[
  {
    "path": ".github/workflows/linux.yaml",
    "content": "name: Build Benchmarks on Ubuntu\non: [push]\njobs:\n  BuildBenchmarks:\n    # Only Ubuntu for now.\n    runs-on: ubuntu-latest\n    steps:\n      - name: Install prerequisites\n        run: sudo apt update && sudo apt -qq --assume-yes full-upgrade && sudo apt install -qq -y build-essential crossbuild-essential-arm64 gcc-riscv64-linux-gnu ocl-icd-opencl-dev opencl-headers libnuma-dev b3sum unzip\n      - name: Wild tomfoolery attempt\n        run: eval \"$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)\" && brew install mingw-w64\n      - name: Check out repository code\n        uses: actions/checkout@v3\n      - name: Build all benchmarks\n        run: eval \"$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)\" && make ci\n      - name: Package benchmarks\n        run: make package\n      - name: b3sum\n        run: b3sum clammarks.txz\n#      - name: Upload package\n#        env:\n#          UPLOAD_KEY: ${{ secrets.UPLOAD_KEY }}\n#          UPLOAD_URL: ${{ secrets.UPLOAD_URL }}\n#        run:  curl -X PUT -T clammarks.txz -H \"$UPLOAD_KEY\" \"$UPLOAD_URL\"\n"
  },
  {
    "path": ".gitignore",
    "content": "## Ignore Visual Studio temporary files, build results, and\r\n## files generated by popular Visual Studio add-ons.\r\n##\r\n## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore\r\n\r\n# User-specific files\r\n*.rsuser\r\n*.suo\r\n*.user\r\n*.userosscache\r\n*.sln.docstates\r\n*.swp\r\n*generatednasm*\r\n*.exe\r\nMemoryBandwidth/membw_*\r\nMemoryLatency/MemoryLatency\r\n\r\n# User-specific files (MonoDevelop/Xamarin Studio)\r\n*.userprefs\r\n\r\n# Mono auto generated files\r\nmono_crash.*\r\n\r\n# Build results\r\n[Dd]ebug/\r\n[Dd]ebugPublic/\r\n[Rr]elease/\r\n[Rr]eleases/\r\nx64/\r\nx86/\r\n[Ww][Ii][Nn]32/\r\n[Aa][Rr][Mm]/\r\n[Aa][Rr][Mm]64/\r\nbld/\r\n[Bb]in/\r\n[Oo]bj/\r\n[Ll]og/\r\n[Ll]ogs/\r\nclammicrobench/*.asm\r\n\r\n# Visual Studio 2015/2017 cache/options directory\r\n.vs/\r\n# Uncomment if you have tasks that create the project's static files in wwwroot\r\n#wwwroot/\r\n\r\n# Visual Studio 2017 auto generated files\r\nGenerated\\ Files/\r\n\r\n# MSTest test Results\r\n[Tt]est[Rr]esult*/\r\n[Bb]uild[Ll]og.*\r\n\r\n# NUnit\r\n*.VisualState.xml\r\nTestResult.xml\r\nnunit-*.xml\r\n\r\n# Build Results of an ATL Project\r\n[Dd]ebugPS/\r\n[Rr]eleasePS/\r\ndlldata.c\r\n\r\n# Benchmark Results\r\nBenchmarkDotNet.Artifacts/\r\n\r\n# .NET Core\r\nproject.lock.json\r\nproject.fragment.lock.json\r\nartifacts/\r\n\r\n# ASP.NET Scaffolding\r\nScaffoldingReadMe.txt\r\n\r\n# StyleCop\r\nStyleCopReport.xml\r\n\r\n# Files built by Visual Studio\r\n*_i.c\r\n*_p.c\r\n*_h.h\r\n*.ilk\r\n*.meta\r\n*.obj\r\n*.iobj\r\n*.pch\r\n*.pdb\r\n*.ipdb\r\n*.pgc\r\n*.pgd\r\n*.rsp\r\n*.sbr\r\n*.tlb\r\n*.tli\r\n*.tlh\r\n*.tmp\r\n*.tmp_proj\r\n*_wpftmp.csproj\r\n*.log\r\n*.tlog\r\n*.vspscc\r\n*.vssscc\r\n.builds\r\n*.pidb\r\n*.svclog\r\n*.scc\r\n\r\n# Chutzpah Test files\r\n_Chutzpah*\r\n\r\n# Visual C++ cache files\r\nipch/\r\n*.aps\r\n*.ncb\r\n*.opendb\r\n*.opensdf\r\n*.sdf\r\n*.cachefile\r\n*.VC.db\r\n*.VC.VC.opendb\r\n\r\n# Visual Studio profiler\r\n*.psess\r\n*.vsp\r\n*.vspx\r\n*.sap\r\n\r\n# Visual Studio Trace Files\r\n*.e2e\r\n\r\n# TFS 2012 Local Workspace\r\n$tf/\r\n\r\n# Guidance Automation Toolkit\r\n*.gpState\r\n\r\n# ReSharper is a .NET coding add-in\r\n_ReSharper*/\r\n*.[Rr]e[Ss]harper\r\n*.DotSettings.user\r\n\r\n# TeamCity is a build add-in\r\n_TeamCity*\r\n\r\n# DotCover is a Code Coverage Tool\r\n*.dotCover\r\n\r\n# AxoCover is a Code Coverage Tool\r\n.axoCover/*\r\n!.axoCover/settings.json\r\n\r\n# Coverlet is a free, cross platform Code Coverage Tool\r\ncoverage*.json\r\ncoverage*.xml\r\ncoverage*.info\r\n\r\n# Visual Studio code coverage results\r\n*.coverage\r\n*.coveragexml\r\n\r\n# NCrunch\r\n_NCrunch_*\r\n.*crunch*.local.xml\r\nnCrunchTemp_*\r\n\r\n# MightyMoose\r\n*.mm.*\r\nAutoTest.Net/\r\n\r\n# Web workbench (sass)\r\n.sass-cache/\r\n\r\n# Installshield output folder\r\n[Ee]xpress/\r\n\r\n# DocProject is a documentation generator add-in\r\nDocProject/buildhelp/\r\nDocProject/Help/*.HxT\r\nDocProject/Help/*.HxC\r\nDocProject/Help/*.hhc\r\nDocProject/Help/*.hhk\r\nDocProject/Help/*.hhp\r\nDocProject/Help/Html2\r\nDocProject/Help/html\r\n\r\n# Click-Once directory\r\npublish/\r\n\r\n# Publish Web Output\r\n*.[Pp]ublish.xml\r\n*.azurePubxml\r\n# Note: Comment the next line if you want to checkin your web deploy settings,\r\n# but database connection strings (with potential passwords) will be unencrypted\r\n*.pubxml\r\n*.publishproj\r\n\r\n# Microsoft Azure Web App publish settings. Comment the next line if you want to\r\n# checkin your Azure Web App publish settings, but sensitive information contained\r\n# in these scripts will be unencrypted\r\nPublishScripts/\r\n\r\n# NuGet Packages\r\n*.nupkg\r\n# NuGet Symbol Packages\r\n*.snupkg\r\n# The packages folder can be ignored because of Package Restore\r\n**/[Pp]ackages/*\r\n# except build/, which is used as an MSBuild target.\r\n!**/[Pp]ackages/build/\r\n# Uncomment if necessary however generally it will be regenerated when needed\r\n#!**/[Pp]ackages/repositories.config\r\n# NuGet v3's project.json files produces more ignorable files\r\n*.nuget.props\r\n*.nuget.targets\r\n\r\n# Nuget personal access tokens and Credentials\r\nnuget.config\r\n\r\n# Microsoft Azure Build Output\r\ncsx/\r\n*.build.csdef\r\n\r\n# Microsoft Azure Emulator\r\necf/\r\nrcf/\r\n\r\n# Windows Store app package directories and files\r\nAppPackages/\r\nBundleArtifacts/\r\nPackage.StoreAssociation.xml\r\n_pkginfo.txt\r\n*.appx\r\n*.appxbundle\r\n*.appxupload\r\n\r\n# Visual Studio cache files\r\n# files ending in .cache can be ignored\r\n*.[Cc]ache\r\n# but keep track of directories ending in .cache\r\n!?*.[Cc]ache/\r\n\r\n# Others\r\nClientBin/\r\n~$*\r\n*~\r\n*.dbmdl\r\n*.dbproj.schemaview\r\n*.jfm\r\n*.pfx\r\n*.publishsettings\r\norleans.codegen.cs\r\n\r\n# Including strong name files can present a security risk\r\n# (https://github.com/github/gitignore/pull/2483#issue-259490424)\r\n#*.snk\r\n\r\n# Since there are multiple workflows, uncomment next line to ignore bower_components\r\n# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)\r\n#bower_components/\r\n\r\n# RIA/Silverlight projects\r\nGenerated_Code/\r\n\r\n# Backup & report files from converting an old project file\r\n# to a newer Visual Studio version. Backup files are not needed,\r\n# because we have git ;-)\r\n_UpgradeReport_Files/\r\nBackup*/\r\nUpgradeLog*.XML\r\nUpgradeLog*.htm\r\nServiceFabricBackup/\r\n*.rptproj.bak\r\n\r\n# SQL Server files\r\n*.mdf\r\n*.ldf\r\n*.ndf\r\n\r\n# Business Intelligence projects\r\n*.rdl.data\r\n*.bim.layout\r\n*.bim_*.settings\r\n*.rptproj.rsuser\r\n*- [Bb]ackup.rdl\r\n*- [Bb]ackup ([0-9]).rdl\r\n*- [Bb]ackup ([0-9][0-9]).rdl\r\n\r\n# Microsoft Fakes\r\nFakesAssemblies/\r\n\r\n# GhostDoc plugin setting file\r\n*.GhostDoc.xml\r\n\r\n# Node.js Tools for Visual Studio\r\n.ntvs_analysis.dat\r\nnode_modules/\r\n\r\n# Visual Studio 6 build log\r\n*.plg\r\n\r\n# Visual Studio 6 workspace options file\r\n*.opt\r\n\r\n# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)\r\n*.vbw\r\n\r\n# Visual Studio LightSwitch build output\r\n**/*.HTMLClient/GeneratedArtifacts\r\n**/*.DesktopClient/GeneratedArtifacts\r\n**/*.DesktopClient/ModelManifest.xml\r\n**/*.Server/GeneratedArtifacts\r\n**/*.Server/ModelManifest.xml\r\n_Pvt_Extensions\r\n\r\n# Paket dependency manager\r\n.paket/paket.exe\r\npaket-files/\r\n\r\n# FAKE - F# Make\r\n.fake/\r\n\r\n# CodeRush personal settings\r\n.cr/personal\r\n\r\n# Python Tools for Visual Studio (PTVS)\r\n__pycache__/\r\n*.pyc\r\n\r\n# Cake - Uncomment if you are using it\r\n# tools/**\r\n# !tools/packages.config\r\n\r\n# Tabs Studio\r\n*.tss\r\n\r\n# Telerik's JustMock configuration file\r\n*.jmconfig\r\n\r\n# BizTalk build output\r\n*.btp.cs\r\n*.btm.cs\r\n*.odx.cs\r\n*.xsd.cs\r\n\r\n# OpenCover UI analysis results\r\nOpenCover/\r\n\r\n# Azure Stream Analytics local run output\r\nASALocalRun/\r\n\r\n# MSBuild Binary and Structured Log\r\n*.binlog\r\n\r\n# NVidia Nsight GPU debugger configuration file\r\n*.nvuser\r\n\r\n# MFractors (Xamarin productivity tool) working folder\r\n.mfractor/\r\n\r\n# Local History for Visual Studio\r\n.localhistory/\r\n\r\n# BeatPulse healthcheck temp database\r\nhealthchecksdb\r\n\r\n# Backup folder for Package Reference Convert tool in Visual Studio 2017\r\nMigrationBackup/\r\n\r\n# Ionide (cross platform F# VS Code tools) working folder\r\n.ionide/\r\n\r\n# Fody - auto-generated XML schema\r\nFodyWeavers.xsd\r\n\r\n# VS Code files for those working on multiple tools\r\n.vscode/*\r\n!.vscode/settings.json\r\n!.vscode/tasks.json\r\n!.vscode/launch.json\r\n!.vscode/extensions.json\r\n*.code-workspace\r\n\r\n# Local History for Visual Studio Code\r\n.history/\r\n\r\n# Windows Installer files from build outputs\r\n*.cab\r\n*.msi\r\n*.msix\r\n*.msm\r\n*.msp\r\n\r\n# JetBrains Rider\r\n.idea/\r\n*.sln.iml\r\n"
  },
  {
    "path": "AsmGen/AsmGen.csproj",
    "content": "<Project Sdk=\"Microsoft.NET.Sdk\">\r\n\r\n  <PropertyGroup>\r\n    <OutputType>Exe</OutputType>\r\n    <TargetFramework>net8.0</TargetFramework>\r\n    <Prefer32Bit>false</Prefer32Bit>\r\n    <PlatformTarget>x64</PlatformTarget>\r\n    <Platforms>AnyCPU;x64</Platforms>\r\n  </PropertyGroup>\r\n\r\n  <ItemGroup>\r\n    <None Update=\"$([System.IO.Path]::Combine('Datafiles','clammicrobench.vcxproj_template'))\">\r\n      <CopyToOutputDirectory>Always</CopyToOutputDirectory>\r\n    </None>\r\n    <None Update=\"$([System.IO.Path]::Combine('Datafiles','GccBranchHistFunction.c'))\">\r\n      <CopyToOutputDirectory>Always</CopyToOutputDirectory>\r\n    </None>\r\n    <None Update=\"$([System.IO.Path]::Combine('Datafiles','IndirectBranchTestBlock.c'))\">\r\n      <CopyToOutputDirectory>Always</CopyToOutputDirectory>\r\n    </None>\r\n    <None Update=\"$([System.IO.Path]::Combine('Datafiles','BranchhistTestBlock.c'))\">\r\n      <CopyToOutputDirectory>Always</CopyToOutputDirectory>\r\n    </None>\r\n    <None Update=\"$([System.IO.Path]::Combine('Datafiles','GccIndirectBranchFunction.c'))\">\r\n      <CopyToOutputDirectory>Always</CopyToOutputDirectory>\r\n    </None>\r\n    <None Update=\"$([System.IO.Path]::Combine('Datafiles','CommonFunctions.c'))\">\r\n      <CopyToOutputDirectory>Always</CopyToOutputDirectory>\r\n    </None>\r\n  </ItemGroup>\r\n</Project>\r\n"
  },
  {
    "path": "AsmGen/AsmGen.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 17\r\nVisualStudioVersion = 17.2.32516.85\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{9A19103F-16F7-4668-BE54-9A1E7A4F7556}\") = \"AsmGen\", \"AsmGen.csproj\", \"{B8930E86-946C-4831-B088-F571E73EEDC4}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|Any CPU = Debug|Any CPU\r\n\t\tDebug|x64 = Debug|x64\r\n\t\tRelease|Any CPU = Release|Any CPU\r\n\t\tRelease|x64 = Release|x64\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU\r\n\t\t{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.Build.0 = Debug|Any CPU\r\n\t\t{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.ActiveCfg = Release|Any CPU\r\n\t\t{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.Build.0 = Release|Any CPU\r\n\t\t{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.Build.0 = Release|x64\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\n\tGlobalSection(ExtensibilityGlobals) = postSolution\r\n\t\tSolutionGuid = {4433D029-CD62-44B9-862E-A8DE52DA45CE}\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "AsmGen/DataFiles/BranchhistTestBlock.c",
    "content": "﻿uint32_t testSizeCount = sizeof(branchHistoryLengths) / sizeof(int);\r\ninitializeBranchHistFuncArr();\r\nsrand(time(NULL));\r\n\r\nsize_t resultSize = sizeof(float) * maxBranchCount * testSizeCount;\r\nfloat* randomResults = (float*)malloc(resultSize);\r\nfloat* predictableResults = (float*)malloc(resultSize);\r\nfor (uint32_t branchCountIdx = 0; branchCountIdx < maxBranchCount; branchCountIdx++) {\r\n    for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {\r\n        uint32_t testSize = branchHistoryLengths[testSizeIdx];\r\n        uint32_t branchCount = branchCounts[branchCountIdx];\r\n        printf(\"Testing branch count %d history length %d\\n\", branchCount, testSize);\r\n        randomResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 1);\r\n        predictableResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 0);\r\n        printf(\"%d, %f, %f\\n\", testSize,\r\n            randomResults[branchCountIdx * testSizeCount + testSizeIdx],\r\n            predictableResults[branchCountIdx * testSizeCount + testSizeIdx]);\r\n    }\r\n}\r\n\r\nprintf(\"Random:\\n\");\r\nprintResultFloatArr(randomResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount);\r\nprintf(\"\\nPredictable:\\n\");\r\nprintResultFloatArr(predictableResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount);\r\n\r\nfree(randomResults);\r\nfree(predictableResults);\n"
  },
  {
    "path": "AsmGen/DataFiles/CommonFunctions.c",
    "content": "﻿// this is a partial C file that's appended into generated code\r\n// stuff here is generic enough to work for both windows/vs and gcc\r\n\r\n#ifndef __MINGW32__\r\n// optional affinity setting for effed up qualcomm/android bs\r\n#include <sched.h>\r\n#include <unistd.h>\r\n#include <sys/types.h>\r\n#include <sys/syscall.h>\r\n#include <pthread.h>\r\n\r\nvoid setAffinity(int core) {\r\n    cpu_set_t cpuset;\r\n    CPU_ZERO(&cpuset);\r\n    CPU_SET(core, &cpuset);\r\n    printf(\"Set affinity to core %d\\n\", core);\r\n    // sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);\r\n}\r\n#endif\r\n\r\nstruct ThreadData {\r\n    int* A;\r\n    int* B;\r\n    float* fpArr;\r\n    uint32_t list_size;\r\n    uint64_t structIterations;\r\n};\r\n\r\nvoid printCsvHeader(uint32_t* xCounts, uint32_t xLen) {\r\n    printf(\"x\");\r\n    for (uint32_t testSizeIdx = 0; testSizeIdx < xLen; testSizeIdx++) {\r\n        printf(\", %d\", xCounts[testSizeIdx]);\r\n    }\r\n\r\n    printf(\"\\n\");\r\n}\r\n\r\n// print results in format that excel can take\r\nvoid printResultFloatArr(float* arr, uint32_t *xCounts, uint32_t xLen, uint32_t *yCounts, uint32_t yLen) {\r\n    uint32_t testSizeCount = xLen;\r\n    printCsvHeader(xCounts, xLen);\r\n    for (uint32_t branchCountIdx = 0; branchCountIdx < yLen; branchCountIdx++) {\r\n        // row header\r\n        printf(\"%d\", yCounts[branchCountIdx]);\r\n        for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {\r\n            printf(\",%f\", arr[branchCountIdx * testSizeCount + testSizeIdx]);\r\n        }\r\n\r\n        printf(\"\\n\");\r\n    }\r\n}\r\n\r\nvoid FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {\r\n    uint32_t increment = byte_increment / sizeof(uint32_t);\r\n    uint32_t element_count = list_size / increment;\r\n    for (int i = 0; i < element_count; i++) {\r\n        pattern_arr[i * increment] = i * increment;\r\n    }\r\n\r\n    int iter = element_count;\r\n    while (iter > 1) {\r\n        iter -= 1;\r\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\r\n        uint32_t tmp = pattern_arr[iter * increment];\r\n        pattern_arr[iter * increment] = pattern_arr[j * increment];\r\n        pattern_arr[j * increment] = tmp;\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/DataFiles/GccBranchHistFunction.c",
    "content": "﻿// this is a partial C file that's appended into generated code\r\n\r\n// Run a test, return the result in time (ns) per branch\r\n// historyLen: length of random array that the test loops through\r\n// branchCountIdx: index into array of branch counts, max determined by generated header/asm\r\n// random: if 1, randomize test array contents. If 0, fill with zeroes\r\nfloat runBranchHistTest(uint32_t historyLen, uint32_t branchCountIdx, int random) {\r\n    struct timeval startTv, endTv;\r\n    struct timezone startTz, endTz;\r\n    uint32_t branchCount = branchCounts[branchCountIdx];\r\n    uint64_t iterations = 320000000 / branchCount;\r\n    uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t) __attribute((sysv_abi)) = branchtestFuncArr[branchCountIdx];\r\n    float onesCount = 0.0f;\r\n\r\n    uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount);\r\n    for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) {\r\n        uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * historyLen);\r\n        for (uint32_t i = 0; i < historyLen; i++) {\r\n            testArr[i] = random ? rand() % 2 : 0;\r\n            if (testArr[i] > 0)\r\n            {\r\n                onesCount += 1.0f;\r\n            }\r\n        }\r\n        testArrToArr[testArrIdx] = testArr;\r\n    }\r\n\r\n    fprintf(stderr, \"Starting test, should have %0.2f percent ones\\n\", onesCount / ((float)historyLen * branchCount));\r\n    gettimeofday(&startTv, &startTz);\r\n    uint64_t takenBranchCount = branchtestFunc(iterations, testArrToArr, historyLen);\r\n    gettimeofday(&endTv, &endTz);\r\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\r\n    float latency = 1e6 * (float)time_diff_ms / (float)iterations;\r\n\r\n    // give result in latency per branch\r\n    latency = latency / branchCount;\r\n    fprintf(stderr, \"History length %u, branch count %u: %0.2f percent not-taken\\n\", historyLen, branchCount, 100 * (float)takenBranchCount / ((float)iterations * branchCount));\r\n\r\n    for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]);\r\n    free(testArrToArr);\r\n    return latency;\r\n}\n"
  },
  {
    "path": "AsmGen/DataFiles/GccIndirectBranchFunction.c",
    "content": "﻿// similar but for indirect branch test\r\n// needs indirectBranchTestFuncArr generated\r\n// mode:\r\n// 0 - cycle through targets\r\n// 1 - random target selection\r\n// 2 - jump to middle\r\nfloat runIndirectBranchTest(uint32_t branchCountIdx, uint32_t targetCountIdx, uint32_t mode) {\r\n    struct timeval startTv, endTv;\r\n    struct timezone startTz, endTz;\r\n    uint32_t branchCount = indirectBranchCounts[branchCountIdx];\r\n    uint32_t targetCount = indirectBranchTargetCounts[targetCountIdx];\r\n    uint64_t iterations = 80000000 / branchCount;\r\n    uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t, uint64_t **) __attribute((sysv_abi)) = indirectBranchTestFuncArr[branchCountIdx][targetCountIdx];\r\n\r\n    // generate an array containing jump target indexes for every branch\r\n    uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount);\r\n    for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) {\r\n        uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * targetCount);\r\n        if (mode == 1)\r\n            for (uint32_t i = 0; i < targetCount; i++) testArr[i] = rand() % targetCount;\r\n        else if (mode == 0)\r\n            for (uint32_t i = 0; i < targetCount; i++) testArr[i] = i;\r\n        else if (mode == 2)\r\n            for (uint32_t i = 0; i < targetCount; i++) testArr[i] = targetCount / 2;\r\n        testArrToArr[testArrIdx] = testArr;\r\n    }\r\n\r\n    // each branch needs a jump table\r\n    uint64_t** jumpTables = (uint64_t**)malloc(sizeof(uint64_t*) * branchCount);\r\n    for (int jumpTableIdx = 0; jumpTableIdx < branchCount; jumpTableIdx++)\r\n    {\r\n        uint64_t* jumpTable = (uint64_t*)malloc(sizeof(uint64_t) * targetCount);\r\n        jumpTables[jumpTableIdx] = jumpTable;\r\n    }\r\n\r\n    gettimeofday(&startTv, &startTz);\r\n    // uint64_t iterations, uint32_t **arr, uint32_t arrLen, uint64_t **scratch\r\n    branchtestFunc(iterations, testArrToArr, targetCount, jumpTables);\r\n    gettimeofday(&endTv, &endTz);\r\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\r\n    float latency = 1e6 * (float)time_diff_ms / (float)iterations;\r\n\r\n    // give result in latency per branch\r\n    latency = latency / branchCount;\r\n\r\n    for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]);\r\n    free(testArrToArr);\r\n    for (int jumpTableIdx = 0; jumpTableIdx < branchCount; jumpTableIdx++) free(jumpTables[jumpTableIdx]);\r\n    free(jumpTables);\r\n    return latency;\r\n}\n"
  },
  {
    "path": "AsmGen/DataFiles/IndirectBranchTestBlock.c",
    "content": "﻿// generated code will have:\r\n// - indirectBranchTargetCounts = array containing # of targets per branch\r\n// - indirectBranchCounts = array containing # of branches to test\r\n// - maxIndirectBranchCount = length of ^^\r\n// - initializeIndirectBranchFuncArr = populates\r\n\r\nuint32_t testSizeCount = sizeof(indirectBranchTargetCounts) / sizeof(int);\r\ninitializeIndirectBranchFuncArr();\r\nsrand(time(NULL));\r\n\r\nsize_t resultSize = sizeof(float) * maxIndirectBranchCount * testSizeCount;\r\nfloat* results = (float*)malloc(resultSize);\r\nfloat* refResults = (float*)malloc(resultSize);\r\nfor (uint32_t branchCountIdx = 0; branchCountIdx < maxIndirectBranchCount; branchCountIdx++) {\r\n    for (uint32_t targetCountIdx = 0; targetCountIdx < testSizeCount; targetCountIdx++) {\r\n        uint32_t testSize = indirectBranchTargetCounts[targetCountIdx];\r\n        uint32_t branchCount = indirectBranchCounts[branchCountIdx];\r\n        printf(\"Testing branch count %d target count %d:\", branchCount, testSize);\r\n        results[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 0);\r\n        refResults[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 2);\r\n        printf(\"%f ns, reference %f ns\\n\",\r\n            results[branchCountIdx * testSizeCount + targetCountIdx],\r\n            refResults[branchCountIdx * testSizeCount + targetCountIdx]);\r\n    }\r\n}\r\n\r\nprintf(\"Indirect branch results:\\n\");\r\nprintResultFloatArr(results, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount);\r\nprintf(\"Reference indirect branch results:\\n\");\r\nprintResultFloatArr(refResults, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount);\r\n\r\nfree(results);\r\nfree(refResults);\n"
  },
  {
    "path": "AsmGen/DataFiles/clammicrobench.vcxproj_template",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>16.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{7e8cf2ba-57a7-4b42-b721-97e02bf9a8b8}</ProjectGuid>\r\n    <RootNamespace>clammicrobench</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"clammicrobench.cpp\" />\r\n  </ItemGroup>\r\n%REPLACEWITHCUSTOMBUILD%\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>\n"
  },
  {
    "path": "AsmGen/IUarchTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public interface IUarchTest\r\n    {\r\n        public const string ThreadLaunchFunctionPrefix = \"ThreadLaunch_\";\r\n        // enough to generate global lines, function calls, and let user pick from tests\r\n        public string Prefix { get; }\r\n        public string Description { get; }\r\n        public bool DivideTimeByCount { get; }\r\n        public bool SupportsIsa(ISA isa);\r\n\r\n        public void GenerateAsm(StringBuilder sb, ISA isa);\r\n        public void GenerateTestBlock(StringBuilder sb, ISA isa);\r\n        public void GenerateAsmGlobalLines(StringBuilder sb);\r\n        public void GenerateExternLines(StringBuilder sb);\r\n\r\n        public enum ISA\r\n        {\r\n            amd64,   // 64-bit x86\r\n            aarch64, // 64-bit arm\r\n            mips64,   // 64-bit MIPS, for loongson\r\n            riscv,   // 64-bit risc-v\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/Program.cs",
    "content": "﻿using System;\r\nusing System.Collections.Generic;\r\nusing System.IO;\r\nusing System.Text;\r\nusing System.Threading.Tasks;\r\n\r\nnamespace AsmGen\r\n{\r\n    class Program\r\n    {\r\n        public static string DataFilesDir = \"DataFiles\";\r\n\r\n        static int structTestIterations = 5000000;\r\n        static int iterations = 100 * structTestIterations;\r\n        static int latencyListSize = 131072 * 1024 / 4; // 128 MB\r\n\r\n        static void Main(string[] args)\r\n        {\r\n            List<IUarchTest> tests = new List<IUarchTest>();\r\n            tests.Add(new BtbTest(4, BtbTest.BranchType.Unconditional));\r\n            tests.Add(new BtbTest(8, BtbTest.BranchType.Unconditional));\r\n            tests.Add(new BtbTest(16, BtbTest.BranchType.Unconditional));\r\n            tests.Add(new BtbTest(32, BtbTest.BranchType.Unconditional));\r\n            tests.Add(new BtbTest(64, BtbTest.BranchType.Unconditional));\r\n            tests.Add(new BtbTest(4, BtbTest.BranchType.Conditional));\r\n            tests.Add(new BtbTest(8, BtbTest.BranchType.Conditional));\r\n            tests.Add(new BtbTest(16, BtbTest.BranchType.Conditional));\r\n            tests.Add(new BtbTest(32, BtbTest.BranchType.Conditional));\r\n            tests.Add(new BranchHistoryTest());\r\n\r\n            List<Task> tasks = new List<Task>();\r\n            tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.amd64)));\r\n            tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.aarch64)));\r\n            tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.mips64)));\r\n            tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.riscv)));\r\n\r\n            tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.amd64)));\r\n            tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.aarch64)));\r\n            tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.mips64)));\r\n            tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.riscv)));\r\n            Task.WaitAll(tasks.ToArray());\r\n\r\n            GenerateMakefile();\r\n        }\r\n\r\n        static void GenerateCFile(List<IUarchTest> tests, IUarchTest.ISA isa)\r\n        {\r\n            StringBuilder sb = new StringBuilder();\r\n            sb.AppendLine(\"#define _GNU_SOURCE\");\r\n            sb.AppendLine(\"#include <stdio.h>\\n#include<stdint.h>\\n#include<sys/time.h>\\n#include <stdlib.h>\\n#include <string.h>\\n#include <time.h>\\n\");\r\n            sb.AppendLine(\"#pragma GCC diagnostic ignored \\\"-Wattributes\\\"\");\r\n            string commonFunctions = File.ReadAllText(Path.Combine(DataFilesDir, \"CommonFunctions.c\"));\r\n            sb.AppendLine(commonFunctions);\r\n\r\n            foreach (IUarchTest test in tests)\r\n            {\r\n                if (test.SupportsIsa(isa))\r\n                {\r\n                    test.GenerateExternLines(sb);\r\n                    Console.WriteLine(\"Test \" + test.Prefix + \" supports ISA \" + isa);\r\n                }\r\n            }\r\n\r\n            // no indexed addressing mode on these architectures, so make sure we can do pointer\r\n            // chasing with a single instruction\r\n            if (isa == IUarchTest.ISA.mips64 || isa == IUarchTest.ISA.riscv)\r\n            {\r\n                sb.AppendLine(\"extern void preplatencyarr(int *arr, uint32_t list_size);\");\r\n            }\r\n\r\n            AddCommonInitCode(sb, tests, isa);\r\n            foreach (IUarchTest test in tests)\r\n            {\r\n                if (test.SupportsIsa(isa)) test.GenerateTestBlock(sb, isa);\r\n            }\r\n\r\n            AddCommonEndCode(sb);\r\n\r\n            File.WriteAllText(\"clammicrobench_\" + isa.ToString() + \".c\", sb.ToString());\r\n        }\r\n\r\n        static void GenerateAsmFile(List<IUarchTest> tests, IUarchTest.ISA isa)\r\n        {\r\n            string filename = \"clammicrobench_\" + isa.ToString() + \".s\";\r\n            StringBuilder sb = new StringBuilder();\r\n            sb.AppendLine(\".text\");\r\n\r\n            if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                UarchTest.GenerateMipsPrepArrayFunction(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                UarchTest.GenerateRiscvPrepArrayFunction(sb);\r\n            }\r\n\r\n            File.WriteAllText(filename, sb.ToString());\r\n            sb.Clear();\r\n\r\n            foreach (IUarchTest test in tests)\r\n            {\r\n                if (test.SupportsIsa(isa))\r\n                {\r\n                    sb.Clear();\r\n                    test.GenerateAsmGlobalLines(sb);\r\n                    test.GenerateAsm(sb, isa);\r\n                    File.AppendAllText(filename, sb.ToString());\r\n                }\r\n            }\r\n        }\r\n\r\n        static void GenerateMakefile()\r\n        {\r\n            StringBuilder sb = new StringBuilder();\r\n            foreach (IUarchTest.ISA isa in Enum.GetValues(typeof(IUarchTest.ISA)))\r\n            {\r\n                sb.AppendLine(isa.ToString() + \":\");\r\n                if (isa == IUarchTest.ISA.aarch64)\r\n                {\r\n                    sb.AppendLine($\"\\tgcc -march=armv8.5-a+aes clammicrobench_{isa.ToString()}.c clammicrobench_{isa.ToString()}.s -o cb -static\");\r\n                    // hack for stupid compilers that need a ton of flags to do basic things\r\n                    sb.AppendLine(\"android:\");\r\n                    sb.AppendLine(\"\\tclang -march=armv8.3-a -mfpu=neon-fp-armv8 clammicrobench_aarch64.c clammicrobench_aarch64.s -o cb\");\r\n                }\r\n                else sb.AppendLine($\"\\tgcc -pthread clammicrobench_{isa.ToString()}.c clammicrobench_{isa.ToString()}.s -o cb\");\r\n            }\r\n\r\n            sb.AppendLine(\"win64:\");\r\n            sb.AppendLine($\"\\tx86_64-w64-mingw32-gcc clammicrobench_{IUarchTest.ISA.amd64.ToString()}.c clammicrobench_{IUarchTest.ISA.amd64.ToString()}.s -o cb.exe\");\r\n\r\n            sb.AppendLine(\"clean:\");\r\n            sb.AppendLine(\"\\trm clammicrobench_* cb\");\r\n\r\n            File.WriteAllText(\"Makefile\", sb.ToString());\r\n        }\r\n\r\n        // Adds largely ISA independent initialization code that gives tests a basic foundation,\r\n        // like a pointer chasing array\r\n        static void AddCommonInitCode(StringBuilder sb, List<IUarchTest> tests, IUarchTest.ISA isa)\r\n        {\r\n            sb.AppendLine(\"int main(int argc, char *argv[]) {\");\r\n            sb.AppendLine($\"  uint64_t time_diff_ms, iterations = {iterations}, structIterations = {structTestIterations}, tmp;\");\r\n            sb.AppendLine(\"  double latency; int *A = NULL, *B = NULL; float *fpArr = NULL; char *test_name = NULL; int core_affinity = -1; int threads = 1;\");\r\n            sb.AppendLine(\"  uint64_t tmpsink;\");\r\n            sb.AppendLine(\"  uint32_t list_size = \" + latencyListSize + \";\");\r\n\r\n            // print a help message based on tests available\r\n            sb.AppendLine($\"  printf(\\\"Usage: -test [test name] -listsize [latency list size = {latencyListSize}] -iterations [struct iterations = {structTestIterations}]\\\\n\\\");\");\r\n            sb.AppendLine(\"  if (argc < 2) {\");\r\n            sb.AppendLine(\"    printf(\\\"List of tests:\\\\n\\\");\");\r\n            foreach (IUarchTest test in tests)\r\n            {\r\n                if (test.SupportsIsa(isa)) sb.AppendLine($\"    printf(\\\"  {test.Prefix} - {test.Description}\\\\n\\\");\");\r\n            }\r\n\r\n            // args provided. parse them and run test\r\n            sb.AppendLine(\"  } else {\");\r\n\r\n            // args handling\r\n            sb.AppendLine(\"    for (int argIdx = 1; argIdx < argc; argIdx++) {\");\r\n            sb.AppendLine(\"      if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1;\");\r\n            sb.AppendLine(\"        if (strncmp(arg, \\\"test\\\", 4) == 0) { argIdx++; test_name = argv[argIdx]; }\");\r\n            sb.AppendLine(\"        if (strncmp(arg, \\\"iterations\\\", 10) == 0) { argIdx++; iterations = 100 * atoi(argv[argIdx]); }\");\r\n            sb.AppendLine(\"        if (strncmp(arg, \\\"listsize\\\", 8) == 0) { argIdx++; list_size = atoi(argv[argIdx]); }\");\r\n            sb.AppendLine(\"        if (strncmp(arg, \\\"affinity\\\", 8) == 0) { argIdx++; core_affinity = atoi(argv[argIdx]); }\");\r\n            sb.AppendLine(\"        if (strncmp(arg, \\\"threads\\\", 7) == 0) { argIdx++; threads = atoi(argv[argIdx]); }\");\r\n            sb.AppendLine(\"      }\"); // end -arg handling if\r\n            sb.AppendLine(\"    }\"); // end args handling for loop\r\n\r\n            sb.AppendLine(\"    if (test_name == NULL) { fprintf(stderr, \\\"No test specified\\\\n\\\"); return 0; }\");\r\n\r\n            // Optional affinity setting for certain troublesome platforms\r\n            // don't need a version that uses Windows affinity APIs because Windows platforms never have this issue\r\n            sb.AppendLine(\"#ifndef __MINGW32__\");\r\n            sb.AppendLine(\"  if (core_affinity != -1) setAffinity(core_affinity);\");\r\n            sb.AppendLine(\"#endif\");\r\n\r\n            // Generate array for pointer chasing unless we're doing a BTB test\r\n            sb.AppendLine(\"  if (argc == 1 || argc > 1 && strncmp(test_name, \\\"btb\\\", 3) != 0) {\");\r\n            GenerateLatencyTestArray(sb);\r\n            sb.AppendLine(\"  }\"); // end of ptr chasing array generation\r\n            sb.AppendLine(\"  struct timeval startTv, endTv;\");\r\n            sb.AppendLine(\"  struct timezone startTz, endTz;\");\r\n        }\r\n\r\n\r\n        static void AddCommonEndCode(StringBuilder sb)\r\n        {\r\n            sb.AppendLine(\"  free(A); free(B); free(fpArr);\");\r\n            sb.AppendLine(\"  }\"); // end else\r\n            sb.AppendLine(\"  return 0; }\");\r\n        }\r\n\r\n        static void GenerateLatencyTestArray(StringBuilder sb)\r\n        {\r\n            // Fill list to create random access pattern\r\n            sb.AppendLine(\"  A = (int*)malloc(sizeof(int) * list_size);\");\r\n            sb.AppendLine(\"  srand(time(NULL));\");\r\n            sb.AppendLine(\"  FillPatternArr(A, list_size, 64);\\n\");\r\n\r\n            sb.AppendLine(\"#ifdef _WIN32\");\r\n            sb.AppendLine(\"  B = (int*)_aligned_malloc(sizeof(int) * list_size, 64);\\n\");\r\n            sb.AppendLine(\"#else\");\r\n            sb.AppendLine(\"  posix_memalign((void **)&B, 64, sizeof(int) * list_size);\\n\");\r\n            sb.AppendLine(\"#endif\");\r\n            sb.AppendLine(\"  for (int i = 0; i < list_size; i++) { B[i] = i; }\\n\");\r\n            sb.AppendLine(\"#ifdef _WIN32\");\r\n            sb.AppendLine(\"  fpArr = (float*)_aligned_malloc(sizeof(float) * list_size, 64);\\n\");\r\n            sb.AppendLine(\"#else\");\r\n            sb.AppendLine(\"  posix_memalign((void **)&fpArr, 64, sizeof(float) * list_size);\");\r\n            sb.AppendLine(\"#endif\");\r\n            sb.AppendLine(\"  for (int i = 0;i < list_size; i++) { fpArr[i] = i + .1; }\\n\");\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/Properties/launchSettings.json",
    "content": "{\r\n  \"profiles\": {\r\n    \"AsmGen\": {\r\n      \"commandName\": \"Project\",\r\n      \"commandLineArgs\": \"autocopy\"\r\n    }\r\n  }\r\n}\n"
  },
  {
    "path": "AsmGen/README.md",
    "content": "# Microbenchmark Generator\r\nC# project to generate C and assembly for CPU structure size benchmarks that use different code for each data point, making them\r\nimpractical to write by hand. For more details on methodology for out-of-order structure size measurement, see https://blog.stuffedcow.net/2013/05/measuring-rob-capacity/\r\n\r\nFirst, go to Program.cs and set the expected sizes for the structures you want to measure. The constructor for each test generally has the same (low, high, step) format. For example if you anticipate ROB capacity will be between 128 and 512 entries, you can do `tests.Add(new RobTest(128, 1, 512))`\r\n\r\n# Building\r\n\r\nCompile the project and run AsmGen.exe. That gives several output files. Compilation for Linux:\r\n`gcc clammicrobench.c clammicrobench_x86.s -o clammicrobench` for x86_64\r\n`gcc clammicrobench.c clammicrobench_arm.s -o clammicrobench` for aarch64\r\n`aarch64-linux-gnu-gcc clammicrobench.c clammicrobench_arm.s -o clammicrobench` to cross compile for aarch64 (for example from a fast desktop)\r\n\r\nFor Windows, run `AsmGen.exe autocopy`. That copies generated files to the /clammicrobench directory, assuming it's run from the default VS output location. Then, open /clammicrobench/clammicrobench.sln and build. You need nasm in your path for that, as covered on README.md at repo root.\r\n\r\nThe indirect branch test can take a while to build with nasm, so you might want to reduce the branch and target counts for that. Or just keep it commented out.\r\n\r\n# Running\r\nGenerally, the syntax is `clammicrobench -test [test name] -listsize [list size for latency test] -iterations [iteration count]`. The last two parameters are optional.\r\n\r\n# Tests\r\n\r\nRunning the program without parameters will spit out a list of tests and brief descriptions. Most are structure size tests. Instructions that consume certain core resources are placed between two pointer chasing loads. Once the two cache misses can't overlap, the structure being tested is full. Some tests, especially those measuring scheduler capacity, will hit a mix of instructions to see whether capacity is shared across different categories of instructions. \r\n\r\nAlongside structure size tests, AsmGen is a convenient place to put other microbenchmarks that involve generating tons of code. There are several branch predictor tests:\r\n- btb16Unconditional, etc: Creates a chain of taken branches in a loop to measure taken branch latency. Useful for showing BTB size and speed. Different distances between branches are useful because branch predictors sometimes have dtrouble tracking branches that are too close together.\r\n- btb16Conditional: Same as above but with always-taken conditional branches\r\n- branchhist - Branch history test: Generates branches that are taken or not taken in some random pattern, then increases the length of that pattern and the number of branches. Each branch is given its own pattern. This test thus tries to see how long of a pattern the branch predictor can track before getting a lot of mispredicts.\r\n- indirectbranch - Indirect branch prediction test: Varies the number of branch targets and branches to see how many total targets the indirect branch predictor can track\r\n- returnstack - Tests return prediction with a nested calls of varying depths. When the return stack overflows, you'll see an increase in time per call/return pair.\r\n"
  },
  {
    "path": "AsmGen/UarchTest.cs",
    "content": "﻿using System.Runtime.Serialization;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public abstract class UarchTest : IUarchTest\r\n    {\r\n        public string Prefix { get; set; }\r\n\r\n        public string Description { get; set; }\r\n\r\n        public int[] Counts;\r\n\r\n        public string FunctionDefinitionParameters { get; set; }\r\n\r\n        public string GetFunctionCallParameters { get; set; }\r\n\r\n        public bool DivideTimeByCount { get; set; }\r\n\r\n        public abstract bool SupportsIsa(IUarchTest.ISA isa);\r\n        public abstract void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa);\r\n\r\n        public void GenerateAsmGlobalLines(StringBuilder sb)\r\n        {\r\n            for (int i = 0; i < Counts.Length; i++)\r\n                sb.AppendLine(\".global \" + Prefix + Counts[i]);\r\n        }\r\n\r\n        public void GenerateExternLines(StringBuilder sb)\r\n        {\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                sb.AppendLine(\"extern uint64_t \" + Prefix + Counts[i] + $\"({FunctionDefinitionParameters}) __attribute((sysv_abi));\");\r\n\r\n                // Function that can be launched in a pthread\r\n                sb.AppendLine($\"void *{IUarchTest.ThreadLaunchFunctionPrefix}{Prefix}{Counts[i]}(void *pa)\");\r\n                sb.AppendLine(\"{\");\r\n                sb.AppendLine(\"    struct ThreadData *td = (struct ThreadData *)pa;\");\r\n                sb.AppendLine(\"    int *A = td->A;\");\r\n                sb.AppendLine(\"    int *B = td->B;\");\r\n                sb.AppendLine(\"    float *fpArr = td->fpArr;\");\r\n                sb.AppendLine(\"    uint32_t list_size = td->list_size;\");\r\n                sb.AppendLine(\"    int structIterations = td->structIterations;\");\r\n                sb.AppendLine(\"    \" + Prefix + Counts[i] + $\"({GetFunctionCallParameters});\");\r\n                sb.AppendLine(\"    return NULL;\");\r\n                sb.AppendLine(\"}\");\r\n            }\r\n        }\r\n\r\n        public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            sb.AppendLine(\"  if (argc > 1 && strcmp(test_name, \\\"\" + Prefix + \"\\\") == 0) {\");\r\n            sb.AppendLine(\"    printf(\\\"\" + Description + \":\\\\n\\\");\");\r\n\r\n            if (isa == IUarchTest.ISA.mips64 || isa == IUarchTest.ISA.riscv)\r\n            {\r\n                sb.AppendLine(\"  if (argc == 1 || argc > 1 && strncmp(test_name, \\\"btb\\\", 3) != 0) {\");\r\n                sb.AppendLine(\"preplatencyarr(A, list_size);\");\r\n                sb.AppendLine(\"  }\");\r\n            }\r\n\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                // use more iterations (iterations = structIterations * 100) and divide iteration count by tested-thing count\r\n                // for certain tests like call stack depth\r\n                if (DivideTimeByCount)\r\n                {\r\n                    sb.AppendLine(\"    tmp = structIterations;\");\r\n                    sb.AppendLine(\"    structIterations = iterations / \" + Counts[i] + \";\");\r\n                }\r\n\r\n                sb.AppendLine(\"    gettimeofday(&startTv, &startTz);\");\r\n                sb.AppendLine(\"#ifndef __MINGW32__\");\r\n                sb.AppendLine(\"    if (threads > 1) {\");\r\n                sb.AppendLine(\"        struct ThreadData testThreadData;\");\r\n                sb.AppendLine(\"        pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t));\");\r\n                sb.AppendLine(\"        testThreadData.A = A;\");\r\n                sb.AppendLine(\"        testThreadData.B = B;\");\r\n                sb.AppendLine(\"        testThreadData.fpArr = fpArr;\");\r\n                sb.AppendLine(\"        testThreadData.list_size = list_size;\");\r\n                sb.AppendLine(\"        testThreadData.structIterations = structIterations;\");\r\n                sb.AppendLine(\"        for (int threadIdx = 0; threadIdx < threads; threadIdx++) {\");\r\n                sb.AppendLine($\"            pthread_create(testThreads + threadIdx, NULL, {IUarchTest.ThreadLaunchFunctionPrefix}{Prefix}{Counts[i]}, &testThreadData);\");\r\n                sb.AppendLine(\"        }\");\r\n                sb.AppendLine(\"        for (int threadIdx = 0; threadIdx < threads; threadIdx++) {\");\r\n                sb.AppendLine(\"             pthread_join(testThreads[threadIdx], NULL);\");\r\n                sb.AppendLine(\"        }\");\r\n                sb.AppendLine(\"        free(testThreads);\");\r\n                // launch threads\r\n                sb.AppendLine(\"    } else \");\r\n                sb.AppendLine(\"        \" + Prefix + Counts[i] + $\"({GetFunctionCallParameters});\");\r\n                sb.AppendLine(\"#else\");\r\n                sb.AppendLine(\"    \" + Prefix + Counts[i] + $\"({GetFunctionCallParameters});\");\r\n                sb.AppendLine(\"#endif\");\r\n                sb.AppendLine(\"    gettimeofday(&endTv, &endTz);\");\r\n                sb.AppendLine(\"    time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\");\r\n                //sb.AppendLine(\"    fprintf(stderr, \\\"%lu ms elapsed, %lu iter\\\\n\\\", time_diff_ms, structIterations);\");\r\n                if (DivideTimeByCount)\r\n                    sb.AppendLine(\"    latency = 1e6 * (float)time_diff_ms / (float)(iterations);\");\r\n                else\r\n                    sb.AppendLine(\"    latency = 1e6 * (float)time_diff_ms / (float)(structIterations);\");\r\n                sb.AppendLine(\"    printf(\\\"\" + Counts[i] + \",%f\\\\n\\\", latency);\\n\");\r\n\r\n                if (DivideTimeByCount)\r\n                {\r\n                    sb.AppendLine(\"    structIterations = tmp;\");\r\n                }\r\n            }\r\n\r\n            sb.AppendLine(\"  }\\n\");\r\n        }\r\n\r\n        /// <summary>\r\n        /// MIPS doesn't have an indexed load instruction which means we'd have to use an\r\n        /// add+shift (extra two instructions), which would complicate measurements\r\n        /// So screw around in order to use direct addressing\r\n        /// </summary>\r\n        /// <param name=\"sb\"></param>\r\n        public static void GenerateMipsPrepArrayFunction(StringBuilder sb)\r\n        {\r\n            // r4 = ptr to arr, r5 = arr len, in 32-bit elements\r\n            sb.AppendLine(\".global preplatencyarr\");\r\n            sb.AppendLine(\"preplatencyarr:\");\r\n            sb.AppendLine(\"  xor $r12, $r12, $r12\");\r\n            sb.AppendLine(\"  xor $r13, $r13, $r13\");\r\n            sb.AppendLine(\"  xor $r14, $r14, $r14\");\r\n            sb.AppendLine(\"  xor $r15, $r15, $r15\"); // array index\r\n            sb.AppendLine(\"  addi.d $r14, $r14, 1\");\r\n            sb.AppendLine(\"preplatencyarr_loop:\");\r\n            sb.AppendLine(\"  alsl.d $r12, $r15, $r0, 0x3\"); // shift by 3 = multiply by 8 for 64-bit\r\n            sb.AppendLine(\"  add.d $r12, $r4, $r12\"); // add loaded value to base address\r\n            sb.AppendLine(\" ld.d $r13, $r12, 0\");\r\n            sb.AppendLine(\"  alsl.d $r13, $r13, $r0, 0x2\"); // address calculation for loaded index. this is in 32-bit values\r\n            sb.AppendLine(\"  add.d $r13, $r4, $r13\");\r\n            sb.AppendLine(\"  st.d $r13, $r12, 0\");  // save calculated address\r\n            sb.AppendLine(\"  add.d $r15, $r15, $r14\");\r\n            sb.AppendLine(\"  alsl.d $r16, $r15, $r0, 0x1\"); // muliply 64-bit index by 2 to prevent out of bounds for 32-bit list size count\r\n            sb.AppendLine(\"  bne $r16, $r5, preplatencyarr_loop\"); // while idx != len\r\n            sb.AppendLine(\"  jr $r1\");\r\n        }\r\n\r\n        public static void GenerateRiscvPrepArrayFunction(StringBuilder sb)\r\n        {\r\n            sb.AppendLine(\".global preplatencyarr\");\r\n            sb.AppendLine(\"preplatencyarr:\");\r\n            sb.AppendLine(\"  li x7, 0\");\r\n            sb.AppendLine(\"  mv x5, x10\");\r\n            sb.AppendLine(\"preplatencyarr_loop:\");\r\n            sb.AppendLine(\"  ld x28, (x5)\");\r\n            sb.AppendLine(\"  slli x28, x28, 2\"); // index specified in 32-bit values\r\n            sb.AppendLine(\"  add x28, x28, x10\");\r\n            sb.AppendLine(\"  sd x28, (x5)\");\r\n            sb.AppendLine(\"  addi x5, x5, 8\"); // next element\r\n            sb.AppendLine(\"  addi x7, x7, 2\"); // list size is given in 32-bit elements\r\n            sb.AppendLine(\"  blt x7, x11, preplatencyarr_loop\");\r\n            sb.AppendLine(\"  ret\");\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/UarchTestHelpers.cs",
    "content": "﻿using System.IO;\r\nusing System.Collections.Generic;\r\nusing System.Linq;\r\nusing System.Text;\r\nusing System.Threading.Tasks;\r\n\r\nnamespace AsmGen\r\n{\r\n    public static class UarchTestHelpers\r\n    {\r\n        public static int[] GenerateCountArray(int low, int high, int step)\r\n        {\r\n            List<int> countList = new List<int>();\r\n            for (int i = low; i <= high; i += step)\r\n            {\r\n                countList.Add(i);\r\n            }\r\n\r\n            return countList.ToArray();\r\n        }\r\n\r\n        public static void GenerateNasmGlobalLines(StringBuilder sb, UarchTest test)\r\n        {\r\n            int[] counts = test.Counts;\r\n            for (int i = 0; i < counts.Length; i++)\r\n                sb.AppendLine(\"global \" + test.Prefix + counts[i]);\r\n        }\r\n\r\n        public static void GenerateAsmGlobalLines(StringBuilder sb, UarchTest test)\r\n        {\r\n            int[] counts = test.Counts;\r\n            for (int i = 0; i < counts.Length; i++)\r\n                sb.AppendLine(\".global \" + test.Prefix + counts[i]);\r\n        }\r\n\r\n        public static void GenerateExternLines(StringBuilder sb, UarchTest test)\r\n        {\r\n            int[] counts = test.Counts;\r\n            for (int i = 0; i < counts.Length; i++)\r\n                sb.AppendLine(\"extern uint64_t \" + test.Prefix + counts[i] + $\"({test.FunctionDefinitionParameters}) __attribute((sysv_abi));\"); ;\r\n        }\r\n\r\n        public static void GenerateVsExternLines(StringBuilder sb, UarchTest test)\r\n        {\r\n            int[] counts = test.Counts;\r\n            for (int i = 0; i < counts.Length; i++)\r\n                sb.AppendLine(\"extern \\\"C\\\" uint64_t \" + test.Prefix + counts[i] + $\"({test.FunctionDefinitionParameters});\");\r\n        }\r\n\r\n        /// <summary>\r\n        /// Generates test functions in assembly, with filler instructions between two divs\r\n        /// Args are put into rcx, rdx, r8 (in that order) to match Windows calling convention\r\n        /// </summary>\r\n        /// <param name=\"sb\">StringBuilder to append to</param>\r\n        /// <param name=\"counts\">Sizes to test the structure at</param>\r\n        /// <param name=\"funcNamePrefix\">Function name prefix</param>\r\n        /// <param name=\"fillerInstrs1\">Filler instructions after first ptr chasing load</param>\r\n        /// <param name=\"fillerInstrs2\">Filler instructions after second ptr chasing load</param>\r\n        /// <param name=\"includePtrChasingLoads\">If true, count pointer chasing loads as consuming the tested resource\r\n        /// (i.e. ptr chasing loads consume a ROB and integer RF slot) </param>\r\n        /// <param name=\"initInstrs\">Any extra initialization instructions</param>\r\n        public static void GenerateX86AsmDivStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = true, string initInstrs = null)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push %rsi\");\r\n                sb.AppendLine(\"  push %rdi\");\r\n                sb.AppendLine(\"  push %r15\");\r\n                sb.AppendLine(\"  push %r14\");\r\n                sb.AppendLine(\"  push %r13\");\r\n                sb.AppendLine(\"  push %r12\");\r\n                sb.AppendLine(\"  push %r11\");\r\n                sb.AppendLine(\"  push %r8\");\r\n                sb.AppendLine(\"  push %r9\");\r\n                sb.AppendLine(\"  push %rcx\");\r\n                sb.AppendLine(\"  push %rdx\");\r\n\r\n                // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                sb.AppendLine(\"  mov %rcx, %r9\"); // r9 <- rcx\r\n                sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                sb.AppendLine(\"  xor %r15, %r15\");\r\n                sb.AppendLine(\"  mov $0x10, %r14\");\r\n                sb.AppendLine(\"  mov $0x20, %r13\");\r\n                sb.AppendLine(\"  mov $0x30, %r12\");\r\n                sb.AppendLine(\"  mov $0x40, %r11\");\r\n\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n\r\n                sb.AppendLine(\"  mov %rdx, %rdi\");\r\n                sb.AppendLine(\"  mov %rdx, %rsi\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n\r\n                // keep dividing list size by itself\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  mov %rdi, %rax\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  sub %rax, %rsi\");\r\n                sb.AppendLine(\"  inc %rsi\");\r\n\r\n                // rdx is the remainder, rax is the quotient\r\n                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  mov %rsi, %rax\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  sub %rax, %rdi\");\r\n                sb.AppendLine(\"  inc %rdi\");\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs2[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  dec %rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                sb.AppendLine(\"  pop %rdx\");\r\n                sb.AppendLine(\"  pop %rcx\");\r\n                sb.AppendLine(\"  pop %r9\");\r\n                sb.AppendLine(\"  pop %r8\");\r\n                sb.AppendLine(\"  pop %r11\");\r\n                sb.AppendLine(\"  pop %r12\");\r\n                sb.AppendLine(\"  pop %r13\");\r\n                sb.AppendLine(\"  pop %r14\");\r\n                sb.AppendLine(\"  pop %r15\");\r\n                sb.AppendLine(\"  pop %rdi\");\r\n                sb.AppendLine(\"  pop %rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        public static void GenerateX86AsmDivNsqTestFuncs(StringBuilder sb,\r\n            int maxSize,\r\n            int[] counts,\r\n            string funcNamePrefix,\r\n            string[] depInstrs,\r\n            string[] indepInstrs,\r\n            bool divsInSq = false,\r\n            string initInstrs = null)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push %rsi\");\r\n                sb.AppendLine(\"  push %rdi\");\r\n                sb.AppendLine(\"  push %r15\");\r\n                sb.AppendLine(\"  push %r14\");\r\n                sb.AppendLine(\"  push %r13\");\r\n                sb.AppendLine(\"  push %r12\");\r\n                sb.AppendLine(\"  push %r11\");\r\n                sb.AppendLine(\"  push %r8\");\r\n                sb.AppendLine(\"  push %rcx\");\r\n                sb.AppendLine(\"  push %rdx\");\r\n\r\n                // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                sb.AppendLine(\"  xor %r15, %r15\");\r\n                sb.AppendLine(\"  mov $0x10, %r14\");\r\n                sb.AppendLine(\"  mov $0x20, %r13\");\r\n                sb.AppendLine(\"  mov $0x30, %r12\");\r\n                sb.AppendLine(\"  mov $0x40, %r11\");\r\n\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n\r\n                sb.AppendLine(\"  mov %rdx, %rdi\");\r\n                sb.AppendLine(\"  mov %rdx, %rsi\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n\r\n                // keep dividing list size by itself\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  mov %rdi, %rax\");  // divide rdi by rsi\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rsi\");\r\n                sb.AppendLine(\"  sub %rax, %rsi\");\r\n                sb.AppendLine(\"  inc %rsi\");\r\n\r\n                // rdx is the remainder, rax is the quotient\r\n                int fillerInstrCount = divsInSq ? counts[i] - 6 : counts[i];\r\n                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)\r\n                {\r\n                    if (fillerIdx < fillerInstrCount)\r\n                    {\r\n                        sb.AppendLine(depInstrs[depInstrIdx]);\r\n                        depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;\r\n                    }\r\n                    else\r\n                    {\r\n                        sb.AppendLine(indepInstrs[indepInstrIdx]);\r\n                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;\r\n                    }\r\n                }\r\n\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  mov %rsi, %rax\");  // divide rsi by rdi\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  xor %rdx, %rdx\");\r\n                sb.AppendLine(\"  idiv %rdi\");\r\n                sb.AppendLine(\"  sub %rax, %rdi\");\r\n                sb.AppendLine(\"  inc %rdi\");\r\n\r\n                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)\r\n                {\r\n                    if (fillerIdx < fillerInstrCount)\r\n                    {\r\n                        sb.AppendLine(depInstrs[depInstrIdx]);\r\n                        depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;\r\n                    }\r\n                    else\r\n                    {\r\n                        sb.AppendLine(indepInstrs[indepInstrIdx]);\r\n                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;\r\n                    }\r\n                }\r\n\r\n                sb.AppendLine(\"  dec %rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                sb.AppendLine(\"  pop %rdx\");\r\n                sb.AppendLine(\"  pop %rcx\");\r\n                sb.AppendLine(\"  pop %r8\");\r\n                sb.AppendLine(\"  pop %r11\");\r\n                sb.AppendLine(\"  pop %r12\");\r\n                sb.AppendLine(\"  pop %r13\");\r\n                sb.AppendLine(\"  pop %r14\");\r\n                sb.AppendLine(\"  pop %r15\");\r\n                sb.AppendLine(\"  pop %rdi\");\r\n                sb.AppendLine(\"  pop %rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        public static void GenerateX86AsmStructureTestFuncs(StringBuilder sb,\r\n            int[] counts,\r\n            string funcNamePrefix,\r\n            string[] fillerInstrs1,\r\n            string[] fillerInstrs2,\r\n            bool includePtrChasingLoads = true,\r\n            string initInstrs = null,\r\n            string postLoadInstrs1 = null,\r\n            string postLoadInstrs2 = null,\r\n            bool lfence = true,\r\n            string cleanupInstrs = null)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push %rsi\");\r\n                sb.AppendLine(\"  push %rdi\");\r\n                sb.AppendLine(\"  push %r15\");\r\n                sb.AppendLine(\"  push %r14\");\r\n                sb.AppendLine(\"  push %r13\");\r\n                sb.AppendLine(\"  push %r12\");\r\n                sb.AppendLine(\"  push %r11\");\r\n                sb.AppendLine(\"  push %r8\");\r\n                sb.AppendLine(\"  push %rcx\");\r\n                sb.AppendLine(\"  push %rdx\");\r\n\r\n                // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                sb.AppendLine(\"  xor %r15, %r15\");\r\n                sb.AppendLine(\"  mov $0x1, %r14\");\r\n                sb.AppendLine(\"  mov $0x2, %r13\");\r\n                sb.AppendLine(\"  mov $0x3, %r12\");\r\n                sb.AppendLine(\"  mov $0x4, %r11\");\r\n\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n\r\n                sb.AppendLine(\"  xor %rdi, %rdi\");\r\n                sb.AppendLine(\"  mov $0x40, %esi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);\r\n                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                if (lfence) sb.AppendLine(\"lfence\");\r\n                else\r\n                {\r\n                    if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);\r\n                    for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                    {\r\n                        sb.AppendLine(fillerInstrs2[instrIdx]);\r\n                        instrIdx = (instrIdx + 1) % fillerInstrs2.Length;\r\n                    }\r\n                }\r\n\r\n                sb.AppendLine(\"  dec %rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                if (cleanupInstrs != null) sb.AppendLine(cleanupInstrs);\r\n                sb.AppendLine(\"  pop %rdx\");\r\n                sb.AppendLine(\"  pop %rcx\");\r\n                sb.AppendLine(\"  pop %r8\");\r\n                sb.AppendLine(\"  pop %r11\");\r\n                sb.AppendLine(\"  pop %r12\");\r\n                sb.AppendLine(\"  pop %r13\");\r\n                sb.AppendLine(\"  pop %r14\");\r\n                sb.AppendLine(\"  pop %r15\");\r\n                sb.AppendLine(\"  pop %rdi\");\r\n                sb.AppendLine(\"  pop %rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        /// <summary>\r\n        /// Generate test functions to see how big a scheduler is, without a NSQ\r\n        /// Dependent ops are followed by independent ops, total op count = max\r\n        /// If number of dependent ops is greater than NSQ size, indep ops can't be executed and\r\n        /// there will be a dispatch stall\r\n        /// </summary>\r\n        /// <param name=\"sb\">Stringbuilder to append to</param>\r\n        /// <param name=\"totalOps\">number of ops between dependent loads. must be less than RF size but greater than SQ+NSQ size</param>\r\n        /// <param name=\"counts\">array of data points to test (SQ sizes in this case)</param>\r\n        /// <param name=\"funcNamePrefix\">function name prefix</param>\r\n        /// <param name=\"dependentInstrs\"></param>\r\n        /// <param name=\"indepInstrs\"></param>\r\n        /// <param name=\"ptrChasingLoadsInSq\">Do ptr chasing loads occupy entries in the SQ being measured?</param>\r\n        public static void GenerateX86AsmNsqTestFuncs(StringBuilder sb,\r\n            int totalOps,\r\n            int[] counts,\r\n            string funcNamePrefix,\r\n            string[] dependentInstrs,\r\n            string[] indepInstrs,\r\n            bool ptrChasingLoadsInSq = false,\r\n            string initInstrs = null,\r\n            string postLoadInstrs = null)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push %rsi\");\r\n                sb.AppendLine(\"  push %rdi\");\r\n                sb.AppendLine(\"  push %r15\");\r\n                sb.AppendLine(\"  push %r14\");\r\n                sb.AppendLine(\"  push %r13\");\r\n                sb.AppendLine(\"  push %r12\");\r\n                sb.AppendLine(\"  push %r11\");\r\n                sb.AppendLine(\"  push %r8\");\r\n                sb.AppendLine(\"  push %rcx\");\r\n                sb.AppendLine(\"  push %rdx\");\r\n\r\n                // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                sb.AppendLine(\"  xor %r15, %r15\");\r\n                sb.AppendLine(\"  mov $0x1, %r14\");\r\n                sb.AppendLine(\"  mov $0x2, %r13\");\r\n                sb.AppendLine(\"  mov $0x3, %r12\");\r\n                sb.AppendLine(\"  mov $0x4, %r11\");\r\n\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n\r\n                sb.AppendLine(\"  xor %rdi, %rdi\");\r\n                sb.AppendLine(\"  mov $0x40, %esi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                if (postLoadInstrs != null) sb.AppendLine(postLoadInstrs);\r\n                int sqInstrs = ptrChasingLoadsInSq ? counts[i] - 2 : counts[i];\r\n                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < totalOps; fillerIdx++)\r\n                {\r\n                    if (fillerIdx < sqInstrs)\r\n                    {\r\n                        sb.AppendLine(dependentInstrs[depInstrIdx]);\r\n                        depInstrIdx = (depInstrIdx + 1) % dependentInstrs.Length;\r\n                    }\r\n                    else\r\n                    {\r\n                        sb.AppendLine(indepInstrs[indepInstrIdx]);\r\n                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;\r\n                    }\r\n                }\r\n\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                sb.AppendLine(\"  lfence\");\r\n                sb.AppendLine(\"  dec %rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                sb.AppendLine(\"  pop %rdx\");\r\n                sb.AppendLine(\"  pop %rcx\");\r\n                sb.AppendLine(\"  pop %r8\");\r\n                sb.AppendLine(\"  pop %r11\");\r\n                sb.AppendLine(\"  pop %r12\");\r\n                sb.AppendLine(\"  pop %r13\");\r\n                sb.AppendLine(\"  pop %r14\");\r\n                sb.AppendLine(\"  pop %r15\");\r\n                sb.AppendLine(\"  pop %rdi\");\r\n                sb.AppendLine(\"  pop %rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        /// <summary>\r\n        /// Generate test functions for testing integer scheduler capacity\r\n        /// R15's value is dependent on the pointer chasing load results\r\n        /// </summary>\r\n        /// <param name=\"sb\"></param>\r\n        /// <param name=\"counts\"></param>\r\n        /// <param name=\"funcNamePrefix\"></param>\r\n        /// <param name=\"fillerInstrs1\"></param>\r\n        /// <param name=\"fillerInstrs2\"></param>\r\n        /// <param name=\"divs\"></param>\r\n        /// <param name=\"initInstrs\"></param>\r\n        public static void GenerateX86AsmIntSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool divs = true, string initInstrs = null)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push %rsi\");\r\n                sb.AppendLine(\"  push %rdi\");\r\n                sb.AppendLine(\"  push %r15\");\r\n                sb.AppendLine(\"  push %r14\");\r\n                sb.AppendLine(\"  push %r13\");\r\n                sb.AppendLine(\"  push %r12\");\r\n                sb.AppendLine(\"  push %r11\");\r\n                sb.AppendLine(\"  push %r8\");\r\n                sb.AppendLine(\"  push %rcx\");\r\n                sb.AppendLine(\"  push %rdx\");\r\n\r\n                // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                sb.AppendLine(\"  xor %r15, %r15\");\r\n                sb.AppendLine(\"  mov $0x1, %r14\");\r\n                sb.AppendLine(\"  mov $0x2, %r13\");\r\n                sb.AppendLine(\"  mov $0x3, %r12\");\r\n                sb.AppendLine(\"  mov $0x4, %r11\");\r\n\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n\r\n                sb.AppendLine(\"  xor %rdi, %rdi\");\r\n                sb.AppendLine(\"  mov $0x40, %esi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                sb.AppendLine(\"  mov %rdi, %r15\");\r\n                int fillerInstrCount = divs ? counts[i] - 2 : counts[i];\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                sb.AppendLine(\"  mov %rsi, %r15\");\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs2[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  dec %rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                sb.AppendLine(\"  pop %rdx\");\r\n                sb.AppendLine(\"  pop %rcx\");\r\n                sb.AppendLine(\"  pop %r8\");\r\n                sb.AppendLine(\"  pop %r11\");\r\n                sb.AppendLine(\"  pop %r12\");\r\n                sb.AppendLine(\"  pop %r13\");\r\n                sb.AppendLine(\"  pop %r14\");\r\n                sb.AppendLine(\"  pop %r15\");\r\n                sb.AppendLine(\"  pop %rdi\");\r\n                sb.AppendLine(\"  pop %rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        /// <summary>\r\n        /// Generates pointer chasing test functions in assembly, with xmm0 <- [address using offset from ptr chasing result]\r\n        /// xmm1-4 can be used for\r\n        /// </summary>\r\n        /// <param name=\"sb\"></param>\r\n        /// <param name=\"counts\"></param>\r\n        /// <param name=\"funcNamePrefix\"></param>\r\n        /// <param name=\"fillerInstrs1\"></param>\r\n        /// <param name=\"fillerInstrs2\"></param>\r\n        public static void GenerateX86AsmFpSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push %rsi\");\r\n                sb.AppendLine(\"  push %rdi\");\r\n                sb.AppendLine(\"  push %r15\");\r\n                sb.AppendLine(\"  push %r14\");\r\n                sb.AppendLine(\"  push %r13\");\r\n                sb.AppendLine(\"  push %r12\");\r\n                sb.AppendLine(\"  push %r8\");\r\n                sb.AppendLine(\"  push %rcx\");\r\n                sb.AppendLine(\"  push %rdx\");\r\n\r\n                // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                sb.AppendLine(\"  xor %r15, %r15\");\r\n                sb.AppendLine(\"  mov $0x1, %r14\");\r\n                sb.AppendLine(\"  mov $0x1, %r13\");\r\n                sb.AppendLine(\"  mov $0x3, %r12\");\r\n\r\n                // initialize some FP values off r8 (third argument)\r\n                sb.AppendLine(\"  movss (%r8), %xmm1\");\r\n                sb.AppendLine(\"  movss 4(%r8), %xmm2\");\r\n                sb.AppendLine(\"  movss 8(%r8), %xmm3\");\r\n                sb.AppendLine(\"  movss 12(%r8), %xmm4\");\r\n                sb.AppendLine(\"  movss 16(%r8), %xmm5\");\r\n\r\n                // start one chain at 0, and the other at 0x40\r\n                sb.AppendLine(\"  xor %rdi, %rdi\");\r\n                sb.AppendLine(\"  mov $0x40, %esi\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                sb.AppendLine(\"  cvtsi2ss %rdi, %xmm0\");\r\n                int fillerInstrCount = counts[i];\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                sb.AppendLine(\"  cvtsi2ss %rsi, %xmm0\");\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs2[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  dec %rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                sb.AppendLine(\"  pop %rdx\");\r\n                sb.AppendLine(\"  pop %rcx\");\r\n                sb.AppendLine(\"  pop %r8\");\r\n                sb.AppendLine(\"  pop %r12\");\r\n                sb.AppendLine(\"  pop %r13\");\r\n                sb.AppendLine(\"  pop %r14\");\r\n                sb.AppendLine(\"  pop %r15\");\r\n                sb.AppendLine(\"  pop %rdi\");\r\n                sb.AppendLine(\"  pop %rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        public static void GenerateX86AsmFp256SchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push %rsi\");\r\n                sb.AppendLine(\"  push %rdi\");\r\n                sb.AppendLine(\"  push %r15\");\r\n                sb.AppendLine(\"  push %r14\");\r\n                sb.AppendLine(\"  push %r13\");\r\n                sb.AppendLine(\"  push %r12\");\r\n                sb.AppendLine(\"  push %r8\");\r\n                sb.AppendLine(\"  push %rcx\");\r\n                sb.AppendLine(\"  push %rdx\");\r\n\r\n                // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                sb.AppendLine(\"  xor %r15, %r15\");\r\n                sb.AppendLine(\"  mov $0x1, %r14\");\r\n                sb.AppendLine(\"  mov $0x1, %r13\");\r\n                sb.AppendLine(\"  mov $0x3, %r12\");\r\n\r\n                // initialize some FP values off r8 (third argument)\r\n                sb.AppendLine(\"  vzeroupper\");\r\n                sb.AppendLine(\"  vmovups (%r8), %ymm1\");\r\n                sb.AppendLine(\"  vmovups 32(%r8), %ymm2\");\r\n                sb.AppendLine(\"  vmovups 64(%r8), %ymm3\");\r\n                sb.AppendLine(\"  vmovups 96(%r8), %ymm4\");\r\n                sb.AppendLine(\"  vmovups 128(%r8), %ymm5\");\r\n\r\n                // start one chain at 0, and the other at 0x40\r\n                sb.AppendLine(\"  xor %rdi, %rdi\");\r\n                sb.AppendLine(\"  mov $0x40, %esi\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                sb.AppendLine(\"  vbroadcastss (%r8,%rdi,4), %ymm0\");\r\n                int fillerInstrCount = counts[i];\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                sb.AppendLine(\"  vbroadcastss (%r8,%rsi,4), %ymm0\");\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs2[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  dec %rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                sb.AppendLine(\"  pop %rdx\");\r\n                sb.AppendLine(\"  pop %rcx\");\r\n                sb.AppendLine(\"  pop %r8\");\r\n                sb.AppendLine(\"  pop %r12\");\r\n                sb.AppendLine(\"  pop %r13\");\r\n                sb.AppendLine(\"  pop %r14\");\r\n                sb.AppendLine(\"  pop %r15\");\r\n                sb.AppendLine(\"  pop %rdi\");\r\n                sb.AppendLine(\"  pop %rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        /// <summary>\r\n        /// Generates test functions in assembly, with filler instructions between two divs\r\n        /// Args are put into rcx, rdx, r8 (in that order) to match Windows calling convention\r\n        /// </summary>\r\n        /// <param name=\"sb\">StringBuilder to append to</param>\r\n        /// <param name=\"counts\">Sizes to test the structure at</param>\r\n        /// <param name=\"funcNamePrefix\">Function name prefix</param>\r\n        /// <param name=\"fillerInstrs1\">Filler instructions after first ptr chasing load</param>\r\n        /// <param name=\"fillerInstrs2\">Filler instructions after second ptr chasing load</param>\r\n        /// <param name=\"includePtrChasingLoads\">If true, count pointer chasing loads as consuming the tested resource\r\n        /// (i.e. ptr chasing loads consume a ROB and integer RF slot) </param>\r\n        /// <param name=\"initInstrs\">Any extra initialization instructions</param>\r\n        public static void GenerateX86NasmDivStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = true, string initInstrs = null)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push rsi\");\r\n                sb.AppendLine(\"  push rdi\");\r\n                sb.AppendLine(\"  push r15\");\r\n                sb.AppendLine(\"  push r14\");\r\n                sb.AppendLine(\"  push r13\");\r\n                sb.AppendLine(\"  push r12\");\r\n                sb.AppendLine(\"  push r11\");\r\n\r\n                sb.AppendLine(\"  xor r15, r15\");\r\n                sb.AppendLine(\"  mov r14, 0x10\");\r\n                sb.AppendLine(\"  mov r13, 0x20\");\r\n                sb.AppendLine(\"  mov r12, 0x30\");\r\n                sb.AppendLine(\"  mov r11, 0x40\");\r\n\r\n\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n\r\n                sb.AppendLine(\"  mov rdi, rdx\");\r\n                sb.AppendLine(\"  mov rsi, rdx\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  mov rax, rdi\");\r\n                sb.AppendLine(\"  idiv rsi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rsi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rsi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rsi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rsi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rsi\");\r\n                sb.AppendLine(\"  sub rsi, rax\");\r\n                sb.AppendLine(\"  inc rsi\");\r\n                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  mov rax, rsi\");\r\n                sb.AppendLine(\"  idiv rdi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rdi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rdi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rdi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rdi\");\r\n                sb.AppendLine(\"  xor rdx, rdx\");\r\n                sb.AppendLine(\"  idiv rdi\");\r\n                sb.AppendLine(\"  sub rdi, rax\");\r\n                sb.AppendLine(\"  inc rdi\");\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs2[instrIdx]);\r\n                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  dec rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                sb.AppendLine(\"  pop r11\");\r\n                sb.AppendLine(\"  pop r12\");\r\n                sb.AppendLine(\"  pop r13\");\r\n                sb.AppendLine(\"  pop r14\");\r\n                sb.AppendLine(\"  pop r15\");\r\n                sb.AppendLine(\"  pop rdi\");\r\n                sb.AppendLine(\"  pop rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        /// <summary>\r\n        /// Generates test functions in ARM assembly.\r\n        /// Registers x15-x10 can be used for integer stuff\r\n        /// Args are in x0, x1, x2\r\n        /// </summary>\r\n        /// <param name=\"sb\"></param>\r\n        /// <param name=\"counts\"></param>\r\n        /// <param name=\"funcNamePrefix\"></param>\r\n        /// <param name=\"fillerInstrs1\"></param>\r\n        /// <param name=\"fillerInstrs2\"></param>\r\n        /// <param name=\"includePtrChasingLoads\"></param>\r\n        /// <param name=\"dsb\">use dsb as lfence</param>\r\n        public static void GenerateArmAsmStructureTestFuncs(StringBuilder sb,\r\n            int[] counts,\r\n            string funcNamePrefix,\r\n            string[] fillerInstrs1,\r\n            string[] fillerInstrs2,\r\n            bool includePtrChasingLoads = false,\r\n            string initInstrs = null,\r\n            string postLoadInstrs1 = null,\r\n            string postLoadInstrs2 = null,\r\n            bool dsb = true)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n\r\n                // args in x0, x1\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  sub sp, sp, #0x50\");\r\n                sb.AppendLine(\"  stp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  stp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  stp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  stp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  mov x15, 1\");\r\n                sb.AppendLine(\"  mov x14, 2\");\r\n                sb.AppendLine(\"  mov x13, 3\");\r\n                sb.AppendLine(\"  mov x12, 4\");\r\n                sb.AppendLine(\"  mov x11, 5\");\r\n                sb.AppendLine(\"  mov x10, 6\");\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n                sb.AppendLine(\"  mov w25, 0x0\");\r\n                sb.AppendLine(\"  mov w26, 0x40\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  ldr w25, [x1, w25, uxtw #2]\"); // current = A[current]\r\n                if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);\r\n                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];\r\n                for (int nopIdx = 0, addIdx = 0; nopIdx < fillerInstrCount; nopIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[addIdx]);\r\n                    addIdx = (addIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  ldr w26, [x1, w26, uxtw #2]\");\r\n                if (dsb)\r\n                {\r\n                    sb.AppendLine(\"  dsb sy\");\r\n                    sb.AppendLine(\"  isb sy\");\r\n                }\r\n                else\r\n                {\r\n                    if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);\r\n                    for (int nopIdx = 0, addIdx = 0; nopIdx < fillerInstrCount; nopIdx++)\r\n                    {\r\n                        sb.AppendLine(fillerInstrs2[addIdx]);\r\n                        addIdx = (addIdx + 1) % fillerInstrs2.Length;\r\n                    }\r\n                }\r\n\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine(\"  cbnz x0, \" + funcName + \"start\");\r\n                sb.AppendLine(\"  ldp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  ldp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  ldp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  ldp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  add sp, sp, #0x50\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        public static void GenerateArmAsmNsqTestFuncs(StringBuilder sb,\r\n            int totalOps,\r\n            int[] counts,\r\n            string funcNamePrefix,\r\n            string[] dependentInstrs,\r\n            string[] indepInstrs,\r\n            bool ptrChasingLoadsInSq = false,\r\n            string initInstrs = null,\r\n            string postLoadInstrs = null)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n\r\n                // args in x0, x1\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  sub sp, sp, #0x50\");\r\n                sb.AppendLine(\"  stp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  stp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  stp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  stp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  mov x15, 1\");\r\n                sb.AppendLine(\"  mov x14, 2\");\r\n                sb.AppendLine(\"  mov x13, 3\");\r\n                sb.AppendLine(\"  mov x12, 4\");\r\n                sb.AppendLine(\"  mov x11, 5\");\r\n                sb.AppendLine(\"  mov x10, 6\");\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n                sb.AppendLine(\"  mov w25, 0x0\");\r\n                sb.AppendLine(\"  mov w26, 0x40\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  ldr w25, [x1, w25, uxtw #2]\"); // current = A[current]\r\n                if (postLoadInstrs != null) sb.AppendLine(postLoadInstrs);\r\n                int sqInstrs = ptrChasingLoadsInSq ? counts[i] - 2 : counts[i];\r\n                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < totalOps; fillerIdx++)\r\n                {\r\n                    if (fillerIdx < sqInstrs)\r\n                        sb.AppendLine(dependentInstrs[instrIdx]);\r\n                    else\r\n                        sb.AppendLine(indepInstrs[instrIdx]);\r\n\r\n                    instrIdx = (instrIdx + 1) % dependentInstrs.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  ldr w26, [x1, w26, uxtw #2]\");\r\n                sb.AppendLine(\"  dsb sy\"); // close enough to lfence\r\n                sb.AppendLine(\"  isb sy\");\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine(\"  cbnz x0, \" + funcName + \"start\");\r\n                sb.AppendLine(\"  ldp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  ldp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  ldp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  ldp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  add sp, sp, #0x50\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        /// <summary>\r\n        /// Filler for todo functions\r\n        /// </summary>\r\n        /// <param name=\"sb\"></param>\r\n        /// <param name=\"counts\"></param>\r\n        /// <param name=\"funcNamePrefix\"></param>\r\n        public static void GenerateStub(StringBuilder sb, int[] counts, string funcNamePrefix)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  ret\");\r\n            }\r\n        }\r\n\r\n        public static void GenerateArmAsmFpSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2)\r\n        {\r\n            GenerateArmAsmStructureTestFuncs(sb,\r\n                counts,\r\n                funcNamePrefix,\r\n                fillerInstrs1,\r\n                fillerInstrs2,\r\n                false,\r\n                null,\r\n                \"  ldr s16, [x2, w25, uxtw #2]\",\r\n                \"  ldr s16, [x2, w26, uxtw #2]\");\r\n        }\r\n\r\n        public static void GenerateArmAsmDivStructureTestFuncs(StringBuilder sb,\r\n            int[] counts,\r\n            string funcNamePrefix,\r\n            string[] fillerInstrs1,\r\n            string[] fillerInstrs2,\r\n            bool includePtrChasingLoads = false,\r\n            string initInstrs = null)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n\r\n                // args in x0 = iterations, x1 = list size, x2 = list (sink)\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  sub sp, sp, #0x50\");\r\n                sb.AppendLine(\"  stp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  stp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  stp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  stp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  mov x15, 1\");\r\n                sb.AppendLine(\"  mov x14, 2\");\r\n                sb.AppendLine(\"  mov x13, 3\");\r\n                sb.AppendLine(\"  mov x12, 4\");\r\n                sb.AppendLine(\"  mov x11, 5\");\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n                sb.AppendLine(\"  mov w25, 0x0\");\r\n                sb.AppendLine(\"  mov w26, 0x40\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  mov w25, w1\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];\r\n                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[addIdx]);\r\n                    addIdx = (addIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  mov w26, w1\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n\r\n                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs2[addIdx]);\r\n                    addIdx = (addIdx + 1) % fillerInstrs2.Length;\r\n                }\r\n\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine(\"  cbnz x0, \" + funcName + \"start\");\r\n                sb.AppendLine(\"  ldp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  ldp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  ldp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  ldp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  add sp, sp, #0x50\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        // Just to deal with A73\r\n        public static string GetArmDependentBranch(string prefix)\r\n        {\r\n            return $\"  cmp x25, x26\\n  b.eq {prefix}_badthing\";\r\n        }\r\n\r\n        public static string GetArmDependentBranchTarget(string prefix)\r\n        {\r\n            return $\"{prefix}_badthing:\\n  .word 0xf7f0a000\";\r\n        }\r\n\r\n        public static string GetRiscvDependentBranch(string prefix)\r\n        {\r\n            return $\"  beq x5, x6, {prefix}_badthing\";\r\n        }\r\n\r\n        public static string GetRiscvDependentBranchTarget(string prefix)\r\n        {\r\n            return $\"{prefix}_badthing:\\n  .word 0x00000000\";\r\n        }\r\n\r\n        public static void GenerateArmAsmDivNsqTestFuncs(StringBuilder sb,\r\n            int maxSize,\r\n            int[] counts,\r\n            string funcNamePrefix,\r\n            string[] depInstrs,\r\n            string[] indepInstrs,\r\n            bool divsInSq = false,\r\n            string initInstrs = null)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n\r\n                // args in x0 = iterations, x1 = list size, x2 = list (sink)\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  sub sp, sp, #0x50\");\r\n                sb.AppendLine(\"  stp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  stp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  stp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  stp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  mov x15, 1\");\r\n                sb.AppendLine(\"  mov x14, 2\");\r\n                sb.AppendLine(\"  mov x13, 3\");\r\n                sb.AppendLine(\"  mov x12, 4\");\r\n                sb.AppendLine(\"  mov x11, 5\");\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n                sb.AppendLine(\"  mov w25, 0x0\");\r\n                sb.AppendLine(\"  mov w26, 0x40\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  mov w25, w1\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                sb.AppendLine(\"  udiv w25, w25, w13\");\r\n                int fillerInstrCount = divsInSq ? counts[i] - 6 : counts[i];\r\n                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)\r\n                {\r\n                    if (fillerIdx < fillerInstrCount)\r\n                    {\r\n                        sb.AppendLine(depInstrs[depInstrIdx]);\r\n                        depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;\r\n                    }\r\n                    else\r\n                    {\r\n                        sb.AppendLine(indepInstrs[indepInstrIdx]);\r\n                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;\r\n                    }\r\n                }\r\n                sb.AppendLine(\"  mov w26, w1\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n                sb.AppendLine(\"  udiv w26, w26, w13\");\r\n                sb.AppendLine(\"  mov w25, w26\");\r\n\r\n                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)\r\n                {\r\n                    if (fillerIdx < fillerInstrCount)\r\n                    {\r\n                        sb.AppendLine(depInstrs[depInstrIdx]);\r\n                        depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;\r\n                    }\r\n                    else\r\n                    {\r\n                        sb.AppendLine(indepInstrs[indepInstrIdx]);\r\n                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;\r\n                    }\r\n                }\r\n\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine(\"  cbnz x0, \" + funcName + \"start\");\r\n                sb.AppendLine(\"  ldp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  ldp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  ldp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  ldp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  add sp, sp, #0x50\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        public static void GenerateMipsAsmStructureTestFuncs(StringBuilder sb,\r\n            int[] counts,\r\n            string funcNamePrefix,\r\n            string[] fillerInstrs1,\r\n            string[] fillerInstrs2,\r\n            bool includePtrChasingLoads = false,\r\n            string initInstrs = null,\r\n            string postLoadInstrs1 = null,\r\n            string postLoadInstrs2 = null,\r\n            bool dsb = false)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n\r\n                // args in r4 = iterations, r5 = list, r6 = list (sink)\r\n                // use r12 and r13 for ptr chasing loads, r14 as decrement for iteration count\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  ld.d $r12, $r5, 0\");\r\n                sb.AppendLine(\"  ld.d $r13, $r5, 64\");\r\n                sb.AppendLine(\"  xor $r14, $r14, $r14\");\r\n                sb.AppendLine(\"  addi.d $r14, $r14, 1\");\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  ld.d $r12, $r12, 0\");\r\n                if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);\r\n                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];\r\n                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[addIdx]);\r\n                    addIdx = (addIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n                sb.AppendLine(\"  ld.d $r13, $r13, 0\");\r\n                if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);\r\n                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs2[addIdx]);\r\n                    addIdx = (addIdx + 1) % fillerInstrs2.Length;\r\n                }\r\n                sb.AppendLine(\"  sub.d $r4, $r4, $r14\");\r\n                sb.AppendLine(\"  bnez $r4, \" + funcName + \"start\");\r\n                sb.AppendLine(\" jr $r1\");\r\n            }\r\n        }\r\n\r\n        public static void GenerateRiscvAsmStructureTestFuncs(StringBuilder sb,\r\n            int[] counts,\r\n            string funcNamePrefix,\r\n            string[] fillerInstrs1,\r\n            string[] fillerInstrs2,\r\n            bool includePtrChasingLoads = false,\r\n            string initInstrs = null,\r\n            string postLoadInstrs1 = null,\r\n            string postLoadInstrs2 = null,\r\n            bool fence = true)\r\n        {\r\n            for (int i = 0; i < counts.Length; i++)\r\n            {\r\n                string funcName = funcNamePrefix + counts[i];\r\n\r\n                // args in x10 = iterations, x11 = list, x12 = list (sink)\r\n                // temporaries are x5-x7, x28-x31\r\n                // x18-27 are to be saved\r\n                // use x5 and x6 for ptr chasing loads\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  addi sp, sp, -88\");\r\n                sb.AppendLine(\"  sd x18, 0(sp)\");\r\n                sb.AppendLine(\"  sd x19, 8(sp)\");\r\n                sb.AppendLine(\"  sd x20, 16(sp)\");\r\n                sb.AppendLine(\"  sd x21, 24(sp)\");\r\n                sb.AppendLine(\"  sd x22, 32(sp)\");\r\n                sb.AppendLine(\"  sd x23, 40(sp)\");\r\n                sb.AppendLine(\"  sd x24, 48(sp)\");\r\n                sb.AppendLine(\"  sd x25, 56(sp)\");\r\n                sb.AppendLine(\"  sd x26, 64(sp)\");\r\n                sb.AppendLine(\"  sd x27, 72(sp)\");\r\n\r\n                sb.AppendLine(\"  addi x28, x28, 1\");\r\n                sb.AppendLine(\"  addi x29, x29, 1\");\r\n                sb.AppendLine(\"  addi x30, x30, 1\");\r\n                sb.AppendLine(\"  addi x31, x31, 1\");\r\n                sb.AppendLine(\"  addi x18, x18, 2\");\r\n                sb.AppendLine(\"  addi x19, x19, 3\");\r\n                sb.AppendLine(\"  addi x20, x20, 4\");\r\n                sb.AppendLine(\"  addi x22, x21, 5\");\r\n\r\n                sb.AppendLine(\"  ld x5, (x11)\");\r\n                sb.AppendLine(\"  ld x6, 64(x11)\");\r\n\r\n                if (initInstrs != null) sb.AppendLine(initInstrs);\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  ld x5, (x5)\");\r\n                if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);\r\n                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];\r\n                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)\r\n                {\r\n                    sb.AppendLine(fillerInstrs1[addIdx]);\r\n                    addIdx = (addIdx + 1) % fillerInstrs1.Length;\r\n                }\r\n                sb.AppendLine(\"  ld x6, (x6)\");\r\n                if (fence) sb.AppendLine(\"  fence\");\r\n                else\r\n                {\r\n                    if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);\r\n                    for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)\r\n                    {\r\n                        sb.AppendLine(fillerInstrs2[addIdx]);\r\n                        addIdx = (addIdx + 1) % fillerInstrs2.Length;\r\n                    }\r\n                }\r\n\r\n                sb.AppendLine(\"  addi x10, x10, -1\");\r\n                sb.AppendLine(\"  bge x10, x0, \" + funcName + \"start\");\r\n\r\n                sb.AppendLine(\"  ld x18, 0(sp)\");\r\n                sb.AppendLine(\"  ld x19, 8(sp)\");\r\n                sb.AppendLine(\"  ld x20, 16(sp)\");\r\n                sb.AppendLine(\"  ld x21, 24(sp)\");\r\n                sb.AppendLine(\"  ld x22, 32(sp)\");\r\n                sb.AppendLine(\"  ld x23, 40(sp)\");\r\n                sb.AppendLine(\"  ld x24, 48(sp)\");\r\n                sb.AppendLine(\"  ld x25, 56(sp)\");\r\n                sb.AppendLine(\"  ld x26, 64(sp)\");\r\n                sb.AppendLine(\"  ld x27, 72(sp)\");\r\n                sb.AppendLine(\"  addi sp, sp, 88\");\r\n                sb.AppendLine(\" ret\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/A73RobTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    /// <summary>\r\n    /// Looking for reordering capacity limits on A73 by combining several different instruction types\r\n    /// </summary>\r\n    public class A73RobTest : UarchTest\r\n    {\r\n        public A73RobTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"a73rob\";\r\n            this.Description = \"Mixed integer/vec128 + stores\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs = UarchTestHelpers.GetArmDependentBranch(this.Prefix);\r\n                string initInstrs = \"  ldr q0, [x1]\\n\" +\r\n                \"  ldr q1, [x1, #0x10]\\n\" +\r\n                \"  ldr q2, [x1, #0x20]\\n\" +\r\n                \"  ldr q3, [x1, #0x30]\\n\" +\r\n                \"  ldr q4, [x1, #0x40]\\n\";\r\n\r\n                List<string> fillerInstrs = new List<string>();\r\n                for (int i = 0; i < this.Counts[this.Counts.Length - 1];i++)\r\n                {\r\n                    if (i < 33) fillerInstrs.Add(\"  add v1.4s, v1.4s, v0.4s\");\r\n                    else if (i < 66) fillerInstrs.Add(\"  add x15, x15, x11\");\r\n                    else fillerInstrs.Add(\"  str x12, [x2]\");\r\n                }\r\n\r\n                string[] fillerInstrsArr = fillerInstrs.ToArray();\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, fillerInstrsArr, fillerInstrsArr, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/AddLoopTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class AddLoopTest : UarchTest\r\n    {\r\n        /// <summary>\r\n        ///\r\n        /// </summary>\r\n        /// <param name=\"low\">must be greater than 2</param>\r\n        /// <param name=\"high\"></param>\r\n        /// <param name=\"step\"></param>\r\n        public AddLoopTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"addloop\";\r\n            this.Description = $\"ADD throughput for various loop sizes. Avoids NOP fusing\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations\";\r\n            this.GetFunctionCallParameters = \"structIterations\";\r\n            this.DivideTimeByCount = true;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return false;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);\r\n            if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);\r\n        }\r\n\r\n        public void GenerateX86GccAsm(StringBuilder sb)\r\n        {\r\n            string[] unrolledAdds = new string[4];\r\n            unrolledAdds[0] = \"  add %r11, %r15\";\r\n            unrolledAdds[1] = \"  add %r11, %r14\";\r\n            unrolledAdds[2] = \"  add %r11, %r13\";\r\n            unrolledAdds[3] = \"  add %r11, %r12\";\r\n\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = this.Prefix + this.Counts[i];\r\n                sb.AppendLine(funcName + \":\");\r\n\r\n                // count dec, jnz as instructions in the loop\r\n                for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(unrolledAdds[nopIdx & 3]);\r\n                sb.AppendLine(\"  dec %rdi\");\r\n                sb.AppendLine(\"  jnz \" + funcName);\r\n                sb.AppendLine(\"  ret\");\r\n            }\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            string[] unrolledAdds = new string[4];\r\n            unrolledAdds[0] = \"  add x15, x15, x11\";\r\n            unrolledAdds[1] = \"  add x14, x14, x11\";\r\n            unrolledAdds[2] = \"  add x13, x13, x11\";\r\n            unrolledAdds[3] = \"  add x12, x12, x11\";\r\n\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = this.Prefix + this.Counts[i];\r\n                sb.AppendLine(funcName + \":\");\r\n\r\n                for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(unrolledAdds[nopIdx & 3]);\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine(\"  cbnz x0, \" + funcName);\r\n                sb.AppendLine(\"  ret\");\r\n            }\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/AddNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class AddNsq : UarchTest\r\n    {\r\n        private int totalOps;\r\n        public AddNsq(int low, int high, int step, int totalOps)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"addnsq\" + totalOps;\r\n            this.Description = \"Integer adds, excluding possible NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.totalOps = totalOps;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            // if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] depInstrs = new string[2];\r\n                depInstrs[0] = \"  add %rdi, %r15\";\r\n                depInstrs[1] = \"  add %rdi, %r14\";\r\n\r\n                string[] indepInstrs = new string[2];\r\n                indepInstrs[0] = \"  add %r13, %r11\";\r\n                indepInstrs[1] = \"  add %r12, %r11\";\r\n                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/AddSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class AddSchedTest : UarchTest\r\n    {\r\n        public AddSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"addsched\";\r\n            this.Description = \"Scheduler, Integer Adds\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add %rdi, %r15\";\r\n                unrolledAdds[1] = \"  add %rdi, %r14\";\r\n                unrolledAdds[2] = \"  add %rdi, %r13\";\r\n                unrolledAdds[3] = \"  add %rdi, %r12\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add x15, x15, x25\";\r\n                unrolledAdds[1] = \"  add x14, x14, x25\";\r\n                unrolledAdds[2] = \"  add x13, x13, x25\";\r\n                unrolledAdds[3] = \"  add x12, x12, x25\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add.d $r15, $r15, $r12\";\r\n                unrolledAdds[1] = \"  add.d $r16, $r16, $r12\";\r\n                unrolledAdds[2] = \"  add.d $r17, $r17, $r12\";\r\n                unrolledAdds[3] = \"  add.d $r18, $r18, $r12\";\r\n\r\n                string[] unrolledAdds1 = new string[4];\r\n                unrolledAdds1[0] = \"  add.d $r15, $r15, $r13\";\r\n                unrolledAdds1[1] = \"  add.d $r16, $r16, $r13\";\r\n                unrolledAdds1[2] = \"  add.d $r17, $r17, $r13\";\r\n                unrolledAdds1[3] = \"  add.d $r18, $r18, $r13\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, includePtrChasingLoads: false);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add x30, x30, x5\";\r\n                unrolledAdds[1] = \"  add x29, x29, x5\";\r\n                unrolledAdds[2] = \"  add x28, x28, x5\";\r\n                unrolledAdds[3] = \"  add x31, x31, x5\";\r\n\r\n                string[] unrolledAdds1 = new string[4];\r\n                unrolledAdds1[0] = \"  add x30, x30, x6\";\r\n                unrolledAdds1[1] = \"  add x31, x31, x6\";\r\n                unrolledAdds1[2] = \"  add x28, x28, x6\";\r\n                unrolledAdds1[3] = \"  add x29, x29, x6\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/AddvNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class AddvNsq : UarchTest\r\n    {\r\n        private int totalOps;\r\n        public AddvNsq(int low, int high, int step, int totalOps)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"addvnsq\";\r\n            this.Description = \"ADDV, excluding possible NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.totalOps = totalOps;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr d16, [x2, w25, sxtw #0]\";\r\n                string initInstrs = \"  ldr d15, [x2]\";\r\n                string[] depInstrs = new string[4];\r\n                depInstrs[0] = \"  addv h1, v16.4h\";\r\n                depInstrs[1] = \"  addv h2, v16.4h\";\r\n                depInstrs[2] = \"  addv h3, v16.4h\";\r\n                depInstrs[3] = \"  addv h4, v16.4h\";\r\n\r\n                string[] indepInstrs = new string[4];\r\n                indepInstrs[0] = \"  addv h1, v15.4h\";\r\n                indepInstrs[1] = \"  addv h2, v15.4h\";\r\n                indepInstrs[2] = \"  addv h3, v15.4h\";\r\n                indepInstrs[3] = \"  addv h4, v15.4h\";\r\n                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,\r\n                    postLoadInstrs: postLoadInstrs1);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/AddvSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class AddvSched : UarchTest\r\n    {\r\n        public AddvSched(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"addvsched\";\r\n            this.Description = \"ADDV Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr q16, [x2, w25, sxtw #0]\";\r\n                string postLoadInstrs2 = \"  ldr q16, [x2, w25, sxtw #0]\";\r\n                string[] unrolledInstrs = new string[4];\r\n                unrolledInstrs[0] = \"  addv h1, v16.4h\";\r\n                unrolledInstrs[1] = \"  addv h2, v16.4h\";\r\n                unrolledInstrs[2] = \"  addv h3, v16.4h\";\r\n                unrolledInstrs[3] = \"  addv h4, v16.4h\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/AeseSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class AeseSchedTest : UarchTest\r\n    {\r\n        public AeseSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"aesesched\";\r\n            this.Description = \"aese scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  aesenc %xmm0, %xmm1\";\r\n                unrolledAdds[1] = \"  aesenc %xmm0, %xmm2\";\r\n                unrolledAdds[2] = \"  aesenc %xmm0, %xmm3\";\r\n                unrolledAdds[3] = \"  aesenc %xmm0, %xmm4\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);\r\n            }\r\n\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr q0, [x2, w25, uxtw#0]\";\r\n                string postLoadInstrs2 = \"  ldr q0, [x2, w26, uxtw#0]\";\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  aese v1.16b, v0.16b\";\r\n                unrolledAdds[1] = \"  aese v2.16b, v0.16b\";\r\n                unrolledAdds[2] = \"  aese v3.16b, v0.16b\";\r\n                unrolledAdds[3] = \"  aese v4.16b, v0.16b\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/AesencNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class AesencNsq : UarchTest\r\n    {\r\n        private int totalOps;\r\n        public AesencNsq(int low, int high, int step, int totalOps)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"aesencnsq\" + totalOps;\r\n            this.Description = \"AESENC, excluding possible NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.totalOps = totalOps;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            // if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string postLoadInstrs = \"  mov %rdi, %r15\\n  add %r8, %r15\\n  movdqu (%r15), %xmm1\";\r\n                string initInstrs = \"  movdqu (%r8), %xmm2\";\r\n                string[] depInstrs = new string[4];\r\n                depInstrs[0] = \"  aesenc %xmm1, %xmm0\";\r\n                depInstrs[1] = \"  aesenc %xmm1, %xmm3\";\r\n                depInstrs[2] = \"  aesenc %xmm1, %xmm4\";\r\n                depInstrs[3] = \"  aesenc %xmm1, %xmm5\";\r\n\r\n                string[] indepInstrs = new string[2];\r\n                indepInstrs[0] = \"  aesenc %xmm2, %xmm6\";\r\n                indepInstrs[1] = \"  aesenc %xmm2, %xmm7\";\r\n                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr s16, [x2, w25, uxtw #2]\";\r\n                string initInstrs = \"  ldr s15, [x2]\";\r\n                string[] depInstrs = new string[4];\r\n                depInstrs[0] = \"  fadd s0, s0, s16\";\r\n                depInstrs[1] = \"  fadd s1, s1, s16\";\r\n                depInstrs[2] = \"  fadd s2, s2, s16\";\r\n                depInstrs[3] = \"  fadd s3, s3, s16\";\r\n\r\n                string[] indepInstrs = new string[4];\r\n                indepInstrs[0] = \"  fadd s17, s17, s15\";\r\n                indepInstrs[1] = \"  fadd s18, s18, s15\";\r\n                indepInstrs[2] = \"  fadd s19, s19, s15\";\r\n                indepInstrs[3] = \"  fadd s20, s20, s15\";\r\n                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,\r\n                    postLoadInstrs: postLoadInstrs1);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/BranchBufferTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class BranchBufferTest : UarchTest\r\n    {\r\n        private bool mixNops;\r\n        private bool initialDependentBranch;\r\n        public BranchBufferTest(int low, int high, int step, bool mixNops = false, bool initialDependentBranch = false)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"bob\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Branch Order Buffer Test (not-taken branches pending retire)\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty); ;\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n            this.mixNops = mixNops;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                GenerateX86GccAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                GenerateMipsAsm(sb);\r\n            }\r\n        }\r\n\r\n        public void GenerateX86GccAsm(StringBuilder sb)\r\n        {\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = Prefix + Counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push %rsi\");\r\n                sb.AppendLine(\"  push %rdi\");\r\n                sb.AppendLine(\"  push %r15\");\r\n                sb.AppendLine(\"  push %r14\");\r\n                sb.AppendLine(\"  push %r13\");\r\n                sb.AppendLine(\"  push %r12\");\r\n                sb.AppendLine(\"  push %r11\");\r\n                sb.AppendLine(\"  push %r8\");\r\n                sb.AppendLine(\"  push %rcx\");\r\n                sb.AppendLine(\"  push %rdx\");\r\n\r\n                // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                sb.AppendLine(\"  xor %r15, %r15\");\r\n                sb.AppendLine(\"  mov $0x1, %r14\");\r\n                sb.AppendLine(\"  mov $0x2, %r13\");\r\n                sb.AppendLine(\"  mov $0x3, %r12\");\r\n                sb.AppendLine(\"  mov $0x4, %r11\");\r\n\r\n                sb.AppendLine(\"  xor %rdi, %rdi\");\r\n                sb.AppendLine(\"  mov $0x40, %esi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_edi_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  cmp %r14, %r11\");\r\n                    sb.AppendLine($\"  je {jumpLabel}\");\r\n                    // try to space the jumps out a bit\r\n                    if (this.mixNops) sb.AppendLine($\"  nop\");\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_esi_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  cmp %r14, %r11\");\r\n                    sb.AppendLine($\"  je {jumpLabel}\");\r\n                    if (this.mixNops) sb.AppendLine($\"  nop\");\r\n                    // try to space the jumps out a bit\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  dec %rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                sb.AppendLine(\"  pop %rdx\");\r\n                sb.AppendLine(\"  pop %rcx\");\r\n                sb.AppendLine(\"  pop %r8\");\r\n                sb.AppendLine(\"  pop %r11\");\r\n                sb.AppendLine(\"  pop %r12\");\r\n                sb.AppendLine(\"  pop %r13\");\r\n                sb.AppendLine(\"  pop %r14\");\r\n                sb.AppendLine(\"  pop %r15\");\r\n                sb.AppendLine(\"  pop %rdi\");\r\n                sb.AppendLine(\"  pop %rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = Prefix + Counts[i];\r\n\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  sub sp, sp, #0x50\");\r\n                sb.AppendLine(\"  stp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  stp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  stp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  stp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  mov x15, 1\");\r\n                sb.AppendLine(\"  mov x14, 2\");\r\n                sb.AppendLine(\"  mov x13, 3\");\r\n                sb.AppendLine(\"  mov x12, 4\");\r\n                sb.AppendLine(\"  mov x11, 5\");\r\n                sb.AppendLine(\"  mov x10, 6\");\r\n\r\n                sb.AppendLine(\"  mov w25, 0x0\");\r\n                sb.AppendLine(\"  mov w26, 0x40\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  ldr w25, [x1, w25, uxtw #2]\"); // current = A[current]\r\n                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_w25_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  cmp x15, x10\");\r\n                    sb.AppendLine($\"  b.eq {jumpLabel}\");\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  ldr w26, [x1, w26, uxtw #2]\");\r\n                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_w26_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  cmp x15, x10\");\r\n                    sb.AppendLine($\"  b.eq {jumpLabel}\");\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine(\"  cbnz x0, \" + funcName + \"start\");\r\n                sb.AppendLine(\"  ldp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  ldp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  ldp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  ldp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  add sp, sp, #0x50\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        public void GenerateMipsAsm(StringBuilder sb)\r\n        {\r\n            StringBuilder ntJumpTargets = new StringBuilder();\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string initInstrs = \"  move $r15, $r0\\n  addi.d $r15, $r15, 15\";\r\n                string funcName = this.Prefix + Counts[i];\r\n\r\n                // args in r4 = iterations, r5 = list, r6 = list (sink)\r\n                // use r12 and r13 for ptr chasing loads, r14 as decrement for iteration count\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  ld.d $r12, $r5, 0\");\r\n                sb.AppendLine(\"  ld.d $r13, $r5, 64\");\r\n                sb.AppendLine(\"  xor $r14, $r14, $r14\");\r\n                sb.AppendLine(\"  addi.d $r14, $r14, 1\");\r\n                sb.AppendLine(initInstrs);\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  ld.d $r12, $r12, 0\");\r\n                int fillerInstrCount = Counts[i];\r\n                for (int instrIdx = 0; instrIdx < fillerInstrCount; instrIdx++)\r\n                {\r\n                    string jumpLabel = \"dontenduphere_r12_\" + this.Prefix + \"_\" + Counts[i] + \"_\" + instrIdx;\r\n                    sb.AppendLine($\"  beqz $r15, {jumpLabel}\");\r\n                    ntJumpTargets.AppendLine(jumpLabel + \":\");\r\n                    ntJumpTargets.AppendLine(\"  jr $r1\");\r\n                }\r\n                sb.AppendLine(\"  ld.d $r13, $r13, 0\");\r\n                for (int instrIdx = 0; instrIdx < fillerInstrCount; instrIdx++)\r\n                {\r\n                    string jumpLabel = \"dontenduphere_r13_\" + this.Prefix + \"_\" + Counts[i] + \"_\" + instrIdx;\r\n                    sb.AppendLine($\"  beqz $r15, {jumpLabel}\");\r\n                    ntJumpTargets.AppendLine(jumpLabel + \":\");\r\n                    ntJumpTargets.AppendLine(\"  jr $r1\");\r\n                }\r\n                sb.AppendLine(\"  sub.d $r4, $r4, $r14\");\r\n                sb.AppendLine(\"  bnez $r4, \" + funcName + \"start\");\r\n                sb.AppendLine(\" jr $r1\");\r\n            }\r\n\r\n            sb.AppendLine(ntJumpTargets.ToString());\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/BranchHistoryTest.cs",
    "content": "﻿using System.IO;\nusing System.Text;\n\nnamespace AsmGen\n{\n    public class BranchHistoryTest : IUarchTest\n    {\n        public string Prefix { get; private set; }\n\n        public string Description { get; private set; }\n\n        public string FunctionDefinitionParameters { get; private set; }\n\n        public string GetFunctionCallParameters { get; private set; }\n\n        public bool DivideTimeByCount { get; private set; }\n\n        private int[] branchCounts;\n        private int[] historyCounts;\n\n        public BranchHistoryTest()\n        {\n            Prefix = \"branchhist\";\n            Description = \"Branch predictor pattern recognition\";\n            FunctionDefinitionParameters = \"uint64_t iterations, uint32_t **arr, uint32_t arrLen\";\n            GetFunctionCallParameters = \"structIterations\";\n            DivideTimeByCount = true;\n            branchCounts = new int[] { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 };\n            historyCounts = new int[] { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536,\n              2048, 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768 };\n        }\n\n        public bool SupportsIsa(IUarchTest.ISA isa)\n        {\n            if (isa == IUarchTest.ISA.amd64) return true;\n            if (isa == IUarchTest.ISA.aarch64) return true;\n            if (isa == IUarchTest.ISA.mips64) return true;\n            if (isa == IUarchTest.ISA.riscv) return true;\n            return false;\n        }\n\n        public void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\n        {\n            if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);\n            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);\n            if (isa == IUarchTest.ISA.mips64) GenerateMipsAsm(sb);\n            if (isa == IUarchTest.ISA.riscv) GenerateRiscvAsm(sb);\n        }\n\n        public void GenerateArmAsm(StringBuilder sb)\n        {\n            for (int i = 0; i < branchCounts.Length; i++)\n            {\n                string functionLabel = Prefix + branchCounts[i];\n                string loopLabel = functionLabel + \"_loop\";\n                sb.AppendLine(\"\\n\" + functionLabel + \":\");\n                sb.AppendLine(\"  sub sp, sp, #0x40\");\n                sb.AppendLine(\"  stp x11, x12, [sp, #0x30]\");\n                sb.AppendLine(\"  stp x15, x16, [sp, #0x20]\");\n                sb.AppendLine(\"  stp x13, x14, [sp, #0x10]\");\n                sb.AppendLine(\"  eor x16, x16, x16\");\n                sb.AppendLine(\"  eor x15, x15, x15\");\n                sb.AppendLine(\"  eor x12, x12, x12\");\n                sb.AppendLine(\"  eor x11, x11, x11\");\n\n                // w14 = branch index, w16 = pattern array index\n                sb.AppendLine(loopLabel + \":\");\n                sb.AppendLine(\"  eor w14, w14, w14\");\n\n                // generate branch blocks\n                for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)\n                {\n                    string jumpTarget = functionLabel + branchCounts[i] + \"_zero\" + branchCount;\n                    sb.AppendLine(\"  ldr x15, [x1, w14, uxtw #3]\");\n                    sb.AppendLine(\"  add w14, w14, 1\");\n                    sb.AppendLine(\"  ldr w13, [x15, w16, uxtw #2]\");\n                    sb.AppendLine($\"  cbnz x13, {jumpTarget}\");\n                    sb.AppendLine(\"  add x12, x12, 1\");\n                    sb.AppendLine(jumpTarget + \":\");\n                }\n\n                // increment w16, and basically cmov 0 -> w16 if w16 = list length\n                sb.AppendLine(\"  add w16, w16, 1\");\n                sb.AppendLine(\"  cmp w16, w2\");\n                sb.AppendLine(\"  csel w16, w11, w16, EQ\");\n                sb.AppendLine(\"  sub x0, x0, 1\");\n                sb.AppendLine($\"  cbnz x0, {loopLabel}\");\n                sb.AppendLine(\"  mov x0, x12\");\n                sb.AppendLine(\"  ldp x11, x12, [sp, #0x30]\");\n                sb.AppendLine(\"  ldp x15, x16, [sp, #0x20]\");\n                sb.AppendLine(\"  ldp x13, x14, [sp, #0x10]\");\n                sb.AppendLine(\"  add sp, sp, #0x40\");\n                sb.AppendLine(\"  ret\");\n            }\n        }\n\n        public void GenerateX86GccAsm(StringBuilder sb)\n        {\n            for (int i = 0; i < branchCounts.Length; i++)\n            {\n                string functionLabel = Prefix + branchCounts[i];\n                sb.AppendLine(\"\\n\" + functionLabel + \":\");\n                sb.AppendLine(\"  push %rbx\");\n                sb.AppendLine(\"  push %r8\");\n                sb.AppendLine(\"  push %r9\");\n                sb.AppendLine(\"  xor %rbx, %rbx\");\n                sb.AppendLine(\"  xor %r8, %r8\");\n                sb.AppendLine(\"  xor %r9, %r9\");\n\n                string loopLabel = functionLabel + \"_loop\";\n                sb.AppendLine(\"\\n\" + loopLabel + \":\");\n                sb.AppendLine(\"  xor %r11, %r11\"); // set index into arr of arrs to 0\n                for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)\n                {\n                    sb.AppendLine(\"  mov (%rsi,%r11,8), %r10\");  // load array base pointer into r10\n                    sb.AppendLine(\"  inc %r11\");\n                    sb.AppendLine(\"  mov (%r10,%rbx,4), %eax \"); // read element from branch history test array\n                    sb.AppendLine(\"  test %eax, %eax\");\n\n                    // conditional branch on test array value\n                    string zeroLabel = Prefix + branchCounts[i] + \"_zero\" + branchCount;\n                    sb.AppendLine(\"  jz \" + zeroLabel);\n                    sb.AppendLine(\"  inc %r8\"); // r8 is just a sink here\n                    sb.AppendLine(zeroLabel + \":\");\n                }\n\n                // loop around in pattern history test array if necessary\n                // avoiding an extra branch to not pollute BPU history\n                sb.AppendLine(\"  inc %rbx\");\n                sb.AppendLine(\"  cmp %rbx, %rdx\");\n                sb.AppendLine(\"  cmove %r9, %rbx\");\n\n                // end of main loop over iteration count\n                sb.AppendLine(\"  dec %rdi\");\n                sb.AppendLine(\"  jnz \" + loopLabel);\n\n                // function epilogue\n                sb.AppendLine(\"  mov %r8, %rax\");\n                sb.AppendLine(\"  pop %r9\");\n                sb.AppendLine(\"  pop %r8\");\n                sb.AppendLine(\"  pop %rbx\");\n                sb.AppendLine(\"  ret\");\n            }\n        }\n\n        public void GenerateMipsAsm(StringBuilder sb)\n        {\n            // Generate an array of branch history test functions, one for each branch count\n            for (int i = 0; i < branchCounts.Length; i++)\n            {\n                // branchtestFunc(iterations, testArrToArr, historyLen)\n                // r4 = iterations, r5 = array of pointers to pattern arrays for each branch, r6 = history length (length of each array)\n                // temporary registers: r12-r20\n\n                // write code here\n                string functionLabel = Prefix + branchCounts[i];\n                sb.AppendLine(\"\\n\" + functionLabel + \":\");\n\n                // r12 = branch index, r13 = index into pattern array\n                sb.AppendLine(\"  move $r13, $r0\");\n                sb.AppendLine(\"  move $r18, $r0\");\n                sb.AppendLine(\"  move $r20, $r0\");\n                sb.AppendLine(\"  addi.d $r20, $r20, 1\");\n\n                string loopLabel = functionLabel + \"_loop\";\n                sb.AppendLine(\"\\n\" + loopLabel + \":\");\n                sb.AppendLine(\"  move $r12, $r0\"); // set branch index to zero\n\n                // generate branch blocks\n                for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)\n                {\n                    string jumpTarget = functionLabel + branchCounts[i] + \"_zero\" + branchCount;\n\n                    // load the branch's pattern array\n                    sb.AppendLine(\"  alsl.d $r14, $r12, $r0, 0x3\");    // get offset into array in bytes, using r12 as array index.\n                    sb.AppendLine(\"  add.d $r14, $r14, $r5\");          // get address into r14\n                    sb.AppendLine(\"  ld.d $r15, $r14, 0\");             // r15 = base address of curent branch's target array\n                    sb.AppendLine(\"  addi.d $r12, $r12, 1\");            // next branch\n\n                    // load element from pattern array indicating where we should branch\n                    sb.AppendLine(\"  alsl.d $r16, $r13, $r0, 0x2\");    // use r13 to index into pattern array\n                    sb.AppendLine(\"  add.d $r16, $r16, $r15\");         // r16 = address of element we want to load\n                    sb.AppendLine(\"  ld.w $r17, $r16, 0\");\n                    sb.AppendLine($\"  bnez $r17, {jumpTarget}\");       // branch if 1\n                    sb.AppendLine(\"  addi.d $r18, $r18, 1\");\n                    sb.AppendLine(jumpTarget + \":\");\n                }\n\n                // increment w16, and basically cmov 0 -> w16 if w16 = list length\n                // increment r13 (idx into pattern array)\n                sb.AppendLine(\"  addi.d $r13, $r13, 1\");\n                sb.AppendLine(\"  sub.d $r19, $r6, $r13\");    // r19 = history length - index\n                sb.AppendLine(\"  maskeqz $r13, $r13, $r19\"); // set index back to 0 to repeat pattern, if history length - index == 0\n                sb.AppendLine(\"  sub.d $r4, $r4, $r20\");            // decrement iteration count\n                sb.AppendLine($\"  bnez $r4, {loopLabel}\");\n                sb.AppendLine(\"  move $r4, $r18\"); // return the count of NT branches for tracking RNG quality\n\n                sb.AppendLine(\"  jr $r1\");\n            }\n        }\n\n        public void GenerateRiscvAsm(StringBuilder sb)\n        {\n            // Generate an array of branch history test functions, one for each branch count\n            for (int i = 0; i < branchCounts.Length; i++)\n            {\n                // branchtestFunc(iterations, testArrToArr, historyLen)\n                // a0 = iterations, a1 = array of pointers to pattern arrays for each branch, a2 = length of each array (history length)\n                // t0-t7 temporary registers\n\n                // write code here\n                string functionLabel = Prefix + branchCounts[i];\n                sb.AppendLine(\"\\n\" + functionLabel + \":\");\n                sb.AppendLine(\"  addi sp, sp, -16\");\n                sb.AppendLine(\"  sd s0, (sp)\");\n                // t1 = index into pattern array\n                sb.AppendLine(\"  li t1, 0\");\n                sb.AppendLine(\"  li t6, 0\");\n\n                string loopLabel = functionLabel + \"_loop\";\n                sb.AppendLine(\"\\n\" + loopLabel + \":\");\n                sb.AppendLine(\"  mv t2, a1\"); // start of array of pointers to pattern arrays\n\n                // generate branchCount blocks, each of which traverses its own array\n                for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)\n                {\n                    string jumpTarget = functionLabel + branchCounts[i] + \"_zero\" + branchCount;\n\n                    // load the branch's pattern array (a1 -> ptr -> array)\n                    sb.AppendLine(\"  ld t3, (t2)\");            // load pointer to array\n\n                    // t3 = base address of branch's array\n                    sb.AppendLine(\"  slli t4, t1, 2\");\n                    sb.AppendLine(\"  add t4, t4, t3\");\n                    sb.AppendLine(\"  lw t5, (t4)\");    // should have 1 or 0\n                    sb.AppendLine(\"  addi t2, t2, 8\"); // next branch\n                    sb.AppendLine($\"  beq t5, x0, {jumpTarget}\");\n                    sb.AppendLine(\"  addi t6, t6, 1\");  // dummy increment to track not-taken/taken branch ratio\n                    sb.AppendLine(jumpTarget + \":\");\n                }\n\n                sb.AppendLine(\"  addi t1, t1, 1\"); // increment array index\n                sb.AppendLine(\"  slt s0, t1, a2\"); // 1 if within range\n                sb.AppendLine(\"  mul t1, t1, s0\"); // multiply by 1 if within range, 0 otherwise\n\n                // decrement iteration count\n                sb.AppendLine(\"  addi a0, a0, -1\");\n                sb.AppendLine($\"  bne a0, x0, {loopLabel}\");\n                sb.AppendLine(\"  mv a0, t6\");\n                sb.AppendLine(\"  ld s0, (sp)\");\n                sb.AppendLine(\"  addi sp, sp, 16\");\n                sb.AppendLine(\"  ret\");\n            }\n        }\n\n        public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa)\n        {\n            sb.AppendLine(\"  if (argc > 1 && strcmp(test_name, \\\"\" + Prefix + \"\\\") == 0) {\");\n            sb.AppendLine(\"    printf(\\\"\" + Description + \":\\\\n\\\");\");\n            GenerateCommonTestBlock(sb);\n            sb.AppendLine(\"  }\\n\");\n        }\n\n        public void GenerateAsmGlobalLines(StringBuilder sb)\n        {\n            for (int i = 0; i < branchCounts.Length; i++)\n                sb.AppendLine(\".global \" + Prefix + branchCounts[i]);\n        }\n\n        // kinda hack this to put in initialization code we need\n        public void GenerateExternLines(StringBuilder sb)\n        {\n            for (int i = 0; i < branchCounts.Length; i++)\n                sb.AppendLine(\"extern uint64_t \" + Prefix + branchCounts[i] + $\"({FunctionDefinitionParameters}) __attribute((sysv_abi));\");\n\n            GenerateInitializationCode(sb, true);\n\n            string gccFunction = File.ReadAllText(Path.Combine(Program.DataFilesDir, \"GccBranchHistFunction.c\"));\n            sb.AppendLine(gccFunction);\n        }\n\n        public void GenerateInitializationCode(StringBuilder sb, bool gcc)\n        {\n            sb.AppendLine($\"uint32_t maxBranchCount = {branchCounts.Length};\");\n            sb.Append($\"uint32_t branchCounts[{branchCounts.Length}] = \");\n            sb.Append(\"{  \" + branchCounts[0]);\n            for (int i = 1; i < branchCounts.Length; i++) sb.Append(\", \" + branchCounts[i]);\n            sb.AppendLine(\" };\");\n            sb.Append($\"uint32_t branchHistoryLengths[{historyCounts.Length}] = \");\n            sb.Append(\"{  \" + historyCounts[0]);\n            for (int i = 1; i < historyCounts.Length; i++) sb.Append(\", \" + historyCounts[i]);\n            sb.AppendLine(\" };\");\n\n            if (gcc) sb.AppendLine($\"uint64_t (__attribute((sysv_abi)) *branchtestFuncArr[{branchCounts.Length}])(uint64_t iterations, uint32_t **arr, uint32_t arrLen);\");\n            else sb.AppendLine($\"uint64_t (*branchtestFuncArr[{branchCounts.Length}])(uint64_t iterations, uint32_t **arr, uint32_t arrLen);\");\n\n            sb.AppendLine(\"void initializeBranchHistFuncArr() {\");\n            for (int i = 0; i < branchCounts.Length; i++)\n            {\n                sb.AppendLine($\"  branchtestFuncArr[{i}] = {Prefix + branchCounts[i]};\");\n            }\n\n            sb.AppendLine(\"}\");\n        }\n\n        public void GenerateCommonTestBlock(StringBuilder sb)\n        {\n            string branchhistMain = File.ReadAllText(Path.Combine(Program.DataFilesDir, \"BranchhistTestBlock.c\"));\n            sb.AppendLine(branchhistMain);\n        }\n    }\n}\n"
  },
  {
    "path": "AsmGen/tests/BtbTest.cs",
    "content": "﻿using System;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class BtbTest : UarchTest\r\n    {\r\n        private int spacing;\r\n        private BranchType branchType;\r\n        private bool varyspacing;\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public enum BranchType\r\n        {\r\n            /// <summary>\r\n            /// Conditional branches that are always taken\r\n            /// </summary>\r\n            Conditional,\r\n\r\n            /// <summary>\r\n            /// Unconditional jmps\r\n            /// </summary>\r\n            Unconditional,\r\n\r\n            /// <summary>\r\n            /// A mix of both to max out Zen 2's BTB capacity\r\n            /// Optimization guide says one entry can track two branches if they're in the same 64B line\r\n            /// and the first is conditional\r\n            /// </summary>\r\n            ZenMix\r\n        }\r\n\r\n        /// <summary>\r\n        /// Constructor for BTB test\r\n        /// </summary>\r\n        /// <param name=\"spacing\">How far apart branches should be. Valid values are 4, 8, 16</param>\r\n        /// <param name=\"conditional\">If true, use conditional branches (still always taken)</param>\r\n        public BtbTest(int spacing, BranchType branchType, bool varyspacing = false)\r\n        {\r\n            this.Counts = new int[] { 1, 2, 4, 8, 16, 32, 48, 56, 64, 128, 256, 512, 768, 1024, 1536, 2048,\r\n                3072, 4096, 4608, 5120, 6144, 7168, 8192, 10240, 12288, 14336, 16384, 20480, 24576, 28672, 32768, 40960, 49152 };\r\n            this.Prefix = \"btb\" + spacing + (varyspacing ? \"v\" : \"\") + branchType;\r\n            this.Description = $\"Branch Target Buffer, \" + branchType + $\" branch every {spacing} bytes \" + (varyspacing ? \" (varied spacing)\" : \"\");\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations\";\r\n            this.GetFunctionCallParameters = \"structIterations\";\r\n            this.DivideTimeByCount = true;\r\n            this.spacing = spacing;\r\n            this.branchType = branchType;\r\n            this.varyspacing = varyspacing;\r\n        }\r\n\r\n        private string GetBranchFuncName(int branchCount) { return Prefix + branchCount; }\r\n        public string GetLabelName(string funcName, int part) { return funcName + \"part\" + part; }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                GenerateX86GccAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                GenerateMipsAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                GenerateRiscvAsm(sb);\r\n            }\r\n        }\r\n\r\n        public void GenerateX86GccAsm(StringBuilder sb)\r\n        {\r\n            string paddingAlign = \"  .align \" + spacing;\r\n            int spacingNops = 0;\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = GetBranchFuncName(Counts[i]);\r\n                //sb.AppendLine(\"; Start of function for branch count \" + branchCounts[i] + \" padding \" + paddings[p]);\r\n                sb.AppendLine(funcName + \":\\n\");\r\n                sb.AppendLine(\"  xor %rax, %rax\");\r\n\r\n                if (branchType == BranchType.ZenMix) sb.AppendLine(\"  .align 64\");\r\n                for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)\r\n                {\r\n                    string labelName = GetLabelName(funcName, branchIdx);\r\n\r\n                    if (branchType == BranchType.Conditional)\r\n                    {\r\n                        sb.AppendLine(\"  test %rax, %rax\");\r\n                        sb.AppendLine(\"  jz \" + labelName); // should always be set\r\n                    }\r\n                    else if (branchType == BranchType.Unconditional)\r\n                    {\r\n                        sb.AppendLine(\"  jmp \" + labelName);\r\n                    }\r\n                    else if (branchType == BranchType.ZenMix)\r\n                    {\r\n                        if ((branchIdx & 0x1) == 0)\r\n                        {\r\n                            sb.AppendLine(\"  jmp \" + labelName);\r\n                        }\r\n                        else\r\n                        {\r\n                            sb.AppendLine(\"  test %rax, %rax\");\r\n                            sb.AppendLine(\"  jz \" + labelName);\r\n                        }\r\n                    }\r\n\r\n                    sb.AppendLine(paddingAlign);\r\n\r\n                    if (varyspacing)\r\n                    {\r\n                        for (int nopIdx = 0; nopIdx < spacingNops; nopIdx++)\r\n                        {\r\n                            sb.AppendLine(\"  nop\");\r\n                        }\r\n                        spacingNops++;\r\n                        if (spacingNops > 6) spacingNops = 0;\r\n                    }\r\n\r\n                    sb.AppendLine(labelName + \":\");\r\n                }\r\n\r\n                sb.AppendLine(\"  dec %rdi\");\r\n                sb.AppendLine(\"  jne \" + funcName);\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n\r\n                // don't let it get too close to the next branch\r\n                sb.AppendLine(paddingAlign);\r\n            }\r\n        }\r\n\r\n        private string Get4BNopAlign()\r\n        {\r\n            string paddingAlign = \"\";\r\n            if (spacing == 8)\r\n            {\r\n                paddingAlign = \"  nop\";\r\n            }\r\n            else if (spacing == 16)\r\n            {\r\n                paddingAlign = \"  nop\\n  nop\\n  nop\";\r\n            }\r\n            else if (spacing == 32)\r\n            {\r\n                paddingAlign = \"  nop\\n  nop\\n  nop\\n  nop\\n  nop\\n  nop\\n  nop\";\r\n            }\r\n            else if (spacing == 64)\r\n            {\r\n                paddingAlign = \"  nop\\n  nop\\n  nop\\n  nop\\n  nop\\n  nop\\n  nop\\n\";\r\n                paddingAlign += \"  nop\\n  nop\\n  nop\\n  nop\\n  nop\\n  nop\\n  nop\\n  nop\";\r\n            }\r\n            else if (spacing != 4)\r\n            {\r\n                Console.WriteLine($\"Unsupported padding value {spacing}\");\r\n                throw new NotImplementedException(\"Unsupported padding value\");\r\n            }\r\n\r\n            return paddingAlign;\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            // things are 4 bytes on aarch64\r\n            string paddingAlign = Get4BNopAlign();\r\n\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = GetBranchFuncName(Counts[i]);\r\n                string funcTargetName = GetBranchFuncName(Counts[i]) + \"_itarget\";\r\n                sb.AppendLine(funcName + \":\");\r\n                sb.AppendLine($\"  adrp x2, {funcName}\");\r\n                sb.AppendLine($\"  add x2, x2, :lo12:{funcName}\");\r\n                sb.AppendLine(\"  mov x1, 1\");\r\n                sb.AppendLine(\".align 16\");\r\n                sb.AppendLine(funcTargetName + \":\");\r\n                for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)\r\n                {\r\n                    string labelName = GetLabelName(funcName, branchIdx);\r\n                    if (branchType == BranchType.Unconditional)\r\n                        sb.AppendLine(\"  b \" + labelName);\r\n                    else if (branchType == BranchType.Conditional)\r\n                        sb.AppendLine(\"  cbnz x1, \" + labelName); // x1 = 1 from earlier, should never be zero\r\n                    else if (branchType == BranchType.ZenMix)\r\n                    {\r\n                        if ((branchIdx & 0x1) == 0) sb.AppendLine(\"  b \" + labelName);\r\n                        else sb.AppendLine(\"  cbnz x1, \" + labelName);\r\n                    }\r\n\r\n                    sb.AppendLine(paddingAlign);\r\n                    sb.AppendLine(labelName + \":\");\r\n                }\r\n\r\n                sb.AppendLine(paddingAlign);\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n\r\n                // aarch64 is a mess. try to avoid 'relocation truncated to fit' issues with an indirect branch\r\n                if (spacing * Counts[i] >= (1024 * 1024 - 20))\r\n                {\r\n                    string workaroundTarget = funcName + \"_aarch64_indirect_workaround\";\r\n\r\n                    // jump over indirect branch to return, on zero\r\n                    // this branch should be not taken for all except the last iteration, and should have minimal\r\n                    // impact on results because a predicted NT branch is sort of 'free' on most architectures\r\n                    sb.AppendLine(\"  cbz x0, \" + workaroundTarget);\r\n                    sb.AppendLine(\"  br x2\");\r\n                    sb.AppendLine(workaroundTarget + \":\");\r\n                }\r\n                else\r\n                {\r\n                    sb.AppendLine(\"  cbnz x0, \" + funcTargetName);\r\n                }\r\n\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n\r\n                // don't let it get too close to the next branch\r\n                sb.AppendLine(paddingAlign);\r\n            }\r\n        }\r\n\r\n        public void GenerateMipsAsm(StringBuilder sb)\r\n        {\r\n            string paddingAlign = Get4BNopAlign();\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = GetBranchFuncName(Counts[i]);\r\n                string funcTargetName = GetBranchFuncName(Counts[i]) + \"_itarget\";\r\n\r\n                sb.AppendLine(funcName + \":\");\r\n                sb.AppendLine(\"  xor $r12, $r12, $r12\");\r\n                sb.AppendLine(\"  addi.d $r12, $r12, 1\");\r\n                sb.AppendLine(\"  xor $r13, $r13, $r13\");\r\n                sb.AppendLine(\"  la $r14, \" + funcTargetName);\r\n                sb.AppendLine(funcTargetName + \":\");\r\n                for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)\r\n                {\r\n                    string labelName = GetLabelName(funcName, branchIdx);\r\n                    sb.AppendLine(\"  beqz $r13, \" + labelName);\r\n                    sb.AppendLine(paddingAlign);\r\n                    sb.AppendLine(labelName + \":\");\r\n                }\r\n\r\n                sb.AppendLine(\"  sub.d $r4, $r4, $r12\"); // decrement iteration count\r\n\r\n                int distance = spacing * Counts[i];\r\n                if (distance < 1024)\r\n                {\r\n                    sb.AppendLine(\"  bnez $r4, \" + funcTargetName); // short branch if we're not too far away\r\n                }\r\n                else\r\n                {\r\n                    string workaroundTarget = funcName + \"_mips_indirect_workaround\";\r\n                    sb.AppendLine(\"  beqz $r4, \" + workaroundTarget); // jump over indirect branch if iteration count is reached\r\n                    sb.AppendLine(\"  jr $r14\"); // jump back to target (start of loop)\r\n                    sb.AppendLine(workaroundTarget + \":\");\r\n                }\r\n\r\n                sb.AppendLine(\"  jr $r1\");\r\n            }\r\n        }\r\n\r\n        private string GetRiscvNopAlign()\r\n        {\r\n            // branch takes 16 bits (2 bytes)\r\n            int paddingNeeded = spacing - 2;\r\n\r\n            // each NOP is 2 bytes\r\n            StringBuilder nopSb = new StringBuilder();\r\n            for (int i = 0; i < paddingNeeded; i += 2)\r\n            {\r\n                nopSb.AppendLine(\"  nop\");\r\n            }\r\n\r\n            return nopSb.ToString();\r\n        }\r\n\r\n        public void GenerateRiscvAsm(StringBuilder sb)\r\n        {\r\n            string paddingAlign = GetRiscvNopAlign();\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = GetBranchFuncName(Counts[i]);\r\n                string funcTargetName = GetBranchFuncName(Counts[i]) + \"_itarget\";\r\n\r\n                sb.AppendLine(funcName + \":\");\r\n                sb.AppendLine(\"  la x5, \" + funcTargetName);\r\n                sb.AppendLine(funcTargetName + \":\");\r\n                for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)\r\n                {\r\n                    string labelName = GetLabelName(funcName, branchIdx);\r\n                    sb.AppendLine(\"  j \" + labelName);\r\n                    sb.AppendLine(paddingAlign);\r\n                    sb.AppendLine(labelName + \":\");\r\n                }\r\n\r\n                sb.AppendLine(\"  addi x10, x10, -1\"); // decrement iteration count\r\n\r\n                int distance = spacing * Counts[i];\r\n                if (distance < 1024)\r\n                {\r\n                    sb.AppendLine(\"  bne x10, x0, \" + funcTargetName); // short branch if we're not too far away\r\n                }\r\n                else\r\n                {\r\n                    string workaroundTarget = funcName + \"_riscv_indirect_workaround\";\r\n                    sb.AppendLine(\"  beq x10, x0, \" + workaroundTarget); // jump over indirect branch if iteration count is reached\r\n                    sb.AppendLine(\"  jalr x0, x5\"); // jump back to target (start of loop)\r\n                    sb.AppendLine(workaroundTarget + \":\");\r\n                }\r\n\r\n                sb.AppendLine(\"  ret\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/CvtSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class CvtSchedTest : UarchTest\r\n    {\r\n        public CvtSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"cvtsched\";\r\n            this.Description = \"F2I Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledInstrs = new string[4];\r\n                unrolledInstrs[0] = \"  cvtsi2ss %rdi, %xmm1\";\r\n                unrolledInstrs[1] = \"  cvtsi2ss %rdi, %xmm2\";\r\n                unrolledInstrs[2] = \"  cvtsi2ss %rdi, %xmm3\";\r\n                unrolledInstrs[3] = \"  cvtsi2ss %rdi, %xmm4\";\r\n\r\n                string[] unrolledInstrs1 = new string[4];\r\n                unrolledInstrs1[0] = \"  cvtsi2ss %rsi, %xmm1\";\r\n                unrolledInstrs1[1] = \"  cvtsi2ss %rsi, %xmm2\";\r\n                unrolledInstrs1[2] = \"  cvtsi2ss %rsi, %xmm3\";\r\n                unrolledInstrs1[3] = \"  cvtsi2ss %rsi, %xmm4\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs1);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] unrolledInstrs = new string[4];\r\n                unrolledInstrs[0] = \"  scvtf s0, w25\";\r\n                unrolledInstrs[1] = \"  scvtf s1, w25\";\r\n                unrolledInstrs[2] = \"  scvtf s2, w25\";\r\n                unrolledInstrs[3] = \"  scvtf s3, w25\";\r\n\r\n                string[] unrolledInstrs1 = new string[4];\r\n                unrolledInstrs1[0] = \"  scvtf s0, w26\";\r\n                unrolledInstrs1[1] = \"  scvtf s1, w26\";\r\n                unrolledInstrs1[2] = \"  scvtf s2, w26\";\r\n                unrolledInstrs1[3] = \"  scvtf s3, w26\";\r\n\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs1);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                \r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                \r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/FAdd256RfTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class Fadd256RfTest : UarchTest\r\n    {\r\n        public enum TestMode\r\n        {\r\n            none,\r\n            setavx512regs,\r\n            pendingavx512instr\r\n        }\r\n        private bool populateAvx512Regs;\r\n        private bool pendingAvx512Instr;\r\n        public Fadd256RfTest(int low, int high, int step, TestMode mode)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fadd256rf\" + mode;\r\n            this.Description = \"256-bit FP/vector RF capacity, \" + mode;\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            if (mode == TestMode.setavx512regs) populateAvx512Regs = true;\r\n            else if (mode == TestMode.pendingavx512instr) pendingAvx512Instr = true;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return false;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string initInstrs = \"  vmovups (%r8), %ymm0\\n\" +\r\n                 \"  vmovups %ymm0, %ymm1\\n\" +\r\n                 \"  vmovups %ymm0, %ymm2\\n\" +\r\n                 \"  vmovups %ymm0, %ymm3\\n\" +\r\n                 \"  vmovups %ymm0, %ymm4\\n\";\r\n\r\n                if (this.populateAvx512Regs)\r\n                {\r\n                    for (int i = 5; i < 32; i++)\r\n                    {\r\n                        initInstrs += \"  vmovups 64(%r8), %zmm\" + i + \"\\n\";\r\n                    }\r\n                }\r\n\r\n                string postLoadInstr = string.Empty;\r\n\r\n                if (this.pendingAvx512Instr)\r\n                {\r\n                    initInstrs += \"  vmovups 64(%r8), %zmm5\\n  vmovups 128(%r8), %zmm6\\n\";\r\n                    postLoadInstr = \"  vaddps %zmm5, %zmm6, %zmm6\";\r\n                }\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  vaddps %ymm0, %ymm1, %ymm1\";\r\n                unrolledAdds[1] = \"  vaddps %ymm0, %ymm2, %ymm2\";\r\n                unrolledAdds[2] = \"  vaddps %ymm0, %ymm3, %ymm3\";\r\n                unrolledAdds[3] = \"  vaddps %ymm0, %ymm4, %ymm3\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, postLoadInstrs1: postLoadInstr, postLoadInstrs2: postLoadInstr);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                \r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string initInstrs = \"\";\r\n                for (int regIdx = 0; regIdx < 32; regIdx++)\r\n                {\r\n                    initInstrs += \"  xvld $xr\" + regIdx + \", $r6, \" + regIdx * 32 + \"\\n\";\r\n                }\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  xvfadd.s $xr1, $xr1, $xr1\";\r\n                unrolledAdds[1] = \"  xvfadd.s $xr2, $xr2, $xr2\";\r\n                unrolledAdds[2] = \"  xvfadd.s $xr3, $xr3, $xr3\";\r\n                unrolledAdds[3] = \"  xvfadd.s $xr4, $xr4, $xr4\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/Fadd128RfTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class Fadd128RfTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public Fadd128RfTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fadd128rf\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"128-bit FP/vector RF capacity\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch)\r\n            {\r\n                if (isa == IUarchTest.ISA.aarch64) return true;\r\n                if (isa == IUarchTest.ISA.riscv) return true;\r\n                return false;\r\n            }\r\n\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return false;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string initInstrs = \"  vmovups (%r8), %ymm0\\n\";\r\n\r\n                for (int i = 1; i < 16; i++) initInstrs += $\"  vmovups %ymm0, %ymm{i}\\n\";\r\n\r\n                List<string> unrolledAddsList = new List<string>();\r\n                for (int i = 1; i < 16; i++) unrolledAddsList.Add($\"  vaddps %ymm0, %ymm{i}, %ymm{i}\");\r\n                string[] unrolledAdds = unrolledAddsList.ToArray();\r\n\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                string initInstrs = \"  ldr q0, [x1]\\n\" +\r\n                \"  ldr q1, [x1, #0x10]\\n\" +\r\n                \"  ldr q2, [x1, #0x20]\\n\" +\r\n                \"  ldr q3, [x1, #0x30]\\n\" +\r\n                \"  ldr q4, [x1, #0x40]\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add v1.4s, v1.4s, v0.4s\";\r\n                unrolledAdds[1] = \"  add v2.4s, v2.4s, v0.4s\";\r\n                unrolledAdds[2] = \"  add v3.4s, v3.4s, v0.4s\";\r\n                unrolledAdds[3] = \"  add v4.4s, v4.4s, v0.4s\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string initInstrs = \"  vsetvli t5, t6, e32\\n  vlw.v v0, (a1)\\n  vlw.v v1, (a1)\\n  vlw.v v2, (a1)\\n  vlw.v v3, (a1)\";\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;\r\n                postLoadInstrs += \"\\n  mv t6, a2\";\r\n                string[] unrolledInstrs = new string[1];\r\n                unrolledInstrs[0] = \"  vfadd.vv v0, v0, v0\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false,\r\n                    initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/Fadd128SchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class Fadd128SchedTest : UarchTest\r\n    {\r\n        public Fadd128SchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fadd128sched\";\r\n            this.Description = \"128-bit Vector FP Add Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  addps %xmm0, %xmm1\";\r\n                unrolledAdds[1] = \"  addps %xmm0, %xmm2\";\r\n                unrolledAdds[2] = \"  addps %xmm0, %xmm3\";\r\n                unrolledAdds[3] = \"  addps %xmm0, %xmm4\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr q0, [x2, w25, uxtw#0]\";\r\n                string postLoadInstrs2 = \"  ldr q0, [x2, w26, uxtw#0]\";\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add v1.4s, v1.4s, v0.4s\";\r\n                unrolledAdds[1] = \"  add v2.4s, v2.4s, v0.4s\";\r\n                unrolledAdds[2] = \"  add v3.4s, v3.4s, v0.4s\";\r\n                unrolledAdds[3] = \"  add v4.4s, v4.4s, v0.4s\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/Fadd256SchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class Fadd256SchedTest : UarchTest\r\n    {\r\n        public Fadd256SchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fadd256sched\";\r\n            this.Description = \"256-bit FP add scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return false;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                // ymm0 is dependent on ptr chasing load\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  vaddps %ymm0, %ymm1, %ymm1\";\r\n                unrolledAdds[1] = \"  vaddps %ymm0, %ymm2, %ymm2\";\r\n                unrolledAdds[2] = \"  vaddps %ymm0, %ymm3, %ymm3\";\r\n                unrolledAdds[3] = \"  vaddps %ymm0, %ymm4, %ymm3\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmFp256SchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                \r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string initInstrs = \"\";\r\n                for (int regIdx = 0; regIdx < 32; regIdx++)\r\n                {\r\n                    initInstrs += \"  xvld $xr\" + regIdx + \", $r6, \" + regIdx * 32 + \"\\n\";\r\n                }\r\n                initInstrs += \"  move $r16, $r0\\n  addi.d $r16, $r16, 0xF\"; // load mask into r16\r\n\r\n                string postLoadInstrs1 = \"  and $r15, $r12, $r16\\n  xvldx $xr1, $r6, $r15\";\r\n                string postLoadInstrs2 = \"  and $r15, $r13, $r16\\n  xvldx $xr1, $r6, $r15\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  xvfadd.s $xr2, $xr2, $xr1\";\r\n                unrolledAdds[1] = \"  xvfadd.s $xr3, $xr3, $xr1\";\r\n                unrolledAdds[2] = \"  xvfadd.s $xr4, $xr4, $xr1\";\r\n                unrolledAdds[3] = \"  xvfadd.s $xr5, $xr5, $xr1\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/FaddNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class FaddNsq : UarchTest\r\n    {\r\n        private int totalOps;\r\n        public FaddNsq(int low, int high, int step, int totalOps)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"faddnsq\" + totalOps;\r\n            this.Description = \"FADD, excluding possible NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.totalOps = totalOps;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string postLoadInstrs = \"  cvtsi2ss %edi, %xmm1\";\r\n                string initInstrs = \"  cvtsi2ss %r12, %xmm2\";\r\n                string[] depInstrs = new string[4];\r\n                depInstrs[0] = \"  addss %xmm1, %xmm0\";\r\n                depInstrs[1] = \"  addss %xmm1, %xmm3\";\r\n                depInstrs[2] = \"  addss %xmm1, %xmm4\";\r\n                depInstrs[3] = \"  addss %xmm1, %xmm5\";\r\n\r\n                string[] indepInstrs = new string[2];\r\n                indepInstrs[0] = \"  addss %xmm2, %xmm6\";\r\n                indepInstrs[1] = \"  addss %xmm2, %xmm7\";\r\n                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr s16, [x2, w25, uxtw #2]\";\r\n                string initInstrs = \"  ldr s15, [x2]\";\r\n                string[] depInstrs = new string[4];\r\n                depInstrs[0] = \"  fadd s0, s0, s16\";\r\n                depInstrs[1] = \"  fadd s1, s1, s16\";\r\n                depInstrs[2] = \"  fadd s2, s2, s16\";\r\n                depInstrs[3] = \"  fadd s3, s3, s16\";\r\n\r\n                string[] indepInstrs = new string[4];\r\n                indepInstrs[0] = \"  fadd s17, s17, s15\";\r\n                indepInstrs[1] = \"  fadd s18, s18, s15\";\r\n                indepInstrs[2] = \"  fadd s19, s19, s15\";\r\n                indepInstrs[3] = \"  fadd s20, s20, s15\";\r\n                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,\r\n                    postLoadInstrs: postLoadInstrs1);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/FaddSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class FaddSchedTest : UarchTest\r\n    {\r\n        public FaddSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"faddsched\";\r\n            this.Description = \"FP Add Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  addss %xmm0, %xmm1\";\r\n                unrolledAdds[1] = \"  addss %xmm0, %xmm2\";\r\n                unrolledAdds[2] = \"  addss %xmm0, %xmm3\";\r\n                unrolledAdds[3] = \"  addss %xmm0, %xmm4\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  fadd s17, s17, s16\";\r\n                unrolledAdds[1] = \"  fadd s18, s18, s16\";\r\n                unrolledAdds[2] = \"  fadd s19, s19, s16\";\r\n                unrolledAdds[3] = \"  fadd s20, s20, s16\";\r\n                UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string initInstrs = \"  fld.s $f8, $r6, 0\\n\" +\r\n                    \"  fld.s $f9, $r6, 4\\n\" +\r\n                    \"  fld.s $f10, $r6, 8\\n\" +\r\n                    \"  fld.s $f11, $r6, 12\\n\" +\r\n                    \"  fld.s $f12, $r6, 16\\n\";\r\n\r\n                string postLoadInstrs1 = \"  andi $r19, $r12, 0xF\\n  add.d $r19, $r19, $r6\\n fld.s $f8, $r19, 0\";\r\n                string[] dependentAdds = new string[4];\r\n                dependentAdds[0] = \"  fadd.s $f9, $f9, $f8\";\r\n                dependentAdds[1] = \"  fadd.s $f10, $f10, $f8\";\r\n                dependentAdds[2] = \"  fadd.s $f11, $f11, $f8\";\r\n                dependentAdds[3] = \"  fadd.s $f12, $f12, $f8\";\r\n\r\n                string postLoadInstrs2 = \"  andi $r19, $r13, 0xF\\n  add.d $r19, $r19, $r6\\n fld.s $f8, $r19, 0\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, dependentAdds, dependentAdds, includePtrChasingLoads: false, initInstrs: initInstrs, \r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string initInstrs = \"  fld f0, (x12)\\n\" +\r\n                    \"  fld f1, 8(x12)\\n\" +\r\n                    \"  fld f2, 16(x12)\\n\" +\r\n                    \"  fld f3, 24(x12)\\n\" +\r\n                    \"  fld f4, 32(x12)\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  fadd.s f0, f0, f4\";\r\n                unrolledAdds[1] = \"  fadd.s f1, f1, f4\";\r\n                unrolledAdds[2] = \"  fadd.s f2, f2, f4\";\r\n                unrolledAdds[3] = \"  fadd.s f3, f3, f4\";\r\n\r\n                string postLoadInstrs1 = \"  andi x7, x5, 0xF\\n  add x7, x7, x12\\n  fld f4, (x7)\";\r\n                string postLoadInstrs2 = \"  andi x7, x6, 0xF\\n  add x7, x7, x12\\n  fld f4, (x7)\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false,\r\n                    initInstrs, postLoadInstrs1, postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/FcmpSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class FcmpSchedTest : UarchTest\r\n    {\r\n        public FcmpSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fcmpsched\";\r\n            this.Description = \"FCMP Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  fcmp s17, s16\";\r\n                unrolledAdds[1] = \"  fcmp s19, s16\";\r\n                unrolledAdds[2] = \"  fcmp s19, s16\";\r\n                unrolledAdds[3] = \"  fcmp s20, s16\";\r\n                UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/FlagRfTest.cs",
    "content": "﻿using System.Text;\n\nnamespace AsmGen\n{\n    public class FlagRfTest : UarchTest\n    {\n        private bool initialDependentBranch;\n        public FlagRfTest(int low, int high, int step, bool initialDependentBranch)\n        {\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\n            this.Prefix = \"flagrf\" + (initialDependentBranch ? \"db\" : string.Empty);\n            this.Description = \"Flags Register File\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\n            this.GetFunctionCallParameters = \"structIterations, A\";\n            this.DivideTimeByCount = false;\n            this.initialDependentBranch = initialDependentBranch;\n        }\n\n        public override bool SupportsIsa(IUarchTest.ISA isa)\n        {\n            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;\n            if (isa == IUarchTest.ISA.amd64) return true;\n            if (isa == IUarchTest.ISA.aarch64) return true;\n            if (isa == IUarchTest.ISA.mips64) return false;\n            return false;\n        }\n\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\n        {\n            if (isa == IUarchTest.ISA.amd64)\n            {\n                string[] unrolledAdds = new string[1];\n                unrolledAdds[0] = \"  test %r15, %r14\";\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);\n            }\n            else if (isa == IUarchTest.ISA.aarch64)\n            {\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\n                string[] unrolledAdds = new string[1];\n                unrolledAdds[0] = \"  cmp x14, x15\";\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "AsmGen/tests/Fma256SchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class Fma256SchedTest : UarchTest\r\n    {\r\n        public Fma256SchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fma256sched\";\r\n            this.Description = \"256-bit FP add scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return false;\r\n            if (isa == IUarchTest.ISA.aarch64) return false;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                // ymm0 is dependent on ptr chasing load\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  vaddps %ymm0, %ymm1, %ymm1\";\r\n                unrolledAdds[1] = \"  vaddps %ymm0, %ymm2, %ymm2\";\r\n                unrolledAdds[2] = \"  vaddps %ymm0, %ymm3, %ymm3\";\r\n                unrolledAdds[3] = \"  vaddps %ymm0, %ymm4, %ymm3\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmFp256SchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                \r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string initInstrs = \"\";\r\n                for (int regIdx = 0; regIdx < 32; regIdx++)\r\n                {\r\n                    initInstrs += \"  xvld $xr\" + regIdx + \", $r6, \" + regIdx * 32 + \"\\n\";\r\n                }\r\n                initInstrs += \"  move $r16, $r0\\n  addi.d $r16, $r16, 0xF\"; // load mask into r16\r\n\r\n                string postLoadInstrs1 = \"  and $r15, $r12, $r16\\n  xvldx $xr1, $r6, $r15\";\r\n                string postLoadInstrs2 = \"  and $r15, $r13, $r16\\n  xvldx $xr1, $r6, $r15\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  xvfmadd.s $xr2, $xr2, $xr2, $xr1\";\r\n                unrolledAdds[1] = \"  xvfmadd.s $xr3, $xr3, $xr3, $xr1\";\r\n                unrolledAdds[2] = \"  xvfmadd.s $xr4, $xr4, $xr4, $xr1\";\r\n                unrolledAdds[3] = \"  xvfmadd.s $xr5, $xr5, $xr5, $xr1\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/FmovSched.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class FmovSched : UarchTest\r\n    {\r\n        public FmovSched(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fmovsched\";\r\n            this.Description = \"FMOV vec to gpr Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr d16, [x2, w25, sxtw #0]\";\r\n                string postLoadInstrs2 = \"  ldr d16, [x2, w25, sxtw #0]\";\r\n                string[] unrolledInstrs = new string[4];\r\n                unrolledInstrs[0] = \"  fmov x15, d16\";\r\n                unrolledInstrs[1] = \"  fmov x14, d16\";\r\n                unrolledInstrs[2] = \"  fmov x13, d16\";\r\n                unrolledInstrs[3] = \"  fmov x12, d16\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/FmulSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class FmulSchedTest : UarchTest\r\n    {\r\n        public FmulSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fmulsched\";\r\n            this.Description = \"FP (32-bit multiply) Scheduler Capacity Test\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                GenerateX86Asm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                GenerateRiscvAsm(sb);\r\n            }\r\n        }\r\n\r\n        public void GenerateX86Asm(StringBuilder sb)\r\n        {\r\n            // xmm0 is dependent on ptr chasing load\r\n            string[] unrolledAdds = new string[4];\r\n            unrolledAdds[0] = \"  mulss %xmm0, %xmm1\";\r\n            unrolledAdds[1] = \"  mulss %xmm0, %xmm2\";\r\n            unrolledAdds[2] = \"  mulss %xmm0, %xmm3\";\r\n            unrolledAdds[3] = \"  mulss %xmm0, %xmm4\";\r\n\r\n            UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            string[] unrolledAdds = new string[4];\r\n            unrolledAdds[0] = \"  fmul s17, s17, s16\";\r\n            unrolledAdds[1] = \"  fmul s18, s18, s16\";\r\n            unrolledAdds[2] = \"  fmul s19, s19, s16\";\r\n            unrolledAdds[3] = \"  fmul s20, s20, s16\";\r\n            UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);\r\n        }\r\n\r\n        public void GenerateRiscvAsm(StringBuilder sb)\r\n        {\r\n            string initInstrs = \"  fld f0, (x12)\\n\" +\r\n                \"  fld f1, 8(x12)\\n\" +\r\n                \"  fld f2, 16(x12)\\n\" +\r\n                \"  fld f3, 24(x12)\\n\" +\r\n                \"  fld f4, 32(x12)\\n\";\r\n\r\n            string[] unrolledAdds = new string[4];\r\n            unrolledAdds[0] = \"  fmul.s f0, f0, f4\";\r\n            unrolledAdds[1] = \"  fmul.s f1, f1, f4\";\r\n            unrolledAdds[2] = \"  fmul.s f2, f2, f4\";\r\n            unrolledAdds[3] = \"  fmul.s f3, f3, f4\";\r\n\r\n            string postLoadInstrs1 = \"  andi x7, x5, 0xF\\n  add x7, x7, x12\\n  fld f4, (x7)\";\r\n            string postLoadInstrs2 = \"  andi x7, x6, 0xF\\n  add x7, x7, x12\\n  fld f4, (x7)\";\r\n            UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false,\r\n                initInstrs, postLoadInstrs1, postLoadInstrs2);\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/FpRfTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class FpRfTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public FpRfTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fprf\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"FP Register File\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch)\r\n            {\r\n                if (isa == IUarchTest.ISA.aarch64) return true;\r\n                if (isa == IUarchTest.ISA.riscv) return true;\r\n                return false;\r\n            }\r\n\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string initInstrs = \"  movss (%r8), %xmm1\\n\" +\r\n                    \"  movss 4(%r8), %xmm2\\n\" +\r\n                    \"  movss 8(%r8), %xmm3\\n\" +\r\n                    \"  movss 12(%r8), %xmm4\\n\" +\r\n                    \"  movss 16(%r8), %xmm5\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  addss %xmm1, %xmm2\";\r\n                unrolledAdds[1] = \"  addss %xmm1, %xmm3\";\r\n                unrolledAdds[2] = \"  addss %xmm1, %xmm4\";\r\n                unrolledAdds[3] = \"  addss %xmm1, %xmm5\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                string initInstrs = \"  ldr s17, [x2]\\n\" +\r\n                    \"  ldr s18, [x2, 4]\\n\" +\r\n                    \"  ldr s19, [x2, 8]\\n\" +\r\n                    \"  ldr s20, [x2, 12]\\n\" +\r\n                    \"  ldr s21, [x2, 16]\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  fadd s18, s18, s17\";\r\n                unrolledAdds[1] = \"  fadd s19, s19, s17\";\r\n                unrolledAdds[2] = \"  fadd s20, s20, s17\";\r\n                unrolledAdds[3] = \"  fadd s21, s21, s17\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string initInstrs = \"  fld.s $f8, $r6, 0\\n\" +\r\n                    \"  fld.s $f9, $r6, 4\\n\" +\r\n                    \"  fld.s $f10, $r6, 8\\n\" +\r\n                    \"  fld.s $f11, $r6, 12\\n\" +\r\n                    \"  fld.s $f12, $r6, 16\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  fadd.s $f9, $f9, $f8\";\r\n                unrolledAdds[1] = \"  fadd.s $f10, $f10, $f8\";\r\n                unrolledAdds[2] = \"  fadd.s $f11, $f11, $f8\";\r\n                unrolledAdds[3] = \"  fadd.s $f12, $f12, $f8\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));\r\n                string initInstrs = \"  fld f0, (x12)\\n\" +\r\n                    \"  fld f1, 8(x12)\\n\" +\r\n                    \"  fld f2, 16(x12)\\n\" +\r\n                    \"  fld f3, 24(x12)\\n\" +\r\n                    \"  fld f4, 32(x12)\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  fadd.s f0, f0, f4\";\r\n                unrolledAdds[1] = \"  fadd.s f1, f1, f4\";\r\n                unrolledAdds[2] = \"  fadd.s f2, f2, f4\";\r\n                unrolledAdds[3] = \"  fadd.s f3, f3, f4\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, \r\n                    includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/FpStoreDataNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class FpStoreDataNsqTest : UarchTest\r\n    {\r\n        public FpStoreDataNsqTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fpstoredatansq\" + high;\r\n            this.Description = \"Store FP 32-bit data scheduler capacity, excluding nsq\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string initInstrs = \"  vzeroupper\\n  vpcmpeqd %xmm2, %xmm2, %xmm2\\n  vpxor %xmm2, %xmm3, %xmm3\\n  cvtsi2ss %r11, %xmm3\\n movss %xmm3, %xmm4\\n  movss %xmm3, %xmm5\\n  movss %xmm3, %xmm6\";\r\n                string postLoadInstr = \"  cvtsi2ss %rdi, %xmm1\";\r\n                string[] dependentStores = new string[4];\r\n                dependentStores[0] = \"  movss %xmm1, (%r8)\";\r\n                dependentStores[1] = \"  movss %xmm1, (%r8, %r14, 4)\";\r\n                dependentStores[2] = \"  movss %xmm1, (%r8, %r13, 4)\";\r\n                dependentStores[3] = \"  movss %xmm1, (%r8, %r12, 4)\";\r\n\r\n                string[] indepFpInstrs = new string[4];\r\n                indepFpInstrs[0] = \"  addss %xmm2, %xmm3\";\r\n                indepFpInstrs[1] = \"  addss %xmm2, %xmm4\";\r\n                indepFpInstrs[2] = \"  addss %xmm2, %xmm5\";\r\n                indepFpInstrs[3] = \"  addss %xmm2, %xmm6\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepFpInstrs, false, initInstrs: initInstrs, postLoadInstrs: postLoadInstr);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/IdrfTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class IdrfTest : UarchTest\r\n    {\r\n        public IdrfTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"idrf\";\r\n            this.Description = \"Immediate/Displacement Register File\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            const string dummyBranchTargetName = \"idrftest_badtarget\";\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                const int storeCount = 40;\r\n                const int addCount = 130;\r\n                List<string> testInstructions = new List<string>();\r\n                int storeIdx = 0, addIdx = 0;\r\n                for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)\r\n                {\r\n                    if (addIdx < addCount)\r\n                    {\r\n                        string addInstr = \"  add $\" + (i + 1) + \", %r\" + (12 + (i % 4));\r\n                        testInstructions.Add(addInstr);\r\n                        addIdx++;\r\n                    }\r\n                    else if (storeIdx < storeCount)\r\n                    {\r\n                        string storeInstr = \"  mov %r11d, \" + +(((i + 1) & 0xFF) * 4) + \"(%r8)\";\r\n                        testInstructions.Add(storeInstr);\r\n                        storeIdx++;\r\n                    }\r\n                    else\r\n                    {\r\n                        string branchInstr = $\"  test %r11, %r11\\n  je {dummyBranchTargetName}\";\r\n                        testInstructions.Add(branchInstr);\r\n                    }\r\n                }\r\n\r\n                string[] unrolledAdds = testInstructions.ToArray();\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);\r\n\r\n                sb.AppendLine($\"{dummyBranchTargetName}:\\n  int3\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/IndirectBranchTest.cs",
    "content": "﻿using System.Text;\r\nusing System.IO;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class IndirectBranchTest : IUarchTest\r\n    {\r\n        private int[] branchCounts;\r\n        private int[] targetCounts;\r\n        private int globalHistoryAssistBits;\r\n        private bool assists;\r\n\r\n        public IndirectBranchTest(bool assist)\r\n        {\r\n            Prefix = \"indirectbranch\";\r\n            Description = \"Indirect branch prediction\";\r\n            FunctionDefinitionParameters = \"uint64_t iterations, uint32_t **arr, uint32_t arrLen, uint64_t **scratch\";\r\n            DivideTimeByCount = true;\r\n            branchCounts = new int[] { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 };\r\n            targetCounts = new int[] { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 160, 192, 256, 384, 512 };\r\n            globalHistoryAssistBits = 4;\r\n            this.assists = assist;\r\n        }\r\n\r\n        public bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            return false;\r\n        }\r\n\r\n        public void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                GenerateX86GccAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                GenerateMipsAsm(sb);\r\n            }\r\n        }\r\n\r\n        private string GetFunctionName(int branchCount, int targetCount)\r\n        {\r\n            return Prefix + branchCount + \"targets\" + targetCount;\r\n        }\r\n\r\n        private string GetTargetLabelName(int branchCount, int targetCount, int branchIndex, int targetIndex)\r\n        {\r\n            return GetFunctionName(branchCount, targetCount) + \"branch\" + branchIndex + \"target\" + targetIndex;\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)\r\n            {\r\n                int currentTargetCount = targetCounts[targetCountIdx];\r\n                for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)\r\n                {\r\n                    int currentBranchCount = branchCounts[branchCountIdx];\r\n                    string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount);\r\n                    string loopLabel = functionLabel + \"_loop\";\r\n                    sb.AppendLine(\"\\n\" + functionLabel + \":\");\r\n                    sb.AppendLine(\"  sub sp, sp, #0x60\");\r\n                    sb.AppendLine(\"  stp x17, x18, [sp, #0x40]\");\r\n                    sb.AppendLine(\"  stp x9, x10, [sp, #0x40]\");\r\n                    sb.AppendLine(\"  stp x11, x12, [sp, #0x30]\");\r\n                    sb.AppendLine(\"  stp x15, x16, [sp, #0x20]\");\r\n                    sb.AppendLine(\"  stp x13, x14, [sp, #0x10]\");\r\n                    sb.AppendLine(\"  eor x16, x16, x16\");\r\n                    sb.AppendLine(\"  eor x15, x15, x15\");\r\n                    sb.AppendLine(\"  eor x14, x14, x14\");\r\n                    sb.AppendLine(\"  eor x12, x12, x12\");\r\n                    sb.AppendLine(\"  eor x11, x11, x11\");\r\n\r\n                    // fill in jump tables for every branch. there has to be a better way to do this\r\n                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)\r\n                    {\r\n                        // x3 = array of ptrs to jump tables\r\n                        // x14 = index into array of jump tables\r\n                        // x17 = ptr to jump table\r\n                        sb.AppendLine(\"  ldr x17, [x3, w14, uxtw #3]\");\r\n                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)\r\n                        {\r\n                            // assuming 64-bit pointers and 4K page size\r\n                            // use x16 = label index\r\n                            string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx);\r\n                            sb.AppendLine($\"  adrp x10, {targetLabelName}\");\r\n                            sb.AppendLine($\"  add x10, x10, :lo12:{targetLabelName}\");\r\n                            sb.AppendLine(\"  str x10, [x17, w16, uxtw #3]\");\r\n                            sb.AppendLine(\"  add w16, w16, 1\");\r\n                        }\r\n\r\n                        sb.AppendLine(\"  eor x16, x16, x16\");\r\n                        sb.AppendLine(\"  add w14, w14, 1\");\r\n                    }\r\n\r\n                    // w14 = branch index, w16 = pattern (target) array index\r\n                    sb.AppendLine(loopLabel + \":\");\r\n                    sb.AppendLine(\"  eor w14, w14, w14\");\r\n\r\n                    // generate branch blocks\r\n                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)\r\n                    {\r\n                        // get a pointer to the jump table\r\n                        sb.AppendLine(\"  ldr x9, [x3, w14, uxtw #3]\");\r\n\r\n                        // look up which target to jump to\r\n                        sb.AppendLine(\"  ldr x15, [x1, w14, uxtw #3]\");\r\n                        sb.AppendLine(\"  add w14, w14, 1\");\r\n                        sb.AppendLine(\"  ldr w13, [x15, w16, uxtw #2]\");\r\n\r\n                        // use the target index (w13) to index into the jump table, and branch on it\r\n                        sb.AppendLine(\"  ldr x17, [x9, w13, uxtw #3]\");\r\n\r\n                        // global history assist branches\r\n                        // rax = index into jump table. make that correlate with global history\r\n                        if (this.assists)\r\n                        {\r\n                            sb.AppendLine(\"  mov x18, 1\");\r\n                            sb.AppendLine(\"  eor w12, w12, w12\");\r\n                            for (int eaxBits = 0; eaxBits < globalHistoryAssistBits; eaxBits++)\r\n                            {\r\n                                string targetName = functionLabel + \"branch\" + branchIdx + \"ghist\" + eaxBits;\r\n                                sb.AppendLine(\"  and w12, w13, w18\");\r\n                                sb.AppendLine($\"  cbnz w12, {targetName}\");\r\n                                sb.AppendLine(\"  nop\");\r\n                                sb.AppendLine($\"{targetName}:\");\r\n                                sb.AppendLine(\"  lsl w18, w18, 1\");\r\n                            }\r\n                        }\r\n\r\n                        // branch on value of x17\r\n                        sb.AppendLine($\"  br x17\");\r\n                        sb.AppendLine(\"  nop\");\r\n\r\n                        // generate targets\r\n                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)\r\n                        {\r\n                            sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + \":\");\r\n                            sb.AppendLine($\"  nop\");\r\n                        }\r\n                    }\r\n\r\n                    // increment w16, and basically cmov 0 -> w16 if w16 = list length\r\n                    sb.AppendLine(\"  add w16, w16, 1\");\r\n                    sb.AppendLine(\"  cmp w16, w2\");\r\n                    sb.AppendLine(\"  csel w16, w11, w16, EQ\");\r\n                    sb.AppendLine(\"  sub x0, x0, 1\");\r\n                    sb.AppendLine($\"  cbnz x0, {loopLabel}\");\r\n                    sb.AppendLine(\"  mov x0, x12\");\r\n                    sb.AppendLine(\"  ldp x9, x10, [sp, #0x40]\");\r\n                    sb.AppendLine(\"  ldp x11, x12, [sp, #0x30]\");\r\n                    sb.AppendLine(\"  ldp x15, x16, [sp, #0x20]\");\r\n                    sb.AppendLine(\"  ldp x13, x14, [sp, #0x10]\");\r\n                    sb.AppendLine(\"  ldp x17, x18, [sp, #0x40]\");\r\n                    sb.AppendLine(\"  add sp, sp, #0x60\");\r\n                    sb.AppendLine(\"  ret\");\r\n                }\r\n            }\r\n        }\r\n\r\n        public void GenerateX86GccAsm(StringBuilder sb)\r\n        {\r\n            for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)\r\n            {\r\n                int currentTargetCount = targetCounts[targetCountIdx];\r\n                for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)\r\n                {\r\n                    /* rdi = iteration count\r\n                     * rsi = array of target selection arrays, one for each branch\r\n                     * rdx = length of pattern array\r\n                     * rcx = array of jump tables, one for each branch\r\n                     */\r\n                    int currentBranchCount = branchCounts[branchCountIdx];\r\n                    string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount);\r\n                    sb.AppendLine(\"\\n\" + functionLabel + \":\");\r\n                    sb.AppendLine(\"  push %rbx\");\r\n                    sb.AppendLine(\"  push %r8\");\r\n                    sb.AppendLine(\"  push %r9\");\r\n                    sb.AppendLine(\"  push %r13\");\r\n                    sb.AppendLine(\"  push %r15\");\r\n                    sb.AppendLine(\"  push %r14\");\r\n                    sb.AppendLine(\"  xor %rbx, %rbx\");\r\n                    sb.AppendLine(\"  xor %r8, %r8\");\r\n                    sb.AppendLine(\"  xor %r9, %r9\");\r\n\r\n                    // initialize jump table\r\n                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)\r\n                    {\r\n                        // rcx = array of ptrs to jump tables\r\n                        // r9 = index into array of jump tables\r\n                        // r15 = ptr to jump table\r\n\r\n                        // load jump table base address into r15\r\n                        sb.AppendLine(\"  mov (%rcx,%r9,8), %r15\");\r\n                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)\r\n                        {\r\n                            // assuming 64-bit pointers and 4K page size\r\n                            // use rbx = index into\r\n                            string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx);\r\n                            sb.AppendLine($\"  lea {targetLabelName}(%rip), %rax\");\r\n                            sb.AppendLine($\"  mov %rax, (%r15,%rbx,8)\");\r\n                            sb.AppendLine(\"  inc %rbx\");\r\n                        }\r\n\r\n                        sb.AppendLine(\"  xor %rbx, %rbx\");\r\n                        sb.AppendLine(\"  inc %r9\");\r\n                    }\r\n\r\n                    sb.AppendLine(\"  xor %r8, %r8\");\r\n                    sb.AppendLine(\"  xor %r9, %r9\");\r\n\r\n                    string loopLabel = functionLabel + \"_loop\";\r\n                    sb.AppendLine(\"\\n\" + loopLabel + \":\");\r\n                    sb.AppendLine(\"  xor %r11, %r11\"); // set index into arr of arrs to 0\r\n                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)\r\n                    {\r\n                        sb.AppendLine(\"  mov (%rcx,%r11,8), %r15\");  // load jump table base pointer into r15\r\n                        sb.AppendLine(\"  mov (%rsi,%r11,8), %r10\");  // load target select array base pointer into r10\r\n                        sb.AppendLine(\"  inc %r11\");\r\n                        sb.AppendLine(\"  mov (%r10,%rbx,4), %eax\"); // get the target for the current iteration into eax\r\n                        sb.AppendLine(\"  mov (%r15,%rax,8), %r14\");  // load address of jump target from jump table\r\n\r\n                        if (assists)\r\n                        {\r\n                            sb.AppendLine(\"  mov %rsi, %r13\");\r\n                            sb.AppendLine(\"  mov $1, %rsi\");\r\n                            for (int eaxBits = 0; eaxBits < 7; eaxBits++)\r\n                            {\r\n                                string targetName = functionLabel + \"branch\" + branchIdx + \"ghist\" + eaxBits;\r\n                                sb.AppendLine(\"  test %eax, %esi\");\r\n                                sb.AppendLine($\"  jnz {targetName}\");\r\n                                sb.AppendLine(\"  nop\");\r\n                                sb.AppendLine($\"{targetName}:\");\r\n\r\n                                sb.AppendLine(\"  shl $1, %esi\");\r\n                            }\r\n                        }\r\n\r\n                        sb.AppendLine(\"  mov %r13, %rsi\");\r\n\r\n                        sb.AppendLine(\"  jmp *%r14\");                // and jump to it\r\n                        // generate targets\r\n                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)\r\n                        {\r\n                            sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + \":\");\r\n                            sb.AppendLine($\"  nop\");\r\n                        }\r\n                    }\r\n\r\n                    // loop around in pattern history test array if necessary\r\n                    // avoiding an extra branch to not pollute BPU history\r\n                    sb.AppendLine(\"  inc %rbx\");\r\n                    sb.AppendLine(\"  cmp %rbx, %rdx\");\r\n                    sb.AppendLine(\"  cmove %r9, %rbx\");\r\n\r\n                    // end of main loop over iteration count\r\n                    sb.AppendLine(\"  dec %rdi\");\r\n                    sb.AppendLine(\"  jnz \" + loopLabel);\r\n\r\n                    // function epilogue\r\n                    sb.AppendLine(\"  mov %r8, %rax\");\r\n                    sb.AppendLine(\"  pop %r14\");\r\n                    sb.AppendLine(\"  pop %r15\");\r\n                    sb.AppendLine(\"  pop %r13\");\r\n                    sb.AppendLine(\"  pop %r9\");\r\n                    sb.AppendLine(\"  pop %r8\");\r\n                    sb.AppendLine(\"  pop %rbx\");\r\n                    sb.AppendLine(\"  ret\");\r\n                }\r\n            }\r\n        }\r\n\r\n        public void GenerateMipsAsm(StringBuilder sb)\r\n        {\r\n            for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)\r\n            {\r\n                int currentTargetCount = targetCounts[targetCountIdx];\r\n                for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)\r\n                {\r\n                    /* r4 = iteration count\r\n                     * r5 = array of target selection arrays, one for each branch\r\n                     * r6 = length of pattern array\r\n                     * r7 = array of jump tables, one for each branch\r\n                     */\r\n                    int currentBranchCount = branchCounts[branchCountIdx];\r\n                    string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount);\r\n                    sb.AppendLine(\"\\n\" + functionLabel + \":\");\r\n\r\n                    // initialize jump tables. r12-r20 are temporary regs. \r\n                    sb.AppendLine(\"  move $r13, $r7\"); // use r13 to access array of pointers to jump tables\r\n                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)\r\n                    {\r\n                        sb.AppendLine(\"  ld.d $r15, $r13, 0\");          // load address of branch's jump table into r15\r\n\r\n                        // initialize the jump table. r15 = base addr. rely on C# for bounds :)\r\n                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)\r\n                        {\r\n                            // write label addresses into array\r\n                            string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx);\r\n                            sb.AppendLine(\"  la $r16, \" + targetLabelName); // load branch target address into r16\r\n                            sb.AppendLine(\"  st.d $r16, $r15, 0\");          // store branch target address\r\n                            sb.AppendLine(\"  addi.d $r15, $r15, 8\");        // increment array pointer\r\n                        }\r\n\r\n                        sb.AppendLine(\"  addi.d $r13, $r13, 8\");    // increment array pointer for array of pointers to jump tables\r\n                    }\r\n\r\n                    // loop through branches for (iterations) times\r\n                    string loopLabel = functionLabel + \"_loop\";\r\n                    sb.AppendLine(\"  move $r14, $r0\");    // r14 = branch target index\r\n                    sb.AppendLine(\"  move $r17, $r0\");\r\n                    sb.AppendLine(\"  addi.d $r17, $r17, 1\"); // use r17 just to store 1\r\n                    sb.AppendLine(\"\\n\" + loopLabel + \":\");\r\n                    sb.AppendLine(\"  move $r12, $r5\");      // r12 to hold pointer to target selection array\r\n                    sb.AppendLine(\"  move $r13, $r7\");      // r13 to hold pointer to jump target array\r\n                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)\r\n                    {\r\n                        sb.AppendLine(\"  ld.d $r16, $r12, 0\"); // r16 = base address of target select array\r\n                        sb.AppendLine(\"  ld.d $r18, $r13, 0\"); // r18 = base address of jump target array\r\n\r\n                        // target select array[target index]\r\n                        sb.AppendLine(\"  alsl.d $r15, $r14, $r0, 0x2\");\r\n                        sb.AppendLine(\"  add.d $r15, $r15, $r16\");\r\n                        sb.AppendLine(\"  ld.w $r19, $r15, 0\");          // load 32-bit target index\r\n\r\n                        sb.AppendLine(\"  alsl.d $r15, $r19, $r0, 0x3\"); // now index into jump table\r\n                        sb.AppendLine(\"  add.d $r15, $r18, $r15\");\r\n                        sb.AppendLine(\"  ld.d $r20, $r15, 0\");\r\n\r\n                        // increment pointers for next branch\r\n                        sb.AppendLine(\"  addi.d $r12, $r12, 8\");\r\n                        sb.AppendLine(\"  addi.d $r13, $r13, 8\");\r\n                        sb.AppendLine(\"  jr $r20\");\r\n\r\n                        // generate targets\r\n                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)\r\n                        {\r\n                            sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + \":\");\r\n                            sb.AppendLine($\"  nop\");\r\n                        }\r\n                    }\r\n\r\n                    // loop back. and try to reset branch index without a branch\r\n                    sb.AppendLine(\"  addi.d $r14, $r14, 1\"); // if r14 == r6 (pattern array length), set r14 back to 0 somehow\r\n                    sb.AppendLine(\"  sub.d $r12, $r14, $r6\"); // 12 = temporary result of comparison\r\n                    sb.AppendLine(\"  maskeqz $r14, $r14, $r12\"); // if r12 = 0, set r14 to 0. otherwise use current value\r\n                    sb.AppendLine(\"  sub.d $r4, $r4, $r17\");\r\n                    sb.AppendLine(\"  bnez $r4, \" + loopLabel);\r\n                    sb.AppendLine(\"  jr $r1\");\r\n                }\r\n            }\r\n        }\r\n\r\n        // kinda hack this to put in initialization code we need\r\n        public void GenerateExternLines(StringBuilder sb)\r\n        {\r\n            for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)\r\n                for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)\r\n                    sb.AppendLine(\"extern uint64_t \" + GetFunctionName(branchCounts[branchCountIdx], targetCounts[targetCountIdx]) + $\"({FunctionDefinitionParameters}) __attribute((sysv_abi));\");\r\n\r\n            GenerateInitializationCode(sb);\r\n            string gccFunction = File.ReadAllText(Path.Combine(Program.DataFilesDir, \"GccIndirectBranchFunction.c\"));\r\n            sb.AppendLine(gccFunction);\r\n        }\r\n\r\n        public void GenerateInitializationCode(StringBuilder sb)\r\n        {\r\n            sb.AppendLine($\"uint32_t maxIndirectBranchCount = {branchCounts.Length};\");\r\n            sb.Append($\"uint32_t indirectBranchCounts[{branchCounts.Length}] = \");\r\n            sb.Append(\"{  \" + branchCounts[0]);\r\n            for (int i = 1; i < branchCounts.Length; i++) sb.Append(\", \" + branchCounts[i]);\r\n            sb.AppendLine(\" };\");\r\n            sb.Append($\"uint32_t indirectBranchTargetCounts[{targetCounts.Length}] = \");\r\n            sb.Append(\"{  \" + targetCounts[0]);\r\n            for (int i = 1; i < targetCounts.Length; i++) sb.Append(\", \" + targetCounts[i]);\r\n            sb.AppendLine(\" };\");\r\n\r\n            // TODO: need to make this a 2D array - [branch count][target count]\r\n            sb.AppendLine($\"uint64_t (__attribute((sysv_abi)) *indirectBranchTestFuncArr[{branchCounts.Length}][{targetCounts.Length}])({FunctionDefinitionParameters});\");\r\n\r\n            sb.AppendLine(\"void initializeIndirectBranchFuncArr() {\");\r\n            for (int i = 0; i < branchCounts.Length; i++)\r\n            {\r\n                for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)\r\n                {\r\n                    sb.AppendLine($\"  indirectBranchTestFuncArr[{i}][{targetCountIdx}] = {GetFunctionName(branchCounts[i], targetCounts[targetCountIdx])};\");\r\n                }\r\n            }\r\n\r\n            sb.AppendLine(\"}\");\r\n        }\r\n\r\n        public string Prefix { get; set; }\r\n        public string Description { get; set; }\r\n        public int[] Counts;\r\n        public string FunctionDefinitionParameters { get; set; }\r\n        public string GetFunctionCallParameters { get; set; }\r\n        public bool DivideTimeByCount { get; set; }\r\n        public void GenerateAsmGlobalLines(StringBuilder sb)\r\n        {\r\n            for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)\r\n                for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)\r\n                    sb.AppendLine(\".global \" + GetFunctionName(branchCounts[branchCountIdx], targetCounts[targetCountIdx]));\r\n        }\r\n\r\n        public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            sb.AppendLine(\"  if (argc > 1 && strncmp(test_name, \\\"\" + Prefix + \"\\\", \" + Prefix.Length + \") == 0) {\");\r\n            sb.AppendLine(\"    printf(\\\"\" + Description + \":\\\\n\\\");\");\r\n            string ibMain = File.ReadAllText(Path.Combine(Program.DataFilesDir, \"IndirectBranchTestBlock.c\"));\r\n            sb.AppendLine(ibMain);\r\n            sb.AppendLine(\"  }\\n\");\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/IntRfDepStoreTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class IntRfTestDependentStore : UarchTest\r\n    {\r\n        public IntRfTestDependentStore(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"intrfds\";\r\n            this.Description = \"Integer Register File, preceded by a dependent store\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add %r11, %r15\";\r\n                unrolledAdds[1] = \"  add %r11, %r14\";\r\n                unrolledAdds[2] = \"  add %r11, %r13\";\r\n                unrolledAdds[3] = \"  add %r11, %r12\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"str w15, [x2, w25, uxtw #2]\";\r\n                string postLoadInstrs2 = \"str w15, [x2, w26, uxtw #2]\";\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add x15, x15, x11\";\r\n                unrolledAdds[1] = \"  add x14, x14, x11\";\r\n                unrolledAdds[2] = \"  add x13, x13, x11\";\r\n                unrolledAdds[3] = \"  add x12, x12, x11\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add.d $r15, $r15, $r14\";\r\n                unrolledAdds[1] = \"  add.d $r16, $r16, $r14\";\r\n                unrolledAdds[2] = \"  add.d $r17, $r17, $r14\";\r\n                unrolledAdds[3] = \"  add.d $r18, $r18, $r14\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add x28, x28, x29\";\r\n                unrolledAdds[1] = \"  add x30, x30, x29\";\r\n                unrolledAdds[2] = \"  add x31, x31, x29\";\r\n                unrolledAdds[3] = \"  add x18, x18, x29\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/IntRfTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class IntRfTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public IntRfTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"intrf\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Integer Register File\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch)\r\n            {\r\n                if (isa == IUarchTest.ISA.aarch64) return true;\r\n                if (isa == IUarchTest.ISA.riscv) return true;\r\n                return false;\r\n            }\r\n\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add %r11, %r15\";\r\n                unrolledAdds[1] = \"  add %r11, %r14\";\r\n                unrolledAdds[2] = \"  add %r11, %r13\";\r\n                unrolledAdds[3] = \"  add %r11, %r12\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add x15, x15, x11\";\r\n                unrolledAdds[1] = \"  add x14, x14, x11\";\r\n                unrolledAdds[2] = \"  add x13, x13, x11\";\r\n                unrolledAdds[3] = \"  add x12, x12, x11\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add.d $r15, $r15, $r14\";\r\n                unrolledAdds[1] = \"  add.d $r16, $r16, $r14\";\r\n                unrolledAdds[2] = \"  add.d $r17, $r17, $r14\";\r\n                unrolledAdds[3] = \"  add.d $r18, $r18, $r14\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add x28, x28, x29\";\r\n                unrolledAdds[1] = \"  add x30, x30, x29\";\r\n                unrolledAdds[2] = \"  add x31, x31, x29\";\r\n                unrolledAdds[3] = \"  add x18, x18, x29\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/JsCvtNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class JsCvtNsq : UarchTest\r\n    {\r\n        private int totalOps;\r\n        public JsCvtNsq(int low, int high, int step, int totalOps)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"jscvtnsq\";\r\n            this.Description = \"FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler, excluding possible NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.totalOps = totalOps;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr d16, [x2, w25, sxtw #0]\";\r\n                string initInstrs = \"  ldr d15, [x2]\";\r\n                string[] depInstrs = new string[4];\r\n                depInstrs[0] = \"  fjcvtzs w15, d16\";\r\n                depInstrs[1] = \"  fjcvtzs w14, d16\";\r\n                depInstrs[2] = \"  fjcvtzs w13, d16\";\r\n                depInstrs[3] = \"  fjcvtzs w12, d16\";\r\n\r\n                string[] indepInstrs = new string[4];\r\n                indepInstrs[0] = \"  fjcvtzs w15, d15\";\r\n                indepInstrs[1] = \"  fjcvtzs w14, d15\";\r\n                indepInstrs[2] = \"  fjcvtzs w13, d15\";\r\n                indepInstrs[3] = \"  fjcvtzs w12, d15\";\r\n                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,\r\n                    postLoadInstrs: postLoadInstrs1);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/JsCvtSched.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class JsCvtSched : UarchTest\r\n    {\r\n        public JsCvtSched(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"jscvtsched\";\r\n            this.Description = \"FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr d16, [x2, w25, sxtw #0]\";\r\n                string postLoadInstrs2 = \"  ldr d16, [x2, w25, sxtw #0]\";\r\n                string[] unrolledInstrs = new string[4];\r\n                unrolledInstrs[0] = \"  fjcvtzs w15, d16\";\r\n                unrolledInstrs[1] = \"  fjcvtzs w14, d16\";\r\n                unrolledInstrs[2] = \"  fjcvtzs w13, d16\";\r\n                unrolledInstrs[3] = \"  fjcvtzs w12, d16\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/JumpNsqTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class JumpNsqTest : UarchTest\r\n    {\r\n        public JumpNsqTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"jumpnsq\";\r\n            this.Description = \"Scheduler, Not-Taken Jumps, excluding possible nsq\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            // if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] dependentJumps = new string[1];\r\n                dependentJumps[0] = \"  cmp %rdi, %rsi\\n  je jumpnsq_reallybadthing\";\r\n                string[] independentJumps = new string[1];\r\n                independentJumps[0] = \"  cmp %r13, %r14\\n  je jumpnsq_reallybadthing\";\r\n                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentJumps, independentJumps);\r\n\r\n                sb.AppendLine(\"jumpnsq_reallybadthing:\\n  int3\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/JumpSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class JumpSchedTest : UarchTest\r\n    {\r\n        public JumpSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"jumpsched\";\r\n            this.Description = \"Scheduler, Not-Taken Jumps\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledJumps = new string[1];\r\n                unrolledJumps[0] = \"  cmp %rdi, %rsi\\n  je jumpsched_reallybadthing\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);\r\n\r\n                sb.AppendLine(\"jumpsched_reallybadthing:\\n  int3\");\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] unrolledJumps = new string[1];\r\n                unrolledJumps[0] = \"  cmp x25, x26\\n  b.eq jumpsched_reallybadthing\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);\r\n                sb.AppendLine(\"jumpsched_reallybadthing:\\n  .word 0xf7f0a000\");\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                // todo\r\n                string[] unrolledJumps = new string[1];\r\n                unrolledJumps[0] = \"  beq x5, x6, jumpsched_reallybadthing\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, false);\r\n                sb.AppendLine(\"jumpsched_reallybadthing:\\n  .word 0x00000000\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/LdqTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class LdqTest : UarchTest\r\n    {\r\n        bool initialDependentBranch;\r\n        public LdqTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"ldq\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Load Queue\" + (initialDependentBranch ? \", preceded by dependent branch\"  : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch)\r\n            {\r\n                if (isa == IUarchTest.ISA.aarch64) return true;\r\n                if (isa == IUarchTest.ISA.riscv) return true;\r\n                return false;\r\n            }\r\n\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledLoads = new string[4];\r\n                unrolledLoads[0] = \"  mov (%r8), %r15\";\r\n                unrolledLoads[1] = \"  mov (%r8), %r14\";\r\n                unrolledLoads[2] = \"  mov (%r8), %r13\";\r\n                unrolledLoads[3] = \"  mov (%r8), %r12\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstr = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                string[] unrolledLoads = new string[4];\r\n                unrolledLoads[0] = \"  ldr x15, [x2]\";\r\n                unrolledLoads[1] = \"  ldr x14, [x2]\";\r\n                unrolledLoads[2] = \"  ldr x13, [x2]\";\r\n                unrolledLoads[3] = \"  ldr x12, [x2]\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstr, postLoadInstrs2: postLoadInstr);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string[] unrolledLoads = new string[4];\r\n                unrolledLoads[0] = \"  ld.d $r15, $r6, 0\";\r\n                unrolledLoads[1] = \"  ld.d $r16, $r6, 8\";\r\n                unrolledLoads[2] = \"  ld.d $r17, $r6, 16\";\r\n                unrolledLoads[3] = \"  ld.d $r18, $r6, 24\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;\r\n                string[] unrolledLoads = new string[4];\r\n                unrolledLoads[0] = \"  ld x28, (x11)\";\r\n                unrolledLoads[1] = \"  ld x29, 8(x11)\";\r\n                unrolledLoads[2] = \"  ld x30, 16(x11)\";\r\n                unrolledLoads[3] = \"  ld x31, 24(x11)\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, \r\n                    includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/LeaSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class LeaSchedTest : UarchTest\r\n    {\r\n        public LeaSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"leasched\";\r\n            this.Description = \"Scheduler, lea with base + index + offset\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  lea 128(%r15, %rdi), %r15\";\r\n                unrolledAdds[1] = \"  lea 128(%r14, %rdi), %r14\";\r\n                unrolledAdds[2] = \"  lea 128(%r13, %rdi), %r13\";\r\n                unrolledAdds[3] = \"  lea 128(%r12, %rdi), %r12\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/LoadNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class LoadNsq : UarchTest\r\n    {\r\n        public LoadNsq(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"loadnsq\";\r\n            this.Description = \"Load Address Scheduler, Excluding any NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] dep = new string[3];\r\n                dep[0] = \"  mov (%r8, %rdi, 4), %r15\";\r\n                dep[1] = \"  mov (%r8, %rdi, 4), %r14\";\r\n                dep[2] = \"  mov (%r8, %rdi, 4), %r13\";\r\n\r\n                string[] indep = new string[3];\r\n                indep[0] = \"  mov (%r8), %r15\";\r\n                indep[1] = \"  mov (%r8), %r14\";\r\n                indep[2] = \"  mov (%r8), %r13\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep, ptrChasingLoadsInSq: true);\r\n            }\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] dep = new string[3];\r\n                dep[0] = \"  ldr w15, [x2, w25, uxtw #2]\";\r\n                dep[1] = \"  ldr w14, [x2, w25, uxtw #2]\";\r\n                dep[2] = \"  ldr w13, [x2, w25, uxtw #2]\";\r\n\r\n                string[] indep = new string[3];\r\n                indep[0] = \"  ldr w12, [x2]\";\r\n                indep[1] = \"  ldr w11, [x2]\";\r\n                indep[2] = \"  ldr w10, [x2]\";\r\n                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/LoadSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class LoadSchedTest : UarchTest\r\n    {\r\n        public LoadSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"loadsched\";\r\n            this.Description = \"Load Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  mov (%r8, %rdi, 4), %r15\";\r\n                dependentLoads[1] = \"  mov (%r8, %rdi, 4), %r14\";\r\n                dependentLoads[2] = \"  mov (%r8, %rdi, 4), %r13\";\r\n                dependentLoads[3] = \"  mov (%r8, %rdi, 4), %r12\";\r\n\r\n                string[] dependentLoads1 = new string[4];\r\n                dependentLoads1[0] = \"  mov (%r8, %rsi, 4), %r15\";\r\n                dependentLoads1[1] = \"  mov (%r8, %rsi, 4), %r14\";\r\n                dependentLoads1[2] = \"  mov (%r8, %rsi, 4), %r13\";\r\n                dependentLoads1[3] = \"  mov (%r8, %rsi, 4), %r12\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  ldr w15, [x2, w25, uxtw #2]\";\r\n                dependentLoads[1] = \"  ldr w14, [x2, w25, uxtw #2]\";\r\n                dependentLoads[2] = \"  ldr w13, [x2, w25, uxtw #2]\";\r\n                dependentLoads[3] = \"  ldr w12, [x2, w25, uxtw #2]\";\r\n\r\n                string[] dependentLoads1 = new string[4];\r\n                dependentLoads1[0] = \"  ldr w15, [x2, w26, uxtw #2]\";\r\n                dependentLoads1[1] = \"  ldr w14, [x2, w26, uxtw #2]\";\r\n                dependentLoads1[2] = \"  ldr w13, [x2, w26, uxtw #2]\";\r\n                dependentLoads1[3] = \"  ldr w12, [x2, w26, uxtw #2]\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string postLoadInstrs1 = \"  andi $r19, $r12, 0xF\\n  add.d $r19, $r19, $r6\";\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  ld.d $r15, $r19, 0\";\r\n                dependentLoads[1] = \"  ld.d $r16, $r19, 8\";\r\n                dependentLoads[2] = \"  ld.d $r17, $r19, 12\";\r\n                dependentLoads[3] = \"  ld.d $r18, $r19, 16\";\r\n\r\n                string postLoadInstrs2 = \"  andi $r19, $r13, 0xF\\n  add.d $r19, $r19, $r6\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, null, \r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                // x5 and x6 are pointer chasing loads\r\n                string postLoadInstrs1 = \"  andi x7, x5, 0xF\\n  add x7, x7, x12\";\r\n                string postLoadInstrs2 = \"  andi x7, x6, 0xF\\n  add x7, x7, x12\";\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  ld x28, (x7)\";\r\n                dependentLoads[1] = \"  ld x29, 8(x7)\";\r\n                dependentLoads[2] = \"  ld x30, 16(x7)\";\r\n                dependentLoads[3] = \"  ld x31, 24(x7)\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MaddSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MaddSchedTest : UarchTest\r\n    {\r\n        public MaddSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"maddsched\";\r\n            this.Description = \"Scheduler, Integer Multiply-Add\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] unrolledMuls = new string[4];\r\n                unrolledMuls[0] = \"  madd x15, x15, x25, x10\";\r\n                unrolledMuls[1] = \"  madd x14, x14, x25, x10\";\r\n                unrolledMuls[2] = \"  madd x13, x13, x25, x10\";\r\n                unrolledMuls[3] = \"  madd x12, x12, x25, x10\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MaskRfTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MaskRfTest : UarchTest\r\n    {\r\n        public MaskRfTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"maskrf\";\r\n            this.Description = \"Mask Registers - AVX-512 only\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  kaddb %k0, %k1, %k1\";\r\n                unrolledAdds[1] = \"  kaddb %k0, %k2, %k2\";\r\n                unrolledAdds[2] = \"  kaddb %k0, %k3, %k3\";\r\n                unrolledAdds[3] = \"  kaddb %k0, %k4, %k4\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixAddJumpSched.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixAddJumpSchedTest : UarchTest\r\n    {\r\n        public MixAddJumpSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixaddjumpsched\";\r\n            this.Description = \"Scheduler, Mixed Adds and Not-Taken Jumps in 3:1 ratio\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledJumps = new string[4];\r\n                unrolledJumps[0] = \"  cmp %rdi, %rsi\\n  je mixaddjumpsched_reallybadthing\";\r\n                unrolledJumps[1] = \"  add %rsi, %r15\";\r\n                unrolledJumps[2] = \"  add %rsi, %r14\";\r\n                unrolledJumps[3] = \"  add %rsi, %r14\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);\r\n\r\n                sb.AppendLine(\"mixaddjumpsched_reallybadthing:\\n  int3\");\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] unrolledJumps = new string[4];\r\n                unrolledJumps[0] = \"  cmp x25, x26\\n  b.eq mixaddjumpsched_reallybadthing\";\r\n                unrolledJumps[1] = \"  add x15, x15, x25\";\r\n                unrolledJumps[2] = \"  add x14, x14, x25\";\r\n                unrolledJumps[3] = \"  add x14, x14, x25\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);\r\n                sb.AppendLine(\"mixaddjumpsched_reallybadthing:\\n  .word 0xf7f0a000\");\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                // todo\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  mul x30, x30, x5\";\r\n                unrolledAdds[1] = \"  mul x29, x29, x5\";\r\n                unrolledAdds[2] = \"  mul x28, x28, x5\";\r\n                unrolledAdds[3] = \"  mul x31, x31, x5\";\r\n\r\n                string[] unrolledAdds1 = new string[4];\r\n                unrolledAdds1[0] = \"  mul x30, x30, x6\";\r\n                unrolledAdds1[1] = \"  mul x31, x31, x6\";\r\n                unrolledAdds1[2] = \"  mul x28, x28, x6\";\r\n                unrolledAdds1[3] = \"  mul x29, x29, x6\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixAddvJsCvtNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixAddvJsCvtNsq : UarchTest\r\n    {\r\n        public MixAddvJsCvtNsq(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixaddvjscvtnsq\";\r\n            this.Description = \"ADDV and fjcvtzs Scheduler, Excluding any NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr q16, [x2, w25, sxtw #0]\\n  ldr d2, [x2, w25, sxtw #0]\";\r\n                string initInstrs = \"  ldr q17, [x2]\\n  ldr d15, [x2]\";\r\n                string[] depInstrs = new string[4];\r\n                depInstrs[0] = \"  addv h1, v16.4h\";\r\n                depInstrs[1] = \"  fjcvtzs w15, d2\";\r\n                depInstrs[2] = \"  addv h3, v16.4h\";\r\n                depInstrs[3] = \"  fjcvtzs w14, d2\";\r\n\r\n                string[] indepInstrs = new string[4];\r\n                indepInstrs[0] = \"  addv h4, v17.4h\";\r\n                indepInstrs[1] = \"  fjcvtzs w12, d15\";\r\n                indepInstrs[2] = \"  addv h5, v17.4h\";\r\n                indepInstrs[3] = \"  fjcvtzs w13, d15\";\r\n                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs: initInstrs,\r\n                    postLoadInstrs: postLoadInstrs1);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixAddvJsCvtSched.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixAddvJsCvtSched : UarchTest\r\n    {\r\n        public MixAddvJsCvtSched(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixaddvjscvtsched\";\r\n            this.Description = \"ADDV and fjcvtzs Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr q16, [x2, w25, sxtw #0]\\n  ldr d2, [x2, w25, sxtw #0]\";\r\n                string postLoadInstrs2 = \"  ldr q16, [x2, w26, sxtw #0]\\n  ldr d2, [x2, w26, sxtw #0]\";\r\n                string[] unrolledInstrs = new string[4];\r\n                unrolledInstrs[0] = \"  addv h1, v16.4h\";\r\n                unrolledInstrs[1] = \"  fjcvtzs w15, d2\";\r\n                unrolledInstrs[2] = \"  addv h3, v16.4h\";\r\n                unrolledInstrs[3] = \"  fjcvtzs w14, d2\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixBranchStoreTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixBranchStoreTest : UarchTest\r\n    {\r\n        private bool mixNops;\r\n        private bool initialDependentBranch;\r\n        public MixBranchStoreTest(int low, int high, int step, bool mixNops = false, bool initialDependentBranch = false)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixstqbob\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Mixed NT branches and stores\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty); ;\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.mixNops = mixNops;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = Prefix + Counts[i];\r\n\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  sub sp, sp, #0x50\");\r\n                sb.AppendLine(\"  stp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  stp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  stp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  stp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  mov x15, 1\");\r\n                sb.AppendLine(\"  mov x14, 2\");\r\n                sb.AppendLine(\"  mov x13, 3\");\r\n                sb.AppendLine(\"  mov x12, 4\");\r\n                sb.AppendLine(\"  mov x11, 5\");\r\n                sb.AppendLine(\"  mov x10, 6\");\r\n\r\n                sb.AppendLine(\"  mov w25, 0x0\");\r\n                sb.AppendLine(\"  mov w26, 0x40\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  ldr w25, [x1, w25, uxtw #2]\"); // current = A[current]\r\n                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_w25_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  cmp x15, x10\");\r\n                    sb.AppendLine($\"  b.eq {jumpLabel}\");\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  ldr w26, [x1, w26, uxtw #2]\");\r\n                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_w26_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  cmp x15, x10\");\r\n                    sb.AppendLine($\"  b.eq {jumpLabel}\");\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine(\"  cbnz x0, \" + funcName + \"start\");\r\n                sb.AppendLine(\"  ldp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  ldp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  ldp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  ldp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  add sp, sp, #0x50\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixFAdd256and32RfTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixFAdd256and32RfTest : UarchTest\r\n    {\r\n        public MixFAdd256and32RfTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"fadd256and32rf\";\r\n            this.Description = \"Mixed 32-bit scalar and 256-bit FP RF capacity\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return false;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string initInstrs = \"  vmovups (%r8), %ymm0\\n\" +\r\n                 \"  movss (%r8), %xmm1\\n\" +\r\n                 \"  vmovups %ymm0, %ymm2\\n\" +\r\n                 \"  movss (%r8), %xmm3\\n\" +\r\n                 \"  vmovups %ymm0, %ymm4\\n\" +\r\n                 \"  movss (%r8), %xmm5\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  vaddps %ymm0, %ymm1, %ymm1\";\r\n                unrolledAdds[1] = \"  addss %xmm5, %xmm2\";\r\n                unrolledAdds[2] = \"  vaddps %ymm0, %ymm3, %ymm3\";\r\n                unrolledAdds[3] = \"  addss %xmm5, %xmm4\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                \r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string initInstrs = \"\";\r\n                for (int regIdx = 0; regIdx < 32; regIdx++)\r\n                {\r\n                    initInstrs += \"  xvld $xr\" + regIdx + \", $r6, \" + regIdx * 32 + \"\\n\";\r\n                    initInstrs += \"  fld.s $f\" + regIdx + \", $r6, \" + regIdx * 4 + \"\\n\";\r\n                }\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  xvfadd.s $xr1, $xr1, $xr1\";\r\n                unrolledAdds[1] = \"  fadd.s $f11, $f11, $f11\";\r\n                unrolledAdds[2] = \"  xvfadd.s $xr3, $xr3, $xr3\";\r\n                unrolledAdds[3] = \"  fadd.s $f12, $f12, $f12\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixFpRfDepBranchTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixFpRfDepBranchTest : UarchTest\r\n    {\r\n        private int interval;\r\n        public MixFpRfDepBranchTest(int low, int high, int step, int interval)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixfprfdepbranch\" + interval;\r\n            this.Description = \"FP Register File, with some dependent branches\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *fpArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.interval = interval;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string initInstrs = \"  ldr s17, [x2]\\n\" +\r\n                    \"  ldr s18, [x2, 4]\\n\" +\r\n                    \"  ldr s19, [x2, 8]\\n\" +\r\n                    \"  ldr s20, [x2, 12]\\n\" +\r\n                    \"  ldr s21, [x2, 16]\\n\";\r\n\r\n                List<string> unrolledAddsList = new List<string>();\r\n                for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)\r\n                {\r\n                    int regnum = 18 + (i % 4);\r\n                    unrolledAddsList.Add($\"  fadd s{regnum}, s{regnum}, s17\");\r\n                    if (i % interval == 0) unrolledAddsList.Add(\"  cmp x25, x26\\n  b.eq mixfpjumpsched_badthing\" + interval);\r\n                }\r\n                string[] unrolledAdds = unrolledAddsList.ToArray();\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, initInstrs: initInstrs);\r\n\r\n                sb.AppendLine($\"mixfpjumpsched_badthing{interval}:\\n  .word 0xf7f0a000\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixFpVecRfTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixFpVecRfTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public MixFpVecRfTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixfpvecrf\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Mixed FP/128-bit FP vec rf\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch)\r\n            {\r\n                if (isa == IUarchTest.ISA.riscv) return true;\r\n                return false;\r\n            }\r\n\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string initInstrs = \"  vsetvli t5, t6, e32\\n  vlw.v v0, (a1)\\n    fld f0, (a1)\";\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;\r\n                postLoadInstrs += \"\\n  mv t6, a2\";\r\n                string[] unrolledInstrs = new string[2];\r\n                unrolledInstrs[0] = \"  vfadd.vv v0, v0, v0\";\r\n                unrolledInstrs[1] = \"  fadd.s f0, f0, f0\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false,\r\n                    initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixIntRfDepBranchTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixIntRfDepBranchTest : UarchTest\r\n    {\r\n        private int interval;\r\n        public MixIntRfDepBranchTest(int low, int high, int step, int interval)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixintrfdepbranch\" + interval;\r\n            this.Description = \"Integer Register File, with some dependent branches\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n            this.interval = interval;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                List<string> unrolledAddsList = new List<string>();\r\n                for (int i = 1; i < this.Counts[this.Counts.Length - 1] + 1; i++)\r\n                {\r\n                    int regnum = 12 + (i % 4);\r\n                    unrolledAddsList.Add($\"  add x{regnum}, x{regnum}, x11\");\r\n                    if (i % interval == 0) unrolledAddsList.Add(\"  cmp x25, x26\\n  b.eq mixintjumpsched_badthing\" + interval);\r\n                }\r\n                string[] unrolledAdds = unrolledAddsList.ToArray();\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);\r\n\r\n                sb.AppendLine($\"mixintjumpsched_badthing{interval}:\\n  .word 0xf7f0a000\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixIntVec128RfTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixIntVec128RfTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public MixIntVec128RfTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixintvec128\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Mixed integer and 128-bit vector register file capacity\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string initInstrs = \"  vmovups (%r8), %ymm0\\n\" +\r\n                 \"  movss (%r8), %xmm1\\n\" +\r\n                 \"  vmovups %ymm0, %ymm2\\n\" +\r\n                 \"  movss (%r8), %xmm3\\n\" +\r\n                 \"  vmovups %ymm0, %ymm4\\n\" +\r\n                 \"  movss (%r8), %xmm5\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add %r11, %r15\";\r\n                unrolledAdds[1] = \"  addss %xmm5, %xmm2\";\r\n                unrolledAdds[2] = \"  add %r11, %r14\";\r\n                unrolledAdds[3] = \"  addss %xmm5, %xmm4\";\r\n\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                string initInstrs = \"  ldr q0, [x1]\\n\" +\r\n                \"  ldr q1, [x1, #0x10]\\n\" +\r\n                \"  ldr q2, [x1, #0x20]\\n\" +\r\n                \"  ldr q3, [x1, #0x30]\\n\" +\r\n                \"  ldr q4, [x1, #0x40]\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  add v1.4s, v1.4s, v0.4s\";\r\n                unrolledAdds[1] = \"  add x15, x15, x11\";\r\n                unrolledAdds[2] = \"  add v2.4s, v2.4s, v0.4s\";\r\n                unrolledAdds[3] = \"  add x14, x14, x11\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixIntrfFprfTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixIntFpRfTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public MixIntFpRfTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixintfprf\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Mixed INT/FP Register File\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            //if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;\r\n            //if (isa == IUarchTest.ISA.amd64) return true;\r\n            //if (isa == IUarchTest.ISA.aarch64) return true;\r\n            //if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                // todo\r\n                string initInstrs = \"  movss (%r8), %xmm1\\n\" +\r\n                    \"  movss 4(%r8), %xmm2\\n\" +\r\n                    \"  movss 8(%r8), %xmm3\\n\" +\r\n                    \"  movss 12(%r8), %xmm4\\n\" +\r\n                    \"  movss 16(%r8), %xmm5\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  addss %xmm1, %xmm2\";\r\n                unrolledAdds[1] = \"  addss %xmm1, %xmm3\";\r\n                unrolledAdds[2] = \"  addss %xmm1, %xmm4\";\r\n                unrolledAdds[3] = \"  addss %xmm1, %xmm5\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {// todo\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                string initInstrs = \"  ldr s17, [x2]\\n\" +\r\n                    \"  ldr s18, [x2, 4]\\n\" +\r\n                    \"  ldr s19, [x2, 8]\\n\" +\r\n                    \"  ldr s20, [x2, 12]\\n\" +\r\n                    \"  ldr s21, [x2, 16]\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  fadd s18, s18, s17\";\r\n                unrolledAdds[1] = \"  fadd s19, s19, s17\";\r\n                unrolledAdds[2] = \"  fadd s20, s20, s17\";\r\n                unrolledAdds[3] = \"  fadd s21, s21, s17\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {// todo\r\n                string initInstrs = \"  fld.s $f8, $r6, 0\\n\" +\r\n                    \"  fld.s $f9, $r6, 4\\n\" +\r\n                    \"  fld.s $f10, $r6, 8\\n\" +\r\n                    \"  fld.s $f11, $r6, 12\\n\" +\r\n                    \"  fld.s $f12, $r6, 16\\n\";\r\n\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  fadd.s $f9, $f9, $f8\";\r\n                unrolledAdds[1] = \"  fadd.s $f10, $f10, $f8\";\r\n                unrolledAdds[2] = \"  fadd.s $f11, $f11, $f8\";\r\n                unrolledAdds[3] = \"  fadd.s $f12, $f12, $f8\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;\r\n                string initInstrs = \"  fld f0, (x12)\\n\" +\r\n                    \"  fld f1, 8(x12)\\n\" +\r\n                    \"  fld f2, 16(x12)\\n\" +\r\n                    \"  fld f3, 24(x12)\\n\" +\r\n                    \"  fld f4, 32(x12)\\n\";\r\n\r\n                List<string> unrolledAdds = new List<string>();\r\n                /* for C910 */\r\n                for (int i = 0; i < 30; i++) unrolledAdds.Add($\"  fadd.s f{i % 4}, f{i % 4}, f4\");\r\n                for (int i = 0; i < 200; i++) unrolledAdds.Add($\"  add x28, x28, x29\");\r\n                /*unrolledAdds.Add(\"  fadd.s f0, f0, f4\");\r\n                unrolledAdds.Add(\"  add x28, x28, x29\");\r\n                unrolledAdds.Add(\"  fadd.s f1, f1, f4\");\r\n                unrolledAdds.Add(\"  add x30, x30, x29\");\r\n                unrolledAdds.Add(\"  fadd.s f2, f2, f4\");\r\n                unrolledAdds.Add(\"  add x31, x31, x29\");\r\n                unrolledAdds.Add(\"  fadd.s f3, f3, f4\");\r\n                unrolledAdds.Add(\"  add x18, x18, x29\");*/\r\n                string[] unrolledAddsArr = unrolledAdds.ToArray();\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAddsArr, unrolledAddsArr, \r\n                    includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixJumpStoreDataSched.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixJumpStoreDataSched : UarchTest\r\n    {\r\n        public MixJumpStoreDataSched(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixjumpstoredatasched\";\r\n            this.Description = \"Scheduler, Mixed Jumps and Store Data\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatarr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            //if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledJumps = new string[4];\r\n                unrolledJumps[0] = \"  cmp %rdi, %rsi\\n  je mixjumpstoredatasched_reallybadthing\";\r\n                unrolledJumps[1] = \"  mov %rdi, (%r8)\";\r\n                unrolledJumps[2] = \"  cmp %rdi, %rsi\\n  je mixjumpstoredatasched_reallybadthing\";\r\n                unrolledJumps[3] = \"  mov %rdi, 64(%r8)\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);\r\n\r\n                sb.AppendLine(\"mixjumpstoredatasched_reallybadthing:\\n  int3\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixJumpStoreSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixJumpStoreSchedTest : UarchTest\r\n    {\r\n        public MixJumpStoreSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixjumpstoresched\";\r\n            this.Description = \"Scheduler, Mixed Jumps and Stores (Address Dependency)\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatarr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            //if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledJumps = new string[4];\r\n                unrolledJumps[0] = \"  cmp %rdi, %rsi\\n  je mixstorejumpsched_reallybadthing\";\r\n                unrolledJumps[1] = \"  mov %r14, (%r8, %rdi, 2)\";\r\n                unrolledJumps[2] = \"  cmp %rdi, %rsi\\n  je mixstorejumpsched_reallybadthing\";\r\n                unrolledJumps[3] = \"  mov %r14, 64(%r8, %rdi, 2)\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);\r\n\r\n                sb.AppendLine(\"mixstorejumpsched_reallybadthing:\\n  int3\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixJumpThenAddSched.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixJumpThenAddSched : UarchTest\r\n    {\r\n        public MixJumpThenAddSched(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixjumpthenaddsched\";\r\n            this.Description = \"Scheduler, 40 NT jumps + adds\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            // if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                List<string> unrolledJumps = new List<string>();\r\n                int instrIdx;\r\n                for (instrIdx = 0; instrIdx < 40; instrIdx++) unrolledJumps.Add(\"  cmp x25, x26\\n  b.eq mixaddthenjumpsched_reallybadthing\");\r\n                for (; instrIdx < this.Counts[this.Counts.Length - 1]; instrIdx++) unrolledJumps.Add(\"  add x15, x15, x25\");\r\n                string[] instrs = unrolledJumps.ToArray();\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, includePtrChasingLoads: true, dsb: true);\r\n                sb.AppendLine(\"mixaddthenjumpsched_reallybadthing:\\n  .word 0xf7f0a000\");\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixLdqStqTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixLdqStqTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public MixLdqStqTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixldqstq\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Mixed Load/Store Queue Test (mem ops pending retire)\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, int *arr1\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, B\";\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                GenerateX86GccAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n            }\r\n        }\r\n\r\n        public void GenerateX86GccAsm(StringBuilder sb)\r\n        {\r\n            string[] instrs = new string[4];\r\n            instrs[0] = \"  mov %r15, (%r8)\";\r\n            instrs[1] = \"  mov (%rdx), %r14\";\r\n            instrs[2] = \"  mov %r13, (%r8)\";\r\n            instrs[3] = \"  mov (%rdx), %r12\";\r\n            UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, true);\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n            string[] instrs = new string[4];\r\n            instrs[0] = \"  str x15, [x2]\";\r\n            instrs[1] = \"  ldr x14, [x1]\";\r\n            instrs[2] = \"  str x13, [x2]\";\r\n            instrs[3] = \"  ldr x12, [x1]\";\r\n            UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                sb, this.Counts, this.Prefix, instrs, instrs, true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n            if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/MixLoadStoreDivSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixLoadStoreDivSchedTest : UarchTest\r\n    {\r\n        public MixLoadStoreDivSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixloadstoredivsched\";\r\n            this.Description = \"Load/Store Scheduler Capacity Test, using divs to block retirement\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int count, int *arr2, int *arr3\";\r\n            this.GetFunctionCallParameters = \"structIterations, list_size, B, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                GenerateX86Asm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n            }\r\n        }\r\n\r\n        public void GenerateX86Asm(StringBuilder sb)\r\n        {\r\n            string[] dependentLoads = new string[2];\r\n            dependentLoads[0] = \"  mov (%r9, %rdx, 4), %r15\";\r\n            dependentLoads[1] = \"  mov %r14, (%r8, %rdx, 4)\";\r\n\r\n            UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, false);\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            string[] dependentLoads = new string[2];\r\n            dependentLoads[0] = \"  ldr w15, [x3, w25, uxtw #2]\";\r\n            dependentLoads[1] = \"  str w14, [x2, w25, uxtw #2]\";\r\n\r\n            string[] dependentLoads1 = new string[2];\r\n            dependentLoads1[0] = \"  ldr w15, [x3, w26, uxtw #2]\";\r\n            dependentLoads1[1] = \"  str w14, [x2, w26, uxtw #2]\";\r\n\r\n            UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, false);\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixLoadStoreSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixLoadStoreSched : UarchTest\r\n    {\r\n        public MixLoadStoreSched(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixloadstoresched\";\r\n            this.Description = \"Mixed Load/Store Address Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  mov %r15, (%r8, %rdi, 4)\";\r\n                dependentLoads[1] = \"  mov (%r8, %rdi, 2), %r14\";\r\n                dependentLoads[2] = \"  mov %r13, (%r8, %rdi, 4)\";\r\n                dependentLoads[3] = \"  mov (%r8, %rdi, 2), %r12\";\r\n\r\n                string[] dependentLoads1 = new string[4];\r\n                dependentLoads1[0] = \"  mov %r15, (%r8, %rsi, 4)\";\r\n                dependentLoads1[1] = \"  mov (%r8, %rsi, 4), %r14\";\r\n                dependentLoads1[2] = \"  mov %r13, (%r8, %rsi, 4)\";\r\n                dependentLoads1[3] = \"  mov (%r8, %rsi, 4), %r12\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  str w15, [x2, w25, uxtw #2]\";\r\n                dependentLoads[1] = \"  ldr w14, [x1, w25, uxtw #0]\";\r\n                dependentLoads[2] = \"  str w13, [x2, w25, uxtw #2]\";\r\n                dependentLoads[3] = \"  ldr w12, [x1, w25, uxtw #0]\";\r\n\r\n                string[] dependentLoads1 = new string[4];\r\n                dependentLoads1[0] = \"  str w15, [x2, w26, uxtw #2]\";\r\n                dependentLoads1[1] = \"  ldr w14, [x1, w26, uxtw #0]\";\r\n                dependentLoads1[2] = \"  str w13, [x2, w26, uxtw #2]\";\r\n                dependentLoads1[3] = \"  ldr w12, [x1, w26, uxtw #0]\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                // x5 and x6 are pointer chasing loads\r\n                string postLoadInstrs1 = \"  andi x7, x5, 0xF\\n  add x7, x7, x12\";\r\n                string postLoadInstrs2 = \"  andi x7, x6, 0xF\\n  add x7, x7, x12\";\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  sd x28, (a2)\";\r\n                dependentLoads[1] = \"  ld x29, 8(a2)\";\r\n                dependentLoads[2] = \"  sd x30, 16(a2)\";\r\n                dependentLoads[3] = \"  ld x31, 24(a2)\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/MixStoreDivSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixStoreDivSchedTest : UarchTest\r\n    {\r\n        public MixStoreDivSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixstoresched\";\r\n            this.Description = \"Store (Mixed Data/Address) Scheduler Capacity Test\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int count, int *arr2\";\r\n            this.GetFunctionCallParameters = \"structIterations, list_size, B\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                GenerateX86Asm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n            }\r\n        }\r\n\r\n        public void GenerateX86Asm(StringBuilder sb)\r\n        {\r\n            string[] dependentStores = new string[4];\r\n            dependentStores[0] = \"  mov %rdx, (%r8, %r15, 4)\";\r\n            dependentStores[1] = \"  mov %r15, (%r8, %rdx, 4)\";\r\n            dependentStores[2] = \"  mov %rdx, (%r8, %r15, 4)\";\r\n            dependentStores[3] = \"  mov %r15, (%r8, %rdx, 4)\";\r\n\r\n            string[] dependentStores1 = new string[4];\r\n            dependentStores1[0] = \"  mov %rdx, (%r8, %r11, 4)\";\r\n            dependentStores1[1] = \"  mov %r11, (%r8, %rdx, 4)\";\r\n            dependentStores1[2] = \"  mov %rdx, (%r8, %r11, 4)\";\r\n            dependentStores1[3] = \"  mov %r11, (%r8, %rdx, 4)\";\r\n            UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            string[] dependentStores = new string[4];\r\n            dependentStores[0] = \"  str w25, [x2, w15, uxtw #2]\";\r\n            dependentStores[1] = \"  str w15, [x2, w25, uxtw #2]\";\r\n            dependentStores[2] = \"  str w25, [x2, w15, uxtw #2]\";\r\n            dependentStores[3] = \"  str w15, [x2, w25, uxtw #2]\";\r\n\r\n            string[] dependentStores1 = new string[4];\r\n            dependentStores1[0] = \"  str w26, [x2, w15, uxtw #2]\";\r\n            dependentStores1[1] = \"  str w15, [x2, w26, uxtw #2]\";\r\n            dependentStores1[2] = \"  str w26, [x2, w15, uxtw #2]\";\r\n            dependentStores1[3] = \"  str w15, [x2, w26, uxtw #2]\";\r\n            UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/MixVec512Vec256BlockRfTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixVec512Vec256BlockRfTest : UarchTest\r\n    {\r\n        // number of tiny registers\r\n        private int nTiny;\r\n\r\n        public MixVec512Vec256BlockRfTest(int low, int high, int step, int nTiny)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixvec512vec256blockrf\" + nTiny;\r\n            this.Description = $\"Mixed zmm/ymm regs - AVX-512 only, {nTiny} 256-bit then 512-bit\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                // use even numbered regs for ymm testing\r\n                string initInstrs = \"  vmovups (%r8), %zmm1\\n\" +\r\n                \"  vmovups 64(%r8), %ymm2\\n\" +\r\n                \"  vmovups 128(%r8), %zmm3\\n\" +\r\n                \"  vmovups 192(%r8), %ymm4\\n\" +\r\n                \"  vmovups 256(%r8), %zmm5\\n\";\r\n\r\n                // use all zmm regs\r\n                for (int i = 6; i < 32; i++)\r\n                {\r\n                    if ((i & 1) == 0) initInstrs += \"vmovups %ymm2, %ymm\" + i + \"\\n\";\r\n                    else initInstrs += \"vmovups %zmm5, %zmm\" + i + \"\\n\";\r\n                }\r\n\r\n                List<string> instrsList = new List<string>();\r\n                for (int i = 0; i < nTiny; i++)\r\n                {\r\n                    int regNum = ((i & 1) == 0) ? i & 0x1F : (i + 1) & 0x1F;\r\n                    instrsList.Add($\"  vxorps %ymm2, %ymm{regNum}, %ymm{regNum}\");\r\n                }\r\n\r\n                for (int i = nTiny; i < this.Counts[this.Counts.Length - 1];i++)\r\n                {\r\n                    int regNum = ((i & 1) == 0) ? i: (i + 1);\r\n                    regNum = (regNum + 1) & 0x1F;\r\n                    instrsList.Add($\"  vxorps %zmm1, %zmm{regNum}, %zmm{regNum}\");\r\n                }\r\n\r\n                string[] unrolledAdds = instrsList.ToArray();\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);\r\n            }\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/MixVec512Vec256RfTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MixVec512Vec256RfTest : UarchTest\r\n    {\r\n        public MixVec512Vec256RfTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mixvec512vec256rf\";\r\n            this.Description = \"Mixed zmm/ymm regs - AVX-512 only, alternating\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                // use even numbered regs for ymm testing\r\n                string initInstrs = \"  vmovups (%r8), %zmm1\\n\" +\r\n                \"  vmovups 64(%r8), %ymm2\\n\" +\r\n                \"  vmovups 128(%r8), %zmm3\\n\" +\r\n                \"  vmovups 192(%r8), %ymm4\\n\" +\r\n                \"  vmovups 256(%r8), %zmm5\\n\";\r\n\r\n                // use all zmm regs\r\n                for (int i = 6; i < 32; i++)\r\n                {\r\n                    if ((i & 1) == 0) initInstrs += \"vmovups %ymm2, %ymm\" + i + \"\\n\";\r\n                    else initInstrs += \"vmovups %zmm5, %zmm\" + i + \"\\n\";\r\n                }\r\n\r\n                List<string> instrsList = new List<string>();\r\n                for (int i = 1; i < 32; i++)\r\n                {\r\n                    if ((i & 1) == 0) instrsList.Add($\"  vaddps %ymm2, %ymm{i}, %ymm{i}\");\r\n                    else instrsList.Add($\"  vaddps %zmm1, %zmm{i}, %zmm{i}\");\r\n                }\r\n\r\n                string[] unrolledAdds = instrsList.ToArray();\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);\r\n            }\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/MmxRfTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MmxRfTest : UarchTest\r\n    {\r\n        public MmxRfTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mmxrf\";\r\n            this.Description = \"64-bit MMX RF Capacity Test. x86 only\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, int *arr2\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, B\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);\r\n        }\r\n\r\n        public void GenerateX86GccAsm(StringBuilder sb)\r\n        {\r\n            string initInstrs = \r\n                \"  fsave (%r8)\\n\" +\r\n                \"  movq (%rdx), %mm0\\n\" +\r\n                \"  movq 8(%rdx), %mm1\\n\" +\r\n                \"  movq 16(%rdx), %mm2\\n\" +\r\n                \"  movq 24(%rdx), %mm3\\n\" +\r\n                \"  movq 32(%rdx), %mm4\\n\";\r\n\r\n            string cleanupInstrs = \"  frstor (%r8)\";\r\n\r\n            string[] unrolledAdds = new string[4];\r\n            unrolledAdds[0] = \"  paddw %mm0, %mm1\";\r\n            unrolledAdds[1] = \"  paddw %mm0, %mm2\";\r\n            unrolledAdds[2] = \"  paddw %mm0, %mm3\";\r\n            unrolledAdds[3] = \"  paddw %mm0, %mm4\";\r\n\r\n            UarchTestHelpers.GenerateX86AsmStructureTestFuncs(\r\n                sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, cleanupInstrs: cleanupInstrs);\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/MulSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class MulSchedTest : UarchTest\r\n    {\r\n        public MulSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"mulsched\";\r\n            this.Description = \"Scheduler, Integer Multiplies\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledMuls = new string[4];\r\n                unrolledMuls[0] = \"  imul %rdi, %r15\";\r\n                unrolledMuls[1] = \"  imul %rdi, %r14\";\r\n                unrolledMuls[2] = \"  imul %rdi, %r13\";\r\n                unrolledMuls[3] = \"  imul %rdi, %r12\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] unrolledMuls = new string[4];\r\n                unrolledMuls[0] = \"  mul x15, x15, x25\";\r\n                unrolledMuls[1] = \"  mul x14, x14, x25\";\r\n                unrolledMuls[2] = \"  mul x13, x13, x25\";\r\n                unrolledMuls[3] = \"  mul x12, x12, x25\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  mul.d $r15, $r15, $r12\";\r\n                unrolledAdds[1] = \"  mul.d $r16, $r16, $r12\";\r\n                unrolledAdds[2] = \"  mul.d $r17, $r17, $r12\";\r\n                unrolledAdds[3] = \"  mul.d $r18, $r18, $r12\";\r\n\r\n                string[] unrolledAdds1 = new string[4];\r\n                unrolledAdds1[0] = \"  mul.d $r15, $r15, $r13\";\r\n                unrolledAdds1[1] = \"  mul.d $r16, $r16, $r13\";\r\n                unrolledAdds1[2] = \"  mul.d $r17, $r17, $r13\";\r\n                unrolledAdds1[3] = \"  mul.d $r18, $r18, $r13\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string[] unrolledMuls = new string[4];\r\n                unrolledMuls[0] = \"  mul x30, x30, x5\";\r\n                unrolledMuls[1] = \"  mul x29, x29, x5\";\r\n                unrolledMuls[2] = \"  mul x28, x28, x5\";\r\n                unrolledMuls[3] = \"  mul x31, x31, x5\";\r\n\r\n                string[] unrolledMuls1 = new string[4];\r\n                unrolledMuls1[0] = \"  mul x30, x30, x6\";\r\n                unrolledMuls1[1] = \"  mul x31, x31, x6\";\r\n                unrolledMuls1[2] = \"  mul x28, x28, x6\";\r\n                unrolledMuls1[3] = \"  mul x29, x29, x6\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls1, false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/NopLoopTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class NopLoopTest : UarchTest\r\n    {\r\n        /// <summary>\r\n        ///\r\n        /// </summary>\r\n        /// <param name=\"low\">must be greater than 2</param>\r\n        /// <param name=\"high\"></param>\r\n        /// <param name=\"step\"></param>\r\n        public NopLoopTest(int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(3, high, step);\r\n            this.Prefix = \"noploop\";\r\n            this.Description = $\"NOP throughput for various loop sizes\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations\";\r\n            this.GetFunctionCallParameters = \"structIterations\";\r\n            this.DivideTimeByCount = true;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return false;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);\r\n            if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);\r\n        }\r\n\r\n        public void GenerateX86GccAsm(StringBuilder sb)\r\n        {\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = this.Prefix + this.Counts[i];\r\n                sb.AppendLine(funcName + \":\");\r\n\r\n                // count dec, jnz as instructions in the loop\r\n                for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(\"  nop\");\r\n                sb.AppendLine(\"  dec %rdi\");\r\n                sb.AppendLine(\"  jnz \" + funcName);\r\n                sb.AppendLine(\"  ret\");\r\n            }\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = this.Prefix + this.Counts[i];\r\n                sb.AppendLine(funcName + \":\");\r\n\r\n                // count dec, jnz as instructions in the loop\r\n                for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(\"  nop\");\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine(\"  cbnz x0, \" + funcName);\r\n                sb.AppendLine(\"  ret\");\r\n            }\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/PdepSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class PdepSchedTest : UarchTest\r\n    {\r\n        public PdepSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"pdepsched\";\r\n            this.Description = \"Scheduler, PDEP\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  pdep %rdi, %r15, %r15\";\r\n                unrolledAdds[1] = \"  pdep %rdi, %r14, %r14\";\r\n                unrolledAdds[2] = \"  pdep %rdi, %r13, %r13\";\r\n                unrolledAdds[3] = \"  pdep %rdi, %r12, %r12\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/ReturnStackTest.cs",
    "content": "﻿using System;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class ReturnStackTest : UarchTest\r\n    {\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public ReturnStackTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"returnstack\";\r\n            this.Description = \"Return Stack Depth Test\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations\";\r\n            this.GetFunctionCallParameters = \"structIterations\";\r\n            this.DivideTimeByCount = true;\r\n        }\r\n\r\n        private string GetFunctionName(int count, int depth) { return $\"returnstack{count}_{depth}\"; }\r\n\r\n        private string GetBranchFuncName(int branchCount) { return Prefix + branchCount; }\r\n        public string GetLabelName(string funcName, int part) { return funcName + \"part\" + part; }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                GenerateX86GccAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                GenerateMipsAsm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                GenerateRiscvAsm(sb);\r\n            }\r\n        }\r\n\r\n        public void GenerateX86GccAsm(StringBuilder sb)\r\n        {\r\n            for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)\r\n            {\r\n                int callDepth = this.Counts[countIdx];\r\n                string topLevelFunctionLabel = this.Prefix + callDepth;\r\n                sb.AppendLine($\"{topLevelFunctionLabel}:\");\r\n                sb.AppendLine(\"  xor %rax, %rax\");\r\n                sb.AppendLine($\"{topLevelFunctionLabel}_loop:\");\r\n                sb.AppendLine($\"  call \" + GetFunctionName(callDepth, 0));\r\n                sb.AppendLine($\"  dec %rdi\");\r\n                sb.AppendLine($\"  jne {topLevelFunctionLabel}_loop\");\r\n                sb.AppendLine(\"  ret\");\r\n\r\n                // generate a batch of functions so we aren't returning to the same address\r\n                // otherwise a simple predictor will suffice\r\n                for (int callIdx = 0; callIdx < callDepth; callIdx++)\r\n                {\r\n                    string funcName = GetFunctionName(callDepth, callIdx);\r\n                    sb.AppendLine($\".global {funcName}\");\r\n                    sb.AppendLine(\".align 128\"); // https://github.com/clamchowder/Microbenchmarks/issues/14\r\n                    sb.AppendLine($\"{funcName}:\");\r\n                    if (callIdx < callDepth - 1)\r\n                    {\r\n                        sb.AppendLine($\"  add %rdi, %rax\");\r\n                        sb.AppendLine(\"  call \" + GetFunctionName(callDepth, callIdx + 1));\r\n                    }\r\n\r\n                    sb.AppendLine(\".align 128\");\r\n                    sb.AppendLine(\"  ret\");\r\n                }\r\n            }\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)\r\n            {\r\n                int callDepth = this.Counts[countIdx];\r\n                string topLevelFunctionLabel = this.Prefix + callDepth;\r\n                sb.AppendLine($\"{topLevelFunctionLabel}:\");\r\n                sb.AppendLine(\"  sub sp, sp, #0x20\");\r\n                sb.AppendLine(\"  stp x29, x30, [sp, #0x10]\");\r\n                sb.AppendLine(\"  eor x3, x3, x3\");\r\n                sb.AppendLine($\"{topLevelFunctionLabel}_loop:\");\r\n                sb.AppendLine($\"  bl \" + GetFunctionName(callDepth, 0));\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine($\"  cbnz x0, {topLevelFunctionLabel}_loop\");\r\n                sb.AppendLine(\"  ldp x29, x30, [sp, #0x10]\");\r\n                sb.AppendLine(\"  add sp, sp, #0x20\");\r\n                sb.AppendLine(\"  ret\");\r\n\r\n                for (int callIdx = 0; callIdx < callDepth; callIdx++)\r\n                {\r\n                    string funcName = GetFunctionName(callDepth, callIdx);\r\n                    sb.AppendLine($\".global {funcName}\");\r\n                    sb.AppendLine($\"{funcName}:\");\r\n                    sb.AppendLine($\"  add x3, x3, x0\");\r\n                    if (callIdx < callDepth - 1)\r\n                    {\r\n                        // 'bl' is like x86 'call', except it's like the kid that falls asleep in the middle of class\r\n                        // it doesn't push the return address, so you have to do that yourself\r\n                        sb.AppendLine(\"  sub sp, sp, #0x20\");\r\n                        sb.AppendLine(\"  stp x29, x30, [sp, #0x10]\");\r\n                        sb.AppendLine(\"  bl \" + GetFunctionName(callDepth, callIdx + 1));\r\n                        sb.AppendLine(\"  ldp x29, x30, [sp, #0x10]\");\r\n                        sb.AppendLine(\"  add sp, sp, #0x20\");\r\n                    }\r\n\r\n                    sb.AppendLine(\"  ret\");\r\n                }\r\n            }\r\n        }\r\n\r\n        public void GenerateMipsAsm(StringBuilder sb)\r\n        {\r\n            for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)\r\n            {\r\n                int callDepth = this.Counts[countIdx];\r\n                string topLevelFunctionLabel = this.Prefix + callDepth;\r\n                sb.AppendLine($\"{topLevelFunctionLabel}:\");\r\n                // top level function runs for specified number of iterations\r\n                sb.AppendLine(\"  xor $r12, $r12, $r12\");\r\n                sb.AppendLine(\"  xor $r13, $r13, $r13\");\r\n                sb.AppendLine(\"  addi.d $r12, $r12, 1\");\r\n                sb.AppendLine(\"  addi.d $r13, $r13, 8\");\r\n                sb.AppendLine(\"  sub.d $sp, $sp, $r13\");\r\n                sb.AppendLine(\"  st.d $r1, $sp, 0\");\r\n                sb.AppendLine($\"{topLevelFunctionLabel}_loop:\");\r\n\r\n                // mips stack grows down\r\n                sb.AppendLine($\"  bl \" + GetFunctionName(callDepth, 0));\r\n                sb.AppendLine(\"  sub.d $r4, $r4, $r12\");\r\n                sb.AppendLine($\"  bnez $r4, {topLevelFunctionLabel}_loop\");\r\n                sb.AppendLine(\"  ld.d $r1, $sp, 0\");\r\n                sb.AppendLine(\"  add.d $sp, $sp, $r13\");\r\n                sb.AppendLine(\"  jr $r1\");\r\n\r\n                // generate the dummy functions\r\n                for (int callIdx = 0; callIdx < callDepth; callIdx++)\r\n                {\r\n                    string funcName = GetFunctionName(callDepth, callIdx);\r\n                    sb.AppendLine($\".global {funcName}\");\r\n                    sb.AppendLine($\"{funcName}:\");\r\n                    if (callIdx < callDepth - 1)\r\n                    {\r\n                        sb.AppendLine(\"  sub.d $sp, $sp, $r13\");\r\n                        sb.AppendLine(\"  st.d $r1, $sp, 0\"); // save return address\r\n                        sb.AppendLine(\"  bl \" + GetFunctionName(callDepth, callIdx + 1));\r\n                        sb.AppendLine(\"  ld.d $r1, $sp, 0\"); // load return address\r\n                        sb.AppendLine(\"  add.d $sp, $sp, $r13\");\r\n                    }\r\n\r\n                    sb.AppendLine(\"  jr $r1\");\r\n                }\r\n            }\r\n        }\r\n\r\n        public void GenerateRiscvAsm(StringBuilder sb)\r\n        {\r\n            for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)\r\n            {\r\n                int callDepth = this.Counts[countIdx];\r\n                string topLevelFunctionLabel = this.Prefix + callDepth;\r\n                sb.AppendLine($\"{topLevelFunctionLabel}:\");\r\n                // top level function runs for specified number of iterations\r\n                // iteration count in x10\r\n                sb.AppendLine(\"  addi sp, sp, -16\");\r\n                sb.AppendLine(\"  sd ra, (sp)\");\r\n                sb.AppendLine($\"{topLevelFunctionLabel}_loop:\");\r\n                sb.AppendLine($\"  jal \" + GetFunctionName(callDepth, 0));\r\n                sb.AppendLine(\"  addi x10, x10, -1\");\r\n                sb.AppendLine($\"  bge x10, x0, {topLevelFunctionLabel}_loop\");\r\n                sb.AppendLine(\"  ld ra, (sp)\");\r\n                sb.AppendLine(\"  addi sp, sp, 16\");\r\n                sb.AppendLine(\"  ret\");\r\n\r\n                // generate the dummy functions\r\n                for (int callIdx = 0; callIdx < callDepth; callIdx++)\r\n                {\r\n                    string funcName = GetFunctionName(callDepth, callIdx);\r\n                    sb.AppendLine($\".global {funcName}\");\r\n                    sb.AppendLine($\"{funcName}:\");\r\n                    if (callIdx < callDepth - 1)\r\n                    {\r\n                        sb.AppendLine(\"  addi sp, sp, -16\"); // keep stack pointer 16B aligned even though we only save a 8B reg\r\n                        sb.AppendLine(\"  sd ra, (sp)\"); // save return address\r\n                        sb.AppendLine(\"  jal \" + GetFunctionName(callDepth, callIdx + 1));\r\n                        sb.AppendLine(\"  ld ra, (sp)\"); // load return address\r\n                        sb.AppendLine(\"  addi sp, sp, 16\");\r\n                    }\r\n\r\n                    sb.AppendLine(\"  ret\");\r\n                }\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/RobTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class RobTest : UarchTest\r\n    {\r\n        private string[] nops;\r\n        private bool initialDependentBranch;\r\n        public RobTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"rob\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Reorder Buffer Test\" + (initialDependentBranch ? \" preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n            this.nops = new string[] { \"nop\" };\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch)\r\n            {\r\n                if (isa == IUarchTest.ISA.aarch64) return true;\r\n                if (isa == IUarchTest.ISA.riscv) return true;\r\n                return false;\r\n            }\r\n\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/RorSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class RorSchedTest : UarchTest\r\n    {\r\n        public RorSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"rorsched\";\r\n            this.Description = \"Scheduler, Integer Rotate by Immediate (1)\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string postLoadInstrs = \"  mov %rdi, %r15\";\r\n                string postLoadInstrs2 = \"  mov %rsi, %r15\";\r\n                string[] unrolledInstrs = new string[1];\r\n                unrolledInstrs[0] = \"  ror $1, %r15\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(\r\n                    sb, \r\n                    this.Counts, \r\n                    this.Prefix, \r\n                    unrolledInstrs, \r\n                    unrolledInstrs, \r\n                    postLoadInstrs1: postLoadInstrs, \r\n                    postLoadInstrs2: postLoadInstrs2, \r\n                    includePtrChasingLoads: false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/ShlSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class ShlSchedTest : UarchTest\r\n    {\r\n        public ShlSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"shlsched\";\r\n            this.Description = \"Scheduler, Integer Shift by Immediate (1)\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string postLoadInstrs = \"  mov %rdi, %r15\";\r\n                string postLoadInstrs2 = \"  mov %rsi, %r15\";\r\n                string[] unrolledInstrs = new string[1];\r\n                unrolledInstrs[0] = \" shl $1, %r15\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(\r\n                    sb, \r\n                    this.Counts, \r\n                    this.Prefix, \r\n                    unrolledInstrs, \r\n                    unrolledInstrs, \r\n                    postLoadInstrs1: postLoadInstrs, \r\n                    postLoadInstrs2: postLoadInstrs2, \r\n                    includePtrChasingLoads: false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/StoreDataDivNsqTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class StoreDataDivNsqTest : UarchTest\r\n    {\r\n        public StoreDataDivNsqTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"storedatadivnsq\";\r\n            this.Description = \"Store Data Scheduler, using DIVs to block retirement\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                // idiv puts remainder in RDX\r\n                string[] dependentStores = new string[4];\r\n                dependentStores[0] = \"  mov %rdx, (%r8, %r15, 4)\";\r\n                dependentStores[1] = \"  mov %rdx, (%r8, %r15, 4)\";\r\n                dependentStores[2] = \"  mov %rdx, (%r8, %r15, 4)\";\r\n                dependentStores[3] = \"  mov %rdx, (%r8, %r15, 4)\";\r\n\r\n                string[] independentStores = new string[4];\r\n                independentStores[0] = \"  mov %r14, (%r8, %r11, 4)\";\r\n                independentStores[1] = \"  mov %r14, (%r8, %r11, 4)\";\r\n                independentStores[2] = \"  mov %r14, (%r8, %r11, 4)\";\r\n                independentStores[3] = \"  mov %r14, (%r8, %r11, 4)\";\r\n                UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] dependentStores = new string[1];\r\n                dependentStores[0] = \"  str w25, [x2, w15, uxtw #2]\";\r\n\r\n                string[] independentStores = new string[1];\r\n                independentStores[0] = \"  str w15, [x2, w15, uxtw #2]\";\r\n\r\n                UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/StoreDataNsqTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class StoreDataNsq : UarchTest\r\n    {\r\n        public StoreDataNsq(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"storedatansq\";\r\n            this.Description = \"Store Data Scheduler, excluding NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            // if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  mov %rdi, (%r8)\";\r\n                dependentLoads[1] = \"  mov %rdi, 8(%r8)\";\r\n                dependentLoads[2] = \"  mov %rdi, 16(%r8)\";\r\n                dependentLoads[3] = \"  mov %rdi, 24(%r8)\";\r\n\r\n                string[] independentLoads = new string[4];\r\n                independentLoads[0] = \"  mov %r14, (%r8)\";\r\n                independentLoads[1] = \"  mov %r14, 8(%r8)\";\r\n                independentLoads[2] = \"  mov %r14, 16(%r8)\";\r\n                independentLoads[3] = \"  mov %r14, 24(%r8)\";\r\n                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentLoads, independentLoads);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/StoreDataSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class StoreDataSchedTest : UarchTest\r\n    {\r\n        public StoreDataSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"storedatasched\";\r\n            this.Description = \"Store Data Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  mov %rdi, (%r8)\";\r\n                dependentLoads[1] = \"  mov %rdi, 8(%r8)\";\r\n                dependentLoads[2] = \"  mov %rdi, 16(%r8)\";\r\n                dependentLoads[3] = \"  mov %rdi, 24(%r8)\";\r\n\r\n                string[] dependentLoads1 = new string[4];\r\n                dependentLoads1[0] = \"  mov %rsi, (%r8)\";\r\n                dependentLoads1[1] = \"  mov %rsi, 8(%r8)\";\r\n                dependentLoads1[2] = \"  mov %rsi, 16(%r8)\";\r\n                dependentLoads1[3] = \"  mov %rsi, 24(%r8)\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  str w25, [x2, 8]\";\r\n                dependentLoads[1] = \"  str w25, [x2, 16]\";\r\n                dependentLoads[2] = \"  str w25, [x2, 24]\";\r\n                dependentLoads[3] = \"  str w25, [x2, 32]\";\r\n\r\n                string[] dependentLoads1 = new string[4];\r\n                dependentLoads1[0] = \"  str w26, [x2, 8]\";\r\n                dependentLoads1[1] = \"  str w26, [x2, 16]\";\r\n                dependentLoads1[2] = \"  str w26, [x2, 24]\";\r\n                dependentLoads1[3] = \"  str w26, [x2, 32]\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string postLoadInstrs1 = \"  andi $r19, $r12, 0xF\\n  add.d $r19, $r19, $r6\";\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  ld.d $r15, $r19, 0\";\r\n                dependentLoads[1] = \"  ld.d $r16, $r19, 8\";\r\n                dependentLoads[2] = \"  ld.d $r17, $r19, 12\";\r\n                dependentLoads[3] = \"  ld.d $r18, $r19, 16\";\r\n\r\n                string postLoadInstrs2 = \"  andi $r19, $r13, 0xF\\n  add.d $r19, $r19, $r6\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, null, \r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                // x5 and x6 are pointer chasing loads\r\n                string postLoadInstrs1 = \"  andi x7, x5, 0xF\\n  add x7, x7, x12\";\r\n                string postLoadInstrs2 = \"  andi x7, x6, 0xF\\n  add x7, x7, x12\";\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  ld x28, (x7)\";\r\n                dependentLoads[1] = \"  ld x29, 8(x7)\";\r\n                dependentLoads[2] = \"  ld x30, 16(x7)\";\r\n                dependentLoads[3] = \"  ld x31, 24(x7)\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/StoreDivNsqTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class StoreDivNsqTest : UarchTest\r\n    {\r\n        public StoreDivNsqTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"storedivnsq\";\n            this.Description = \"Store Scheduler, using DIVs to block retirement, excluding NSQ\";\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\n            if (isa == IUarchTest.ISA.aarch64) return true;\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                // idiv puts remainder in RDX\n                string[] dependentStores = new string[4];\r\n                dependentStores[0] = \"  mov %r15w, (%r8, %rdx, 2)\";\r\n                dependentStores[1] = \"  mov %r15w, 2(%r8, %rdx, 2)\";\r\n                dependentStores[2] = \"  mov %r15w, 4(%r8, %rdx, 2)\";\r\n                dependentStores[3] = \"  mov %r15w, 6(%r8, %rdx, 2)\";\r\n\r\n                string[] indepStores = new string[4];\r\n                indepStores[0] = \"  mov %r11w, (%r8)\";\r\n                indepStores[1] = \"  mov %r11w, 2(%r8)\";\r\n                indepStores[2] = \"  mov %r11w, 4(%r8)\";\r\n                indepStores[3] = \"  mov %r11w, 6(%r8)\";\r\n                UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepStores);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] dependentStores = new string[1];\r\n                dependentStores[0] = \"  str w15, [x2, w25, uxtw #2]\";\r\n\r\n                string[] independentStores = new string[1];\r\n                independentStores[0] = \"  str w15, [x2, w15, uxtw #2]\";\r\n\r\n                UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/StoreDivSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class StoreDivSchedTest : UarchTest\r\n    {\r\n        public StoreDivSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"storedivsched\";\r\n            this.Description = \"Store Address Scheduler Capacity Test, using divs to block retirement\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int count, int *arr2\";\r\n            this.GetFunctionCallParameters = \"structIterations, list_size, B\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                GenerateX86Asm(sb);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                GenerateArmAsm(sb);\r\n            }\r\n        }\r\n\r\n        public void GenerateX86Asm(StringBuilder sb)\r\n        {\r\n            string[] dependentStores = new string[4];\r\n            dependentStores[0] = \"  mov %r15, (%r8, %rdx, 4)\";\r\n            dependentStores[1] = \"  mov %r15, (%r8, %rdx, 4)\";\r\n            dependentStores[2] = \"  mov %r15, (%r8, %rdx, 4)\";\r\n            dependentStores[3] = \"  mov %r15, (%r8, %rdx, 4)\";\r\n\r\n            string[] dependentStores1 = new string[4];\r\n            dependentStores1[0] = \"  mov %r11, (%r8, %rdx, 4)\";\r\n            dependentStores1[1] = \"  mov %r11, (%r8, %rdx, 4)\";\r\n            dependentStores1[2] = \"  mov %r11, (%r8, %rdx, 4)\";\r\n            dependentStores1[3] = \"  mov %r11, (%r8, %rdx, 4)\";\r\n\r\n            // instead of using pointer chasing loads, use a nasty block of chained integer divisions to block retirement\r\n            // some older/less capable architectures will not reorder loads ahead of stores with unknown addresses,\r\n            // which breaks the usual technique\r\n            UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            string[] dependentStores = new string[4];\r\n            dependentStores[0] = \"  str w15, [x2, w25, uxtw #2]\";\r\n            dependentStores[1] = \"  str w15, [x2, w25, uxtw #2]\";\r\n            dependentStores[2] = \"  str w15, [x2, w25, uxtw #2]\";\r\n            dependentStores[3] = \"  str w15, [x2, w25, uxtw #2]\";\r\n\r\n            string[] dependentStores1 = new string[4];\r\n            dependentStores1[0] = \"  str w15, [x2, w26, uxtw #2]\";\r\n            dependentStores1[1] = \"  str w15, [x2, w26, uxtw #2]\";\r\n            dependentStores1[2] = \"  str w15, [x2, w26, uxtw #2]\";\r\n            dependentStores1[3] = \"  str w15, [x2, w26, uxtw #2]\";\r\n            UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/StoreNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class StoreNsq : UarchTest\r\n    {\r\n        public StoreNsq(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"storensq\";\r\n            this.Description = \"Store Address Scheduler, Excluding any NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] depStores = new string[4];\r\n                depStores[0] = \"  str w15, [x2, w25, uxtw #2]\";\r\n                depStores[1] = \"  str w14, [x2, w25, uxtw #2]\";\r\n                depStores[2] = \"  str w13, [x2, w25, uxtw #2]\";\r\n                depStores[3] = \"  str w12, [x2, w25, uxtw #2]\";\r\n\r\n                string[] indepStores = new string[4];\r\n                indepStores[0] = \"  str w15, [x2, w26, uxtw #2]\";\r\n                indepStores[1] = \"  str w14, [x2, w26, uxtw #2]\";\r\n                indepStores[2] = \"  str w13, [x2, w26, uxtw #2]\";\r\n                indepStores[3] = \"  str w12, [x2, w26, uxtw #2]\";\r\n                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depStores, indepStores);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/StoreSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class StoreSchedTest : UarchTest\r\n    {\r\n        public StoreSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"storesched\";\r\n            this.Description = \"Store Address Scheduler\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] dependentStores = new string[4];\r\n                dependentStores[0] = \"  mov %r15, (%r8, %rdi, 4)\";\r\n                dependentStores[1] = \"  mov %r14, (%r8, %rdi, 4)\";\r\n                dependentStores[2] = \"  mov %r13, (%r8, %rdi, 4)\";\r\n                dependentStores[3] = \"  mov %r12, (%r8, %rdi, 4)\";\r\n\r\n                string[] dependentStores1 = new string[4];\r\n                dependentStores1[0] = \"  mov %r15, (%r8, %rsi, 4)\";\r\n                dependentStores1[1] = \"  mov %r14, (%r8, %rsi, 4)\";\r\n                dependentStores1[2] = \"  mov %r13, (%r8, %rsi, 4)\";\r\n                dependentStores1[3] = \"  mov %r12, (%r8, %rsi, 4)\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string[] dependentStores = new string[4];\r\n                dependentStores[0] = \"  str w15, [x2, w25, uxtw #2]\";\r\n                dependentStores[1] = \"  str w14, [x2, w25, uxtw #2]\";\r\n                dependentStores[2] = \"  str w13, [x2, w25, uxtw #2]\";\r\n                dependentStores[3] = \"  str w12, [x2, w25, uxtw #2]\";\r\n\r\n                string[] dependentStores1 = new string[4];\r\n                dependentStores1[0] = \"  str w15, [x2, w26, uxtw #2]\";\r\n                dependentStores1[1] = \"  str w14, [x2, w26, uxtw #2]\";\r\n                dependentStores1[2] = \"  str w13, [x2, w26, uxtw #2]\";\r\n                dependentStores1[3] = \"  str w12, [x2, w26, uxtw #2]\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                // x5 and x6 are pointer chasing loads\r\n                string postLoadInstrs1 = \"  andi x7, x5, 0xF\\n  add x7, x7, x12\";\r\n                string postLoadInstrs2 = \"  andi x7, x6, 0xF\\n  add x7, x7, x12\";\r\n                string[] dependentLoads = new string[4];\r\n                dependentLoads[0] = \"  sd x28, (a2)\";\r\n                dependentLoads[1] = \"  sd x29, 8(a2)\";\r\n                dependentLoads[2] = \"  sd x30, 16(a2)\";\r\n                dependentLoads[3] = \"  sd x31, 24(a2)\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,\r\n                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/Stq128Test.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class Stq128Test : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public Stq128Test(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"stq128\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Store Queue with 128-bit stores\" + (initialDependentBranch ? \", preceded by independent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch)\r\n            {\r\n                if (isa == IUarchTest.ISA.aarch64) return true;\r\n                if (isa == IUarchTest.ISA.riscv) return true;\r\n                return false;\r\n            }\r\n\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string initInstrs = \"  movups (%rdx), %xmm1\";\r\n                string[] unrolledStores = new string[4];\r\n                unrolledStores[0] = \"  movaps %xmm1, (%r8)\";\r\n                unrolledStores[1] = \"  movaps %xmm1, (%r8)\";\r\n                unrolledStores[2] = \"  movaps %xmm1, (%r8)\";\r\n                unrolledStores[3] = \"  movaps %xmm1, (%r8)\";\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, initInstrs: initInstrs, includePtrChasingLoads: false);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string initInstrs = \"  ldr q0, [x1]\";\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                string[] unrolledStores = new string[4];\r\n                unrolledStores[0] = \"  str q0, [x2]\";\r\n                unrolledStores[1] = \"  str q0, [x2]\";\r\n                unrolledStores[2] = \"  str q0, [x2]\";\r\n                unrolledStores[3] = \"  str q0, [x2]\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string initInstrs = \"  mv t6, x0\\n  addi t6, t6, 16\\n  vsetvli t5, t6, e32\\n  vlw.v v0, (a1)\";\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;\r\n                postLoadInstrs += \"\\n  mv t6, a2\";\r\n                string[] unrolledStores = new string[1];\r\n                unrolledStores[0] = \"  vsw.v v0, (t6)\\n  addi t6, t6, 64\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, false,\r\n                    initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/Stq512Test.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class Stq512Test : UarchTest\r\n    {\r\n        private bool differentLines;\r\n        public Stq512Test(int low, int high, int step, bool differentLines)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"stq512\" + (differentLines ? \"dl\" : string.Empty);\r\n            this.Description = \"Store Queue with 512-bit stores - AVX-512 only\";\r\n            if (differentLines) this.Description += \" with multiple lines\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.differentLines = differentLines;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string initInstrs = \"  vmovaps (%r8), %zmm0\\n  vmovaps %zmm0, %zmm1\";\r\n                string[] unrolledStores;\r\n                if (differentLines)\r\n                {\r\n                    List<string> unrolledStoresList = new List<string>();\r\n                    int maxOffset = 512, currentOffset = 0;\r\n                    for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)\r\n                    {\r\n                        string loadOffset = currentOffset > 0 ? currentOffset.ToString() : string.Empty;\r\n                        string nextInstr = $\"  vmovaps %zmm0, {loadOffset}(%r8)\";\r\n                        unrolledStoresList.Add(nextInstr);\r\n                        if (currentOffset >= maxOffset)\r\n                        {\r\n                            currentOffset = 0;\r\n                        }\r\n                        else currentOffset += 64;\r\n                        unrolledStoresList.Add(\"  vmovaps %zmm0, (%r8)\");\r\n                    }\r\n\r\n                    unrolledStores = unrolledStoresList.ToArray();\r\n                }\r\n                else\r\n                {\r\n                    unrolledStores = new string[2];\r\n                    unrolledStores[0] = \"  vmovaps %zmm0, (%r8)\";\r\n                    unrolledStores[1] = \"  vmovaps %zmm1, (%r8)\";\r\n                }\r\n\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, initInstrs: initInstrs);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/StqTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class StqTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        private bool spaced;\r\n\r\n        public StqTest(int low, int high, int step, bool initialDependentBranch, bool spaced)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"stq\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Store Queue\" + (initialDependentBranch ? \", preceded by independent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n            this.spaced = spaced;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch)\r\n            {\r\n                if (isa == IUarchTest.ISA.aarch64) return true;\r\n                if (isa == IUarchTest.ISA.riscv) return true;\r\n                return false;\r\n            }\r\n\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] unrolledStores;\r\n                string postLoadInstrs = \"\";\r\n                if (spaced)\r\n                {\r\n                    postLoadInstrs = \"mov %r8, %r11\";\r\n                    List<string> storeInstrs = new List<string>();\r\n                    for (int i = 0; i < this.Counts[Counts.Length - 1]; i++)\r\n                    {\r\n                        // Send to different cache lines\r\n                        storeInstrs.Add(\"  mov %r15, (%r11)\\n  add $64, %r11\");\r\n                    }\r\n\r\n                    unrolledStores = storeInstrs.ToArray();\r\n                }\r\n                else\r\n                {\r\n                    unrolledStores = new string[4];\r\n                    unrolledStores[0] = \"  mov %r15, (%r8)\";\r\n                    unrolledStores[1] = \"  mov %r14, (%r8)\";\r\n                    unrolledStores[2] = \"  mov %r13, (%r8)\";\r\n                    unrolledStores[3] = \"  mov %r12, (%r8)\";\r\n                }\r\n\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(\r\n                    sb, \r\n                    this.Counts, \r\n                    this.Prefix, \r\n                    unrolledStores, \r\n                    unrolledStores, \r\n                    postLoadInstrs1: postLoadInstrs, \r\n                    postLoadInstrs2: postLoadInstrs, \r\n                    includePtrChasingLoads: false);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                string[] unrolledStores = new string[4];\r\n                unrolledStores[0] = \"  str x15, [x2]\";\r\n                unrolledStores[1] = \"  str x14, [x2]\";\r\n                unrolledStores[2] = \"  str x13, [x2]\";\r\n                unrolledStores[3] = \"  str x12, [x2]\";\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(\r\n                    sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string[] unrolledStores = new string[4];\r\n                unrolledStores[0] = \"  st.d $r15, $r6, 0\";\r\n                unrolledStores[1] = \"  st.d $r16, $r6, 0\";\r\n                unrolledStores[2] = \"  st.d $r17, $r6, 0\";\r\n                unrolledStores[3] = \"  st.d $r18, $r6, 0\";\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;\r\n                string[] unrolledStores;\r\n                if (this.spaced)\r\n                {\r\n                    List<string> stores = new List<string>();\r\n                    for (int i = 0; i < 32; i++)\r\n                    {\r\n                        stores.Add($\"  sd x28, {i * 16}(x12)\");\r\n                    }\r\n\r\n                    unrolledStores = stores.ToArray();\r\n                }\r\n                else\r\n                {\r\n                    unrolledStores = new string[4];\r\n                    unrolledStores[0] = \"  sd x28, (x12)\";\r\n                    unrolledStores[1] = \"  sd x29, 8(x12)\";\r\n                    unrolledStores[2] = \"  sd x30, 16(x12)\";\r\n                    unrolledStores[3] = \"  sd x31, 24(x12)\";\r\n                }\r\n\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, false,\r\n                    postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/TakenBranchBufferTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class TakenBranchBufferTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public TakenBranchBufferTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"tbb\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Taken Branch Buffer Test (taken branches pending retire)\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);\r\n            else if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);\r\n        }\r\n\r\n        public void GenerateX86GccAsm(StringBuilder sb)\r\n        {\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = Prefix + Counts[i];\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  push %rsi\");\r\n                sb.AppendLine(\"  push %rdi\");\r\n                sb.AppendLine(\"  push %r15\");\r\n                sb.AppendLine(\"  push %r14\");\r\n                sb.AppendLine(\"  push %r13\");\r\n                sb.AppendLine(\"  push %r12\");\r\n                sb.AppendLine(\"  push %r11\");\r\n                sb.AppendLine(\"  push %r8\");\r\n                sb.AppendLine(\"  push %rcx\");\r\n                sb.AppendLine(\"  push %rdx\");\r\n\r\n                // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                sb.AppendLine(\"  xor %r15, %r15\");\r\n                sb.AppendLine(\"  mov $0x1, %r14\");\r\n                sb.AppendLine(\"  mov $0x2, %r13\");\r\n                sb.AppendLine(\"  mov $0x3, %r12\");\r\n                sb.AppendLine(\"  mov $0x4, %r11\");\r\n\r\n                sb.AppendLine(\"  xor %rdi, %rdi\");\r\n                sb.AppendLine(\"  mov $0x40, %esi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_edi_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  jmp {jumpLabel}\");\r\n                    sb.AppendLine(\".align 16\");\r\n                    if (fillerIdx % 2 == 0) sb.AppendLine(\"  nop\");\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_esi_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  jmp {jumpLabel}\");\r\n                    // try to space the jumps out a bit\r\n                    sb.AppendLine(\".align 16\");\r\n                    if (fillerIdx % 2 == 0) sb.AppendLine(\"  nop\");\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  dec %rcx\");\r\n                sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                sb.AppendLine(\"  pop %rdx\");\r\n                sb.AppendLine(\"  pop %rcx\");\r\n                sb.AppendLine(\"  pop %r8\");\r\n                sb.AppendLine(\"  pop %r11\");\r\n                sb.AppendLine(\"  pop %r12\");\r\n                sb.AppendLine(\"  pop %r13\");\r\n                sb.AppendLine(\"  pop %r14\");\r\n                sb.AppendLine(\"  pop %r15\");\r\n                sb.AppendLine(\"  pop %rdi\");\r\n                sb.AppendLine(\"  pop %rsi\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n        }\r\n\r\n        public void GenerateArmAsm(StringBuilder sb)\r\n        {\r\n            string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n            for (int i = 0; i < Counts.Length; i++)\r\n            {\r\n                string funcName = Prefix + Counts[i];\r\n\r\n                sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                sb.AppendLine(\"  sub sp, sp, #0x50\");\r\n                sb.AppendLine(\"  stp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  stp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  stp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  stp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  mov x15, 1\");\r\n                sb.AppendLine(\"  mov x14, 2\");\r\n                sb.AppendLine(\"  mov x13, 3\");\r\n                sb.AppendLine(\"  mov x12, 4\");\r\n                sb.AppendLine(\"  mov x11, 5\");\r\n                sb.AppendLine(\"  mov x10, 6\");\r\n\r\n                sb.AppendLine(\"  mov w25, 0x0\");\r\n                sb.AppendLine(\"  mov w26, 0x40\");\r\n                sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                sb.AppendLine(\"  ldr w25, [x1, w25, uxtw #2]\"); // current = A[current]\r\n                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_w25_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  b {jumpLabel}\");\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  ldr w26, [x1, w26, uxtw #2]\");\r\n                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);\r\n                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)\r\n                {\r\n                    string jumpLabel = $\"{funcName}_w26_target{fillerIdx}\";\r\n                    sb.AppendLine($\"  b {jumpLabel}\");\r\n                    sb.AppendLine($\"{jumpLabel}:\");\r\n                }\r\n\r\n                sb.AppendLine(\"  sub x0, x0, 1\");\r\n                sb.AppendLine(\"  cbnz x0, \" + funcName + \"start\");\r\n                sb.AppendLine(\"  ldp x25, x26, [sp, #0x40]\");\r\n                sb.AppendLine(\"  ldp x10, x11, [sp, #0x30]\");\r\n                sb.AppendLine(\"  ldp x12, x13, [sp, #0x20]\");\r\n                sb.AppendLine(\"  ldp x14, x15, [sp, #0x10]\");\r\n                sb.AppendLine(\"  add sp, sp, #0x50\");\r\n                sb.AppendLine(\"  ret\\n\\n\");\r\n            }\r\n\r\n            if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/TakenJumpSchedTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class TakenJumpSchedTest : UarchTest\r\n    {\r\n        public TakenJumpSchedTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"takenjumpsched\";\r\n            this.Description = \"Scheduler, Taken Jumps\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            // if (isa == IUarchTest.ISA.mips64) return true;\r\n            // if (isa == IUarchTest.ISA.riscv) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                for (int i = 0; i < this.Counts.Length; i++)\r\n                {\r\n                    string funcName = this.Prefix + this.Counts[i];\r\n                    sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                    sb.AppendLine(\"  push %rsi\");\r\n                    sb.AppendLine(\"  push %rdi\");\r\n                    sb.AppendLine(\"  push %r8\");\r\n                    sb.AppendLine(\"  push %rcx\");\r\n                    sb.AppendLine(\"  push %rdx\");\r\n\r\n                    // arguments are in RDI, RSI, RDX, RCX, R8, and R9\r\n                    // move them into familiar windows argument regs (rcx, rdx, r8)\r\n                    sb.AppendLine(\"  mov %rdx, %r8\"); // r8 <- rdx\r\n                    sb.AppendLine(\"  mov %rsi, %rdx\"); // rdx <- rsi\r\n                    sb.AppendLine(\"  mov %rdi, %rcx\"); // rcx <- rdi\r\n\r\n                    sb.AppendLine(\"  xor %rdi, %rdi\");\r\n                    sb.AppendLine(\"  mov $0x40, %esi\");\r\n                    sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                    sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                    sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                    sb.AppendLine(\"  mov (%rdx,%rdi,4), %edi\");\r\n                    for (int fillerIdx = 0;fillerIdx < this.Counts[i]; fillerIdx++)\r\n                    {\r\n                        string labelName = funcName + \"part\" + fillerIdx;\r\n                        sb.AppendLine(\"  cmp %rdi, %rsi\");\r\n                        sb.AppendLine(\"  jne \" + labelName);\r\n                        sb.AppendLine(\"  inc %rax\");\r\n                        sb.AppendLine(\".align 16\");\r\n                        sb.AppendLine(labelName + \":\");\r\n\r\n                    }\r\n\r\n                    sb.AppendLine(\"  mov (%rdx,%rsi,4), %esi\");\r\n                    sb.AppendLine(\"lfence\");\r\n\r\n                    sb.AppendLine(\"  dec %rcx\");\r\n                    sb.AppendLine(\"  jne \" + funcName + \"start\");\r\n                    sb.AppendLine(\"  pop %rdx\");\r\n                    sb.AppendLine(\"  pop %rcx\");\r\n                    sb.AppendLine(\"  pop %r8\");\r\n                    sb.AppendLine(\"  pop %rdi\");\r\n                    sb.AppendLine(\"  pop %rsi\");\r\n                    sb.AppendLine(\"  ret\\n\\n\");\r\n                }\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                for (int i = 0; i < this.Counts.Length; i++)\r\n                {\r\n                    string funcName = this.Prefix + this.Counts[i];\r\n\r\n                    // args in x0, x1\r\n                    sb.AppendLine(\"\\n\" + funcName + \":\");\r\n                    sb.AppendLine(\"  sub sp, sp, #0x50\");\r\n                    sb.AppendLine(\"  stp x14, x15, [sp, #0x10]\");\r\n                    sb.AppendLine(\"  stp x12, x13, [sp, #0x20]\");\r\n                    sb.AppendLine(\"  stp x10, x11, [sp, #0x30]\");\r\n                    sb.AppendLine(\"  stp x25, x26, [sp, #0x40]\");\r\n                    sb.AppendLine(\"  mov x15, 1\");\r\n                    sb.AppendLine(\"  mov w25, 0x0\");\r\n                    sb.AppendLine(\"  mov w26, 0x40\");\r\n                    sb.AppendLine(\"\\n\" + funcName + \"start:\");\r\n                    sb.AppendLine(\"  ldr w25, [x1, w25, uxtw #2]\"); // current = A[current]\r\n                    for (int nopIdx = 0; nopIdx < this.Counts[i]; nopIdx++)\r\n                    {\r\n                        string labelName = funcName + \"part\" + nopIdx;\r\n                        sb.AppendLine(\"  cmp w25, w26\");\r\n                        sb.AppendLine(\"  b.ne \" + labelName);\r\n                        sb.AppendLine(\"  add x15, x15, 1\");\r\n                        sb.AppendLine(\"  nop\\n  nop\\n  nop\");\r\n                        sb.AppendLine(labelName + \":\");\r\n                    }\r\n\r\n                    sb.AppendLine(\"  ldr w26, [x1, w26, uxtw #2]\");\r\n                    sb.AppendLine(\"  dsb sy\");\r\n                    sb.AppendLine(\"  isb sy\");\r\n\r\n                    sb.AppendLine(\"  sub x0, x0, 1\");\r\n                    sb.AppendLine(\"  cbnz x0, \" + funcName + \"start\");\r\n                    sb.AppendLine(\"  ldp x25, x26, [sp, #0x40]\");\r\n                    sb.AppendLine(\"  ldp x10, x11, [sp, #0x30]\");\r\n                    sb.AppendLine(\"  ldp x12, x13, [sp, #0x20]\");\r\n                    sb.AppendLine(\"  ldp x14, x15, [sp, #0x10]\");\r\n                    sb.AppendLine(\"  add sp, sp, #0x50\");\r\n                    sb.AppendLine(\"  ret\\n\\n\");\r\n                }\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                // todo\r\n                string[] unrolledAdds = new string[4];\r\n                unrolledAdds[0] = \"  mul x30, x30, x5\";\r\n                unrolledAdds[1] = \"  mul x29, x29, x5\";\r\n                unrolledAdds[2] = \"  mul x28, x28, x5\";\r\n                unrolledAdds[3] = \"  mul x31, x31, x5\";\r\n\r\n                string[] unrolledAdds1 = new string[4];\r\n                unrolledAdds1[0] = \"  mul x30, x30, x6\";\r\n                unrolledAdds1[1] = \"  mul x31, x31, x6\";\r\n                unrolledAdds1[2] = \"  mul x28, x28, x6\";\r\n                unrolledAdds1[3] = \"  mul x29, x29, x6\";\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/Vec512RfTest.cs",
    "content": "﻿using System.Collections.Generic;\r\nusing System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class Vec512RfTest : UarchTest\r\n    {\r\n        public Vec512RfTest(int low, int high, int step)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"vec512rf\";\r\n            this.Description = \"Vector (512-bit packed fp) RF Test - AVX-512 only\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                // it's ok, the ptr chasing arr should be way bigger than this\r\n                string initInstrs = \"  vmovups (%r8), %zmm1\\n\" +\r\n                \"  vmovups 64(%r8), %zmm2\\n\" +\r\n                \"  vmovups 128(%r8), %zmm3\\n\" +\r\n                \"  vmovups 192(%r8), %zmm4\\n\" +\r\n                \"  vmovups 256(%r8), %zmm5\\n\";\r\n\r\n                // use all zmm regs\r\n                for (int i = 6; i < 32; i++)\r\n                {\r\n                    initInstrs += \"vmovups %zmm5, %zmm\" + i + \"\\n\";\r\n                }\r\n\r\n                List<string> instrsList = new List<string>();\r\n                for (int i = 1; i < 32; i++)\r\n                {\r\n                    instrsList.Add($\"  vaddps %zmm1, %zmm{i}, %zmm{i}\");\r\n                }\r\n\r\n                string[] unrolledAdds = instrsList.ToArray();\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);\r\n            }\r\n        }\r\n    }\r\n}"
  },
  {
    "path": "AsmGen/tests/VecMulNsq.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class VecMulNsq : UarchTest\r\n    {\r\n        private int totalOps;\r\n        public VecMulNsq(int low, int high, int step, int totalOps)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"vecmulnsq\" + totalOps;\r\n            this.Description = \"Vector Integer Multiply, excluding possible NSQ\";\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr, float *floatArr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A, fpArr\";\r\n            this.DivideTimeByCount = false;\r\n            this.totalOps = totalOps;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string postLoadInstrs = \"  mov %rdi, %r15\\n  add %r8, %r15\\n  movdqu (%r15), %xmm1\";\r\n                string initInstrs = \"  movdqu (%r8), %xmm2\";\r\n                string[] depInstrs = new string[4];\r\n                depInstrs[0] = \"  pmulld %xmm1, %xmm0\";\r\n                depInstrs[1] = \"  pmulld %xmm1, %xmm3\";\r\n                depInstrs[2] = \"  pmulld %xmm1, %xmm4\";\r\n                depInstrs[3] = \"  pmulld %xmm1, %xmm5\";\r\n\r\n                string[] indepInstrs = new string[2];\r\n                indepInstrs[0] = \"  pmulld %xmm2, %xmm6\";\r\n                indepInstrs[1] = \"  pmulld %xmm2, %xmm7\";\r\n                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs1 = \"  ldr s16, [x2, w25, uxtw #2]\";\r\n                string initInstrs = \"  ldr s15, [x2]\";\r\n                string[] depInstrs = new string[4];\r\n                depInstrs[0] = \"  fadd s0, s0, s16\";\r\n                depInstrs[1] = \"  fadd s1, s1, s16\";\r\n                depInstrs[2] = \"  fadd s2, s2, s16\";\r\n                depInstrs[3] = \"  fadd s3, s3, s16\";\r\n\r\n                string[] indepInstrs = new string[4];\r\n                indepInstrs[0] = \"  fadd s17, s17, s15\";\r\n                indepInstrs[1] = \"  fadd s18, s18, s15\";\r\n                indepInstrs[2] = \"  fadd s19, s19, s15\";\r\n                indepInstrs[3] = \"  fadd s20, s20, s15\";\r\n                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,\r\n                    postLoadInstrs: postLoadInstrs1);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "AsmGen/tests/ZeroRobTest.cs",
    "content": "﻿using System.Text;\r\n\r\nnamespace AsmGen\r\n{\r\n    public class ZeroRobTest : UarchTest\r\n    {\r\n        private bool initialDependentBranch;\r\n        public ZeroRobTest(int low, int high, int step, bool initialDependentBranch)\r\n        {\r\n            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);\r\n            this.Prefix = \"zerorob\" + (initialDependentBranch ? \"db\" : string.Empty);\r\n            this.Description = \"Reorder Buffer Test with Zeroing Idioms\" + (initialDependentBranch ? \", preceded by dependent branch\" : string.Empty);\r\n            this.FunctionDefinitionParameters = \"uint64_t iterations, int *arr\";\r\n            this.GetFunctionCallParameters = \"structIterations, A\";\r\n            this.DivideTimeByCount = false;\r\n            this.initialDependentBranch = initialDependentBranch;\r\n        }\r\n\r\n        public override bool SupportsIsa(IUarchTest.ISA isa)\r\n        {\r\n            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;\r\n            if (isa == IUarchTest.ISA.amd64) return true;\r\n            if (isa == IUarchTest.ISA.aarch64) return true;\r\n            if (isa == IUarchTest.ISA.mips64) return true;\r\n            return false;\r\n        }\r\n\r\n        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)\r\n        {\r\n            if (isa == IUarchTest.ISA.amd64)\r\n            {\r\n                string[] nops = new string[] { \"  xor %r11, %r11\" };\r\n                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.aarch64)\r\n            {\r\n                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;\r\n                string[] nops = new string[] { \"  mov x10, 0\" };\r\n                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true);\r\n                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));\r\n            }\r\n            else if (isa == IUarchTest.ISA.mips64)\r\n            {\r\n                string[] nops = new string[] { \"  move $r14, $r0\" };\r\n                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true);\r\n            }\r\n            else if (isa == IUarchTest.ISA.riscv)\r\n            {\r\n                string[] nops = new string[] { \"  mov $r14, $r0\" };\r\n                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true);\r\n            }\r\n        }\r\n    }\r\n}\r\n"
  },
  {
    "path": "CoherencyLatency/CoherencyLatency.cpp",
    "content": "#include <stdio.h>\r\n#include <stdint.h>\r\n\r\n#ifndef __MINGW32__\r\n    #include <sys\\timeb.h>\r\n#else\r\n    #include <sys/timeb.h>\r\n#endif\r\n\r\n#include <intrin.h>\r\n#include <windows.h>\r\n\r\n#define ITERATIONS 10000000;\r\n\r\nfloat RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter);\r\nfloat RunOwnedTest(unsigned int processor1, unsigned int processor2, uint64_t iter);\r\nDWORD WINAPI LatencyTestThread(LPVOID param);\r\nDWORD WINAPI ReadLatencyTestThread(LPVOID param);\r\n\r\nLONG64* bouncyBase;\r\nLONG64* bouncy;\r\n\r\ntypedef struct LatencyThreadData {\r\n    uint64_t start;       // initial value to write into target\r\n    uint64_t iterations;  // number of iterations to run\r\n    LONG64 *target;       // value to bounce between threads, init with start - 1\r\n    LONG64 *readTarget;   // for read test, memory location to read from (owned by other core)\r\n    DWORD affinityMask;   // thread affinity mask to set\r\n} LatencyData;\r\n\r\nint main(int argc, char *argv[]) {\r\n    SYSTEM_INFO sysInfo;\r\n    DWORD numProcs;\r\n    float** latencies;\r\n    uint64_t iter = ITERATIONS;\r\n    int offsets = 1;\r\n    float (*test)(unsigned int, unsigned int, uint64_t) = RunTest;\r\n\r\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\r\n        if (*(argv[argIdx]) == '-') {\r\n            char* arg = argv[argIdx] + 1;\r\n            if (_strnicmp(arg, \"iterations\", 10) == 0) {\r\n                argIdx++;\r\n                iter = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"%lu iterations requested\\n\", iter);\r\n            }\r\n            else if (_strnicmp(arg, \"bounce\", 6) == 0) {\r\n                fprintf(stderr, \"Bouncy\\n\");\r\n            }\r\n            else if (_strnicmp(arg, \"owned\", 5) == 0) {\r\n                test = RunOwnedTest;\r\n                fprintf(stderr, \"Using separate cache lines for each thread to write to\\n\");\r\n            }\r\n            else if (_strnicmp(arg, \"offset\", 6) == 0) {\r\n                argIdx++;\r\n                offsets = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Offsets: %d\\n\", offsets);\r\n            }\r\n        }\r\n    }\r\n\r\n    bouncyBase = (LONG64*)_aligned_malloc(64 * offsets, 4096);\r\n    bouncy = bouncyBase;\r\n    if (bouncy == NULL) {\r\n        fprintf(stderr, \"Could not allocate aligned mem\\n\");\r\n    }\r\n\r\n    GetSystemInfo(&sysInfo);\r\n    numProcs = sysInfo.dwNumberOfProcessors;\r\n    fprintf(stderr, \"Number of CPUs: %u\\n\", numProcs);\r\n    latencies = (float **)malloc(sizeof(float*) * offsets);\r\n    if (latencies == NULL) {\r\n        fprintf(stderr, \"couldn't allocate result array\\n\");\r\n        return 0;\r\n    }\r\n\r\n    for (DWORD offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {\r\n        bouncy = (LONG64*)((char*)bouncyBase + offsetIdx * 64);\r\n        latencies[offsetIdx] = (float*)malloc(sizeof(float) * numProcs * numProcs);\r\n        float* latenciesPtr = latencies[offsetIdx];\r\n\r\n        // Run all to all, skipping testing a core against itself ofc\r\n        // technically can skip the other way around (start j = i + 1) but meh\r\n        for (DWORD i = 0; i < numProcs; i++) {\r\n            for (DWORD j = 0; j < numProcs; j++) {\r\n                latenciesPtr[j + i * numProcs] = i == j ? 0 : test(i, j, iter);\r\n            }\r\n        }\r\n    }\r\n\r\n    for (DWORD offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {\r\n        printf(\"Cache line offset: %d\\n\", offsetIdx);\r\n        float* latenciesPtr = latencies[offsetIdx];\r\n\r\n        // print thing to copy to excel\r\n        for (DWORD i = 0; i < numProcs; i++) {\r\n            for (DWORD j = 0; j < numProcs; j++) {\r\n                if (j != 0) printf(\",\");\r\n                if (j == i) printf(\"x\");\r\n                else printf(\"%f\", latenciesPtr[j + i * numProcs]);\r\n            }\r\n            printf(\"\\n\");\r\n        }\r\n\r\n        free(latenciesPtr);\r\n    }\r\n\r\n    free(latencies);\r\n    _aligned_free(bouncyBase);\r\n    return 0;\r\n}\r\n\r\nfloat TimeThreads(unsigned int processor1, unsigned int processor2, uint64_t iter, LatencyData lat1, LatencyData lat2, DWORD (*threadFunc)(LPVOID)) {\r\n    struct timeb start, end;\r\n    HANDLE testThreads[2];\r\n    DWORD tid1, tid2;\r\n\r\n    testThreads[0] = CreateThread(NULL, 0, threadFunc, &lat1, CREATE_SUSPENDED, &tid1);\r\n    testThreads[1] = CreateThread(NULL, 0, threadFunc, &lat2, CREATE_SUSPENDED, &tid2);\r\n\r\n    if (testThreads[0] == NULL || testThreads[1] == NULL) {\r\n        fprintf(stderr, \"Failed to create test threads\\n\");\r\n        return -1;\r\n    }\r\n\r\n    SetThreadAffinityMask(testThreads[0], 1ULL << (uint64_t)processor1);\r\n    SetThreadAffinityMask(testThreads[1], 1ULL << (uint64_t)processor2);\r\n\r\n    ftime(&start);\r\n    ResumeThread(testThreads[0]);\r\n    ResumeThread(testThreads[1]);\r\n    WaitForMultipleObjects(2, testThreads, TRUE, INFINITE);\r\n    ftime(&end);\r\n\r\n    int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);\r\n    float latency = 1e6 * (float)time_diff_ms / (float)iter;\r\n\r\n    fprintf(stderr, \"%d to %d: %f ns\\n\", processor1, processor2, latency);\r\n\r\n    CloseHandle(testThreads[0]);\r\n    CloseHandle(testThreads[1]);\r\n\r\n    // each thread does interlocked compare and exchange iterations times. divide by 2 to get overall count of locked ops\r\n    return latency / 2;\r\n}\r\n\r\n/// <summary>\r\n/// Measures latency from one processor core to another\r\n/// </summary>\r\n/// <param name=\"processor1\">processor number 1</param>\r\n/// <param name=\"processor2\">processor number 2</param>\r\n/// <param name=\"iter\">Number of iterations</param>\r\n/// <param name=\"bouncy\">aligned mem to bounce around</param>\r\n/// <returns>latency per iteration in ns</returns>\r\nfloat RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter) {\r\n    LatencyData lat1, lat2;\r\n    float latency;\r\n\r\n    *bouncy = 0;\r\n    lat1.iterations = iter;\r\n    lat1.start = 1;\r\n    lat1.target = bouncy;\r\n    lat2.iterations = iter;\r\n    lat2.start = 2;\r\n    lat2.target = bouncy;\r\n\r\n    latency = TimeThreads(processor1, processor2, iter, lat1, lat2, LatencyTestThread);\r\n    return latency;\r\n}\r\n\r\nfloat RunOwnedTest(unsigned int processor1, unsigned int processor2, uint64_t iter) {\r\n    LatencyData lat1, lat2;\r\n    LONG64* target1, * target2;\r\n    float latency;\r\n\r\n    // drop them on different cache lines\r\n    target1 = (LONG64*)_aligned_malloc(128, 64);\r\n    target2 = target1 + 8;\r\n    if (target1 == NULL) {\r\n        fprintf(stderr, \"Could not allocate aligned mem\\n\");\r\n    }\r\n\r\n    *target1 = 1;\r\n    *target2 = 0;\r\n    lat1.iterations = iter;\r\n    lat1.start = 3;\r\n    lat1.target = target1;\r\n    lat1.readTarget = target2;\r\n    lat2.iterations = iter;\r\n    lat2.start = 2;\r\n    lat2.target = target2;\r\n    lat2.readTarget = target1;\r\n\r\n    latency = TimeThreads(processor1, processor2, iter, lat1, lat2, ReadLatencyTestThread);\r\n    _aligned_free(target1);\r\n    return latency;\r\n}\r\n\r\n/// <summary>\r\n/// Runs one thread of the latency test. should be run in pairs\r\n/// Always writes to target\r\n/// </summary>\r\n/// <param name=\"param\">Latency test params</param>\r\n/// <returns>next value that would have been written to shared memory</returns>\r\nDWORD WINAPI LatencyTestThread(LPVOID param) {\r\n    LatencyData *latencyData = (LatencyData *)param;\r\n    uint64_t current = latencyData->start;\r\n    while (current <= 2 * latencyData->iterations) {\r\n        if (_InterlockedCompareExchange64(latencyData->target, current, current - 1) == current - 1) {\r\n            current += 2;\r\n        }\r\n    }\r\n\r\n    return current;\r\n}\r\n\r\n/// <summary>\r\n/// Similar thing but tries to not bounce cache line ownership\r\n/// Instead, threads write to different cache lines\r\n/// </summary>\r\n/// <param name=\"param\">Latency test params</param>\r\n/// <returns>next value that would have been written to owned mem</returns>\r\nDWORD WINAPI ReadLatencyTestThread(LPVOID param) {\r\n    LatencyData* latencyData = (LatencyData*)param;\r\n    uint64_t current = latencyData->start;\r\n    uint64_t startTsc = __rdtsc();\r\n    while (current <= 2 * latencyData->iterations) {\r\n        if (*(latencyData->readTarget) == current - 1) {\r\n            *(latencyData->target) = current;\r\n            current += 2;\r\n            _mm_sfence();\r\n        }\r\n    }\r\n\r\n    return current;\r\n}\r\n"
  },
  {
    "path": "CoherencyLatency/CoherencyLatency.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 16\r\nVisualStudioVersion = 16.0.31025.194\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"CoherencyLatency\", \"CoherencyLatency.vcxproj\", \"{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|x64 = Debug|x64\r\n\t\tDebug|x86 = Debug|x86\r\n\t\tRelease|x64 = Release|x64\r\n\t\tRelease|x86 = Release|x86\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.ActiveCfg = Debug|Win32\r\n\t\t{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.Build.0 = Debug|Win32\r\n\t\t{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.Build.0 = Release|x64\r\n\t\t{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.ActiveCfg = Release|Win32\r\n\t\t{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.Build.0 = Release|Win32\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\n\tGlobalSection(ExtensibilityGlobals) = postSolution\r\n\t\tSolutionGuid = {A6E60C3D-60ED-4DBF-B4AA-7C1C3A140325}\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "CoherencyLatency/CoherencyLatency.vcxproj",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>16.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{6d9ccc8c-09f5-484b-8630-be18a9cf1995}</ProjectGuid>\r\n    <RootNamespace>CoherencyLatency</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"CoherencyLatency.cpp\" />\r\n  </ItemGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>\n"
  },
  {
    "path": "CoherencyLatency/Makefile",
    "content": "include ../Common/arch_detect.mk\n\nCFLAGS = -pthread -O3\n\nall: $(TARGET)\n\namd64:\n\t$(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_amd64 $(LDFLAGS)\n\naarch64:\n\t$(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_aarch64 $(LDFLAGS)\n\nriscv64:\n\t$(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_riscv64 $(LDFLAGS)\n\nw64:\n\t$(CC) $(CFLAGS) CoherencyLatency.cpp -o CoherencyLatency_w64.exe $(LDFLAGS)\n\n# w64 can build with mingw 11, which isn't available on jammy\n\nci: amd64 aarch64 riscv64\n\nclean:\n\trm -rf *.o *.zip \"ocl-icd-libopencl1*\" \"OpenCL-SDK*\" && find . -type f -executable -delete\n\n.PHONY: all ci clean\n"
  },
  {
    "path": "CoherencyLatency/PThreadsCoherencyLatency.c",
    "content": "#define _GNU_SOURCE\n\n#include <stdio.h>\n#include <string.h>\n#include <stdlib.h>\n#include <stdint.h>\n#include <sys/sysinfo.h>\n#include <sys/time.h>\n#include <sys/types.h>\n#include <sys/syscall.h>\n#include <unistd.h>\n#include <sched.h>\n#include <pthread.h>\n\n#define ITERATIONS 10000000;\n\n// kidding right?\n#define gettid() syscall(SYS_gettid)\n\ntypedef struct LatencyThreadData {\n    uint64_t start;\n    uint64_t iterations;\n    volatile uint64_t *target;\n    unsigned int processorIndex;\n} LatencyData;\n\ntypedef struct LatencyPairRunData {\n    uint32_t processor1;\n    uint32_t processor2;\n    uint64_t iter;\n    float result;\n    uint64_t *target;\n} LatencyPairRunData;\n\nvoid *LatencyTestThread(void *param);\nvoid *NoLockLatencyTestThread(void *param);\nvoid *(*testFunc)(void *) = LatencyTestThread;\nvoid *RunTest(void *param);\n\nint main(int argc, char *argv[]) {\n    float **latencies;\n    int *parallelTestState;\n    int numProcs, offsets = 1, parallelismFactor = 1;\n    uint64_t iter = ITERATIONS;\n    uint64_t *bouncyArr;\n\n    numProcs = get_nprocs();\n    fprintf(stderr, \"Number of CPUs: %u\\n\", numProcs);\n\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\n        if (*(argv[argIdx]) == '-') {\n            char* arg = argv[argIdx] + 1;\n            if (strncmp(arg, \"iterations\", 10) == 0) {\n                argIdx++;\n                iter = atoi(argv[argIdx]);\n                fprintf(stderr, \"%lu iterations requested\\n\", iter);\n            }\n            else if (strncmp(arg, \"nolock\", 6) == 0) {\n                fprintf(stderr, \"No locks, plain loads and stores\\n\");\n                testFunc = NoLockLatencyTestThread;\n            }\n            else if (strncmp(arg, \"offset\", 6) == 0) {\n                argIdx++;\n                offsets = atoi(argv[argIdx]);\n                fprintf(stderr, \"Offsets: %d\\n\", offsets);\n            }\n            else if (strncmp(arg, \"parallel\", 8) == 0) {\n                argIdx++;\n                parallelismFactor = atoi(argv[argIdx]);\n                fprintf(stderr, \"Will go for %d runs in parallel\\n\", parallelismFactor);\n            }\n        }\n    }\n\n    latencies = (float **)malloc(sizeof(float *) * offsets);\n    parallelTestState = (int *)malloc(sizeof(int) * numProcs * numProcs);\n    memset(latencies, 0, sizeof(float) * offsets);\n    if (0 != posix_memalign((void **)(&bouncyArr), 4096, 4096 * parallelismFactor)) {\n        fprintf(stderr, \"Could not allocate aligned mem\\n\");\n        return 0;\n    } \n\n    LatencyPairRunData *pairRunData = (LatencyPairRunData *)malloc(sizeof(LatencyPairRunData) * parallelismFactor);\n\n    for (int offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {\n        latencies[offsetIdx] = (float *)malloc(sizeof(float) * numProcs * numProcs);\n        memset(parallelTestState, 0, sizeof(int) * numProcs * numProcs);\n        float *latenciesPtr = latencies[offsetIdx];\n\n        while (1) {\n            // select parallelismFactor threads\n            int selectedParallelTestCount = 0;\n            memset(pairRunData, 0, sizeof(LatencyPairRunData) * parallelismFactor);\n            for (int i = 0;i < numProcs && selectedParallelTestCount < parallelismFactor; i++) {\n                for (int j = 0;j < numProcs && selectedParallelTestCount < parallelismFactor; j++) {\n                    if (j == i) { latenciesPtr[j + i * numProcs] = 0; continue; }\n                    if (parallelTestState[j + i * numProcs] == 1) {\n                        fprintf(stderr, \"Thread unexpectedly did not complete\\n\");\n                        exit(0);\n                    }\n                    if (parallelTestState[j + i * numProcs] == 0) {\n                        // neither thread can already have a pending run\n                        int validPair = 1;\n                        for (int c = 0; c < numProcs; c++) {\n                            if (parallelTestState[j + c * numProcs] == 1 || \n                                parallelTestState[c + i * numProcs] == 1 ||\n                                parallelTestState[i + c * numProcs] == 1 ||\n                                parallelTestState[c + j * numProcs] == 1) {\n                                validPair = 0;\n                                break;\n                            }\n                        }\n\n                        if (!validPair) continue;\n\n                        // for SMT enabled CPUs, check sibling threads. will do later\n                        parallelTestState[j + i * numProcs] = 1;\n                        pairRunData[selectedParallelTestCount].processor1 = i;\n                        pairRunData[selectedParallelTestCount].processor2 = j;\n                        pairRunData[selectedParallelTestCount].iter = iter;\n                        pairRunData[selectedParallelTestCount].result = 0.0f;\n                        pairRunData[selectedParallelTestCount].target = bouncyArr + (512 * selectedParallelTestCount + 8 * offsetIdx);\n                        fprintf(stderr, \"Selected %d -> %d\\n\", i, j);\n                        selectedParallelTestCount++;\n                    }\n                }\n            }\n            \n            if (selectedParallelTestCount == 0) break;\n\n            // launch threads\n            fprintf(stderr, \"Selected %d pairs for parallel testing\\n\", selectedParallelTestCount);\n            pthread_t *testThreads = (pthread_t *)malloc(selectedParallelTestCount * sizeof(pthread_t));\n            memset(testThreads, 0, selectedParallelTestCount * sizeof(pthread_t));\n            for (int parallelIdx = 0; parallelIdx < selectedParallelTestCount; parallelIdx++) {\n                if (pairRunData[parallelIdx].processor1 == 0 && pairRunData[parallelIdx].processor2 == 0) break;\n                pthread_create(testThreads + parallelIdx, NULL, RunTest, (void *)(pairRunData + parallelIdx));\n            }\n\n            // join threads\n            for (int parallelIdx = 0; parallelIdx < selectedParallelTestCount; parallelIdx++) {\n                pthread_join(testThreads[parallelIdx], NULL);\n                int i = pairRunData[parallelIdx].processor1;\n                int j = pairRunData[parallelIdx].processor2;\n                latenciesPtr[j + i * numProcs] = pairRunData[parallelIdx].result;\n                parallelTestState[j + i * numProcs] = 2;\n            }\n\n            free(testThreads);\n        }\n    }\n\n      for (int offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {\n        float *latenciesPtr = latencies[offsetIdx];\n        printf(\"Cache line offset: %d\\n\", offsetIdx);\n        for (int i = 0;i < numProcs; i++) {\n            for (int j = 0;j < numProcs; j++) {\n                if (j != 0) printf(\",\");\n                if (j == i) printf(\"x\");\n                // to maintain consistency, divide by 2 (see justification in windows version)\n                else printf(\"%f\", latenciesPtr[j + i * numProcs] / 2);\n            }\n            printf(\"\\n\");\n        }\n\n        free(latenciesPtr);\n    }\n\n    free(parallelTestState);\n    free(pairRunData);\n    free(latencies);\n    free(bouncyArr);\n    return 0;\n}\n\n// run test and gather timing data using the specified thread function\nfloat TimeThreads(unsigned int proc1,\n                  unsigned int proc2,\n                  uint64_t iter,\n                  LatencyData *lat1,\n                  LatencyData *lat2,\n                  void *(*threadFunc)(void *)) {\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz;\n    pthread_t testThreads[2];\n    int t1rc, t2rc;\n    void *res1, *res2;\n\n    gettimeofday(&startTv, &startTz);\n    t1rc = pthread_create(&testThreads[0], NULL, threadFunc, (void *)lat1);\n    t2rc = pthread_create(&testThreads[1], NULL, threadFunc, (void *)lat2);\n    if (t1rc != 0 || t2rc != 0) {\n      fprintf(stderr, \"Could not create threads\\n\");\n      return 0;\n    }\n\n    pthread_join(testThreads[0], &res1);\n    pthread_join(testThreads[1], &res2);\n    gettimeofday(&endTv, &endTz);\n\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    float latency = 1e6 * (float)time_diff_ms / (float)iter;\n    return latency;\n}\n\n// test latency between two logical CPUs\n// float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter) {\nvoid *RunTest(void *param) {\n  LatencyPairRunData *pairRunData = (LatencyPairRunData *)param;\n  uint32_t processor1 = pairRunData->processor1;\n  uint32_t processor2 = pairRunData->processor2;\n  uint64_t iter = pairRunData->iter;\n  LatencyData lat1, lat2;\n  float latency;\n\n  *(pairRunData->target) = 0;\n  lat1.iterations = iter;\n  lat1.start = 1;\n  lat1.target = pairRunData->target;\n  lat1.processorIndex = processor1;\n  lat2.iterations = iter;\n  lat2.start = 2;\n  lat2.target = pairRunData->target;\n  lat2.processorIndex = processor2;\n  latency = TimeThreads(processor1, processor2, iter, &lat1, &lat2, NoLockLatencyTestThread);\n  fprintf(stderr, \"%d to %d: %f ns\\n\", processor1, processor2, latency);\n  pairRunData->result = latency;\n  return NULL;\n}\n\nvoid *LatencyTestThread(void *param) {\n    LatencyData *latencyData = (LatencyData *)param;\n    cpu_set_t cpuset;\n    uint64_t current = latencyData->start;\n\n    CPU_ZERO(&cpuset);\n    CPU_SET(latencyData->processorIndex, &cpuset);\n    sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);\n    //fprintf(stderr, \"thread %ld set affinity %d\\n\", gettid(), latencyData->processorIndex);\n\n    while (current <= 2 * latencyData->iterations) {\n        if (__sync_bool_compare_and_swap(latencyData->target, current - 1, current)) current += 2;\n    }\n\n    pthread_exit(NULL);\n}\n\nvoid *NoLockLatencyTestThread(void *param) {\n    LatencyData *latencyData = (LatencyData *)param;\n    cpu_set_t cpuset;\n    uint64_t current = latencyData->start;\n\n    CPU_ZERO(&cpuset);\n    CPU_SET(latencyData->processorIndex, &cpuset);\n    sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);\n\n    while (current <= 2 * latencyData->iterations) {\n        if (*(latencyData->target) == current - 1) {\n            *(latencyData->target) = current;\n            current += 2;\n        } \n    }\n\n    pthread_exit(NULL);\n} \n"
  },
  {
    "path": "CoherencyLatency/c2cparse/Program.cs",
    "content": "﻿// See https://aka.ms/new-console-template for more information\r\nusing System;\r\n\r\npublic class C2CParse\r\n{\r\n    public static void Main(string[] args)\r\n    {\r\n        if (args.Length == 0)\r\n        {\r\n            Console.WriteLine(\"Need filename as arg\");\r\n            return;\r\n        }\r\n\r\n        string[] inputLatencies = null;\r\n        string[] outputLatencies = null;\r\n        string inputFile = File.ReadAllText(args[0]);\r\n        string[] inputLines = inputFile.Split('\\n');\r\n        for (int row = 0; row < inputLines.Length; row++)\r\n        {\r\n            string[] lineSplit = inputLines[row].Split(',');\r\n            if (inputLatencies == null)\r\n            {\r\n                inputLatencies = new string[inputLines.Length * lineSplit.Length];\r\n                outputLatencies = new string[inputLines.Length * lineSplit.Length];\r\n                if (inputLines.Length != lineSplit.Length)\r\n                {\r\n                    Console.WriteLine(\"Line count: {0}, line segments: {1} must be equal\", inputLines.Length, lineSplit.Length);\r\n                    return;\r\n                }\r\n            }\r\n\r\n            for (int i = 0; i < inputLines.Length; i++)\r\n            {\r\n                inputLatencies[row * lineSplit.Length + i] = lineSplit[i];\r\n            }\r\n        }\r\n\r\n        for (int row = 0; row < inputLines.Length; row++)\r\n        {\r\n            for (int col = 0; col < inputLines.Length; col++)\r\n            {\r\n                string v1 = inputLatencies[row * inputLines.Length + col];\r\n                // translate both row and col\r\n                int newRow = GetCoreIndex(row, 4, 64);\r\n                int newCol = GetCoreIndex(col, 4, 64);\r\n                outputLatencies[newRow * inputLines.Length + newCol] = v1;\r\n            }\r\n        }\r\n\r\n        for (int row = 0; row < inputLines.Length; row++)\r\n        {\r\n            for (int col = 0; col < inputLines.Length; col++)\r\n            {\r\n                Console.Write(\",{0}\", outputLatencies[row * inputLines.Length + col]);\r\n            }\r\n\r\n            Console.WriteLine();\r\n        }\r\n    }\r\n\r\n    /// <summary>\r\n    /// Convert linux index to windows index\r\n    /// </summary>\r\n    /// <param name=\"inputIndex\"></param>\r\n    /// <param name=\"smtCount\"></param>\r\n    /// <param name=\"coreCount\"></param>\r\n    /// <returns></returns>\r\n    public static int GetCoreIndex(int inputIndex, int smtCount, int coreCount)\r\n    {\r\n        int physicalCoreIndex = inputIndex % coreCount;\r\n        int smtIndex = inputIndex / coreCount;\r\n        return physicalCoreIndex * smtCount + smtIndex;\r\n    }\r\n}"
  },
  {
    "path": "CoherencyLatency/c2cparse/c2cparse.csproj",
    "content": "<Project Sdk=\"Microsoft.NET.Sdk\">\r\n\r\n  <PropertyGroup>\r\n    <OutputType>Exe</OutputType>\r\n    <TargetFramework>net6.0</TargetFramework>\r\n    <ImplicitUsings>enable</ImplicitUsings>\r\n    <Nullable>enable</Nullable>\r\n  </PropertyGroup>\r\n\r\n</Project>\r\n"
  },
  {
    "path": "CoherencyLatency/c2cparse/c2cparse.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 17\r\nVisualStudioVersion = 17.4.33110.190\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}\") = \"c2cparse\", \"c2cparse.csproj\", \"{F9E172EC-1A9A-4908-9512-4547CD1CFD80}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|Any CPU = Debug|Any CPU\r\n\t\tRelease|Any CPU = Release|Any CPU\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.ActiveCfg = Debug|Any CPU\r\n\t\t{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.Build.0 = Debug|Any CPU\r\n\t\t{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.ActiveCfg = Release|Any CPU\r\n\t\t{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.Build.0 = Release|Any CPU\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\n\tGlobalSection(ExtensibilityGlobals) = postSolution\r\n\t\tSolutionGuid = {4C3856A5-1183-4D5F-80BE-3D694765A594}\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "Common/arch_detect.mk",
    "content": "TARGET ?= amd64\n\nifeq ($(OS),Windows_NT)\n    TARGET = w64\nelse\n    UNAME_M := $(shell uname -m)\n    ifeq ($(UNAME_M),x86_64)\n        TARGET = amd64\n    endif\n    ifeq ($(UNAME_M),aarch64)\n        TARGET = aarch64\n    endif\n    ifeq ($(UNAME_M),riscv64)\n        TARGET = riscv64\n    endif\n    UNAME_S := $(shell uname -s)\n    ifeq ($(UNAME_S),Darwin)\n    TARGET = darwin\n    endif\nendif\n\namd64: CC = x86_64-linux-gnu-gcc\namd64_numa: CC = x86_64-linux-gnu-gcc\naarch64: CC := gcc\naarch64_numa: CC = aarch64-linux-gnu-gcc\nriscv64: CC = riscv64-linux-gnu-gcc\nw64: CC = x86_64-w64-mingw32-gcc\ndarwin: CC = clang\n"
  },
  {
    "path": "Common/ci_gpumemlatency.sh",
    "content": "#!/bin/sh\n\nmake_all () {\n\tmake amd64\n\tmake clean-obj\n\tLDFLAGS=\"-lm -L ocl-icd-arm64/usr/lib/aarch64-linux-gnu -lOpenCL\" make aarch64\n\tmake clean-obj\n\tLDFLAGS=\"-lm -L ocl-icd-riscv64/usr/lib/riscv64-linux-gnu -lOpenCL\" make riscv64\n\tmake clean-obj\n\tCPPFLAGS=\"-I OpenCL-SDK-${OCL_VER}-Win-x64/include\" LDFLAGS=\"-lm -L OpenCL-SDK-${OCL_VER}-Win-x64/lib -lOpenCL\" make w64\n\tmake clean-obj\n}\n\nlinux_deps () {\n\tfor ARCH in arm64 riscv64; do\n\tif ! grep -q $ARCH /etc/apt/sources.list; then\n\t\techo \"deb [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe\" | sudo tee -a /etc/apt/sources.list\n\t\techo \"deb-src [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe\" | sudo tee -a /etc/apt/sources.list\n\t\tsudo apt update\n\tfi\n\t\tapt-get download \"ocl-icd-libopencl1:${ARCH}\"\n\t\tfind . -type f -name \"*${ARCH}*.deb\" -exec dpkg-deb -x {} \"ocl-icd-${ARCH}\" \\;\ndone\ncp ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so.1 ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so\ncp ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so.1 ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so\n}\n\nw64_deps () {\n\tcurl -fssLO \"https://github.com/KhronosGroup/OpenCL-SDK/releases/download/${OCL_VER}/OpenCL-SDK-${OCL_VER}-Win-x64.zip\"\n\tunzip \"OpenCL-SDK-${OCL_VER}-Win-x64.zip\"\n}\n\nlinux_deps\nw64_deps\nmake_all\n"
  },
  {
    "path": "Common/ci_package.sh",
    "content": "#!/bin/sh\n\nPKG=\"clammarks-$(git rev-parse --short HEAD)\"\nrm -rf \"$PKG\" \"clammarks.txz\"\nmkdir -p \"$PKG\"\n\nfor TARGET in \"amd64\" \"aarch64\" \"riscv64\" \"w64\"; do\n\tmkdir \"$PKG/$TARGET\"\n\tfor COMPONENT in CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency; do\n\t\tfind \"$COMPONENT\" -type f -name \"*$TARGET*\" -executable -exec cp {} \"$PKG/$TARGET\" \\;\n\tdone\n\tfind \"GpuMemLatency\" -type f -name \"*.cl\" -exec cp {} \"$PKG/$TARGET\" \\;\ndone\n\ncp \"LICENSE\" \"$PKG\"\n\ntar caf \"clammarks.txz\" \"$PKG\"\n"
  },
  {
    "path": "Common/perfmon.h",
    "content": "// Stuff that only works on Linux. Should be #ifdef-ed out for mingw cross compilation\nuint64_t readmsr(uint32_t coreindex, uint32_t msrindex) {\n    char buf[256];\n    memset(buf, 0, 256);\n    snprintf(buf, 256, \"/dev/cpu/%d/msr\", coreindex);\n    int fd;\n    uint64_t msrvalue = 0;\n    fd = open(buf, O_RDWR);\n    if (fd == -1) {\n        fprintf(stderr, \"Could not open msr\\n\");\n        return 0;\n    }\n    \n    lseek(fd, msrindex, SEEK_SET);\n    read(fd, &msrvalue, 8);\n    close(fd);\n    return msrvalue;\n}\n\n#define PERF_NUM_EVENTS 4\nstruct perf_read_data {\n    uint64_t nr;\n    struct {\n        uint64_t value;\n        uint64_t id;\n    } values[PERF_NUM_EVENTS];\n};\n\nstruct perf_select_data {\n    uint64_t id;   // id used to identify the event when it comes back in a group\n    int fd;        // file descriptor\n    struct perf_event_attr attr;\n    uint64_t value;\n    const char *description;\n};\n\nstruct perf_select_data perf_selected_events[PERF_NUM_EVENTS];\nstruct perf_read_data perfReadData;\nstruct timeval perf_startTv, perf_endTv;\nuint64_t perf_time_ms;\n\n// populates basic properties\nvoid initialize_hw_event(struct perf_event_attr *attr, uint64_t cfg, uint32_t hwid) {\n    memset(attr, 0, sizeof(struct perf_event_attr));\n    \n    // low 32 bits of config = hardware event id\n    // high 32 bits = PMU id (atom/core). Get from /sys/devices/<the thing>/type\n    // on Arrow Lake, atom = 10, core = 4 \n    attr->config = cfg | ((uint64_t)hwid << 32);\n    attr->type = PERF_TYPE_HARDWARE;\n    attr->size = sizeof(struct perf_event_attr);\n    attr->disabled = 1;\n    attr->exclude_kernel = 1;\n    attr->exclude_hv = 1;\n    attr->inherit = 1; // include child threads\n    attr->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;\n}\n\nvoid set_hw_event(struct perf_select_data *evt, int groupfd) {\n    evt->fd = syscall(__NR_perf_event_open, &(evt->attr), 0, -1, groupfd, 0);\n    ioctl(evt->fd, PERF_EVENT_IOC_ID, &(evt->id));\n}\n\nvoid open_perf_monitoring() {\n    int groupLeaderFd = -1;\n    memset(perf_selected_events, 0, sizeof(struct perf_select_data) * PERF_NUM_EVENTS);\n    \n    perf_selected_events[0].description = \"instructions\";\n    initialize_hw_event(&(perf_selected_events[0].attr), PERF_COUNT_HW_INSTRUCTIONS, 0);\n    set_hw_event(perf_selected_events, -1);\n    groupLeaderFd = perf_selected_events[0].fd;\n\n    perf_selected_events[1].description = \"cycles\";\n    initialize_hw_event(&(perf_selected_events[1].attr), PERF_COUNT_HW_CPU_CYCLES, 0);\n    set_hw_event(perf_selected_events + 1, groupLeaderFd);\n\n    perf_selected_events[2].description = \"llc_ref\";\n    initialize_hw_event(&(perf_selected_events[2].attr), 0x4F2E, 0);\n    perf_selected_events[2].attr.type = PERF_TYPE_RAW;\n    set_hw_event(perf_selected_events + 2, groupLeaderFd);\n\n    perf_selected_events[3].description = \"llc_miss\";\n    initialize_hw_event(&(perf_selected_events[3].attr), 0x412E, 0);\n    perf_selected_events[3].attr.type = PERF_TYPE_RAW;\n    set_hw_event(perf_selected_events + 3, groupLeaderFd);\n}\n\nvoid start_perf_monitoring() {\n    gettimeofday(&perf_startTv, NULL);\n    int groupLeaderFd = perf_selected_events[0].fd;\n    ioctl(groupLeaderFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);\n    ioctl(groupLeaderFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); \n}\n\nuint64_t instrs, cycles, llcRef, llcMiss;\nvoid stop_perf_monitoring() {\n    int readbytes = 0;\n    int groupLeaderFd = perf_selected_events[0].fd;\n    ioctl(groupLeaderFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);\n    // fprintf(stderr, \"read %d bytes\\n\", sizeof(struct perf_read_data));\n    readbytes = read(groupLeaderFd, &perfReadData, sizeof(struct perf_read_data));\n    //fprintf(stderr, \"Read %d bytes into perf_read_data. nr = %lu\\n\", readbytes, perfReadData.nr);\n    for (int i = 0; i < perfReadData.nr; i++) {\n        for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {\n            if (perf_selected_events[evt_idx].id == perfReadData.values[i].id) {\n                struct perf_select_data *selected_evt = perf_selected_events + evt_idx;\n                selected_evt->value = perfReadData.values[i].value;\n                // fprintf(stderr, \"%s: %lu\\n\", selected_evt->description, selected_evt->value);\n            }\n        }\n    }\n\n    gettimeofday(&perf_endTv, NULL);\n    perf_time_ms = ((perf_endTv.tv_sec - perf_startTv.tv_sec) * 1000 + (perf_endTv.tv_usec - perf_startTv.tv_usec) / 1000); \n}\n\nvoid close_perf_monitoring() {\n    for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) close(perf_selected_events[evt_idx].fd);\n}\n\nvoid append_perf_header() {\n    for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {\n        printf(\",%s\", perf_selected_events[evt_idx].description);\n    }\n\n    printf(\",Time (ms)\");\n}\n\nvoid append_perf_values() {\n    for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {\n        printf(\",%lu\", perf_selected_events[evt_idx].value);\n    }\n    \n    printf(\",%lu\", perf_time_ms);\n}\n"
  },
  {
    "path": "Common/timing.c",
    "content": "#ifdef _MSC_VER\r\n#include <sys\\timeb.h>\r\n__declspec(selectany) struct timeb start, end;\r\nvoid start_timing() {\r\n    ftime(&start);\r\n}\r\n\r\nunsigned int end_timing() {\r\n    ftime(&end);\r\n    return 1000 * (end.time - start.time) + (end.millitm - start.millitm);\r\n}\r\n\r\nvoid start_timing_ts(struct timeb *startTimeb) {\r\n    ftime(startTimeb);\r\n}\r\n\r\nunsigned int end_timing_ts(struct timeb* startTimeb) {\r\n    struct timeb end;\r\n    ftime(&end);\r\n    return 1000 * (end.time - startTimeb->time) + (end.millitm - startTimeb->millitm);\r\n}\r\n#else\r\n#include <sys/time.h>\r\n#include <stddef.h>\r\nstruct timeval startTv, endTv;\r\nvoid start_timing() {\r\n    gettimeofday(&startTv, NULL);\r\n}\r\n\r\nunsigned int end_timing() {\r\n    gettimeofday(&endTv, NULL);\r\n    return (unsigned int)((endTv.tv_sec - startTv.tv_sec) * 1000 + (endTv.tv_usec - startTv.tv_usec) / 1000);\r\n}\r\n\r\nvoid start_timing_ts(struct timeval* start) {\r\n    gettimeofday(start, NULL);\r\n}\r\n\r\nunsigned int end_timing_ts(struct timeval* start) {\r\n    struct timeval end;\r\n    gettimeofday(&end, NULL);\r\n    return (unsigned int)((end.tv_sec - start->tv_sec) * 1000 + (end.tv_usec - start->tv_usec) / 1000);\r\n\r\n}\r\n#endif\r\n\r\nunsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time) {\r\n  // safety measure to deal with nasty timer precision issues if the system is fast\r\n  if (last_time < 50) return last_iteration_count * 2;\r\n  return last_iteration_count * (target_time / last_time);\r\n}\r\n"
  },
  {
    "path": "Common/timing.h",
    "content": "#ifndef timingincluded\r\n#define timingincluded\r\n#ifdef _MSC_VER\r\n#include <sys\\timeb.h>\r\n#else\r\n#include <sys/time.h>\r\n#endif\r\nextern struct timeb start, end;\r\ninline void start_timing();\r\ninline unsigned int end_timing();\r\n\r\n#ifdef _MSC_VER\r\nvoid start_timing_ts(struct timeb* startTimeb);\r\nunsigned int end_timing_ts(struct timeb* startTimeb);\r\n#else\r\nvoid start_timing_ts(struct timeval* start);\r\nunsigned int end_timing_ts(struct timeval* start);\r\n#endif\r\nunsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time);\n#endif\n"
  },
  {
    "path": "CoreClockChecker/BoostClockChecker.c",
    "content": "#include <stdio.h>\n#include <time.h>\n#include <sys/time.h>\n#include <stdint.h>\n#include <stdlib.h> \n#include <string.h>\n#include <unistd.h>\n\nextern uint64_t clktsctest(uint64_t iterations) __attribute((ms_abi));\n\nint main(int argc, char *argv[]) {\n    struct timeval startTv, endTv;\n    uint64_t iterations = 500000, samples = 100;\n    unsigned int sleepSeconds = 5;\n    time_t time_diff_ms;\n\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\n        if (*(argv[argIdx]) == '-') {\n            char *arg = argv[argIdx] + 1;\n\t    if (strncmp(arg, \"samples\", 7) == 0) {\n\t        argIdx++;\n\t\tsamples = atol(argv[argIdx]);\n\t    } else if (strncmp(arg, \"iterations\", 10) == 0) {\n\t        argIdx++;\n\t\titerations = atol(argv[argIdx]);\n            } else if (strncmp(arg, \"sleep\", 5) == 0) {\n\t        argIdx++;\n\t\tsleepSeconds = atoi(argv[argIdx]);\n\t    }\n\t}\n    }\n\n    sleep(sleepSeconds);\n\n    uint64_t *measuredTscs = malloc(samples * sizeof(uint64_t));\n    for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) {\n        uint64_t elapsedTsc = clktsctest(iterations);\n\tmeasuredTscs[sampleIdx] = elapsedTsc;\n    }\n\n    fprintf(stderr, \"Used %lu samples\\n\", samples);\n    fprintf(stderr, \"Used %lu iterations\\n\", iterations);\n    // figure out TSC to real time ratio\n    fprintf(stderr, \"Checking TSC ratio...\\n\");\n    uint64_t iterationsHi = 8e9; // should be a couple seconds at least?\n    gettimeofday(&startTv, NULL);\n    uint64_t referenceElapsedTsc = clktsctest(iterationsHi);\n    gettimeofday(&endTv, NULL);\n    time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    float tsc_per_ms = (float)referenceElapsedTsc / (float)time_diff_ms;\n    float tsc_per_ns = tsc_per_ms / 1e6;\n    fprintf(stderr, \"TSC = %lu, elapsed ms = %lu\\n\", referenceElapsedTsc, time_diff_ms);\n    fprintf(stderr, \"TSC per ms: %f, TSC per ns: %f\\n\", tsc_per_ms, tsc_per_ns);\n\n    printf(\"Time (ms), Clk (GHz), TSC\\n\");\n    float elapsedTime = 0;\n    for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) {\n\t// (tsc / ms) * tsc = 1 / ms\n\tfloat elapsedTimeMs = measuredTscs[sampleIdx] / tsc_per_ms;\n\telapsedTime += elapsedTimeMs;\n\tfloat latency = 1e6 * elapsedTimeMs / (float)iterations;\n\tfloat addsPerNs = 1 / latency;\n\tprintf(\"%f,%f,%lu\\n\", elapsedTime, addsPerNs, measuredTscs[sampleIdx]);\n    }\n\n    return 0;\n}\n"
  },
  {
    "path": "CoreClockChecker/BoostClockChecker_arm.s",
    "content": ".text\n.global clktsctest\n\n.global _clktsctest\n\n.balign 4\n\n/* x0 = iterations, return elapsed TSC in x0 */\n_clktsctest:\nclktsctest:\n  sub sp, sp, #0x40\n  stp x10, x11, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x14, x15, [sp, #0x30]\n  mov x10, 1\n  mov x11, 20\n  mov x12, 0\n  /* stackoverflow says this is a good idea */\n  mrs x14, cntvct_el0\nclktsctest_loop:\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  add x12, x12, x10\n  sub x0, x0, x11\n  cbnz x0, clktsctest_loop\n  mrs x15, cntvct_el0\n  sub x0, x15, x14\n  ldp x14, x15, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x10, x11, [sp, #0x10]\n  add sp, sp, #0x40\n  ret\n"
  },
  {
    "path": "CoreClockChecker/BoostClockChecker_x86.s",
    "content": ".global clktsctest\n\n/* rcx = iterations, return elapsed TSC in rax */ \nclktsctest:\n  push %rdx\n  push %rbx\n  push %r8\n  push %r9\n  push %r10\n  mov %rcx, %rdi\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  rdtsc            /* high 32 bits in EDX, low 32 bits in EAX */\n  shl $32, %rdx    /* shift high 32 bits into upper half of EDX */\n  add %rax, %rdx   /* place full 64-bit value in rdx */\n  mov %rdx, %r10\nclktsctest_loop:\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  sub %r9, %rdi\n  jnz clktsctest_loop\n  rdtsc\n  shl $32, %rdx\n  add %rdx, %rax /* now rax has the new value */\n  sub %r10, %rax /* subtract old TSC value from the new one, which should be larger */\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rbx\n  pop %rdx\n  ret  \n"
  },
  {
    "path": "CoreClockChecker/CoreClockChecker.c",
    "content": "#define _GNU_SOURCE\n#include <cpuid.h>\n#include <stdio.h>\n#include <time.h>\n#include <stdint.h>\n#include <stdlib.h>\n#include <string.h>\n#include <pthread.h>\n#include <sys/sysinfo.h>\n#include <sys/time.h>\n#include <unistd.h>\n#include <fcntl.h>\n#include <math.h>\n\n#define MSR_RAPL_PWR_UNIT 0xC0010299\n#define HWCR 0xC0010015\n#define MSR_CORE_ENERGY_STAT 0xC001029A\n#define MSR_PKG_ENERGY_STAT 0xC001029B\n\n#define INTEL_MSR_RAPL_PWR_UNIT 0x606\n#define INTEL_MSR_PP0_ENERGY_STATUS 0x639\n#define INTEL_MSR_PKG_ENERGY_STATUS 0x611\n\nextern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi));\n\nvoid detectCpuMaker();\nvoid setBoost(int on);\nvoid setAffinity(int core);\nint openMsr(int core);\nuint64_t readMsr(int fd, uint32_t addr);\nvoid writeMsr(int fd, uint32_t addr, uint64_t value);\nfloat getEnergyStatusUnits();\nuint64_t getCoreEnergyStat(int core);\nuint64_t getPkgEnergyStat(int core);\nuint64_t getTotalCoreEnergy();\nint *msrFds;\nint amdCpu = 1;\nint numProcs = 0;\n\nint main(int argc, char *argv[]) {\n    struct timeval startTv, endTv;\n    time_t time_diff_ms;\n    float latency, clockSpeedGhz, energyUnits;\n    uint64_t startEnergy, endEnergy, startPkgEnergy, endPkgEnergy;\n    uint64_t iterationsHigh = 8e9;\n\n    detectCpuMaker();\n    numProcs = get_nprocs();\n    fprintf(stderr, \"Number of CPUs: %u\\n\", numProcs);\n    msrFds = (int *)malloc(sizeof(int) * numProcs);\n    memset(msrFds, 0, sizeof(int) * numProcs);\n\n    if (argc > 1 && strncmp(argv[1], \"disableboost\", 12) == 0) {\n        setBoost(0);\n    } else if (argc > 1 && strncmp(argv[1], \"enableboost\", 11) == 0) {\n        setBoost(1);\n    } else if (argc > 1 && strncmp(argv[1], \"power\", 5) == 0) {\n        iterationsHigh *= 2; // try for more accuracy\n\tenergyUnits = getEnergyStatusUnits();\n\tprintf(\"Core, Core Power, Package Power\\n\");\n        for (int i = 0; i < numProcs; i++) {\n            setAffinity(i);\n\n            gettimeofday(&startTv, NULL);\n            startEnergy = getCoreEnergyStat(i);\n            startPkgEnergy = getPkgEnergyStat(i);\n            clktest(iterationsHigh);\n            endPkgEnergy = getPkgEnergyStat(i);\n\t    endEnergy = getCoreEnergyStat(i);\n            gettimeofday(&endTv, NULL);\n\n            time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n            latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;\n            clockSpeedGhz = 1 / latency;\n            //printf(\"runtime: %llu ms\\n\", time_diff_ms);\n            //printf(\"%d, %f GHz\\n\", i, clockSpeedGhz);\n\t    printf(\"%d, %f, %f\\n\", i,\n\t        ((endEnergy - startEnergy) * energyUnits) / (time_diff_ms / 1000),\n\t        ((endPkgEnergy - startPkgEnergy) * energyUnits) / (time_diff_ms / 1000));\n        }\n    } else if (argc > 2 && strncmp(argv[1], \"measurecmd\", 9) == 0) {\n        int rc;\n\tfloat coreJoules, pkgJoules;\n        fprintf(stderr, \"argv[2] is %s\\nOnly handling Intel at the moment\\n\", argv[2]);\n\tenergyUnits = getEnergyStatusUnits();\n\n\tgettimeofday(&startTv, NULL);\n\tstartEnergy = getTotalCoreEnergy();\n\tstartPkgEnergy = getPkgEnergyStat(0);\n\trc = system(argv[2]);\n\tendEnergy = getTotalCoreEnergy();\n\tendPkgEnergy = getPkgEnergyStat(0);\n\tgettimeofday(&endTv, NULL);\n\tfprintf(stderr, \"system() returned %d\\n\", rc);\n\n        time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n\tcoreJoules = (endEnergy - startEnergy) * energyUnits;\n\tpkgJoules = (endPkgEnergy - startPkgEnergy) * energyUnits;\n\tprintf(\"Core Joules: %f\\n\", coreJoules);\n\tprintf(\"Package Joules: %f\\n\", pkgJoules);\n\tprintf(\"Elapsed time, seconds: %f\\n\", (double)time_diff_ms / 1000);\n    }\n    else {\n        for (int i = 0; i < numProcs; i++) {\n            setAffinity(i);\n\n            gettimeofday(&startTv, NULL);\n            clktest(iterationsHigh);\n            gettimeofday(&endTv, NULL);\n            time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n            latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;\n            clockSpeedGhz = 1 / latency;\n            //printf(\"runtime: %llu ms\\n\", time_diff_ms);\n            printf(\"%d, %f GHz\\n\", i, clockSpeedGhz);\n        }\n    }\n\n    free(msrFds);\n    return 0;\n}\n\nvoid detectCpuMaker() {\n    uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx;\n    uint32_t *uintPtr;\n    char cpuName[13];\n    amdCpu = 0;\n    __cpuid_count(0, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx);\n    uintPtr = (uint32_t *)cpuName;\n    uintPtr[0] = cpuidEbx;\n    uintPtr[1] = cpuidEdx;\n    uintPtr[2] = cpuidEcx;\n    cpuName[12] = 0;\n    fprintf(stderr, \"CPU name: %s\\n\", cpuName);\n    if (memcmp(cpuName, \"GenuineIntel\", 12) == 0) {\n        amdCpu = 0;\n\tfprintf(stderr, \"Looks like Intel\\n\");\n    } else if (memcmp(cpuName, \"AuthenticAMD\", 12) == 0) {\n        amdCpu = 1;\n\tfprintf(stderr, \"Looks like AMD\\n\");\n    }\n}\n\nvoid setAffinity(int core) {\n    int rc;\n    cpu_set_t cpuset;\n    pthread_t thread = pthread_self();\n    CPU_ZERO(&cpuset);\n    CPU_SET(core, &cpuset);\n    rc = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);\n    if (rc != 0) {\n        fprintf(stderr, \"unable to set thread affinity to %d\\n\", core);\n    }\n}\n\nint openMsr(int core) {\n    char msrFilename[255];\n    int fd;\n    sprintf(msrFilename, \"/dev/cpu/%d/msr\", core);\n    fd = open(msrFilename, O_RDWR);\n    if (fd < 0) {\n        fprintf(stderr, \"Could not open MSR file, core %d\\n\", core);\n        return -1;\n    }\n    return fd;\n}\n\nuint64_t readMsr(int fd, uint32_t addr) {\n    uint64_t result, bytesRead;\n    bytesRead = pread(fd, &result, sizeof(result), addr);\n    if (bytesRead != sizeof(result)) {\n        fprintf(stderr, \"Could not read from fd %d, msr %u\\n\", fd, addr);\n    }\n    return result;\n}\n\nvoid writeMsr(int fd, uint32_t addr, uint64_t value) {\n    uint64_t bytesWritten, newValue;\n    bytesWritten = pwrite(fd, &value, sizeof(value), addr);\n    if (bytesWritten != sizeof(value)) {\n        fprintf(stderr, \"Could not write to fd %d, msr %u, value %lu\\n\", fd, addr, value);\n    }\n\n    newValue = readMsr(fd, addr);\n    if (value != newValue) {\n        fprintf(stderr, \"Wrote to fd %d, msr %u, value %lu, but write did not take effect\\n\", fd, addr, value);\n    }\n}\n\nvoid setBoost(int on) {\n    uint64_t hwcrValue;\n    for (int i = 0; i < numProcs; i++) {\n        setAffinity(i);\n\tif (!msrFds[i]) msrFds[i] = openMsr(i);\n\thwcrValue = readMsr(msrFds[i], HWCR);\n        if (on) {\n\t    hwcrValue &= ~(1UL << 25);  // unset bit to request CPB on\n\t    //fprintf(stderr, \"Requesting CPB on (unsetting bit 25 in HWCR): 0x%08x\\n\", hwcrValue);\n\t} else {\n\t    hwcrValue |= (1UL << 25);      // set bit to disable CPB\n\t    //fprintf(stderr, \"Requesting CPB off (setting bit 25 in HWCR): 0x%08x\\n\", hwcrValue);\n\t}\n\n\twriteMsr(msrFds[i], HWCR, hwcrValue);\n    }\n}\n\nfloat getEnergyStatusUnits() {\n    uint64_t energyUnits, raplPwrUnit;\n    setAffinity(0);\n    if (!msrFds[0]) msrFds[0] = openMsr(0);\n\n    if (amdCpu) {\n        raplPwrUnit = readMsr(msrFds[0], MSR_RAPL_PWR_UNIT);\n    }\n    else\n    {\n        raplPwrUnit = readMsr(msrFds[0], INTEL_MSR_RAPL_PWR_UNIT);\n    }\n\n    energyUnits = (raplPwrUnit >> 8) & 0x1F;\n    return (float)pow(0.5, (double)energyUnits);\n}\n\nuint64_t getCoreEnergyStat(int core) {\n    if (!msrFds[core]) msrFds[core] = openMsr(core);\n\n    if (amdCpu)\n        return readMsr(msrFds[core], MSR_CORE_ENERGY_STAT);\n    else\n        return readMsr(msrFds[core], INTEL_MSR_PP0_ENERGY_STATUS);\n}\n\nuint64_t getPkgEnergyStat(int core) {\n    if (!msrFds[core]) msrFds[core] = openMsr(core);\n    if (amdCpu)\n        return readMsr(msrFds[core], MSR_PKG_ENERGY_STAT);\n    else\n        return readMsr(msrFds[core], INTEL_MSR_PKG_ENERGY_STATUS);\n}\n\nuint64_t getTotalCoreEnergy() {\n    if (amdCpu) {\n        uint64_t totalCoreEnergy = 0;\n\n\t// only testing the 5950X and 3950X for now, and physical cores\n\t// are 0-15 on linux. hack around this until I have time to\n\t// programatically figure out SMT siblings\n        for (int i = 0; i < 16; i++) {\n            totalCoreEnergy += getCoreEnergyStat(i);\n        }\n\n\treturn totalCoreEnergy;\n    } else {\n        // intel does not track power per core\n        return getCoreEnergyStat(0);\n    }\n}\n"
  },
  {
    "path": "CoreClockChecker/CoreClockChecker_x86.s",
    "content": ".global clktest\n\n/*\n  %rdi = arg0 = iteration count\n*/\nclktest:\n  push %rbx\n  push %r8\n  push %r9\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\nclktest_loop:\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  sub %r9, %rdi\n  jnz clktest_loop\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret\n"
  },
  {
    "path": "CoreClockChecker/Makefile",
    "content": "include ../Common/arch_detect.mk\n\nCFLAGS = -O3\nLDFLAGS = -lm\n\nall: $(TARGET)\n\namd64:\n\t$(CC) $(CFLAGS) -pthread CoreClockChecker.c CoreClockChecker_x86.s -o CoreClockChecker_amd64 $(LDFLAGS)\n\t$(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_amd64 $(LDFLAGS)\n\naarch64:\n\t$(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_arm.s -o BoostClockChecker_aarch64 $(LDFLAGS)\n\nw64:\n\t$(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_w64.exe $(LDFLAGS)\n\nci: amd64 aarch64 w64\n\nclean:\n\trm -f *.o && find . -type f -executable -delete\n\n.PHONY: all ci clean\n"
  },
  {
    "path": "CoreClockChecker/WinCoreClockChecker/CoreClockCheckFunctions.asm",
    "content": "section .text\r\nbits 64\r\n\r\nglobal clktest\r\n\r\n; rcx = iteration count\r\n; rdx = address of memory location to monitor\r\n; return elapsed tsc\r\nclktest:\r\n  push rdx\r\n  push rbx\r\n  push r8\r\n  push r9\r\n  push r10\r\n  push r11\r\n  xor rbx, rbx\r\n  mov r8, 1 ; GLC will eliminate adds with immediates or increments\r\nclktest_loop:\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  add rbx, r8\r\n  mov r11d, [rdx]\r\n  test r11d, r11d\r\n  jnz clktest_loop_end ; early exit condition (someone else exited)\r\n  sub rcx, 20\r\n  jg clktest_loop\r\n  mov [rdx], r8\r\nclktest_loop_end:\r\n  mov rax, rbx\r\n  pop r11\r\n  pop r10\r\n  pop r9\r\n  pop r8\r\n  pop rbx\r\n  pop rdx\r\n  ret"
  },
  {
    "path": "CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.cpp",
    "content": "// WinCoreClockChecker.cpp : This file contains the 'main' function. Program execution begins and ends there.\r\n//\r\n\r\n#include <stdio.h>\r\n#include <stdlib.h>\r\n#include <stdint.h>\r\n#include <sys\\timeb.h>\r\n#include <windows.h>\r\n\r\nextern \"C\" uint64_t clktest(uint64_t iterations, uint64_t *flag);\r\n\r\nint ECoreTestOrder[] = { 2, 3, 4, 5, 6, 7, 8, 9 };\r\nint BackwardECoreTestOrder[] = { 9, 8, 7, 6, 5, 4, 3, 2 };\r\nint AlternatingECoreTestOrder[] = { 2, 6, 3, 7, 4, 8, 5, 9 };\r\nint PCoreTestOrder[] = { 12, 10, 14, 16, 18, 0 };\r\nint AllECores[] = { 20, 21, 2, 3, 4, 5, 6, 7, 8, 9 };\r\nint AllCores[] = { 12, 10, 14, 16, 18, 0, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21 };\r\n\r\nstruct ClockTestData {\r\n    uint64_t iterations;\r\n    uint64_t completed_iterations;\r\n    uint64_t *flag;\r\n};\r\n\r\nfloat* runMtClockTest(int* cores, int nCores);\r\nvoid PrintResults(int* cores, float* results, int coreCount);\r\nvoid RunCoreByCoreClockTest(int* cores, int coreCount);\r\nvoid RunEvenCoreTest(int coreCount);\r\n\r\nuint64_t start_iterations = 8e9;\r\n\r\nint main(int argc, char *argv[])\r\n{\r\n    // Test E-Cores one by one\r\n    start_iterations = 8e9;\r\n\r\n    if (argc > 1)\r\n    {\r\n        int evenCoreCount = atoi(argv[1]);\r\n        printf(\"Even Cores, core count %d\\n\");\r\n        RunEvenCoreTest(evenCoreCount);\r\n    }\r\n\r\n    int eCoreCount = sizeof(ECoreTestOrder) / sizeof(int);\r\n    printf(\"E-Cores, Warmup:\\n\");\r\n    RunCoreByCoreClockTest(ECoreTestOrder, sizeof(ECoreTestOrder) / sizeof(int));\r\n    printf(\"E-Cores, filling one cluster first:\\n\");\r\n    RunCoreByCoreClockTest(ECoreTestOrder, sizeof(ECoreTestOrder) / sizeof(int));\r\n    printf(\"E-Cores, filling other cluster first but still one cluster at a time:\\n\");\r\n    RunCoreByCoreClockTest(BackwardECoreTestOrder, sizeof(BackwardECoreTestOrder) / sizeof(int));\r\n    printf(\"E-Cores, alternating cores between clusters:\\n\");\r\n    RunCoreByCoreClockTest(AlternatingECoreTestOrder, sizeof(AlternatingECoreTestOrder) / sizeof(int));\r\n    printf(\"E-Cores, LPE first:\\n\");\r\n    RunCoreByCoreClockTest(AllECores, sizeof(AllECores) / sizeof(int));\r\n\r\n    start_iterations = 12e9;\r\n    printf(\"P-Cores, warmup:\\n\");\r\n    RunCoreByCoreClockTest(PCoreTestOrder, sizeof(PCoreTestOrder) / sizeof(int));\r\n    printf(\"P-Cores, fastest core first:\\n\");\r\n    RunCoreByCoreClockTest(PCoreTestOrder, sizeof(PCoreTestOrder) / sizeof(int));\r\n    printf(\"All cores, fastest core first:\\n\");\r\n    RunCoreByCoreClockTest(AllCores, sizeof(AllCores) / sizeof(int));\r\n\r\n    return 0;\r\n}\r\n\r\nvoid RunEvenCoreTest(int coreCount)\r\n{\r\n    int* coreSequence = (int *)malloc(sizeof(int) * coreCount);\r\n    for (int i = 0; i < coreCount; i++)\r\n    {\r\n        coreSequence[i] = i * 2;\r\n    }\r\n\r\n    RunCoreByCoreClockTest(coreSequence, coreCount);\r\n    free(coreSequence);\r\n}\r\n\r\nvoid RunCoreByCoreClockTest(int *cores, int coreCount)\r\n{\r\n    float* coreByCoreResults = (float*)malloc(sizeof(float) * coreCount * coreCount);\r\n    memset(coreByCoreResults, 0, sizeof(float) * coreCount * coreCount);\r\n    for (int i = 0; i < coreCount; i++)\r\n    {\r\n        float* results = runMtClockTest(cores, i + 1);\r\n        for (int j = 0; j < (i + 1); j++)\r\n        {\r\n            coreByCoreResults[coreCount * i + j] = results[j];\r\n        }\r\n\r\n        free(results);\r\n    }\r\n\r\n    PrintResults(cores, coreByCoreResults, coreCount);\r\n    free(coreByCoreResults);\r\n}\r\n\r\nvoid PrintResults(int *cores, float* results, int coreCount)\r\n{\r\n    // print csv header\r\n    for (int i = 0; i < coreCount; i++)\r\n    {\r\n        printf(\",%d\", cores[i]);\r\n    }\r\n\r\n    printf(\"\\n\");\r\n    for (int currentCoreCountIndex = 0; currentCoreCountIndex < coreCount; currentCoreCountIndex++)\r\n    {\r\n        printf(\"%d\", currentCoreCountIndex + 1);\r\n        for (int currentCoreIdx = 0; currentCoreIdx < coreCount; currentCoreIdx++)\r\n        {\r\n            float currentResult = results[coreCount * currentCoreCountIndex + currentCoreIdx];\r\n            if (currentResult != 0.0f) printf(\",%f\", currentResult);\r\n            else printf(\",-\");\r\n        }\r\n\r\n        printf(\"\\n\");\r\n    }\r\n}\r\n\r\nDWORD WINAPI ClockTestThread(LPVOID param)\r\n{\r\n    struct ClockTestData* testData = (struct ClockTestData*)param;\r\n    testData->completed_iterations = clktest(testData->iterations, testData->flag);\r\n    return 0;\r\n}\r\n\r\n// cores = array of test order -> logical core id\r\nfloat* runMtClockTest(int* cores, int nCores)\r\n{\r\n    struct timeb start, end;\r\n    struct ClockTestData* threadData = (struct ClockTestData*)malloc(sizeof(struct ClockTestData) * nCores);\r\n    float* results = (float*)malloc(sizeof(float) * nCores);\r\n    memset(results, 0, sizeof(float) * nCores);\r\n    HANDLE* testThreads = (HANDLE*)malloc(sizeof(HANDLE) * nCores);\r\n\r\n    // try to align test times\r\n    float maxThreadTsc, minThreadTsc;\r\n    float time_diff_sec;\r\n    uint64_t flag = 0;\r\n\r\n    for (int i = 0; i < nCores; i++)\r\n    {\r\n        threadData[i].iterations = start_iterations;\r\n        threadData[i].flag = &flag;\r\n        testThreads[i] = CreateThread(NULL, 0, ClockTestThread, threadData + i, CREATE_SUSPENDED, NULL);\r\n        SetThreadAffinityMask(testThreads[i], 1ULL << (uint64_t)cores[i]);\r\n    }\r\n\r\n    ftime(&start);\r\n    for (int i = 0; i < nCores; i++)\r\n    {\r\n        ResumeThread(testThreads[i]);\r\n    }\r\n\r\n    WaitForMultipleObjects(nCores, testThreads, TRUE, INFINITE);\r\n    ftime(&end);\r\n    time_diff_sec = (float)(end.time - start.time) + 0.001f * (end.millitm - start.millitm);\r\n    for (int i = 0; i < nCores; i++)\r\n    {\r\n        // fprintf(stderr, \"Core %d: %llu iterations in %f sec\\n\", cores[i], threadData[i].completed_iterations, time_diff_sec);\r\n        float ghz = ((float)threadData[i].completed_iterations / 1e9) / time_diff_sec;\r\n        // fprintf(stderr, \"Core %d: %f GHz\\n\", cores[i], ghz);\r\n        results[i] = ghz;\r\n    }\r\n\r\n    free(testThreads);\r\n    free(threadData);\r\n    return results;\r\n}"
  },
  {
    "path": "CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 17\r\nVisualStudioVersion = 17.9.34723.18\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"WinCoreClockChecker\", \"WinCoreClockChecker.vcxproj\", \"{D70EC1DD-794C-4156-8483-227E566CC76B}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|x64 = Debug|x64\r\n\t\tDebug|x86 = Debug|x86\r\n\t\tRelease|x64 = Release|x64\r\n\t\tRelease|x86 = Release|x86\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.ActiveCfg = Debug|Win32\r\n\t\t{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.Build.0 = Debug|Win32\r\n\t\t{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.Build.0 = Release|x64\r\n\t\t{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.ActiveCfg = Release|Win32\r\n\t\t{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.Build.0 = Release|Win32\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\n\tGlobalSection(ExtensibilityGlobals) = postSolution\r\n\t\tSolutionGuid = {6AA7051E-EAEF-48CA-9C08-8641D57B3EB1}\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>17.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{d70ec1dd-794c-4156-8483-227e566cc76b}</ProjectGuid>\r\n    <RootNamespace>WinCoreClockChecker</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"WinCoreClockChecker.cpp\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"CoreClockCheckFunctions.asm\">\r\n      <FileType>Document</FileType>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">nasm -f win64 CoreClockCheckFunctions.asm</Command>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">CoreClockCheckFunctions.obj</Outputs>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">nasm -f win64 CoreClockCheckFunctions.asm</Command>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">CoreClockCheckFunctions.obj</Outputs>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">nasm -f win64 CoreClockCheckFunctions.asm</Command>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">CoreClockCheckFunctions.obj</Outputs>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">nasm -f win64 CoreClockCheckFunctions.asm</Command>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">CoreClockCheckFunctions.obj</Outputs>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>"
  },
  {
    "path": "CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj.filters",
    "content": "﻿<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project ToolsVersion=\"4.0\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup>\r\n    <Filter Include=\"Source Files\">\r\n      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>\r\n      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Header Files\">\r\n      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>\r\n      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Resource Files\">\r\n      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>\r\n      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>\r\n    </Filter>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"WinCoreClockChecker.cpp\">\r\n      <Filter>Source Files</Filter>\r\n    </ClCompile>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"CoreClockCheckFunctions.asm\">\r\n      <Filter>Source Files</Filter>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n</Project>"
  },
  {
    "path": "GpuMemLatency/Makefile",
    "content": "include ../Common/arch_detect.mk\n\nOCL_VER = v2023.04.17\nCI_SCRIPT = ../Common/ci_gpumemlatency.sh\n\nCFLAGS = -O3 -I ../Common\nDEPS = ../Common/timings.h\nOBJ = opencltest.o latency_test.o bw_test.o common.o atomic_test.o instruction_rate.o timing.o\nLDFLAGS ?= -lm -lOpenCL\nifeq ($(TARGET), Darwin)\n    LDFLAGS = -lm -framework OpenCL\nendif\n\nall: $(TARGET)\n\nGpuMemLatency: $(OBJ)\n\t$(CC) $(CPPFLAGS) $(CFLAGS) $^ -o $@ $(LDFLAGS)\n\n%.o: %.c $(DEPS)\n\t$(CC) $(CFLAGS) -c -o $@ $<\n\ntiming.o:\n\t$(CC) $(CFLAGS) -c ../Common/timing.c -o timing.o\n\namd64: $(OBJ)\n\t$(CC) $(CFLAGS) $^ -o GpuMemLatency_amd64 $(LDFLAGS)\n\naarch64: $(OBJ)\n\t$(CC) $(CFLAGS) $^ -o GpuMemLatency_aarch64 $(LDFLAGS)\n\nriscv64: $(OBJ)\n\t$(CC) $(CFLAGS) $^ -o GpuMemLatency_riscv64 $(LDFLAGS)\n\nw64: $(OBJ)\n\t$(CC) $(CFLAGS) $^ -o GpuMemLatency_w64.exe $(LDFLAGS)\n\ndarwin: $(OBJ)\n\t$(CC) $(CFLAGS) $^ -o GpuMemLatency_darwin $(LDFLAGS)\n\nci: clean\n\t@OCL_VER=$(OCL_VER) sh $(CI_SCRIPT)\n\nclean-ci:\n\trm -rf \"*.deb\" \"*.zip\" \"ocl-icd-*\" \"OpenCL-SDK-*\"\n\nclean-obj: \n\trm -f *.o\n\nclean: clean-ci clean-obj\n\tfind . -type f -executable -delete\n\n.PHONY: all ci clean-ci clean-obj clean\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/README.md",
    "content": "# OpenCL<sup>TM</sup> API Headers\n\nThis repository contains C language headers for the OpenCL API.\n\nThe authoritative public repository for these headers is located at:\n\nhttps://github.com/KhronosGroup/OpenCL-Headers\n\nIssues, proposed fixes for issues, and other suggested changes should be\ncreated using Github.\n\n## Branch Structure\n\nThe OpenCL API headers in this repository are Unified headers and are designed\nto work with all released OpenCL versions.  This differs from previous OpenCL\nAPI headers, where version-specific API headers either existed in separate\nbranches, or in separate folders in a branch.\n\n## Compiling for a Specific OpenCL Version\n\nBy default, the OpenCL API headers in this repository are for the latest\nOpenCL version (currently OpenCL 2.2).  To use these API headers to target\na different OpenCL version, an application may `#define` the preprocessor\nvalue `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers.\nThe `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing\nthe OpenCL API version.\n\nFor example, to enforce usage of no more than the OpenCL 1.2 APIs, you may\ninclude the OpenCL API headers as follows:\n\n```\n#define CL_TARGET_OPENCL_VERSION 120\n#include <CL/opencl.h>\n```\n\n## Directory Structure\n\n```\nREADME.md               This file\nLICENSE                 Source license for the OpenCL API headers\nCL/                     Unified OpenCL API headers tree\n```\n\n## License\n\nSee [LICENSE](LICENSE).\n\n---\n\nOpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos.\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __OPENCL_CL_H\n#define __OPENCL_CL_H\n\n#include <CL/cl_version.h>\n#include <CL/cl_platform.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/******************************************************************************/\n\ntypedef struct _cl_platform_id *    cl_platform_id;\ntypedef struct _cl_device_id *      cl_device_id;\ntypedef struct _cl_context *        cl_context;\ntypedef struct _cl_command_queue *  cl_command_queue;\ntypedef struct _cl_mem *            cl_mem;\ntypedef struct _cl_program *        cl_program;\ntypedef struct _cl_kernel *         cl_kernel;\ntypedef struct _cl_event *          cl_event;\ntypedef struct _cl_sampler *        cl_sampler;\n\ntypedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */\ntypedef cl_ulong            cl_bitfield;\ntypedef cl_bitfield         cl_device_type;\ntypedef cl_uint             cl_platform_info;\ntypedef cl_uint             cl_device_info;\ntypedef cl_bitfield         cl_device_fp_config;\ntypedef cl_uint             cl_device_mem_cache_type;\ntypedef cl_uint             cl_device_local_mem_type;\ntypedef cl_bitfield         cl_device_exec_capabilities;\n#ifdef CL_VERSION_2_0\ntypedef cl_bitfield         cl_device_svm_capabilities;\n#endif\ntypedef cl_bitfield         cl_command_queue_properties;\n#ifdef CL_VERSION_1_2\ntypedef intptr_t            cl_device_partition_property;\ntypedef cl_bitfield         cl_device_affinity_domain;\n#endif\n\ntypedef intptr_t            cl_context_properties;\ntypedef cl_uint             cl_context_info;\n#ifdef CL_VERSION_2_0\ntypedef cl_bitfield         cl_queue_properties;\n#endif\ntypedef cl_uint             cl_command_queue_info;\ntypedef cl_uint             cl_channel_order;\ntypedef cl_uint             cl_channel_type;\ntypedef cl_bitfield         cl_mem_flags;\n#ifdef CL_VERSION_2_0\ntypedef cl_bitfield         cl_svm_mem_flags;\n#endif\ntypedef cl_uint             cl_mem_object_type;\ntypedef cl_uint             cl_mem_info;\n#ifdef CL_VERSION_1_2\ntypedef cl_bitfield         cl_mem_migration_flags;\n#endif\ntypedef cl_uint             cl_image_info;\n#ifdef CL_VERSION_1_1\ntypedef cl_uint             cl_buffer_create_type;\n#endif\ntypedef cl_uint             cl_addressing_mode;\ntypedef cl_uint             cl_filter_mode;\ntypedef cl_uint             cl_sampler_info;\ntypedef cl_bitfield         cl_map_flags;\n#ifdef CL_VERSION_2_0\ntypedef intptr_t            cl_pipe_properties;\ntypedef cl_uint             cl_pipe_info;\n#endif\ntypedef cl_uint             cl_program_info;\ntypedef cl_uint             cl_program_build_info;\n#ifdef CL_VERSION_1_2\ntypedef cl_uint             cl_program_binary_type;\n#endif\ntypedef cl_int              cl_build_status;\ntypedef cl_uint             cl_kernel_info;\n#ifdef CL_VERSION_1_2\ntypedef cl_uint             cl_kernel_arg_info;\ntypedef cl_uint             cl_kernel_arg_address_qualifier;\ntypedef cl_uint             cl_kernel_arg_access_qualifier;\ntypedef cl_bitfield         cl_kernel_arg_type_qualifier;\n#endif\ntypedef cl_uint             cl_kernel_work_group_info;\n#ifdef CL_VERSION_2_1\ntypedef cl_uint             cl_kernel_sub_group_info;\n#endif\ntypedef cl_uint             cl_event_info;\ntypedef cl_uint             cl_command_type;\ntypedef cl_uint             cl_profiling_info;\n#ifdef CL_VERSION_2_0\ntypedef cl_bitfield         cl_sampler_properties;\ntypedef cl_uint             cl_kernel_exec_info;\n#endif\n#ifdef CL_VERSION_3_0\ntypedef cl_bitfield         cl_device_atomic_capabilities;\ntypedef cl_uint             cl_khronos_vendor_id;\ntypedef cl_bitfield         cl_mem_properties;\ntypedef cl_uint             cl_version;\n#endif\n\ntypedef struct _cl_image_format {\n    cl_channel_order        image_channel_order;\n    cl_channel_type         image_channel_data_type;\n} cl_image_format;\n\n#ifdef CL_VERSION_1_2\n\ntypedef struct _cl_image_desc {\n    cl_mem_object_type      image_type;\n    size_t                  image_width;\n    size_t                  image_height;\n    size_t                  image_depth;\n    size_t                  image_array_size;\n    size_t                  image_row_pitch;\n    size_t                  image_slice_pitch;\n    cl_uint                 num_mip_levels;\n    cl_uint                 num_samples;\n#ifdef CL_VERSION_2_0\n#ifdef __GNUC__\n    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */\n#endif\n#ifdef _MSC_VER\n#pragma warning( push )\n#pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 /Za builds */\n#endif\n    union {\n#endif\n      cl_mem                  buffer;\n#ifdef CL_VERSION_2_0\n      cl_mem                  mem_object;\n    };\n#ifdef _MSC_VER\n#pragma warning( pop )\n#endif\n#endif\n} cl_image_desc;\n\n#endif\n\n#ifdef CL_VERSION_1_1\n\ntypedef struct _cl_buffer_region {\n    size_t                  origin;\n    size_t                  size;\n} cl_buffer_region;\n\n#endif\n\n#ifdef CL_VERSION_3_0\n\n#define CL_NAME_VERSION_MAX_NAME_SIZE 64\n\ntypedef struct _cl_name_version {\n    cl_version              version;\n    char                    name[CL_NAME_VERSION_MAX_NAME_SIZE];\n} cl_name_version;\n\n#endif\n\n/******************************************************************************/\n\n/* Error Codes */\n#define CL_SUCCESS                                  0\n#define CL_DEVICE_NOT_FOUND                         -1\n#define CL_DEVICE_NOT_AVAILABLE                     -2\n#define CL_COMPILER_NOT_AVAILABLE                   -3\n#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4\n#define CL_OUT_OF_RESOURCES                         -5\n#define CL_OUT_OF_HOST_MEMORY                       -6\n#define CL_PROFILING_INFO_NOT_AVAILABLE             -7\n#define CL_MEM_COPY_OVERLAP                         -8\n#define CL_IMAGE_FORMAT_MISMATCH                    -9\n#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10\n#define CL_BUILD_PROGRAM_FAILURE                    -11\n#define CL_MAP_FAILURE                              -12\n#ifdef CL_VERSION_1_1\n#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13\n#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14\n#endif\n#ifdef CL_VERSION_1_2\n#define CL_COMPILE_PROGRAM_FAILURE                  -15\n#define CL_LINKER_NOT_AVAILABLE                     -16\n#define CL_LINK_PROGRAM_FAILURE                     -17\n#define CL_DEVICE_PARTITION_FAILED                  -18\n#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19\n#endif\n\n#define CL_INVALID_VALUE                            -30\n#define CL_INVALID_DEVICE_TYPE                      -31\n#define CL_INVALID_PLATFORM                         -32\n#define CL_INVALID_DEVICE                           -33\n#define CL_INVALID_CONTEXT                          -34\n#define CL_INVALID_QUEUE_PROPERTIES                 -35\n#define CL_INVALID_COMMAND_QUEUE                    -36\n#define CL_INVALID_HOST_PTR                         -37\n#define CL_INVALID_MEM_OBJECT                       -38\n#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39\n#define CL_INVALID_IMAGE_SIZE                       -40\n#define CL_INVALID_SAMPLER                          -41\n#define CL_INVALID_BINARY                           -42\n#define CL_INVALID_BUILD_OPTIONS                    -43\n#define CL_INVALID_PROGRAM                          -44\n#define CL_INVALID_PROGRAM_EXECUTABLE               -45\n#define CL_INVALID_KERNEL_NAME                      -46\n#define CL_INVALID_KERNEL_DEFINITION                -47\n#define CL_INVALID_KERNEL                           -48\n#define CL_INVALID_ARG_INDEX                        -49\n#define CL_INVALID_ARG_VALUE                        -50\n#define CL_INVALID_ARG_SIZE                         -51\n#define CL_INVALID_KERNEL_ARGS                      -52\n#define CL_INVALID_WORK_DIMENSION                   -53\n#define CL_INVALID_WORK_GROUP_SIZE                  -54\n#define CL_INVALID_WORK_ITEM_SIZE                   -55\n#define CL_INVALID_GLOBAL_OFFSET                    -56\n#define CL_INVALID_EVENT_WAIT_LIST                  -57\n#define CL_INVALID_EVENT                            -58\n#define CL_INVALID_OPERATION                        -59\n#define CL_INVALID_GL_OBJECT                        -60\n#define CL_INVALID_BUFFER_SIZE                      -61\n#define CL_INVALID_MIP_LEVEL                        -62\n#define CL_INVALID_GLOBAL_WORK_SIZE                 -63\n#ifdef CL_VERSION_1_1\n#define CL_INVALID_PROPERTY                         -64\n#endif\n#ifdef CL_VERSION_1_2\n#define CL_INVALID_IMAGE_DESCRIPTOR                 -65\n#define CL_INVALID_COMPILER_OPTIONS                 -66\n#define CL_INVALID_LINKER_OPTIONS                   -67\n#define CL_INVALID_DEVICE_PARTITION_COUNT           -68\n#endif\n#ifdef CL_VERSION_2_0\n#define CL_INVALID_PIPE_SIZE                        -69\n#define CL_INVALID_DEVICE_QUEUE                     -70\n#endif\n#ifdef CL_VERSION_2_2\n#define CL_INVALID_SPEC_ID                          -71\n#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72\n#endif\n\n\n/* cl_bool */\n#define CL_FALSE                                    0\n#define CL_TRUE                                     1\n#ifdef CL_VERSION_1_2\n#define CL_BLOCKING                                 CL_TRUE\n#define CL_NON_BLOCKING                             CL_FALSE\n#endif\n\n/* cl_platform_info */\n#define CL_PLATFORM_PROFILE                         0x0900\n#define CL_PLATFORM_VERSION                         0x0901\n#define CL_PLATFORM_NAME                            0x0902\n#define CL_PLATFORM_VENDOR                          0x0903\n#define CL_PLATFORM_EXTENSIONS                      0x0904\n#ifdef CL_VERSION_2_1\n#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905\n#endif\n#ifdef CL_VERSION_3_0\n#define CL_PLATFORM_NUMERIC_VERSION                 0x0906\n#define CL_PLATFORM_EXTENSIONS_WITH_VERSION         0x0907\n#endif\n\n/* cl_device_type - bitfield */\n#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)\n#define CL_DEVICE_TYPE_CPU                          (1 << 1)\n#define CL_DEVICE_TYPE_GPU                          (1 << 2)\n#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)\n#ifdef CL_VERSION_1_2\n#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)\n#endif\n#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF\n\n/* cl_device_info */\n#define CL_DEVICE_TYPE                                   0x1000\n#define CL_DEVICE_VENDOR_ID                              0x1001\n#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002\n#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003\n#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004\n#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B\n#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C\n#define CL_DEVICE_ADDRESS_BITS                           0x100D\n#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E\n#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F\n#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010\n#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011\n#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012\n#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013\n#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014\n#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015\n#define CL_DEVICE_IMAGE_SUPPORT                          0x1016\n#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017\n#define CL_DEVICE_MAX_SAMPLERS                           0x1018\n#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019\n#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A\n#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B\n#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C\n#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D\n#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E\n#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F\n#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020\n#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021\n#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022\n#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023\n#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024\n#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025\n#define CL_DEVICE_ENDIAN_LITTLE                          0x1026\n#define CL_DEVICE_AVAILABLE                              0x1027\n#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028\n#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029\n#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */\n#ifdef CL_VERSION_2_0\n#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A\n#endif\n#define CL_DEVICE_NAME                                   0x102B\n#define CL_DEVICE_VENDOR                                 0x102C\n#define CL_DRIVER_VERSION                                0x102D\n#define CL_DEVICE_PROFILE                                0x102E\n#define CL_DEVICE_VERSION                                0x102F\n#define CL_DEVICE_EXTENSIONS                             0x1030\n#define CL_DEVICE_PLATFORM                               0x1031\n#ifdef CL_VERSION_1_2\n#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032\n#endif\n/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in \"cl_ext.h\" */\n#ifdef CL_VERSION_1_1\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034\n#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C\n#define CL_DEVICE_OPENCL_C_VERSION                       0x103D\n#endif\n#ifdef CL_VERSION_1_2\n#define CL_DEVICE_LINKER_AVAILABLE                       0x103E\n#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F\n#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040\n#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041\n#define CL_DEVICE_PARENT_DEVICE                          0x1042\n#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043\n#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044\n#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045\n#define CL_DEVICE_PARTITION_TYPE                         0x1046\n#define CL_DEVICE_REFERENCE_COUNT                        0x1047\n#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048\n#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049\n#endif\n#ifdef CL_VERSION_2_0\n#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A\n#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B\n#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C\n#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D\n#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E\n#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F\n#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050\n#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051\n#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052\n#define CL_DEVICE_SVM_CAPABILITIES                       0x1053\n#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054\n#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055\n#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056\n#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057\n#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058\n#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059\n#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A\n#endif\n#ifdef CL_VERSION_2_1\n#define CL_DEVICE_IL_VERSION                             0x105B\n#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C\n#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D\n#endif\n#ifdef CL_VERSION_3_0\n#define CL_DEVICE_NUMERIC_VERSION                        0x105E\n#define CL_DEVICE_EXTENSIONS_WITH_VERSION                0x1060\n#define CL_DEVICE_ILS_WITH_VERSION                       0x1061\n#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION          0x1062\n#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES             0x1063\n#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES              0x1064\n#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT         0x1065\n#define CL_DEVICE_OPENCL_C_ALL_VERSIONS                  0x1066\n#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE     0x1067\n#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068\n#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT          0x1069\n/* 0x106A to 0x106E - Reserved for upcoming KHR extension */\n#define CL_DEVICE_OPENCL_C_FEATURES                      0x106F\n#define CL_DEVICE_DEVICE_ENQUEUE_SUPPORT                 0x1070\n#define CL_DEVICE_PIPE_SUPPORT                           0x1071\n#endif\n\n/* cl_device_fp_config - bitfield */\n#define CL_FP_DENORM                                (1 << 0)\n#define CL_FP_INF_NAN                               (1 << 1)\n#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)\n#define CL_FP_ROUND_TO_ZERO                         (1 << 3)\n#define CL_FP_ROUND_TO_INF                          (1 << 4)\n#define CL_FP_FMA                                   (1 << 5)\n#ifdef CL_VERSION_1_1\n#define CL_FP_SOFT_FLOAT                            (1 << 6)\n#endif\n#ifdef CL_VERSION_1_2\n#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)\n#endif\n\n/* cl_device_mem_cache_type */\n#define CL_NONE                                     0x0\n#define CL_READ_ONLY_CACHE                          0x1\n#define CL_READ_WRITE_CACHE                         0x2\n\n/* cl_device_local_mem_type */\n#define CL_LOCAL                                    0x1\n#define CL_GLOBAL                                   0x2\n\n/* cl_device_exec_capabilities - bitfield */\n#define CL_EXEC_KERNEL                              (1 << 0)\n#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)\n\n/* cl_command_queue_properties - bitfield */\n#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)\n#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)\n#ifdef CL_VERSION_2_0\n#define CL_QUEUE_ON_DEVICE                          (1 << 2)\n#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)\n#endif\n\n/* cl_context_info */\n#define CL_CONTEXT_REFERENCE_COUNT                  0x1080\n#define CL_CONTEXT_DEVICES                          0x1081\n#define CL_CONTEXT_PROPERTIES                       0x1082\n#ifdef CL_VERSION_1_1\n#define CL_CONTEXT_NUM_DEVICES                      0x1083\n#endif\n\n/* cl_context_properties */\n#define CL_CONTEXT_PLATFORM                         0x1084\n#ifdef CL_VERSION_1_2\n#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085\n#endif\n\n#ifdef CL_VERSION_1_2\n\n/* cl_device_partition_property */\n#define CL_DEVICE_PARTITION_EQUALLY                 0x1086\n#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087\n#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0\n#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\n/* cl_device_affinity_domain */\n#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)\n#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)\n#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)\n#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)\n#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)\n#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)\n\n#endif\n\n#ifdef CL_VERSION_2_0\n\n/* cl_device_svm_capabilities */\n#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)\n#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)\n#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)\n#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)\n\n#endif\n\n/* cl_command_queue_info */\n#define CL_QUEUE_CONTEXT                            0x1090\n#define CL_QUEUE_DEVICE                             0x1091\n#define CL_QUEUE_REFERENCE_COUNT                    0x1092\n#define CL_QUEUE_PROPERTIES                         0x1093\n#ifdef CL_VERSION_2_0\n#define CL_QUEUE_SIZE                               0x1094\n#endif\n#ifdef CL_VERSION_2_1\n#define CL_QUEUE_DEVICE_DEFAULT                     0x1095\n#endif\n#ifdef CL_VERSION_3_0\n#define CL_QUEUE_PROPERTIES_ARRAY                   0x1098\n#endif\n\n/* cl_mem_flags and cl_svm_mem_flags - bitfield */\n#define CL_MEM_READ_WRITE                           (1 << 0)\n#define CL_MEM_WRITE_ONLY                           (1 << 1)\n#define CL_MEM_READ_ONLY                            (1 << 2)\n#define CL_MEM_USE_HOST_PTR                         (1 << 3)\n#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)\n#define CL_MEM_COPY_HOST_PTR                        (1 << 5)\n/* reserved                                         (1 << 6)    */\n#ifdef CL_VERSION_1_2\n#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)\n#define CL_MEM_HOST_READ_ONLY                       (1 << 8)\n#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)\n#endif\n#ifdef CL_VERSION_2_0\n#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */\n#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */\n#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)\n#endif\n\n#ifdef CL_VERSION_1_2\n\n/* cl_mem_migration_flags - bitfield */\n#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)\n#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)\n\n#endif\n\n/* cl_channel_order */\n#define CL_R                                        0x10B0\n#define CL_A                                        0x10B1\n#define CL_RG                                       0x10B2\n#define CL_RA                                       0x10B3\n#define CL_RGB                                      0x10B4\n#define CL_RGBA                                     0x10B5\n#define CL_BGRA                                     0x10B6\n#define CL_ARGB                                     0x10B7\n#define CL_INTENSITY                                0x10B8\n#define CL_LUMINANCE                                0x10B9\n#ifdef CL_VERSION_1_1\n#define CL_Rx                                       0x10BA\n#define CL_RGx                                      0x10BB\n#define CL_RGBx                                     0x10BC\n#endif\n#ifdef CL_VERSION_1_2\n#define CL_DEPTH                                    0x10BD\n#define CL_DEPTH_STENCIL                            0x10BE\n#endif\n#ifdef CL_VERSION_2_0\n#define CL_sRGB                                     0x10BF\n#define CL_sRGBx                                    0x10C0\n#define CL_sRGBA                                    0x10C1\n#define CL_sBGRA                                    0x10C2\n#define CL_ABGR                                     0x10C3\n#endif\n\n/* cl_channel_type */\n#define CL_SNORM_INT8                               0x10D0\n#define CL_SNORM_INT16                              0x10D1\n#define CL_UNORM_INT8                               0x10D2\n#define CL_UNORM_INT16                              0x10D3\n#define CL_UNORM_SHORT_565                          0x10D4\n#define CL_UNORM_SHORT_555                          0x10D5\n#define CL_UNORM_INT_101010                         0x10D6\n#define CL_SIGNED_INT8                              0x10D7\n#define CL_SIGNED_INT16                             0x10D8\n#define CL_SIGNED_INT32                             0x10D9\n#define CL_UNSIGNED_INT8                            0x10DA\n#define CL_UNSIGNED_INT16                           0x10DB\n#define CL_UNSIGNED_INT32                           0x10DC\n#define CL_HALF_FLOAT                               0x10DD\n#define CL_FLOAT                                    0x10DE\n#ifdef CL_VERSION_1_2\n#define CL_UNORM_INT24                              0x10DF\n#endif\n#ifdef CL_VERSION_2_1\n#define CL_UNORM_INT_101010_2                       0x10E0\n#endif\n\n/* cl_mem_object_type */\n#define CL_MEM_OBJECT_BUFFER                        0x10F0\n#define CL_MEM_OBJECT_IMAGE2D                       0x10F1\n#define CL_MEM_OBJECT_IMAGE3D                       0x10F2\n#ifdef CL_VERSION_1_2\n#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3\n#define CL_MEM_OBJECT_IMAGE1D                       0x10F4\n#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5\n#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6\n#endif\n#ifdef CL_VERSION_2_0\n#define CL_MEM_OBJECT_PIPE                          0x10F7\n#endif\n\n/* cl_mem_info */\n#define CL_MEM_TYPE                                 0x1100\n#define CL_MEM_FLAGS                                0x1101\n#define CL_MEM_SIZE                                 0x1102\n#define CL_MEM_HOST_PTR                             0x1103\n#define CL_MEM_MAP_COUNT                            0x1104\n#define CL_MEM_REFERENCE_COUNT                      0x1105\n#define CL_MEM_CONTEXT                              0x1106\n#ifdef CL_VERSION_1_1\n#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107\n#define CL_MEM_OFFSET                               0x1108\n#endif\n#ifdef CL_VERSION_2_0\n#define CL_MEM_USES_SVM_POINTER                     0x1109\n#endif\n#ifdef CL_VERSION_3_0\n#define CL_MEM_PROPERTIES                           0x110A\n#endif\n\n/* cl_image_info */\n#define CL_IMAGE_FORMAT                             0x1110\n#define CL_IMAGE_ELEMENT_SIZE                       0x1111\n#define CL_IMAGE_ROW_PITCH                          0x1112\n#define CL_IMAGE_SLICE_PITCH                        0x1113\n#define CL_IMAGE_WIDTH                              0x1114\n#define CL_IMAGE_HEIGHT                             0x1115\n#define CL_IMAGE_DEPTH                              0x1116\n#ifdef CL_VERSION_1_2\n#define CL_IMAGE_ARRAY_SIZE                         0x1117\n#define CL_IMAGE_BUFFER                             0x1118\n#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119\n#define CL_IMAGE_NUM_SAMPLES                        0x111A\n#endif\n\n\n/* cl_pipe_info */\n#ifdef CL_VERSION_2_0\n#define CL_PIPE_PACKET_SIZE                         0x1120\n#define CL_PIPE_MAX_PACKETS                         0x1121\n#endif\n#ifdef CL_VERSION_3_0\n#define CL_PIPE_PROPERTIES                          0x1122\n#endif\n\n/* cl_addressing_mode */\n#define CL_ADDRESS_NONE                             0x1130\n#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131\n#define CL_ADDRESS_CLAMP                            0x1132\n#define CL_ADDRESS_REPEAT                           0x1133\n#ifdef CL_VERSION_1_1\n#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134\n#endif\n\n/* cl_filter_mode */\n#define CL_FILTER_NEAREST                           0x1140\n#define CL_FILTER_LINEAR                            0x1141\n\n/* cl_sampler_info */\n#define CL_SAMPLER_REFERENCE_COUNT                  0x1150\n#define CL_SAMPLER_CONTEXT                          0x1151\n#define CL_SAMPLER_NORMALIZED_COORDS                0x1152\n#define CL_SAMPLER_ADDRESSING_MODE                  0x1153\n#define CL_SAMPLER_FILTER_MODE                      0x1154\n#ifdef CL_VERSION_2_0\n/* These enumerants are for the cl_khr_mipmap_image extension.\n   They have since been added to cl_ext.h with an appropriate\n   KHR suffix, but are left here for backwards compatibility. */\n#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155\n#define CL_SAMPLER_LOD_MIN                          0x1156\n#define CL_SAMPLER_LOD_MAX                          0x1157\n#endif\n#ifdef CL_VERSION_3_0\n#define CL_SAMPLER_PROPERTIES                       0x1158\n#endif\n\n/* cl_map_flags - bitfield */\n#define CL_MAP_READ                                 (1 << 0)\n#define CL_MAP_WRITE                                (1 << 1)\n#ifdef CL_VERSION_1_2\n#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)\n#endif\n\n/* cl_program_info */\n#define CL_PROGRAM_REFERENCE_COUNT                  0x1160\n#define CL_PROGRAM_CONTEXT                          0x1161\n#define CL_PROGRAM_NUM_DEVICES                      0x1162\n#define CL_PROGRAM_DEVICES                          0x1163\n#define CL_PROGRAM_SOURCE                           0x1164\n#define CL_PROGRAM_BINARY_SIZES                     0x1165\n#define CL_PROGRAM_BINARIES                         0x1166\n#ifdef CL_VERSION_1_2\n#define CL_PROGRAM_NUM_KERNELS                      0x1167\n#define CL_PROGRAM_KERNEL_NAMES                     0x1168\n#endif\n#ifdef CL_VERSION_2_1\n#define CL_PROGRAM_IL                               0x1169\n#endif\n#ifdef CL_VERSION_2_2\n#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A\n#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B\n#endif\n\n/* cl_program_build_info */\n#define CL_PROGRAM_BUILD_STATUS                     0x1181\n#define CL_PROGRAM_BUILD_OPTIONS                    0x1182\n#define CL_PROGRAM_BUILD_LOG                        0x1183\n#ifdef CL_VERSION_1_2\n#define CL_PROGRAM_BINARY_TYPE                      0x1184\n#endif\n#ifdef CL_VERSION_2_0\n#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185\n#endif\n\n#ifdef CL_VERSION_1_2\n\n/* cl_program_binary_type */\n#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0\n#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1\n#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2\n#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4\n\n#endif\n\n/* cl_build_status */\n#define CL_BUILD_SUCCESS                            0\n#define CL_BUILD_NONE                               -1\n#define CL_BUILD_ERROR                              -2\n#define CL_BUILD_IN_PROGRESS                        -3\n\n/* cl_kernel_info */\n#define CL_KERNEL_FUNCTION_NAME                     0x1190\n#define CL_KERNEL_NUM_ARGS                          0x1191\n#define CL_KERNEL_REFERENCE_COUNT                   0x1192\n#define CL_KERNEL_CONTEXT                           0x1193\n#define CL_KERNEL_PROGRAM                           0x1194\n#ifdef CL_VERSION_1_2\n#define CL_KERNEL_ATTRIBUTES                        0x1195\n#endif\n\n#ifdef CL_VERSION_1_2\n\n/* cl_kernel_arg_info */\n#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196\n#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197\n#define CL_KERNEL_ARG_TYPE_NAME                     0x1198\n#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199\n#define CL_KERNEL_ARG_NAME                          0x119A\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\n/* cl_kernel_arg_address_qualifier */\n#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B\n#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C\n#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D\n#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\n/* cl_kernel_arg_access_qualifier */\n#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0\n#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1\n#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2\n#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\n/* cl_kernel_arg_type_qualifier */\n#define CL_KERNEL_ARG_TYPE_NONE                     0\n#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)\n#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)\n#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)\n#ifdef CL_VERSION_2_0\n#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)\n#endif\n\n#endif\n\n/* cl_kernel_work_group_info */\n#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0\n#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1\n#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2\n#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3\n#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4\n#ifdef CL_VERSION_1_2\n#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5\n#endif\n\n#ifdef CL_VERSION_2_1\n\n/* cl_kernel_sub_group_info */\n#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033\n#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034\n#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8\n#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9\n#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA\n\n#endif\n\n#ifdef CL_VERSION_2_0\n\n/* cl_kernel_exec_info */\n#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6\n#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7\n\n#endif\n\n/* cl_event_info */\n#define CL_EVENT_COMMAND_QUEUE                      0x11D0\n#define CL_EVENT_COMMAND_TYPE                       0x11D1\n#define CL_EVENT_REFERENCE_COUNT                    0x11D2\n#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3\n#ifdef CL_VERSION_1_1\n#define CL_EVENT_CONTEXT                            0x11D4\n#endif\n\n/* cl_command_type */\n#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0\n#define CL_COMMAND_TASK                             0x11F1\n#define CL_COMMAND_NATIVE_KERNEL                    0x11F2\n#define CL_COMMAND_READ_BUFFER                      0x11F3\n#define CL_COMMAND_WRITE_BUFFER                     0x11F4\n#define CL_COMMAND_COPY_BUFFER                      0x11F5\n#define CL_COMMAND_READ_IMAGE                       0x11F6\n#define CL_COMMAND_WRITE_IMAGE                      0x11F7\n#define CL_COMMAND_COPY_IMAGE                       0x11F8\n#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9\n#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA\n#define CL_COMMAND_MAP_BUFFER                       0x11FB\n#define CL_COMMAND_MAP_IMAGE                        0x11FC\n#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD\n#define CL_COMMAND_MARKER                           0x11FE\n#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF\n#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200\n#ifdef CL_VERSION_1_1\n#define CL_COMMAND_READ_BUFFER_RECT                 0x1201\n#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202\n#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203\n#define CL_COMMAND_USER                             0x1204\n#endif\n#ifdef CL_VERSION_1_2\n#define CL_COMMAND_BARRIER                          0x1205\n#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206\n#define CL_COMMAND_FILL_BUFFER                      0x1207\n#define CL_COMMAND_FILL_IMAGE                       0x1208\n#endif\n#ifdef CL_VERSION_2_0\n#define CL_COMMAND_SVM_FREE                         0x1209\n#define CL_COMMAND_SVM_MEMCPY                       0x120A\n#define CL_COMMAND_SVM_MEMFILL                      0x120B\n#define CL_COMMAND_SVM_MAP                          0x120C\n#define CL_COMMAND_SVM_UNMAP                        0x120D\n#endif\n#ifdef CL_VERSION_3_0\n#define CL_COMMAND_SVM_MIGRATE_MEM                  0x120E\n#endif\n\n/* command execution status */\n#define CL_COMPLETE                                 0x0\n#define CL_RUNNING                                  0x1\n#define CL_SUBMITTED                                0x2\n#define CL_QUEUED                                   0x3\n\n/* cl_buffer_create_type */\n#ifdef CL_VERSION_1_1\n#define CL_BUFFER_CREATE_TYPE_REGION                0x1220\n#endif\n\n/* cl_profiling_info */\n#define CL_PROFILING_COMMAND_QUEUED                 0x1280\n#define CL_PROFILING_COMMAND_SUBMIT                 0x1281\n#define CL_PROFILING_COMMAND_START                  0x1282\n#define CL_PROFILING_COMMAND_END                    0x1283\n#ifdef CL_VERSION_2_0\n#define CL_PROFILING_COMMAND_COMPLETE               0x1284\n#endif\n\n/* cl_device_atomic_capabilities - bitfield */\n#ifdef CL_VERSION_3_0\n#define CL_DEVICE_ATOMIC_ORDER_RELAXED          (1 << 0)\n#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL          (1 << 1)\n#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST          (1 << 2)\n#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM        (1 << 3)\n#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP       (1 << 4)\n#define CL_DEVICE_ATOMIC_SCOPE_DEVICE           (1 << 5)\n#define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES      (1 << 6)\n#endif\n\n/* cl_khronos_vendor_id */\n#define CL_KHRONOS_VENDOR_ID_CODEPLAY               0x10004\n\n#ifdef CL_VERSION_3_0\n\n/* cl_version */\n#define CL_VERSION_MAJOR_BITS (10)\n#define CL_VERSION_MINOR_BITS (10)\n#define CL_VERSION_PATCH_BITS (12)\n\n#define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1)\n#define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1)\n#define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1)\n\n#define CL_VERSION_MAJOR(version) \\\n  ((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS))\n\n#define CL_VERSION_MINOR(version) \\\n  (((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK)\n\n#define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK)\n\n#define CL_MAKE_VERSION(major, minor, patch)                      \\\n  ((((major) & CL_VERSION_MAJOR_MASK)                             \\\n       << (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) |      \\\n   (((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \\\n   ((patch) & CL_VERSION_PATCH_MASK))\n\n#endif\n\n/********************************************************************************************************/\n\n/* Platform API */\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetPlatformIDs(cl_uint          num_entries,\n                 cl_platform_id * platforms,\n                 cl_uint *        num_platforms) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetPlatformInfo(cl_platform_id   platform,\n                  cl_platform_info param_name,\n                  size_t           param_value_size,\n                  void *           param_value,\n                  size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n/* Device APIs */\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetDeviceIDs(cl_platform_id   platform,\n               cl_device_type   device_type,\n               cl_uint          num_entries,\n               cl_device_id *   devices,\n               cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetDeviceInfo(cl_device_id    device,\n                cl_device_info  param_name,\n                size_t          param_value_size,\n                void *          param_value,\n                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclCreateSubDevices(cl_device_id                         in_device,\n                   const cl_device_partition_property * properties,\n                   cl_uint                              num_devices,\n                   cl_device_id *                       out_devices,\n                   cl_uint *                            num_devices_ret) CL_API_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\n#ifdef CL_VERSION_2_1\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetDefaultDeviceCommandQueue(cl_context           context,\n                               cl_device_id         device,\n                               cl_command_queue     command_queue) CL_API_SUFFIX__VERSION_2_1;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetDeviceAndHostTimer(cl_device_id    device,\n                        cl_ulong*       device_timestamp,\n                        cl_ulong*       host_timestamp) CL_API_SUFFIX__VERSION_2_1;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetHostTimer(cl_device_id device,\n               cl_ulong *   host_timestamp) CL_API_SUFFIX__VERSION_2_1;\n\n#endif\n\n/* Context APIs */\nextern CL_API_ENTRY cl_context CL_API_CALL\nclCreateContext(const cl_context_properties * properties,\n                cl_uint              num_devices,\n                const cl_device_id * devices,\n                void (CL_CALLBACK * pfn_notify)(const char * errinfo,\n                                                const void * private_info,\n                                                size_t       cb,\n                                                void *       user_data),\n                void *               user_data,\n                cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_context CL_API_CALL\nclCreateContextFromType(const cl_context_properties * properties,\n                        cl_device_type      device_type,\n                        void (CL_CALLBACK * pfn_notify)(const char * errinfo,\n                                                        const void * private_info,\n                                                        size_t       cb,\n                                                        void *       user_data),\n                        void *              user_data,\n                        cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetContextInfo(cl_context         context,\n                 cl_context_info    param_name,\n                 size_t             param_value_size,\n                 void *             param_value,\n                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n/* Command Queue APIs */\n\n#ifdef CL_VERSION_2_0\n\nextern CL_API_ENTRY cl_command_queue CL_API_CALL\nclCreateCommandQueueWithProperties(cl_context               context,\n                                   cl_device_id             device,\n                                   const cl_queue_properties *    properties,\n                                   cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_2_0;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetCommandQueueInfo(cl_command_queue      command_queue,\n                      cl_command_queue_info param_name,\n                      size_t                param_value_size,\n                      void *                param_value,\n                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n/* Memory Object APIs */\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateBuffer(cl_context   context,\n               cl_mem_flags flags,\n               size_t       size,\n               void *       host_ptr,\n               cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_1\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateSubBuffer(cl_mem                   buffer,\n                  cl_mem_flags             flags,\n                  cl_buffer_create_type    buffer_create_type,\n                  const void *             buffer_create_info,\n                  cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_1;\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateImage(cl_context              context,\n              cl_mem_flags            flags,\n              const cl_image_format * image_format,\n              const cl_image_desc *   image_desc,\n              void *                  host_ptr,\n              cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\n#ifdef CL_VERSION_2_0\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreatePipe(cl_context                 context,\n             cl_mem_flags               flags,\n             cl_uint                    pipe_packet_size,\n             cl_uint                    pipe_max_packets,\n             const cl_pipe_properties * properties,\n             cl_int *                   errcode_ret) CL_API_SUFFIX__VERSION_2_0;\n\n#endif\n\n#ifdef CL_VERSION_3_0\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateBufferWithProperties(cl_context                context,\n                             const cl_mem_properties * properties,\n                             cl_mem_flags              flags,\n                             size_t                    size,\n                             void *                    host_ptr,\n                             cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateImageWithProperties(cl_context                context,\n                            const cl_mem_properties * properties,\n                            cl_mem_flags              flags,\n                            const cl_image_format *   image_format,\n                            const cl_image_desc *     image_desc,\n                            void *                    host_ptr,\n                            cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetSupportedImageFormats(cl_context           context,\n                           cl_mem_flags         flags,\n                           cl_mem_object_type   image_type,\n                           cl_uint              num_entries,\n                           cl_image_format *    image_formats,\n                           cl_uint *            num_image_formats) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetMemObjectInfo(cl_mem           memobj,\n                   cl_mem_info      param_name,\n                   size_t           param_value_size,\n                   void *           param_value,\n                   size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetImageInfo(cl_mem           image,\n               cl_image_info    param_name,\n               size_t           param_value_size,\n               void *           param_value,\n               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_2_0\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetPipeInfo(cl_mem           pipe,\n              cl_pipe_info     param_name,\n              size_t           param_value_size,\n              void *           param_value,\n              size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_2_0;\n\n#endif\n\n#ifdef CL_VERSION_1_1\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetMemObjectDestructorCallback(cl_mem memobj,\n                                 void (CL_CALLBACK * pfn_notify)(cl_mem memobj,\n                                                                 void * user_data),\n                                 void * user_data) CL_API_SUFFIX__VERSION_1_1;\n\n#endif\n\n/* SVM Allocation APIs */\n\n#ifdef CL_VERSION_2_0\n\nextern CL_API_ENTRY void * CL_API_CALL\nclSVMAlloc(cl_context       context,\n           cl_svm_mem_flags flags,\n           size_t           size,\n           cl_uint          alignment) CL_API_SUFFIX__VERSION_2_0;\n\nextern CL_API_ENTRY void CL_API_CALL\nclSVMFree(cl_context        context,\n          void *            svm_pointer) CL_API_SUFFIX__VERSION_2_0;\n\n#endif\n\n/* Sampler APIs */\n\n#ifdef CL_VERSION_2_0\n\nextern CL_API_ENTRY cl_sampler CL_API_CALL\nclCreateSamplerWithProperties(cl_context                     context,\n                              const cl_sampler_properties *  sampler_properties,\n                              cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_2_0;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetSamplerInfo(cl_sampler         sampler,\n                 cl_sampler_info    param_name,\n                 size_t             param_value_size,\n                 void *             param_value,\n                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n/* Program Object APIs */\nextern CL_API_ENTRY cl_program CL_API_CALL\nclCreateProgramWithSource(cl_context        context,\n                          cl_uint           count,\n                          const char **     strings,\n                          const size_t *    lengths,\n                          cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_program CL_API_CALL\nclCreateProgramWithBinary(cl_context                     context,\n                          cl_uint                        num_devices,\n                          const cl_device_id *           device_list,\n                          const size_t *                 lengths,\n                          const unsigned char **         binaries,\n                          cl_int *                       binary_status,\n                          cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_program CL_API_CALL\nclCreateProgramWithBuiltInKernels(cl_context            context,\n                                  cl_uint               num_devices,\n                                  const cl_device_id *  device_list,\n                                  const char *          kernel_names,\n                                  cl_int *              errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\n#ifdef CL_VERSION_2_1\n\nextern CL_API_ENTRY cl_program CL_API_CALL\nclCreateProgramWithIL(cl_context    context,\n                     const void*    il,\n                     size_t         length,\n                     cl_int*        errcode_ret) CL_API_SUFFIX__VERSION_2_1;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclBuildProgram(cl_program           program,\n               cl_uint              num_devices,\n               const cl_device_id * device_list,\n               const char *         options,\n               void (CL_CALLBACK *  pfn_notify)(cl_program program,\n                                                void * user_data),\n               void *               user_data) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclCompileProgram(cl_program           program,\n                 cl_uint              num_devices,\n                 const cl_device_id * device_list,\n                 const char *         options,\n                 cl_uint              num_input_headers,\n                 const cl_program *   input_headers,\n                 const char **        header_include_names,\n                 void (CL_CALLBACK *  pfn_notify)(cl_program program,\n                                                  void * user_data),\n                 void *               user_data) CL_API_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_program CL_API_CALL\nclLinkProgram(cl_context           context,\n              cl_uint              num_devices,\n              const cl_device_id * device_list,\n              const char *         options,\n              cl_uint              num_input_programs,\n              const cl_program *   input_programs,\n              void (CL_CALLBACK *  pfn_notify)(cl_program program,\n                                               void * user_data),\n              void *               user_data,\n              cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\n#ifdef CL_VERSION_2_2\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetProgramReleaseCallback(cl_program          program,\n                            void (CL_CALLBACK * pfn_notify)(cl_program program,\n                                                            void * user_data),\n                            void *              user_data) CL_API_SUFFIX__VERSION_2_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetProgramSpecializationConstant(cl_program  program,\n                                   cl_uint     spec_id,\n                                   size_t      spec_size,\n                                   const void* spec_value) CL_API_SUFFIX__VERSION_2_2;\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetProgramInfo(cl_program         program,\n                 cl_program_info    param_name,\n                 size_t             param_value_size,\n                 void *             param_value,\n                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetProgramBuildInfo(cl_program            program,\n                      cl_device_id          device,\n                      cl_program_build_info param_name,\n                      size_t                param_value_size,\n                      void *                param_value,\n                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n/* Kernel Object APIs */\nextern CL_API_ENTRY cl_kernel CL_API_CALL\nclCreateKernel(cl_program      program,\n               const char *    kernel_name,\n               cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclCreateKernelsInProgram(cl_program     program,\n                         cl_uint        num_kernels,\n                         cl_kernel *    kernels,\n                         cl_uint *      num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_2_1\n\nextern CL_API_ENTRY cl_kernel CL_API_CALL\nclCloneKernel(cl_kernel     source_kernel,\n              cl_int*       errcode_ret) CL_API_SUFFIX__VERSION_2_1;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainKernel(cl_kernel    kernel) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseKernel(cl_kernel   kernel) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetKernelArg(cl_kernel    kernel,\n               cl_uint      arg_index,\n               size_t       arg_size,\n               const void * arg_value) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_2_0\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetKernelArgSVMPointer(cl_kernel    kernel,\n                         cl_uint      arg_index,\n                         const void * arg_value) CL_API_SUFFIX__VERSION_2_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetKernelExecInfo(cl_kernel            kernel,\n                    cl_kernel_exec_info  param_name,\n                    size_t               param_value_size,\n                    const void *         param_value) CL_API_SUFFIX__VERSION_2_0;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetKernelInfo(cl_kernel       kernel,\n                cl_kernel_info  param_name,\n                size_t          param_value_size,\n                void *          param_value,\n                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetKernelArgInfo(cl_kernel       kernel,\n                   cl_uint         arg_indx,\n                   cl_kernel_arg_info  param_name,\n                   size_t          param_value_size,\n                   void *          param_value,\n                   size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetKernelWorkGroupInfo(cl_kernel                  kernel,\n                         cl_device_id               device,\n                         cl_kernel_work_group_info  param_name,\n                         size_t                     param_value_size,\n                         void *                     param_value,\n                         size_t *                   param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_2_1\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetKernelSubGroupInfo(cl_kernel                   kernel,\n                        cl_device_id                device,\n                        cl_kernel_sub_group_info    param_name,\n                        size_t                      input_value_size,\n                        const void*                 input_value,\n                        size_t                      param_value_size,\n                        void*                       param_value,\n                        size_t*                     param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;\n\n#endif\n\n/* Event Object APIs */\nextern CL_API_ENTRY cl_int CL_API_CALL\nclWaitForEvents(cl_uint             num_events,\n                const cl_event *    event_list) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetEventInfo(cl_event         event,\n               cl_event_info    param_name,\n               size_t           param_value_size,\n               void *           param_value,\n               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_1\n\nextern CL_API_ENTRY cl_event CL_API_CALL\nclCreateUserEvent(cl_context    context,\n                  cl_int *      errcode_ret) CL_API_SUFFIX__VERSION_1_1;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_1\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetUserEventStatus(cl_event   event,\n                     cl_int     execution_status) CL_API_SUFFIX__VERSION_1_1;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetEventCallback(cl_event    event,\n                   cl_int      command_exec_callback_type,\n                   void (CL_CALLBACK * pfn_notify)(cl_event event,\n                                                   cl_int   event_command_status,\n                                                   void *   user_data),\n                   void *      user_data) CL_API_SUFFIX__VERSION_1_1;\n\n#endif\n\n/* Profiling APIs */\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetEventProfilingInfo(cl_event            event,\n                        cl_profiling_info   param_name,\n                        size_t              param_value_size,\n                        void *              param_value,\n                        size_t *            param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n/* Flush and Finish APIs */\nextern CL_API_ENTRY cl_int CL_API_CALL\nclFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\n\n/* Enqueued Commands APIs */\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueReadBuffer(cl_command_queue    command_queue,\n                    cl_mem              buffer,\n                    cl_bool             blocking_read,\n                    size_t              offset,\n                    size_t              size,\n                    void *              ptr,\n                    cl_uint             num_events_in_wait_list,\n                    const cl_event *    event_wait_list,\n                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_1\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueReadBufferRect(cl_command_queue    command_queue,\n                        cl_mem              buffer,\n                        cl_bool             blocking_read,\n                        const size_t *      buffer_offset,\n                        const size_t *      host_offset,\n                        const size_t *      region,\n                        size_t              buffer_row_pitch,\n                        size_t              buffer_slice_pitch,\n                        size_t              host_row_pitch,\n                        size_t              host_slice_pitch,\n                        void *              ptr,\n                        cl_uint             num_events_in_wait_list,\n                        const cl_event *    event_wait_list,\n                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueWriteBuffer(cl_command_queue   command_queue,\n                     cl_mem             buffer,\n                     cl_bool            blocking_write,\n                     size_t             offset,\n                     size_t             size,\n                     const void *       ptr,\n                     cl_uint            num_events_in_wait_list,\n                     const cl_event *   event_wait_list,\n                     cl_event *         event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_1\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueWriteBufferRect(cl_command_queue    command_queue,\n                         cl_mem              buffer,\n                         cl_bool             blocking_write,\n                         const size_t *      buffer_offset,\n                         const size_t *      host_offset,\n                         const size_t *      region,\n                         size_t              buffer_row_pitch,\n                         size_t              buffer_slice_pitch,\n                         size_t              host_row_pitch,\n                         size_t              host_slice_pitch,\n                         const void *        ptr,\n                         cl_uint             num_events_in_wait_list,\n                         const cl_event *    event_wait_list,\n                         cl_event *          event) CL_API_SUFFIX__VERSION_1_1;\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueFillBuffer(cl_command_queue   command_queue,\n                    cl_mem             buffer,\n                    const void *       pattern,\n                    size_t             pattern_size,\n                    size_t             offset,\n                    size_t             size,\n                    cl_uint            num_events_in_wait_list,\n                    const cl_event *   event_wait_list,\n                    cl_event *         event) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueCopyBuffer(cl_command_queue    command_queue,\n                    cl_mem              src_buffer,\n                    cl_mem              dst_buffer,\n                    size_t              src_offset,\n                    size_t              dst_offset,\n                    size_t              size,\n                    cl_uint             num_events_in_wait_list,\n                    const cl_event *    event_wait_list,\n                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_1\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueCopyBufferRect(cl_command_queue    command_queue,\n                        cl_mem              src_buffer,\n                        cl_mem              dst_buffer,\n                        const size_t *      src_origin,\n                        const size_t *      dst_origin,\n                        const size_t *      region,\n                        size_t              src_row_pitch,\n                        size_t              src_slice_pitch,\n                        size_t              dst_row_pitch,\n                        size_t              dst_slice_pitch,\n                        cl_uint             num_events_in_wait_list,\n                        const cl_event *    event_wait_list,\n                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueReadImage(cl_command_queue     command_queue,\n                   cl_mem               image,\n                   cl_bool              blocking_read,\n                   const size_t *       origin,\n                   const size_t *       region,\n                   size_t               row_pitch,\n                   size_t               slice_pitch,\n                   void *               ptr,\n                   cl_uint              num_events_in_wait_list,\n                   const cl_event *     event_wait_list,\n                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueWriteImage(cl_command_queue    command_queue,\n                    cl_mem              image,\n                    cl_bool             blocking_write,\n                    const size_t *      origin,\n                    const size_t *      region,\n                    size_t              input_row_pitch,\n                    size_t              input_slice_pitch,\n                    const void *        ptr,\n                    cl_uint             num_events_in_wait_list,\n                    const cl_event *    event_wait_list,\n                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueFillImage(cl_command_queue   command_queue,\n                   cl_mem             image,\n                   const void *       fill_color,\n                   const size_t *     origin,\n                   const size_t *     region,\n                   cl_uint            num_events_in_wait_list,\n                   const cl_event *   event_wait_list,\n                   cl_event *         event) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueCopyImage(cl_command_queue     command_queue,\n                   cl_mem               src_image,\n                   cl_mem               dst_image,\n                   const size_t *       src_origin,\n                   const size_t *       dst_origin,\n                   const size_t *       region,\n                   cl_uint              num_events_in_wait_list,\n                   const cl_event *     event_wait_list,\n                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueCopyImageToBuffer(cl_command_queue command_queue,\n                           cl_mem           src_image,\n                           cl_mem           dst_buffer,\n                           const size_t *   src_origin,\n                           const size_t *   region,\n                           size_t           dst_offset,\n                           cl_uint          num_events_in_wait_list,\n                           const cl_event * event_wait_list,\n                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueCopyBufferToImage(cl_command_queue command_queue,\n                           cl_mem           src_buffer,\n                           cl_mem           dst_image,\n                           size_t           src_offset,\n                           const size_t *   dst_origin,\n                           const size_t *   region,\n                           cl_uint          num_events_in_wait_list,\n                           const cl_event * event_wait_list,\n                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY void * CL_API_CALL\nclEnqueueMapBuffer(cl_command_queue command_queue,\n                   cl_mem           buffer,\n                   cl_bool          blocking_map,\n                   cl_map_flags     map_flags,\n                   size_t           offset,\n                   size_t           size,\n                   cl_uint          num_events_in_wait_list,\n                   const cl_event * event_wait_list,\n                   cl_event *       event,\n                   cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY void * CL_API_CALL\nclEnqueueMapImage(cl_command_queue  command_queue,\n                  cl_mem            image,\n                  cl_bool           blocking_map,\n                  cl_map_flags      map_flags,\n                  const size_t *    origin,\n                  const size_t *    region,\n                  size_t *          image_row_pitch,\n                  size_t *          image_slice_pitch,\n                  cl_uint           num_events_in_wait_list,\n                  const cl_event *  event_wait_list,\n                  cl_event *        event,\n                  cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueUnmapMemObject(cl_command_queue command_queue,\n                        cl_mem           memobj,\n                        void *           mapped_ptr,\n                        cl_uint          num_events_in_wait_list,\n                        const cl_event * event_wait_list,\n                        cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueMigrateMemObjects(cl_command_queue       command_queue,\n                           cl_uint                num_mem_objects,\n                           const cl_mem *         mem_objects,\n                           cl_mem_migration_flags flags,\n                           cl_uint                num_events_in_wait_list,\n                           const cl_event *       event_wait_list,\n                           cl_event *             event) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueNDRangeKernel(cl_command_queue command_queue,\n                       cl_kernel        kernel,\n                       cl_uint          work_dim,\n                       const size_t *   global_work_offset,\n                       const size_t *   global_work_size,\n                       const size_t *   local_work_size,\n                       cl_uint          num_events_in_wait_list,\n                       const cl_event * event_wait_list,\n                       cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueNativeKernel(cl_command_queue  command_queue,\n                      void (CL_CALLBACK * user_func)(void *),\n                      void *            args,\n                      size_t            cb_args,\n                      cl_uint           num_mem_objects,\n                      const cl_mem *    mem_list,\n                      const void **     args_mem_loc,\n                      cl_uint           num_events_in_wait_list,\n                      const cl_event *  event_wait_list,\n                      cl_event *        event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueMarkerWithWaitList(cl_command_queue  command_queue,\n                            cl_uint           num_events_in_wait_list,\n                            const cl_event *  event_wait_list,\n                            cl_event *        event) CL_API_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueBarrierWithWaitList(cl_command_queue  command_queue,\n                             cl_uint           num_events_in_wait_list,\n                             const cl_event *  event_wait_list,\n                             cl_event *        event) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\n#ifdef CL_VERSION_2_0\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMFree(cl_command_queue  command_queue,\n                 cl_uint           num_svm_pointers,\n                 void *            svm_pointers[],\n                 void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,\n                                                    cl_uint          num_svm_pointers,\n                                                    void *           svm_pointers[],\n                                                    void *           user_data),\n                 void *            user_data,\n                 cl_uint           num_events_in_wait_list,\n                 const cl_event *  event_wait_list,\n                 cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMMemcpy(cl_command_queue  command_queue,\n                   cl_bool           blocking_copy,\n                   void *            dst_ptr,\n                   const void *      src_ptr,\n                   size_t            size,\n                   cl_uint           num_events_in_wait_list,\n                   const cl_event *  event_wait_list,\n                   cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMMemFill(cl_command_queue  command_queue,\n                    void *            svm_ptr,\n                    const void *      pattern,\n                    size_t            pattern_size,\n                    size_t            size,\n                    cl_uint           num_events_in_wait_list,\n                    const cl_event *  event_wait_list,\n                    cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMMap(cl_command_queue  command_queue,\n                cl_bool           blocking_map,\n                cl_map_flags      flags,\n                void *            svm_ptr,\n                size_t            size,\n                cl_uint           num_events_in_wait_list,\n                const cl_event *  event_wait_list,\n                cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMUnmap(cl_command_queue  command_queue,\n                  void *            svm_ptr,\n                  cl_uint           num_events_in_wait_list,\n                  const cl_event *  event_wait_list,\n                  cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\n\n#endif\n\n#ifdef CL_VERSION_2_1\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMMigrateMem(cl_command_queue         command_queue,\n                       cl_uint                  num_svm_pointers,\n                       const void **            svm_pointers,\n                       const size_t *           sizes,\n                       cl_mem_migration_flags   flags,\n                       cl_uint                  num_events_in_wait_list,\n                       const cl_event *         event_wait_list,\n                       cl_event *               event) CL_API_SUFFIX__VERSION_2_1;\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\n/* Extension function access\n *\n * Returns the extension function address for the given function name,\n * or NULL if a valid function can not be found.  The client must\n * check to make sure the address is not NULL, before using or\n * calling the returned function address.\n */\nextern CL_API_ENTRY void * CL_API_CALL\nclGetExtensionFunctionAddressForPlatform(cl_platform_id platform,\n                                         const char *   func_name) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\n#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS\n    /*\n     *  WARNING:\n     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED\n     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the\n     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.\n     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.\n     *\n     *  Software developers previously relying on this API are instructed to set the command queue\n     *  properties when creating the queue, instead.\n     */\n    extern CL_API_ENTRY cl_int CL_API_CALL\n    clSetCommandQueueProperty(cl_command_queue              command_queue,\n                              cl_command_queue_properties   properties,\n                              cl_bool                       enable,\n                              cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;\n#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */\n\n/* Deprecated OpenCL 1.1 APIs */\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL\nclCreateImage2D(cl_context              context,\n                cl_mem_flags            flags,\n                const cl_image_format * image_format,\n                size_t                  image_width,\n                size_t                  image_height,\n                size_t                  image_row_pitch,\n                void *                  host_ptr,\n                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL\nclCreateImage3D(cl_context              context,\n                cl_mem_flags            flags,\n                const cl_image_format * image_format,\n                size_t                  image_width,\n                size_t                  image_height,\n                size_t                  image_depth,\n                size_t                  image_row_pitch,\n                size_t                  image_slice_pitch,\n                void *                  host_ptr,\n                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL\nclEnqueueMarker(cl_command_queue    command_queue,\n                cl_event *          event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL\nclEnqueueWaitForEvents(cl_command_queue  command_queue,\n                        cl_uint          num_events,\n                        const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL\nclEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL\nclUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL\nclGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\n/* Deprecated OpenCL 2.0 APIs */\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL\nclCreateCommandQueue(cl_context                     context,\n                     cl_device_id                   device,\n                     cl_command_queue_properties    properties,\n                     cl_int *                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;\n\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL\nclCreateSampler(cl_context          context,\n                cl_bool             normalized_coords,\n                cl_addressing_mode  addressing_mode,\n                cl_filter_mode      filter_mode,\n                cl_int *            errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;\n\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL\nclEnqueueTask(cl_command_queue  command_queue,\n              cl_kernel         kernel,\n              cl_uint           num_events_in_wait_list,\n              const cl_event *  event_wait_list,\n              cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif  /* __OPENCL_CL_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_d3d10.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __OPENCL_CL_D3D10_H\n#define __OPENCL_CL_D3D10_H\n\n#include <d3d10.h>\n#include <CL/cl.h>\n#include <CL/cl_platform.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/******************************************************************************\n * cl_khr_d3d10_sharing                                                       */\n#define cl_khr_d3d10_sharing 1\n\ntypedef cl_uint cl_d3d10_device_source_khr;\ntypedef cl_uint cl_d3d10_device_set_khr;\n\n/******************************************************************************/\n\n/* Error Codes */\n#define CL_INVALID_D3D10_DEVICE_KHR                  -1002\n#define CL_INVALID_D3D10_RESOURCE_KHR                -1003\n#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004\n#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005\n\n/* cl_d3d10_device_source_nv */\n#define CL_D3D10_DEVICE_KHR                          0x4010\n#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011\n\n/* cl_d3d10_device_set_nv */\n#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012\n#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013\n\n/* cl_context_info */\n#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014\n#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C\n\n/* cl_mem_info */\n#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015\n\n/* cl_image_info */\n#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016\n\n/* cl_command_type */\n#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017\n#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018\n\n/******************************************************************************/\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(\n    cl_platform_id             platform,\n    cl_d3d10_device_source_khr d3d_device_source,\n    void *                     d3d_object,\n    cl_d3d10_device_set_khr    d3d_device_set,\n    cl_uint                    num_entries,\n    cl_device_id *             devices,\n    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(\n    cl_context     context,\n    cl_mem_flags   flags,\n    ID3D10Buffer * resource,\n    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(\n    cl_context        context,\n    cl_mem_flags      flags,\n    ID3D10Texture2D * resource,\n    UINT              subresource,\n    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(\n    cl_context        context,\n    cl_mem_flags      flags,\n    ID3D10Texture3D * resource,\n    UINT              subresource,\n    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(\n    cl_command_queue command_queue,\n    cl_uint          num_objects,\n    const cl_mem *   mem_objects,\n    cl_uint          num_events_in_wait_list,\n    const cl_event * event_wait_list,\n    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(\n    cl_command_queue command_queue,\n    cl_uint          num_objects,\n    const cl_mem *   mem_objects,\n    cl_uint          num_events_in_wait_list,\n    const cl_event * event_wait_list,\n    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif  /* __OPENCL_CL_D3D10_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_d3d11.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __OPENCL_CL_D3D11_H\n#define __OPENCL_CL_D3D11_H\n\n#include <d3d11.h>\n#include <CL/cl.h>\n#include <CL/cl_platform.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/******************************************************************************\n * cl_khr_d3d11_sharing                                                       */\n#define cl_khr_d3d11_sharing 1\n\ntypedef cl_uint cl_d3d11_device_source_khr;\ntypedef cl_uint cl_d3d11_device_set_khr;\n\n/******************************************************************************/\n\n/* Error Codes */\n#define CL_INVALID_D3D11_DEVICE_KHR                  -1006\n#define CL_INVALID_D3D11_RESOURCE_KHR                -1007\n#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008\n#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009\n\n/* cl_d3d11_device_source */\n#define CL_D3D11_DEVICE_KHR                          0x4019\n#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A\n\n/* cl_d3d11_device_set */\n#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B\n#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C\n\n/* cl_context_info */\n#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D\n#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D\n\n/* cl_mem_info */\n#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E\n\n/* cl_image_info */\n#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F\n\n/* cl_command_type */\n#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020\n#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021\n\n/******************************************************************************/\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(\n    cl_platform_id             platform,\n    cl_d3d11_device_source_khr d3d_device_source,\n    void *                     d3d_object,\n    cl_d3d11_device_set_khr    d3d_device_set,\n    cl_uint                    num_entries,\n    cl_device_id *             devices,\n    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(\n    cl_context     context,\n    cl_mem_flags   flags,\n    ID3D11Buffer * resource,\n    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(\n    cl_context        context,\n    cl_mem_flags      flags,\n    ID3D11Texture2D * resource,\n    UINT              subresource,\n    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(\n    cl_context        context,\n    cl_mem_flags      flags,\n    ID3D11Texture3D * resource,\n    UINT              subresource,\n    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(\n    cl_command_queue command_queue,\n    cl_uint          num_objects,\n    const cl_mem *   mem_objects,\n    cl_uint          num_events_in_wait_list,\n    const cl_event * event_wait_list,\n    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(\n    cl_command_queue command_queue,\n    cl_uint          num_objects,\n    const cl_mem *   mem_objects,\n    cl_uint          num_events_in_wait_list,\n    const cl_event * event_wait_list,\n    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif  /* __OPENCL_CL_D3D11_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_dx9_media_sharing.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H\n#define __OPENCL_CL_DX9_MEDIA_SHARING_H\n\n#include <CL/cl.h>\n#include <CL/cl_platform.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/******************************************************************************/\n/* cl_khr_dx9_media_sharing                                                   */\n#define cl_khr_dx9_media_sharing 1\n\ntypedef cl_uint             cl_dx9_media_adapter_type_khr;\ntypedef cl_uint             cl_dx9_media_adapter_set_khr;\n\n#if defined(_WIN32)\n#include <d3d9.h>\ntypedef struct _cl_dx9_surface_info_khr\n{\n    IDirect3DSurface9 *resource;\n    HANDLE shared_handle;\n} cl_dx9_surface_info_khr;\n#endif\n\n\n/******************************************************************************/\n\n/* Error Codes */\n#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010\n#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011\n#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012\n#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013\n\n/* cl_media_adapter_type_khr */\n#define CL_ADAPTER_D3D9_KHR                              0x2020\n#define CL_ADAPTER_D3D9EX_KHR                            0x2021\n#define CL_ADAPTER_DXVA_KHR                              0x2022\n\n/* cl_media_adapter_set_khr */\n#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023\n#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024\n\n/* cl_context_info */\n#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025\n#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026\n#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027\n\n/* cl_mem_info */\n#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028\n#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029\n\n/* cl_image_info */\n#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A\n\n/* cl_command_type */\n#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B\n#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C\n\n/******************************************************************************/\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(\n    cl_platform_id                   platform,\n    cl_uint                          num_media_adapters,\n    cl_dx9_media_adapter_type_khr *  media_adapter_type,\n    void *                           media_adapters,\n    cl_dx9_media_adapter_set_khr     media_adapter_set,\n    cl_uint                          num_entries,\n    cl_device_id *                   devices,\n    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(\n    cl_context                    context,\n    cl_mem_flags                  flags,\n    cl_dx9_media_adapter_type_khr adapter_type,\n    void *                        surface_info,\n    cl_uint                       plane,\n    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(\n    cl_command_queue command_queue,\n    cl_uint          num_objects,\n    const cl_mem *   mem_objects,\n    cl_uint          num_events_in_wait_list,\n    const cl_event * event_wait_list,\n    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(\n    cl_command_queue command_queue,\n    cl_uint          num_objects,\n    const cl_mem *   mem_objects,\n    cl_uint          num_events_in_wait_list,\n    const cl_event * event_wait_list,\n    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_dx9_media_sharing_intel.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n/*****************************************************************************\\\n\nCopyright (c) 2013-2019 Intel Corporation All Rights Reserved.\n\nTHESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\nEXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\nPROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\nPROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\nOF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING\nNEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE\nMATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nFile Name: cl_dx9_media_sharing_intel.h\n\nAbstract:\n\nNotes:\n\n\\*****************************************************************************/\n\n#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H\n#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H\n\n#include <CL/cl.h>\n#include <CL/cl_platform.h>\n#include <d3d9.h>\n#include <dxvahd.h>\n#include <wtypes.h>\n#include <d3d9types.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/***************************************\n* cl_intel_dx9_media_sharing extension *\n****************************************/\n\n#define cl_intel_dx9_media_sharing 1\n\ntypedef cl_uint cl_dx9_device_source_intel;\ntypedef cl_uint cl_dx9_device_set_intel;\n\n/* error codes */\n#define CL_INVALID_DX9_DEVICE_INTEL                   -1010\n#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011\n#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012\n#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013\n\n/* cl_dx9_device_source_intel */\n#define CL_D3D9_DEVICE_INTEL                          0x4022\n#define CL_D3D9EX_DEVICE_INTEL                        0x4070\n#define CL_DXVA_DEVICE_INTEL                          0x4071\n\n/* cl_dx9_device_set_intel */\n#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024\n#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025\n\n/* cl_context_info */\n#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026\n#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072\n#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073\n\n/* cl_mem_info */\n#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027\n#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074\n\n/* cl_image_info */\n#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075\n\n/* cl_command_type */\n#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A\n#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B\n/******************************************************************************/\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetDeviceIDsFromDX9INTEL(\n    cl_platform_id              platform,\n    cl_dx9_device_source_intel  dx9_device_source,\n    void*                       dx9_object,\n    cl_dx9_device_set_intel     dx9_device_set,\n    cl_uint                     num_entries,\n    cl_device_id*               devices,\n    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(\n    cl_platform_id              platform,\n    cl_dx9_device_source_intel  dx9_device_source,\n    void*                       dx9_object,\n    cl_dx9_device_set_intel     dx9_device_set,\n    cl_uint                     num_entries,\n    cl_device_id*               devices,\n    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateFromDX9MediaSurfaceINTEL(\n    cl_context                  context,\n    cl_mem_flags                flags,\n    IDirect3DSurface9*          resource,\n    HANDLE                      sharedHandle,\n    UINT                        plane,\n    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(\n    cl_context                  context,\n    cl_mem_flags                flags,\n    IDirect3DSurface9*          resource,\n    HANDLE                      sharedHandle,\n    UINT                        plane,\n    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueAcquireDX9ObjectsINTEL(\n    cl_command_queue            command_queue,\n    cl_uint                     num_objects,\n    const cl_mem*               mem_objects,\n    cl_uint                     num_events_in_wait_list,\n    const cl_event*             event_wait_list,\n    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(\n    cl_command_queue            command_queue,\n    cl_uint                     num_objects,\n    const cl_mem*               mem_objects,\n    cl_uint                     num_events_in_wait_list,\n    const cl_event*             event_wait_list,\n    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueReleaseDX9ObjectsINTEL(\n    cl_command_queue            command_queue,\n    cl_uint                     num_objects,\n    cl_mem*                     mem_objects,\n    cl_uint                     num_events_in_wait_list,\n    const cl_event*             event_wait_list,\n    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(\n    cl_command_queue            command_queue,\n    cl_uint                     num_objects,\n    cl_mem*                     mem_objects,\n    cl_uint                     num_events_in_wait_list,\n    const cl_event*             event_wait_list,\n    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_egl.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __OPENCL_CL_EGL_H\n#define __OPENCL_CL_EGL_H\n\n#include <CL/cl.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n\n/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */\n#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F\n#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D\n#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E\n\n/* Error type for clCreateFromEGLImageKHR */\n#define CL_INVALID_EGL_OBJECT_KHR             -1093\n#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092\n\n/* CLeglImageKHR is an opaque handle to an EGLImage */\ntypedef void* CLeglImageKHR;\n\n/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */\ntypedef void* CLeglDisplayKHR;\n\n/* CLeglSyncKHR is an opaque handle to an EGLSync object */\ntypedef void* CLeglSyncKHR;\n\n/* properties passed to clCreateFromEGLImageKHR */\ntypedef intptr_t cl_egl_image_properties_khr;\n\n\n#define cl_khr_egl_image 1\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateFromEGLImageKHR(cl_context                  context,\n                        CLeglDisplayKHR             egldisplay,\n                        CLeglImageKHR               eglimage,\n                        cl_mem_flags                flags,\n                        const cl_egl_image_properties_khr * properties,\n                        cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(\n    cl_context                  context,\n    CLeglDisplayKHR             egldisplay,\n    CLeglImageKHR               eglimage,\n    cl_mem_flags                flags,\n    const cl_egl_image_properties_khr * properties,\n    cl_int *                    errcode_ret);\n\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,\n                              cl_uint          num_objects,\n                              const cl_mem *   mem_objects,\n                              cl_uint          num_events_in_wait_list,\n                              const cl_event * event_wait_list,\n                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(\n    cl_command_queue command_queue,\n    cl_uint          num_objects,\n    const cl_mem *   mem_objects,\n    cl_uint          num_events_in_wait_list,\n    const cl_event * event_wait_list,\n    cl_event *       event);\n\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,\n                              cl_uint          num_objects,\n                              const cl_mem *   mem_objects,\n                              cl_uint          num_events_in_wait_list,\n                              const cl_event * event_wait_list,\n                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(\n    cl_command_queue command_queue,\n    cl_uint          num_objects,\n    const cl_mem *   mem_objects,\n    cl_uint          num_events_in_wait_list,\n    const cl_event * event_wait_list,\n    cl_event *       event);\n\n\n#define cl_khr_egl_event 1\n\nextern CL_API_ENTRY cl_event CL_API_CALL\nclCreateEventFromEGLSyncKHR(cl_context      context,\n                            CLeglSyncKHR    sync,\n                            CLeglDisplayKHR display,\n                            cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(\n    cl_context      context,\n    CLeglSyncKHR    sync,\n    CLeglDisplayKHR display,\n    cl_int *        errcode_ret);\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* __OPENCL_CL_EGL_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_ext.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n/* cl_ext.h contains OpenCL extensions which don't have external */\n/* (OpenGL, D3D) dependencies.                                   */\n\n#ifndef __CL_EXT_H\n#define __CL_EXT_H\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n#include <CL/cl.h>\n\n/* cl_khr_fp64 extension - no extension #define since it has no functions  */\n/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */\n\n#if CL_TARGET_OPENCL_VERSION <= 110\n#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032\n#endif\n\n/* cl_khr_fp16 extension - no extension #define since it has no functions  */\n#define CL_DEVICE_HALF_FP_CONFIG                    0x1033\n\n/* Memory object destruction\n *\n * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR\n *\n * Registers a user callback function that will be called when the memory object is deleted and its resources\n * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback\n * stack associated with memobj. The registered user callback functions are called in the reverse order in\n * which they were registered. The user callback functions are called and then the memory object is deleted\n * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be\n * notified when the memory referenced by host_ptr, specified when the memory object is created and used as\n * the storage bits for the memory object, can be reused or freed.\n *\n * The application may not call CL api's with the cl_mem object passed to the pfn_notify.\n *\n * Please check for the \"cl_APPLE_SetMemObjectDestructor\" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)\n * before using.\n */\n#define cl_APPLE_SetMemObjectDestructor 1\ncl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem memobj,\n                                        void (* pfn_notify)(cl_mem memobj, void * user_data),\n                                        void * user_data)             CL_EXT_SUFFIX__VERSION_1_0;\n\n\n/* Context Logging Functions\n *\n * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().\n * Please check for the \"cl_APPLE_ContextLoggingFunctions\" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)\n * before using.\n *\n * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger\n */\n#define cl_APPLE_ContextLoggingFunctions 1\nextern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * errstr,\n                                            const void * private_info,\n                                            size_t       cb,\n                                            void *       user_data)  CL_EXT_SUFFIX__VERSION_1_0;\n\n/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */\nextern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * errstr,\n                                          const void * private_info,\n                                          size_t       cb,\n                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;\n\n/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */\nextern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * errstr,\n                                          const void * private_info,\n                                          size_t       cb,\n                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;\n\n\n/************************\n* cl_khr_icd extension *\n************************/\n#define cl_khr_icd 1\n\n/* cl_platform_info                                                        */\n#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920\n\n/* Additional Error Codes                                                  */\n#define CL_PLATFORM_NOT_FOUND_KHR                   -1001\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclIcdGetPlatformIDsKHR(cl_uint          num_entries,\n                       cl_platform_id * platforms,\n                       cl_uint *        num_platforms);\n\ntypedef CL_API_ENTRY cl_int\n(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint          num_entries,\n                                         cl_platform_id * platforms,\n                                         cl_uint *        num_platforms);\n\n\n/*******************************\n * cl_khr_il_program extension *\n *******************************/\n#define cl_khr_il_program 1\n\n/* New property to clGetDeviceInfo for retrieving supported intermediate\n * languages\n */\n#define CL_DEVICE_IL_VERSION_KHR                    0x105B\n\n/* New property to clGetProgramInfo for retrieving for retrieving the IL of a\n * program\n */\n#define CL_PROGRAM_IL_KHR                           0x1169\n\nextern CL_API_ENTRY cl_program CL_API_CALL\nclCreateProgramWithILKHR(cl_context   context,\n                         const void * il,\n                         size_t       length,\n                         cl_int *     errcode_ret);\n\ntypedef CL_API_ENTRY cl_program\n(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context   context,\n                                           const void * il,\n                                           size_t       length,\n                                           cl_int *     errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;\n\n/* Extension: cl_khr_image2d_from_buffer\n *\n * This extension allows a 2D image to be created from a cl_mem buffer without\n * a copy. The type associated with a 2D image created from a buffer in an\n * OpenCL program is image2d_t. Both the sampler and sampler-less read_image\n * built-in functions are supported for 2D images and 2D images created from\n * a buffer.  Similarly, the write_image built-ins are also supported for 2D\n * images created from a buffer.\n *\n * When the 2D image from buffer is created, the client must specify the\n * width, height, image format (i.e. channel order and channel data type)\n * and optionally the row pitch.\n *\n * The pitch specified must be a multiple of\n * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.\n * The base address of the buffer must be aligned to\n * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.\n */\n\n#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR              0x104A\n#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR       0x104B\n\n\n/**************************************\n * cl_khr_initialize_memory extension *\n **************************************/\n\n#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030\n\n\n/**************************************\n * cl_khr_terminate_context extension *\n **************************************/\n\n#define CL_CONTEXT_TERMINATED_KHR                   -1121\n\n#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031\n#define CL_CONTEXT_TERMINATE_KHR                    0x2032\n\n#define cl_khr_terminate_context 1\nextern CL_API_ENTRY cl_int CL_API_CALL\nclTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int\n(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;\n\n\n/*\n * Extension: cl_khr_spir\n *\n * This extension adds support to create an OpenCL program object from a\n * Standard Portable Intermediate Representation (SPIR) instance\n */\n\n#define CL_DEVICE_SPIR_VERSIONS                     0x40E0\n#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1\n\n\n/*****************************************\n * cl_khr_create_command_queue extension *\n *****************************************/\n#define cl_khr_create_command_queue 1\n\ntypedef cl_bitfield cl_queue_properties_khr;\n\nextern CL_API_ENTRY cl_command_queue CL_API_CALL\nclCreateCommandQueueWithPropertiesKHR(cl_context context,\n                                      cl_device_id device,\n                                      const cl_queue_properties_khr* properties,\n                                      cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_command_queue\n(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,\n                                                        cl_device_id device,\n                                                        const cl_queue_properties_khr* properties,\n                                                        cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;\n\n\n/******************************************\n* cl_nv_device_attribute_query extension *\n******************************************/\n\n/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */\n#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000\n#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001\n#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002\n#define CL_DEVICE_WARP_SIZE_NV                      0x4003\n#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004\n#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005\n#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006\n\n\n/*********************************\n* cl_amd_device_attribute_query *\n*********************************/\n\n#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD            0x4036\n#define CL_DEVICE_TOPOLOGY_AMD                          0x4037\n#define CL_DEVICE_BOARD_NAME_AMD                        0x4038\n#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD                0x4039\n#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD             0x4040\n#define CL_DEVICE_SIMD_WIDTH_AMD                        0x4041\n#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD            0x4042\n#define CL_DEVICE_WAVEFRONT_WIDTH_AMD                   0x4043\n#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD               0x4044\n#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD          0x4045\n#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD     0x4046\n#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047\n#define CL_DEVICE_LOCAL_MEM_BANKS_AMD                   0x4048\n#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD            0x4049\n#define CL_DEVICE_GFXIP_MAJOR_AMD                       0x404A\n#define CL_DEVICE_GFXIP_MINOR_AMD                       0x404B\n#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD            0x404C\n#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD         0x4030\n#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD               0x4031\n#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD    0x4033\n#define CL_DEVICE_PCIE_ID_AMD                           0x4034\n\n\n/*********************************\n* cl_arm_printf extension\n*********************************/\n\n#define CL_PRINTF_CALLBACK_ARM                      0x40B0\n#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1\n\n\n/***********************************\n* cl_ext_device_fission extension\n***********************************/\n#define cl_ext_device_fission   1\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_int\n(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_int\n(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;\n\ntypedef cl_ulong  cl_device_partition_property_ext;\nextern CL_API_ENTRY cl_int CL_API_CALL\nclCreateSubDevicesEXT(cl_device_id   in_device,\n                      const cl_device_partition_property_ext * properties,\n                      cl_uint        num_entries,\n                      cl_device_id * out_devices,\n                      cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_int\n(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id   in_device,\n                                         const cl_device_partition_property_ext * properties,\n                                         cl_uint        num_entries,\n                                         cl_device_id * out_devices,\n                                         cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;\n\n/* cl_device_partition_property_ext */\n#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050\n#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051\n#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052\n#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053\n\n/* clDeviceGetInfo selectors */\n#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054\n#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055\n#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056\n#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057\n#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058\n\n/* error codes */\n#define CL_DEVICE_PARTITION_FAILED_EXT              -1057\n#define CL_INVALID_PARTITION_COUNT_EXT              -1058\n#define CL_INVALID_PARTITION_NAME_EXT               -1059\n\n/* CL_AFFINITY_DOMAINs */\n#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1\n#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2\n#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3\n#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4\n#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10\n#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100\n\n/* cl_device_partition_property_ext list terminators */\n#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)\n#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)\n#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)\n\n\n/***********************************\n * cl_ext_migrate_memobject extension definitions\n ***********************************/\n#define cl_ext_migrate_memobject 1\n\ntypedef cl_bitfield cl_mem_migration_flags_ext;\n\n#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1\n\n#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,\n                             cl_uint          num_mem_objects,\n                             const cl_mem *   mem_objects,\n                             cl_mem_migration_flags_ext flags,\n                             cl_uint          num_events_in_wait_list,\n                             const cl_event * event_wait_list,\n                             cl_event *       event);\n\ntypedef CL_API_ENTRY cl_int\n(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,\n                                               cl_uint          num_mem_objects,\n                                               const cl_mem *   mem_objects,\n                                               cl_mem_migration_flags_ext flags,\n                                               cl_uint          num_events_in_wait_list,\n                                               const cl_event * event_wait_list,\n                                               cl_event *       event);\n\n\n/*********************************\n* cl_qcom_ext_host_ptr extension\n*********************************/\n#define cl_qcom_ext_host_ptr 1\n\n#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)\n\n#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0\n#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1\n#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2\n#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3\n#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4\n#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5\n#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6\n#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7\n\ntypedef cl_uint                                   cl_image_pitch_info_qcom;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetDeviceImageInfoQCOM(cl_device_id             device,\n                         size_t                   image_width,\n                         size_t                   image_height,\n                         const cl_image_format   *image_format,\n                         cl_image_pitch_info_qcom param_name,\n                         size_t                   param_value_size,\n                         void                    *param_value,\n                         size_t                  *param_value_size_ret);\n\ntypedef struct _cl_mem_ext_host_ptr\n{\n    /* Type of external memory allocation. */\n    /* Legal values will be defined in layered extensions. */\n    cl_uint  allocation_type;\n\n    /* Host cache policy for this external memory allocation. */\n    cl_uint  host_cache_policy;\n\n} cl_mem_ext_host_ptr;\n\n\n/*******************************************\n* cl_qcom_ext_host_ptr_iocoherent extension\n********************************************/\n\n/* Cache policy specifying io-coherence */\n#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9\n\n\n/*********************************\n* cl_qcom_ion_host_ptr extension\n*********************************/\n\n#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8\n\ntypedef struct _cl_mem_ion_host_ptr\n{\n    /* Type of external memory allocation. */\n    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */\n    cl_mem_ext_host_ptr  ext_host_ptr;\n\n    /* ION file descriptor */\n    int                  ion_filedesc;\n\n    /* Host pointer to the ION allocated memory */\n    void*                ion_hostptr;\n\n} cl_mem_ion_host_ptr;\n\n\n/*********************************\n* cl_qcom_android_native_buffer_host_ptr extension\n*********************************/\n\n#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6\n\ntypedef struct _cl_mem_android_native_buffer_host_ptr\n{\n    /* Type of external memory allocation. */\n    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */\n    cl_mem_ext_host_ptr  ext_host_ptr;\n\n    /* Virtual pointer to the android native buffer */\n    void*                anb_ptr;\n\n} cl_mem_android_native_buffer_host_ptr;\n\n\n/******************************************\n * cl_img_yuv_image extension *\n ******************************************/\n\n/* Image formats used in clCreateImage */\n#define CL_NV21_IMG                                 0x40D0\n#define CL_YV12_IMG                                 0x40D1\n\n\n/******************************************\n * cl_img_cached_allocations extension *\n ******************************************/\n\n/* Flag values used by clCreateBuffer */\n#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)\n#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)\n\n\n/******************************************\n * cl_img_use_gralloc_ptr extension *\n ******************************************/\n#define cl_img_use_gralloc_ptr 1\n\n/* Flag values used by clCreateBuffer */\n#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)\n\n/* To be used by clGetEventInfo: */\n#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2\n#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3\n\n/* Error code from clEnqueueReleaseGrallocObjectsIMG */\n#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueAcquireGrallocObjectsIMG(cl_command_queue      command_queue,\n                                  cl_uint               num_objects,\n                                  const cl_mem *        mem_objects,\n                                  cl_uint               num_events_in_wait_list,\n                                  const cl_event *      event_wait_list,\n                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueReleaseGrallocObjectsIMG(cl_command_queue      command_queue,\n                                  cl_uint               num_objects,\n                                  const cl_mem *        mem_objects,\n                                  cl_uint               num_events_in_wait_list,\n                                  const cl_event *      event_wait_list,\n                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;\n\n\n/*********************************\n* cl_khr_subgroups extension\n*********************************/\n#define cl_khr_subgroups 1\n\n#if !defined(CL_VERSION_2_1)\n/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.\n   In hindsight, there should have been a khr suffix on this type for\n   the extension, but keeping it un-suffixed to maintain backwards\n   compatibility. */\ntypedef cl_uint             cl_kernel_sub_group_info;\n#endif\n\n/* cl_kernel_sub_group_info */\n#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033\n#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetKernelSubGroupInfoKHR(cl_kernel    in_kernel,\n                           cl_device_id in_device,\n                           cl_kernel_sub_group_info param_name,\n                           size_t       input_value_size,\n                           const void * input_value,\n                           size_t       param_value_size,\n                           void *       param_value,\n                           size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;\n\ntypedef CL_API_ENTRY cl_int\n(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel    in_kernel,\n                                              cl_device_id in_device,\n                                              cl_kernel_sub_group_info param_name,\n                                              size_t       input_value_size,\n                                              const void * input_value,\n                                              size_t       param_value_size,\n                                              void *       param_value,\n                                              size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;\n\n\n/*********************************\n* cl_khr_mipmap_image extension\n*********************************/\n\n/* cl_sampler_properties */\n#define CL_SAMPLER_MIP_FILTER_MODE_KHR              0x1155\n#define CL_SAMPLER_LOD_MIN_KHR                      0x1156\n#define CL_SAMPLER_LOD_MAX_KHR                      0x1157\n\n\n/*********************************\n* cl_khr_priority_hints extension\n*********************************/\n/* This extension define is for backwards compatibility.\n   It shouldn't be required since this extension has no new functions. */\n#define cl_khr_priority_hints 1\n\ntypedef cl_uint  cl_queue_priority_khr;\n\n/* cl_command_queue_properties */\n#define CL_QUEUE_PRIORITY_KHR 0x1096\n\n/* cl_queue_priority_khr */\n#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)\n#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)\n#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)\n\n\n/*********************************\n* cl_khr_throttle_hints extension\n*********************************/\n/* This extension define is for backwards compatibility.\n   It shouldn't be required since this extension has no new functions. */\n#define cl_khr_throttle_hints 1\n\ntypedef cl_uint  cl_queue_throttle_khr;\n\n/* cl_command_queue_properties */\n#define CL_QUEUE_THROTTLE_KHR 0x1097\n\n/* cl_queue_throttle_khr */\n#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)\n#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)\n#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)\n\n\n/*********************************\n* cl_khr_subgroup_named_barrier\n*********************************/\n/* This extension define is for backwards compatibility.\n   It shouldn't be required since this extension has no new functions. */\n#define cl_khr_subgroup_named_barrier 1\n\n/* cl_device_info */\n#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035\n\n\n/*********************************\n* cl_khr_extended_versioning\n*********************************/\n\n#define cl_khr_extended_versioning 1\n\n#define CL_VERSION_MAJOR_BITS_KHR (10)\n#define CL_VERSION_MINOR_BITS_KHR (10)\n#define CL_VERSION_PATCH_BITS_KHR (12)\n\n#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)\n#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)\n#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)\n\n#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))\n#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)\n#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)\n\n#define CL_MAKE_VERSION_KHR(major, minor, patch) \\\n    ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \\\n    (((minor) &  CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \\\n    ((patch) & CL_VERSION_PATCH_MASK_KHR))\n\ntypedef cl_uint cl_version_khr;\n\n#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64\n\ntypedef struct _cl_name_version_khr\n{\n    cl_version_khr version;\n    char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];\n} cl_name_version_khr;\n\n/* cl_platform_info */\n#define CL_PLATFORM_NUMERIC_VERSION_KHR                  0x0906\n#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR          0x0907\n\n/* cl_device_info */\n#define CL_DEVICE_NUMERIC_VERSION_KHR                    0x105E\n#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR           0x105F\n#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR            0x1060\n#define CL_DEVICE_ILS_WITH_VERSION_KHR                   0x1061\n#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR      0x1062\n\n\n/*********************************\n* cl_khr_device_uuid extension\n*********************************/\n#define cl_khr_device_uuid 1\n\n#define CL_UUID_SIZE_KHR 16\n#define CL_LUID_SIZE_KHR 8\n\n#define CL_DEVICE_UUID_KHR          0x106A\n#define CL_DRIVER_UUID_KHR          0x106B\n#define CL_DEVICE_LUID_VALID_KHR    0x106C\n#define CL_DEVICE_LUID_KHR          0x106D\n#define CL_DEVICE_NODE_MASK_KHR     0x106E\n\n\n/**********************************\n * cl_arm_import_memory extension *\n **********************************/\n#define cl_arm_import_memory 1\n\ntypedef intptr_t cl_import_properties_arm;\n\n/* Default and valid proporties name for cl_arm_import_memory */\n#define CL_IMPORT_TYPE_ARM                        0x40B2\n\n/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */\n#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3\n\n/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */\n#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4\n\n/* Protected memory property */\n#define CL_IMPORT_TYPE_PROTECTED_ARM              0x40B5\n\n/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */\n#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2\n\n/* Data consistency with host property */\n#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3\n\n/* Import memory size value to indicate a size for the whole buffer */\n#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX\n\n/* This extension adds a new function that allows for direct memory import into\n * OpenCL via the clImportMemoryARM function.\n *\n * Memory imported through this interface will be mapped into the device's page\n * tables directly, providing zero copy access. It will never fall back to copy\n * operations and aliased buffers.\n *\n * Types of memory supported for import are specified as additional extension\n * strings.\n *\n * This extension produces cl_mem allocations which are compatible with all other\n * users of cl_mem in the standard API.\n *\n * This extension maps pages with the same properties as the normal buffer creation\n * function clCreateBuffer.\n */\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclImportMemoryARM( cl_context context,\n                   cl_mem_flags flags,\n                   const cl_import_properties_arm *properties,\n                   void *memory,\n                   size_t size,\n                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;\n\n\n/******************************************\n * cl_arm_shared_virtual_memory extension *\n ******************************************/\n#define cl_arm_shared_virtual_memory 1\n\n/* Used by clGetDeviceInfo */\n#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6\n\n/* Used by clGetMemObjectInfo */\n#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7\n\n/* Used by clSetKernelExecInfoARM: */\n#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8\n#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9\n\n/* To be used by clGetEventInfo: */\n#define CL_COMMAND_SVM_FREE_ARM                         0x40BA\n#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB\n#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC\n#define CL_COMMAND_SVM_MAP_ARM                          0x40BD\n#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE\n\n/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */\n#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)\n#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)\n#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)\n#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)\n\n/* Flag values used by clSVMAllocARM: */\n#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)\n#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)\n\ntypedef cl_bitfield cl_svm_mem_flags_arm;\ntypedef cl_uint     cl_kernel_exec_info_arm;\ntypedef cl_bitfield cl_device_svm_capabilities_arm;\n\nextern CL_API_ENTRY void * CL_API_CALL\nclSVMAllocARM(cl_context       context,\n              cl_svm_mem_flags_arm flags,\n              size_t           size,\n              cl_uint          alignment) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY void CL_API_CALL\nclSVMFreeARM(cl_context        context,\n             void *            svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMFreeARM(cl_command_queue  command_queue,\n                    cl_uint           num_svm_pointers,\n                    void *            svm_pointers[],\n                    void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,\n                                                       cl_uint          num_svm_pointers,\n                                                       void *           svm_pointers[],\n                                                       void *           user_data),\n                    void *            user_data,\n                    cl_uint           num_events_in_wait_list,\n                    const cl_event *  event_wait_list,\n                    cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMMemcpyARM(cl_command_queue  command_queue,\n                      cl_bool           blocking_copy,\n                      void *            dst_ptr,\n                      const void *      src_ptr,\n                      size_t            size,\n                      cl_uint           num_events_in_wait_list,\n                      const cl_event *  event_wait_list,\n                      cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMMemFillARM(cl_command_queue  command_queue,\n                       void *            svm_ptr,\n                       const void *      pattern,\n                       size_t            pattern_size,\n                       size_t            size,\n                       cl_uint           num_events_in_wait_list,\n                       const cl_event *  event_wait_list,\n                       cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMMapARM(cl_command_queue  command_queue,\n                   cl_bool           blocking_map,\n                   cl_map_flags      flags,\n                   void *            svm_ptr,\n                   size_t            size,\n                   cl_uint           num_events_in_wait_list,\n                   const cl_event *  event_wait_list,\n                   cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueSVMUnmapARM(cl_command_queue  command_queue,\n                     void *            svm_ptr,\n                     cl_uint           num_events_in_wait_list,\n                     const cl_event *  event_wait_list,\n                     cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetKernelArgSVMPointerARM(cl_kernel    kernel,\n                            cl_uint      arg_index,\n                            const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetKernelExecInfoARM(cl_kernel            kernel,\n                       cl_kernel_exec_info_arm  param_name,\n                       size_t               param_value_size,\n                       const void *         param_value) CL_EXT_SUFFIX__VERSION_1_2;\n\n/********************************\n * cl_arm_get_core_id extension *\n ********************************/\n\n#ifdef CL_VERSION_1_2\n\n#define cl_arm_get_core_id 1\n\n/* Device info property for bitfield of cores present */\n#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM      0x40BF\n\n#endif  /* CL_VERSION_1_2 */\n\n/*********************************\n* cl_arm_job_slot_selection\n*********************************/\n\n#define cl_arm_job_slot_selection 1\n\n/* cl_device_info */\n#define CL_DEVICE_JOB_SLOTS_ARM                   0x41E0\n\n/* cl_command_queue_properties */\n#define CL_QUEUE_JOB_SLOT_ARM                     0x41E1\n\n#ifdef __cplusplus\n}\n#endif\n\n\n#endif /* __CL_EXT_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_ext_intel.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n *\n ******************************************************************************/\n/*****************************************************************************\\\n\nCopyright (c) 2013-2020 Intel Corporation All Rights Reserved.\n\nTHESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\nEXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\nPROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\nPROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\nOF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING\nNEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE\nMATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nFile Name: cl_ext_intel.h\n\nAbstract:\n\nNotes:\n\n\\*****************************************************************************/\n\n#ifndef __CL_EXT_INTEL_H\n#define __CL_EXT_INTEL_H\n\n#include <CL/cl.h>\n#include <CL/cl_platform.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/***************************************\n* cl_intel_thread_local_exec extension *\n****************************************/\n\n#define cl_intel_thread_local_exec 1\n\n#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)\n\n/***********************************************\n* cl_intel_device_partition_by_names extension *\n************************************************/\n\n#define cl_intel_device_partition_by_names 1\n\n#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052\n#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1\n\n/************************************************\n* cl_intel_accelerator extension                *\n* cl_intel_motion_estimation extension          *\n* cl_intel_advanced_motion_estimation extension *\n*************************************************/\n\n#define cl_intel_accelerator 1\n#define cl_intel_motion_estimation 1\n#define cl_intel_advanced_motion_estimation 1\n\ntypedef struct _cl_accelerator_intel* cl_accelerator_intel;\ntypedef cl_uint cl_accelerator_type_intel;\ntypedef cl_uint cl_accelerator_info_intel;\n\ntypedef struct _cl_motion_estimation_desc_intel {\n    cl_uint mb_block_type;\n    cl_uint subpixel_mode;\n    cl_uint sad_adjust_mode;\n    cl_uint search_path_type;\n} cl_motion_estimation_desc_intel;\n\n/* error codes */\n#define CL_INVALID_ACCELERATOR_INTEL                              -1094\n#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095\n#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096\n#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097\n\n/* cl_accelerator_type_intel */\n#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0\n\n/* cl_accelerator_info_intel */\n#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090\n#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091\n#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092\n#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093\n\n/* cl_motion_detect_desc_intel flags */\n#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0\n#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1\n#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2\n\n#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0\n#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1\n#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2\n\n#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0\n#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1\n\n#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0\n#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1\n#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5\n\n#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0\n#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1\n#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2\n#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4\n\n#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1\n#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2\n#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3\n\n#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16\n#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21\n#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32\n#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43\n#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48\n\n#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0\n#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1\n#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2\n#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3\n\n#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0\n#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1\n#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2\n#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3\n\n#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0\n#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1\n#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2\n#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3\n\n#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4\n#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4\n#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5\n#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6\n#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7\n#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8\n\n#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0\n#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1\n#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2\n#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3\n\n/* cl_device_info */\n#define CL_DEVICE_ME_VERSION_INTEL                                0x407E\n\n#define CL_ME_VERSION_LEGACY_INTEL                                0x0\n#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1\n#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2\n\nextern CL_API_ENTRY cl_accelerator_intel CL_API_CALL\nclCreateAcceleratorINTEL(\n    cl_context                   context,\n    cl_accelerator_type_intel    accelerator_type,\n    size_t                       descriptor_size,\n    const void*                  descriptor,\n    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(\n    cl_context                   context,\n    cl_accelerator_type_intel    accelerator_type,\n    size_t                       descriptor_size,\n    const void*                  descriptor,\n    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetAcceleratorInfoINTEL(\n    cl_accelerator_intel         accelerator,\n    cl_accelerator_info_intel    param_name,\n    size_t                       param_value_size,\n    void*                        param_value,\n    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(\n    cl_accelerator_intel         accelerator,\n    cl_accelerator_info_intel    param_name,\n    size_t                       param_value_size,\n    void*                        param_value,\n    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclRetainAcceleratorINTEL(\n    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(\n    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclReleaseAcceleratorINTEL(\n    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(\n    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;\n\n/******************************************\n* cl_intel_simultaneous_sharing extension *\n*******************************************/\n\n#define cl_intel_simultaneous_sharing 1\n\n#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104\n#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105\n\n/***********************************\n* cl_intel_egl_image_yuv extension *\n************************************/\n\n#define cl_intel_egl_image_yuv 1\n\n#define CL_EGL_YUV_PLANE_INTEL                           0x4107\n\n/********************************\n* cl_intel_packed_yuv extension *\n*********************************/\n\n#define cl_intel_packed_yuv 1\n\n#define CL_YUYV_INTEL                                    0x4076\n#define CL_UYVY_INTEL                                    0x4077\n#define CL_YVYU_INTEL                                    0x4078\n#define CL_VYUY_INTEL                                    0x4079\n\n/********************************************\n* cl_intel_required_subgroup_size extension *\n*********************************************/\n\n#define cl_intel_required_subgroup_size 1\n\n#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108\n#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109\n#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A\n\n/****************************************\n* cl_intel_driver_diagnostics extension *\n*****************************************/\n\n#define cl_intel_driver_diagnostics 1\n\ntypedef cl_uint cl_diagnostics_verbose_level;\n\n#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106\n\n#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )\n#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )\n#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )\n#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )\n\n/********************************\n* cl_intel_planar_yuv extension *\n*********************************/\n\n#define CL_NV12_INTEL                                       0x410E\n\n#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )\n#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )\n\n#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E\n#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F\n\n/*******************************************************\n* cl_intel_device_side_avc_motion_estimation extension *\n********************************************************/\n\n#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B\n#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C\n#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D\n\n#define CL_AVC_ME_VERSION_0_INTEL                           0x0   /* No support. */\n#define CL_AVC_ME_VERSION_1_INTEL                           0x1   /* First supported version. */\n\n#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0\n#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1\n#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2\n#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3\n\n#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0\n#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1\n#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2\n#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3\n\n#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0\n#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1\n#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2\n\n#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0\n#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E\n#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D\n#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B\n#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77\n#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F\n#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F\n#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F\n\n#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0\n#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1\n#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2\n#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3\n#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4\n#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5\n#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6\n#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7\n#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8\n#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9\n#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2\n#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa\n\n#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0\n#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2\n\n#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0\n#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1\n#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3\n\n#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0\n#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1\n#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2\n#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3\n\n#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10\n#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15\n#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20\n#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B\n#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30\n\n#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0\n#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2\n#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4\n#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8\n\n#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0\n#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000\n\n#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )\n#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )\n#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )\n#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )\n\n#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00\n#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80\n\n#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0\n#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1\n#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2\n\n#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6\n#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5\n#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3\n\n#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60\n#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10\n#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8\n#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4\n\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8\n#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0\n#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1\n#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2\n#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3\n\n#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1\n#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2\n#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3\n\n#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0\n#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1\n#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2\n\n#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0\n#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1\n\n/*******************************************\n* cl_intel_unified_shared_memory extension *\n********************************************/\n\n/* These APIs are in sync with Revision O of the cl_intel_unified_shared_memory spec! */\n\n#define cl_intel_unified_shared_memory 1\n\n/* cl_device_info */\n#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL                   0x4190\n#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL                 0x4191\n#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL   0x4192\n#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL    0x4193\n#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL          0x4194\n\ntypedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel;\n\n/* cl_device_unified_shared_memory_capabilities_intel - bitfield */\n#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL                   (1 << 0)\n#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL            (1 << 1)\n#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL        (1 << 2)\n#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)\n\ntypedef cl_bitfield cl_mem_properties_intel;\n\n/* cl_mem_properties_intel */\n#define CL_MEM_ALLOC_FLAGS_INTEL        0x4195\n\ntypedef cl_bitfield cl_mem_alloc_flags_intel;\n\n/* cl_mem_alloc_flags_intel - bitfield */\n#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL               (1 << 0)\n\ntypedef cl_uint cl_mem_info_intel;\n\n/* cl_mem_alloc_info_intel */\n#define CL_MEM_ALLOC_TYPE_INTEL         0x419A\n#define CL_MEM_ALLOC_BASE_PTR_INTEL     0x419B\n#define CL_MEM_ALLOC_SIZE_INTEL         0x419C\n#define CL_MEM_ALLOC_DEVICE_INTEL       0x419D\n/* Enum values 0x419E-0x419F are reserved for future queries. */\n\ntypedef cl_uint cl_unified_shared_memory_type_intel;\n\n/* cl_unified_shared_memory_type_intel */\n#define CL_MEM_TYPE_UNKNOWN_INTEL       0x4196\n#define CL_MEM_TYPE_HOST_INTEL          0x4197\n#define CL_MEM_TYPE_DEVICE_INTEL        0x4198\n#define CL_MEM_TYPE_SHARED_INTEL        0x4199\n\ntypedef cl_uint cl_mem_advice_intel;\n\n/* cl_mem_advice_intel */\n/* Enum values 0x4208-0x420F are reserved for future memory advices. */\n\n/* cl_kernel_exec_info */\n#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL      0x4200\n#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL    0x4201\n#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL    0x4202\n#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL                  0x4203\n\n/* cl_command_type */\n#define CL_COMMAND_MEMFILL_INTEL        0x4204\n#define CL_COMMAND_MEMCPY_INTEL         0x4205\n#define CL_COMMAND_MIGRATEMEM_INTEL     0x4206\n#define CL_COMMAND_MEMADVISE_INTEL      0x4207\n\nextern CL_API_ENTRY void* CL_API_CALL\nclHostMemAllocINTEL(\n            cl_context context,\n            const cl_mem_properties_intel* properties,\n            size_t size,\n            cl_uint alignment,\n            cl_int* errcode_ret);\n\ntypedef CL_API_ENTRY void* (CL_API_CALL *\nclHostMemAllocINTEL_fn)(\n            cl_context context,\n            const cl_mem_properties_intel* properties,\n            size_t size,\n            cl_uint alignment,\n            cl_int* errcode_ret);\n\nextern CL_API_ENTRY void* CL_API_CALL\nclDeviceMemAllocINTEL(\n            cl_context context,\n            cl_device_id device,\n            const cl_mem_properties_intel* properties,\n            size_t size,\n            cl_uint alignment,\n            cl_int* errcode_ret);\n\ntypedef CL_API_ENTRY void* (CL_API_CALL *\nclDeviceMemAllocINTEL_fn)(\n            cl_context context,\n            cl_device_id device,\n            const cl_mem_properties_intel* properties,\n            size_t size,\n            cl_uint alignment,\n            cl_int* errcode_ret);\n\nextern CL_API_ENTRY void* CL_API_CALL\nclSharedMemAllocINTEL(\n            cl_context context,\n            cl_device_id device,\n            const cl_mem_properties_intel* properties,\n            size_t size,\n            cl_uint alignment,\n            cl_int* errcode_ret);\n\ntypedef CL_API_ENTRY void* (CL_API_CALL *\nclSharedMemAllocINTEL_fn)(\n            cl_context context,\n            cl_device_id device,\n            const cl_mem_properties_intel* properties,\n            size_t size,\n            cl_uint alignment,\n            cl_int* errcode_ret);\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclMemFreeINTEL(\n            cl_context context,\n            void* ptr);\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *\nclMemFreeINTEL_fn)(\n            cl_context context,\n            void* ptr);\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetMemAllocInfoINTEL(\n            cl_context context,\n            const void* ptr,\n            cl_mem_info_intel param_name,\n            size_t param_value_size,\n            void* param_value,\n            size_t* param_value_size_ret);\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *\nclGetMemAllocInfoINTEL_fn)(\n            cl_context context,\n            const void* ptr,\n            cl_mem_info_intel param_name,\n            size_t param_value_size,\n            void* param_value,\n            size_t* param_value_size_ret);\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclSetKernelArgMemPointerINTEL(\n            cl_kernel kernel,\n            cl_uint arg_index,\n            const void* arg_value);\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *\nclSetKernelArgMemPointerINTEL_fn)(\n            cl_kernel kernel,\n            cl_uint arg_index,\n            const void* arg_value);\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueMemsetINTEL(       /* Deprecated */\n            cl_command_queue command_queue,\n            void* dst_ptr,\n            cl_int value,\n            size_t size,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *\nclEnqueueMemsetINTEL_fn)(   /* Deprecated */\n            cl_command_queue command_queue,\n            void* dst_ptr,\n            cl_int value,\n            size_t size,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueMemFillINTEL(\n            cl_command_queue command_queue,\n            void* dst_ptr,\n            const void* pattern,\n            size_t pattern_size,\n            size_t size,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *\nclEnqueueMemFillINTEL_fn)(\n            cl_command_queue command_queue,\n            void* dst_ptr,\n            const void* pattern,\n            size_t pattern_size,\n            size_t size,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueMemcpyINTEL(\n            cl_command_queue command_queue,\n            cl_bool blocking,\n            void* dst_ptr,\n            const void* src_ptr,\n            size_t size,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *\nclEnqueueMemcpyINTEL_fn)(\n            cl_command_queue command_queue,\n            cl_bool blocking,\n            void* dst_ptr,\n            const void* src_ptr,\n            size_t size,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\n#ifdef CL_VERSION_1_2\n\n/* Because these APIs use cl_mem_migration_flags, they require\n   OpenCL 1.2: */\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueMigrateMemINTEL(\n            cl_command_queue command_queue,\n            const void* ptr,\n            size_t size,\n            cl_mem_migration_flags flags,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *\nclEnqueueMigrateMemINTEL_fn)(\n            cl_command_queue command_queue,\n            const void* ptr,\n            size_t size,\n            cl_mem_migration_flags flags,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\n#endif\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueMemAdviseINTEL(\n            cl_command_queue command_queue,\n            const void* ptr,\n            size_t size,\n            cl_mem_advice_intel advice,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *\nclEnqueueMemAdviseINTEL_fn)(\n            cl_command_queue command_queue,\n            const void* ptr,\n            size_t size,\n            cl_mem_advice_intel advice,\n            cl_uint num_events_in_wait_list,\n            const cl_event* event_wait_list,\n            cl_event* event);\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* __CL_EXT_INTEL_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_gl.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __OPENCL_CL_GL_H\n#define __OPENCL_CL_GL_H\n\n#include <CL/cl.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\ntypedef cl_uint     cl_gl_object_type;\ntypedef cl_uint     cl_gl_texture_info;\ntypedef cl_uint     cl_gl_platform_info;\ntypedef struct __GLsync *cl_GLsync;\n\n/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */\n#define CL_GL_OBJECT_BUFFER                     0x2000\n#define CL_GL_OBJECT_TEXTURE2D                  0x2001\n#define CL_GL_OBJECT_TEXTURE3D                  0x2002\n#define CL_GL_OBJECT_RENDERBUFFER               0x2003\n#ifdef CL_VERSION_1_2\n#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E\n#define CL_GL_OBJECT_TEXTURE1D                  0x200F\n#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010\n#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011\n#endif\n\n/* cl_gl_texture_info           */\n#define CL_GL_TEXTURE_TARGET                    0x2004\n#define CL_GL_MIPMAP_LEVEL                      0x2005\n#ifdef CL_VERSION_1_2\n#define CL_GL_NUM_SAMPLES                       0x2012\n#endif\n\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateFromGLBuffer(cl_context     context,\n                     cl_mem_flags   flags,\n                     cl_GLuint      bufobj,\n                     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateFromGLTexture(cl_context      context,\n                      cl_mem_flags    flags,\n                      cl_GLenum       target,\n                      cl_GLint        miplevel,\n                      cl_GLuint       texture,\n                      cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\n#endif\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateFromGLRenderbuffer(cl_context   context,\n                           cl_mem_flags flags,\n                           cl_GLuint    renderbuffer,\n                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetGLObjectInfo(cl_mem                memobj,\n                  cl_gl_object_type *   gl_object_type,\n                  cl_GLuint *           gl_object_name) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetGLTextureInfo(cl_mem               memobj,\n                   cl_gl_texture_info   param_name,\n                   size_t               param_value_size,\n                   void *               param_value,\n                   size_t *             param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueAcquireGLObjects(cl_command_queue      command_queue,\n                          cl_uint               num_objects,\n                          const cl_mem *        mem_objects,\n                          cl_uint               num_events_in_wait_list,\n                          const cl_event *      event_wait_list,\n                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueReleaseGLObjects(cl_command_queue      command_queue,\n                          cl_uint               num_objects,\n                          const cl_mem *        mem_objects,\n                          cl_uint               num_events_in_wait_list,\n                          const cl_event *      event_wait_list,\n                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;\n\n\n/* Deprecated OpenCL 1.1 APIs */\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL\nclCreateFromGLTexture2D(cl_context      context,\n                        cl_mem_flags    flags,\n                        cl_GLenum       target,\n                        cl_GLint        miplevel,\n                        cl_GLuint       texture,\n                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\nextern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL\nclCreateFromGLTexture3D(cl_context      context,\n                        cl_mem_flags    flags,\n                        cl_GLenum       target,\n                        cl_GLint        miplevel,\n                        cl_GLuint       texture,\n                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\n/* cl_khr_gl_sharing extension  */\n\n#define cl_khr_gl_sharing 1\n\ntypedef cl_uint     cl_gl_context_info;\n\n/* Additional Error Codes  */\n#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000\n\n/* cl_gl_context_info  */\n#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006\n#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007\n\n/* Additional cl_context_properties  */\n#define CL_GL_CONTEXT_KHR                       0x2008\n#define CL_EGL_DISPLAY_KHR                      0x2009\n#define CL_GLX_DISPLAY_KHR                      0x200A\n#define CL_WGL_HDC_KHR                          0x200B\n#define CL_CGL_SHAREGROUP_KHR                   0x200C\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetGLContextInfoKHR(const cl_context_properties * properties,\n                      cl_gl_context_info            param_name,\n                      size_t                        param_value_size,\n                      void *                        param_value,\n                      size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(\n    const cl_context_properties * properties,\n    cl_gl_context_info            param_name,\n    size_t                        param_value_size,\n    void *                        param_value,\n    size_t *                      param_value_size_ret);\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif  /* __OPENCL_CL_GL_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_gl_ext.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __OPENCL_CL_GL_EXT_H\n#define __OPENCL_CL_GL_EXT_H\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n#include <CL/cl_gl.h>\n\n/*\n *  cl_khr_gl_event extension\n */\n#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D\n\nextern CL_API_ENTRY cl_event CL_API_CALL\nclCreateEventFromGLsyncKHR(cl_context context,\n                           cl_GLsync  cl_GLsync,\n                           cl_int *   errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif\t/* __OPENCL_CL_GL_EXT_H  */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_half.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2019-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n/**\n * This is a header-only utility library that provides OpenCL host code with\n * routines for converting to/from cl_half values.\n *\n * Example usage:\n *\n *    #include <CL/cl_half.h>\n *    ...\n *    cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE);\n *    cl_float f = cl_half_to_float(h);\n */\n\n#ifndef OPENCL_CL_HALF_H\n#define OPENCL_CL_HALF_H\n\n#include <CL/cl_platform.h>\n\n#include <stdint.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n\n/**\n * Rounding mode used when converting to cl_half.\n */\ntypedef enum\n{\n  CL_HALF_RTE, // round to nearest even\n  CL_HALF_RTZ, // round towards zero\n  CL_HALF_RTP, // round towards positive infinity\n  CL_HALF_RTN, // round towards negative infinity\n} cl_half_rounding_mode;\n\n\n/* Private utility macros. */\n#define CL_HALF_EXP_MASK 0x7C00\n#define CL_HALF_MAX_FINITE_MAG 0x7BFF\n\n\n/*\n * Utility to deal with values that overflow when converting to half precision.\n */\nstatic inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode,\n                                              uint16_t sign)\n{\n  if (rounding_mode == CL_HALF_RTZ)\n  {\n    // Round overflow towards zero -> largest finite number (preserving sign)\n    return (sign << 15) | CL_HALF_MAX_FINITE_MAG;\n  }\n  else if (rounding_mode == CL_HALF_RTP && sign)\n  {\n    // Round negative overflow towards positive infinity -> most negative finite number\n    return (1 << 15) | CL_HALF_MAX_FINITE_MAG;\n  }\n  else if (rounding_mode == CL_HALF_RTN && !sign)\n  {\n    // Round positive overflow towards negative infinity -> largest finite number\n    return CL_HALF_MAX_FINITE_MAG;\n  }\n\n  // Overflow to infinity\n  return (sign << 15) | CL_HALF_EXP_MASK;\n}\n\n/*\n * Utility to deal with values that underflow when converting to half precision.\n */\nstatic inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode,\n                                               uint16_t sign)\n{\n  if (rounding_mode == CL_HALF_RTP && !sign)\n  {\n    // Round underflow towards positive infinity -> smallest positive value\n    return (sign << 15) | 1;\n  }\n  else if (rounding_mode == CL_HALF_RTN && sign)\n  {\n    // Round underflow towards negative infinity -> largest negative value\n    return (sign << 15) | 1;\n  }\n\n  // Flush to zero\n  return (sign << 15);\n}\n\n\n/**\n * Convert a cl_float to a cl_half.\n */\nstatic inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode)\n{\n  // Type-punning to get direct access to underlying bits\n  union\n  {\n    cl_float f;\n    uint32_t i;\n  } f32;\n  f32.f = f;\n\n  // Extract sign bit\n  uint16_t sign = f32.i >> 31;\n\n  // Extract FP32 exponent and mantissa\n  uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF;\n  uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1);\n\n  // Remove FP32 exponent bias\n  int32_t exp = f_exp - CL_FLT_MAX_EXP + 1;\n\n  // Add FP16 exponent bias\n  uint16_t h_exp = exp + CL_HALF_MAX_EXP - 1;\n\n  // Position of the bit that will become the FP16 mantissa LSB\n  uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG;\n\n  // Check for NaN / infinity\n  if (f_exp == 0xFF)\n  {\n    if (f_mant)\n    {\n      // NaN -> propagate mantissa and silence it\n      uint16_t h_mant = f_mant >> lsb_pos;\n      h_mant |= 0x200;\n      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;\n    }\n    else\n    {\n      // Infinity -> zero mantissa\n      return (sign << 15) | CL_HALF_EXP_MASK;\n    }\n  }\n\n  // Check for zero\n  if (!f_exp && !f_mant)\n  {\n    return (sign << 15);\n  }\n\n  // Check for overflow\n  if (exp >= CL_HALF_MAX_EXP)\n  {\n    return cl_half_handle_overflow(rounding_mode, sign);\n  }\n\n  // Check for underflow\n  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))\n  {\n    return cl_half_handle_underflow(rounding_mode, sign);\n  }\n\n  // Check for value that will become denormal\n  if (exp < -14)\n  {\n    // Denormal -> include the implicit 1 from the FP32 mantissa\n    h_exp = 0;\n    f_mant |= 1 << (CL_FLT_MANT_DIG - 1);\n\n    // Mantissa shift amount depends on exponent\n    lsb_pos = -exp + (CL_FLT_MANT_DIG - 25);\n  }\n\n  // Generate FP16 mantissa by shifting FP32 mantissa\n  uint16_t h_mant = f_mant >> lsb_pos;\n\n  // Check whether we need to round\n  uint32_t halfway = 1 << (lsb_pos - 1);\n  uint32_t mask = (halfway << 1) - 1;\n  switch (rounding_mode)\n  {\n    case CL_HALF_RTE:\n      if ((f_mant & mask) > halfway)\n      {\n        // More than halfway -> round up\n        h_mant += 1;\n      }\n      else if ((f_mant & mask) == halfway)\n      {\n        // Exactly halfway -> round to nearest even\n        if (h_mant & 0x1)\n          h_mant += 1;\n      }\n      break;\n    case CL_HALF_RTZ:\n      // Mantissa has already been truncated -> do nothing\n      break;\n    case CL_HALF_RTP:\n      if ((f_mant & mask) && !sign)\n      {\n        // Round positive numbers up\n        h_mant += 1;\n      }\n      break;\n    case CL_HALF_RTN:\n      if ((f_mant & mask) && sign)\n      {\n        // Round negative numbers down\n        h_mant += 1;\n      }\n      break;\n  }\n\n  // Check for mantissa overflow\n  if (h_mant & 0x400)\n  {\n    h_exp += 1;\n    h_mant = 0;\n  }\n\n  return (sign << 15) | (h_exp << 10) | h_mant;\n}\n\n\n/**\n * Convert a cl_double to a cl_half.\n */\nstatic inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode)\n{\n  // Type-punning to get direct access to underlying bits\n  union\n  {\n    cl_double d;\n    uint64_t i;\n  } f64;\n  f64.d = d;\n\n  // Extract sign bit\n  uint16_t sign = f64.i >> 63;\n\n  // Extract FP64 exponent and mantissa\n  uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF;\n  uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1);\n\n  // Remove FP64 exponent bias\n  int64_t exp = d_exp - CL_DBL_MAX_EXP + 1;\n\n  // Add FP16 exponent bias\n  uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);\n\n  // Position of the bit that will become the FP16 mantissa LSB\n  uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG;\n\n  // Check for NaN / infinity\n  if (d_exp == 0x7FF)\n  {\n    if (d_mant)\n    {\n      // NaN -> propagate mantissa and silence it\n      uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);\n      h_mant |= 0x200;\n      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;\n    }\n    else\n    {\n      // Infinity -> zero mantissa\n      return (sign << 15) | CL_HALF_EXP_MASK;\n    }\n  }\n\n  // Check for zero\n  if (!d_exp && !d_mant)\n  {\n    return (sign << 15);\n  }\n\n  // Check for overflow\n  if (exp >= CL_HALF_MAX_EXP)\n  {\n    return cl_half_handle_overflow(rounding_mode, sign);\n  }\n\n  // Check for underflow\n  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))\n  {\n    return cl_half_handle_underflow(rounding_mode, sign);\n  }\n\n  // Check for value that will become denormal\n  if (exp < -14)\n  {\n    // Include the implicit 1 from the FP64 mantissa\n    h_exp = 0;\n    d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1);\n\n    // Mantissa shift amount depends on exponent\n    lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25));\n  }\n\n  // Generate FP16 mantissa by shifting FP64 mantissa\n  uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);\n\n  // Check whether we need to round\n  uint64_t halfway = (uint64_t)1 << (lsb_pos - 1);\n  uint64_t mask = (halfway << 1) - 1;\n  switch (rounding_mode)\n  {\n    case CL_HALF_RTE:\n      if ((d_mant & mask) > halfway)\n      {\n        // More than halfway -> round up\n        h_mant += 1;\n      }\n      else if ((d_mant & mask) == halfway)\n      {\n        // Exactly halfway -> round to nearest even\n        if (h_mant & 0x1)\n          h_mant += 1;\n      }\n      break;\n    case CL_HALF_RTZ:\n      // Mantissa has already been truncated -> do nothing\n      break;\n    case CL_HALF_RTP:\n      if ((d_mant & mask) && !sign)\n      {\n        // Round positive numbers up\n        h_mant += 1;\n      }\n      break;\n    case CL_HALF_RTN:\n      if ((d_mant & mask) && sign)\n      {\n        // Round negative numbers down\n        h_mant += 1;\n      }\n      break;\n  }\n\n  // Check for mantissa overflow\n  if (h_mant & 0x400)\n  {\n    h_exp += 1;\n    h_mant = 0;\n  }\n\n  return (sign << 15) | (h_exp << 10) | h_mant;\n}\n\n\n/**\n * Convert a cl_half to a cl_float.\n */\nstatic inline cl_float cl_half_to_float(cl_half h)\n{\n  // Type-punning to get direct access to underlying bits\n  union\n  {\n    cl_float f;\n    uint32_t i;\n  } f32;\n\n  // Extract sign bit\n  uint16_t sign = h >> 15;\n\n  // Extract FP16 exponent and mantissa\n  uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F;\n  uint16_t h_mant = h & 0x3FF;\n\n  // Remove FP16 exponent bias\n  int32_t exp = h_exp - CL_HALF_MAX_EXP + 1;\n\n  // Add FP32 exponent bias\n  uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1;\n\n  // Check for NaN / infinity\n  if (h_exp == 0x1F)\n  {\n    if (h_mant)\n    {\n      // NaN -> propagate mantissa and silence it\n      uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG);\n      f_mant |= 0x400000;\n      f32.i = (sign << 31) | 0x7F800000 | f_mant;\n      return f32.f;\n    }\n    else\n    {\n      // Infinity -> zero mantissa\n      f32.i = (sign << 31) | 0x7F800000;\n      return f32.f;\n    }\n  }\n\n  // Check for zero / denormal\n  if (h_exp == 0)\n  {\n    if (h_mant == 0)\n    {\n      // Zero -> zero exponent\n      f_exp = 0;\n    }\n    else\n    {\n      // Denormal -> normalize it\n      // - Shift mantissa to make most-significant 1 implicit\n      // - Adjust exponent accordingly\n      uint32_t shift = 0;\n      while ((h_mant & 0x400) == 0)\n      {\n        h_mant <<= 1;\n        shift++;\n      }\n      h_mant &= 0x3FF;\n      f_exp -= shift - 1;\n    }\n  }\n\n  f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13);\n  return f32.f;\n}\n\n\n#undef CL_HALF_EXP_MASK\n#undef CL_HALF_MAX_FINITE_MAG\n\n\n#ifdef __cplusplus\n}\n#endif\n\n\n#endif  /* OPENCL_CL_HALF_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_icd.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2019-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef OPENCL_CL_ICD_H\n#define OPENCL_CL_ICD_H\n\n#include <CL/cl.h>\n#include <CL/cl_egl.h>\n#include <CL/cl_ext.h>\n#include <CL/cl_gl.h>\n\n#if defined(_WIN32)\n#include <CL/cl_d3d11.h>\n#include <CL/cl_d3d10.h>\n#include <CL/cl_dx9_media_sharing.h>\n#endif\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/*\n * This file contains pointer type definitions for each of the CL API calls as\n * well as a type definition for the dispatch table used by the Khronos ICD\n * loader (see cl_khr_icd extension specification for background).\n */\n\n/* API function pointer definitions */\n\n// Platform APIs\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)(\n    cl_uint num_entries, cl_platform_id *platforms,\n    cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)(\n    cl_platform_id platform, cl_platform_info param_name,\n    size_t param_value_size, void *param_value,\n    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n// Device APIs\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)(\n    cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,\n    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)(\n    cl_device_id device, cl_device_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevices)(\n    cl_device_id in_device,\n    const cl_device_partition_property *partition_properties,\n    cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDevice)(\n    cl_device_id device) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDevice)(\n    cl_device_id device) CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clCreateSubDevices;\ntypedef void *cl_api_clRetainDevice;\ntypedef void *cl_api_clReleaseDevice;\n\n#endif\n\n// Context APIs\ntypedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContext)(\n    const cl_context_properties *properties, cl_uint num_devices,\n    const cl_device_id *devices,\n    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),\n    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContextFromType)(\n    const cl_context_properties *properties, cl_device_type device_type,\n    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),\n    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainContext)(\n    cl_context context) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseContext)(\n    cl_context context) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetContextInfo)(\n    cl_context context, cl_context_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n// Command Queue APIs\ntypedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)(\n    cl_context context, cl_device_id device,\n    cl_command_queue_properties properties,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_2_0\n\ntypedef CL_API_ENTRY\ncl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)(\n    cl_context /* context */, cl_device_id /* device */,\n    const cl_queue_properties * /* properties */,\n    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;\n\n#else\n\ntypedef void *cl_api_clCreateCommandQueueWithProperties;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)(\n    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)(\n    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)(\n    cl_command_queue command_queue, cl_command_queue_info param_name,\n    size_t param_value_size, void *param_value,\n    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n// Memory Object APIs\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBuffer)(\n    cl_context context, cl_mem_flags flags, size_t size, void *host_ptr,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage)(\n    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,\n    const cl_image_desc *image_desc, void *host_ptr,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clCreateImage;\n\n#endif\n\n#ifdef CL_VERSION_3_0\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBufferWithProperties)(\n    cl_context context, const cl_mem_properties *properties, cl_mem_flags flags,\n    size_t size, void *host_ptr,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImageWithProperties)(\n    cl_context context, const cl_mem_properties *properties, cl_mem_flags flags,\n    const cl_image_format *image_format, const cl_image_desc *image_desc,\n    void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0;\n\n#else\n\ntypedef void *cl_api_clCreateBufferWithProperties;\ntypedef void *cl_api_clCreateImageWithProperties;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainMemObject)(\n    cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseMemObject)(\n    cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)(\n    cl_context context, cl_mem_flags flags, cl_mem_object_type image_type,\n    cl_uint num_entries, cl_image_format *image_formats,\n    cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)(\n    cl_mem memobj, cl_mem_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetImageInfo)(\n    cl_mem image, cl_image_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_2_0\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreatePipe)(\n    cl_context /* context */, cl_mem_flags /* flags */,\n    cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,\n    const cl_pipe_properties * /* properties */,\n    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPipeInfo)(\n    cl_mem /* pipe */, cl_pipe_info /* param_name */,\n    size_t /* param_value_size */, void * /* param_value */,\n    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;\n\ntypedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clSVMAlloc)(\n    cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,\n    unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0;\n\ntypedef CL_API_ENTRY void(CL_API_CALL *cl_api_clSVMFree)(\n    cl_context /* context */,\n    void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;\n\n#else\n\ntypedef void *cl_api_clCreatePipe;\ntypedef void *cl_api_clGetPipeInfo;\ntypedef void *cl_api_clSVMAlloc;\ntypedef void *cl_api_clSVMFree;\n\n#endif\n\n// Sampler APIs\ntypedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSampler)(\n    cl_context context, cl_bool normalized_coords,\n    cl_addressing_mode addressing_mode, cl_filter_mode filter_mode,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainSampler)(\n    cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseSampler)(\n    cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)(\n    cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_2_0\n\ntypedef CL_API_ENTRY\ncl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)(\n    cl_context /* context */,\n    const cl_sampler_properties * /* sampler_properties */,\n    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;\n\n#else\n\ntypedef void *cl_api_clCreateSamplerWithProperties;\n\n#endif\n\n// Program Object APIs\ntypedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)(\n    cl_context context, cl_uint count, const char **strings,\n    const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)(\n    cl_context context, cl_uint num_devices, const cl_device_id *device_list,\n    const size_t *lengths, const unsigned char **binaries,\n    cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY\ncl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)(\n    cl_context context, cl_uint num_devices, const cl_device_id *device_list,\n    const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clCreateProgramWithBuiltInKernels;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainProgram)(\n    cl_program program) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseProgram)(\n    cl_program program) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)(\n    cl_program program, cl_uint num_devices, const cl_device_id *device_list,\n    const char *options,\n    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),\n    void *user_data) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCompileProgram)(\n    cl_program program, cl_uint num_devices, const cl_device_id *device_list,\n    const char *options, cl_uint num_input_headers,\n    const cl_program *input_headers, const char **header_include_names,\n    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),\n    void *user_data) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clLinkProgram)(\n    cl_context context, cl_uint num_devices, const cl_device_id *device_list,\n    const char *options, cl_uint num_input_programs,\n    const cl_program *input_programs,\n    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),\n    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clCompileProgram;\ntypedef void *cl_api_clLinkProgram;\n\n#endif\n\n#ifdef CL_VERSION_2_2\n\ntypedef CL_API_ENTRY\ncl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)(\n    cl_program program, cl_uint spec_id, size_t spec_size,\n    const void *spec_value) CL_API_SUFFIX__VERSION_2_2;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)(\n    cl_program program,\n    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),\n    void *user_data) CL_API_SUFFIX__VERSION_2_2;\n\n#else\n\ntypedef void *cl_api_clSetProgramSpecializationConstant;\ntypedef void *cl_api_clSetProgramReleaseCallback;\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)(\n    cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clUnloadPlatformCompiler;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramInfo)(\n    cl_program program, cl_program_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)(\n    cl_program program, cl_device_id device, cl_program_build_info param_name,\n    size_t param_value_size, void *param_value,\n    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n// Kernel Object APIs\ntypedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCreateKernel)(\n    cl_program program, const char *kernel_name,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)(\n    cl_program program, cl_uint num_kernels, cl_kernel *kernels,\n    cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainKernel)(\n    cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseKernel)(\n    cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArg)(\n    cl_kernel kernel, cl_uint arg_index, size_t arg_size,\n    const void *arg_value) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelInfo)(\n    cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)(\n    cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name,\n    size_t param_value_size, void *param_value,\n    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clGetKernelArgInfo;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)(\n    cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,\n    size_t param_value_size, void *param_value,\n    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_2_0\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)(\n    cl_kernel /* kernel */, cl_uint /* arg_index */,\n    const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)(\n    cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,\n    size_t /* param_value_size */,\n    const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)(\n    cl_kernel /* in_kernel */, cl_device_id /*in_device*/,\n    cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/,\n    const void * /*input_value*/, size_t /*param_value_size*/,\n    void * /*param_value*/,\n    size_t * /*param_value_size_ret*/) CL_EXT_SUFFIX__VERSION_2_0;\n\n#else\n\ntypedef void *cl_api_clSetKernelArgSVMPointer;\ntypedef void *cl_api_clSetKernelExecInfo;\ntypedef void *cl_api_clGetKernelSubGroupInfoKHR;\n\n#endif\n\n// Event Object APIs\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clWaitForEvents)(\n    cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventInfo)(\n    cl_event event, cl_event_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event)\n    CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event)\n    CL_API_SUFFIX__VERSION_1_0;\n\n// Profiling APIs\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)(\n    cl_event event, cl_profiling_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\n// Flush and Finish APIs\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFlush)(\n    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFinish)(\n    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\n\n// Enqueued Commands APIs\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)(\n    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,\n    size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_1\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)(\n    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,\n    const size_t *buffer_origin, const size_t *host_origin,\n    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,\n    size_t host_row_pitch, size_t host_slice_pitch, void *ptr,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_1;\n\n#else\n\ntypedef void *cl_api_clEnqueueReadBufferRect;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)(\n    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,\n    size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_1\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)(\n    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,\n    const size_t *buffer_origin, const size_t *host_origin,\n    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,\n    size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_1;\n\n#else\n\ntypedef void *cl_api_clEnqueueWriteBufferRect;\n\n#endif\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)(\n    cl_command_queue command_queue, cl_mem buffer, const void *pattern,\n    size_t pattern_size, size_t offset, size_t cb,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clEnqueueFillBuffer;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)(\n    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,\n    size_t src_offset, size_t dst_offset, size_t cb,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_1\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)(\n    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,\n    const size_t *src_origin, const size_t *dst_origin, const size_t *region,\n    size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,\n    size_t dst_slice_pitch, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_1;\n\n#else\n\ntypedef void *cl_api_clEnqueueCopyBufferRect;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)(\n    cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,\n    const size_t *origin, const size_t *region, size_t row_pitch,\n    size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)(\n    cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,\n    const size_t *origin, const size_t *region, size_t input_row_pitch,\n    size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)(\n    cl_command_queue command_queue, cl_mem image, const void *fill_color,\n    const size_t origin[3], const size_t region[3],\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clEnqueueFillImage;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)(\n    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,\n    const size_t *src_origin, const size_t *dst_origin, const size_t *region,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)(\n    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,\n    const size_t *src_origin, const size_t *region, size_t dst_offset,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)(\n    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,\n    size_t src_offset, const size_t *dst_origin, const size_t *region,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)(\n    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map,\n    cl_map_flags map_flags, size_t offset, size_t cb,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapImage)(\n    cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,\n    cl_map_flags map_flags, const size_t *origin, const size_t *region,\n    size_t *image_row_pitch, size_t *image_slice_pitch,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)(\n    cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)(\n    cl_command_queue command_queue, cl_uint num_mem_objects,\n    const cl_mem *mem_objects, cl_mem_migration_flags flags,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clEnqueueMigrateMemObjects;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)(\n    cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,\n    const size_t *global_work_offset, const size_t *global_work_size,\n    const size_t *local_work_size, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueTask)(\n    cl_command_queue command_queue, cl_kernel kernel,\n    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)(\n    cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *),\n    void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list,\n    const void **args_mem_loc, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\n#ifdef CL_VERSION_1_2\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)(\n    cl_command_queue command_queue, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)(\n    cl_command_queue command_queue, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY void *(\n    CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)(\n    cl_platform_id platform,\n    const char *function_name)CL_API_SUFFIX__VERSION_1_2;\n\n#else\n\ntypedef void *cl_api_clEnqueueMarkerWithWaitList;\ntypedef void *cl_api_clEnqueueBarrierWithWaitList;\ntypedef void *cl_api_clGetExtensionFunctionAddressForPlatform;\n\n#endif\n\n// Shared Virtual Memory APIs\n\n#ifdef CL_VERSION_2_0\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)(\n    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,\n    void ** /* svm_pointers */,\n    void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */,\n                                     cl_uint /* num_svm_pointers */,\n                                     void ** /* svm_pointers[] */,\n                                     void * /* user_data */),\n    void * /* user_data */, cl_uint /* num_events_in_wait_list */,\n    const cl_event * /* event_wait_list */,\n    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)(\n    cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,\n    void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,\n    cl_uint /* num_events_in_wait_list */,\n    const cl_event * /* event_wait_list */,\n    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)(\n    cl_command_queue /* command_queue */, void * /* svm_ptr */,\n    const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,\n    cl_uint /* num_events_in_wait_list */,\n    const cl_event * /* event_wait_list */,\n    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)(\n    cl_command_queue /* command_queue */, cl_bool /* blocking_map */,\n    cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */,\n    cl_uint /* num_events_in_wait_list */,\n    const cl_event * /* event_wait_list */,\n    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)(\n    cl_command_queue /* command_queue */, void * /* svm_ptr */,\n    cl_uint /* num_events_in_wait_list */,\n    const cl_event * /* event_wait_list */,\n    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;\n\n#else\n\ntypedef void *cl_api_clEnqueueSVMFree;\ntypedef void *cl_api_clEnqueueSVMMemcpy;\ntypedef void *cl_api_clEnqueueSVMMemFill;\ntypedef void *cl_api_clEnqueueSVMMap;\ntypedef void *cl_api_clEnqueueSVMUnmap;\n\n#endif\n\n// Deprecated APIs\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)(\n    cl_command_queue command_queue, cl_command_queue_properties properties,\n    cl_bool enable, cl_command_queue_properties *old_properties)\n    CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage2D)(\n    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,\n    size_t image_width, size_t image_height, size_t image_row_pitch,\n    void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage3D)(\n    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,\n    size_t image_width, size_t image_height, size_t image_depth,\n    size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr,\n    cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void)\n    CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarker)(\n    cl_command_queue command_queue,\n    cl_event *event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)(\n    cl_command_queue command_queue, cl_uint num_events,\n    const cl_event *event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)(\n    cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\ntypedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)(\n    const char *function_name)CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;\n\n// GL and other APIs\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)(\n    cl_context context, cl_mem_flags flags, cl_GLuint bufobj,\n    int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)(\n    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,\n    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)(\n    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,\n    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)(\n    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,\n    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)(\n    cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)(\n    cl_mem memobj, cl_gl_object_type *gl_object_type,\n    cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)(\n    cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\n/* cl_khr_gl_sharing */\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)(\n    const cl_context_properties *properties, cl_gl_context_info param_name,\n    size_t param_value_size, void *param_value, size_t *param_value_size_ret);\n\n/* cl_khr_gl_event */\ntypedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)(\n    cl_context context, cl_GLsync sync, cl_int *errcode_ret);\n\n#if defined(_WIN32)\n\n/* cl_khr_d3d10_sharing */\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)(\n    cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,\n    void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,\n    cl_uint num_entries, cl_device_id *devices,\n    cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)(\n    cl_context context, cl_mem_flags flags, ID3D10Buffer *resource,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)(\n    cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,\n    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)(\n    cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,\n    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY\ncl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY\ncl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_0;\n\nextern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D10KHR(\n    cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,\n    void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,\n    cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateFromD3D10BufferKHR(cl_context context, cl_mem_flags flags,\n                           ID3D10Buffer *resource, cl_int *errcode_ret);\n\nextern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture2DKHR(\n    cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,\n    UINT subresource, cl_int *errcode_ret);\n\nextern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture3DKHR(\n    cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,\n    UINT subresource, cl_int *errcode_ret);\n\nextern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D10ObjectsKHR(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list, cl_event *event);\n\nextern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list, cl_event *event);\n\n/* cl_khr_d3d11_sharing */\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)(\n    cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,\n    void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,\n    cl_uint num_entries, cl_device_id *devices,\n    cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)(\n    cl_context context, cl_mem_flags flags, ID3D11Buffer *resource,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)(\n    cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,\n    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)(\n    cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,\n    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY\ncl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY\ncl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_2;\n\n/* cl_khr_dx9_media_sharing */\ntypedef CL_API_ENTRY\ncl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)(\n    cl_platform_id platform, cl_uint num_media_adapters,\n    cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters,\n    cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,\n    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)(\n    cl_context context, cl_mem_flags flags,\n    cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,\n    cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY\ncl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY\ncl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_1_2;\n\n/* cl_khr_d3d11_sharing */\nextern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D11KHR(\n    cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,\n    void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,\n    cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateFromD3D11BufferKHR(cl_context context, cl_mem_flags flags,\n                           ID3D11Buffer *resource, cl_int *errcode_ret);\n\nextern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture2DKHR(\n    cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,\n    UINT subresource, cl_int *errcode_ret);\n\nextern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture3DKHR(\n    cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,\n    UINT subresource, cl_int *errcode_ret);\n\nextern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D11ObjectsKHR(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list, cl_event *event);\n\nextern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list, cl_event *event);\n\n/* cl_khr_dx9_media_sharing */\nextern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9MediaAdapterKHR(\n    cl_platform_id platform, cl_uint num_media_adapters,\n    cl_dx9_media_adapter_type_khr *media_adapter_type, void *media_adapters,\n    cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,\n    cl_device_id *devices, cl_uint *num_devices);\n\nextern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceKHR(\n    cl_context context, cl_mem_flags flags,\n    cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,\n    cl_uint plane, cl_int *errcode_ret);\n\nextern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9MediaSurfacesKHR(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list, cl_event *event);\n\nextern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list, cl_event *event);\n\n#else\n\n/* cl_khr_d3d10_sharing */\ntypedef void *cl_api_clGetDeviceIDsFromD3D10KHR;\ntypedef void *cl_api_clCreateFromD3D10BufferKHR;\ntypedef void *cl_api_clCreateFromD3D10Texture2DKHR;\ntypedef void *cl_api_clCreateFromD3D10Texture3DKHR;\ntypedef void *cl_api_clEnqueueAcquireD3D10ObjectsKHR;\ntypedef void *cl_api_clEnqueueReleaseD3D10ObjectsKHR;\n\n/* cl_khr_d3d11_sharing */\ntypedef void *cl_api_clGetDeviceIDsFromD3D11KHR;\ntypedef void *cl_api_clCreateFromD3D11BufferKHR;\ntypedef void *cl_api_clCreateFromD3D11Texture2DKHR;\ntypedef void *cl_api_clCreateFromD3D11Texture3DKHR;\ntypedef void *cl_api_clEnqueueAcquireD3D11ObjectsKHR;\ntypedef void *cl_api_clEnqueueReleaseD3D11ObjectsKHR;\n\n/* cl_khr_dx9_media_sharing */\ntypedef void *cl_api_clCreateFromDX9MediaSurfaceKHR;\ntypedef void *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR;\ntypedef void *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR;\ntypedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR;\n\n#endif\n\n/* OpenCL 1.1 */\n\n#ifdef CL_VERSION_1_1\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetEventCallback)(\n    cl_event /* event */, cl_int /* command_exec_callback_type */,\n    void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),\n    void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)(\n    cl_mem /* buffer */, cl_mem_flags /* flags */,\n    cl_buffer_create_type /* buffer_create_type */,\n    const void * /* buffer_create_info */,\n    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY\ncl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)(\n    cl_mem /* memobj */,\n    void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,\n                                       void * /*user_data*/),\n    void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateUserEvent)(\n    cl_context /* context */,\n    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)(\n    cl_event /* event */,\n    cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;\n\n#else\n\ntypedef void *cl_api_clSetEventCallback;\ntypedef void *cl_api_clCreateSubBuffer;\ntypedef void *cl_api_clSetMemObjectDestructorCallback;\ntypedef void *cl_api_clCreateUserEvent;\ntypedef void *cl_api_clSetUserEventStatus;\n\n#endif\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)(\n    cl_device_id in_device,\n    const cl_device_partition_property_ext *partition_properties,\n    cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)(\n    cl_device_id device) CL_API_SUFFIX__VERSION_1_0;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)(\n    cl_device_id device) CL_API_SUFFIX__VERSION_1_0;\n\n/* cl_khr_egl_image */\ntypedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)(\n    cl_context context, CLeglDisplayKHR display, CLeglImageKHR image,\n    cl_mem_flags flags, const cl_egl_image_properties_khr *properties,\n    cl_int *errcode_ret);\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list, cl_event *event);\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)(\n    cl_command_queue command_queue, cl_uint num_objects,\n    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list, cl_event *event);\n\n/* cl_khr_egl_event */\ntypedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)(\n    cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display,\n    cl_int *errcode_ret);\n\n#ifdef CL_VERSION_2_1\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)(\n    cl_context context, cl_device_id device,\n    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1;\n\ntypedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)(\n    cl_context context, const void *il, size_t length,\n    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)(\n    cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name,\n    size_t input_value_size, const void *input_value, size_t param_value_size,\n    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;\n\ntypedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCloneKernel)(\n    cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)(\n    cl_command_queue command_queue, cl_uint num_svm_pointers,\n    const void **svm_pointers, const size_t *sizes,\n    cl_mem_migration_flags flags, cl_uint num_events_in_wait_list,\n    const cl_event *event_wait_list,\n    cl_event *event) CL_API_SUFFIX__VERSION_2_1;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)(\n    cl_device_id device, cl_ulong *device_timestamp,\n    cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;\n\ntypedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetHostTimer)(\n    cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;\n\n#else\n\ntypedef void *cl_api_clSetDefaultDeviceCommandQueue;\ntypedef void *cl_api_clCreateProgramWithIL;\ntypedef void *cl_api_clGetKernelSubGroupInfo;\ntypedef void *cl_api_clCloneKernel;\ntypedef void *cl_api_clEnqueueSVMMigrateMem;\ntypedef void *cl_api_clGetDeviceAndHostTimer;\ntypedef void *cl_api_clGetHostTimer;\n\n#endif\n\n/* Vendor dispatch table struture */\n\ntypedef struct _cl_icd_dispatch {\n  /* OpenCL 1.0 */\n  cl_api_clGetPlatformIDs clGetPlatformIDs;\n  cl_api_clGetPlatformInfo clGetPlatformInfo;\n  cl_api_clGetDeviceIDs clGetDeviceIDs;\n  cl_api_clGetDeviceInfo clGetDeviceInfo;\n  cl_api_clCreateContext clCreateContext;\n  cl_api_clCreateContextFromType clCreateContextFromType;\n  cl_api_clRetainContext clRetainContext;\n  cl_api_clReleaseContext clReleaseContext;\n  cl_api_clGetContextInfo clGetContextInfo;\n  cl_api_clCreateCommandQueue clCreateCommandQueue;\n  cl_api_clRetainCommandQueue clRetainCommandQueue;\n  cl_api_clReleaseCommandQueue clReleaseCommandQueue;\n  cl_api_clGetCommandQueueInfo clGetCommandQueueInfo;\n  cl_api_clSetCommandQueueProperty clSetCommandQueueProperty;\n  cl_api_clCreateBuffer clCreateBuffer;\n  cl_api_clCreateImage2D clCreateImage2D;\n  cl_api_clCreateImage3D clCreateImage3D;\n  cl_api_clRetainMemObject clRetainMemObject;\n  cl_api_clReleaseMemObject clReleaseMemObject;\n  cl_api_clGetSupportedImageFormats clGetSupportedImageFormats;\n  cl_api_clGetMemObjectInfo clGetMemObjectInfo;\n  cl_api_clGetImageInfo clGetImageInfo;\n  cl_api_clCreateSampler clCreateSampler;\n  cl_api_clRetainSampler clRetainSampler;\n  cl_api_clReleaseSampler clReleaseSampler;\n  cl_api_clGetSamplerInfo clGetSamplerInfo;\n  cl_api_clCreateProgramWithSource clCreateProgramWithSource;\n  cl_api_clCreateProgramWithBinary clCreateProgramWithBinary;\n  cl_api_clRetainProgram clRetainProgram;\n  cl_api_clReleaseProgram clReleaseProgram;\n  cl_api_clBuildProgram clBuildProgram;\n  cl_api_clUnloadCompiler clUnloadCompiler;\n  cl_api_clGetProgramInfo clGetProgramInfo;\n  cl_api_clGetProgramBuildInfo clGetProgramBuildInfo;\n  cl_api_clCreateKernel clCreateKernel;\n  cl_api_clCreateKernelsInProgram clCreateKernelsInProgram;\n  cl_api_clRetainKernel clRetainKernel;\n  cl_api_clReleaseKernel clReleaseKernel;\n  cl_api_clSetKernelArg clSetKernelArg;\n  cl_api_clGetKernelInfo clGetKernelInfo;\n  cl_api_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;\n  cl_api_clWaitForEvents clWaitForEvents;\n  cl_api_clGetEventInfo clGetEventInfo;\n  cl_api_clRetainEvent clRetainEvent;\n  cl_api_clReleaseEvent clReleaseEvent;\n  cl_api_clGetEventProfilingInfo clGetEventProfilingInfo;\n  cl_api_clFlush clFlush;\n  cl_api_clFinish clFinish;\n  cl_api_clEnqueueReadBuffer clEnqueueReadBuffer;\n  cl_api_clEnqueueWriteBuffer clEnqueueWriteBuffer;\n  cl_api_clEnqueueCopyBuffer clEnqueueCopyBuffer;\n  cl_api_clEnqueueReadImage clEnqueueReadImage;\n  cl_api_clEnqueueWriteImage clEnqueueWriteImage;\n  cl_api_clEnqueueCopyImage clEnqueueCopyImage;\n  cl_api_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;\n  cl_api_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;\n  cl_api_clEnqueueMapBuffer clEnqueueMapBuffer;\n  cl_api_clEnqueueMapImage clEnqueueMapImage;\n  cl_api_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;\n  cl_api_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;\n  cl_api_clEnqueueTask clEnqueueTask;\n  cl_api_clEnqueueNativeKernel clEnqueueNativeKernel;\n  cl_api_clEnqueueMarker clEnqueueMarker;\n  cl_api_clEnqueueWaitForEvents clEnqueueWaitForEvents;\n  cl_api_clEnqueueBarrier clEnqueueBarrier;\n  cl_api_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;\n  cl_api_clCreateFromGLBuffer clCreateFromGLBuffer;\n  cl_api_clCreateFromGLTexture2D clCreateFromGLTexture2D;\n  cl_api_clCreateFromGLTexture3D clCreateFromGLTexture3D;\n  cl_api_clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer;\n  cl_api_clGetGLObjectInfo clGetGLObjectInfo;\n  cl_api_clGetGLTextureInfo clGetGLTextureInfo;\n  cl_api_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;\n  cl_api_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;\n  cl_api_clGetGLContextInfoKHR clGetGLContextInfoKHR;\n\n  /* cl_khr_d3d10_sharing */\n  cl_api_clGetDeviceIDsFromD3D10KHR clGetDeviceIDsFromD3D10KHR;\n  cl_api_clCreateFromD3D10BufferKHR clCreateFromD3D10BufferKHR;\n  cl_api_clCreateFromD3D10Texture2DKHR clCreateFromD3D10Texture2DKHR;\n  cl_api_clCreateFromD3D10Texture3DKHR clCreateFromD3D10Texture3DKHR;\n  cl_api_clEnqueueAcquireD3D10ObjectsKHR clEnqueueAcquireD3D10ObjectsKHR;\n  cl_api_clEnqueueReleaseD3D10ObjectsKHR clEnqueueReleaseD3D10ObjectsKHR;\n\n  /* OpenCL 1.1 */\n  cl_api_clSetEventCallback clSetEventCallback;\n  cl_api_clCreateSubBuffer clCreateSubBuffer;\n  cl_api_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;\n  cl_api_clCreateUserEvent clCreateUserEvent;\n  cl_api_clSetUserEventStatus clSetUserEventStatus;\n  cl_api_clEnqueueReadBufferRect clEnqueueReadBufferRect;\n  cl_api_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;\n  cl_api_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;\n\n  /* cl_ext_device_fission */\n  cl_api_clCreateSubDevicesEXT clCreateSubDevicesEXT;\n  cl_api_clRetainDeviceEXT clRetainDeviceEXT;\n  cl_api_clReleaseDeviceEXT clReleaseDeviceEXT;\n\n  /* cl_khr_gl_event */\n  cl_api_clCreateEventFromGLsyncKHR clCreateEventFromGLsyncKHR;\n\n  /* OpenCL 1.2 */\n  cl_api_clCreateSubDevices clCreateSubDevices;\n  cl_api_clRetainDevice clRetainDevice;\n  cl_api_clReleaseDevice clReleaseDevice;\n  cl_api_clCreateImage clCreateImage;\n  cl_api_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;\n  cl_api_clCompileProgram clCompileProgram;\n  cl_api_clLinkProgram clLinkProgram;\n  cl_api_clUnloadPlatformCompiler clUnloadPlatformCompiler;\n  cl_api_clGetKernelArgInfo clGetKernelArgInfo;\n  cl_api_clEnqueueFillBuffer clEnqueueFillBuffer;\n  cl_api_clEnqueueFillImage clEnqueueFillImage;\n  cl_api_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;\n  cl_api_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;\n  cl_api_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;\n  cl_api_clGetExtensionFunctionAddressForPlatform\n      clGetExtensionFunctionAddressForPlatform;\n  cl_api_clCreateFromGLTexture clCreateFromGLTexture;\n\n  /* cl_khr_d3d11_sharing */\n  cl_api_clGetDeviceIDsFromD3D11KHR clGetDeviceIDsFromD3D11KHR;\n  cl_api_clCreateFromD3D11BufferKHR clCreateFromD3D11BufferKHR;\n  cl_api_clCreateFromD3D11Texture2DKHR clCreateFromD3D11Texture2DKHR;\n  cl_api_clCreateFromD3D11Texture3DKHR clCreateFromD3D11Texture3DKHR;\n  cl_api_clCreateFromDX9MediaSurfaceKHR clCreateFromDX9MediaSurfaceKHR;\n  cl_api_clEnqueueAcquireD3D11ObjectsKHR clEnqueueAcquireD3D11ObjectsKHR;\n  cl_api_clEnqueueReleaseD3D11ObjectsKHR clEnqueueReleaseD3D11ObjectsKHR;\n\n  /* cl_khr_dx9_media_sharing */\n  cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR\n      clGetDeviceIDsFromDX9MediaAdapterKHR;\n  cl_api_clEnqueueAcquireDX9MediaSurfacesKHR\n      clEnqueueAcquireDX9MediaSurfacesKHR;\n  cl_api_clEnqueueReleaseDX9MediaSurfacesKHR\n      clEnqueueReleaseDX9MediaSurfacesKHR;\n\n  /* cl_khr_egl_image */\n  cl_api_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;\n  cl_api_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;\n  cl_api_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;\n\n  /* cl_khr_egl_event */\n  cl_api_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;\n\n  /* OpenCL 2.0 */\n  cl_api_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties;\n  cl_api_clCreatePipe clCreatePipe;\n  cl_api_clGetPipeInfo clGetPipeInfo;\n  cl_api_clSVMAlloc clSVMAlloc;\n  cl_api_clSVMFree clSVMFree;\n  cl_api_clEnqueueSVMFree clEnqueueSVMFree;\n  cl_api_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;\n  cl_api_clEnqueueSVMMemFill clEnqueueSVMMemFill;\n  cl_api_clEnqueueSVMMap clEnqueueSVMMap;\n  cl_api_clEnqueueSVMUnmap clEnqueueSVMUnmap;\n  cl_api_clCreateSamplerWithProperties clCreateSamplerWithProperties;\n  cl_api_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;\n  cl_api_clSetKernelExecInfo clSetKernelExecInfo;\n\n  /* cl_khr_sub_groups */\n  cl_api_clGetKernelSubGroupInfoKHR clGetKernelSubGroupInfoKHR;\n\n  /* OpenCL 2.1 */\n  cl_api_clCloneKernel clCloneKernel;\n  cl_api_clCreateProgramWithIL clCreateProgramWithIL;\n  cl_api_clEnqueueSVMMigrateMem clEnqueueSVMMigrateMem;\n  cl_api_clGetDeviceAndHostTimer clGetDeviceAndHostTimer;\n  cl_api_clGetHostTimer clGetHostTimer;\n  cl_api_clGetKernelSubGroupInfo clGetKernelSubGroupInfo;\n  cl_api_clSetDefaultDeviceCommandQueue clSetDefaultDeviceCommandQueue;\n\n  /* OpenCL 2.2 */\n  cl_api_clSetProgramReleaseCallback clSetProgramReleaseCallback;\n  cl_api_clSetProgramSpecializationConstant clSetProgramSpecializationConstant;\n\n  /* OpenCL 3.0 */\n  cl_api_clCreateBufferWithProperties clCreateBufferWithProperties;\n  cl_api_clCreateImageWithProperties clCreateImageWithProperties;\n\n} cl_icd_dispatch;\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* #ifndef OPENCL_CL_ICD_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_platform.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __CL_PLATFORM_H\n#define __CL_PLATFORM_H\n\n#include <CL/cl_version.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n#if defined(_WIN32)\n    #define CL_API_ENTRY\n    #define CL_API_CALL     __stdcall\n    #define CL_CALLBACK     __stdcall\n#else\n    #define CL_API_ENTRY\n    #define CL_API_CALL\n    #define CL_CALLBACK\n#endif\n\n/*\n * Deprecation flags refer to the last version of the header in which the\n * feature was not deprecated.\n *\n * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without\n * deprecation but is deprecated in versions later than 1.1.\n */\n\n#define CL_EXTENSION_WEAK_LINK\n#define CL_API_SUFFIX__VERSION_1_0\n#define CL_EXT_SUFFIX__VERSION_1_0\n#define CL_API_SUFFIX__VERSION_1_1\n#define CL_EXT_SUFFIX__VERSION_1_1\n#define CL_API_SUFFIX__VERSION_1_2\n#define CL_EXT_SUFFIX__VERSION_1_2\n#define CL_API_SUFFIX__VERSION_2_0\n#define CL_EXT_SUFFIX__VERSION_2_0\n#define CL_API_SUFFIX__VERSION_2_1\n#define CL_EXT_SUFFIX__VERSION_2_1\n#define CL_API_SUFFIX__VERSION_2_2\n#define CL_EXT_SUFFIX__VERSION_2_2\n#define CL_API_SUFFIX__VERSION_3_0\n#define CL_EXT_SUFFIX__VERSION_3_0\n#define CL_API_SUFFIX__EXPERIMENTAL\n#define CL_EXT_SUFFIX__EXPERIMENTAL\n\n\n#ifdef __GNUC__\n  #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))\n  #define CL_EXT_PREFIX_DEPRECATED\n#elif defined(_WIN32)\n  #define CL_EXT_SUFFIX_DEPRECATED\n  #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)\n#else\n  #define CL_EXT_SUFFIX_DEPRECATED\n  #define CL_EXT_PREFIX_DEPRECATED\n#endif\n\n#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS\n    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED\n#else\n    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED\n#endif\n\n#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS\n    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED\n#else\n    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED\n#endif\n\n#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS\n    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED\n#else\n    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED\n #endif\n\n#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS\n    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED\n#else\n    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED\n#endif\n\n#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS\n    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED\n#else\n    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED\n#endif\n\n#ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS\n    #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED\n#else\n    #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED\n    #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED\n#endif\n\n#if (defined (_WIN32) && defined(_MSC_VER))\n\n/* scalar types  */\ntypedef signed   __int8         cl_char;\ntypedef unsigned __int8         cl_uchar;\ntypedef signed   __int16        cl_short;\ntypedef unsigned __int16        cl_ushort;\ntypedef signed   __int32        cl_int;\ntypedef unsigned __int32        cl_uint;\ntypedef signed   __int64        cl_long;\ntypedef unsigned __int64        cl_ulong;\n\ntypedef unsigned __int16        cl_half;\ntypedef float                   cl_float;\ntypedef double                  cl_double;\n\n/* Macro names and corresponding values defined by OpenCL */\n#define CL_CHAR_BIT         8\n#define CL_SCHAR_MAX        127\n#define CL_SCHAR_MIN        (-127-1)\n#define CL_CHAR_MAX         CL_SCHAR_MAX\n#define CL_CHAR_MIN         CL_SCHAR_MIN\n#define CL_UCHAR_MAX        255\n#define CL_SHRT_MAX         32767\n#define CL_SHRT_MIN         (-32767-1)\n#define CL_USHRT_MAX        65535\n#define CL_INT_MAX          2147483647\n#define CL_INT_MIN          (-2147483647-1)\n#define CL_UINT_MAX         0xffffffffU\n#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)\n#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)\n#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)\n\n#define CL_FLT_DIG          6\n#define CL_FLT_MANT_DIG     24\n#define CL_FLT_MAX_10_EXP   +38\n#define CL_FLT_MAX_EXP      +128\n#define CL_FLT_MIN_10_EXP   -37\n#define CL_FLT_MIN_EXP      -125\n#define CL_FLT_RADIX        2\n#define CL_FLT_MAX          340282346638528859811704183484516925440.0f\n#define CL_FLT_MIN          1.175494350822287507969e-38f\n#define CL_FLT_EPSILON      1.1920928955078125e-7f\n\n#define CL_HALF_DIG          3\n#define CL_HALF_MANT_DIG     11\n#define CL_HALF_MAX_10_EXP   +4\n#define CL_HALF_MAX_EXP      +16\n#define CL_HALF_MIN_10_EXP   -4\n#define CL_HALF_MIN_EXP      -13\n#define CL_HALF_RADIX        2\n#define CL_HALF_MAX          65504.0f\n#define CL_HALF_MIN          6.103515625e-05f\n#define CL_HALF_EPSILON      9.765625e-04f\n\n#define CL_DBL_DIG          15\n#define CL_DBL_MANT_DIG     53\n#define CL_DBL_MAX_10_EXP   +308\n#define CL_DBL_MAX_EXP      +1024\n#define CL_DBL_MIN_10_EXP   -307\n#define CL_DBL_MIN_EXP      -1021\n#define CL_DBL_RADIX        2\n#define CL_DBL_MAX          1.7976931348623158e+308\n#define CL_DBL_MIN          2.225073858507201383090e-308\n#define CL_DBL_EPSILON      2.220446049250313080847e-16\n\n#define CL_M_E              2.7182818284590452354\n#define CL_M_LOG2E          1.4426950408889634074\n#define CL_M_LOG10E         0.43429448190325182765\n#define CL_M_LN2            0.69314718055994530942\n#define CL_M_LN10           2.30258509299404568402\n#define CL_M_PI             3.14159265358979323846\n#define CL_M_PI_2           1.57079632679489661923\n#define CL_M_PI_4           0.78539816339744830962\n#define CL_M_1_PI           0.31830988618379067154\n#define CL_M_2_PI           0.63661977236758134308\n#define CL_M_2_SQRTPI       1.12837916709551257390\n#define CL_M_SQRT2          1.41421356237309504880\n#define CL_M_SQRT1_2        0.70710678118654752440\n\n#define CL_M_E_F            2.718281828f\n#define CL_M_LOG2E_F        1.442695041f\n#define CL_M_LOG10E_F       0.434294482f\n#define CL_M_LN2_F          0.693147181f\n#define CL_M_LN10_F         2.302585093f\n#define CL_M_PI_F           3.141592654f\n#define CL_M_PI_2_F         1.570796327f\n#define CL_M_PI_4_F         0.785398163f\n#define CL_M_1_PI_F         0.318309886f\n#define CL_M_2_PI_F         0.636619772f\n#define CL_M_2_SQRTPI_F     1.128379167f\n#define CL_M_SQRT2_F        1.414213562f\n#define CL_M_SQRT1_2_F      0.707106781f\n\n#define CL_NAN              (CL_INFINITY - CL_INFINITY)\n#define CL_HUGE_VALF        ((cl_float) 1e50)\n#define CL_HUGE_VAL         ((cl_double) 1e500)\n#define CL_MAXFLOAT         CL_FLT_MAX\n#define CL_INFINITY         CL_HUGE_VALF\n\n#else\n\n#include <stdint.h>\n\n/* scalar types  */\ntypedef int8_t          cl_char;\ntypedef uint8_t         cl_uchar;\ntypedef int16_t         cl_short;\ntypedef uint16_t        cl_ushort;\ntypedef int32_t         cl_int;\ntypedef uint32_t        cl_uint;\ntypedef int64_t         cl_long;\ntypedef uint64_t        cl_ulong;\n\ntypedef uint16_t        cl_half;\ntypedef float           cl_float;\ntypedef double          cl_double;\n\n/* Macro names and corresponding values defined by OpenCL */\n#define CL_CHAR_BIT         8\n#define CL_SCHAR_MAX        127\n#define CL_SCHAR_MIN        (-127-1)\n#define CL_CHAR_MAX         CL_SCHAR_MAX\n#define CL_CHAR_MIN         CL_SCHAR_MIN\n#define CL_UCHAR_MAX        255\n#define CL_SHRT_MAX         32767\n#define CL_SHRT_MIN         (-32767-1)\n#define CL_USHRT_MAX        65535\n#define CL_INT_MAX          2147483647\n#define CL_INT_MIN          (-2147483647-1)\n#define CL_UINT_MAX         0xffffffffU\n#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)\n#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)\n#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)\n\n#define CL_FLT_DIG          6\n#define CL_FLT_MANT_DIG     24\n#define CL_FLT_MAX_10_EXP   +38\n#define CL_FLT_MAX_EXP      +128\n#define CL_FLT_MIN_10_EXP   -37\n#define CL_FLT_MIN_EXP      -125\n#define CL_FLT_RADIX        2\n#define CL_FLT_MAX          340282346638528859811704183484516925440.0f\n#define CL_FLT_MIN          1.175494350822287507969e-38f\n#define CL_FLT_EPSILON      1.1920928955078125e-7f\n\n#define CL_HALF_DIG          3\n#define CL_HALF_MANT_DIG     11\n#define CL_HALF_MAX_10_EXP   +4\n#define CL_HALF_MAX_EXP      +16\n#define CL_HALF_MIN_10_EXP   -4\n#define CL_HALF_MIN_EXP      -13\n#define CL_HALF_RADIX        2\n#define CL_HALF_MAX          65504.0f\n#define CL_HALF_MIN          6.103515625e-05f\n#define CL_HALF_EPSILON      9.765625e-04f\n\n#define CL_DBL_DIG          15\n#define CL_DBL_MANT_DIG     53\n#define CL_DBL_MAX_10_EXP   +308\n#define CL_DBL_MAX_EXP      +1024\n#define CL_DBL_MIN_10_EXP   -307\n#define CL_DBL_MIN_EXP      -1021\n#define CL_DBL_RADIX        2\n#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0\n#define CL_DBL_MIN          2.225073858507201383090e-308\n#define CL_DBL_EPSILON      2.220446049250313080847e-16\n\n#define CL_M_E              2.7182818284590452354\n#define CL_M_LOG2E          1.4426950408889634074\n#define CL_M_LOG10E         0.43429448190325182765\n#define CL_M_LN2            0.69314718055994530942\n#define CL_M_LN10           2.30258509299404568402\n#define CL_M_PI             3.14159265358979323846\n#define CL_M_PI_2           1.57079632679489661923\n#define CL_M_PI_4           0.78539816339744830962\n#define CL_M_1_PI           0.31830988618379067154\n#define CL_M_2_PI           0.63661977236758134308\n#define CL_M_2_SQRTPI       1.12837916709551257390\n#define CL_M_SQRT2          1.41421356237309504880\n#define CL_M_SQRT1_2        0.70710678118654752440\n\n#define CL_M_E_F            2.718281828f\n#define CL_M_LOG2E_F        1.442695041f\n#define CL_M_LOG10E_F       0.434294482f\n#define CL_M_LN2_F          0.693147181f\n#define CL_M_LN10_F         2.302585093f\n#define CL_M_PI_F           3.141592654f\n#define CL_M_PI_2_F         1.570796327f\n#define CL_M_PI_4_F         0.785398163f\n#define CL_M_1_PI_F         0.318309886f\n#define CL_M_2_PI_F         0.636619772f\n#define CL_M_2_SQRTPI_F     1.128379167f\n#define CL_M_SQRT2_F        1.414213562f\n#define CL_M_SQRT1_2_F      0.707106781f\n\n#if defined( __GNUC__ )\n   #define CL_HUGE_VALF     __builtin_huge_valf()\n   #define CL_HUGE_VAL      __builtin_huge_val()\n   #define CL_NAN           __builtin_nanf( \"\" )\n#else\n   #define CL_HUGE_VALF     ((cl_float) 1e50)\n   #define CL_HUGE_VAL      ((cl_double) 1e500)\n   float nanf( const char * );\n   #define CL_NAN           nanf( \"\" )\n#endif\n#define CL_MAXFLOAT         CL_FLT_MAX\n#define CL_INFINITY         CL_HUGE_VALF\n\n#endif\n\n#include <stddef.h>\n\n/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */\ntypedef unsigned int cl_GLuint;\ntypedef int          cl_GLint;\ntypedef unsigned int cl_GLenum;\n\n/*\n * Vector types\n *\n *  Note:   OpenCL requires that all types be naturally aligned.\n *          This means that vector types must be naturally aligned.\n *          For example, a vector of four floats must be aligned to\n *          a 16 byte boundary (calculated as 4 * the natural 4-byte\n *          alignment of the float).  The alignment qualifiers here\n *          will only function properly if your compiler supports them\n *          and if you don't actively work to defeat them.  For example,\n *          in order for a cl_float4 to be 16 byte aligned in a struct,\n *          the start of the struct must itself be 16-byte aligned.\n *\n *          Maintaining proper alignment is the user's responsibility.\n */\n\n/* Define basic vector types */\n#if defined( __VEC__ )\n   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */\n   typedef __vector unsigned char     __cl_uchar16;\n   typedef __vector signed char       __cl_char16;\n   typedef __vector unsigned short    __cl_ushort8;\n   typedef __vector signed short      __cl_short8;\n   typedef __vector unsigned int      __cl_uint4;\n   typedef __vector signed int        __cl_int4;\n   typedef __vector float             __cl_float4;\n   #define  __CL_UCHAR16__  1\n   #define  __CL_CHAR16__   1\n   #define  __CL_USHORT8__  1\n   #define  __CL_SHORT8__   1\n   #define  __CL_UINT4__    1\n   #define  __CL_INT4__     1\n   #define  __CL_FLOAT4__   1\n#endif\n\n#if defined( __SSE__ )\n    #if defined( __MINGW64__ )\n        #include <intrin.h>\n    #else\n        #include <xmmintrin.h>\n    #endif\n    #if defined( __GNUC__ )\n        typedef float __cl_float4   __attribute__((vector_size(16)));\n    #else\n        typedef __m128 __cl_float4;\n    #endif\n    #define __CL_FLOAT4__   1\n#endif\n\n#if defined( __SSE2__ )\n    #if defined( __MINGW64__ )\n        #include <intrin.h>\n    #else\n        #include <emmintrin.h>\n    #endif\n    #if defined( __GNUC__ )\n        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));\n        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));\n        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));\n        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));\n        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));\n        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));\n        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));\n        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));\n        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));\n    #else\n        typedef __m128i __cl_uchar16;\n        typedef __m128i __cl_char16;\n        typedef __m128i __cl_ushort8;\n        typedef __m128i __cl_short8;\n        typedef __m128i __cl_uint4;\n        typedef __m128i __cl_int4;\n        typedef __m128i __cl_ulong2;\n        typedef __m128i __cl_long2;\n        typedef __m128d __cl_double2;\n    #endif\n    #define __CL_UCHAR16__  1\n    #define __CL_CHAR16__   1\n    #define __CL_USHORT8__  1\n    #define __CL_SHORT8__   1\n    #define __CL_INT4__     1\n    #define __CL_UINT4__    1\n    #define __CL_ULONG2__   1\n    #define __CL_LONG2__    1\n    #define __CL_DOUBLE2__  1\n#endif\n\n#if defined( __MMX__ )\n    #include <mmintrin.h>\n    #if defined( __GNUC__ )\n        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));\n        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));\n        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));\n        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));\n        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));\n        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));\n        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));\n        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));\n        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));\n    #else\n        typedef __m64       __cl_uchar8;\n        typedef __m64       __cl_char8;\n        typedef __m64       __cl_ushort4;\n        typedef __m64       __cl_short4;\n        typedef __m64       __cl_uint2;\n        typedef __m64       __cl_int2;\n        typedef __m64       __cl_ulong1;\n        typedef __m64       __cl_long1;\n        typedef __m64       __cl_float2;\n    #endif\n    #define __CL_UCHAR8__   1\n    #define __CL_CHAR8__    1\n    #define __CL_USHORT4__  1\n    #define __CL_SHORT4__   1\n    #define __CL_INT2__     1\n    #define __CL_UINT2__    1\n    #define __CL_ULONG1__   1\n    #define __CL_LONG1__    1\n    #define __CL_FLOAT2__   1\n#endif\n\n#if defined( __AVX__ )\n    #if defined( __MINGW64__ )\n        #include <intrin.h>\n    #else\n        #include <immintrin.h>\n    #endif\n    #if defined( __GNUC__ )\n        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));\n        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));\n    #else\n        typedef __m256      __cl_float8;\n        typedef __m256d     __cl_double4;\n    #endif\n    #define __CL_FLOAT8__   1\n    #define __CL_DOUBLE4__  1\n#endif\n\n/* Define capabilities for anonymous struct members. */\n#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L\n#define  __CL_HAS_ANON_STRUCT__ 1\n#define  __CL_ANON_STRUCT__\n#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )\n#define  __CL_HAS_ANON_STRUCT__ 1\n#define  __CL_ANON_STRUCT__ __extension__\n#elif defined( _WIN32) && defined(_MSC_VER)\n    #if _MSC_VER >= 1500\n   /* Microsoft Developer Studio 2008 supports anonymous structs, but\n    * complains by default. */\n    #define  __CL_HAS_ANON_STRUCT__ 1\n    #define  __CL_ANON_STRUCT__\n   /* Disable warning C4201: nonstandard extension used : nameless\n    * struct/union */\n    #pragma warning( push )\n    #pragma warning( disable : 4201 )\n    #endif\n#else\n#define  __CL_HAS_ANON_STRUCT__ 0\n#define  __CL_ANON_STRUCT__\n#endif\n\n/* Define alignment keys */\n#if defined( __GNUC__ )\n    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))\n#elif defined( _WIN32) && (_MSC_VER)\n    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */\n    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */\n    /* #include <crtdefs.h>                                                                                             */\n    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */\n    #define CL_ALIGNED(_x)\n#else\n   #warning  Need to implement some method to align data here\n   #define  CL_ALIGNED(_x)\n#endif\n\n/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */\n#if __CL_HAS_ANON_STRUCT__\n    /* .xyzw and .s0123...{f|F} are supported */\n    #define CL_HAS_NAMED_VECTOR_FIELDS 1\n    /* .hi and .lo are supported */\n    #define CL_HAS_HI_LO_VECTOR_FIELDS 1\n#endif\n\n/* Define cl_vector types */\n\n/* ---- cl_charn ---- */\ntypedef union\n{\n    cl_char  CL_ALIGNED(2) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };\n#endif\n#if defined( __CL_CHAR2__)\n    __cl_char2     v2;\n#endif\n}cl_char2;\n\ntypedef union\n{\n    cl_char  CL_ALIGNED(4) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };\n#endif\n#if defined( __CL_CHAR2__)\n    __cl_char2     v2[2];\n#endif\n#if defined( __CL_CHAR4__)\n    __cl_char4     v4;\n#endif\n}cl_char4;\n\n/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */\ntypedef  cl_char4  cl_char3;\n\ntypedef union\n{\n    cl_char   CL_ALIGNED(8) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };\n#endif\n#if defined( __CL_CHAR2__)\n    __cl_char2     v2[4];\n#endif\n#if defined( __CL_CHAR4__)\n    __cl_char4     v4[2];\n#endif\n#if defined( __CL_CHAR8__ )\n    __cl_char8     v8;\n#endif\n}cl_char8;\n\ntypedef union\n{\n    cl_char  CL_ALIGNED(16) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };\n#endif\n#if defined( __CL_CHAR2__)\n    __cl_char2     v2[8];\n#endif\n#if defined( __CL_CHAR4__)\n    __cl_char4     v4[4];\n#endif\n#if defined( __CL_CHAR8__ )\n    __cl_char8     v8[2];\n#endif\n#if defined( __CL_CHAR16__ )\n    __cl_char16    v16;\n#endif\n}cl_char16;\n\n\n/* ---- cl_ucharn ---- */\ntypedef union\n{\n    cl_uchar  CL_ALIGNED(2) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };\n#endif\n#if defined( __cl_uchar2__)\n    __cl_uchar2     v2;\n#endif\n}cl_uchar2;\n\ntypedef union\n{\n    cl_uchar  CL_ALIGNED(4) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };\n#endif\n#if defined( __CL_UCHAR2__)\n    __cl_uchar2     v2[2];\n#endif\n#if defined( __CL_UCHAR4__)\n    __cl_uchar4     v4;\n#endif\n}cl_uchar4;\n\n/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */\ntypedef  cl_uchar4  cl_uchar3;\n\ntypedef union\n{\n    cl_uchar   CL_ALIGNED(8) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };\n#endif\n#if defined( __CL_UCHAR2__)\n    __cl_uchar2     v2[4];\n#endif\n#if defined( __CL_UCHAR4__)\n    __cl_uchar4     v4[2];\n#endif\n#if defined( __CL_UCHAR8__ )\n    __cl_uchar8     v8;\n#endif\n}cl_uchar8;\n\ntypedef union\n{\n    cl_uchar  CL_ALIGNED(16) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };\n#endif\n#if defined( __CL_UCHAR2__)\n    __cl_uchar2     v2[8];\n#endif\n#if defined( __CL_UCHAR4__)\n    __cl_uchar4     v4[4];\n#endif\n#if defined( __CL_UCHAR8__ )\n    __cl_uchar8     v8[2];\n#endif\n#if defined( __CL_UCHAR16__ )\n    __cl_uchar16    v16;\n#endif\n}cl_uchar16;\n\n\n/* ---- cl_shortn ---- */\ntypedef union\n{\n    cl_short  CL_ALIGNED(4) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };\n#endif\n#if defined( __CL_SHORT2__)\n    __cl_short2     v2;\n#endif\n}cl_short2;\n\ntypedef union\n{\n    cl_short  CL_ALIGNED(8) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };\n#endif\n#if defined( __CL_SHORT2__)\n    __cl_short2     v2[2];\n#endif\n#if defined( __CL_SHORT4__)\n    __cl_short4     v4;\n#endif\n}cl_short4;\n\n/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */\ntypedef  cl_short4  cl_short3;\n\ntypedef union\n{\n    cl_short   CL_ALIGNED(16) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };\n#endif\n#if defined( __CL_SHORT2__)\n    __cl_short2     v2[4];\n#endif\n#if defined( __CL_SHORT4__)\n    __cl_short4     v4[2];\n#endif\n#if defined( __CL_SHORT8__ )\n    __cl_short8     v8;\n#endif\n}cl_short8;\n\ntypedef union\n{\n    cl_short  CL_ALIGNED(32) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };\n#endif\n#if defined( __CL_SHORT2__)\n    __cl_short2     v2[8];\n#endif\n#if defined( __CL_SHORT4__)\n    __cl_short4     v4[4];\n#endif\n#if defined( __CL_SHORT8__ )\n    __cl_short8     v8[2];\n#endif\n#if defined( __CL_SHORT16__ )\n    __cl_short16    v16;\n#endif\n}cl_short16;\n\n\n/* ---- cl_ushortn ---- */\ntypedef union\n{\n    cl_ushort  CL_ALIGNED(4) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };\n#endif\n#if defined( __CL_USHORT2__)\n    __cl_ushort2     v2;\n#endif\n}cl_ushort2;\n\ntypedef union\n{\n    cl_ushort  CL_ALIGNED(8) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };\n#endif\n#if defined( __CL_USHORT2__)\n    __cl_ushort2     v2[2];\n#endif\n#if defined( __CL_USHORT4__)\n    __cl_ushort4     v4;\n#endif\n}cl_ushort4;\n\n/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */\ntypedef  cl_ushort4  cl_ushort3;\n\ntypedef union\n{\n    cl_ushort   CL_ALIGNED(16) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };\n#endif\n#if defined( __CL_USHORT2__)\n    __cl_ushort2     v2[4];\n#endif\n#if defined( __CL_USHORT4__)\n    __cl_ushort4     v4[2];\n#endif\n#if defined( __CL_USHORT8__ )\n    __cl_ushort8     v8;\n#endif\n}cl_ushort8;\n\ntypedef union\n{\n    cl_ushort  CL_ALIGNED(32) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };\n#endif\n#if defined( __CL_USHORT2__)\n    __cl_ushort2     v2[8];\n#endif\n#if defined( __CL_USHORT4__)\n    __cl_ushort4     v4[4];\n#endif\n#if defined( __CL_USHORT8__ )\n    __cl_ushort8     v8[2];\n#endif\n#if defined( __CL_USHORT16__ )\n    __cl_ushort16    v16;\n#endif\n}cl_ushort16;\n\n\n/* ---- cl_halfn ---- */\ntypedef union\n{\n    cl_half  CL_ALIGNED(4) s[2];\n#if __CL_HAS_ANON_STRUCT__\n    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };\n    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };\n    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };\n#endif\n#if defined( __CL_HALF2__)\n    __cl_half2     v2;\n#endif\n}cl_half2;\n\ntypedef union\n{\n    cl_half  CL_ALIGNED(8) s[4];\n#if __CL_HAS_ANON_STRUCT__\n    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };\n    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };\n    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };\n#endif\n#if defined( __CL_HALF2__)\n    __cl_half2     v2[2];\n#endif\n#if defined( __CL_HALF4__)\n    __cl_half4     v4;\n#endif\n}cl_half4;\n\n/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */\ntypedef  cl_half4  cl_half3;\n\ntypedef union\n{\n    cl_half   CL_ALIGNED(16) s[8];\n#if __CL_HAS_ANON_STRUCT__\n    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };\n    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };\n    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };\n#endif\n#if defined( __CL_HALF2__)\n    __cl_half2     v2[4];\n#endif\n#if defined( __CL_HALF4__)\n    __cl_half4     v4[2];\n#endif\n#if defined( __CL_HALF8__ )\n    __cl_half8     v8;\n#endif\n}cl_half8;\n\ntypedef union\n{\n    cl_half  CL_ALIGNED(32) s[16];\n#if __CL_HAS_ANON_STRUCT__\n    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };\n#endif\n#if defined( __CL_HALF2__)\n    __cl_half2     v2[8];\n#endif\n#if defined( __CL_HALF4__)\n    __cl_half4     v4[4];\n#endif\n#if defined( __CL_HALF8__ )\n    __cl_half8     v8[2];\n#endif\n#if defined( __CL_HALF16__ )\n    __cl_half16    v16;\n#endif\n}cl_half16;\n\n/* ---- cl_intn ---- */\ntypedef union\n{\n    cl_int  CL_ALIGNED(8) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };\n#endif\n#if defined( __CL_INT2__)\n    __cl_int2     v2;\n#endif\n}cl_int2;\n\ntypedef union\n{\n    cl_int  CL_ALIGNED(16) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };\n#endif\n#if defined( __CL_INT2__)\n    __cl_int2     v2[2];\n#endif\n#if defined( __CL_INT4__)\n    __cl_int4     v4;\n#endif\n}cl_int4;\n\n/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */\ntypedef  cl_int4  cl_int3;\n\ntypedef union\n{\n    cl_int   CL_ALIGNED(32) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };\n#endif\n#if defined( __CL_INT2__)\n    __cl_int2     v2[4];\n#endif\n#if defined( __CL_INT4__)\n    __cl_int4     v4[2];\n#endif\n#if defined( __CL_INT8__ )\n    __cl_int8     v8;\n#endif\n}cl_int8;\n\ntypedef union\n{\n    cl_int  CL_ALIGNED(64) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };\n#endif\n#if defined( __CL_INT2__)\n    __cl_int2     v2[8];\n#endif\n#if defined( __CL_INT4__)\n    __cl_int4     v4[4];\n#endif\n#if defined( __CL_INT8__ )\n    __cl_int8     v8[2];\n#endif\n#if defined( __CL_INT16__ )\n    __cl_int16    v16;\n#endif\n}cl_int16;\n\n\n/* ---- cl_uintn ---- */\ntypedef union\n{\n    cl_uint  CL_ALIGNED(8) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };\n#endif\n#if defined( __CL_UINT2__)\n    __cl_uint2     v2;\n#endif\n}cl_uint2;\n\ntypedef union\n{\n    cl_uint  CL_ALIGNED(16) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };\n#endif\n#if defined( __CL_UINT2__)\n    __cl_uint2     v2[2];\n#endif\n#if defined( __CL_UINT4__)\n    __cl_uint4     v4;\n#endif\n}cl_uint4;\n\n/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */\ntypedef  cl_uint4  cl_uint3;\n\ntypedef union\n{\n    cl_uint   CL_ALIGNED(32) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };\n#endif\n#if defined( __CL_UINT2__)\n    __cl_uint2     v2[4];\n#endif\n#if defined( __CL_UINT4__)\n    __cl_uint4     v4[2];\n#endif\n#if defined( __CL_UINT8__ )\n    __cl_uint8     v8;\n#endif\n}cl_uint8;\n\ntypedef union\n{\n    cl_uint  CL_ALIGNED(64) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };\n#endif\n#if defined( __CL_UINT2__)\n    __cl_uint2     v2[8];\n#endif\n#if defined( __CL_UINT4__)\n    __cl_uint4     v4[4];\n#endif\n#if defined( __CL_UINT8__ )\n    __cl_uint8     v8[2];\n#endif\n#if defined( __CL_UINT16__ )\n    __cl_uint16    v16;\n#endif\n}cl_uint16;\n\n/* ---- cl_longn ---- */\ntypedef union\n{\n    cl_long  CL_ALIGNED(16) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };\n#endif\n#if defined( __CL_LONG2__)\n    __cl_long2     v2;\n#endif\n}cl_long2;\n\ntypedef union\n{\n    cl_long  CL_ALIGNED(32) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };\n#endif\n#if defined( __CL_LONG2__)\n    __cl_long2     v2[2];\n#endif\n#if defined( __CL_LONG4__)\n    __cl_long4     v4;\n#endif\n}cl_long4;\n\n/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */\ntypedef  cl_long4  cl_long3;\n\ntypedef union\n{\n    cl_long   CL_ALIGNED(64) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };\n#endif\n#if defined( __CL_LONG2__)\n    __cl_long2     v2[4];\n#endif\n#if defined( __CL_LONG4__)\n    __cl_long4     v4[2];\n#endif\n#if defined( __CL_LONG8__ )\n    __cl_long8     v8;\n#endif\n}cl_long8;\n\ntypedef union\n{\n    cl_long  CL_ALIGNED(128) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };\n#endif\n#if defined( __CL_LONG2__)\n    __cl_long2     v2[8];\n#endif\n#if defined( __CL_LONG4__)\n    __cl_long4     v4[4];\n#endif\n#if defined( __CL_LONG8__ )\n    __cl_long8     v8[2];\n#endif\n#if defined( __CL_LONG16__ )\n    __cl_long16    v16;\n#endif\n}cl_long16;\n\n\n/* ---- cl_ulongn ---- */\ntypedef union\n{\n    cl_ulong  CL_ALIGNED(16) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };\n#endif\n#if defined( __CL_ULONG2__)\n    __cl_ulong2     v2;\n#endif\n}cl_ulong2;\n\ntypedef union\n{\n    cl_ulong  CL_ALIGNED(32) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };\n#endif\n#if defined( __CL_ULONG2__)\n    __cl_ulong2     v2[2];\n#endif\n#if defined( __CL_ULONG4__)\n    __cl_ulong4     v4;\n#endif\n}cl_ulong4;\n\n/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */\ntypedef  cl_ulong4  cl_ulong3;\n\ntypedef union\n{\n    cl_ulong   CL_ALIGNED(64) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };\n#endif\n#if defined( __CL_ULONG2__)\n    __cl_ulong2     v2[4];\n#endif\n#if defined( __CL_ULONG4__)\n    __cl_ulong4     v4[2];\n#endif\n#if defined( __CL_ULONG8__ )\n    __cl_ulong8     v8;\n#endif\n}cl_ulong8;\n\ntypedef union\n{\n    cl_ulong  CL_ALIGNED(128) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };\n#endif\n#if defined( __CL_ULONG2__)\n    __cl_ulong2     v2[8];\n#endif\n#if defined( __CL_ULONG4__)\n    __cl_ulong4     v4[4];\n#endif\n#if defined( __CL_ULONG8__ )\n    __cl_ulong8     v8[2];\n#endif\n#if defined( __CL_ULONG16__ )\n    __cl_ulong16    v16;\n#endif\n}cl_ulong16;\n\n\n/* --- cl_floatn ---- */\n\ntypedef union\n{\n    cl_float  CL_ALIGNED(8) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };\n#endif\n#if defined( __CL_FLOAT2__)\n    __cl_float2     v2;\n#endif\n}cl_float2;\n\ntypedef union\n{\n    cl_float  CL_ALIGNED(16) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };\n#endif\n#if defined( __CL_FLOAT2__)\n    __cl_float2     v2[2];\n#endif\n#if defined( __CL_FLOAT4__)\n    __cl_float4     v4;\n#endif\n}cl_float4;\n\n/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */\ntypedef  cl_float4  cl_float3;\n\ntypedef union\n{\n    cl_float   CL_ALIGNED(32) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };\n#endif\n#if defined( __CL_FLOAT2__)\n    __cl_float2     v2[4];\n#endif\n#if defined( __CL_FLOAT4__)\n    __cl_float4     v4[2];\n#endif\n#if defined( __CL_FLOAT8__ )\n    __cl_float8     v8;\n#endif\n}cl_float8;\n\ntypedef union\n{\n    cl_float  CL_ALIGNED(64) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };\n#endif\n#if defined( __CL_FLOAT2__)\n    __cl_float2     v2[8];\n#endif\n#if defined( __CL_FLOAT4__)\n    __cl_float4     v4[4];\n#endif\n#if defined( __CL_FLOAT8__ )\n    __cl_float8     v8[2];\n#endif\n#if defined( __CL_FLOAT16__ )\n    __cl_float16    v16;\n#endif\n}cl_float16;\n\n/* --- cl_doublen ---- */\n\ntypedef union\n{\n    cl_double  CL_ALIGNED(16) s[2];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };\n   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };\n   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };\n#endif\n#if defined( __CL_DOUBLE2__)\n    __cl_double2     v2;\n#endif\n}cl_double2;\n\ntypedef union\n{\n    cl_double  CL_ALIGNED(32) s[4];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };\n   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };\n#endif\n#if defined( __CL_DOUBLE2__)\n    __cl_double2     v2[2];\n#endif\n#if defined( __CL_DOUBLE4__)\n    __cl_double4     v4;\n#endif\n}cl_double4;\n\n/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */\ntypedef  cl_double4  cl_double3;\n\ntypedef union\n{\n    cl_double   CL_ALIGNED(64) s[8];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };\n   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };\n   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };\n#endif\n#if defined( __CL_DOUBLE2__)\n    __cl_double2     v2[4];\n#endif\n#if defined( __CL_DOUBLE4__)\n    __cl_double4     v4[2];\n#endif\n#if defined( __CL_DOUBLE8__ )\n    __cl_double8     v8;\n#endif\n}cl_double8;\n\ntypedef union\n{\n    cl_double  CL_ALIGNED(128) s[16];\n#if __CL_HAS_ANON_STRUCT__\n   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\n   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\n   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };\n#endif\n#if defined( __CL_DOUBLE2__)\n    __cl_double2     v2[8];\n#endif\n#if defined( __CL_DOUBLE4__)\n    __cl_double4     v4[4];\n#endif\n#if defined( __CL_DOUBLE8__ )\n    __cl_double8     v8[2];\n#endif\n#if defined( __CL_DOUBLE16__ )\n    __cl_double16    v16;\n#endif\n}cl_double16;\n\n/* Macro to facilitate debugging\n * Usage:\n *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.\n *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \\\"\n *   Each line thereafter of OpenCL C source must end with: \\n\\\n *   The last line ends in \";\n *\n *   Example:\n *\n *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO \"\\\n *   kernel void foo( int a, float * b )             \\n\\\n *   {                                               \\n\\\n *      // my comment                                \\n\\\n *      *b[ get_global_id(0)] = a;                   \\n\\\n *   }                                               \\n\\\n *   \";\n *\n * This should correctly set up the line, (column) and file information for your source\n * string so you can do source level debugging.\n */\n#define  __CL_STRINGIFY( _x )               # _x\n#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )\n#define  CL_PROGRAM_STRING_DEBUG_INFO       \"#line \"  _CL_STRINGIFY(__LINE__) \" \\\"\" __FILE__ \"\\\" \\n\\n\"\n\n#ifdef __cplusplus\n}\n#endif\n\n#undef __CL_HAS_ANON_STRUCT__\n#undef __CL_ANON_STRUCT__\n#if defined( _WIN32) && defined(_MSC_VER)\n    #if _MSC_VER >=1500\n    #pragma warning( pop )\n    #endif\n#endif\n\n#endif  /* __CL_PLATFORM_H  */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_va_api_media_sharing_intel.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n/*****************************************************************************\\\n\nCopyright (c) 2013-2019 Intel Corporation All Rights Reserved.\n\nTHESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\nA PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\nEXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\nPROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\nPROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\nOF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING\nNEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE\nMATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nFile Name: cl_va_api_media_sharing_intel.h\n\nAbstract:\n\nNotes:\n\n\\*****************************************************************************/\n\n\n#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H\n#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H\n\n#include <CL/cl.h>\n#include <CL/cl_platform.h>\n#include <va/va.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/******************************************\n* cl_intel_va_api_media_sharing extension *\n*******************************************/\n\n#define cl_intel_va_api_media_sharing 1\n\n/* error codes */\n#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098\n#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099\n#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100\n#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101\n\n/* cl_va_api_device_source_intel */\n#define CL_VA_API_DISPLAY_INTEL                             0x4094\n\n/* cl_va_api_device_set_intel */\n#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095\n#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096\n\n/* cl_context_info */\n#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097\n\n/* cl_mem_info */\n#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098\n\n/* cl_image_info */\n#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099\n\n/* cl_command_type */\n#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A\n#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B\n\ntypedef cl_uint cl_va_api_device_source_intel;\ntypedef cl_uint cl_va_api_device_set_intel;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclGetDeviceIDsFromVA_APIMediaAdapterINTEL(\n    cl_platform_id                platform,\n    cl_va_api_device_source_intel media_adapter_type,\n    void*                         media_adapter,\n    cl_va_api_device_set_intel    media_adapter_set,\n    cl_uint                       num_entries,\n    cl_device_id*                 devices,\n    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(\n    cl_platform_id                platform,\n    cl_va_api_device_source_intel media_adapter_type,\n    void*                         media_adapter,\n    cl_va_api_device_set_intel    media_adapter_set,\n    cl_uint                       num_entries,\n    cl_device_id*                 devices,\n    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_mem CL_API_CALL\nclCreateFromVA_APIMediaSurfaceINTEL(\n    cl_context                    context,\n    cl_mem_flags                  flags,\n    VASurfaceID*                  surface,\n    cl_uint                       plane,\n    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(\n    cl_context                    context,\n    cl_mem_flags                  flags,\n    VASurfaceID*                  surface,\n    cl_uint                       plane,\n    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueAcquireVA_APIMediaSurfacesINTEL(\n    cl_command_queue              command_queue,\n    cl_uint                       num_objects,\n    const cl_mem*                 mem_objects,\n    cl_uint                       num_events_in_wait_list,\n    const cl_event*               event_wait_list,\n    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(\n    cl_command_queue              command_queue,\n    cl_uint                       num_objects,\n    const cl_mem*                 mem_objects,\n    cl_uint                       num_events_in_wait_list,\n    const cl_event*               event_wait_list,\n    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;\n\nextern CL_API_ENTRY cl_int CL_API_CALL\nclEnqueueReleaseVA_APIMediaSurfacesINTEL(\n    cl_command_queue              command_queue,\n    cl_uint                       num_objects,\n    const cl_mem*                 mem_objects,\n    cl_uint                       num_events_in_wait_list,\n    const cl_event*               event_wait_list,\n    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;\n\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(\n    cl_command_queue              command_queue,\n    cl_uint                       num_objects,\n    const cl_mem*                 mem_objects,\n    cl_uint                       num_events_in_wait_list,\n    const cl_event*               event_wait_list,\n    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/cl_version.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2018-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __CL_VERSION_H\n#define __CL_VERSION_H\n\n/* Detect which version to target */\n#if !defined(CL_TARGET_OPENCL_VERSION)\n#pragma message(\"cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)\")\n#define CL_TARGET_OPENCL_VERSION 220\n#endif\n#if CL_TARGET_OPENCL_VERSION != 100 && \\\n    CL_TARGET_OPENCL_VERSION != 110 && \\\n    CL_TARGET_OPENCL_VERSION != 120 && \\\n    CL_TARGET_OPENCL_VERSION != 200 && \\\n    CL_TARGET_OPENCL_VERSION != 210 && \\\n    CL_TARGET_OPENCL_VERSION != 220 && \\\n    CL_TARGET_OPENCL_VERSION != 300\n#pragma message(\"cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 220 (OpenCL 2.2)\")\n#undef CL_TARGET_OPENCL_VERSION\n#define CL_TARGET_OPENCL_VERSION 220\n#endif\n\n\n/* OpenCL Version */\n#if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0)\n#define CL_VERSION_3_0  1\n#endif\n#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)\n#define CL_VERSION_2_2  1\n#endif\n#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)\n#define CL_VERSION_2_1  1\n#endif\n#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)\n#define CL_VERSION_2_0  1\n#endif\n#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)\n#define CL_VERSION_1_2  1\n#endif\n#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)\n#define CL_VERSION_1_1  1\n#endif\n#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)\n#define CL_VERSION_1_0  1\n#endif\n\n/* Allow deprecated APIs for older OpenCL versions. */\n#if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)\n#define CL_USE_DEPRECATED_OPENCL_2_2_APIS\n#endif\n#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)\n#define CL_USE_DEPRECATED_OPENCL_2_1_APIS\n#endif\n#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)\n#define CL_USE_DEPRECATED_OPENCL_2_0_APIS\n#endif\n#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)\n#define CL_USE_DEPRECATED_OPENCL_1_2_APIS\n#endif\n#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\n#define CL_USE_DEPRECATED_OPENCL_1_1_APIS\n#endif\n#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)\n#define CL_USE_DEPRECATED_OPENCL_1_0_APIS\n#endif\n\n#endif  /* __CL_VERSION_H */\n"
  },
  {
    "path": "GpuMemLatency/OpenCL/include/CL/opencl.h",
    "content": "/*******************************************************************************\n * Copyright (c) 2008-2020 The Khronos Group Inc.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *    http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n ******************************************************************************/\n\n#ifndef __OPENCL_H\n#define __OPENCL_H\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n#include <CL/cl.h>\n#include <CL/cl_gl.h>\n#include <CL/cl_gl_ext.h>\n#include <CL/cl_ext.h>\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif  /* __OPENCL_H   */\n"
  },
  {
    "path": "GpuMemLatency/atomic_test.c",
    "content": "#include \"opencltest.h\"\r\n\r\nfloat int_atomic_latency_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t iterations,\r\n    short local,\r\n    uint32_t *time_ms)\r\n{\r\n    cl_int ret;\r\n    cl_int result = 0;\r\n    size_t global_item_size = 2;\r\n    size_t local_item_size = 1;\r\n    float latency;\r\n    uint32_t time_diff_ms;\r\n    uint32_t A = 0;\r\n\r\n    if (local)\r\n    {\r\n        local_item_size = 2;\r\n    }\r\n\r\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(uint32_t), NULL, &ret);\r\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &result);\r\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t), &A, 0, NULL, NULL);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL);\r\n    clFinish(command_queue);\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n\r\n    start_timing();\r\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n        latency = 0;\r\n        goto cleanup;\r\n    }\r\n    clFinish(command_queue);\r\n    time_diff_ms = end_timing();\r\n    *time_ms = time_diff_ms;\r\n    latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2;\r\n\r\ncleanup:\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    return latency;\r\n}\r\n\r\nfloat c2c_atomic_latency_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t iterations)\r\n{\r\n    cl_int ret;\r\n    cl_int result = 0;\r\n    size_t global_item_size;\r\n    size_t local_item_size = 1;\r\n    float latency;\r\n    uint32_t time_diff_ms;\r\n    uint32_t A;\r\n\r\n    cl_uint cuCount = getCuCount();\r\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(uint32_t), NULL, &ret);\r\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &ret);\r\n    global_item_size = cuCount;\r\n\r\n    float* result_arr = (float*)malloc(sizeof(float) * cuCount * cuCount);\r\n\r\n    for (cl_int t1_idx = 0; t1_idx < cuCount; t1_idx++)\r\n    {\r\n        for (cl_int t2_idx = 0; t2_idx < cuCount; t2_idx++)\r\n        {\r\n            if (t1_idx == t2_idx) continue;\r\n            fprintf(stderr, \"Testing %d -> %d\\n\", t1_idx, t2_idx);\r\n            A = 0;\r\n            ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t), &A, 0, NULL, NULL);\r\n            ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL);\r\n            clFinish(command_queue);\r\n            clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n            clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);\r\n            clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n            clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&t1_idx);\r\n            clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&t2_idx);\r\n\r\n            start_timing();\r\n            ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n            if (ret != CL_SUCCESS)\r\n            {\r\n                fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n                latency = 0;\r\n                goto cleanup;\r\n            }\r\n            clFinish(command_queue);\r\n            time_diff_ms = end_timing();\r\n            latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2;\r\n            fprintf(stderr, \"%d -> %d: %f\\n\", t1_idx, t2_idx, latency);\r\n            result_arr[t1_idx * cuCount + t2_idx] = latency;\r\n        }\r\n    }\r\n\r\n    for (cl_int i = 0; i < cuCount; i++)\r\n    {\r\n        printf(\",%d\", i);\r\n    }\r\n    printf(\"\\n\");\r\n\r\n    for (cl_int t1_idx = 0; t1_idx < cuCount; t1_idx++)\r\n    {\r\n        printf(\"%d\", t1_idx);\r\n        for (cl_int t2_idx = 0; t2_idx < cuCount; t2_idx++)\r\n        {\r\n            if (t1_idx == t2_idx) printf(\",x\");\r\n            else printf(\",%f\", result_arr[t1_idx * cuCount + t2_idx]);\r\n        }\r\n        printf(\"\\n\");\r\n    }\r\n\r\ncleanup:\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    free(result_arr);\r\n    return latency;\r\n}\n\nfloat int_atomic_add_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    size_t threads,\n    size_t localsize)\n{\n    // Loop unroll factor\n    const float opsPerIteration = 8.0f;\n    cl_int ret;\n    int64_t time_diff_ms = 0;\n    float gOpsPerSec;\n    uint32_t iterations = 7000;\n    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * threads);\n    for (int i = 0; i < threads; i++) A[i] = i;\n\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint32_t) * threads, NULL, &ret);\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t) * threads, A, 0, NULL, NULL);\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);\n    clFinish(command_queue);\n\n    while (time_diff_ms < TARGET_TIME_MS / 2) {\n        start_timing();\n        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &threads, &localsize, 0, NULL, NULL);\n        if (ret != CL_SUCCESS)\n        {\n            fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\n            gOpsPerSec = 0;\n            goto int_atomic_add_test_end;\n        }\n\n        clFinish(command_queue);\n        time_diff_ms = end_timing();\n        float totalOps = (float)iterations * opsPerIteration * (float)threads;\n        gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);\n        fprintf(stderr, \"GOPS: %f, elapsed time: %lld\\n\", gOpsPerSec, time_diff_ms);\n\n        iterations = adjust_iterations(iterations, time_diff_ms);\n        clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);\n    }\n\n\nint_atomic_add_test_end:\n    clReleaseMemObject(a_mem_obj);\n    free(A);\n    return gOpsPerSec;\n}"
  },
  {
    "path": "GpuMemLatency/bw_test.c",
    "content": "#include \"opencltest.h\"\r\n\r\nfloat bw_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint64_t list_size,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t skip,\r\n    uint32_t chase_iterations)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    float bandwidth, total_data_gb;\r\n    cl_int ret;\r\n    cl_int float4size = list_size / 4;\r\n    int64_t time_diff_ms;\r\n\r\n    if (skip == 0)\r\n    {\r\n        // nemes's read-combining-defeating heuristic\r\n        uint32_t region_size = list_size * sizeof(float);\r\n        uint32_t current_region_steps = (uint32_t)(region_size / (local_size * 4));\r\n        skip = (chase_iterations + current_region_steps + 1) * local_size * 4;\r\n    }\r\n\r\n    float* A = (float*)malloc(sizeof(float) * list_size);\r\n    float* result = (float*)malloc(sizeof(float) * thread_count);\r\n\r\n    if (!A || !result)\r\n    {\r\n        fprintf(stderr, \"Failed to allocate memory for test size %lu KB\\n\", list_size);\r\n    }\r\n\r\n    // assume that cl_uint size is 4 bytes, same as float size\r\n    cl_uint* start_offsets = (cl_uint*)malloc(sizeof(cl_uint) * thread_count);\r\n    cl_uint* calculated_offsets = (cl_uint*)malloc(sizeof(cl_uint) * thread_count);\r\n    memset(calculated_offsets, 0, sizeof(uint32_t) * thread_count);\r\n    for (uint32_t i = 0; i < list_size; i++)\r\n    {\r\n        A[i] = (float)(i * 0.5);\r\n    }\r\n\r\n    // tell each thread where to start\r\n    for (uint32_t i = 0; i < thread_count; i++)\r\n    {\r\n        uint32_t localId = i % local_size;\r\n        uint32_t groupId = i / local_size;\r\n        start_offsets[i] = (cl_uint)((groupId * skip * local_size + localId) % (float4size - 1));\r\n\r\n        // randomly start each workgroup somewhere - ends up being really bad\r\n        /*cl_uint groupOffset = rand() % (float4size / local_size);\r\n        start_offsets[i] = (cl_uint)((groupOffset * local_size + localId) % (float4size - 1));*/\r\n    }\r\n\r\n    // copy array to device\r\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(float), NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(float), A, 0, NULL, NULL);\r\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);\r\n    cl_mem start_offsets_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint) * thread_count, NULL, &ret);\r\n    if (ret != 0) fprintf(stderr, \"create buffer for start offsets failed. ret = %d\\n\", ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, start_offsets_obj, CL_TRUE, 0, sizeof(cl_uint) * thread_count, start_offsets, 0, NULL, NULL);\r\n    if (ret != 0) fprintf(stderr, \"enqueue write buffer for start offsets failed. ret = %d\\n\", ret);\r\n\r\n    // Set kernel arguments for __kernel void sum_bw_test(__global float* A, int count, int float4size, __global float* ret, int skip, __global int *startPositions)\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&float4size);\r\n    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&result_obj);\r\n    clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&skip);\r\n    clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&start_offsets_obj);\r\n    clFinish(command_queue); // writes should be blocking, but are they?\r\n\r\n    start_timing();\r\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    ret = clFinish(command_queue); // returns success even when TDR happens?\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    time_diff_ms = end_timing();\r\n\r\n    // each thread does iterations reads\r\n    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count + thread_count) / 1e9;\r\n    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;\r\n\r\n    //fprintf(stderr, \"%llu ms, %llu GB\\n\", time_diff_ms, total_data_gb);\r\n\r\n    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);\r\n    if (ret != 0) fprintf(stderr, \"enqueue read buffer for result failed. ret = %d\\n\", ret);\r\n    clFinish(command_queue);\r\n\r\n    ret = clEnqueueReadBuffer(command_queue, start_offsets_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, calculated_offsets, 0, NULL, NULL);\r\n    if (ret != 0) fprintf(stderr, \"enqueue read buffer for start offsets failed. ret = %d\\n\", ret);\r\n    clFinish(command_queue);\r\n\r\n    if (memcmp(calculated_offsets, start_offsets, sizeof(uint32_t) * thread_count))\r\n    {\r\n        fprintf(stderr, \"mismatch in calculated start offsets\\n\");\r\n        for (uint32_t i = 0; i < thread_count; i++)\r\n        {\r\n            if (calculated_offsets[i] != start_offsets[i]) {\r\n                fprintf(stderr, \"At index %u, calculated from GPU = %u, calculated on CPU = %u. skip=%u\\n\", i, calculated_offsets[i], start_offsets[i], skip);\r\n                break;\r\n            }\r\n        }\r\n    }\r\n\r\n    //fprintf(stderr, \"Finished reading result. Sum: %d\\n\", result[0]);\r\n\r\ncleanup:\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    clReleaseMemObject(start_offsets_obj);\r\n    free(A);\r\n    free(result);\r\n    free(start_offsets);\r\n    free(calculated_offsets);\r\n    return bandwidth;\r\n}\r\n\r\nfloat tex_bw_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint64_t width,\r\n    uint64_t height,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t randomize,\r\n    uint32_t chase_iterations,\r\n    int64_t *time_ms)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    float texels = 0;\r\n    cl_int ret;\r\n    int64_t time_diff_ms;\r\n    uint64_t tex_array_size = 3 * width * height; // texture size in bytes\r\n    cl_mem tex_mem_obj = NULL, a_mem_obj = NULL, result_obj = NULL;\r\n\r\n    float* A = (float*)malloc(sizeof(float) * tex_array_size);\r\n    float* result = (float*)malloc(sizeof(float) * thread_count);\r\n\r\n    if (!A || !result)\r\n    {\r\n        fprintf(stderr, \"Failed to allocate memory for %lu x %lu texture\\n\", width, height);\r\n    }\r\n\r\n    // fill array\r\n    for (uint64_t i = 0; i < tex_array_size; i++)\r\n    {\r\n        A[i] = randomize ? rand() * 0.2f : (float)(i * 0.5);\r\n    }\r\n\r\n    // create texture from it\r\n    //a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, tex_array_size * sizeof(float), A, &ret);\r\n    //ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, tex_array_size * sizeof(float), A, 0, NULL, NULL);\r\n    cl_image_desc imageDesc;\r\n    memset(&imageDesc, 0, sizeof(cl_image_desc));\r\n    imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;\r\n    imageDesc.image_width = width;\r\n    imageDesc.image_height = height;\r\n    //imageDesc.mem_object = a_mem_obj;\r\n    //imageDesc.buffer = A;\r\n    cl_image_format imageFormat;\r\n    imageFormat.image_channel_order = CL_R;\r\n    imageFormat.image_channel_data_type = CL_FLOAT;\r\n    tex_mem_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, A, &ret);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Failed to create 2d texture: %d\\n\", ret);\r\n        goto tex_bw_cleanup;\r\n    }\r\n\r\n    size_t origin[] = { 0, 0, 0 };\r\n    size_t region[] = { width, height, 1 };\r\n    ret = clEnqueueWriteImage(command_queue, tex_mem_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Failed to copy 2d texture: %d\\n\", ret);\r\n        goto tex_bw_cleanup;\r\n    }\r\n\r\n    fprintf(stderr, \"Created image\\n\");\r\n\r\n    // copy array to device\r\n    result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);\r\n\r\n    // Set kernel arguments for __kernel void sum_bw_test(__global float* A, int count, int float4size, __global float* ret, int skip, __global int *startPositions)\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_mem_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n    clFinish(command_queue); // writes should be blocking, but are they?\r\n\r\n    start_timing();\r\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n        texels = 0;\r\n        goto tex_bw_cleanup;\r\n    }\r\n\r\n    ret = clFinish(command_queue); // returns success even when TDR happens?\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\r\n        texels = 0;\r\n        goto tex_bw_cleanup;\r\n    }\r\n\r\n    time_diff_ms = end_timing();\r\n    fprintf(stderr, \"elapsed time: %lld ms\\n\", time_diff_ms);\r\n\r\n    // each thread does iterations samples, and each sample returns a 4-wide vector\r\n    texels = 1000 * (float)(chase_iterations * thread_count * 4 / 1e9) / (float)time_diff_ms;\r\n    fprintf(stderr, \"%u iterations, %u threads, %lu ms\\n\", chase_iterations, thread_count, time_diff_ms);\r\n\r\n    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);\r\n    if (ret != 0) fprintf(stderr, \"enqueue read buffer for result failed. ret = %d\\n\", ret);\r\n    clFinish(command_queue);\r\n\r\n    *time_ms = time_diff_ms;\r\n\r\ntex_bw_cleanup:\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n    clReleaseMemObject(tex_mem_obj);\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    free(A);\r\n    free(result);\r\n    return texels;\r\n}\r\n\r\n// must be at least as large as local memory test size in kernel\r\n// list size in 32-bit elements\r\n#define local_mem_bw_test_size 8192\r\nfloat local_bw_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int64_t *time_ms)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    float bandwidth, total_data_gb;\r\n    cl_int ret;\r\n    int64_t time_diff_ms;\r\n\r\n    float* A = (uint32_t*)malloc(sizeof(uint32_t) * local_mem_bw_test_size);\r\n    float* result = (uint32_t*)malloc(sizeof(uint32_t) * thread_count);\r\n\r\n    if (!A || !result)\r\n    {\r\n        fprintf(stderr, \"Failed to allocate memory for test size %lu KB\\n\", local_mem_bw_test_size * 4);\r\n    }\r\n\r\n    for (uint32_t i = 0; i < local_mem_bw_test_size; i++)\r\n    {\r\n        A[i] = i + .02;\r\n    }\r\n\r\n    // copy array to device\r\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local_mem_bw_test_size * sizeof(float), NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local_mem_bw_test_size * sizeof(float), A, 0, NULL, NULL);\r\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);\r\n\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n    clFinish(command_queue); // writes should be blocking, but are they?\r\n\r\n    start_timing();\r\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    ret = clFinish(command_queue); // returns success even when TDR happens?\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    time_diff_ms = end_timing();\r\n    *time_ms = time_diff_ms;\r\n\r\n    // each thread does iterations reads\r\n    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9;\r\n    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;\r\n\r\n    //fprintf(stderr, \"%llu ms, %llu GB\\n\", time_diff_ms, total_data_gb);\r\n\r\n    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);\r\n    if (ret != 0) fprintf(stderr, \"enqueue read buffer for result failed. ret = %d\\n\", ret);\r\n    clFinish(command_queue);\r\n\r\ncleanup:\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    free(A);\r\n    free(result);\r\n    return bandwidth;\r\n}\r\n\r\n#define buffer_test_size 4096 // 1024x uint4\r\nfloat buffer_bw_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int64_t* time_ms)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    float bandwidth, total_data_gb;\r\n    cl_int ret;\r\n    int64_t time_diff_ms;\r\n    cl_mem result_obj;\r\n\r\n    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * buffer_test_size);\r\n    float* result = (uint32_t*)malloc(sizeof(float) * thread_count);\r\n\r\n    if (!A || !result)\r\n    {\r\n        fprintf(stderr, \"Failed to allocate memory for test size %lu KB\\n\", local_mem_bw_test_size * 4);\r\n    }\r\n\r\n    for (uint32_t i = 0; i < buffer_test_size; i++)\r\n    {\r\n        A[i] = i + 1;\r\n    }\r\n\r\n    // copy array to device\r\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_test_size * sizeof(uint32_t), NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, buffer_test_size * sizeof(uint32_t), A, 0, NULL, NULL);\r\n\r\n    // handle cl_image stuff\r\n    cl_image_format imageFormat;\r\n    imageFormat.image_channel_data_type = CL_UNSIGNED_INT32;\r\n    imageFormat.image_channel_order = CL_R;\r\n    cl_image_desc imageDesc;\r\n    memset(&imageDesc, 0, sizeof(cl_image_desc));\r\n    imageDesc.buffer = a_mem_obj;\r\n    imageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;\r\n    imageDesc.image_width = buffer_test_size; // width in pixels\r\n    cl_mem tex_obj = tex_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, NULL, &ret);\r\n\r\n    size_t origin[] = { 0, 0, 0 };\r\n    size_t region[] = { imageDesc.image_width, 1, 1 };\r\n    ret = clEnqueueWriteImage(command_queue, tex_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL);\r\n\r\n    result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);\r\n\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n    clFinish(command_queue); // writes should be blocking, but are they?\r\n\r\n    start_timing();\r\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    ret = clFinish(command_queue); // returns success even when TDR happens?\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    time_diff_ms = end_timing();\r\n    *time_ms = time_diff_ms;\r\n\r\n    // each thread does iterations reads\r\n    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9;\r\n    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;\r\n\r\n    //fprintf(stderr, \"%llu ms, %llu GB\\n\", time_diff_ms, total_data_gb);\r\n\r\n    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);\r\n    if (ret != 0) fprintf(stderr, \"enqueue read buffer for result failed. ret = %d\\n\", ret);\r\n    clFinish(command_queue);\r\n\r\ncleanup:\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    free(A);\r\n    free(result);\r\n    return bandwidth;\r\n}\r\n\r\nfloat local_chase_bw_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    uint32_t wave_size,\r\n    int64_t* time_ms)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    float bandwidth, total_data_gb;\r\n    cl_int ret;\r\n    int64_t time_diff_ms;\r\n\r\n    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * local_mem_bw_test_size);\r\n    uint32_t* result = (uint32_t*)malloc(sizeof(uint32_t) * thread_count);\r\n\r\n    if (!A || !result)\r\n    {\r\n        fprintf(stderr, \"Failed to allocate memory for test size %lu KB\\n\", local_mem_bw_test_size * 4);\r\n    }\r\n\r\n    for (uint32_t i = 0; i < local_mem_bw_test_size; i++)\r\n    {\r\n        // assumes local_mem_bw_test_size is a power of 2.\r\n        A[i] = i + wave_size & (local_mem_bw_test_size - 1);\r\n    }\r\n\r\n    // copy array to device\r\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local_mem_bw_test_size * sizeof(uint32_t), NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local_mem_bw_test_size * sizeof(uint32_t), A, 0, NULL, NULL);\r\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint32_t) * thread_count, NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);\r\n\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n    clFinish(command_queue); // writes should be blocking, but are they?\r\n\r\n    start_timing();\r\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    ret = clFinish(command_queue); // returns success even when TDR happens?\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    time_diff_ms = end_timing();\r\n    *time_ms = time_diff_ms;\r\n\r\n    // each thread does iterations reads\r\n    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9;\r\n    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;\r\n\r\n    //fprintf(stderr, \"%llu ms, %llu GB\\n\", time_diff_ms, total_data_gb);\r\n\r\n    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);\r\n    if (ret != 0) fprintf(stderr, \"enqueue read buffer for result failed. ret = %d\\n\", ret);\r\n    clFinish(command_queue);\r\n\r\ncleanup:\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    free(A);\r\n    free(result);\r\n    return bandwidth;\r\n}\r\n\r\n#define local64_test_size 2048\r\nfloat local_64_bw_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int64_t* time_ms)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    float bandwidth, total_data_gb;\r\n    cl_int ret;\r\n    int64_t time_diff_ms;\r\n\r\n    uint64_t* A = (uint64_t*)malloc(sizeof(uint64_t) * local64_test_size);\r\n    uint64_t* result = (uint64_t*)malloc(sizeof(uint64_t) * thread_count);\r\n\r\n    if (!A || !result)\r\n    {\r\n        fprintf(stderr, \"Failed to allocate memory for test size %lu KB\\n\", local64_test_size * 4);\r\n    }\r\n\r\n    for (uint64_t i = 0; i < local64_test_size; i++)\r\n    {\r\n        A[i] = i;\r\n    }\r\n\r\n    // copy array to device\r\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local64_test_size * sizeof(uint64_t), NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local64_test_size * sizeof(uint64_t), A, 0, NULL, NULL);\r\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint64_t) * thread_count, NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint64_t) * thread_count, result, 0, NULL, NULL);\r\n\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n    clFinish(command_queue); // writes should be blocking, but are they?\r\n\r\n    start_timing();\r\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    ret = clFinish(command_queue); // returns success even when TDR happens?\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\r\n        bandwidth = 0;\r\n        goto cleanup;\r\n    }\r\n\r\n    time_diff_ms = end_timing();\r\n    *time_ms = time_diff_ms;\r\n\r\n    // each thread does iterations reads\r\n    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count + thread_count) / 1e9;\r\n    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;\r\n\r\n    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint64_t) * thread_count, result, 0, NULL, NULL);\r\n    if (ret != 0) fprintf(stderr, \"enqueue read buffer for result failed. ret = %d\\n\", ret);\r\n    clFinish(command_queue);\r\n\r\ncleanup:\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    free(A);\r\n    free(result);\r\n    return bandwidth;\r\n}\r\n\r\n\r\n// default test sizes for link bandwidth\r\nconst uint64_t default_link_test_sizes[] = { 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152 };\r\n\r\nvoid link_bw_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t iterations)\r\n{\r\n    cl_int ret;\r\n    cl_int result = 0;\r\n    size_t global_item_size;\r\n    size_t local_item_size = 1;\r\n    float gpu_to_host_bandwidth, host_to_gpu_bandwidth, total_data_gb;\r\n    uint32_t time_diff_ms, loop_iterations;\r\n    uint32_t* A;\r\n\r\n    int test_size_count = sizeof(default_link_test_sizes) / sizeof(unsigned long long);\r\n    float* results = (float*)malloc(sizeof(float) * 2 * test_size_count);\r\n    memset(results, 0, sizeof(float) * 2 * test_size_count);\r\n\r\n    printf(\"Copy Size (KB), Host to GPU (GB/s), GPU to Host (GB/s)\\n\");\r\n    for (int size_idx = 0; size_idx < test_size_count; size_idx++) {\r\n        uint64_t testSizeBytes = default_link_test_sizes[size_idx] * 1024;\r\n        uint64_t testSizeKb = default_link_test_sizes[size_idx];\r\n\r\n        if (testSizeBytes > max_global_test_size) {\r\n            printf(\"%d K would exceed device's max buffer size of %lu K, stopping here.\\n\", testSizeKb, max_global_test_size / 1024);\r\n            break;\r\n        }\r\n\r\n        A = (uint32_t*)malloc(testSizeBytes);\r\n        memset(A, 0, testSizeBytes);\r\n        cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, testSizeBytes, NULL, &ret);\r\n        clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n        global_item_size = 1; // only hit the first element, not like we're going to spend time verifying an entire arr especially at large sizes\r\n\r\n        // use 1M iterations = 1 GB total to transfer\r\n        loop_iterations = ((uint64_t)iterations * 1000) / (uint64_t)testSizeBytes;\r\n        //fprintf(stderr, \"Size: %llu KB, Iterations: %d, base iterations: %d\\n\", testSizeKb, loop_iterations, iterations);\r\n\r\n        start_timing();\r\n        for (int iter_idx = 0; iter_idx < loop_iterations; iter_idx++)\r\n        {\r\n            ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, testSizeBytes, A, 0, NULL, NULL);\r\n            clFinish(command_queue);\r\n        }\r\n        time_diff_ms = end_timing();\r\n        total_data_gb = ((float)loop_iterations * testSizeBytes) / 1e9;\r\n        host_to_gpu_bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;\r\n        results[size_idx * 2] = host_to_gpu_bandwidth;\r\n        //fprintf(stderr, \"Write to GPU: %f GB transferred in %d ms\\n\", total_data_gb, time_diff_ms);\r\n\r\n        start_timing();\r\n        for (int iter_idx = 0; iter_idx < loop_iterations; iter_idx++)\r\n        {\r\n            ret = clEnqueueReadBuffer(command_queue, a_mem_obj, CL_TRUE, 0, testSizeBytes, A, 0, NULL, NULL);\r\n            clFinish(command_queue);\r\n        }\r\n        time_diff_ms = end_timing();\r\n        total_data_gb = ((float)loop_iterations * testSizeBytes) / 1e9;\r\n        gpu_to_host_bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;\r\n        results[size_idx * 2 + 1] = gpu_to_host_bandwidth;\r\n        //fprintf(stderr, \"Read from GPU: %f GB transferred in %d ms\\n\", total_data_gb, time_diff_ms);\r\n\r\n        printf(\"%llu,%f,%f\\n\", testSizeKb, host_to_gpu_bandwidth, gpu_to_host_bandwidth);\r\n\r\n        clReleaseMemObject(a_mem_obj);\r\n        free(A);\r\n    }\r\n\r\n    float max = 0;\r\n    for (int size_idx = 0; size_idx < test_size_count; size_idx++) {\r\n        if (results[size_idx * 2] > max) max = results[size_idx * 2];\r\n        if (results[size_idx * 2 + 1] > max) max = results[size_idx * 2 + 1];\r\n    }\r\n\r\n    printf(\"Link bandwidth: %f GB/s\\n\", max);\r\n\r\ncleanup:\r\n    free(results);\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n}\n"
  },
  {
    "path": "GpuMemLatency/common.c",
    "content": "#include \"opencltest.h\"\r\n\r\ncl_device_id selected_device_id;\r\ncl_platform_id selected_platform_id;\r\ncl_ulong max_global_test_size;\r\nint saveprogram = 0;\r\n\r\n// Fills an array using Sattolo's algo\r\nvoid FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {\r\n    uint32_t increment = byte_increment / sizeof(uint32_t);\r\n    uint32_t element_count = list_size / increment;\r\n    for (int i = 0; i < element_count; i++) {\r\n        pattern_arr[i * increment] = i * increment;\r\n    }\r\n\r\n    int iter = element_count;\r\n    while (iter > 1) {\r\n        iter -= 1;\r\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\r\n        uint32_t tmp = pattern_arr[iter * increment];\r\n        pattern_arr[iter * increment] = pattern_arr[j * increment];\r\n        pattern_arr[j * increment] = tmp;\r\n    }\r\n}\r\n\r\ncl_uint getCuCount() {\r\n    cl_uint cuCount;\r\n    size_t cuCountLen = sizeof(cl_uint);\r\n    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_COMPUTE_UNITS, cuCountLen, &cuCount, &cuCountLen))\r\n    {\r\n        fprintf(stderr, \"Could not get number of compute units\\n\");\r\n        return 0;\r\n    }\r\n\r\n    return cuCount;\r\n}\r\n\r\nsize_t getMaxWorkgroupSize()\r\n{\r\n    size_t maxWorkgroupSize;\r\n    size_t workgroupSizeLen = sizeof(size_t);\r\n    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, workgroupSizeLen, &maxWorkgroupSize, &workgroupSizeLen))\r\n    {\r\n        fprintf(stderr, \"Could not get number of compute units\\n\");\r\n        return 0;\r\n    }\r\n\r\n    return maxWorkgroupSize;\r\n}\r\n\r\ncl_ulong get_max_constant_buffer_size() {\r\n    cl_ulong constant_buffer_size = 0;\r\n    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &constant_buffer_size, NULL)) {\r\n        fprintf(stderr, \"Failed to get max constant buffer size\\n\");\r\n    }\r\n\r\n    return constant_buffer_size;\r\n}\r\n\r\ncl_ulong get_max_buffer_size() {\r\n    cl_ulong buffer_size = 0;\r\n    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &buffer_size, NULL)) {\r\n        fprintf(stderr, \"Failed to get max constant buffer size\\n\");\r\n    }\r\n\r\n    return buffer_size;\r\n}\r\n\r\ncl_ulong get_max_tex_buffer_size() {\r\n    cl_ulong buffer_size = 0;\r\n    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(cl_ulong), &buffer_size, NULL)) {\r\n        fprintf(stderr, \"Failed to get max texture buffer size\\n\");\r\n    }\r\n\r\n    return buffer_size;\r\n}\r\n\r\ncl_ulong get_max_2d_tex_width() {\r\n    cl_ulong max_width = 0;\r\n    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(cl_ulong), &max_width, NULL)) {\r\n        fprintf(stderr, \"Failed to get max texture width\\n\");\r\n    }\r\n\r\n    return max_width;\r\n}\r\n\r\ncl_ulong get_max_2d_tex_height() {\r\n    cl_ulong max_width = 0;\r\n    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(cl_ulong), &max_width, NULL)) {\r\n        fprintf(stderr, \"Failed to get max texture height\\n\");\r\n    }\r\n\r\n    return max_width;\r\n}\r\n\r\nshort checkExtensionSupport(const char *extension_name) {\r\n    size_t extensionLen = 0;\r\n    char* extensions;\r\n    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_EXTENSIONS, 0, NULL, &extensionLen))\r\n    {\r\n        fprintf(stderr, \"Could not determine memory needed to hold OpenCL extension list\\n\");\r\n        return 0;\r\n    }\r\n\r\n    extensions = (char *)malloc(extensionLen + 1);\r\n    extensions[extensionLen] = 0;\r\n    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_EXTENSIONS, extensionLen, extensions, &extensionLen))\r\n    {\r\n        fprintf(stderr, \"Could not get OpenCL extensions list\\n\");\r\n        return 0;\r\n    }\r\n\r\n    //fprintf(stderr, \"OpenCL extensions list: %s\\n\", extensions);\r\n    // extension list is space separated\r\n    size_t spaceCount = 0;\r\n    for (int i = 0; i < extensionLen; i++) {\r\n        if (extensions[i] == ' ') spaceCount++;\r\n    }\r\n\r\n    int* extensionsSpaces = (int*)malloc(sizeof(int) * (spaceCount + 1));\r\n    extensionsSpaces[0] = 0;\r\n    int spaceIdx = 1;\r\n    for (int i = 0; i < extensionLen; i++) {\r\n        if (extensions[i] == ' ') {\r\n            extensions[i] = 0;\r\n            extensionsSpaces[spaceIdx] = i + 1;\r\n            spaceIdx++;\r\n        }\r\n    }\r\n\r\n    short found = 0;\r\n    for (int i = 0; i < spaceCount; i++)\r\n    {\r\n        //fprintf(stderr, \"Looking for %s = %s\\n\", extension_name, extensions + extensionsSpaces[i]);\r\n        if (strcmp(extension_name, extensions + extensionsSpaces[i]) == 0) {\r\n            found = 1;\r\n            //fprintf(stderr, \"found\\n\");\r\n            break;\r\n        }\r\n    }\r\n\r\n    free(extensionsSpaces);\r\n    free(extensions);\r\n    return found;\r\n}\r\n\r\n/// <summary>\r\n/// populate global variables for opencl device id and platform id\r\n/// </summary>\r\n/// <param name=\"platform_index\">platform index. if -1, prompt user</param>\r\n/// <param name=\"device_index\">device index. if -1. prompt user</param>\r\n/// <returns>opencl context</returns>\r\ncl_context get_context_from_user(int platform_index, int device_index) {\r\n    int i = 0;\r\n    int selected_platform_index = 0, selected_device_index = 0;\r\n\r\n    // Get platform and device information\r\n    cl_uint ret_num_devices;\r\n    cl_uint ret_num_platforms;\r\n\r\n    cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);\r\n    cl_platform_id* platforms = NULL;\r\n    cl_device_id* devices = NULL;\r\n    cl_context context = NULL;\r\n    platforms = (cl_platform_id*)malloc(ret_num_platforms * sizeof(cl_platform_id));\r\n\r\n    ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);\r\n    fprintf(stderr, \"clGetPlatformIDs returned %d. %d platforms\\n\", ret, ret_num_platforms);\r\n\r\n    for (i = 0; i < ret_num_platforms; i++)\r\n    {\r\n        size_t platform_name_len;\r\n        char* platform_name = NULL;\r\n        if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &platform_name_len)) {\r\n            fprintf(stderr, \"Failed to get platform info for platform %d\\n\", i);\r\n            continue;\r\n        }\r\n\r\n        platform_name = (char*)malloc(platform_name_len + 1);\r\n        platform_name[platform_name_len] = 0;\r\n\r\n        if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, platform_name_len, platform_name, NULL)) {\r\n            fprintf(stderr, \"Failed to get platform name for platform %d\\n\", i);\r\n            free(platform_name);\r\n            continue;\r\n        }\r\n\r\n        fprintf(stderr, \"Platform %d: %s\\n\", i, platform_name);\r\n        free(platform_name);\r\n    }\r\n\r\n    selected_platform_index = platform_index;\r\n    if (selected_platform_index == -1)\r\n    {\r\n        printf(\"Enter platform #:\");\r\n        scanf(\"%d\", &selected_platform_index);\r\n    }\r\n\r\n    if (selected_platform_index > ret_num_platforms - 1)\r\n    {\r\n        fprintf(stderr, \"platform index out of range\\n\");\r\n        goto get_context_from_user_end;\r\n    }\r\n\r\n    selected_platform_id = platforms[selected_platform_index];\r\n\r\n    if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &ret_num_devices)) {\r\n        fprintf(stderr, \"Failed to enumerate device ids for platform\");\r\n        return NULL;\r\n    }\r\n\r\n    devices = (cl_device_id*)malloc(ret_num_devices * sizeof(cl_device_id));\r\n    if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, ret_num_devices, devices, NULL)) {\r\n        fprintf(stderr, \"Failed to get device ids for platform\");\r\n        free(devices);\r\n        return NULL;\r\n    }\r\n\r\n    fprintf(stderr, \"clGetDeviceIDs returned %d devices\\n\", ret_num_devices);\r\n\r\n    for (i = 0; i < ret_num_devices; i++)\r\n    {\r\n        size_t device_name_len;\r\n        char* device_name = NULL;\r\n        if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &device_name_len)) {\r\n            fprintf(stderr, \"Failed to get name length for device %d\\n\", i);\r\n            continue;\r\n        }\r\n\r\n        //fprintf(stderr, \"debug: device name length: %d\\n\", device_name_len);\r\n        device_name = (char*)malloc(device_name_len + 1);\r\n        device_name[device_name_len] = 0;\r\n\r\n        if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, device_name_len, device_name, &device_name_len)) {\r\n            fprintf(stderr, \"Failed to get name for device %d\\n\", i);\r\n            free(device_name);\r\n            continue;\r\n        }\r\n\r\n        fprintf(stderr, \"Device %d: %s\\n\", i, device_name);\r\n        free(device_name);\r\n    }\r\n\r\n    selected_device_index = device_index;\r\n    if (selected_device_index == -1)\r\n    {\r\n        fprintf(stderr, \"Enter device #:\");\r\n        scanf(\"%d\", &selected_device_index);\r\n    }\r\n\r\n\r\n    if (selected_device_index > ret_num_devices - 1)\r\n    {\r\n        fprintf(stderr, \"Device index out of range\\n\");\r\n        goto get_context_from_user_end;\r\n    }\r\n\r\n    selected_device_id = devices[selected_device_index];\r\n\r\n    // Create an OpenCL context\r\n    context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret);\r\n    fprintf(stderr, \"clCreateContext returned %d\\n\", ret);\r\n    fprintf(stderr, \"Max workgroup size for device: %u\\n\", getMaxWorkgroupSize());\r\n\r\nget_context_from_user_end:\r\n    free(platforms);\r\n    free(devices);\r\n    return context;\r\n}\r\n\r\ncl_program build_program(cl_context context, const char* fname, const char *params)\r\n{\r\n    cl_int ret;\r\n    FILE* fp = NULL;\r\n    char* source_str;\r\n    size_t source_size;\r\n    fp = fopen(fname, \"r\");\r\n    if (!fp) {\r\n        fprintf(stderr, \"Failed to load kernel %s.\\n\", fname);\r\n        exit(1);\r\n    }\r\n    source_str = (char*)malloc(MAX_SOURCE_SIZE);\r\n    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);\r\n    fclose(fp);\r\n\r\n    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &ret);\r\n    ret = clBuildProgram(program, 1, &selected_device_id, params, NULL, NULL);\r\n    //fprintf(stderr, \"clBuildProgram %s returned %d\\n\", fname, ret);\r\n    if (ret == -11)\r\n    {\r\n        size_t log_size;\r\n        fprintf(stderr, \"OpenCL kernel build error\\n\");\r\n        clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);\r\n        char* log = (char*)malloc(log_size);\r\n        clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);\r\n        fprintf(stderr, \"%s\\n\", log);\r\n        free(log);\r\n    }\r\n\r\n    free(source_str);\r\n    return program;\r\n}\r\n\r\nvoid write_program(cl_program program, const char *name)\r\n{\r\n    size_t* binarySizes = NULL;\r\n    size_t nDevices = 0;\r\n    cl_int ret, memoryRequired = 0;\r\n    char fname[255];\r\n    int i;\r\n    unsigned char** binaries = NULL;\r\n\r\n    ret = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(size_t), &nDevices, NULL);\r\n    if (ret != CL_SUCCESS) {\r\n        fprintf(stderr, \"Could not get number of devices for program\\n\");\r\n        return;\r\n    }\r\n\r\n    fprintf(stderr, \"Program is associated with %llu devices\\n\", nDevices);\r\n    binarySizes = (size_t*)malloc(sizeof(size_t) * nDevices);\r\n    if (binarySizes == NULL)\r\n    {\r\n        fprintf(stderr, \"Failed to allocate memory for binary sizes\\n\");\r\n        goto getProgram_Fail;\r\n    }\r\n\r\n    ret = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * nDevices, binarySizes, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Could not get program binary sizes\\n\");\r\n        goto getProgram_Fail;\r\n    }\r\n\r\n    binaries = (unsigned char*)malloc(nDevices);\r\n    for (i = 0; i < nDevices; i++) {\r\n        fprintf(stderr, \"Device %d: %llu byte program\\n\", i, binarySizes[i]);\r\n        binaries[i] = (char*)malloc(binarySizes[i]);\r\n    }\r\n\r\n    ret = clGetProgramInfo(program, CL_PROGRAM_BINARIES, nDevices * sizeof(unsigned char*), binaries, NULL);\r\n    if (ret != CL_SUCCESS)\r\n    {\r\n        fprintf(stderr, \"Could not get program binaries\\n\");\r\n        goto getProgram_Fail;\r\n    }\r\n\r\n    for (int i = 0; i < nDevices; i++)\r\n    {\r\n        snprintf(fname, 254, \"prog%d_%s\", i, name);\r\n        FILE* dst = fopen(fname, \"w\");\r\n        fwrite(binaries[i], 1, binarySizes[i], dst);\r\n        fclose(dst);\r\n        fprintf(stderr, \"Wrote compiled kernel to %s\\n\", fname);\r\n    }\r\n\r\ngetProgram_Fail:\r\n    for (int i = 0; i < nDevices; i++) free(binaries[i]);\r\n    free(binaries);\r\n    free(binarySizes);\r\n}\r\n\r\n// Given last run settings, return target iteration count that should make the next run\r\n// go for approximately TARGET_TIME_MS\r\nuint32_t adjust_iterations(uint32_t iterations, uint64_t time_ms)\r\n{\r\n    uint32_t chase_iterations = (uint32_t)((float)iterations * TARGET_TIME_MS / (float)time_ms);\r\n    if (time_ms == 0) chase_iterations = iterations * 100;\r\n    //fprintf(stderr, \"Kernel took %llu ms. Setting iterations = %u\\n\", time_ms, chase_iterations);\r\n\r\n    return chase_iterations;\r\n}"
  },
  {
    "path": "GpuMemLatency/instruction_rate.c",
    "content": "#include \"opencltest.h\"\r\n\r\nfloat fp64_instruction_rate_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int float4_element_count,\r\n    cl_mem a_mem_obj,\r\n    cl_mem result_obj,\r\n    cl_float* A,\r\n    cl_float* result);\r\n\r\nfloat fp16_instruction_rate_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int float4_element_count,\r\n    cl_mem a_mem_obj,\r\n    cl_mem result_obj,\r\n    cl_float* A,\r\n    cl_float* result);\r\n\r\nfloat run_rate_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int float4_element_count,\r\n    cl_mem a_mem_obj,\r\n    cl_mem result_obj,\r\n    cl_float* A,\r\n    cl_float* result,\r\n    float totalOps);\r\n\r\nfloat run_latency_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t chase_iterations,\r\n    int float4_element_count,\r\n    cl_mem a_mem_obj,\r\n    cl_mem result_obj,\r\n    cl_float* A,\r\n    cl_float* result,\r\n    float opsPerIteration);\r\n\r\nfloat global_totalOps;\r\n\r\nfloat instruction_rate_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int forcefp16,\r\n    int forcefp64)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    float gOpsPerSec = 0, opsPerIteration;\r\n    cl_int ret;\r\n    int64_t time_diff_ms;\r\n    int float4_element_count = thread_count * 4;\r\n\r\n    cl_program program = build_program(context, \"instruction_rate_kernel.cl\", NULL);\r\n    if (saveprogram) write_program(program, \"irate\");\r\n    cl_kernel int32_add_rate_kernel = clCreateKernel(program, \"int32_add_rate_test\", &ret);\r\n    cl_kernel int32_mul_rate_kernel = clCreateKernel(program, \"int32_mul_rate_test\", &ret);\r\n    cl_kernel fp32_add_rate_kernel = clCreateKernel(program, \"fp32_add_rate_test\", &ret);\r\n    cl_kernel fp32_fma_rate_kernel = clCreateKernel(program, \"fp32_fma_rate_test\", &ret);\r\n    cl_kernel fp32_builtin_fma_rate_kernel = clCreateKernel(program, \"fp32_builtin_fma_rate_test\", &ret);\r\n    cl_kernel fp32_mad_rate_kernel = clCreateKernel(program, \"fp32_mad_rate_test\", &ret);\r\n    cl_kernel fp32_rcp_rate_kernel = clCreateKernel(program, \"fp32_rcp_rate_test\", &ret);\r\n    cl_kernel fp32_rsqrt_rate_kernel = clCreateKernel(program, \"fp32_rsqrt_rate_test\", &ret);\r\n    cl_kernel mix_fp32_int32_add_rate_kernel = clCreateKernel(program, \"mix_fp32_int32_add_rate_test\", &ret);\r\n    cl_kernel mix_fp32_int32_addmul_rate_kernel = clCreateKernel(program, \"mix_fp32_int32_addmul_rate_test\", &ret);\r\n    cl_kernel int64_add_rate_kernel = clCreateKernel(program, \"int64_add_rate_test\", &ret);\r\n    cl_kernel int64_mul_rate_kernel = clCreateKernel(program, \"int64_mul_rate_test\", &ret);\r\n    cl_kernel int16_add_rate_kernel = clCreateKernel(program, \"int16_add_rate_test\", &ret);\r\n    cl_kernel int16_mul_rate_kernel = clCreateKernel(program, \"int16_mul_rate_test\", &ret);\r\n    cl_kernel int8_add_rate_kernel = clCreateKernel(program, \"int8_add_rate_test\", &ret);\r\n    cl_kernel int8_mul_rate_kernel = clCreateKernel(program, \"int8_mul_rate_test\", &ret);\r\n    cl_kernel fp32_fma_latency_kernel = clCreateKernel(program, \"fp32_fma_latency_test\", &ret);\r\n    cl_kernel fp32_add_latency_kernel = clCreateKernel(program, \"fp32_add_latency_test\", &ret);\r\n    cl_kernel int32_add_latency_kernel = clCreateKernel(program, \"int32_add_latency_test\", &ret);\r\n    cl_kernel int32_mul_latency_kernel = clCreateKernel(program, \"int32_mul_latency_test\", &ret);\r\n\r\n    cl_kernel int32_add_scalar_latency_kernel = clCreateKernel(program, \"int32_add_scalar_latency_test\", &ret);\r\n    cl_kernel int32_mul_scalar_latency_kernel = clCreateKernel(program, \"int32_mul_scalar_latency_test\", &ret);\r\n    cl_kernel fp32_add_scalar_latency_kernel = clCreateKernel(program, \"fp32_add_scalar_latency_test\", &ret);\r\n    cl_kernel fp32_fma_scalar_latency_kernel = clCreateKernel(program, \"fp32_fma_scalar_latency_test\", &ret);\r\n    cl_kernel fp32_mul_scalar_latency_kernel = clCreateKernel(program, \"fp32_mul_scalar_latency_test\", &ret);\r\n    cl_kernel fp32_mul_latency_kernel = clCreateKernel(program, \"fp32_mul_latency_test\", &ret);\r\n\r\n    float* A = (float*)malloc(sizeof(float) * float4_element_count * 4);\r\n    float* result = (float*)malloc(sizeof(float) * 4 * thread_count);\r\n\r\n    if (!A || !result)\r\n    {\r\n        fprintf(stderr, \"Failed to allocate memory instruction rate test\\n\");\r\n    }\r\n\r\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, float4_element_count * sizeof(float), NULL, &ret);\r\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * 4 * thread_count, NULL, &ret);\r\n\r\n    // Integer test first\r\n    uint32_t *int32_A = (uint32_t*)A;\r\n    for (int i = 0; i < float4_element_count * 4; i++)\r\n    {\r\n        int32_A[i] = i + 1;\r\n    }\r\n\r\n    // 4x int4 * 8 per iteration, and count the loop increment too\r\n    opsPerIteration = 4.0f * 8.0f;\r\n    float int32_add_rate = run_rate_test(context, command_queue, int32_add_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"INT32 G Adds/sec: %f\\n\", int32_add_rate);\r\n\r\n    printf(\"===== INT32 add latency =====\\n\");\r\n    float int32_add_latency = run_latency_test(context, command_queue, int32_add_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"INT32 add latency: %f ns\\n\", int32_add_latency);\r\n\r\n    printf(\"===== INT32 add latency (scalar) =====\\n\");\r\n    int32_add_latency = run_latency_test(context, command_queue, int32_add_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"INT32 add latency (scalar): %f ns\\n\", int32_add_latency);\r\n\r\n    printf(\"===== INT32 mul latency =====\\n\");\r\n    float int32_mul_latency = run_latency_test(context, command_queue, int32_mul_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"INT32 mul latency: %f ns\\n\", int32_mul_latency);\r\n\r\n    printf(\"===== INT32 mul latency (scalar) =====\\n\");\r\n    int32_mul_latency = run_latency_test(context, command_queue, int32_mul_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"INT32 mul latency (scalar): %f ns\\n\", int32_mul_latency);\r\n\r\n    opsPerIteration = 4.0f * 8.0f;\r\n    float int32_mul_rate = run_rate_test(context, command_queue, int32_mul_rate_kernel, thread_count, local_size, (chase_iterations / 2),\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"INT32 G Multiplies/sec: %f\\n\", int32_mul_rate);\r\n\r\n    // FP32 add and fma test\r\n    cl_float* fp32_A = (cl_float*)A;\r\n    for (int i = 0; i < float4_element_count * 4; i++)\r\n    {\r\n        fp32_A[i] = 0.5f * i;\r\n    }\r\n\r\n    opsPerIteration = 4.0f * 8.0f;\r\n\r\n    float fp32_add_rate = run_rate_test(context, command_queue, fp32_add_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"FP32 G Adds/sec: %f\\n\", fp32_add_rate);\r\n\r\n    printf(\"===== FP32 add latency =====\\n\");\r\n    float fp32_add_latency = run_latency_test(context, command_queue, fp32_add_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"FP32 add latency: %f ns\\n\", fp32_add_latency);\r\n\r\n    printf(\"===== FP32 add latency (scalar) =====\\n\");\r\n    fp32_add_latency = run_latency_test(context, command_queue, fp32_add_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"FP32 add latency (scalar): %f ns\\n\", fp32_add_latency);\r\n\r\n    printf(\"===== FP32 fma latency =====\\n\");\r\n    float fp32_fma_latency = run_latency_test(context, command_queue, fp32_fma_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"FP32 FMA latency: %f ns\\n\", fp32_fma_latency);\r\n\r\n    printf(\"===== FP32 fma latency (scalar) =====\\n\");\r\n    fp32_fma_latency = run_latency_test(context, command_queue, fp32_fma_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"FP32 FMA latency (scalar): %f ns\\n\", fp32_fma_latency);\r\n\r\n    printf(\"===== FP32 mul latency =====\\n\");\r\n    fp32_fma_latency = run_latency_test(context, command_queue, fp32_mul_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"FP32 mul latency: %f ns\\n\", fp32_fma_latency);\r\n    fp32_fma_latency = run_latency_test(context, command_queue, fp32_mul_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);\r\n    fprintf(stderr, \"FP32 mul latency (scalar): %f ns\\n\", fp32_fma_latency);\r\n\r\n    float fp32_fma_rate = run_rate_test(context, command_queue, fp32_fma_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"FP32 G FMA/sec: %f : %f GFLOPs\\n\", fp32_fma_rate, fp32_fma_rate * 2); \r\n\r\n    float builtin_fp32_fma_rate = run_rate_test(context, command_queue, fp32_builtin_fma_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"FP32 G fma()/sec: %f : %f GFLOPs\\n\", builtin_fp32_fma_rate, builtin_fp32_fma_rate * 2);\r\n\r\n    fp32_fma_rate = run_rate_test(context, command_queue, fp32_mad_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"FP32 G mad()/sec: %f : %f GFLOPs\\n\", fp32_fma_rate, fp32_fma_rate * 2);\r\n\r\n    float fp32_rcp_rate = run_rate_test(context, command_queue, fp32_rcp_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"FP32 G native_recip/sec: %f\\n\", fp32_rcp_rate);\r\n\r\n    float fp32_rsqrt_rate = run_rate_test(context, command_queue, fp32_rsqrt_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"FP32 G native_rsqrt/sec: %f\\n\", fp32_rsqrt_rate);\r\n\r\n    // Mixed INT32 and FP32 - 4 FP32, 4 INT32, and the loop increment\r\n    // takes FP inputs and converts some to int\r\n    opsPerIteration = 4.0f * 8.0f + 1.0f;\r\n    float mix_fp32_int32_rate = run_rate_test(context, command_queue, mix_fp32_int32_add_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"Mixed INT32 and FP32 G Adds/sec: %f\\n\", mix_fp32_int32_rate);\r\n\r\n    // Test the same with integer multiplies\r\n    mix_fp32_int32_rate = run_rate_test(context, command_queue, mix_fp32_int32_addmul_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"Mixed INT32 Multiplies and FP32 G Adds/sec: %f\\n\", mix_fp32_int32_rate);\r\n\r\n    // INT64 add test\r\n    cl_ulong* int64_A = (cl_ulong*)A;\r\n    for (int i = 0; i < float4_element_count * 2; i++)\r\n    {\r\n        int64_A[i] = i * 2;\r\n    }\r\n\r\n    opsPerIteration = 2.0f * 8.0f;\r\n    float int64_add_rate = run_rate_test(context, command_queue, int64_add_rate_kernel, thread_count, local_size, chase_iterations / 2,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"INT64 G Adds/sec: %f\\n\", int64_add_rate);\r\n\r\n    opsPerIteration = 2.0f * 8.0f;\r\n    float int64_mul_rate = run_rate_test(context, command_queue, int64_mul_rate_kernel, thread_count, local_size, chase_iterations / 8,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"INT64 G Multiplies/sec: %f\\n\", int64_mul_rate);\r\n\r\n    // INT16 (short) tests\r\n    cl_ushort* int16_A = (cl_ushort*)A;\r\n    for (int i = 0; i < float4_element_count * 8; i++)\r\n    {\r\n        int16_A[i] = i;\r\n    }\r\n\r\n    // short8\r\n    opsPerIteration = 8.0f * 8.0f;\r\n    float int16_add_rate = run_rate_test(context, command_queue, int16_add_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"INT16 G Adds/sec: %f\\n\", int16_add_rate);\r\n\r\n    float int16_mul_rate = run_rate_test(context, command_queue, int16_mul_rate_kernel, thread_count, local_size, chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"INT16 G Multiplies/sec: %f \\n\", int16_mul_rate);\r\n\r\n    // INT8 (char) tests\r\n    cl_char* int8_A = (cl_char*)A;\r\n    for (int i = 0; i < float4_element_count * 8; i++)\r\n    {\r\n        int8_A[i] = i;\r\n    }\r\n\r\n    uint32_t int8_chase_iterations = chase_iterations / 10;\r\n    opsPerIteration = 16.0f * 8.0f;\r\n    float int8_add_rate = run_rate_test(context, command_queue, int8_add_rate_kernel, thread_count, local_size, int8_chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"INT8 G Adds/sec: %f\\n\", int8_add_rate);\r\n\r\n    float int8_mul_rate = run_rate_test(context, command_queue, int8_mul_rate_kernel, thread_count, local_size, int8_chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);\r\n    fprintf(stderr, \"INT8 G Multiplies/sec: %f\\n\", int8_mul_rate);\r\n\r\n    short checkExtensionSupport(const char *extension_name);\r\n    \r\n    if (checkExtensionSupport(\"cl_khr_fp64\") || forcefp64) {\r\n        fp64_instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, float4_element_count,\r\n            a_mem_obj, result_obj, A, result);\r\n    }\r\n    else {\r\n        fprintf(stderr, \"FP64 not supported\\n\");\r\n    }\r\n\r\n    if (checkExtensionSupport(\"cl_khr_fp16\") || forcefp16) {\r\n        fp16_instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, float4_element_count,\r\n            a_mem_obj, result_obj, A, result);\r\n    }\r\n    else {\r\n        fprintf(stderr, \"FP16 not supported\\n\");\r\n    }\r\n\r\ncleanup:\r\n    clFlush(command_queue);\r\n    clFinish(command_queue);\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    free(A);\r\n    free(result);\r\n    return gOpsPerSec;\r\n}\r\n\r\n// Runs an instruction rate test. The kernel is expected to perform opsPerIteration * chase_iterations operations\r\n// Mostly simplifies the uber instruction rate test above. Expects memory to be pre-allocated for example.\r\n// Returns GOPS\r\nfloat run_rate_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int float4_element_count,\r\n    cl_mem a_mem_obj,\r\n    cl_mem result_obj,\r\n    cl_float* A,\r\n    cl_float* result,\r\n    float opsPerIteration)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    cl_int ret;\r\n    float totalOps, gOpsPerSec;\r\n    uint64_t time_diff_ms = 0;\r\n\r\n    memset(result, 0, sizeof(float) * 4 * thread_count);\r\n\r\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, float4_element_count * sizeof(float), A, 0, NULL, NULL);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * 4 * thread_count, result, 0, NULL, NULL);\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n    clFinish(command_queue);\r\n\r\n    //fprintf(stderr, \"Submitting fp32 add kernel to command queue\\n\");\r\n    // start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment\r\n    while (time_diff_ms < TARGET_TIME_MS / 2) {\r\n        start_timing();\r\n        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n        if (ret != CL_SUCCESS)\r\n        {\r\n            fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n            gOpsPerSec = 0;\r\n            return 0;\r\n        }\r\n\r\n        ret = clFinish(command_queue);\r\n        if (ret != CL_SUCCESS)\r\n        {\r\n            printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\r\n            gOpsPerSec = 0;\r\n            return 0;\r\n        }\r\n\r\n        time_diff_ms = end_timing();\r\n\r\n        totalOps = (float)chase_iterations * opsPerIteration * (float)thread_count;\r\n        gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);\r\n        //fprintf(stderr, \"chase iterations: %d, thread count: %d\\n\", chase_iterations, thread_count);\r\n        //fprintf(stderr, \"total ops: %f (%.2f G)\\ntotal time: %llu ms\\n\", totalOps, totalOps / 1e9, time_diff_ms);\r\n\r\n        chase_iterations = adjust_iterations(chase_iterations, time_diff_ms);\r\n        clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    }\r\n\r\n    return gOpsPerSec;\r\n}\r\n\r\n\r\n// Variation of the test above but input array size is aligned with assumed wave size.\r\n// if partitioning pattern, this will test partitioning with active waves in the specified pattern\r\nfloat run_divergence_rate_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t wave,\r\n    int *partitionPattern)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    uint32_t active_threads = thread_count;\r\n    cl_int ret;\r\n    float totalOps, gOpsPerSec;\r\n    uint64_t time_diff_ms = 0;\r\n    uint32_t chase_iterations = 2500000;\r\n\r\n    cl_program program = build_program(context, \"instruction_rate_kernel.cl\", NULL);\r\n    cl_kernel kernel = clCreateKernel(program, partitionPattern == NULL ? \"fp32_divergence_rate_test\" : \"fp32_partition_rate_test\", &ret);\r\n    \r\n    float* result = (float*)malloc(sizeof(float) * thread_count);\r\n    float* A = (float*)malloc(sizeof(float) * thread_count);\r\n    memset(result, 0, sizeof(float) * thread_count);\r\n\r\n    if (partitionPattern != NULL) active_threads = 0;\r\n    if (partitionPattern != NULL) fprintf(stderr, \"\\n\");\r\n\r\n    for (int i = 0; i < thread_count; i++)\r\n    {\r\n        if (partitionPattern == NULL) {\r\n            // divergence test\r\n            if ((i / wave) % 2 == 0) A[i] = 0.2f;\r\n            else A[i] = 0.8f;\r\n        }\r\n        else\r\n        {\r\n            if (partitionPattern[(i / wave)]) {\r\n                A[i] = 0.2f;\r\n                fprintf(stderr, \"a \");\r\n                active_threads++;\r\n            }\r\n            else\r\n            {\r\n                fprintf(stderr, \"_ \");\r\n                A[i] = 1.2f;\r\n            }\r\n\r\n            if ((i + 1) % wave == 0)\r\n            {\r\n                fprintf(stderr, \"\\n\");\r\n            }\r\n        }\r\n    }\r\n\r\n    if (partitionPattern != NULL) fprintf(stderr, \"\\nActive threads: %d\\n\", active_threads);\r\n\r\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, thread_count * sizeof(float), NULL, &ret);\r\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, thread_count * sizeof(float), NULL, &ret);\r\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, thread_count * sizeof(float), A, 0, NULL, NULL);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, thread_count * sizeof(float), result, 0, NULL, NULL);\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n    clFinish(command_queue);\r\n\r\n    // start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment\r\n    while (time_diff_ms < TARGET_TIME_MS / 2) {\r\n        start_timing();\r\n        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n        if (ret != CL_SUCCESS)\r\n        {\r\n            fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n            gOpsPerSec = 0;\r\n            return 0;\r\n        }\r\n\r\n        ret = clFinish(command_queue);\r\n        if (ret != CL_SUCCESS)\r\n        {\r\n            printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\r\n            gOpsPerSec = 0;\r\n            return 0;\r\n        }\r\n\r\n        time_diff_ms = end_timing();\r\n\r\n        totalOps = (float)chase_iterations * 8 * (float)active_threads;\r\n        gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);\r\n        //fprintf(stderr, \"chase iterations: %d, thread count: %d\\n\", chase_iterations, thread_count);\r\n        //fprintf(stderr, \"total ops: %f (%.2f G)\\ntotal time: %llu ms\\n\", totalOps, totalOps / 1e9, time_diff_ms);\r\n\r\n        chase_iterations = adjust_iterations(chase_iterations, time_diff_ms);\r\n        clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    }\r\n\r\n    clReleaseMemObject(a_mem_obj);\r\n    clReleaseMemObject(result_obj);\r\n    free(A);\r\n    free(result);\r\n    clReleaseKernel(kernel);\r\n    clReleaseProgram(program);\r\n    return gOpsPerSec;\r\n}\r\n\r\n// often takes time for clocks to settle?\r\n#define LATENCY_REPEAT 5\r\nfloat run_latency_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    uint32_t chase_iterations,\r\n    int float4_element_count,\r\n    cl_mem a_mem_obj,\r\n    cl_mem result_obj,\r\n    cl_float* A,\r\n    cl_float* result,\r\n    float opsPerIteration)\r\n{\r\n    size_t global_item_size = 1;\r\n    size_t local_item_size = 1;\r\n    cl_int ret;\r\n    float latency;\r\n    uint64_t time_diff_ms = 0;\r\n\r\n    // hack around latency taking longer\r\n    chase_iterations = chase_iterations / 50;\r\n\r\n    // testing returning a float4\r\n    memset(result, 0, sizeof(float) * 4);\r\n\r\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, float4_element_count * sizeof(float), A, 0, NULL, NULL);\r\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * 4, result, 0, NULL, NULL);\r\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\r\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\r\n    clFinish(command_queue);\r\n\r\n    //fprintf(stderr, \"Submitting fp32 add kernel to command queue\\n\");\r\n    // start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment\r\n    while (time_diff_ms < TARGET_TIME_MS / 2) {\r\n        start_timing();\r\n        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n        if (ret != CL_SUCCESS)\r\n        {\r\n            fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\r\n            latency = 0;\r\n            return 0;\r\n        }\r\n\r\n        ret = clFinish(command_queue);\r\n        if (ret != CL_SUCCESS)\r\n        {\r\n            printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\r\n            latency = 0;\r\n            return 0;\r\n        }\r\n\r\n        time_diff_ms = end_timing();\r\n        chase_iterations = adjust_iterations(chase_iterations, time_diff_ms);\r\n        clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\r\n    }\r\n\r\n    float totalOps = (float)chase_iterations * opsPerIteration * (float)global_item_size;\r\n    latency = (float)time_diff_ms * 1e6 / totalOps;\r\n    // fprintf(stderr, \"\\tinitial run: %f ns latency\\n\", latency);\r\n\r\n    float minLatency = 0.0f;\r\n    for (int i = 0; i < LATENCY_REPEAT; i++)\r\n    {\r\n        start_timing();\r\n        clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\r\n        clFinish(command_queue);\r\n        time_diff_ms = end_timing();\r\n        latency = (float)time_diff_ms * 1e6 / totalOps;\r\n        // fprintf(stderr, \"\\trun %d: %f ns latency\\n\", i, latency);\r\n        if (i == 0 || latency < minLatency) minLatency = latency;\r\n    }\r\n\r\n    //fprintf(stderr, \"chase iterations: %d, thread count: %d\\n\", chase_iterations, thread_count);\r\n    //fprintf(stderr, \"total ops: %f (%.2f G)\\ntotal time: %llu ms\\n\", totalOps, totalOps / 1e9, time_diff_ms);\r\n    return minLatency;\r\n}\r\n\r\n// taking out FP64 because some implementations don't support it. putting another build program + create kernel section\r\n// in the main instruction rate test function would be too messy\r\nfloat fp64_instruction_rate_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int float4_element_count,\r\n    cl_mem a_mem_obj,\r\n    cl_mem result_obj,\r\n    cl_float *A,\r\n    cl_float*result)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    float gOpsPerSec, totalOps;\r\n    cl_int ret;\r\n    int64_t time_diff_ms;\r\n\r\n    // FP64 add test\r\n    uint32_t low_chase_iterations = chase_iterations / 4;\r\n    cl_double* fp64_A = (cl_double*)A;\r\n    for (int i = 0; i < float4_element_count * 2; i++)\r\n    {\r\n        fp64_A[i] = 0.5f * i;\r\n    }\r\n\r\n    memset(result, 0, sizeof(float) * 4 * thread_count);\r\n\r\n    cl_program program = build_program(context, \"instruction_rate_fp64_kernel.cl\", NULL);\r\n    if (saveprogram) write_program(program, \"fp64irate\");\r\n    cl_kernel fp64_add_rate_kernel = clCreateKernel(program, \"fp64_add_rate_test\", &ret);\r\n    cl_kernel fp64_fma_rate_kernel = clCreateKernel(program, \"fp64_fma_rate_test\", &ret);\r\n    cl_kernel fp64_mad_rate_kernel = clCreateKernel(program, \"fp64_mad_rate_test\", &ret);\r\n    totalOps = 2.0f * 8.0f;\r\n    gOpsPerSec = run_rate_test(context, command_queue, fp64_add_rate_kernel, thread_count, local_size, low_chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);\r\n    fprintf(stderr, \"FP64 G Adds/sec: %f\\n\", gOpsPerSec);\r\n    gOpsPerSec = run_rate_test(context, command_queue, fp64_fma_rate_kernel, thread_count, local_size, low_chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);\r\n    fprintf(stderr, \"FP64 G FMAs/sec: %f : %f FP64 GFLOPs\\n\", gOpsPerSec, gOpsPerSec * 2);\r\n    gOpsPerSec = run_rate_test(context, command_queue, fp64_mad_rate_kernel, thread_count, local_size, low_chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);\r\n    fprintf(stderr, \"FP64 G mad()/sec: %f : %f FP64 GFLOPs\\n\", gOpsPerSec, gOpsPerSec * 2);\r\n\r\n    return gOpsPerSec;\r\n}\r\n\r\n// taking out FP16 too because it requires an extension to be supported\r\nfloat fp16_instruction_rate_test(cl_context context,\r\n    cl_command_queue command_queue,\r\n    uint32_t thread_count,\r\n    uint32_t local_size,\r\n    uint32_t chase_iterations,\r\n    int float4_element_count,\r\n    cl_mem a_mem_obj,\r\n    cl_mem result_obj,\r\n    cl_float* A,\r\n    cl_float* result)\r\n{\r\n    size_t global_item_size = thread_count;\r\n    size_t local_item_size = local_size;\r\n    float gOpsPerSec, totalOps;\r\n    cl_int ret;\r\n    int64_t time_diff_ms;\r\n\r\n    // FP64 add test\r\n    uint32_t low_chase_iterations = chase_iterations / 4;\r\n    cl_half* fp16_A = (cl_float*)A;\r\n    for (int i = 0; i < float4_element_count * 8; i++)\r\n    {\r\n        fp16_A[i] = (cl_half)(0.5f * i);\r\n    }\r\n\r\n    memset(result, 0, sizeof(float) * 4 * thread_count);\r\n\r\n    cl_program program = build_program(context, \"instruction_rate_fp16_kernel.cl\", NULL);\r\n    if (saveprogram) write_program(program, \"fp16irate\");\r\n    cl_kernel fp16_add_rate_kernel = clCreateKernel(program, \"fp16_add_rate_test\", &ret);\r\n    cl_kernel fp16_fma_rate_kernel = clCreateKernel(program, \"fp16_fma_rate_test\", &ret);\r\n    //cl_kernel fp16_rsqrt_rate_kernel = clCreateKernel(program, \"fp16_rsqrt_rate_test\", &ret);\r\n    totalOps = 8.0f * 8.0f;\r\n    gOpsPerSec = run_rate_test(context, command_queue, fp16_add_rate_kernel, thread_count, local_size, low_chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);\r\n    fprintf(stderr, \"FP16 G Adds/sec: %f\\n\", gOpsPerSec);\r\n    gOpsPerSec = run_rate_test(context, command_queue, fp16_fma_rate_kernel, thread_count, local_size, low_chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);\r\n    fprintf(stderr, \"FP16 G FMAs/sec: %f : %f FP16 GFLOPs\\n\", gOpsPerSec, gOpsPerSec * 2);\r\n    /*gOpsPerSec = run_rate_test(context, command_queue, fp16_rsqrt_rate_kernel, thread_count, local_size, low_chase_iterations,\r\n        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);\r\n    fprintf(stderr, \"FP16 G native_rsqrt/sec: %f\\n\", gOpsPerSec);*/\r\n\r\n    return gOpsPerSec;\r\n}\r\n"
  },
  {
    "path": "GpuMemLatency/instruction_rate_fp16_kernel.cl",
    "content": "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n#define rate_local_mem_test_size 256\r\n__kernel void fp16_add_rate_test(__global half8 *A, int count, __global half8 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global half8 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    half8 v0 = local_a[masked_tid];\r\n    half8 v1 = local_a[masked_tid + 1];\r\n    half8 v2 = local_a[masked_tid + 2];\r\n    half8 v3 = local_a[masked_tid + 3];\r\n    half8 v4 = v0 + v1;\r\n    half8 v5 = v0 + v2;\r\n    half8 v6 = v0 + v3;\r\n    half8 v7 = v1 + v2;\r\n    half8 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 += acc;\r\n        v1 += acc;\r\n        v2 += acc;\r\n        v3 += acc;\r\n        v4 += acc;\r\n        v5 += acc;\r\n        v6 += acc;\r\n        v7 += acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp16_fma_rate_test(__global half8 *A, int count, __global half8 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global half8 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    half8 v0 = local_a[masked_tid];\r\n    half8 v1 = local_a[masked_tid + 1];\r\n    half8 v2 = local_a[masked_tid + 2];\r\n    half8 v3 = local_a[masked_tid + 3];\r\n    half8 v4 = v0 + v1;\r\n    half8 v5 = v0 + v2;\r\n    half8 v6 = v0 + v3;\r\n    half8 v7 = v1 + v2;\r\n    half8 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 += acc * v0;\r\n        v1 += acc * v1;\r\n        v2 += acc * v2;\r\n        v3 += acc * v3;\r\n        v4 += acc * v4;\r\n        v5 += acc * v5;\r\n        v6 += acc * v6;\r\n        v7 += acc * v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\n\n/*__kernel void fp16_rsqrt_rate_test(__global half8 *A, int count, __global half8 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global half8 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    half8 v0 = local_a[masked_tid];\r\n    half8 v1 = local_a[masked_tid + 1];\r\n    half8 v2 = local_a[masked_tid + 2];\r\n    half8 v3 = local_a[masked_tid + 3];\r\n    half8 v4 = v0 + v1;\r\n    half8 v5 = v0 + v2;\r\n    half8 v6 = v0 + v3;\r\n    half8 v7 = v1 + v2;\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 = native_rsqrt(v0);\r\n        v1 = native_rsqrt(v1);\r\n        v2 = native_rsqrt(v2);\r\n        v3 = native_rsqrt(v3);\r\n        v4 = native_rsqrt(v4);\r\n        v5 = native_rsqrt(v5);\r\n        v6 = native_rsqrt(v6);\r\n        v7 = native_rsqrt(v7);\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\n*/"
  },
  {
    "path": "GpuMemLatency/instruction_rate_fp64_kernel.cl",
    "content": "#define rate_local_mem_test_size 256\r\n__kernel void fp64_add_rate_test(__global double2 *A, int count, __global double2 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global double2 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    double2 v0 = local_a[masked_tid];\r\n    double2 v1 = local_a[masked_tid + 1];\r\n    double2 v2 = local_a[masked_tid + 2];\r\n    double2 v3 = local_a[masked_tid + 3];\r\n    double2 v4 = v0 + v1;\r\n    double2 v5 = v0 + v2;\r\n    double2 v6 = v0 + v3;\r\n    double2 v7 = v1 + v2;\r\n    double2 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 += acc;\r\n        v1 += acc;\r\n        v2 += acc;\r\n        v3 += acc;\r\n        v4 += acc;\r\n        v5 += acc;\r\n        v6 += acc;\r\n        v7 += acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp64_fma_rate_test(__global double2 *A, int count, __global double2 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global double2 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    double2 v0 = local_a[masked_tid];\r\n    double2 v1 = local_a[masked_tid + 1];\r\n    double2 v2 = local_a[masked_tid + 2];\r\n    double2 v3 = local_a[masked_tid + 3];\r\n    double2 v4 = v0 + v1;\r\n    double2 v5 = v0 + v2;\r\n    double2 v6 = v0 + v3;\r\n    double2 v7 = v1 + v2;\r\n    double2 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 += acc * v0;\r\n        v1 += acc * v1;\r\n        v2 += acc * v2;\r\n        v3 += acc * v3;\r\n        v4 += acc * v4;\r\n        v5 += acc * v5;\r\n        v6 += acc * v6;\r\n        v7 += acc * v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\n\n__kernel void fp64_mad_rate_test(__global double2 *A, int count, __global double2 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global double2 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    double2 v0 = local_a[masked_tid];\r\n    double2 v1 = local_a[masked_tid + 1];\r\n    double2 v2 = local_a[masked_tid + 2];\r\n    double2 v3 = local_a[masked_tid + 3];\r\n    double2 v4 = v0 + v1;\r\n    double2 v5 = v0 + v2;\r\n    double2 v6 = v0 + v3;\r\n    double2 v7 = v1 + v2;\r\n    double2 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 = mad(acc, v0, v0);\r\n        v1 = mad(acc, v1, v1);\r\n        v2 = mad(acc, v2, v2);\r\n        v3 = mad(acc, v3, v3);\r\n        v4 = mad(acc, v4, v3);\r\n        v5 = mad(acc, v5, v5);\r\n        v6 = mad(acc, v6, v6);\r\n        v7 = mad(acc, v7, v7);\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\n"
  },
  {
    "path": "GpuMemLatency/instruction_rate_kernel.cl",
    "content": "#define rate_local_mem_test_size 512\r\n\r\n// A must be at least (local size * 4) uint32 elements in size, but must not exceed local mem size\r\n// jk it doesn't use local mem now\r\n__kernel void int32_add_rate_test(__global uint4 *A, int count, __global uint4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n\r\n    __local uint4 local_a[rate_local_mem_test_size];\r\n    for (int i = tid;i < rate_local_mem_test_size; i += max_offset)\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n    // __global uint4 *local_a = A;\r\n\r\n    int masked_tid = min(tid, rate_local_mem_test_size - 8);\r\n    uint4 v0 = local_a[masked_tid];\r\n    uint4 v1 = local_a[masked_tid + 1];\r\n    uint4 v2 = local_a[masked_tid + 2];\r\n    uint4 v3 = local_a[masked_tid + 3];\r\n    uint4 v4 = local_a[masked_tid + 4];\r\n    uint4 v5 = local_a[masked_tid + 5];\r\n    uint4 v6 = local_a[masked_tid + 6];\r\n    uint4 v7 = local_a[masked_tid + 7];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        uint4 acc = local_a[i & (rate_local_mem_test_size - 1)];\r\n        v0 += acc;\r\n        v1 += acc;\r\n        v2 += acc;\r\n        v3 += acc;\r\n        v4 += acc;\r\n        v5 += acc;\r\n        v6 += acc;\r\n        v7 += acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int32_mul_rate_test(__global uint4 *A, int count, __global uint4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global uint4 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    uint4 v0 = local_a[masked_tid];\r\n    uint4 v1 = local_a[masked_tid + 1];\r\n    uint4 v2 = local_a[masked_tid + 2];\r\n    uint4 v3 = local_a[masked_tid + 3];\r\n    uint4 v4 = v0 + v1;\r\n    uint4 v5 = v0 + v2;\r\n    uint4 v6 = v0 + v3;\r\n    uint4 v7 = v1 + v2;\r\n    uint4 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        //uint4 acc = local_a[i & (rate_local_mem_test_size) - 1];\r\n        v0 *= acc;\r\n        v1 *= acc;\r\n        v2 *= acc;\r\n        v3 *= acc;\r\n        v4 *= acc;\r\n        v5 *= acc;\r\n        v6 *= acc;\r\n        v7 *= acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_add_rate_test(__global float4 *A, int count, __global float4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float4 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float4 v0 = local_a[masked_tid];\r\n    float4 v1 = local_a[masked_tid + 1];\r\n    float4 v2 = local_a[masked_tid + 2];\r\n    float4 v3 = local_a[masked_tid + 3];\r\n    float4 v4 = v0 + v1;\r\n    float4 v5 = v0 + v2;\r\n    float4 v6 = v0 + v3;\r\n    float4 v7 = v1 + v2;\r\n    float4 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        //float4 acc = local_a[i & (rate_local_mem_test_size) - 1];\r\n        v0 += acc;\r\n        v1 += acc;\r\n        v2 += acc;\r\n        v3 += acc;\r\n        v4 += acc;\r\n        v5 += acc;\r\n        v6 += acc;\r\n        v7 += acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_rcp_rate_test(__global float4 *A, int count, __global float4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float4 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float4 v0 = local_a[masked_tid];\r\n    float4 v1 = local_a[masked_tid + 1];\r\n    float4 v2 = local_a[masked_tid + 2];\r\n    float4 v3 = local_a[masked_tid + 3];\r\n    float4 v4 = v0 + v1;\r\n    float4 v5 = v0 + v2;\r\n    float4 v6 = v0 + v3;\r\n    float4 v7 = v1 + v2;\r\n    float4 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 = native_recip(v0);\r\n        v1 = native_recip(v1);\r\n        v2 = native_recip(v2);\r\n        v3 = native_recip(v3);\r\n        v4 = native_recip(v4);\r\n        v5 = native_recip(v5);\r\n        v6 = native_recip(v6);\r\n        v7 = native_recip(v7);\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_rsqrt_rate_test(__global float4 *A, int count, __global float4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float4 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float4 v0 = local_a[masked_tid];\r\n    float4 v1 = local_a[masked_tid + 1];\r\n    float4 v2 = local_a[masked_tid + 2];\r\n    float4 v3 = local_a[masked_tid + 3];\r\n    float4 v4 = v0 + v1;\r\n    float4 v5 = v0 + v2;\r\n    float4 v6 = v0 + v3;\r\n    float4 v7 = v1 + v2;\r\n    float4 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 = native_rsqrt(v0);\r\n        v1 = native_rsqrt(v1);\r\n        v2 = native_rsqrt(v2);\r\n        v3 = native_rsqrt(v3);\r\n        v4 = native_rsqrt(v4);\r\n        v5 = native_rsqrt(v5);\r\n        v6 = native_rsqrt(v6);\r\n        v7 = native_rsqrt(v7);\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int64_add_rate_test(__global ulong2 *A, int count, __global ulong2 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global ulong2 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    ulong2 v0 = local_a[masked_tid];\r\n    ulong2 v1 = local_a[masked_tid + 1];\r\n    ulong2 v2 = local_a[masked_tid + 2];\r\n    ulong2 v3 = local_a[masked_tid + 3];\r\n    ulong2 v4 = v0 + v1;\r\n    ulong2 v5 = v0 + v2;\r\n    ulong2 v6 = v0 + v3;\r\n    ulong2 v7 = v1 + v2;\r\n    ulong2 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        //uint4 acc = local_a[i & (rate_local_mem_test_size) - 1];\r\n        v0 += acc;\r\n        v1 += acc;\r\n        v2 += acc;\r\n        v3 += acc;\r\n        v4 += acc;\r\n        v5 += acc;\r\n        v6 += acc;\r\n        v7 += acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int64_mul_rate_test(__global ulong2 *A, int count, __global ulong2 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global ulong2 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    ulong2 v0 = local_a[masked_tid];\r\n    ulong2 v1 = local_a[masked_tid + 1];\r\n    ulong2 v2 = local_a[masked_tid + 2];\r\n    ulong2 v3 = local_a[masked_tid + 3];\r\n    ulong2 v4 = v0 + v1;\r\n    ulong2 v5 = v0 + v2;\r\n    ulong2 v6 = v0 + v3;\r\n    ulong2 v7 = v1 + v2;\r\n    ulong2 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        //uint4 acc = local_a[i & (rate_local_mem_test_size) - 1];\r\n        v0 *= acc;\r\n        v1 *= acc;\r\n        v2 *= acc;\r\n        v3 *= acc;\r\n        v4 *= acc;\r\n        v5 *= acc;\r\n        v6 *= acc;\r\n        v7 *= acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n\r\n__kernel void mix_fp32_int32_add_rate_test(__global float4 *A, int count, __global float4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    \r\n    __local int4 local_a[rate_local_mem_test_size];\r\n    for (int i = tid;i < rate_local_mem_test_size; i += max_offset)\r\n        local_a[i] = convert_int4_sat(A[i]);\r\n    barrier(CLK_LOCAL_MEM_FENCE); \r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float4 v0 = A[masked_tid];\r\n    float4 v1 = A[masked_tid + 1];\r\n    float4 v2 = A[masked_tid + 2];\r\n    float4 v3 = A[masked_tid + 3];\r\n    int4 v4 = convert_int4_sat(v0 + v1);\r\n    int4 v5 = convert_int4_sat(v0 + v2);\r\n    int4 v6 = convert_int4_sat(v0 + v3);\r\n    int4 v7 = convert_int4_sat(v1 + v2);\r\n    float4 fp_acc = A[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n\tint4 int_acc = local_a[i & (rate_local_mem_test_size - 1)];\r\n        v0 += fp_acc;\r\n        v1 += fp_acc;\r\n        v2 += fp_acc;\r\n        v3 += fp_acc;\r\n        v4 += int_acc;\r\n        v5 += int_acc;\r\n        v6 += int_acc;\r\n        v7 += int_acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + convert_float4(v4 + v5 + v6 + v7);\r\n}\r\n\r\n__kernel void mix_fp32_int32_addmul_rate_test(__global float4 *A, int count, __global float4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float4 *fp32_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float4 v0 = fp32_a[masked_tid];\r\n    float4 v1 = fp32_a[masked_tid + 1];\r\n    float4 v2 = fp32_a[masked_tid + 2];\r\n    float4 v3 = fp32_a[masked_tid + 3];\r\n    int4 v4 = convert_int4_sat(v0 + v1);\r\n    int4 v5 = convert_int4_sat(v0 + v2);\r\n    int4 v6 = convert_int4_sat(v0 + v3);\r\n    int4 v7 = convert_int4_sat(v1 + v2);\r\n    float4 fp_acc = fp32_a[0];\r\n    int4 int_acc = convert_int4_sat(fp32_a[0]);\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 += fp_acc;\r\n        v1 += fp_acc;\r\n        v2 += fp_acc;\r\n        v3 += fp_acc;\r\n        v4 *= int_acc;\r\n        v5 *= int_acc;\r\n        v6 *= int_acc;\r\n        v7 *= int_acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + convert_float4(v4 + v5 + v6 + v7);\r\n}\r\n\r\n__kernel void fp32_fma_rate_test(__global float4 *A, int count, __global float4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float4 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float4 v0 = local_a[masked_tid];\r\n    float4 v1 = local_a[masked_tid + 1];\r\n    float4 v2 = local_a[masked_tid + 2];\r\n    float4 v3 = local_a[masked_tid + 3];\r\n    float4 v4 = local_a[masked_tid + 4];\r\n    float4 v5 = local_a[masked_tid + 5];\r\n    float4 v6 = local_a[masked_tid + 6];\r\n    float4 v7 = local_a[masked_tid + 7];\r\n    float4 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 += acc * v0;\r\n        v1 += acc * v1;\r\n        v2 += acc * v2;\r\n        v3 += acc * v3;\r\n        v4 += acc * v4;\r\n        v5 += acc * v5;\r\n        v6 += acc * v6;\r\n        v7 += acc * v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_builtin_fma_rate_test(__global float4 *A, int count, __global float4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float4 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float4 v0 = local_a[masked_tid];\r\n    float4 v1 = local_a[masked_tid + 1];\r\n    float4 v2 = local_a[masked_tid + 2];\r\n    float4 v3 = local_a[masked_tid + 3];\r\n    float4 v4 = local_a[masked_tid + 4];\r\n    float4 v5 = local_a[masked_tid + 5];\r\n    float4 v6 = local_a[masked_tid + 6];\r\n    float4 v7 = local_a[masked_tid + 7];\r\n    float4 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n\tv0 = fma(acc, v0, v0);\r\n\tv1 = fma(acc, v1, v1);\r\n\tv2 = fma(acc, v2, v2);\r\n\tv3 = fma(acc, v3, v3);\r\n\tv4 = fma(acc, v4, v4);\r\n\tv5 = fma(acc, v5, v5);\r\n\tv6 = fma(acc, v6, v6);\r\n\tv7 = fma(acc, v7, v7);\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_mad_rate_test(__global float4 *A, int count, __global float4 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float4 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float4 v0 = local_a[masked_tid];\r\n    float4 v1 = local_a[masked_tid + 1];\r\n    float4 v2 = local_a[masked_tid + 2];\r\n    float4 v3 = local_a[masked_tid + 3];\r\n    float4 v4 = v0 + v1;\r\n    float4 v5 = v0 + v2;\r\n    float4 v6 = v0 + v3;\r\n    float4 v7 = v1 + v2;\r\n    float4 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        //float4 acc = local_a[i & (rate_local_mem_test_size) - 1];\r\n        v0 = mad(acc, v0, v0);\r\n        v1 = mad(acc, v1, v1);\r\n        v2 = mad(acc, v2, v2);\r\n        v3 = mad(acc, v3, v3);\r\n        v4 = mad(acc, v4, v4);\r\n        v5 = mad(acc, v5, v5);\r\n        v6 = mad(acc, v6, v6);\r\n        v7 = mad(acc, v7, v7);\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int16_add_rate_test(__global short8 *A, int count, __global short8 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    //__global short8 *local_a = A;\r\n\r\n    __local short8 local_a[rate_local_mem_test_size];\r\n    for (int i = tid;i < rate_local_mem_test_size; i += max_offset)\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE); \r\n\r\n    int masked_tid = min(tid, rate_local_mem_test_size - 8);\r\n    short8 v0 = local_a[masked_tid];\r\n    short8 v1 = local_a[masked_tid + 1];\r\n    short8 v2 = local_a[masked_tid + 2];\r\n    short8 v3 = local_a[masked_tid + 3];\r\n    short8 v4 = local_a[masked_tid + 4];\r\n    short8 v5 = local_a[masked_tid + 5];\r\n    short8 v6 = local_a[masked_tid + 6];\r\n    short8 v7 = local_a[masked_tid + 7];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n\tshort8 acc = local_a[i & (rate_local_mem_test_size - 1)];\r\n        v0 += acc;\r\n        v1 += acc;\r\n        v2 += acc;\r\n        v3 += acc;\r\n        v4 += acc;\r\n        v5 += acc;\r\n        v6 += acc;\r\n        v7 += acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int16_mul_rate_test(__global short8 *A, int count, __global short8 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    //__global short8 *local_a = A;\r\n\r\n    __local short8 local_a[rate_local_mem_test_size];\r\n    for (int i = tid;i < rate_local_mem_test_size; i += max_offset)\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);  \r\n\r\n    int masked_tid = min(tid, rate_local_mem_test_size - 8);\r\n    short8 v0 = local_a[masked_tid];\r\n    short8 v1 = local_a[masked_tid + 1];\r\n    short8 v2 = local_a[masked_tid + 2];\r\n    short8 v3 = local_a[masked_tid + 3];\r\n    short8 v4 = local_a[masked_tid + 4];\r\n    short8 v5 = local_a[masked_tid + 5];\r\n    short8 v6 = local_a[masked_tid + 6]; \r\n    short8 v7 = local_a[masked_tid + 7];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n\tshort8 acc = local_a[i & (rate_local_mem_test_size - 1)];\r\n        v0 *= acc;\r\n        v1 *= acc;\r\n        v2 *= acc;\r\n        v3 *= acc;\r\n        v4 *= acc;\r\n        v5 *= acc;\r\n        v6 *= acc;\r\n        v7 *= acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int8_add_rate_test(__global char16 *A, int count, __global char16 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global char16 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    char16 v0 = local_a[masked_tid];\r\n    char16 v1 = local_a[masked_tid + 1];\r\n    char16 v2 = local_a[masked_tid + 2];\r\n    char16 v3 = local_a[masked_tid + 3];\r\n    char16 v4 = v0 + v1;\r\n    char16 v5 = v0 + v2;\r\n    char16 v6 = v0 + v3;\r\n    char16 v7 = v1 + v2;\r\n    char16 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 += acc;\r\n        v1 += acc;\r\n        v2 += acc;\r\n        v3 += acc;\r\n        v4 += acc;\r\n        v5 += acc;\r\n        v6 += acc;\r\n        v7 += acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int8_mul_rate_test(__global char16 *A, int count, __global char16 *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global char16 *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    char16 v0 = local_a[masked_tid];\r\n    char16 v1 = local_a[masked_tid + 1];\r\n    char16 v2 = local_a[masked_tid + 2];\r\n    char16 v3 = local_a[masked_tid + 3];\r\n    char16 v4 = v0 + v1;\r\n    char16 v5 = v0 + v2;\r\n    char16 v6 = v0 + v3;\r\n    char16 v7 = v1 + v2;\r\n    char16 acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i++) {\r\n        v0 *= acc;\r\n        v1 *= acc;\r\n        v2 *= acc;\r\n        v3 *= acc;\r\n        v4 *= acc;\r\n        v5 *= acc;\r\n        v6 *= acc;\r\n        v7 *= acc;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_fma_latency_test(__global float *A, int count, __global float *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float v0 = local_a[masked_tid];\r\n    float v1 = local_a[masked_tid + 1];\r\n    float v2 = local_a[masked_tid + 2];\r\n    float v3 = local_a[masked_tid + 3];\r\n    float v4 = v0 + v1;\r\n    float v5 = v0 + v2;\r\n    float v6 = v0 + v3;\r\n    float v7 = v1 + v2;\r\n    float acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i += 4) {\r\n        v0 = v7 + acc * v0;\r\n        v1 = v0 + acc * v1;\r\n        v2 = v1 + acc * v2;\r\n        v3 = v2 + acc * v3;\r\n        v4 = v3 + acc * v4;\r\n        v5 = v4 + acc * v5;\r\n        v6 = v5 + acc * v6;\r\n        v7 = v6 + acc * v7;\r\n\r\n        v0 = v7 + acc * v0;\r\n        v1 = v0 + acc * v1;\r\n        v2 = v1 + acc * v2;\r\n        v3 = v2 + acc * v3;\r\n        v4 = v3 + acc * v4;\r\n        v5 = v4 + acc * v5;\r\n        v6 = v5 + acc * v6;\r\n        v7 = v6 + acc * v7;\r\n\r\n        v0 = v7 + acc * v0;\r\n        v1 = v0 + acc * v1;\r\n        v2 = v1 + acc * v2;\r\n        v3 = v2 + acc * v3;\r\n        v4 = v3 + acc * v4;\r\n        v5 = v4 + acc * v5;\r\n        v6 = v5 + acc * v6;\r\n        v7 = v6 + acc * v7;\r\n\r\n        v0 = v7 + acc * v0;\r\n        v1 = v0 + acc * v1;\r\n        v2 = v1 + acc * v2;\r\n        v3 = v2 + acc * v3;\r\n        v4 = v3 + acc * v4;\r\n        v5 = v4 + acc * v5;\r\n        v6 = v5 + acc * v6;\r\n        v7 = v6 + acc * v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_add_latency_test(__global float *A, int count, __global float *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float v0 = local_a[masked_tid];\r\n    float v1 = local_a[masked_tid + 1];\r\n    float v2 = local_a[masked_tid + 2];\r\n    float v3 = local_a[masked_tid + 3];\r\n    float v4 = v0 + v1;\r\n    float v5 = v0 + v2;\r\n    float v6 = v0 + v3;\r\n    float v7 = v1 + v2;\r\n    float acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i += 4) {\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int32_add_latency_test(__global uint *A, int count, __global uint *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    uint v0 = A[masked_tid];\r\n    uint v1 = A[masked_tid + 1];\r\n    uint v2 = A[masked_tid + 2];\r\n    uint v3 = A[masked_tid + 3];\r\n    uint v4 = v0 + v1;\r\n    uint v5 = v0 + v2;\r\n    uint v6 = v0 + v3;\r\n    uint v7 = v1 + v2;\r\n\r\n    for (int i = 0; i < count; i += 4) {\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int32_mul_latency_test(__global uint *A, int count, __global uint *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global uint *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    uint v0 = local_a[masked_tid];\r\n    uint v1 = local_a[masked_tid + 1];\r\n    uint v2 = local_a[masked_tid + 2];\r\n    uint v3 = local_a[masked_tid + 3];\r\n    uint v4 = v0 + v1;\r\n    uint v5 = v0 + v2;\r\n    uint v6 = v0 + v3;\r\n    uint v7 = v1 + v2;\r\n    uint acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i += 4) {\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_divergence_rate_test(__global float *A, int count, __global float *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float v0 = local_a[masked_tid];\r\n    float v1 = local_a[masked_tid + 1];\r\n    float v2 = local_a[masked_tid + 2];\r\n    float v3 = local_a[masked_tid + 3];\r\n    float v4 = v0 + v1;\r\n    float v5 = v0 + v2;\r\n    float v6 = v0 + v3;\r\n    float v7 = v1 + v2;\r\n    float acc = A[0];\r\n    float op = A[get_global_id(0)];\r\n\r\n    if (op < 1.0) {\r\n        for (int i = 0; i < count; i++) {\r\n            if (op < 0.5) {\r\n                v0 += acc;\r\n                v1 += acc;\r\n                v2 += acc;\r\n                v3 += acc;\r\n                v4 += acc;\r\n                v5 += acc;\r\n                v6 += acc;\r\n                v7 += acc;\r\n            }\r\n            else\r\n            {\r\n                v0 *= acc;\r\n                v1 *= acc;\r\n                v2 *= acc;\r\n                v3 *= acc;\r\n                v4 *= acc;\r\n                v5 *= acc;\r\n                v6 *= acc;\r\n                v7 *= acc;\r\n            }\r\n        }\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_partition_rate_test(__global float *A, int count, __global float *ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float *local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float v0 = local_a[masked_tid];\r\n    float v1 = local_a[masked_tid + 1];\r\n    float v2 = local_a[masked_tid + 2];\r\n    float v3 = local_a[masked_tid + 3];\r\n    float v4 = v0 + v1;\r\n    float v5 = v0 + v2;\r\n    float v6 = v0 + v3;\r\n    float v7 = v1 + v2;\r\n    float acc = A[0];\r\n    float op = A[get_global_id(0)];\r\n\r\n    if (op < 1.0) {\r\n        for (int i = 0; i < count; i++) {\r\n            v0 += acc;\r\n            v1 += acc;\r\n            v2 += acc;\r\n            v3 += acc;\r\n            v4 += acc;\r\n            v5 += acc;\r\n            v6 += acc;\r\n            v7 += acc;\r\n        }\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n/// Scalar latency\r\n__kernel void int32_add_scalar_latency_test(__global uint* A, int count, __global uint* ret) {\r\n    int tid = 0;\r\n    int max_offset = get_local_size(0);\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    uint v0 = A[masked_tid];\r\n    uint v1 = A[masked_tid + 1];\r\n    uint v2 = A[masked_tid + 2];\r\n    uint v3 = A[masked_tid + 3];\r\n    uint v4 = v0 + v1;\r\n    uint v5 = v0 + v2;\r\n    uint v6 = v0 + v3;\r\n    uint v7 = v1 + v2;\r\n\r\n    for (int i = 0; i < count; i += 4) {\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void int32_mul_scalar_latency_test(__global uint* A, int count, __global uint* ret) {\r\n    int tid = 0;\r\n    int max_offset = get_local_size(0);\r\n    __global uint* local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    uint v0 = local_a[masked_tid];\r\n    uint v1 = local_a[masked_tid + 1];\r\n    uint v2 = local_a[masked_tid + 2];\r\n    uint v3 = local_a[masked_tid + 3];\r\n    uint v4 = v0 + v1;\r\n    uint v5 = v0 + v2;\r\n    uint v6 = v0 + v3;\r\n    uint v7 = v1 + v2;\r\n    uint acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i += 4) {\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_add_scalar_latency_test(__global float* A, int count, __global float* ret) {\r\n    int tid = 0;\r\n    int max_offset = get_local_size(0);\r\n    __global float* local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float v0 = local_a[masked_tid];\r\n    float v1 = local_a[masked_tid + 1];\r\n    float v2 = local_a[masked_tid + 2];\r\n    float v3 = local_a[masked_tid + 3];\r\n    float v4 = v0 + v1;\r\n    float v5 = v0 + v2;\r\n    float v6 = v0 + v3;\r\n    float v7 = v1 + v2;\r\n    float acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i += 8) {\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n\r\n        v0 = v7 + v0;\r\n        v1 = v0 + v1;\r\n        v2 = v1 + v2;\r\n        v3 = v2 + v3;\r\n        v4 = v3 + v4;\r\n        v5 = v4 + v5;\r\n        v6 = v5 + v6;\r\n        v7 = v6 + v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_fma_scalar_latency_test(__global float* A, int count, __global float* ret) {\r\n    int tid = 0;\r\n    int max_offset = get_local_size(0);\r\n    __global float* local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float v0 = local_a[masked_tid];\r\n    float v1 = local_a[masked_tid + 1];\r\n    float v2 = local_a[masked_tid + 2];\r\n    float v3 = local_a[masked_tid + 3];\r\n    float v4 = v0 + v1;\r\n    float v5 = v0 + v2;\r\n    float v6 = v0 + v3;\r\n    float v7 = v1 + v2;\r\n    float acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i += 4) {\r\n        v0 = v7 + acc * v0;\r\n        v1 = v0 + acc * v1;\r\n        v2 = v1 + acc * v2;\r\n        v3 = v2 + acc * v3;\r\n        v4 = v3 + acc * v4;\r\n        v5 = v4 + acc * v5;\r\n        v6 = v5 + acc * v6;\r\n        v7 = v6 + acc * v7;\r\n\r\n        v0 = v7 + acc * v0;\r\n        v1 = v0 + acc * v1;\r\n        v2 = v1 + acc * v2;\r\n        v3 = v2 + acc * v3;\r\n        v4 = v3 + acc * v4;\r\n        v5 = v4 + acc * v5;\r\n        v6 = v5 + acc * v6;\r\n        v7 = v6 + acc * v7;\r\n\r\n        v0 = v7 + acc * v0;\r\n        v1 = v0 + acc * v1;\r\n        v2 = v1 + acc * v2;\r\n        v3 = v2 + acc * v3;\r\n        v4 = v3 + acc * v4;\r\n        v5 = v4 + acc * v5;\r\n        v6 = v5 + acc * v6;\r\n        v7 = v6 + acc * v7;\r\n\r\n        v0 = v7 + acc * v0;\r\n        v1 = v0 + acc * v1;\r\n        v2 = v1 + acc * v2;\r\n        v3 = v2 + acc * v3;\r\n        v4 = v3 + acc * v4;\r\n        v5 = v4 + acc * v5;\r\n        v6 = v5 + acc * v6;\r\n        v7 = v6 + acc * v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_mul_scalar_latency_test(__global float* A, int count, __global float* ret) {\r\n    int tid = 0;\r\n    int max_offset = get_local_size(0);\r\n    __global float* local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float v0 = local_a[masked_tid];\r\n    float v1 = local_a[masked_tid + 1];\r\n    float v2 = local_a[masked_tid + 2];\r\n    float v3 = local_a[masked_tid + 3];\r\n    float v4 = v0 + v1;\r\n    float v5 = v0 + v2;\r\n    float v6 = v0 + v3;\r\n    float v7 = v1 + v2;\r\n    float acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i += 4) {\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}\r\n\r\n__kernel void fp32_mul_latency_test(__global float* A, int count, __global float* ret) {\r\n    int tid = get_local_id(0);\r\n    int max_offset = get_local_size(0);\r\n    __global float* local_a = A;\r\n\r\n    int masked_tid = tid & (rate_local_mem_test_size - 1);\r\n    float v0 = local_a[masked_tid];\r\n    float v1 = local_a[masked_tid + 1];\r\n    float v2 = local_a[masked_tid + 2];\r\n    float v3 = local_a[masked_tid + 3];\r\n    float v4 = v0 + v1;\r\n    float v5 = v0 + v2;\r\n    float v6 = v0 + v3;\r\n    float v7 = v1 + v2;\r\n    float acc = local_a[0];\r\n\r\n    for (int i = 0; i < count; i += 4) {\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n\r\n        v0 = v7 * v0;\r\n        v1 = v0 * v1;\r\n        v2 = v1 * v2;\r\n        v3 = v2 * v3;\r\n        v4 = v3 * v4;\r\n        v5 = v4 * v5;\r\n        v6 = v5 * v6;\r\n        v7 = v6 * v7;\r\n    }\r\n\r\n    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;\r\n}"
  },
  {
    "path": "GpuMemLatency/kernel.cl",
    "content": "// not used, I tried\r\n__constant sampler_t direct_sampler = CLK_NORMALIZED_COORDS_FALSE | // coordinates are from 0 to max dimension size\r\n                                        CLK_ADDRESS_NONE | // if it goes out of bounds feel free to explode and die\r\n                                        CLK_FILTER_NEAREST;\r\n__kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) {\r\n    int localId = get_local_id(0);\r\n    // uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up\r\n    int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0;\r\n    uint4 current = read_imageui(A, startPos);\r\n    // printf(\"start x: %u -> %u\\n\", startPos, current.x);\r\n    for (int i = 0; i < count; i += 10) {\r\n        // printf(\"current: %u %u %u %u, address: %d\\n\", current.x, current.y, current.z, current.w, (int)current.x / 4);\r\n        //current = read_imageui(A, direct_sampler, i);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        //printf(\"%d: current read: %u %u %u %u\\n\", i, current.x, current.y, current.z, current.w);\r\n        // local_a[localId] = current;\r\n    }\r\n\r\n    ret[get_global_id(0)] = current.x;\r\n}\r\n\r\n__constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float)\r\n                                        CLK_ADDRESS_REPEAT | // going out of bounds = replicate\r\n                                        CLK_FILTER_NEAREST;\r\n__kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) {\r\n    int localId = get_local_id(0);\r\n    float pos = get_global_id(0) * native_recip((float)get_global_size(0));\r\n    float2 increment;\r\n    increment.x = 0.01; // guessing\r\n    increment.y = 0.01;\r\n\r\n    float2 current0, current1, current2, current3;\r\n    current0.x = pos;\r\n    current0.y = pos;\r\n    current1.x = 0.1 + (localId / 10000);\r\n    current1.y = 0.1 + (localId / 10000);\r\n    current2.x = 0.01 + (localId / 10000);\r\n    current2.y = 0.01 + (localId / 10000);\r\n    current3.x = 0.002 + (localId / 5000);\r\n    current3.y = 0.001 + (localId / 5000);\r\n\r\n    float4 tmp0 = read_imagef(A, funny_sampler, current0);\r\n    float4 tmp1 = read_imagef(A, funny_sampler, current1);\r\n    float4 tmp2 = read_imagef(A, funny_sampler, current2);\r\n    float4 tmp3 = read_imagef(A, funny_sampler, current3);\r\n    for (int i = 0; i < count; i += 4)\r\n    {\r\n        tmp0 += read_imagef(A, funny_sampler, current0);\r\n        tmp1 += read_imagef(A, funny_sampler, current1);\r\n        tmp2 += read_imagef(A, funny_sampler, current2);\r\n        tmp3 += read_imagef(A, funny_sampler, current3);\r\n        current0 += increment;\r\n        current1 += increment;\r\n        current2 += increment;\r\n        current3 += increment;\r\n    }\r\n\r\n    *ret = dot(tmp0, tmp1) + dot(tmp2, tmp3);\r\n}\r\n\r\n// Cacheline size in bytes, must correspond to what's defined for the latency test\r\n#define CACHELINE_SIZE 64\r\n\r\n// unrolled until terascale no longer saw further improvement (10x unroll)\r\n// assumes count will be a multiple of 10. but it won't be too inaccurate with a big count\r\n// not divisible by 10\r\n__kernel void unrolled_latency_test(__global const int* A, int count, __global int* ret) {\r\n    int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency\r\n    int result;\r\n    for (int i = 0; i < count; i += 10) {\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n    }\r\n\r\n    ret[0] = result;\r\n}\r\n\r\n// Ensures the loaded value will be constant across a workgroup\r\n__kernel void scalar_unrolled_latency_test(__global const int* A, int count, __global int* ret) {\r\n    int current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0];\r\n    int result;\r\n    for (int i = 0; i < count; i += 10) {\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n    }\r\n\r\n    ret[0] = result;\r\n}\r\n\r\n// Takes size as an additional argument, meant to run many pointer chasing threads in parallel\r\n// Tries to measure a GPU's latency hiding ability at varying levels of parallelism\r\n__kernel void parallel_latency_test(__global const int* A, int count, int size, __global int* ret) {\r\n    size_t threadId = get_global_id(0);\r\n    int current = A[threadId % size];\r\n    int result = 0;\r\n    for (int i = 0; i < count; i += 10) {\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n    }\r\n\r\n    ret[threadId] = result;\r\n}\r\n\r\n// latency test like the unrolled one above, but with input as constant memory\r\n__kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) {\r\n    //int current = A[0];\r\n    int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0];\r\n    int result;\r\n    for (int i = 0; i < count; i += 10) {\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n    }\r\n\r\n    ret[0] = result;\r\n}\r\n\r\n#define local_mem_test_size 1024\r\n// uses local memory (LDS/shmem)\r\n__kernel void local_unrolled_latency_test(__global const int* A, int count, __global int* ret) {\r\n    __local int local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite?\r\n    // better be fast\r\n    for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // everyone else can chill/get masked off\r\n    if (get_local_id(0) == 0) {\r\n        int current = local_a[0];\r\n        int result;\r\n        for (int i = 0; i < count; i += 10) {\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n        }\r\n\r\n        ret[0] = result;\r\n    }\r\n}\r\n\r\n\r\n__kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) {\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    float4 result1 = (0.1f,0.2f,0.3f,0.4f);\r\n    float4 result2 = (1.1f,1.2f,1.3f,1.4f);\r\n    float4 result3 = (2.1f,2.2f,2.3f,2.4f);\r\n    float4 result4 = (3.0f,3.1f,3.2f,3.3f);\r\n    float4 result5 = (4.0f,4.2f,4.1f,4.3f);\r\n\r\n    int initialIdx = startPositions[threadId];\r\n    //int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1);\r\n    //startPositions[threadId] = initialIdx; // for debugging\r\n\r\n    int idx = initialIdx;\r\n    __global float4 *B = (__global float4 *)A;\r\n    for (int i = 0; i < count; i += 20) {\r\n        result1 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n\r\n        result2 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n\r\n        result3 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n\r\n        result4 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n\r\n        result5 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n    }\r\n\r\n    ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5);\r\n}\r\n\r\n#define local_mem_bw_test_size 1024\r\n// test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats\r\n__kernel void local_bw_test(__global float* A, uint count, __global float* ret) {\r\n __local float local_a[local_mem_bw_test_size];\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    float acc1 = 1.1;\r\n    float acc2 = 2.2;\r\n    float acc3 = 3.3;\r\n    float acc4 = 4.4;\r\n\n    //printf(\"subgroup size %d\\n\", get_sub_group_size());\n\r\n    // workgroup-wide copy from global mem into local mem\r\n    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // assumes local memory size is at least 1024 float4s\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    int idx2 = localId + localSize * 2;\r\n    for (int i = 0; i < count; i += 12) { \r\n        acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        acc2 += local_a[idx0 + 1] * local_a[idx1 + 1] + local_a[idx2 + 1];\r\n        acc3 += local_a[idx0 + 2] * local_a[idx1 + 2] + local_a[idx2 + 2];\r\n        acc4 += local_a[idx0 + 3] * local_a[idx1 + 3] + local_a[idx2 + 3];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n    }\r\n\r\n    ret[threadId] = acc1 + acc2 + acc3 + acc4;\r\n}\r\n\r\n__kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) {\r\n    __local float4 local_a[local_mem_bw_test_size];\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    float4 acc1 = A[get_global_id(0) & 0x3FF];\r\n    float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];\r\n    float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];\r\n    float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];\r\n\r\n    // workgroup-wide copy from global mem into local mem\r\n    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // assumes local memory size is at least 1024 float4s\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    int idx2 = localId + localSize * 2;\r\n    for (int i = 0; i < count; i += (12*4)) { \r\n        acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        acc2 += local_a[idx0 + 1] * local_a[idx1 + 1] + local_a[idx2 + 1];\r\n        acc3 += local_a[idx0 + 2] * local_a[idx1 + 2] + local_a[idx2 + 2];\r\n        acc4 += local_a[idx0 + 3] * local_a[idx1 + 3] + local_a[idx2 + 3];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n    }\r\n\r\n    ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4);\r\n}\r\n\r\n\r\n#define local64_test_size 2048 // size was given in 4B elements. This test uses 8B\r\n__kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) { \r\n    __local ulong local_a[local64_test_size];\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n\r\n    // workgroup-wide copy from global mem into local mem\r\n    for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;\r\n\r\n    // assumes local memory size is at least 512x 64-bit uints\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    for (int i = 0; i < count; i += 8) { \r\n        acc0 ^= local_a[idx0];\r\n        acc1 ^= local_a[idx1];\r\n        acc2 ^= local_a[idx0 + 1];\r\n        acc3 ^= local_a[idx1 + 1];\r\n        idx0 = (idx0 + localSize) & 0x1FF;\r\n        idx1 = (idx1 + localSize) & 0x1FF;\r\n    }\r\n\r\n    ret[threadId] = acc0 + acc1 + acc2 + acc3;\r\n}\r\n\r\n// let's try the method from zhe jia et al\r\n__kernel void local_chase_bw(__global uint* A, uint count, __global uint* ret) {\r\n    __local ulong local_a[local_mem_bw_test_size];\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    uint sink = localId;\r\n\r\n    // workgroup-wide copy from global mem into local mem\r\n    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    for (int i = 0; i < count; i += 4)\r\n    {\r\n        sink = local_a[sink];\r\n        sink = local_a[sink];\r\n        sink = local_a[sink];\r\n        sink = local_a[sink];\r\n    }\r\n\r\n    ret[threadId] = sink;\r\n}\r\n\r\n#define fixed_tex_test_size 1024\r\n__kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) {\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    uint4 acc1 = read_imageui(A, 0);\r\n    uint4 acc2 = read_imageui(A, 1);\r\n    uint4 acc3 = read_imageui(A, 2);\r\n    uint4 acc4 = read_imageui(A, 3);\r\n\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    int idx2 = localId + localSize * 2;\r\n\r\n    // Each read_imageui reads out a 4-wide vector\r\n    for (int i = 0; i < count; i += 16) {\r\n        read_imageui(A, idx0);\r\n        acc1 += read_imageui(A, idx0);\r\n        acc2 += read_imageui(A, idx1);\r\n        acc3 += read_imageui(A, idx2);\r\n        acc4 += read_imageui(A, idx0 + 1);\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n    }\r\n\r\n    float4 out1 = convert_float4(acc1);\r\n    float4 out2 = convert_float4(acc2);\r\n    float4 out3 = convert_float4(acc3);\r\n    float4 out4 = convert_float4(acc4);\r\n    ret[threadId] = dot(out1, out2) + dot(out3, out4);\r\n}\r\n\r\n// A = inputs, fixed size\r\n__kernel void int_exec_latency_test(__global int* A, int count, __global int* ret) {\r\n    int sum = 0;\r\n    int input1 = A[0], input2 = A[1], input3 = A[2], input4 = A[3];\r\n    for (int i = 0; i < count; i++) {\r\n        sum += input1;\r\n        sum += input2;\r\n        sum += input3;\r\n        sum += input4;\r\n        sum += input1;\r\n        sum += input2;\r\n        sum += input3;\r\n        sum += input4;\r\n        sum += input1;\r\n        sum += input2;\r\n        sum += input3;\r\n        sum += input4;\r\n    }\r\n}\r\n\r\n// hoping each thread/workgroup lands on a different CU\r\n// A = pointer to location being bounced around\r\n// count = iterations\r\n// ret = sink\r\n// t1 = id of thread 1\r\n// t2 = id of thread 2\r\n__kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) {\r\n    int global_id = get_global_id(0);\r\n    int current = 0;\r\n    if (global_id == t1) current = 1;\r\n    else if (global_id == t2) current = 2;\r\n\r\n    if (global_id == t1 || global_id == t2) {\r\n        //printf(\"gid: %d, t1: %d, t2: %d, A: %d, current = %d\\n\", global_id, t1, t2, *A, current);\r\n        while (current <= 2 * count) {\r\n            if (atomic_cmpxchg(A, current - 1, current) == current - 1) {\r\n                current += 2;\r\n            }\r\n        }\r\n        ret[0] = current;\r\n    }\r\n}\r\n\r\n__kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) {\r\n    int current = get_global_id(0) + 1;\r\n    while (current <= 2 * count) {\r\n        if (atomic_cmpxchg(A, current - 1, current) == current - 1) {\r\n            current += 2;\r\n        }\r\n    }\r\n}\r\n\r\n__kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) {\r\n    __local int a[1];\r\n    int current = get_global_id(0) + 1;\r\n    if (current == 1) a[0] = A[0];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    while (current <= 2 * count) {\r\n        if (atomic_cmpxchg(a, current - 1, current) == current - 1) {\r\n            current += 2;\r\n        }\r\n    }\r\n}\r\n\r\n__kernel void dummy_add(__global int* A) {\r\n    A[get_global_id(0)]++;\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/atomic_exec_latency_test.cl",
    "content": "__kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) {\r\n    int current = get_global_id(0) + 1;\r\n    while (current <= 2 * count) {\r\n        if (atomic_cmpxchg(A, current - 1, current) == current - 1) {\r\n            current += 2;\r\n        }\r\n    }\r\n}\r\n\r\n__kernel void atomic_add_test(__global int *A, int count) {\r\n    int addend = get_global_id(0);\r\n    int addend1 = addend + 5;\r\n    int addend2 = addend + 6;\r\n    int addend3 = addend + 7;\r\n    int addend4 = addend + 8;\r\n    int addend5 = addend + 9;\r\n    int addend6 = addend + 10;\r\n    int addend7 = addend + 11;\r\n    __global int *target = A + get_global_id(0);\r\n    for (int i = 0; i < count; i++)\r\n    {\r\n        atomic_add(target, addend);\r\n        atomic_add(target, addend1);\r\n        atomic_add(target, addend2);\r\n        atomic_add(target, addend3);\r\n        atomic_add(target, addend4);\r\n        atomic_add(target, addend5);\r\n        atomic_add(target, addend6);\r\n        atomic_add(target, addend7);\r\n    }\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/buffer_bw_test.cl",
    "content": "#define fixed_tex_test_size 1024\r\n__kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) {\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    uint4 acc1 = read_imageui(A, 0);\r\n    uint4 acc2 = read_imageui(A, 1);\r\n    uint4 acc3 = read_imageui(A, 2);\r\n    uint4 acc4 = read_imageui(A, 3);\r\n\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    int idx2 = localId + localSize * 2;\r\n\r\n    // Each read_imageui reads out a 4-wide vector\r\n    for (int i = 0; i < count; i += 16) {\r\n        read_imageui(A, idx0);\r\n        acc1 += read_imageui(A, idx0);\r\n        acc2 += read_imageui(A, idx1);\r\n        acc3 += read_imageui(A, idx2);\r\n        acc4 += read_imageui(A, idx0 + 1);\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n    }\r\n\r\n    float4 out1 = convert_float4(acc1);\r\n    float4 out2 = convert_float4(acc2);\r\n    float4 out3 = convert_float4(acc3);\r\n    float4 out4 = convert_float4(acc4);\r\n    ret[threadId] = dot(out1, out2) + dot(out3, out4);\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/c2c_atomic_exec_latency_test.cl",
    "content": "// hoping each thread/workgroup lands on a different CU\r\n// A = pointer to location being bounced around\r\n// count = iterations\r\n// ret = sink\r\n// t1 = id of thread 1\r\n// t2 = id of thread 2\r\n__kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) {\r\n    int global_id = get_global_id(0);\r\n    int current = 0;\r\n    if (global_id == t1) current = 1;\r\n    else if (global_id == t2) current = 2;\r\n\r\n    if (global_id == t1 || global_id == t2) {\r\n        //printf(\"gid: %d, t1: %d, t2: %d, A: %d, current = %d\\n\", global_id, t1, t2, *A, current);\r\n        while (current <= 2 * count) {\r\n            if (atomic_cmpxchg(A, current - 1, current) == current - 1) {\r\n                current += 2;\r\n            }\r\n        }\r\n        ret[0] = current;\r\n    }\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/constant_unrolled_latency_test.cl",
    "content": "// latency test like the unrolled one above, but with input as constant memory\r\n__kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) {\r\n    //int current = A[0];\r\n    int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0];\r\n    int result;\r\n    for (int i = 0; i < count; i += 10) {\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n    }\r\n\r\n    ret[0] = result;\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/ldst_bw_test.cl",
    "content": "#define ldst_bw_test_size 1024\r\n// test load/store bandwidth with a small test size that should fit in L1\r\n/*__kernel void ldst_bw_test(__global float* A, uint count, __global float* ret) {\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    float acc1 = 1.1;\r\n    float acc2 = 2.2;\r\n    float acc3 = 3.3;\r\n    float acc4 = 4.4;\r\n\r\n    // assumes local memory size is at least 1024 float4s\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    int idx2 = localId + localSize * 2;\r\n    for (int i = 0; i < count; i += 12) { \r\n        acc1 += A[idx0] * A[idx1] + A[idx2];\r\n        idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);\r\n        idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);\r\n        idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);\r\n\r\n        acc2 += A[idx0] * A[idx1] + A[idx2];\r\n        idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);\r\n        idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);\r\n        idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);\r\n\r\n        acc3 += A[idx0] * A[idx1] + A[idx2];\r\n        idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);\r\n        idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);\r\n        idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);\r\n\r\n        acc4 += A[idx0] * A[idx1] + A[idx2];\r\n        idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);\r\n        idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);\r\n        idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);\r\n    }\r\n\r\n    ret[threadId] = acc1 + acc2 + acc3 + acc4;\r\n}*/\r\n\r\n__kernel void ldst_bw_test(__global float4* A, uint count, __global float* ret) {\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    float acc1 = 1.1;\r\n    float acc2 = 2.2;\r\n    float acc3 = 3.3;\r\n    float acc4 = 4.4;\r\n\r\n    // assumes local memory size is at least 1024 float4s\r\n    int idx0 = localId;\r\n    int idx1 = idx0 + localSize;\r\n    int idx2 = idx1 + localSize;\r\n    int idx3 = idx2 + localSize;\r\n    for (int i = 0; i < count; i += (16*4)) { \r\n        acc1 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n        idx3 = (idx3 + localSize) & 0x3FF;\r\n\r\n        acc2 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n        idx3 = (idx3 + localSize) & 0x3FF;\r\n\r\n        acc3 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n        idx3 = (idx3 + localSize) & 0x3FF;\r\n\r\n        acc4 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n        idx3 = (idx3 + localSize) & 0x3FF;\r\n    }\r\n\r\n    ret[threadId] =  acc1 + acc2 + acc3 + acc4;\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/local_64_bw_test.cl",
    "content": "#define local64_test_size 2048 // size was given in 4B elements. This test uses 8B\r\n__kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) { \r\n    __local ulong local_a[local64_test_size];\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n\r\n    // workgroup-wide copy from global mem into local mem\r\n    for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;\r\n\r\n    // assumes local memory size is at least 512x 64-bit uints\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    for (int i = 0; i < count; i += 8) { \r\n        acc0 ^= local_a[idx0];\r\n        acc1 ^= local_a[idx1];\r\n        idx0 = (idx0 + localSize) & 0x1FF;\r\n        idx1 = (idx1 + localSize) & 0x1FF;\r\n\r\n        acc3 ^= local_a[idx0];\r\n        acc4 ^= local_a[idx1];\r\n        idx0 = (idx0 + localSize) & 0x1FF;\r\n        idx1 = (idx1 + localSize) & 0x1FF;\r\n    }\r\n\r\n    ret[threadId] = acc0 + acc1 + acc2 + acc3;\r\n}\r\n"
  },
  {
    "path": "GpuMemLatency/kernels/local_atomic_latency_test.cl",
    "content": "__kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) {\r\n    __local int a[1];\r\n    int current = get_global_id(0) + 1;\r\n    if (current == 1) a[0] = A[0];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    while (current <= 2 * count) {\r\n        if (atomic_cmpxchg(a, current - 1, current) == current - 1) {\r\n            current += 2;\r\n        }\r\n    }\r\n}\r\n\r\n#define local_atomic_add_wg_size 256\r\n__kernel void local_atomic_add_test(__global int *A, int count) {\r\n    __local int local_a[local_atomic_add_wg_size];\r\n    local_a[get_local_id(0)] = A[get_global_id(0)];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    int addend = get_global_id(0);\r\n    int addend1 = addend + 5;\r\n    int addend2 = addend + 6;\r\n    int addend3 = addend + 7;\r\n    int addend4 = addend + 8;\r\n    int addend5 = addend + 9;\r\n    int addend6 = addend + 10;\r\n    int addend7 = addend + 11;\r\n    __local int *target = local_a + get_local_id(0);\r\n    for (int i = 0; i < count; i++)\r\n    {\r\n        atomic_add(target, addend);\r\n        atomic_add(target, addend1);\r\n        atomic_add(target, addend2);\r\n        atomic_add(target, addend3);\r\n        atomic_add(target, addend4);\r\n        atomic_add(target, addend5);\r\n        atomic_add(target, addend6);\r\n        atomic_add(target, addend7);\r\n    }\r\n\r\n    A[get_global_id(0)] = local_a[get_local_id(0)];\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/local_bw_test.cl",
    "content": "#define local_mem_bw_test_size 1024\r\n// test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats\r\n__kernel void local_bw_test(__global float* A, uint count, __global float* ret) {\r\n __local float local_a[local_mem_bw_test_size];\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    float acc1 = 1.1;\r\n    float acc2 = 2.2;\r\n    float acc3 = 3.3;\r\n    float acc4 = 4.4;\r\n\r\n    //printf(\"subgroup size %d\\n\", get_sub_group_size());\r\n\r\n    // workgroup-wide copy from global mem into local mem\r\n    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // assumes local memory size is at least 1024 float4s\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    int idx2 = localId + localSize * 2;\r\n    for (int i = 0; i < count; i += 12) { \r\n        acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n\r\n        acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n\r\n        acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n\r\n        acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n    }\r\n\r\n    ret[threadId] = acc1 + acc2 + acc3 + acc4;\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/local_float4_bw_test.cl",
    "content": "#define local_mem_bw_test_size 1024\r\n__kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) {\r\n    __local float4 local_a[local_mem_bw_test_size];\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    float4 acc1 = A[get_global_id(0) & 0x3FF];\r\n    float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];\r\n    float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];\r\n    float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];\r\n\r\n    // workgroup-wide copy from global mem into local mem\r\n    for (int i = get_local_id(0); i < local_mem_bw_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // assumes local memory size is at least 1024 float4s\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    int idx2 = localId + localSize * 2;\r\n    for (int i = 0; i < count; i += (12 * 4)) {\r\n        acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n\r\n        acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n\r\n        acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n\r\n        acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n    }\r\n\r\n    ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4);\r\n}\r\n\r\n__kernel void mixed_float4_bw_test(__global float4* A, uint count, __global float* ret) {\r\n    __local float4 local_a[local_mem_bw_test_size];\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    float4 acc1 = A[get_global_id(0) & 0x3FF];\r\n    float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];\r\n    float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];\r\n    float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];\r\n    float4 acc5 = A[(get_global_id(0) + 4) & 0x3FF];\r\n    float4 acc6 = A[(get_global_id(0) + 5) & 0x3FF];\r\n    float4 acc7 = A[(get_global_id(0) + 6) & 0x3FF];\r\n    float4 acc8 = A[(get_global_id(0) + 7) & 0x3FF];\r\n\r\n    // workgroup-wide copy from global mem into local mem\r\n    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // assumes local memory size is at least 1024 float4s\r\n    int idx0 = localId;\r\n    int idx1 = localId + localSize;\r\n    int idx2 = localId + localSize * 2;\r\n    for (int i = 0; i < count; i += (16*4)) {\r\n        local_a[idx0] += A[idx1] * A[idx2]; // 4 * (3R 1W)\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n\r\n        local_a[idx0] += A[idx1] * A[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n\r\n        local_a[idx0] += A[idx1] * A[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n\r\n        local_a[idx0] += A[idx1] * A[idx2];\r\n        idx0 = (idx0 + localSize) & 0x3FF;\r\n        idx1 = (idx1 + localSize) & 0x3FF;\r\n        idx2 = (idx2 + localSize) & 0x3FF;\r\n    }\r\n\r\n    ret[threadId] = dot(local_a[get_local_id(0)], local_a[get_local_id(0) + 1]);\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/local_unrolled_latency_test.cl",
    "content": "#define local_mem_test_size 1024\r\n// uses local memory (LDS/shmem)\r\n__kernel void local_unrolled_latency_test(__global const uint* A, int count, __global uint* ret) {\r\n    __local uint local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite?\r\n    // better be fast\r\n    for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0))\r\n        local_a[i] = A[i];\r\n    barrier(CLK_LOCAL_MEM_FENCE);\r\n\r\n    // everyone else can chill/get masked off\r\n    if (get_local_id(0) == 0) {\r\n        uint current = local_a[0];\r\n        uint result;\r\n        for (int i = 0; i < count; i += 10) {\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n            result += current;\r\n            current = local_a[current];\r\n        }\r\n\r\n        ret[0] = result;\r\n    }\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/scalar_unrolled_latency_test.cl",
    "content": "// Ensures the loaded value will be constant across a workgroup\r\n__kernel void scalar_unrolled_latency_test(__global const uint* A, int count, __global uint* ret) {\r\n    uint current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0];\r\n    uint result;\r\n    for (int i = 0; i < count; i += 10) {\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n    }\r\n\r\n    ret[0] = result;\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/sum_bw_test.cl",
    "content": "__kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) {\r\n    int threadId = get_global_id(0);\r\n    int localId = get_local_id(0);\r\n    int localSize = get_local_size(0);\r\n    int groupId = get_group_id(0);\r\n    float4 result1 = (0.1f,0.2f,0.3f,0.4f);\r\n    float4 result2 = (1.1f,1.2f,1.3f,1.4f);\r\n    float4 result3 = (2.1f,2.2f,2.3f,2.4f);\r\n    float4 result4 = (3.0f,3.1f,3.2f,3.3f);\r\n    float4 result5 = (4.0f,4.2f,4.1f,4.3f);\r\n\r\n    int initialIdx = startPositions[threadId];\r\n    //int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1);\r\n    //startPositions[threadId] = initialIdx; // for debugging\r\n\r\n    int idx = initialIdx;\r\n    __global float4 *B = (__global float4 *)A;\r\n    for (int i = 0; i < count; i += 20) {\r\n        result1 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n\r\n        result2 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n\r\n        result3 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n\r\n        result4 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n\r\n        result5 += B[idx];\r\n        idx += localSize;\r\n        if (idx >= float4size) idx = initialIdx;\r\n    }\r\n\r\n    ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5);\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/tex_bw_test.cl",
    "content": "__constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float)\r\n                                        CLK_ADDRESS_REPEAT | // going out of bounds = replicate\r\n                                        CLK_FILTER_NEAREST;\r\n__kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) {\r\n    int localId = get_local_id(0);\r\n    float pos = get_global_id(0) * native_recip((float)get_global_size(0));\r\n    float2 increment;\r\n    increment.x = 0.01; // guessing\r\n    increment.y = 0.01;\r\n\r\n    float2 current0, current1, current2, current3;\r\n    current0.x = pos;\r\n    current0.y = pos;\r\n    current1.x = 0.1 + (localId / 10000);\r\n    current1.y = 0.1 + (localId / 10000);\r\n    current2.x = 0.01 + (localId / 10000);\r\n    current2.y = 0.01 + (localId / 10000);\r\n    current3.x = 0.002 + (localId / 5000);\r\n    current3.y = 0.001 + (localId / 5000);\r\n\r\n    float4 tmp0 = read_imagef(A, funny_sampler, current0);\r\n    float4 tmp1 = read_imagef(A, funny_sampler, current1);\r\n    float4 tmp2 = read_imagef(A, funny_sampler, current2);\r\n    float4 tmp3 = read_imagef(A, funny_sampler, current3);\r\n    for (int i = 0; i < count; i += 4)\r\n    {\r\n        tmp0 += read_imagef(A, funny_sampler, current0);\r\n        tmp1 += read_imagef(A, funny_sampler, current1);\r\n        tmp2 += read_imagef(A, funny_sampler, current2);\r\n        tmp3 += read_imagef(A, funny_sampler, current3);\r\n        current0 += increment;\r\n        current1 += increment;\r\n        current2 += increment;\r\n        current3 += increment;\r\n    }\r\n\r\n    *ret = dot(tmp0, tmp1) + dot(tmp2, tmp3);\r\n}"
  },
  {
    "path": "GpuMemLatency/kernels/tex_latency_test.cl",
    "content": "__kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) {\r\n    int localId = get_local_id(0);\r\n    // uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up\r\n    int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0;\r\n    uint4 current = read_imageui(A, startPos);\r\n    // printf(\"start x: %u -> %u\\n\", startPos, current.x);\r\n    for (int i = 0; i < count; i += 10) {\r\n        // printf(\"current: %u %u %u %u, address: %d\\n\", current.x, current.y, current.z, current.w, (int)current.x / 4);\r\n        //current = read_imageui(A, direct_sampler, i);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        current = read_imageui(A, current.x);\r\n        //printf(\"%d: current read: %u %u %u %u\\n\", i, current.x, current.y, current.z, current.w);\r\n        // local_a[localId] = current;\r\n    }\r\n\r\n    ret[get_global_id(0)] = current.x;\r\n}\r\n"
  },
  {
    "path": "GpuMemLatency/kernels/unrolled_latency_test.cl",
    "content": "// unrolled until terascale no longer saw further improvement (10x unroll)\r\n// assumes count will be a multiple of 10. but it won't be too inaccurate with a big count\r\n// not divisible by 10\r\n__kernel void unrolled_latency_test(__global const uint* A, int count, __global uint* ret) {\r\n    uint current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency\r\n    uint result;\r\n    for (int i = 0; i < count; i += 10) {\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n    }\r\n\r\n    ret[0] = result;\r\n}"
  },
  {
    "path": "GpuMemLatency/latency_test.c",
    "content": "#include \"opencltest.h\"\n\n// list_size = number of 4B (32-bit) elements\nfloat latency_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t list_size,\n    uint32_t chase_iterations,\n    short uniform,\n    int threads,\n    int local_size,\n    int wave_size,\n    int stride,\n    uint32_t *elapsed_ms)\n{\n    size_t global_item_size = 1, local_item_size = 1;\n    cl_int ret;\n    float latency;\n    int64_t time_diff_ms;\n    uint32_t result;\n\n    if (threads && local_size)\n    {\n        local_item_size = local_size;\n        global_item_size = threads;\n    }\n\n    // fprintf(stderr, \"Testing latency with %d threads %d local size %d list size\\n\", threads, local_size, list_size);\n\n    // Sanity Checks\n    if (!uniform && ((stride * 2 > list_size * 4) || // 2 cache lines\n        ((threads > 1) && (stride * 2 > (list_size * 4 / (threads / wave_size)))))) // handle partition case\n    {\n        fprintf(stderr, \"Less than 2 lines will be visited with stride %d, list size %dx 32-bit INTs\\n\", stride, list_size);\n        return 1.0f;\n    }\n\n    // Fill pattern arr\n    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * list_size);\n    uint32_t* thread_start = (uint32_t*)malloc(sizeof(uint32_t) * (global_item_size));\n    memset(A, 0, sizeof(uint32_t) * list_size);\n    if (threads < 2 || uniform) {\n        FillPatternArr(A, list_size, stride);\n        thread_start[0] = 0;\n    }\n    else\n    {\n        if (wave_size <= 1) wave_size = 1;\n\n        // partition pattern arr, creating a section for each wave\n        int wave_count = threads / wave_size;\n        int sub_list_size = list_size / wave_count;\n        for (int waveId = 0; waveId < wave_count; waveId++)\n        {\n            int waveId_start = sub_list_size * waveId;\n            thread_start[wave_size * waveId] = waveId_start;\n            FillPatternArr(A + waveId_start, sub_list_size, stride);\n            // fprintf(stderr, \"starting thread %d at %d\\n\", threadId, threadId_start);\n\n            // offset indices\n            for (int subIdx = 0; subIdx < sub_list_size; subIdx++)\n            {\n                A[waveId_start + subIdx] += waveId_start;\n            }\n        }\n\n        // make sure all threads in a wave access the same item\n        for (int i = 1; i < threads; i++)\n        {\n            int waveId = i / wave_size;\n            thread_start[i] = thread_start[waveId * wave_size];\n            //fprintf(stderr, \"wave %d thread %d starting at %d\\n\", waveId, i, thread_start[i]);\n        }\n    }\n\n    // copy array to device\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(uint32_t), NULL, &ret);\n    clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(uint32_t), A, 0, NULL, NULL);\n\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, global_item_size * sizeof(uint32_t), NULL, &ret);\n    clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL);\n    clFinish(command_queue);\n\n    // Set kernel arguments\n    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\n    if (ret != CL_SUCCESS)\n    {\n        fprintf(stderr, \"Failed to set list as kernel arg. clSetKernelArg returned %d\\n\", ret);\n        latency = 0;\n        goto cleanup;\n    }\n\n    ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\n    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\n\n    start_timing();\n    // Execute the OpenCL kernel. launch a single thread\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\n    if (ret != CL_SUCCESS)\n    {\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\n        latency = 0;\n        goto cleanup;\n    }\n\n    ret = clFinish(command_queue); // returns success even when TDR happens?\n    if (ret != CL_SUCCESS)\n    {\n        printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\n        latency = 0;\n        goto cleanup;\n    }\n\n    time_diff_ms = end_timing();\n    if (elapsed_ms != NULL) *elapsed_ms = time_diff_ms;\n    latency = 1e6 * (float)time_diff_ms / (float)chase_iterations;\n\n    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t), &result, 0, NULL, NULL);\n    clFinish(command_queue);\n\n    //fprintf(stderr, \"Finished reading result. Sum: %d\\n\", result[0]);\n\ncleanup:\n    clFlush(command_queue);\n    clFinish(command_queue);\n    clReleaseMemObject(a_mem_obj);\n    clReleaseMemObject(result_obj);\n    free(A);\n    return latency;\n}\n\nfloat tex_latency_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t list_size,\n    uint32_t chase_iterations,\n    int threads,\n    int local_size,\n    int wave_size)\n{\n    size_t global_item_size = 1, local_item_size = 1;\n    cl_int ret = 0;\n    uint32_t result;\n    cl_mem a_mem_obj = NULL, result_obj = NULL, tex_obj = NULL;\n    float latency = 0;\n\n    if (threads > 1)\n    {\n        global_item_size = threads;\n        local_item_size = local_size;\n    }\n\n    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * list_size);\n    uint32_t* thread_start = (uint32_t*)malloc(sizeof(uint32_t) * (global_item_size));\n    memset(A, 0, sizeof(uint32_t) * list_size);\n    if (threads < 2) {\n        FillPatternArr(A, list_size, CACHELINE_SIZE);\n        thread_start[0] = 0;\n    }\n    else\n    {\n        if (wave_size <= 1) wave_size = 1;\n\n        // partition pattern arr, creating a section for each wave\n        int wave_count = threads / wave_size;\n        int sub_list_size = list_size / wave_count;\n        for (int waveId = 0; waveId < wave_count; waveId++)\n        {\n            int waveId_start = sub_list_size * waveId;\n            thread_start[wave_size * waveId] = waveId_start;\n            FillPatternArr(A + waveId_start, sub_list_size, CACHELINE_SIZE);\n            // fprintf(stderr, \"starting thread %d at %d\\n\", threadId, threadId_start);\n\n            // offset indices\n            for (int subIdx = 0; subIdx < sub_list_size; subIdx++)\n            {\n                A[waveId_start + subIdx] += waveId_start;\n            }\n        }\n\n        // make sure all threads in a wave access the same item\n        for (int i = 1; i < threads; i++)\n        {\n            int waveId = i / wave_size;\n            thread_start[i] = thread_start[waveId * wave_size];\n            //fprintf(stderr, \"wave %d thread %d starting at %d\\n\", waveId, i, thread_start[i]);\n        }\n    }\n\n    // use buffer as texture\n    a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(uint32_t), NULL, &ret);\n    clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(uint32_t), A, 0, NULL, NULL);\n    clFinish(command_queue);\n    cl_image_format imageFormat;\n    imageFormat.image_channel_data_type = CL_UNSIGNED_INT32;\n    imageFormat.image_channel_order = CL_R;\n\n    cl_image_desc imageDesc;\n    memset(&imageDesc, 0, sizeof(cl_image_desc));\n    imageDesc.buffer = a_mem_obj;\n    imageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;\n    imageDesc.image_width = list_size; // width in pixels\n    //imageDesc.image_height = 1; // not used for 1D image\n    //imageDesc.image_depth = 1;  // not used for 1D image\n    //imageDesc.mem_object = a_mem_obj;\n    tex_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, NULL, &ret);\n    if (ret != CL_SUCCESS)\n    {\n        fprintf(stderr, \"Failed to create image: %d\\n\", ret);\n        goto texLatencyCleanup;\n    }\n\n    size_t origin[] = { 0, 0, 0 };\n    size_t region[] = { imageDesc.image_width, 1, 1 };\n    ret = clEnqueueWriteImage(command_queue, tex_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL);\n    if (ret != CL_SUCCESS)\n    {\n        fprintf(stderr, \"Failed to copy image: %d\\n\", ret);\n        goto texLatencyCleanup;\n    }\n    \n    result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, global_item_size * sizeof(uint32_t), NULL, &ret);\n    clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL);\n    clFinish(command_queue);\n\n    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_obj);\n    ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);\n    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\n    ret = clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&list_size);\n\n    start_timing();\n    // Execute the OpenCL kernel\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\n    if (ret != CL_SUCCESS)\n    {\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\n        latency = 0;\n        goto texLatencyCleanup;\n    }\n\n    ret = clFinish(command_queue); // returns success even when TDR happens?\n    if (ret != CL_SUCCESS)\n    {\n        printf(\"Failed to finish command queue. clFinish returned %d\\n\", ret);\n        latency = 0;\n        goto texLatencyCleanup;\n    }\n\n    uint64_t time_diff_ms = end_timing();\n    latency = 1e6 * (float)time_diff_ms / (float)chase_iterations;\n\n    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL);\n    clFinish(command_queue);\n\n    // for (int i = 0; i < global_item_size; i++) fprintf(stderr, \"Thread %d ended at %d\\n\", i, thread_start[i]);\n\ntexLatencyCleanup:\n    clFlush(command_queue);\n    clFinish(command_queue);\n    clReleaseMemObject(a_mem_obj);\n    clReleaseMemObject(tex_obj);\n    clReleaseMemObject(result_obj);\n    free(A);\n    return latency;\n}"
  },
  {
    "path": "GpuMemLatency/local_mem_latency_kernel.cl",
    "content": "// for testing total local memory capacity by seeing when threads can no longer overlap in time\r\n// due to local mem capacity limits across the GPU\r\n// calling code expected to define LATENCY_LOCAL_MEM_SIZE\r\n__kernel void unrolled_latency_test_localmem(__global const int* A, int count, __global int* ret) {\r\n    __local int local_a[LATENCY_LOCAL_MEM_SIZE];\r\n    int start = A[0]; // this will test scalar latency, always\r\n    int current = A[start];\r\n    int result;\r\n    for (int i = 0; i < count; i += 10) {\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        result += current;\r\n        current = A[current];\r\n        local_a[i & (LATENCY_LOCAL_MEM_SIZE - 1)] = current;\r\n    }\r\n\r\n    ret[0] = local_a[current & (LATENCY_LOCAL_MEM_SIZE - 1)];\r\n}\r\n"
  },
  {
    "path": "GpuMemLatency/opencltest.c",
    "content": "#include \"opencltest.h\"\n\n// default test sizes for latency, in KB\nint default_test_sizes[] = { 1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 144, 160, 172, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 3072, 4096, 5120, 6144, \n    8192, 16384, 18432, 20480, 24576, 25600, 28672, 32768, 36864, 40960, 41200, 49152, 65536, 98304, 131072, 196608, 262144, 524288, 768432,  819200, 921600, 1048576 };\n\n// lining this up with nemes's VK bw test sizes. units for this one are in bytes\nconst uint64_t default_bw_test_sizes[] = {\n    4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768, 40960, 49152, 57344, 65536, 81920, 98304, 114688, 131072,\n        196608, 262144, 393216, 458752, 524288, 786432, 1048576, 1572864, 2097152, 3145728, 4194304, 6291456, 8388608, 12582912, 16777216, 20971520,\n        25165824, 33554432, 37748736, 41943040, 50331648, 58720256, 67108864, 100663296, 134217728, 201326592, 268435456, 402653184, 536870912, 805306368,\n        1073741824, 1610579968, 2147483648, 3221225472, 4294967296\n};\n\nfloat int_exec_latency_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t iterations);\n\nuint32_t scale_bw_iterations(uint32_t base_iterations, uint32_t size_kb);\nuint64_t scale_iterations(uint32_t size_kb, uint64_t iterations);\n\ncl_ulong get_max_buffer_size();\ncl_ulong get_max_constant_buffer_size();\n\nenum TestType {\n    VectorMemLatency,\n    ScalarMemLatency,\n    ConstantMemLatency,\n    LocalMemCapacity,\n    LocalMemLatency,\n    TexMemLatency,\n    GlobalAtomicLatency,\n    LocalAtomicLatency,\n    GlobalAtomicAdd,\n    LocalAtomicAdd,\n    GlobalMemBandwidth,\n    LocalMemBandwidth,\n    LocalMemChaseBandwidth,\n    LocalMem64Bandwidth,\n    LocalMemFloat4Bandwidth,\n    MixedFloat4Bandwidth,\n    LoadStoreBandwidth,\n    TextureThroughput,\n    BufferBandwidth,\n    MemBandwidthWorkgroupScaling,\n    CoreToCore,\n    LinkBandwidth,\n    InstructionRate,\n    Divergence,\n    Partition,\n    MemDivergence\n};\n\n\nint main(int argc, char* argv[]) {\n    cl_int ret;\n    uint32_t stride = 64;\n    uint32_t list_size = 3840 * 2160 * 4;\n    uint32_t chase_iterations = 1e6 * 7;\n    // skip = 0 means auto\n    uint32_t thread_count = 1, local_size = 1, skip = 0, wave = 0;\n    float result;\n    int platform_index = -1, device_index = -1;\n    enum TestType testType = VectorMemLatency;\n    char thread_count_set = 0, local_size_set = 0, chase_iterations_set = 0, skip_set = 0;\n    int sizeKb = 0;\n    int forceCuCount = 0;\n    int forcefp16 = 0, forcefp64 = 0;\n\n    // vars for local mem capacity testing\n    int local_mem_size_kb = 0; // local mem allocated for each wg\n    int group_count = 0;       // max wg count\n\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\n        if (*(argv[argIdx]) == '-') {\n            char* arg = argv[argIdx] + 1;\n            if (_strnicmp(arg, \"stride\", 6) == 0) {\n                argIdx++;\n                stride = atoi(argv[argIdx]);\n                fprintf(stderr, \"Using stride = %u\\n\", stride);\n            }\n            else if (_strnicmp(arg, \"iterations\", 10) == 0) {\n                argIdx++;\n                chase_iterations = atoi(argv[argIdx]);\n                chase_iterations_set = 1;\n                fprintf(stderr, \"Using %u iterations\\n\", chase_iterations);\n            }\n            else if (_strnicmp(arg, \"threads\", 7) == 0) {\n                argIdx++;\n                thread_count = atoi(argv[argIdx]);\n                thread_count_set = 1;\n                fprintf(stderr, \"Using %u threads\\n\", thread_count);\n            }\n            else if (_strnicmp(arg, \"localsize\", 9) == 0) {\n                argIdx++;\n                local_size = atoi(argv[argIdx]);\n                local_size_set = 1;\n                fprintf(stderr, \"Using local size = %u\\n\", local_size);\n            }\n            else if (_strnicmp(arg, \"wave\", 4) == 0) {\n                argIdx++;\n                wave = atoi(argv[argIdx]);\n                fprintf(stderr, \"Estimated wave size = %u\\n\", wave);\n            }\n            else if (_strnicmp(arg, \"platform\", 8) == 0) {\n                argIdx++;\n                platform_index = atoi(argv[argIdx]);\n                fprintf(stderr, \"Using OpenCL platform index %d\\n\", platform_index);\n            }\n            else if (_strnicmp(arg, \"device\", 6) == 0) {\n                argIdx++;\n                device_index = atoi(argv[argIdx]);\n                fprintf(stderr, \"Using OpenCL device index %d\\n\", device_index);\n            }\n            else if (_strnicmp(arg, \"bwskip\", 6) == 0) {\n                argIdx++;\n                skip = atoi(argv[argIdx]);\n                fprintf(stderr, \"Workgroups will be spaced %u apart\\n\", skip);\n            }\n            else if (_strnicmp(arg, \"sizekb\", 6) == 0) {\n                argIdx++;\n                sizeKb = atoi(argv[argIdx]);\n                fprintf(stderr, \"Only testing %d KB\\n\", sizeKb);\n            }\n            else if (_strnicmp(arg, \"localmemsize\", 12) == 0)\n            {\n                argIdx++;\n                local_mem_size_kb = atoi(argv[argIdx]);\n                fprintf(stderr, \"Testing with %d of local memory allocated per WG\\n\", local_mem_size_kb);\n            }\n            else if (_strnicmp(arg, \"groupcount\", 10) == 0)\n            {\n                argIdx++;\n                group_count = atoi(argv[argIdx]);\n                fprintf(stderr, \"Testing with up to %d WGs\\n\", group_count);\n            }\n            else if (_strnicmp(arg, \"saveprogram\", 11) == 0) {\n                saveprogram = 1;\n                fprintf(stderr, \"Writing compiled program to disk\\n\");\n            }\n            else if (_strnicmp(arg, \"forcefp16\", 10) == 0) {\n                forcefp16 = 1;\n                fprintf(stderr, \"For instruction rate testing, will run FP16 tests regardless of whether support is advertised\\n\");\n            }\n            else if (_strnicmp(arg, \"forcefp64\", 10) == 0) {\n                forcefp64 = 1;\n                fprintf(stderr, \"For instruction rate testing, will run FP64 tests regardless of whether support is advertised\\n\");\n            }\n            else if (_strnicmp(arg, \"test\", 4) == 0) {\n                argIdx++;\n                if (_strnicmp(argv[argIdx], \"vectorlatency\", 13) == 0) {\n                    testType = VectorMemLatency;\n                    fprintf(stderr, \"Testing global memory latency, vector accesses\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"scalarlatency\", 13) == 0) {\n                    testType = ScalarMemLatency;\n                    fprintf(stderr, \"Testing global memory latency, scalar accesses\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"constantlatency\", 15) == 0) {\n                    testType = ConstantMemLatency;\n                    fprintf(stderr, \"Testing constant memory latency\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"memdivergence\", 13) == 0) {\n                    testType = MemDivergence;\n                    fprintf(stderr, \"Testing memory access divergence cost\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"localmemcapacity\", 16) == 0) {\n                    testType = LocalMemCapacity;\n                    fprintf(stderr, \"Testing GPU-wide local memory capacity. Make sure localmemsize/groupcount are set appropriately!\\n\");\n\n                    if (sizeKb == 0) sizeKb = 1;\n                    if (group_count == 0) group_count = 16;\n                }\n                else if (_strnicmp(argv[argIdx], \"globalatomiccmpxchg\", 19) == 0) {\n                    testType = GlobalAtomicLatency;\n                    fprintf(stderr, \"Testing global atomic latency (cmpxchg)\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"globalatomicadd\", 15) == 0)\n                {\n                    testType = GlobalAtomicAdd;\n                    fprintf(stderr, \"Testing global atomic add\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"locallatency\", 13) == 0) {\n                    testType = LocalMemLatency;\n                    fprintf(stderr, \"Testing local mem latency\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"texlatency\", 10) == 0) {\n                    testType = TexMemLatency;\n                    fprintf(stderr, \"Testing texture mem latency\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"localatomiccmpxchg\", 18) == 0) {\n                    testType = LocalAtomicLatency;\n                    fprintf(stderr, \"Testing local atomic latency (cmpxchg)\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"localatomicadd\", 14) == 0) {\n                    testType = LocalAtomicAdd;\n                    fprintf(stderr, \"Testing local atomic add\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"bw\", 2) == 0) {\n                    testType = GlobalMemBandwidth;\n                    fprintf(stderr, \"Testing global memory bandwidth\\n\");\n\n                    // Somewhat reasonable defaults\n                    if (!thread_count_set) thread_count = 131072;\n                    if (!local_size_set) local_size = 256;\n                    if (!chase_iterations_set) chase_iterations = 500000;\n                }\n                else if (_strnicmp(argv[argIdx], \"localbw\", 7) == 0) {\n                    testType = LocalMemBandwidth;\n                    if (!thread_count_set) thread_count = 262144;\n                    if (!local_size_set) local_size = 256;\n                    fprintf(stderr, \"Testing local memory bandwidth\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"localchasebw\", 12) == 0) {\n                    testType = LocalMemChaseBandwidth;\n                    fprintf(stderr, \"Testing local memory bandwidth using pointer chasing and lots of waves\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"local64bw\", 9) == 0) {\n                    testType = LocalMem64Bandwidth;\n                    fprintf(stderr, \"Testing local memory bandwidth using 64-bit loads\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"localfloat4bw\", 13) == 0) {\n                    testType = LocalMemFloat4Bandwidth;\n                    fprintf(stderr, \"Testing local memory bandwidth using float4 (4x32-bit) loads\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"mixedbw\", 7) == 0) {\n                    testType = MixedFloat4Bandwidth;\n                    fprintf(stderr, \"Mixed local/global load bw test with float4\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"bufferbw\", 8) == 0) {\n                    testType = BufferBandwidth;\n                    fprintf(stderr, \"Testing buffer bandwidth\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"ldstbw\", 6) == 0) {\n                    testType = LoadStoreBandwidth;\n                    fprintf(stderr, \"Testing load/store bandwidth\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"scaling\", 7) == 0)\n                {\n                    testType = MemBandwidthWorkgroupScaling;\n                    fprintf(stderr, \"Testing BW scaling with workgroups\\n\");\n                    if (!chase_iterations_set) chase_iterations = 20000000;\n\n                    if (argIdx + 1 < argc && argv[argIdx + 1][0] != '-')\n                    {\n                        argIdx++;\n                        forceCuCount = atoi(argv[argIdx]);\n                        fprintf(stderr, \"Using up to %d workgroups\\n\", forceCuCount);\n                    }\n                }\n                else if (_strnicmp(argv[argIdx], \"c2c\", 3) == 0)\n                {\n                    testType = CoreToCore;\n                    fprintf(stderr, \"Testing latency with global atomics across CU count\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"link\", 4) == 0)\n                {\n                    testType = LinkBandwidth;\n                    fprintf(stderr, \"Testing host <-> GPU link bandwidth\\n\");\n                    if (!chase_iterations_set) chase_iterations = 30000000;\n                }\n                else if (_strnicmp(argv[argIdx], \"instructionrate\", 15) == 0)\n                {\n                    testType = InstructionRate;\n                    fprintf(stderr, \"Testing instruction rate\\n\");\n                    if (!chase_iterations_set) chase_iterations = 1000;\n                    if (!local_size_set && !thread_count_set) {\n                        local_size = 256;\n                        thread_count = 32768;\n                        fprintf(stderr, \"Selecting local size = %d, threads = %d\\n\", local_size, thread_count);\n                    }\n                }\n                else if (_strnicmp(argv[argIdx], \"tmu\", 3) == 0)\n                {\n                    testType = TextureThroughput;\n                    fprintf(stderr, \"Testing TMUs\\n\");\n                }\n                else if (_strnicmp(argv[argIdx], \"divergence\", 10) == 0)\n                {\n                    testType = Divergence;\n                    fprintf(stderr, \"Testing compute throughput with varying numbers of consecutive threads doing the same op\\n\");\n                    if (!local_size_set && !thread_count_set) {\n                        local_size = 256;\n                        thread_count = 32768;\n                        fprintf(stderr, \"Selecting local size = %d, threads = %d\\n\", local_size, thread_count);\n                    }\n                }\n                else if (_strnicmp(argv[argIdx], \"partition\", 9) == 0)\n                {\n                    testType = Partition;\n                    fprintf(stderr, \"Testing execution unit partitioning. Make sure wave size is set!\\n\");\n                }\n                else {\n                    fprintf(stderr, \"I'm so confused. Unknown test type %s\\n\", argv[argIdx]);\n                }\n            }\n        }\n    }\n\n    if (argc == 1)\n    {\n        fprintf(stderr, \"Usage:\\n\\t[-test <latency/constantlatency/globalatomic/localatomic/bw>]\\n\\t[-platform <platform id>]\\n\\t[-device <device id>]\\n\");\n        fprintf(stderr, \"\\t[-threads <opencl thread count>]\\n\\t[-localsize <opencl workgroup size>]\\n\\t[-bwskip <workgroup spacing>]\\n\");\n        fprintf(stderr, \"Number of threads (OpenCL global work size) must be divisible by local work size\\n\");\n    }\n\n    fprintf(stderr, \"Using %d threads with local size %d\\n\", thread_count, local_size);\n#pragma region opencl_overhead\n    // Create an OpenCL context\n    cl_context context = get_context_from_user(platform_index, device_index);\n    if (context == NULL) exit(1);\n\n    // Load kernel\n    cl_program program = build_program(context, \"kernel.cl\", NULL);\n    if (saveprogram) write_program(program, \"kernel\");\n\n    // Create a command queue\n    cl_command_queue command_queue = clCreateCommandQueue(context, selected_device_id, 0, &ret);\n    fprintf(stderr, \"clCreateCommandQueue returned %d\\n\", ret);\n\n    cl_kernel c2c_atomic_latency_test_kernel = clCreateKernel(program, \"c2c_atomic_exec_latency_test\", &ret);\n    cl_kernel dummy_add_kernel = clCreateKernel(program, \"dummy_add\", &ret);\n    cl_kernel local_bw_chase_kernel = clCreateKernel(program, \"local_chase_kernel\", &ret);\n#pragma endregion opencl_overhead\n\n    max_global_test_size = get_max_buffer_size();\n\n    if (testType == GlobalAtomicLatency)\n    {\n        cl_program prog = build_program(context, \"atomic_exec_latency_test.cl\", NULL);\n        cl_kernel atomic_latency_test_kernel = clCreateKernel(prog, \"atomic_exec_latency_test\", &ret);\n        if (saveprogram) write_program(prog, \"atomic_exec_latency_test\");\n\n        chase_iterations = 200000;\n        uint32_t elapsed_ms = 0, target_ms = 2000;\n        while (elapsed_ms < target_ms / 2) {\n            result = int_atomic_latency_test(context, command_queue, atomic_latency_test_kernel, chase_iterations, false, &elapsed_ms);\n            fprintf(stderr, \"%d iterations, %u ms => %f ns\\n\", chase_iterations, elapsed_ms, result);\n            chase_iterations = scale_iterations_to_target(chase_iterations, elapsed_ms, target_ms);\n        }\n        printf(\"global atomic latency: %f\\n\", result);\n        clReleaseKernel(atomic_latency_test_kernel);\n        clReleaseProgram(prog);\n    }\n    else if (testType == LocalAtomicLatency)\n    {\n        cl_program prog = build_program(context, \"local_atomic_latency_test.cl\", NULL);\n        cl_kernel local_atomic_latency_test_kernel = clCreateKernel(prog, \"local_atomic_latency_test\", &ret);\n        if (saveprogram) write_program(prog, \"local_atomic_latency_test\");\n\n        chase_iterations = 500000;\n        uint32_t elapsed_ms = 0, target_ms = 2000;\n        while (elapsed_ms < target_ms / 2) {\n            result = int_atomic_latency_test(context, command_queue, local_atomic_latency_test_kernel, chase_iterations, true, &elapsed_ms);\n            fprintf(stderr, \"%d iterations, %u ms => %f ns\\n\", chase_iterations, elapsed_ms, result);\n            chase_iterations = scale_iterations_to_target(chase_iterations, (float)elapsed_ms, (float)target_ms);\n        }\n        printf(\"local atomic latency: %f\\n\", result);\n        clReleaseKernel(local_atomic_latency_test_kernel);\n        clReleaseProgram(prog);\n    }\n    else if (testType == GlobalAtomicAdd)\n    {\n        cl_program prog = build_program(context, \"atomic_exec_latency_test.cl\", NULL);\n        cl_kernel global_atomic_add_kernel = clCreateKernel(prog, \"atomic_add_test\", &ret);\n        if (saveprogram) write_program(prog, \"atomic_exec_latency_test\");\n        result = int_atomic_add_test(context, command_queue, global_atomic_add_kernel, thread_count, local_size);\n        fprintf(stderr, \"Global atomic INT32 adds: %f GOPS\\n\", result);\n    }\n    else if (testType == LocalAtomicAdd)\n    {\n        cl_program prog = build_program(context, \"local_atomic_latency_test.cl\", NULL);\n        cl_kernel local_atomic_add_kernel = clCreateKernel(prog, \"local_atomic_add_test\", &ret);\n        if (saveprogram) write_program(prog, \"local_atomic_latency_test\");\n        result = int_atomic_add_test(context, command_queue, local_atomic_add_kernel, thread_count, local_size);\n        fprintf(stderr, \"Local atomic INT32 adds: %f GOPS\\n\", result);\n    }\n    else if (testType == VectorMemLatency || testType == ScalarMemLatency)\n    {\n        cl_program prog;\n        cl_kernel globalMemLatencyKernel;\n        if (testType == ScalarMemLatency) \n        {\n            prog = build_program(context, \"scalar_unrolled_latency_test.cl\", NULL);\n            globalMemLatencyKernel = clCreateKernel(prog, \"scalar_unrolled_latency_test\", &ret);\n            if (saveprogram) write_program(prog, \"scalar_unrolled_latency_test\");\n        }\n        else // Vector mem latency\n        {\n            prog = build_program(context, \"unrolled_latency_test.cl\", NULL);\n            globalMemLatencyKernel = clCreateKernel(prog, \"unrolled_latency_test\", &ret);\n            if (saveprogram) write_program(prog, \"unrolled_latency_test\");\n        }\n\n        fprintf(stderr, \"Doing %d K p-chase iterations with stride %d over %d KiB region\\n\", chase_iterations / 1000, stride, list_size * 4 / 1024);\n        printf(\"\\nSattolo, global memory latency (up to %llu K) unroll:\\n\", max_global_test_size / 1024);\n\n        for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) {\n            if (max_global_test_size < sizeof(int) * 256 * default_test_sizes[size_idx]) {\n                printf(\"%d K would exceed device's max buffer size of %llu K, stopping here.\\n\", default_test_sizes[size_idx], max_global_test_size / 1024);\n                break;\n            }\n            result = latency_test(context, command_queue, \n                globalMemLatencyKernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), false, thread_count, local_size, wave, stride, NULL);\n            printf(\"%d,%f\\n\", default_test_sizes[size_idx], result);\n            if (result == 0) {\n                printf(\"Something went wrong, not testing anything bigger.\\n\");\n                break;\n            }\n        }\n\n        clReleaseKernel(globalMemLatencyKernel);\n        clReleaseProgram(prog);\n    }\n    else if (testType == MemDivergence) {\n        cl_program vecProg, texProg;\n        cl_kernel vecKernel, texKernel;\n        fprintf(stderr, \"Testing mem divergence with localsize %d, test size %d KB\\n\", local_size, sizeKb);\n\n        // vector\n        vecProg = build_program(context, \"unrolled_latency_test.cl\", NULL);\n        if (saveprogram) write_program(vecProg, \"vector_unrolled_latency_test\");\n        vecKernel = clCreateKernel(vecProg, \"unrolled_latency_test\", &ret);\n\n        texProg = build_program(context, \"tex_latency_test.cl\", NULL);\n        texKernel = clCreateKernel(texProg, \"tex_latency_test\", &ret);\n        if (saveprogram) write_program(texProg, \"tex_latency_test\");\n\n        float* memDivergenceResults = (float*)malloc(sizeof(float) * local_size * 2);\n        for (int threadCount = 1; threadCount <= local_size; threadCount++) {\n            float vecResult = latency_test(context, command_queue, vecKernel, 256 * sizeKb, scale_iterations(sizeKb, chase_iterations), false, threadCount, threadCount, 1, stride, NULL);\n            memDivergenceResults[threadCount * 2] = vecResult;\n\n            float texResult = tex_latency_test(context, command_queue, texKernel, 256 * sizeKb, scale_iterations(sizeKb, chase_iterations), threadCount, threadCount, 1);\n            memDivergenceResults[threadCount * 2 + 1] = texResult;\n\n            fprintf(stderr, \"%d threads: %f vec, %f tex\\n\", threadCount, vecResult, texResult);\n        }\n\n        for (int threadCount = 1; threadCount <= local_size; threadCount++) {\n            printf(\"%d,%f,%f\\n\", threadCount, memDivergenceResults[threadCount * 2], memDivergenceResults[threadCount * 2 + 1]);\n        }\n\n        clReleaseKernel(texKernel);\n        clReleaseKernel(vecKernel);\n        clReleaseProgram(texProg);\n        clReleaseProgram(vecProg);\n        free(memDivergenceResults);\n    }\n    else if (testType == LocalMemCapacity)\n    {\n        char build_options[128];\n        const char* local_mem_define_prefix = \"-D LATENCY_LOCAL_MEM_SIZE=\";\n        memset(build_options, 0, 128);\n        memcpy(build_options, local_mem_define_prefix, 26);\n        snprintf(build_options + 26, 128 - 26, \"%u\", 256 * local_mem_size_kb);\n        cl_program program = build_program(context, \"local_mem_latency_kernel.cl\", build_options);\n        cl_kernel local_mem_capacity_kernel = clCreateKernel(program, \"unrolled_latency_test_localmem\", &ret);\n        if (ret != CL_SUCCESS)\n        {\n            fprintf(stderr, \"Could not create local mem capacity testing kernel\\n\");\n            exit(0);\n        }\n\n        if (saveprogram) write_program(program, \"local_mem_latency_kernel\");\n\n        fprintf(stderr, \"Testing local memory capacity with %u KB of local mem per WG, up to %u WGs\\n\", local_mem_size_kb, group_count);\n        printf(\"Groups,Local Mem Capacity,Latency\\n\");\n        for (int groups = 1; groups <= group_count; groups++) {\n            result = latency_test(context, command_queue, \n                local_mem_capacity_kernel, \n                256 * sizeKb, \n                (uint32_t)scale_iterations(sizeKb, chase_iterations), \n                true, \n                groups, \n                1, \n                1, \n                64, \n                NULL);\n            printf(\"%d,%d,%f\\n\", groups, groups* local_mem_size_kb, result);\n        }\n\n        clReleaseKernel(local_mem_capacity_kernel);\n        clReleaseProgram(program);\n    }\n    else if (testType == ConstantMemLatency)\n    {\n        cl_program prog = build_program(context, \"constant_unrolled_latency_test.cl\", NULL);\n        cl_kernel constant_kernel = clCreateKernel(prog, \"constant_unrolled_latency_test\", &ret);\n        if (saveprogram) write_program(prog, \"constant_unrolled_latency_test\");\n        cl_ulong max_constant_test_size = get_max_constant_buffer_size();\n        printf(\"\\nSattolo, constant memory (up to %llu K), no-unroll:\\n\", max_constant_test_size / 1024);\n\n        for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) {\n            if (max_constant_test_size < sizeof(int) * 256 * default_test_sizes[size_idx]) {\n                printf(\"%d K would exceed device's max constant buffer size of %llu K, stopping here.\\n\", default_test_sizes[size_idx], max_constant_test_size / 1024);\n                break;\n            }\n            result = latency_test(context, command_queue, constant_kernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), false, thread_count, local_size, wave, stride, NULL);\n            printf(\"%d,%f\\n\", default_test_sizes[size_idx], result);\n            if (result == 0) {\n                printf(\"Something went wrong, not testing anything bigger.\\n\");\n                break;\n            }\n        }\n\n        clReleaseKernel(constant_kernel);\n        clReleaseProgram(program);\n    }\n    else if (testType == TexMemLatency)\n    {\n        cl_program prog = build_program(context, \"tex_latency_test.cl\", NULL);\n        cl_kernel tex_latency_kernel = clCreateKernel(prog, \"tex_latency_test\", &ret);\n        if (saveprogram) write_program(prog, \"tex_latency_test\");\n        cl_ulong max_tex_test_size = get_max_tex_buffer_size();\n        for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) {\n            if (default_test_sizes[size_idx] * 1024 > max_tex_test_size) {\n                printf(\"%d K would exceed device's texture buffer size of %llu K, stopping here.\\n\", default_test_sizes[size_idx], max_tex_test_size / 1024);\n                break;\n            }\n\n            result = tex_latency_test(context, command_queue, tex_latency_kernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), \n                thread_count, local_size, wave);\n            printf(\"%d,%f\\n\", default_test_sizes[size_idx], result);\n            if (result == 0) {\n                printf(\"Something went wrong, not testing anything bigger.\\n\");\n                break;\n            }\n        }\n\n        clReleaseKernel(tex_latency_kernel);\n        clReleaseProgram(prog);\n    }\n    else if (testType == LocalMemLatency)\n    {\n        cl_program prog = build_program(context, \"local_unrolled_latency_test.cl\", NULL);\n        cl_kernel local_kernel = clCreateKernel(prog, \"local_unrolled_latency_test\", &ret);\n        if (saveprogram) write_program(prog, \"local_unrolled_latency_test\");\n\n        uint32_t elapsed_ms = 0, target_ms = 2000;\n        chase_iterations = 50000;\n        while (elapsed_ms < target_ms / 2) {\n            result = latency_test(context, command_queue, local_kernel, 1024, chase_iterations, false, thread_count, local_size, wave, stride, &elapsed_ms);\n            fprintf(stderr, \"%u iterations, %u ms -> %f ns\\n\", chase_iterations, elapsed_ms, result);\n            chase_iterations = scale_iterations_to_target(chase_iterations, elapsed_ms, target_ms);\n        }\n        printf(\"Local mem latency: %f\\n\", result);\n\n        clReleaseKernel(local_kernel);\n        clReleaseProgram(prog);\n    }\n    else if (testType == GlobalMemBandwidth)\n    {\n        cl_program prog = build_program(context, \"sum_bw_test.cl\", NULL);\n        cl_kernel bw_kernel = clCreateKernel(prog, \"sum_bw_test\", &ret);\n        if (saveprogram) write_program(prog, \"sum_bw_test\");\n        fprintf(stderr, \"Using %u threads, %u local size, %u base iterations\\n\", thread_count, local_size, chase_iterations);\n        printf(\"\\nMemory bandwidth (up to %llu K):\\n\", max_global_test_size / 1024);\n\n        if (!sizeKb) {\n            for (int size_idx = 0; size_idx < sizeof(default_bw_test_sizes) / sizeof(unsigned long long); size_idx++) {\n                uint64_t testSizeKb = default_bw_test_sizes[size_idx] / 1024;\n                if ((max_global_test_size / 1024) < testSizeKb) {\n                    printf(\"%llu K would exceed device's max buffer size of %llu K, stopping here.\\n\", testSizeKb, max_global_test_size / 1024);\n                    break;\n                }\n\n                result = bw_test(context,\n                    command_queue,\n                    bw_kernel, 256 * testSizeKb,\n                    thread_count,\n                    local_size,\n                    skip,\n                    scale_bw_iterations(chase_iterations, testSizeKb));\n\n                printf(\"%llu,%f\\n\", testSizeKb, result);\n                if (result == 0) {\n                    printf(\"Something went wrong, not testing anything bigger.\\n\");\n                    break;\n                }\n            }\n        }\n        else {\n            result = bw_test(context,\n                command_queue,\n                bw_kernel, 256 * sizeKb,\n                thread_count,\n                local_size,\n                skip,\n                scale_bw_iterations(chase_iterations, sizeKb));\n\n            printf(\"%lu,%f\\n\", sizeKb, result);\n            if (result == 0) {\n                printf(\"Something went wrong, not testing anything bigger.\\n\");\n            }\n        }\n\n        clReleaseKernel(bw_kernel);\n        clReleaseProgram(prog);\n    }\n    else if (testType == LocalMemBandwidth || \n        testType == LocalMem64Bandwidth || \n        testType == BufferBandwidth || \n        testType == LoadStoreBandwidth ||\n        testType == TextureThroughput ||\n        testType == LocalMemFloat4Bandwidth ||\n        testType == MixedFloat4Bandwidth)\n    {\n        cl_program prog;\n        cl_kernel local_bw_kernel = NULL, local_64_bw_kernel = NULL, local_float4_bw_kernel = NULL, buffer_bw_kernel = NULL, tex_bw_kernel = NULL, loadstore_bw_kernel = NULL;\n        cl_kernel mixed_bw_kernel = NULL;\n        if (testType == LocalMemBandwidth)\n        {\n            prog = build_program(context, \"local_bw_test.cl\", NULL);\n            local_bw_kernel = clCreateKernel(prog, \"local_bw_test\", &ret);\n            if (saveprogram) write_program(prog, \"local_bw_test\");\n        }\n        else if (testType == LocalMem64Bandwidth) {\n            prog = build_program(context, \"local_64_bw_test.cl\", NULL);\n            local_64_bw_kernel = clCreateKernel(prog, \"local_64_bw_test\", &ret);\n            if (saveprogram) write_program(prog, \"local_64_bw_test\");\n        }\n        else if (testType == LocalMemFloat4Bandwidth) {\n            prog = build_program(context, \"local_float4_bw_test.cl\", NULL);\n            local_float4_bw_kernel = clCreateKernel(prog, \"local_float4_bw_test\", &ret);\n            if (saveprogram) write_program(prog, \"local_float4_bw_test\");\n        }\n        else if (testType == BufferBandwidth) {\n            prog = build_program(context, \"buffer_bw_test.cl\", NULL);\n            buffer_bw_kernel = clCreateKernel(prog, \"buffer_bw_test\", &ret);\n            if (saveprogram) write_program(prog, \"buffer_bw_test\");\n        }\n        else if (testType == LoadStoreBandwidth)\n        {\n            prog = build_program(context, \"ldst_bw_test.cl\", NULL);\n            loadstore_bw_kernel = clCreateKernel(prog, \"ldst_bw_test\", &ret);\n            if (saveprogram) write_program(prog, \"ldst_bw_test\");\n        }\n        else if (testType == MixedFloat4Bandwidth)\n        {\n            prog = build_program(context, \"local_float4_bw_test.cl\", NULL);\n            mixed_bw_kernel = clCreateKernel(prog, \"mixed_float4_bw_test\", NULL);\n            if (saveprogram) write_program(prog, \"mixed_float4_bw_test\");\n        }\n        else { // tex throughput\n            prog = build_program(context, \"tex_bw_test.cl\", NULL);\n            tex_bw_kernel = clCreateKernel(prog, \"tex_bw_test\", &ret);\n            if (saveprogram) write_program(prog, \"tex_bw_test\");\n        }\n\n        uint32_t thread_low = 1024, thread_high = 1048576*4;\n        if (!thread_count_set) thread_count = thread_low;\n        float max_bw = 0;\n\n        while (true) {\n            int64_t elapsed_ms = 0, target_ms = 1500;\n            if (!chase_iterations_set) chase_iterations = 500000;\n            while (elapsed_ms < target_ms / 2)\n            {\n                if (testType == LocalMemBandwidth) {\n                    fprintf(stderr, \"Testing local mem bw\\n\");\n                    result = local_bw_test(context, command_queue, local_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);\n                }\n                else if (testType == LocalMem64Bandwidth) {\n                    fprintf(stderr, \"Testing local mem bw with 64-bit loads\\n\");\n                    result = local_64_bw_test(context, command_queue, local_64_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);\n                }\n                else if (testType == LocalMemFloat4Bandwidth) {\n                    fprintf(stderr, \"Testing local mem bw with float4 loads\\n\");\n                    result = local_bw_test(context, command_queue, local_float4_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);\n                }\n                else if (testType == MixedFloat4Bandwidth) {\n                    fprintf(stderr, \"Testing mixed local/global bw with float4 loads\\n\");\n                    result = local_bw_test(context, command_queue, mixed_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);\n                }\n                else if (testType == BufferBandwidth)\n                {\n                    fprintf(stderr, \"Testing buffer bw\\n\");\n                    result = buffer_bw_test(context, command_queue, buffer_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);\n                }\n                else if (testType == LoadStoreBandwidth)\n                {\n                    fprintf(stderr, \"Testing global load bandwidth\\n\");\n                    result = local_bw_test(context, command_queue, loadstore_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);\n                }\n                else if (testType == TextureThroughput)\n                {\n                    fprintf(stderr, \"Testing texture throughput\\n\");\n                    result = tex_bw_test(context,\n                        command_queue,\n                        tex_bw_kernel,\n                        256, // width\n                        256, // height\n                        thread_count,\n                        local_size,\n                        0,\n                        chase_iterations,\n                        &elapsed_ms);\n                }\n\n                fprintf(stderr, \"%u threads, %u local size, %u iterations ==> %f GB/s, elapsed time %lld ms\\n\",\n                    thread_count, local_size, chase_iterations, result, elapsed_ms);\n                if (elapsed_ms < 25) chase_iterations *= 2;\n                else chase_iterations = (uint32_t)((float)chase_iterations * (target_ms / elapsed_ms));\n                if (result == 0)\n                {\n                    fprintf(stderr, \"Run failed\\n\");\n                    break;\n                }\n\n                if (chase_iterations_set) break;\n            }\n                    \n            if (result > max_bw) max_bw = result;\n\n            if (thread_count_set) break;\n            thread_count *= 2;\n            if (thread_count > thread_high) break;\n        }\n\n        printf(\"Bandwidth: %f GB/s\\n\", max_bw);\n    }\n    else if (testType == LocalMemChaseBandwidth)\n    {\n        int thread_scan_done = 0;\n        uint32_t thread_low = 256, thread_high = 524288 * 4;\n        fprintf(stderr, \"Testing local memory bandwidth using pointer chasing. Ensure wave size is set correctly with -wave\\n\");\n\n        if (!thread_count_set) thread_count = thread_low;\n\n        while (!thread_scan_done) {\n            // ignore chase iterations and auto manage it\n            int64_t elapsed_ms = 0, target_ms = 1500;\n            chase_iterations = 500000;\n\n            if (thread_count_set) thread_scan_done = 0;\n            else\n            {\n                thread_count *= 2;\n                if (thread_count > thread_high) break;\n            }\n\n            while (elapsed_ms < target_ms / 2)\n            {\n                result = local_chase_bw_test(context, command_queue, local_bw_chase_kernel, thread_count, local_size, chase_iterations, wave, &elapsed_ms);\n                fprintf(stderr, \"%u threads, %u local size, %u wave, %u iterations ==> %f GB/s, elapsed time %lld ms\\n\",\n                    thread_count, local_size, wave, chase_iterations, result, elapsed_ms);\n                if (elapsed_ms < 25) chase_iterations *= 2;\n                else chase_iterations = (uint32_t)((float)chase_iterations * (target_ms / elapsed_ms));\n                if (result == 0)\n                {\n                    fprintf(stderr, \"Run failed\\n\");\n                    break;\n                }\n            }\n        }\n\n        printf(\"Local memory bandwidth: %f GB/s\\n\", result);\n    }\n    else if (testType == MemBandwidthWorkgroupScaling)\n    {\n        cl_program prog = build_program(context, \"sum_bw_test.cl\", NULL);\n        cl_kernel bw_kernel = clCreateKernel(prog, \"sum_bw_test\", &ret);\n        if (saveprogram) write_program(prog, \"sum_bw_test\");\n        uint32_t testSizeCount = sizeof(default_bw_test_sizes) / sizeof(unsigned long long);\n        cl_uint cuCount = forceCuCount ? forceCuCount : getCuCount();\n\n        fprintf(stderr, \"Device has %u compute units\\n\", cuCount);\n\n        float* scalingResults = (float*)malloc(sizeof(float) * cuCount * testSizeCount);\n        for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++)\n        {\n            if (!sizeKb) {\n                for (int size_idx = 0; size_idx < testSizeCount; size_idx++)\n                {\n                    uint64_t testSizeKb = default_bw_test_sizes[size_idx] / 1024;\n                    fprintf(stderr, \"Testing size %llu KB, %u workgroups\\n\", testSizeKb, workgroupCount);\n                    if ((max_global_test_size / 1024) < testSizeKb) {\n                        printf(\"%llu K would exceed device's max buffer size of %llu K\\n\", testSizeKb, max_global_test_size / 1024);\n                        scalingResults[(workgroupCount - 1) * testSizeCount + size_idx] = 0;\n                        continue;\n                    }\n\n                    result = bw_test(context,\n                        command_queue,\n                        bw_kernel, 256 * testSizeKb,\n                        local_size * workgroupCount,\n                        local_size,\n                        skip,\n                        scale_bw_iterations(chase_iterations, testSizeKb));\n\n                    scalingResults[(workgroupCount - 1) * testSizeCount + size_idx] = result;\n                    fprintf(stderr, \"%u workgroups, %llu KB = %f GB/s\\n\", workgroupCount, testSizeKb, result);\n                }\n            }\n            else {\n                fprintf(stderr, \"Testing size %d KB, %u workgroups\\n\", sizeKb, workgroupCount);\n                result = bw_test(context,\n                    command_queue,\n                    bw_kernel, 256 * sizeKb,\n                    local_size * workgroupCount,\n                    local_size,\n                    skip,\n                    scale_bw_iterations(chase_iterations, sizeKb));\n                scalingResults[workgroupCount - 1] = result;\n                fprintf(stderr, \"%u workgroups, %lu KB = %f GB/s\\n\", workgroupCount, sizeKb, result);\n            }\n        }\n\n        if (!sizeKb) {\n            for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++)\n            {\n                printf(\",%u\", workgroupCount);\n            }\n            printf(\"\\n\");\n\n            for (int size_idx = 0; size_idx < testSizeCount; size_idx++)\n            {\n                printf(\"%llu\", default_bw_test_sizes[size_idx] / 1024);\n                for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++)\n                {\n                    printf(\",%f\", scalingResults[(workgroupCount - 1) * testSizeCount + size_idx]);\n                }\n\n                printf(\"\\n\");\n            }\n        }\n        else {\n            printf(\"For %d KB:\\n\", sizeKb);\n            for (int workgroupIdx = 0; workgroupIdx < cuCount; workgroupIdx++)\n            {\n                printf(\"%d,%f\\n\", workgroupIdx + 1, scalingResults[workgroupIdx]);\n            }\n\n            printf(\"\\n\");\n        }\n\n        free(scalingResults);\n        clReleaseKernel(bw_kernel);\n        clReleaseProgram(prog);\n    }\n    else if (testType == CoreToCore)\n    {\n        c2c_atomic_latency_test(context, command_queue, c2c_atomic_latency_test_kernel, chase_iterations);\n     }\n    else if (testType == LinkBandwidth)\n    {\n        link_bw_test(context, command_queue, dummy_add_kernel, chase_iterations);\n    }\n    else if (testType == InstructionRate)\n    {\n        instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, forcefp16, forcefp64);\n    }\n    else if (testType == Divergence)\n    {\n        int current_wave = 1;\n        int max_wave = 512;\n        printf(\"Contiguous Thread Block Size,FP32 GOPs\\n\");\n        while (current_wave <= max_wave)\n        {\n            float gops = run_divergence_rate_test(context, command_queue, thread_count, local_size, current_wave, NULL);\n            printf(\"%d,%f\\n\", current_wave, gops);\n            current_wave *= 2;\n        }\n    }\n    else if (testType == Partition)\n    {\n        // function and its associated kernel serve two purposes\n        int pattern4[] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 };\n        float result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern4);\n        printf(\"Throughput: %f\\n\", result);\n\n\tint patterns[] = { 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0 };\n        result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, patterns);\n        printf(\"Throughput: %f\\n\", result);\n\n        int pattern2[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };\n        result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern2);\n        printf(\"Throughput: %f\\n\", result);\n\n        int consec_pattern[] = { 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0 };\n        result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, consec_pattern);\n        printf(\"Throughput: %f\\n\", result);\n    }\n\n    //printf(\"If you didn't run this through cmd, now you can copy the results. And press ctrl+c to close\");\n    //scanf(\"\\n\");\n\n    // Clean up\n    cleanup:\n    ret = clFlush(command_queue);\n    ret = clFinish(command_queue);\n    ret = clReleaseProgram(program);\n    ret = clReleaseCommandQueue(command_queue);\n    ret = clReleaseContext(context);\n    return 0;\n}\n\n/// <summary>\n/// Heuristic to make sure test runs for enough time but not too long\n/// </summary>\n/// <param name=\"size_kb\">Region size</param>\n/// <param name=\"iterations\">base iterations</param>\n/// <returns>scaled iterations</returns>\nuint64_t scale_iterations(uint32_t size_kb, uint64_t iterations) {\n    return 10 * iterations / pow(size_kb, 1.0 / 4.0);\n}\n\n#define INT_EXEC_INPUT_SIZE 16\nfloat int_exec_latency_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t iterations)\n{\n    cl_int ret;\n    cl_int result = 0;\n    size_t global_item_size = 1;\n    size_t local_item_size = 1;\n    float latency;\n    uint32_t time_diff_ms;\n    uint32_t A[INT_EXEC_INPUT_SIZE];\n\n    for (int i = 0; i < INT_EXEC_INPUT_SIZE; i++) A[i] = i;\n\n    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, INT_EXEC_INPUT_SIZE * sizeof(uint32_t), NULL, &ret);\n    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &result);\n    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, INT_EXEC_INPUT_SIZE * sizeof(uint32_t), A, 0, NULL, NULL);\n    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL);\n    clFinish(command_queue);\n    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);\n    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);\n    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);\n\n    start_timing();\n    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);\n    if (ret != CL_SUCCESS)\n    {\n        fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\n        latency = 0;\n        goto cleanup;\n    }\n    clFinish(command_queue);\n    time_diff_ms = end_timing();\n    latency = 1e6 * (float)time_diff_ms / (float)(iterations * 12);\n\ncleanup:\n    clFlush(command_queue);\n    clFinish(command_queue);\n    clReleaseMemObject(a_mem_obj);\n    clReleaseMemObject(result_obj);\n    return latency;\n}\n\nuint32_t scale_bw_iterations(uint32_t base_iterations, uint32_t size_kb)\n{\n    if (size_kb < 4096) return base_iterations;\n    else return base_iterations / 2;\n}\n"
  },
  {
    "path": "GpuMemLatency/opencltest.h",
    "content": "#pragma once\n\n#ifndef opencltestheader\n#define opencltestheader\n#include <stdio.h>\n#include <stdlib.h>\n#include <stdint.h>\n#include <string.h>\n#include <math.h>\n#include \"../Common/timing.h\"\n\n#define false 0\n#define true 1\n\n#define CL_USE_DEPRECATED_OPENCL_1_2_APIS\n#ifndef __APPLE__\n#include <CL/cl.h>\n#else\n#include <OpenCL/cl.h>\n#endif\n#define MAX_SOURCE_SIZE (0x100000)\n\n#define CACHELINE_SIZE 64\n#define TARGET_TIME_MS 2000\n\n#ifndef _MSC_VER\n#define _strnicmp strncmp\n#endif\nextern cl_device_id selected_device_id;\nextern cl_platform_id selected_platform_id;\nextern cl_ulong max_global_test_size;\nextern int saveprogram;\ncl_context get_context_from_user(int platform_index, int device_index);\ncl_program build_program(cl_context context, const char* fname, const char *params);\nvoid write_program(cl_program program, const char *name);\nuint32_t adjust_iterations(uint32_t iterations, uint64_t time_ms);\nvoid FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment);\ncl_uint getCuCount();\nsize_t getMaxWorkgroupSize();\ncl_ulong get_max_constant_buffer_size();\ncl_ulong get_max_buffer_size();\ncl_ulong get_max_tex_buffer_size();\ncl_ulong get_max_2d_tex_width();\ncl_ulong get_max_2d_tex_height();\n\nfloat int_atomic_latency_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t iterations,\n    short local,\n    uint32_t *time_ms);\nfloat int_atomic_add_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    size_t threads,\n    size_t localsize);\nfloat latency_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t list_size,\n    uint32_t chase_iterations,\n    short uniform,\n    int threads,\n    int local_size,\n    int wave,\n    int stride,\n    uint32_t *elapsed_ms);\nfloat tex_latency_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t list_size,\n    uint32_t chase_iterations,\n    int threads,\n    int local_size,\n    int wave_size);\nfloat bw_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint64_t list_size,\n    uint32_t thread_count,\n    uint32_t local_size,\n    uint32_t skip,\n    uint32_t chase_iterations);\nfloat tex_bw_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint64_t width,\n    uint64_t height,\n    uint32_t thread_count,\n    uint32_t local_size,\n    uint32_t randomize,\n    uint32_t chase_iterations,\n    int64_t *time_ms);\nfloat local_bw_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t thread_count,\n    uint32_t local_size,\n    uint32_t chase_iterations,\n    int64_t *time_ms);\nfloat local_chase_bw_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t thread_count,\n    uint32_t local_size,\n    uint32_t chase_iterations,\n    uint32_t wave_size,\n    int64_t* time_ms);\nfloat local_64_bw_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t thread_count,\n    uint32_t local_size,\n    uint32_t chase_iterations,\n    int64_t* time_ms);\nfloat buffer_bw_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t thread_count,\n    uint32_t local_size,\n    uint32_t chase_iterations,\n    int64_t* time_ms);\nvoid link_bw_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t iterations);\nfloat c2c_atomic_latency_test(cl_context context,\n    cl_command_queue command_queue,\n    cl_kernel kernel,\n    uint32_t iterations);\n\nfloat instruction_rate_test(cl_context context,\n    cl_command_queue command_queue,\n    uint32_t thread_count,\n    uint32_t local_size,\n    uint32_t chase_iterations,\n    int forcefp16,\n    int forcefp64);\n\nfloat run_divergence_rate_test(cl_context context,\n    cl_command_queue command_queue,\n    uint32_t thread_count,\n    uint32_t local_size,\n    uint32_t wave,\n    int *pattern);\n#endif\n"
  },
  {
    "path": "GpuMemLatency/opencltest.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 16\r\nVisualStudioVersion = 16.0.30503.244\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"opencltest\", \"opencltest.vcxproj\", \"{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|x64 = Debug|x64\r\n\t\tDebug|x86 = Debug|x86\r\n\t\tRelease|x64 = Release|x64\r\n\t\tRelease|x86 = Release|x86\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.ActiveCfg = Debug|Win32\r\n\t\t{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.Build.0 = Debug|Win32\r\n\t\t{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.Build.0 = Release|x64\r\n\t\t{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.ActiveCfg = Release|Win32\r\n\t\t{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.Build.0 = Release|Win32\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\n\tGlobalSection(ExtensibilityGlobals) = postSolution\r\n\t\tSolutionGuid = {4447E91D-E7A1-4249-87A7-E75A78167E71}\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "GpuMemLatency/opencltest.vcxproj",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>16.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{fa51d7f4-f6e0-4cb5-9cdd-ad39a3519f78}</ProjectGuid>\r\n    <RootNamespace>opencltest</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n      <AdditionalIncludeDirectories>$(SolutionDir)\\OpenCL\\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n      <AdditionalLibraryDirectories>$(SolutionDir)\\OpenCL\\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>\r\n      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n      <AdditionalIncludeDirectories>$(SolutionDir)\\OpenCL\\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n      <AdditionalLibraryDirectories>$(SolutionDir)\\OpenCL\\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>\r\n      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"..\\common\\timing.c\" />\r\n    <ClCompile Include=\"atomic_test.c\" />\r\n    <ClCompile Include=\"bw_test.c\" />\r\n    <ClCompile Include=\"common.c\" />\r\n    <ClCompile Include=\"instruction_rate.c\" />\r\n    <ClCompile Include=\"texturetest.c\" />\r\n    <CopyFileToFolders Include=\"instruction_rate_fp16_kernel.cl\">\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">false</ExcludedFromBuild>\r\n      <FileType>CppCode</FileType>\r\n    </CopyFileToFolders>\r\n    <ClCompile Include=\"latency_test.c\" />\r\n    <ClCompile Include=\"opencltest.c\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CopyFileToFolders Include=\"kernel.cl\">\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">false</ExcludedFromBuild>\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClInclude Include=\"..\\common\\timing.h\" />\r\n    <ClInclude Include=\"opencltest.h\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CopyFileToFolders Include=\"instruction_rate_kernel.cl\">\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">false</ExcludedFromBuild>\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CopyFileToFolders Include=\"instruction_rate_fp64_kernel.cl\">\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">false</ExcludedFromBuild>\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CopyFileToFolders Include=\"local_mem_latency_kernel.cl\">\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">false</ExcludedFromBuild>\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CopyFileToFolders Include=\"kernels\\atomic_exec_latency_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\buffer_bw_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\c2c_atomic_exec_latency_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\constant_unrolled_latency_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\local_64_bw_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\local_atomic_latency_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\local_bw_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\ldst_bw_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\local_float4_bw_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\local_unrolled_latency_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\scalar_unrolled_latency_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <None Include=\"kernels\\sum_bw_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\tex_bw_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\tex_latency_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n    <CopyFileToFolders Include=\"kernels\\unrolled_latency_test.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n  </ItemGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>"
  },
  {
    "path": "GpuMemLatency/opencltest.vcxproj.filters",
    "content": "﻿<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project ToolsVersion=\"4.0\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup>\r\n    <ClCompile Include=\"..\\common\\timing.c\" />\r\n    <ClCompile Include=\"atomic_test.c\" />\r\n    <ClCompile Include=\"bw_test.c\" />\r\n    <ClCompile Include=\"common.c\" />\r\n    <ClCompile Include=\"instruction_rate.c\" />\r\n    <ClCompile Include=\"texturetest.c\" />\r\n    <ClCompile Include=\"latency_test.c\" />\r\n    <ClCompile Include=\"opencltest.c\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClInclude Include=\"..\\common\\timing.h\" />\r\n    <ClInclude Include=\"opencltest.h\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CopyFileToFolders Include=\"instruction_rate_fp16_kernel.cl\" />\r\n    <CopyFileToFolders Include=\"kernel.cl\" />\r\n    <CopyFileToFolders Include=\"instruction_rate_kernel.cl\" />\r\n    <CopyFileToFolders Include=\"instruction_rate_fp64_kernel.cl\" />\r\n    <CopyFileToFolders Include=\"local_mem_latency_kernel.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\atomic_exec_latency_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\buffer_bw_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\c2c_atomic_exec_latency_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\constant_unrolled_latency_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\local_64_bw_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\local_atomic_latency_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\tex_bw_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\local_bw_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\local_float4_bw_test.cl\" />\r\n    <CopyFileToFolders Include=\"kernels\\tex_latency_test.cl\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <None Include=\"kernels\\local_unrolled_latency_test.cl\" />\r\n    <None Include=\"kernels\\scalar_unrolled_latency_test.cl\" />\r\n    <None Include=\"kernels\\sum_bw_test.cl\" />\r\n    <None Include=\"kernels\\unrolled_latency_test.cl\" />\r\n  </ItemGroup>\r\n</Project>"
  },
  {
    "path": "GpuMemLatency/texturetest.c",
    "content": "#include \"opencltest.h\"\r\n\r\n"
  },
  {
    "path": "InstructionRate/Makefile",
    "content": "include ../Common/arch_detect.mk\n\nCFLAGS = -O3\n\nall: $(TARGET)\n\namd64:\n\t$(CC) $(CFLAGS) x86_instructionrate.s x86_instructionrate.c -o InstructionRate_amd64 $(LDFLAGS)\n\naarch64:\n\t$(CC) $(CFLAGS) -march=native -pthread arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS)\n\nriscv64:\n\t$(CC) $(CFLAGS) -march=rv64gc -pthread riscv_instructionrate.s riscv_instructionrate.c -o InstructionRate_riscv64 $(LDFLAGS)\n\ntermux:\n\tclang -march=armv8+aes arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS)\n\namd64_fusion:\n\t$(CC) $(CFLAGS) x86_fusion.s x86_fusion.c -o InstructionRateFusion_amd64 $(LDFLAGS)\n\nw64:\n\t$(CC) $(CFLAGS) x86_instructionrate.c x86_instructionrate.s -o InstructionRate_w64.exe $(LDFLAGS)\n\nci: amd64 amd64_fusion aarch64 riscv64 w64\n\nclean:\n\trm -f *.o && find . -type f -executable -delete\n\n.PHONY: all ci clean\n"
  },
  {
    "path": "InstructionRate/arm_instructionrate.c",
    "content": "#define  _GNU_SOURCE\n#include <stdio.h>\n#include <sys/time.h>\n#include <time.h>\n#include <stdint.h>\n#include <stdlib.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include <sched.h>\n#include <pthread.h>\n#include <string.h>\n\nextern uint64_t noptest(uint64_t iterations);\nextern uint64_t clktest(uint64_t iterations);\n\nextern uint64_t addtest(uint64_t iterations);\nextern uint64_t eortest(uint64_t iterations);\nextern uint64_t maddaddtest(uint64_t iterations);\nextern uint64_t cmptest(uint64_t iterations);\nextern uint64_t addmultest(uint64_t iterations);\nextern uint64_t addmul21test(uint64_t iterations);\nextern uint64_t mul32test(uint64_t iterations);\nextern uint64_t mul64test(uint64_t iterations);\nextern uint64_t latmul64test(uint64_t iterations);\nextern uint64_t jmptest(uint64_t iterations);\nextern uint64_t fusejmptest(uint64_t iterations);\nextern uint64_t mixmuljmptest(uint64_t iterations);\nextern uint64_t mixmuljmptest21(uint64_t iterations);\nextern uint64_t mixaddjmptest(uint64_t iterations);\nextern uint64_t mixaddjmp21test(uint64_t iterations);\nextern uint64_t rortest(uint64_t iterations);\nextern uint64_t mixmulrortest(uint64_t iterations);\nextern uint64_t vecadd128test(uint64_t iterations, int arr[4]);\nextern uint64_t latvecadd128test(uint64_t iterations, int arr[4]);\nextern uint64_t vecmul128test(uint64_t iterations, int arr[4]);\nextern uint64_t latvecmul128test(uint64_t iterations, int arr[4]);\nextern uint64_t mixvecaddmul128test(uint64_t iterations, int arr[4]);\nextern uint64_t faddtest(uint64_t iterations, float arr[4]);\nextern uint64_t latfaddtest(uint64_t iterations, float arr[4]);\nextern uint64_t vecfadd128test(uint64_t iterations, float arr[4]);\nextern uint64_t vecfmul128test(uint64_t iterations, float arr[4]);\nextern uint64_t latvecfadd128test(uint64_t iterations, float arr[4]);\nextern uint64_t latvecfmul128test(uint64_t iterations, float arr[4]);\nextern uint64_t mixvecfaddfmul128test(uint64_t iterations, float arr[4]);\nextern uint64_t vecfma128test(uint64_t iterations, float arr[4]);\nextern uint64_t scalarfmatest(uint64_t iterations, float arr[4]);\nextern uint64_t latvecfma128test(uint64_t iterations, float arr[4]);\nextern uint64_t latscalarfmatest(uint64_t iterations, float arr[4]);\nextern uint64_t mixvecfaddfma128test(uint64_t iterations, float arr[4]);\nextern uint64_t mixvecfmulfma128test(uint64_t iterations, float arr[4]);\n\n// see if SIMD pipeline shares ports with scalar ALU ones\nextern uint64_t mixaddvecadd128test(uint64_t iterations, int arr[4]);\nextern uint64_t mix3to1addvecadd128test(uint64_t iterations, int arr[4]);\nextern uint64_t mix1to1addvecadd128test(uint64_t iterations, int arr[4]);\nextern uint64_t mixmulvecmultest(uint64_t iterations, int arr[4]);\n\n// are vec int and vec fp on the same port?\nextern uint64_t mixvecmulfmultest(uint64_t iterations, float farr[4], int iarr[4]);\nextern uint64_t mixvecaddfaddtest(uint64_t iterations, float farr[4], int iarr[4]);\n\n// where are the branch ports\nextern uint64_t mixjmpvecaddtest(uint64_t iterations, int arr[4]);\nextern uint64_t mixjmpvecmultest(uint64_t iterations, int arr[4]);\n\n// load/store\nextern uint64_t loadtest(uint64_t iterations, int arr[4]);\nextern uint64_t mixloadstoretest(uint64_t iterations, int arr[4], int sink[4]);\nextern uint64_t mix21loadstoretest(uint64_t iterations, int arr[4], int sink[4]);\nextern uint64_t vecloadtest(uint64_t iterations, int arr[4]);\nextern uint64_t vecstoretest(uint64_t iterations, int arr[4], int sink[4]);\n\n// renamer tests\nextern uint64_t indepmovtest(uint64_t iterations);\nextern uint64_t depmovtest(uint64_t iterations);\nextern uint64_t xorzerotest(uint64_t iterations);\nextern uint64_t movzerotest(uint64_t iterations);\nextern uint64_t subzerotest(uint64_t iterations);\n\n// Is crypto separate\nextern uint64_t aesetest(uint64_t iterations, int arr[4]);\nextern uint64_t mixaesevecadd128test(uint64_t iterations, int arr[4]);\nextern uint64_t pmulltest(uint64_t iterations, int arr[4]);\nextern uint64_t mixpmulladd128test(uint64_t iterations, int arr[4]);\n\nfloat fpTestArr[4] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14 };\nint intTestArr[4] __attribute__ ((aligned (64))) = { 1, 2, 3, 4 };\nint sinkArr[4] __attribute__ ((aligned (64))) = { 2, 3, 4, 5 };\n\nfloat measureFunction(uint64_t iterations, float clockSpeedGhz, uint64_t (*testfunc)(uint64_t));\nuint64_t vecadd128wrapper(uint64_t iterations);\nuint64_t latvecadd128wrapper(uint64_t iterations);\nuint64_t vecmul128wrapper(uint64_t iterations);\nuint64_t latvecmul128wrapper(uint64_t iterations);\nuint64_t mixvecaddmul128wrapper(uint64_t iterations);\nuint64_t faddwrapper(uint64_t iterations);\nuint64_t latfaddwrapper(uint64_t iterations);\nuint64_t vecfadd128wrapper(uint64_t iterations);\nuint64_t latvecfadd128wrapper(uint64_t iterations);\nuint64_t vecfmul128wrapper(uint64_t iterations);\nuint64_t latvecfmul128wrapper(uint64_t iterations);\nuint64_t mixvecfaddfmul128wrapper(uint64_t iterations);\nuint64_t mixaddvecadd128wrapper(uint64_t iterations);\nuint64_t mix3to1addvecadd128wrapper(uint64_t iterations);\nuint64_t mix1to1addvecadd128wrapper(uint64_t iterations);\nuint64_t mixmulvecmulwrapper(uint64_t iterations);\nuint64_t mixvecmulfmulwrapper(uint64_t iterations);\nuint64_t mixvecaddfaddwrapper(uint64_t iterations);\nuint64_t mixjmpvecaddwrapper(uint64_t iterations);\nuint64_t mixjmpvecmulwrapper(uint64_t iterations);\nuint64_t vecloadwrapper(uint64_t iterations);\nuint64_t loadwrapper(uint64_t iterations);\nuint64_t vecstorewrapper(uint64_t iterations);\nuint64_t mixloadstorewrapper(uint64_t iterations);\nuint64_t mix21loadstorewrapper(uint64_t iterations);\nuint64_t vecfma128wrapper(uint64_t iterations);\nuint64_t scalarfmawrapper(uint64_t iterations);\nuint64_t latscalarfmawrapper(uint64_t iterations);\nuint64_t mixvecfaddfma128wrapper(uint64_t iterations);\nuint64_t mixvecfmulfma128wrapper(uint64_t iterations);\nuint64_t latvecfma128wrapper(uint64_t iteration);\nuint64_t aesetestwrapper(uint64_t iterations);\nuint64_t mixaesevecadd128wrapper(uint64_t iterations);\nuint64_t pmullwrapper(uint64_t iterations);\nuint64_t mixpmulladd128wrapper(uint64_t iterations);\n\nint threads = 0, hardaffinity = 0;\ncpu_set_t cpuset;\n\nint main(int argc, char *argv[]) {\n  struct timeval startTv, endTv;\n  struct timezone startTz, endTz;\n  uint64_t iterations = 1500000000;\n  uint64_t iterationsHigh = iterations * 5;\n  uint64_t time_diff_ms;\n  float latency, opsPerNs, clockSpeedGhz;\n  \n  if (argc > 1) {\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\n      if (*(argv[argIdx]) == '-') {\n        char *arg = argv[argIdx] + 1;\n\tif (strncmp(arg, \"affinity\", 8) == 0) {\n\t  argIdx++;\n\t  int targetCpu = atoi(argv[argIdx]);\n          CPU_ZERO(&cpuset);\n          CPU_SET(targetCpu, &cpuset);\n          sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); \n\t  fprintf(stderr, \"Set affinity to %d\\n\", targetCpu);\n\t}\n        else if (strncmp(arg, \"hardaffinity\", 12) == 0) {\n          CPU_ZERO(&cpuset);\n          CPU_SET(0, &cpuset);\n          CPU_SET(1, &cpuset);\n          sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); \n\t  fprintf(stderr, \"Set affinity 2,3\\n\"); \n          hardaffinity = 1;\n        }\n\telse if (strncmp(arg, \"threads\", 7) == 0) {\n\t  argIdx++;\n\t  threads = atoi(argv[argIdx]);\n\t  fprintf(stderr, \"Multithreading mode, %d threads\\n\", threads);\n\t}\n\telse if (strncmp(arg, \"iter\", 4) == 0) {\n\t  argIdx++;\n\t  int iterMul = atoi(argv[argIdx]);\n\t  iterations *= iterMul;\n\t  iterationsHigh *= iterMul;\n\t  fprintf(stderr, \"Scaled iterations by %d\\n\", iterMul);\n\t}\n      }\n    }\n  }\n\n  // figure out clock speed\n  gettimeofday(&startTv, &startTz);\n  clktest(iterations);\n  gettimeofday(&endTv, &endTz);\n  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n  latency = 1e6 * (float)time_diff_ms / (float)iterations;\n  // clk speed should be 1/latency, assuming we got one add per clk, roughly\n  clockSpeedGhz = 1/latency;\n  printf(\"Estimated clock speed> %.2f GHz\\n\", clockSpeedGhz);\n\n  printf(\"Nops per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, noptest));\n  printf(\"Adds per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, addtest));\n  printf(\"XORs per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, eortest));\n  printf(\"CMPs per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, cmptest));\n  \n  printf(\"\\n----Renamer Tests----\\n\");\n  printf(\"Indepdent movs per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest));\n  printf(\"Dependent movs per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest));\n  printf(\"eor -> 0 per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest));\n  printf(\"mov -> 0 per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest));\n  printf(\"sub -> 0 per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest));\n\n  printf(\"\\n----ALU Pipe Layout Tests----\\n\");\n  printf(\"Not taken jmps per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, jmptest));\n  printf(\"Jump fusion test> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, fusejmptest));\n  printf(\"1:1 mixed not taken jmps / muls per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest));\n  printf(\"1:2 mixed not taken jmps / muls per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest21));\n  printf(\"1:1 mixed not taken jmps / adds per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmptest));\n  printf(\"1:2 mixed not taken jmps / adds per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmp21test));\n  printf(\"1:1 mixed add/mul per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, addmultest));\n  printf(\"2:1 mixed add/mul per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, addmul21test));\n  printf(\"ror per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, rortest));\n  printf(\"1:1 mixed mul/ror per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixmulrortest));\n  printf(\"1:3 madd:add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, maddaddtest));\n  printf(\"32-bit mul per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));\n  printf(\"64-bit mul per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));\n  printf(\"64-bit multiply latency> %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latmul64test));\n\n  printf(\"\\n----FP/ASIMD Crypto Tests----\\n\");\n  printf(\"aese per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, aesetestwrapper));\n  printf(\"1:1 aese and vec 128 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixaesevecadd128wrapper));\n  printf(\"pmull per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, pmullwrapper));\n  printf(\"1:1 pmull and vec 128 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixpmulladd128wrapper));\n\n  printf (\"\\n----FP/ASIMD Tests----\\n\");\n  printf(\"scalar fp32 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, faddwrapper));\n  printf(\"128-bit vec int32 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecadd128wrapper));\n  printf(\"128-bit vec int32 multiply per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecmul128wrapper));\n  printf(\"128-bit vec int32 mixed multiply and add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddmul128wrapper));\n  printf(\"128-bit vec fp32 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecfadd128wrapper));\n  printf(\"128-bit vec fp32 multiply per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecfmul128wrapper));\n  printf(\"128-bit vec fp32 mixed multiply and add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfmul128wrapper));\n  printf(\"2:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixaddvecadd128wrapper));\n  printf(\"3:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mix3to1addvecadd128wrapper));\n  printf(\"1:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mix1to1addvecadd128wrapper));\n  printf(\"1:1 mixed scalar 32-bit multiply and 128-bit vec int32 multiply per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixmulvecmulwrapper));\n  printf(\"1:1 mixed 128-bit vec fp32 multiply and 128-bit vec int32 multiply per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixvecmulfmulwrapper));\n  printf(\"1:1 mixed 128-bit vec fp32 add and 128-bit vec int32 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddfaddwrapper));\n  printf(\"1:2 mixed not taken jumps and 128-bit vec int32 add per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecaddwrapper));\n  printf(\"1:1 mixed not taken jumps and 128-bit vec int32 mul per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecmulwrapper));\n  printf(\"128-bit vec int32 add latency> %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latvecadd128wrapper));\n  printf(\"128-bit vec int32 mul latency> %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latvecmul128wrapper));\n  printf(\"Scalar FADD Latency> %.2f clocks\\n\", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latfaddwrapper));\n  printf(\"128-bit vector FADD latency> %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latvecfadd128wrapper));\n  printf(\"128-bit vector FMUL latency> %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latvecfmul128wrapper));\n  printf(\"128-bit vector FMA per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecfma128wrapper));\n  printf(\"128-bit vector FMA latency> %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latvecfma128wrapper));\n  printf(\"Scalar FMA per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, scalarfmawrapper));\n  printf(\"Scalar FMA latency> %.2f clocks\\n\", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latscalarfmawrapper));\n  printf(\"1:1 mixed 128-bit vector FMA/FADD per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfma128wrapper));\n  printf(\"1:1 mixed 128-bit vector FMA/FMUL per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfmulfma128wrapper)); \n  \n  printf(\"\\n----Load/Store Tests----\\n\");\n  printf(\"128-bit vec loads per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecloadwrapper));\n  printf(\"128-bit vec stores per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecstorewrapper));\n  printf(\"64-bit loads per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, loadwrapper));\n  printf(\"1:1 mixed 64-bit loads/stores per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixloadstorewrapper));\n  printf(\"2:1 mixed 64-bit loads/stores per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mix21loadstorewrapper));\n\n\n  return 0;\n}\n\nstruct TestThreadData {\n  uint64_t iterations;\n  uint64_t (*testfunc)(uint64_t);\n};\n\nvoid *TestThread(void *param) {\n  struct TestThreadData *testData = (struct TestThreadData *)param;\n  if (hardaffinity) {\n    sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); \n  }\n\n  testData->testfunc(testData->iterations);\n  return NULL;\n}\n\nfloat measureFunction(uint64_t iterations, float clockSpeedGhz, uint64_t (*testfunc)(uint64_t)) {\n  struct timeval startTv, endTv;\n  struct timezone startTz, endTz;\n  uint64_t time_diff_ms;\n  float latency, opsPerNs;\n  \n  gettimeofday(&startTv, &startTz);\n  if (threads == 0) testfunc(iterations);\n  else {\n    pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t));\n    struct TestThreadData *testData = (struct TestThreadData *)malloc(threads * sizeof(struct TestThreadData));\n    for (int threadIdx = 0; threadIdx < threads; threadIdx++) {\n      testData[threadIdx].iterations = iterations;\n      testData[threadIdx].testfunc = testfunc;\n      pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx);\n    }\n\n    for (int threadIdx = 0; threadIdx < threads; threadIdx++) {\n      pthread_join(testThreads[threadIdx], NULL);\n    }\n\n    free(testThreads);\n    free(testData);\n  }\n  gettimeofday(&endTv, &endTz);\n  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n  latency = 1e6 * (float)time_diff_ms / (float)iterations;\n  opsPerNs = 1/latency;\n  //printf(\"%f adds/ns, %f adds/clk?\\n\", opsPerNs, opsPerNs / clockSpeedGhz);\n  return opsPerNs / clockSpeedGhz;\n}\n\nuint64_t vecadd128wrapper(uint64_t iterations) {\n  return vecadd128test(iterations, intTestArr);\n}\n\nuint64_t vecmul128wrapper(uint64_t iterations) {\n  return vecmul128test(iterations, intTestArr);\n}\n\nuint64_t latvecadd128wrapper(uint64_t iterations) {\n  return latvecadd128test(iterations, intTestArr);\n}\n\nuint64_t latvecmul128wrapper(uint64_t iterations) {\n  return latvecmul128test(iterations, intTestArr);\n}\n\nuint64_t mixvecaddmul128wrapper(uint64_t iterations) {\n  return mixvecaddmul128test(iterations, intTestArr);\n}\n\nuint64_t faddwrapper(uint64_t iterations) {\n  return faddtest(iterations, fpTestArr);\n}\n\nuint64_t latfaddwrapper(uint64_t iterations) {\n  return latfaddtest(iterations, fpTestArr);\n}\n\nuint64_t latvecfadd128wrapper(uint64_t iterations) {\n  return latvecfadd128test(iterations, fpTestArr);\n}\n\nuint64_t latvecfmul128wrapper(uint64_t iterations) {\n  return latvecfmul128test(iterations, fpTestArr);\n}\n\nuint64_t vecfadd128wrapper(uint64_t iterations) {\n  return vecfadd128test(iterations, fpTestArr);\n}\n\nuint64_t vecfmul128wrapper(uint64_t iterations) {\n  return vecfmul128test(iterations, fpTestArr);\n}\nuint64_t mixvecfaddfmul128wrapper(uint64_t iterations) {\n  return mixvecfaddfmul128test(iterations, fpTestArr);\n}\n\nuint64_t mixaddvecadd128wrapper(uint64_t iterations) {\n  return mixaddvecadd128test(iterations, intTestArr);\n}\n\nuint64_t mix3to1addvecadd128wrapper(uint64_t iterations) {\n  return mix3to1addvecadd128test(iterations, intTestArr);\n}\n\nuint64_t mix1to1addvecadd128wrapper(uint64_t iterations) {\n  return mix1to1addvecadd128test(iterations, intTestArr);\n}\n\nuint64_t mixmulvecmulwrapper(uint64_t iterations) {\n  return mixmulvecmultest(iterations, intTestArr);\n}\n\nuint64_t mixvecmulfmulwrapper(uint64_t iterations) {\n  return mixvecmulfmultest(iterations, fpTestArr, intTestArr);\n}\n\nuint64_t mixvecaddfaddwrapper(uint64_t iterations) {\n  return mixvecaddfaddtest(iterations, fpTestArr, intTestArr);\n}\n\nuint64_t mixjmpvecaddwrapper(uint64_t iterations) {\n  return mixjmpvecaddtest(iterations, intTestArr);\n}\n\nuint64_t mixjmpvecmulwrapper(uint64_t iterations) {\n  return mixjmpvecmultest(iterations, intTestArr);\n}\n\nuint64_t vecloadwrapper(uint64_t iterations) {\n  return vecloadtest(iterations, intTestArr);\n}\n\nuint64_t vecstorewrapper(uint64_t iterations) {\n  return vecstoretest(iterations, intTestArr, sinkArr);\n}\n\nuint64_t loadwrapper(uint64_t iterations) {\n  if (((uint64_t)intTestArr & 63) != 0) {\n    printf(\"Warning - load may not be 64B aligned\\n\");\n  }\n\n  return loadtest(iterations, intTestArr);\n}\n\nuint64_t mixloadstorewrapper(uint64_t iterations) {\n  return mixloadstoretest(iterations, intTestArr, sinkArr);\n}\n\nuint64_t mix21loadstorewrapper(uint64_t iterations) {\n  return mix21loadstoretest(iterations, intTestArr, sinkArr);\n}\n\nuint64_t vecfma128wrapper(uint64_t iterations) {\n  return vecfma128test(iterations, fpTestArr);\n}\n\nuint64_t scalarfmawrapper(uint64_t iterations) {\n  return scalarfmatest(iterations, fpTestArr);\n}\n\nuint64_t latscalarfmawrapper(uint64_t iterations) {\n  return latscalarfmatest(iterations, fpTestArr);\n}\n\nuint64_t latvecfma128wrapper(uint64_t iterations) {\n  return latvecfma128test(iterations, fpTestArr);\n}\n\nuint64_t mixvecfmulfma128wrapper(uint64_t iterations) {\n  return mixvecfmulfma128test(iterations, fpTestArr);\n}\n\nuint64_t mixvecfaddfma128wrapper(uint64_t iterations) {\n  return mixvecfaddfma128test(iterations, fpTestArr);\n}\n\nuint64_t aesetestwrapper(uint64_t iterations) {\n  return aesetest(iterations, intTestArr);\n}\n\nuint64_t mixaesevecadd128wrapper(uint64_t iterations) {\n  return mixaesevecadd128test(iterations, intTestArr);\n}\n\nuint64_t pmullwrapper(uint64_t iterations) {\n  return pmulltest(iterations, intTestArr);\n}\n\nuint64_t mixpmulladd128wrapper(uint64_t iterations) {\n  return mixpmulladd128test(iterations, intTestArr);\n}\n"
  },
  {
    "path": "InstructionRate/arm_instructionrate.s",
    "content": ".text\n\n.global clktest\n.global addtest\n.global eortest\n.global maddaddtest\n.global cmptest\n.global addmultest\n.global addmul21test\n.global mixaddjmp21test\n.global mul32test\n.global mul64test\n.global latmul64test\n.global noptest\n.global fusejmptest\n.global jmptest\n.global mixmuljmptest\n.global mixmuljmptest21\n.global mixaddjmptest\n.global rortest\n.global mixmulrortest\n\n.global _clktest\n.global _addtest\n.global _eortest\n.global _maddaddtest\n.global _cmptest\n.global _addmultest\n.global _addmul21test\n.global _mixaddjmp21test\n.global _mul32test\n.global _mul64test\n.global _latmul64test\n.global _noptest\n.global _fusejmptest\n.global _jmptest\n.global _mixmuljmptest\n.global _mixmuljmptest21\n.global _mixaddjmptest\n.global _rortest\n.global _mixmulrortest\n\n.global vecadd128test\n.global latvecadd128test\n.global vecmul128test\n.global latvecmul128test\n.global mixvecaddmul128test\n.global faddtest\n.global latfaddtest\n.global latfmultest\n.global latvecfadd128test\n.global latvecfmul128test\n.global vecfadd128test\n.global vecfmul128test\n.global mixvecfaddfmul128test\n.global mixaddvecadd128test\n.global mix3to1addvecadd128test\n.global mix1to1addvecadd128test\n.global mixmulvecmultest\n.global mixvecmulfmultest\n.global mixvecaddfaddtest\n.global mixjmpvecaddtest\n.global mixjmpvecmultest\n.global vecfma128test\n.global latvecfma128test\n.global scalarfmatest\n.global latscalarfmatest\n.global aesetest\n.global mixaesevecadd128test\n.global pmulltest\n.global mixpmulladd128test\n\n.global _vecadd128test\n.global _latvecadd128test\n.global _vecmul128test\n.global _latvecmul128test\n.global _mixvecaddmul128test\n.global _faddtest\n.global _latfaddtest\n.global _latfmultest\n.global _latvecfadd128test\n.global _latvecfmul128test\n.global _vecfadd128test\n.global _vecfmul128test\n.global _mixvecfaddfmul128test\n.global _mixaddvecadd128test\n.global _mix3to1addvecadd128test\n.global _mix1to1addvecadd128test\n.global _mixmulvecmultest\n.global _mixvecmulfmultest\n.global _mixvecaddfaddtest\n.global _mixjmpvecaddtest\n.global _mixjmpvecmultest\n.global _vecfma128test\n.global _latvecfma128test\n.global _scalarfmatest\n.global _latscalarfmatest\n\n.global mixvecfaddfma128test\n.global mixvecfmulfma128test\n.global loadtest\n.global mixloadstoretest\n.global mix21loadstoretest\n.global vecloadtest\n.global vecstoretest\n\n.global _mixvecfaddfma128test\n.global _mixvecfmulfma128test\n.global _loadtest\n.global _mixloadstoretest\n.global _mix21loadstoretest\n.global _vecloadtest\n.global _vecstoretest\n\n//renamer tests\n.global indepmovtest\n.global depmovtest\n.global xorzerotest\n.global movzerotest\n.global subzerotest\n\n.global _indepmovtest\n.global _depmovtest\n.global _xorzerotest\n.global _movzerotest\n.global _subzerotest\n.global _aesetest\n.global _mixaesevecadd128test\n.global _pmulltest\n.global _mixpmulladd128test\n\n.balign 4\n\n/* x0 = arg = iteration count. all iteration counts must be divisible by 10 */\n_clktest:\nclktest:\n  sub sp, sp, #0x30\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  mov x15, 1\n  mov x14, 20\n  eor x13, x13, x13\nclktest_loop:\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  add x13, x13, x15\n  sub x0, x0, x14\n  cbnz x0, clktest_loop\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x30\n  ret\n\n_noptest:\nnoptest:\n  sub sp, sp, #0x30\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  mov x15, 1\n  mov x14, 30\n  eor x13, x13, x13\nnoptest_loop:\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  sub x0, x0, x14\n  cbnz x0, noptest_loop\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x30\n  ret\n\n_addtest:\naddtest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 30\n  eor x13, x13, x13\n  eor x12, x12, x12\n  eor x11, x11, x11\n  eor x10, x10, x10\n  eor x9, x9, x9\naddtest_loop:\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add x9, x9, x15\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add x9, x9, x15\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add x9, x9, x15\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add x9, x9, x15\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add x9, x9, x15\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add x9, x9, x15\n  sub x0, x0, x14\n  cbnz x0, addtest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret \n\n_maddaddtest:\nmaddaddtest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 20\n  eor x13, x13, x13\n  eor x12, x12, x12\n  eor x11, x11, x11\n  mov x10, 2\n  eor x9, x9, x9\n  mov x8, 3\nmaddaddtest_loop:\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  madd x10, x8, x0, x15\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  madd x10, x8, x0, x15 \n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  madd x10, x8, x0, x15  \n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  madd x10, x8, x0, x15  \n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  madd x10, x8, x0, x15  \n  sub x0, x0, x14\n  cbnz x0, maddaddtest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret \n\n_eortest:\neortest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 30\n  eor x13, x13, x13\n  eor x12, x12, x12\n  eor x11, x11, x11\n  eor x10, x10, x10\n  eor x9, x9, x9\neortest_loop:\n  eor x13, x13, x15\n  eor x12, x12, x15\n  eor x11, x11, x15\n  eor x10, x10, x15\n  eor x9, x9, x15\n  eor x13, x13, x15\n  eor x12, x12, x15\n  eor x11, x11, x15\n  eor x10, x10, x15\n  eor x9, x9, x15\n  eor x13, x13, x15\n  eor x12, x12, x15\n  eor x11, x11, x15\n  eor x10, x10, x15\n  eor x9, x9, x15\n  eor x13, x13, x15\n  eor x12, x12, x15\n  eor x11, x11, x15\n  eor x10, x10, x15\n  eor x9, x9, x15\n  eor x13, x13, x15\n  eor x12, x12, x15\n  eor x11, x11, x15\n  eor x10, x10, x15\n  eor x9, x9, x15\n  eor x13, x13, x15\n  eor x12, x12, x15\n  eor x11, x11, x15\n  eor x10, x10, x15\n  eor x9, x9, x15\n  sub x0, x0, x14\n  cbnz x0, eortest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_cmptest:\ncmptest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 30\n  eor x13, x13, x13\n  eor x12, x12, x12\n  eor x11, x11, x11\n  eor x10, x10, x10\n  eor x9, x9, x9\ncmptest_loop:\n  cmp x13, x13\n  cmp x12, x12\n  cmp x11, x11\n  cmp x10, x10\n  cmp x9, x9 \n  cmp x13, x13\n  cmp x12, x12\n  cmp x11, x11\n  cmp x10, x10\n  cmp x9, x9 \n  cmp x13, x13\n  cmp x12, x12\n  cmp x11, x11\n  cmp x10, x10\n  cmp x9, x9 \n  cmp x13, x13\n  cmp x12, x12\n  cmp x11, x11\n  cmp x10, x10\n  cmp x9, x9 \n  cmp x13, x13\n  cmp x12, x12\n  cmp x11, x11\n  cmp x10, x10\n  cmp x9, x9 \n  cmp x13, x13\n  cmp x12, x12\n  cmp x11, x11\n  cmp x10, x10\n  cmp x9, x9 \n  sub x0, x0, x14\n  cbnz x0, cmptest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret \n\n_addmultest:\naddmultest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 2\n  mov x14, 20\n  mov x13, 2\n  eor x12, x12, x12\n  mov x11, 2\n  eor x10, x10, x10\n  mov x9, 2\n  mov x8, 2\naddmultest_loop:\n  mul w13, w13, w15\n  add x12, x12, x15\n  mul w11, w11, w15\n  add x10, x10, x15\n  mul w9, w9, w15\n  add x12, x12, x15\n  mul w8, w8, w15\n  add x10, x10, x15\n  mul w13, w13, w15\n  add x12, x12, x15\n  mul w11, w11, w15\n  add x10, x10, x15\n  mul w9, w9, w15\n  add x12, x12, x15\n  mul w8, w8, w15\n  add x10, x10, x15\n  mul w13, w13, w15\n  add x12, x12, x15\n  mul w11, w11, w15\n  add x10, x10, x15\n  sub x0, x0, x14\n  cbnz x0, addmultest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_addmul21test:\naddmul21test:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 2\n  mov x14, 24\n  mov x13, 2\n  eor x12, x12, x12\n  mov x11, 2\n  eor x10, x10, x10\n  mov x9, 2\n  mov x8, 2\naddmul21test_loop:\n  mul w13, w13, w15\n  add x12, x12, x15\n  add x10, x10, x15\n  mul w11, w11, w15\n  add x12, x12, x15\n  add x10, x10, x15\n  mul w9, w9, w15\n  add x12, x12, x15\n  add x10, x10, x15\n  mul w8, w8, w15\n  add x12, x12, x15\n  add x10, x10, x15\n  mul w13, w13, w15\n  add x12, x12, x15\n  add x10, x10, x15\n  mul w11, w11, w15\n  add x12, x12, x15\n  add x10, x10, x15\n  mul w9, w9, w15\n  add x12, x12, x15\n  add x10, x10, x15\n  mul w8, w8, w15\n  add x12, x12, x15\n  add x10, x10, x15\n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt addmul21test_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret \n\n_mul32test:\nmul32test:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 2\n  mov x14, 20\n  mov x13, x15\n  mov x12, x15\n  mov x11, x15\n  mov x10, x15\n  mov x9, x15\n  mov x8, x15\nmul32test_loop:\n  mul w13, w13, w15\n  mul w12, w12, w15\n  mul w11, w11, w15\n  mul w10, w10, w15\n  mul w9, w9, w15\n  mul w8, w8, w15\n  mul w13, w13, w15\n  mul w12, w12, w15\n  mul w11, w11, w15\n  mul w10, w10, w15\n  mul w9, w9, w15\n  mul w8, w8, w15\n  mul w13, w13, w15\n  mul w12, w12, w15\n  mul w11, w11, w15\n  mul w10, w10, w15\n  mul w9, w9, w15\n  mul w8, w8, w15\n  mul w13, w13, w15\n  mul w12, w12, w15\n  sub x0, x0, x14\n  cbnz x0, mul32test_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mul64test:\nmul64test:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 2\n  mov x14, 20\n  mov x13, x15\n  mov x12, x15\n  mov x11, x15\n  mov x10, x15\n  mov x9, x15\n  mov x8, x15\nmul64test_loop:\n  mul x13, x13, x15\n  mul x12, x12, x15\n  mul x11, x11, x15\n  mul x10, x10, x15\n  mul x9, x9, x15\n  mul x8, x8, x15\n  mul x13, x13, x15\n  mul x12, x12, x15\n  mul x11, x11, x15\n  mul x10, x10, x15\n  mul x9, x9, x15\n  mul x8, x8, x15\n  mul x13, x13, x15\n  mul x12, x12, x15\n  mul x11, x11, x15\n  mul x10, x10, x15\n  mul x9, x9, x15\n  mul x8, x8, x15\n  mul x13, x13, x15\n  mul x12, x12, x15\n  sub x0, x0, x14\n  cbnz x0, mul64test_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_latmul64test:\nlatmul64test:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 2\n  mov x14, 20\n  mov x13, x15\nlatmul64test_loop:\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  mul x13, x13, x13\n  sub x0, x0, x14\n  cbnz x0, latmul64test_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n/* needs an additional parameter passed in x1 - ptr to array of 4 floats */\n_vecadd128test:\nvecadd128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\nvecadd128test_loop:\n  add v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  add v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  add v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  add v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  add v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  add v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  add v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  add v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  add v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  add v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  sub x0, x0, x14\n  cbnz x0, vecadd128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_latvecadd128test:\nlatvecadd128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\nlatvecadd128test_loop:\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  add v16.4s, v16.4s, v16.4s\n  sub x0, x0, x14\n  cbnz x0, latvecadd128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_faddtest:\nfaddtest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr s16, [x1]\n  ldr s17, [x1, #0x4]\n  ldr s18, [x1, #0x8]\n  ldr s19, [x1, #0xC]\n  ldr s20, [x1]\n  ldr s21, [x1, #0x4]\nfaddtest_loop:\n  fadd s16, s16, s16\n  fadd s17, s17, s17\n  fadd s18, s18, s18\n  fadd s19, s19, s19\n  fadd s20, s20, s20\n  fadd s21, s21, s21\n  fadd s16, s16, s16\n  fadd s17, s17, s17\n  fadd s18, s18, s18\n  fadd s19, s19, s19\n  fadd s20, s20, s20\n  fadd s21, s21, s21\n  fadd s16, s16, s16\n  fadd s17, s17, s17\n  fadd s18, s18, s18\n  fadd s19, s19, s19\n  fadd s20, s20, s20\n  fadd s21, s21, s21\n  fadd s16, s16, s16\n  fadd s17, s17, s17\n  sub x0, x0, x14\n  cbnz x0, faddtest_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_latfaddtest:\nlatfaddtest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr s16, [x1]\nlatfaddtest_loop:\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  fadd s16, s16, s16\n  sub x0, x0, x14\n  cbnz x0, latfaddtest_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_latfmultest:\nlatfmultest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr s16, [x1]\nlatfmultest_loop:\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  fmul s16, s16, s16\n  sub x0, x0, x14\n  cbnz x0, latfmultest_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_latvecmul128test:\nlatvecmul128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\nlatvecmul128test_loop:\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v16.4s, v16.4s, v16.4s\n  sub x0, x0, x14\n  cbnz x0, latvecmul128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_vecmul128test:\nvecmul128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\nvecmul128test_loop:\n  mul v16.4s, v16.4s, v16.4s\n  mul v17.4s, v17.4s, v17.4s\n  mul v18.4s, v18.4s, v18.4s\n  mul v19.4s, v19.4s, v19.4s\n  mul v20.4s, v20.4s, v20.4s\n  mul v21.4s, v21.4s, v21.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v17.4s, v17.4s, v17.4s\n  mul v18.4s, v18.4s, v18.4s\n  mul v19.4s, v19.4s, v19.4s\n  mul v20.4s, v20.4s, v20.4s\n  mul v21.4s, v21.4s, v21.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v17.4s, v17.4s, v17.4s\n  mul v18.4s, v18.4s, v18.4s\n  mul v19.4s, v19.4s, v19.4s\n  mul v20.4s, v20.4s, v20.4s\n  mul v21.4s, v21.4s, v21.4s\n  mul v16.4s, v16.4s, v16.4s\n  mul v17.4s, v17.4s, v17.4s\n  sub x0, x0, x14\n  cbnz x0, vecmul128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_scalarfmatest:\nscalarfmatest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\n  ldr q24, [x1]\n  ldr q25, [x1]\nscalarfmatest_loop:\n  fmadd s16, s16, s16, s16\n  fmadd s17, s17, s17, s17\n  fmadd s18, s18, s18, s18\n  fmadd s19, s19, s19, s19\n  fmadd s20, s20, s20, s20\n  fmadd s21, s21, s21, s21\n  fmadd s22, s22, s22, s22\n  fmadd s23, s23, s23, s23\n  fmadd s24, s24, s24, s24\n  fmadd s25, s25, s25, s25\n  fmadd s16, s16, s16, s16\n  fmadd s17, s17, s17, s17\n  fmadd s18, s18, s18, s18\n  fmadd s19, s19, s19, s19\n  fmadd s20, s20, s20, s20\n  fmadd s21, s21, s21, s21\n  fmadd s22, s22, s22, s22\n  fmadd s23, s23, s23, s23\n  fmadd s24, s24, s24, s24\n  fmadd s25, s25, s25, s25\n  sub x0, x0, x14\n  cbnz x0, scalarfmatest_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_latscalarfmatest:\nlatscalarfmatest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\nlatscalarfmatest_loop:\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  fmadd s16, s16, s16, s16\n  sub x0, x0, x14\n  cbnz x0, latscalarfmatest_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_vecfma128test:\nvecfma128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\n  ldr q24, [x1]\n  ldr q25, [x1]\nvecfma128test_loop:\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v17.4s, v17.4s, v17.4s\n  fmla v18.4s, v18.4s, v18.4s\n  fmla v19.4s, v19.4s, v19.4s\n  fmla v20.4s, v20.4s, v20.4s\n  fmla v21.4s, v21.4s, v21.4s\n  fmla v22.4s, v22.4s, v22.4s\n  fmla v23.4s, v23.4s, v23.4s\n  fmla v24.4s, v24.4s, v24.4s\n  fmla v25.4s, v25.4s, v25.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v17.4s, v17.4s, v17.4s\n  fmla v18.4s, v18.4s, v18.4s\n  fmla v19.4s, v19.4s, v19.4s\n  fmla v20.4s, v20.4s, v20.4s\n  fmla v21.4s, v21.4s, v21.4s\n  fmla v22.4s, v22.4s, v22.4s\n  fmla v23.4s, v23.4s, v23.4s\n  fmla v24.4s, v24.4s, v24.4s\n  fmla v25.4s, v25.4s, v25.4s\n  sub x0, x0, x14\n  cbnz x0, vecfma128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_mixvecfmulfma128test:\nmixvecfmulfma128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\n  ldr q24, [x1]\n  ldr q25, [x1]\nmixvecfmulfma128test_loop:\n  fmla v16.4s, v16.4s, v16.4s\n  fmul v17.4s, v17.4s, v17.4s\n  fmla v18.4s, v18.4s, v18.4s\n  fmul v19.4s, v19.4s, v19.4s\n  fmla v20.4s, v20.4s, v20.4s\n  fmul v21.4s, v21.4s, v21.4s\n  fmla v22.4s, v22.4s, v22.4s\n  fmul v23.4s, v23.4s, v23.4s\n  fmla v24.4s, v24.4s, v24.4s\n  fmul v25.4s, v25.4s, v25.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmul v17.4s, v17.4s, v17.4s\n  fmla v18.4s, v18.4s, v18.4s\n  fmul v19.4s, v19.4s, v19.4s\n  fmla v20.4s, v20.4s, v20.4s\n  fmul v21.4s, v21.4s, v21.4s\n  fmla v22.4s, v22.4s, v22.4s\n  fmul v23.4s, v23.4s, v23.4s\n  fmla v24.4s, v24.4s, v24.4s\n  fmul v25.4s, v25.4s, v25.4s\n  sub x0, x0, x14\n  cbnz x0, mixvecfmulfma128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_mixvecfaddfma128test:\nmixvecfaddfma128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\n  ldr q24, [x1]\n  ldr q25, [x1]\nmixvecfaddfma128test_loop:\n  fmla v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  fmla v18.4s, v18.4s, v18.4s\n  fadd v19.4s, v19.4s, v19.4s\n  fmla v20.4s, v20.4s, v20.4s\n  fadd v21.4s, v21.4s, v21.4s\n  fmla v22.4s, v22.4s, v22.4s\n  fadd v23.4s, v23.4s, v23.4s\n  fmla v24.4s, v24.4s, v24.4s\n  fadd v25.4s, v25.4s, v25.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  fmla v18.4s, v18.4s, v18.4s\n  fadd v19.4s, v19.4s, v19.4s\n  fmla v20.4s, v20.4s, v20.4s\n  fadd v21.4s, v21.4s, v21.4s\n  fmla v22.4s, v22.4s, v22.4s\n  fadd v23.4s, v23.4s, v23.4s\n  fmla v24.4s, v24.4s, v24.4s\n  fadd v25.4s, v25.4s, v25.4s\n  sub x0, x0, x14\n  cbnz x0, mixvecfaddfma128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_latvecfma128test:\nlatvecfma128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\nlatvecfma128test_loop:\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v16.4s, v16.4s, v16.4s\n  sub x0, x0, x14\n  cbnz x0, latvecfma128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_vecfadd128test:\nvecfadd128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\nvecfadd128test_loop:\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  fadd v18.4s, v18.4s, v18.4s\n  fadd v19.4s, v19.4s, v19.4s\n  fadd v20.4s, v20.4s, v20.4s\n  fadd v21.4s, v21.4s, v21.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  fadd v18.4s, v18.4s, v18.4s\n  fadd v19.4s, v19.4s, v19.4s\n  fadd v20.4s, v20.4s, v20.4s\n  fadd v21.4s, v21.4s, v21.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  fadd v18.4s, v18.4s, v18.4s\n  fadd v19.4s, v19.4s, v19.4s\n  fadd v20.4s, v20.4s, v20.4s\n  fadd v21.4s, v21.4s, v21.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  sub x0, x0, x14\n  cbnz x0, vecfadd128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_latvecfadd128test:\nlatvecfadd128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\nlatvecfadd128test_loop:\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v16.4s, v16.4s, v16.4s\n  sub x0, x0, x14\n  cbnz x0, latvecfadd128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_vecfmul128test:\nvecfmul128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\nvecfmul128test_loop:\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v17.4s, v17.4s, v17.4s\n  fmul v18.4s, v18.4s, v18.4s\n  fmul v19.4s, v19.4s, v19.4s\n  fmul v20.4s, v20.4s, v20.4s\n  fmul v21.4s, v21.4s, v21.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v17.4s, v17.4s, v17.4s\n  fmul v18.4s, v18.4s, v18.4s\n  fmul v19.4s, v19.4s, v19.4s\n  fmul v20.4s, v20.4s, v20.4s\n  fmul v21.4s, v21.4s, v21.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v17.4s, v17.4s, v17.4s\n  fmul v18.4s, v18.4s, v18.4s\n  fmul v19.4s, v19.4s, v19.4s\n  fmul v20.4s, v20.4s, v20.4s\n  fmul v21.4s, v21.4s, v21.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v17.4s, v17.4s, v17.4s\n  sub x0, x0, x14\n  cbnz x0, vecfmul128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_latvecfmul128test:\nlatvecfmul128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\nlatvecfmul128test_loop:\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fmul v16.4s, v16.4s, v16.4s\n  sub x0, x0, x14\n  cbnz x0, latvecfmul128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_mixvecfaddfmul128test:\nmixvecfaddfmul128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\nmixvecfaddfmul128test_loop:\n  fmul v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  fmul v18.4s, v18.4s, v18.4s\n  fadd v19.4s, v19.4s, v19.4s\n  fmul v20.4s, v20.4s, v20.4s\n  fadd v21.4s, v21.4s, v21.4s\n  fmul v22.4s, v22.4s, v22.4s\n  fadd v23.4s, v23.4s, v23.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  fmul v18.4s, v18.4s, v18.4s\n  fadd v19.4s, v19.4s, v19.4s\n  fmul v20.4s, v20.4s, v20.4s\n  fadd v21.4s, v21.4s, v21.4s\n  fmul v22.4s, v22.4s, v22.4s\n  fadd v23.4s, v23.4s, v23.4s\n  fmul v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  fmul v18.4s, v18.4s, v18.4s\n  fadd v19.4s, v19.4s, v19.4s\n  sub x0, x0, x14\n  cbnz x0, mixvecfaddfmul128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_mixvecaddmul128test:\nmixvecaddmul128test:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\nmixvecaddmul128test_loop:\n  mul v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  mul v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  mul v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  mul v22.4s, v22.4s, v22.4s\n  add v23.4s, v23.4s, v23.4s\n  mul v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  mul v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  mul v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  mul v22.4s, v22.4s, v22.4s\n  add v23.4s, v23.4s, v23.4s\n  mul v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  mul v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  sub x0, x0, x14\n  cbnz x0, mixvecaddmul128test_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_mixaddvecadd128test:\nmixaddvecadd128test:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 30\n  eor x13, x13, x13\n  eor x12, x12, x12\n  eor x11, x11, x11\n  eor x10, x10, x10\n  eor x9, x9, x9\n  eor x8, x8, x8\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\nmixaddvecadd128test_loop:\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add x10, x10, x15\n  add v22.4s, v22.4s, v22.4s\n  add v23.4s, v23.4s, v23.4s\n  sub x0, x0, x14\n  cbnz x0, mixaddvecadd128test_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mix3to1addvecadd128test:\nmix3to1addvecadd128test:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 40\n  eor x13, x13, x13\n  eor x12, x12, x12\n  eor x11, x11, x11\n  eor x10, x10, x10\n  eor x9, x9, x9\n  eor x8, x8, x8\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\nmix3to1addvecadd128test_loop:\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v16.4s, v16.4s, v16.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v17.4s, v17.4s, v17.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v18.4s, v18.4s, v18.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v19.4s, v19.4s, v19.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v20.4s, v20.4s, v20.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v16.4s, v16.4s, v16.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v17.4s, v17.4s, v17.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v18.4s, v18.4s, v18.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v19.4s, v19.4s, v19.4s\n  add x13, x13, x15\n  add x12, x12, x15\n  add x11, x11, x15\n  add v20.4s, v20.4s, v20.4s\n  sub x0, x0, x14\n  cbnz x0, mix3to1addvecadd128test_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mix1to1addvecadd128test:\nmix1to1addvecadd128test:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 40\n  eor x13, x13, x13\n  eor x12, x12, x12\n  eor x11, x11, x11\n  eor x10, x10, x10\n  eor x9, x9, x9\n  eor x8, x8, x8\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\nmix1to1addvecadd128test_loop:\n  add x13, x13, x15\n  add v16.4s, v16.4s, v16.4s\n  add x12, x12, x15\n  add v17.4s, v17.4s, v17.4s\n  add x11, x11, x15\n  add v18.4s, v18.4s, v18.4s\n  add x10, x10, x15\n  add v19.4s, v19.4s, v19.4s\n\n  add x13, x13, x15\n  add v16.4s, v16.4s, v16.4s\n  add x12, x12, x15\n  add v17.4s, v17.4s, v17.4s\n  add x11, x11, x15\n  add v18.4s, v18.4s, v18.4s\n  add x10, x10, x15\n  add v19.4s, v19.4s, v19.4s\n\n  add x13, x13, x15\n  add v16.4s, v16.4s, v16.4s\n  add x12, x12, x15\n  add v17.4s, v17.4s, v17.4s\n  add x11, x11, x15\n  add v18.4s, v18.4s, v18.4s\n  add x10, x10, x15\n  add v19.4s, v19.4s, v19.4s\n\n  add x13, x13, x15\n  add v16.4s, v16.4s, v16.4s\n  add x12, x12, x15\n  add v17.4s, v17.4s, v17.4s\n  add x11, x11, x15\n  add v18.4s, v18.4s, v18.4s\n  add x10, x10, x15\n  add v19.4s, v19.4s, v19.4s\n\n  add x13, x13, x15\n  add v16.4s, v16.4s, v16.4s\n  add x12, x12, x15\n  add v17.4s, v17.4s, v17.4s\n  add x11, x11, x15\n  add v18.4s, v18.4s, v18.4s\n  add x10, x10, x15\n  add v19.4s, v19.4s, v19.4s\n\n  sub x0, x0, x14\n  cbnz x0, mix1to1addvecadd128test_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mixmulvecmultest:\nmixmulvecmultest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 2\n  mov x14, 20\n  mov x13, x15\n  mov x12, x15\n  mov x11, x15\n  mov x10, x15\n  mov x9, x15\n  mov x8, x15\n  mov x7, x15\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1]\n  ldr q22, [x1]\n  ldr q23, [x1]\nmixmulvecmultest_loop:\n  mul w8, w8, w15\n  mul v16.4s, v16.4s, v16.4s\n  mul w9, w9, w15\n  mul v17.4s, v17.4s, v17.4s\n  mul w10, w10, w15\n  mul v18.4s, v18.4s, v18.4s\n  mul w11, w11, w15\n  mul v19.4s, v19.4s, v19.4s\n  mul w12, w12, w15\n  mul v20.4s, v20.4s, v20.4s\n  mul w8, w8, w15\n  mul v16.4s, v16.4s, v16.4s\n  mul w9, w9, w15\n  mul v17.4s, v17.4s, v17.4s\n  mul w10, w10, w15\n  mul v18.4s, v18.4s, v18.4s\n  mul w11, w11, w15\n  mul v19.4s, v19.4s, v19.4s\n  mul w12, w12, w15\n  mul v20.4s, v20.4s, v20.4s\n  sub x0, x0, x14\n  cbnz x0, mixmulvecmultest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mixvecmulfmultest:\nmixvecmulfmultest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x2]\n  ldr q18, [x1]\n  ldr q19, [x2]\n  ldr q20, [x1]\n  ldr q21, [x2]\nmixvecmulfmultest_loop:\n  fmul v16.4s, v16.4s, v16.4s\n  mul v17.4s, v17.4s, v17.4s\n  fmul v18.4s, v18.4s, v18.4s\n  mul v19.4s, v19.4s, v19.4s\n  fmul v20.4s, v20.4s, v20.4s\n  mul v21.4s, v21.4s, v21.4s\n  fmul v16.4s, v16.4s, v16.4s\n  mul v17.4s, v17.4s, v17.4s\n  fmul v18.4s, v18.4s, v18.4s\n  mul v19.4s, v19.4s, v19.4s\n  fmul v20.4s, v20.4s, v20.4s\n  mul v21.4s, v21.4s, v21.4s\n  fmul v16.4s, v16.4s, v16.4s\n  mul v17.4s, v17.4s, v17.4s\n  fmul v18.4s, v18.4s, v18.4s\n  mul v19.4s, v19.4s, v19.4s\n  fmul v20.4s, v20.4s, v20.4s\n  mul v21.4s, v21.4s, v21.4s\n  fmul v16.4s, v16.4s, v16.4s\n  mul v17.4s, v17.4s, v17.4s\n  sub x0, x0, x14\n  cbnz x0, mixvecmulfmultest_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_mixvecaddfaddtest:\nmixvecaddfaddtest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x2]\n  ldr q18, [x1]\n  ldr q19, [x2]\n  ldr q20, [x1]\n  ldr q21, [x2]\nmixvecaddfaddtest_loop:\n  fadd v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  fadd v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  fadd v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  fadd v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  fadd v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  fadd v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  fadd v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  fadd v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  fadd v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s\n  fadd v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  sub x0, x0, x14\n  cbnz x0, mixvecaddfaddtest_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_mixjmpvecaddtest:\nmixjmpvecaddtest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 30\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\nmixjmpvecaddtest_loop:\n  add v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  add v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  add v20.4s, v20.4s, v20.4s\n  add v16.4s, v16.4s, v16.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  add v17.4s, v17.4s, v17.4s\n  add v18.4s, v18.4s, v18.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  add v19.4s, v19.4s, v19.4s\n  add v20.4s, v20.4s, v20.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  add v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  add v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  add v20.4s, v20.4s, v20.4s\n  add v16.4s, v16.4s, v16.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  add v17.4s, v17.4s, v17.4s\n  add v18.4s, v18.4s, v18.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  add v19.4s, v19.4s, v19.4s\n  add v20.4s, v20.4s, v20.4s\n  cbz x0, mixjmpvecaddtest_jellydonut\n  sub x0, x0, x14\n  cbnz x0, mixjmpvecaddtest_loop\nmixjmpvecaddtest_jellydonut:\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_mixjmpvecmultest:\nmixjmpvecmultest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\nmixjmpvecmultest_loop:\n  mul v16.4s, v16.4s, v16.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  mul v17.4s, v17.4s, v17.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  mul v18.4s, v18.4s, v18.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  mul v19.4s, v19.4s, v19.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  mul v20.4s, v20.4s, v20.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  mul v16.4s, v16.4s, v16.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  mul v17.4s, v17.4s, v17.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  mul v18.4s, v18.4s, v18.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  mul v19.4s, v19.4s, v19.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  mul v20.4s, v20.4s, v20.4s\n  cbz x0, mixjmpvecmultest_jellydonut\n  sub x0, x0, x14\n  cbnz x0, mixjmpvecmultest_loop\nmixjmpvecmultest_jellydonut:\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_vecloadtest:\nvecloadtest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\nvecloadtest_loop:\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  sub x0, x0, x14\n  cbnz x0, vecloadtest_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_vecstoretest:\nvecstoretest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\nvecstoretest_loop:\n  str q16, [x2]\n  str q17, [x2]\n  str q18, [x2]\n  str q19, [x2]\n  str q20, [x2]\n  str q16, [x2]\n  str q17, [x2]\n  str q18, [x2]\n  str q19, [x2]\n  str q20, [x2]\n  str q16, [x2]\n  str q17, [x2]\n  str q18, [x2]\n  str q19, [x2]\n  str q20, [x2]\n  str q16, [x2]\n  str q17, [x2]\n  str q18, [x2]\n  str q19, [x2]\n  str q20, [x2]\n  sub x0, x0, x14\n  cbnz x0, vecstoretest_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_loadtest:\nloadtest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x14, 20\nloadtest_loop:\n  ldr x10, [x1]\n  ldr x11, [x1]\n  ldr x12, [x1]\n  ldr x13, [x1]\n  ldr x15, [x1]\n  ldr x10, [x1]\n  ldr x11, [x1]\n  ldr x12, [x1]\n  ldr x13, [x1]\n  ldr x15, [x1]\n  ldr x10, [x1]\n  ldr x11, [x1]\n  ldr x12, [x1]\n  ldr x13, [x1]\n  ldr x15, [x1]\n  ldr x10, [x1]\n  ldr x11, [x1]\n  ldr x12, [x1]\n  ldr x13, [x1]\n  ldr x15, [x1]\n  sub x0, x0, x14\n  cbnz x0, loadtest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mixloadstoretest:\nmixloadstoretest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x14, 20\nmixloadstoretest_loop:\n  ldr x10, [x1]\n  str x14, [x2]\n  ldr x11, [x1]\n  str x14, [x2]\n  ldr x12, [x1]\n  str x14, [x2]\n  ldr x13, [x1]\n  str x14, [x2]\n  ldr x15, [x1]\n  str x14, [x2]\n  ldr x10, [x1]\n  str x14, [x2]\n  ldr x11, [x1]\n  str x14, [x2]\n  ldr x12, [x1]\n  str x14, [x2]\n  ldr x13, [x1]\n  str x14, [x2]\n  ldr x15, [x1]\n  str x14, [x2]\n  sub x0, x0, x14\n  cbnz x0, mixloadstoretest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mix21loadstoretest:\nmix21loadstoretest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x14, 30\nmix21loadstoretest_loop:\n  ldr x10, [x1]\n  ldr x11, [x1]\n  str x14, [x2]\n  ldr x12, [x1]\n  ldr x13, [x1]\n  str x14, [x2]\n  ldr x10, [x1]\n  ldr x11, [x1]\n  str x14, [x2]\n  ldr x12, [x1]\n  ldr x13, [x1]\n  str x14, [x2]\n  ldr x10, [x1]\n  ldr x11, [x1]\n  str x14, [x2]\n  ldr x12, [x1]\n  ldr x13, [x1]\n  str x14, [x2]\n  ldr x10, [x1]\n  ldr x11, [x1]\n  str x14, [x2]\n  ldr x12, [x1]\n  ldr x13, [x1]\n  str x14, [x2]\n  ldr x10, [x1]\n  ldr x11, [x1]\n  str x14, [x2]\n  ldr x12, [x1]\n  ldr x13, [x1]\n  str x14, [x2]\n  sub x0, x0, x14\n  cbnz x0, mix21loadstoretest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_jmptest:\njmptest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\njmptest_loop:\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  cbz x0, jmptest_jellydonut\n  sub x0, x0, x14\n  cbnz x0, jmptest_loop\njmptest_jellydonut:\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_fusejmptest:\nfusejmptest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 20\nfusejmptest_loop:\n  nop\n  nop\n  cmp x0, 0\n  b.eq jmptest_jellydonut\n  nop\n  nop\n  cmp x0, 0\n  b.eq jmptest_jellydonut\n  nop\n  nop\n  cmp x0, 0\n  b.eq jmptest_jellydonut\n  nop\n  nop\n  cmp x0, 0\n  b.eq jmptest_jellydonut\n  nop\n  sub x0, x0, x14\n  cmp x0, 0\n  b.ne fusejmptest_loop\nfusejmptest_jellydonut:\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n_mixmuljmptest:\nmixmuljmptest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x8, 7\n  mov x9, 6\n  mov x10, 1\n  mov x11, 2\n  mov x12, 3\n  mov x13, 4\n  mov x15, 5\n  mov x14, 20\nmixmuljmptest_loop:\n  mul x10, x10, x15\n  mul x11, x11, x15\n  mul x12, x12, x15\n  mul x13, x13, x15\n  mul x9, x9, x15\n  mul x8, x8, x15\n  mul x10, x10, x15\n  mul x11, x11, x15\n  mul x12, x12, x15\n  mul x13, x13, x15\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  sub x0, x0, x14\n  cbnz x0, mixmuljmptest_loop\nmixmuljmptest_jellydonut:\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mixmuljmptest21:\nmixmuljmptest21:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x8, 7\n  mov x9, 6\n  mov x10, 1\n  mov x11, 2\n  mov x12, 3\n  mov x13, 4\n  mov x15, 5\n  mov x14, 30\nmixmuljmptest21_loop:\n  mul x10, x10, x15\n  mul x11, x11, x15\n  mul x12, x12, x15\n  mul x13, x13, x15\n  mul x9, x9, x15\n  mul x8, x8, x15\n  mul x10, x10, x15\n  mul x11, x11, x15\n  mul x12, x12, x15\n  mul x13, x13, x15\n  cbz x0, mixmuljmptest21_jellydonut\n  cbz x0, mixmuljmptest21_jellydonut\n  cbz x0, mixmuljmptest21_jellydonut\n  cbz x0, mixmuljmptest21_jellydonut\n  cbz x0, mixmuljmptest21_jellydonut\n  cbz x0, mixmuljmptest21_jellydonut\n  cbz x0, mixmuljmptest21_jellydonut\n  cbz x0, mixmuljmptest21_jellydonut\n  cbz x0, mixmuljmptest21_jellydonut\n  cbz x0, mixmuljmptest21_jellydonut\n  mul x10, x10, x15\n  mul x11, x11, x15\n  mul x12, x12, x15\n  mul x13, x13, x15\n  mul x9, x9, x15\n  mul x8, x8, x15\n  mul x10, x10, x15\n  mul x11, x11, x15\n  mul x12, x12, x15\n  mul x13, x13, x15\n  sub x0, x0, x14\n  cbnz x0, mixmuljmptest21_loop\nmixmuljmptest21_jellydonut:\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mixaddjmptest:\nmixaddjmptest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x8, 7\n  mov x9, 6\n  mov x10, 1\n  mov x11, 2\n  mov x12, 3\n  mov x13, 4\n  mov x15, 5\n  mov x14, 20\nmixaddjmptest_loop:\n  add x10, x10, x15\n  add x11, x11, x15\n  add x12, x12, x15\n  add x13, x13, x15\n  add x9, x9, x15\n  add x8, x8, x15\n  add x10, x10, x15\n  add x11, x11, x15\n  add x12, x12, x15\n  add x13, x13, x15\n  cbz x0, mixaddjmptest_jellydonut\n  cbz x0, mixaddjmptest_jellydonut\n  cbz x0, mixaddjmptest_jellydonut\n  cbz x0, mixaddjmptest_jellydonut\n  cbz x0, mixaddjmptest_jellydonut\n  cbz x0, mixaddjmptest_jellydonut\n  cbz x0, mixaddjmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  cbz x0, mixmuljmptest_jellydonut\n  sub x0, x0, x14\n  cbnz x0, mixmuljmptest_loop\nmixaddjmptest_jellydonut:\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_mixaddjmp21test:\nmixaddjmp21test:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40] \n  mov x8, 7\n  mov x9, 6\n  mov x10, 1\n  mov x11, 2\n  mov x12, 3\n  mov x13, 4\n  mov x15, 5\n  mov x14, 15\nmixaddjmp21test_loop:\n  add x10, x10, x15\n  add x11, x11, x15\n  cbz x0, mixaddjmp21test_jellydonut\n  \n  add x12, x12, x15\n  add x13, x13, x15\n  cbz x0, mixaddjmp21test_jellydonut\n  \n  add x9, x9, x15\n  add x8, x8, x15\n  cbz x0, mixaddjmp21test_jellydonut\n  \n  add x10, x10, x15\n  add x11, x11, x15\n  cbz x0, mixaddjmp21test_jellydonut\n  \n  add x12, x12, x15\n  add x13, x13, x15\n  cbz x0, mixaddjmp21test_jellydonut\n\n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt mixaddjmp21test_loop\nmixaddjmp21test_jellydonut:\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50 \n  ret\n\n_mixmulrortest:\nmixmulrortest:\n  sub sp, sp, #0x80\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  stp x19, x20, [sp, #0x50]\n  stp x21, x22, [sp, #0x60]\n  stp x23, x24, [sp, #0x70]\n  mov x8, 7\n  mov x9, 6\n  mov x10, 1\n  mov x11, 2\n  mov x12, 3\n  mov x13, 4\n  mov x15, 5\n  mov x19, x8\n  mov x20, x8\n  mov x21, x8\n  mov x22, x8\n  mov x23, x8\n  mov x24, x8\n  mov x14, 20\nmixmulrortest_loop:\n  ror x24, x24, 1\n  ror x23, x23, 1\n  ror x22, x22, 1\n  ror x21, x21, 1\n  ror x20, x20, 1\n  mul x10, x10, x15\n  mul x11, x11, x15\n  mul x12, x12, x15\n  mul x13, x13, x15\n  mul x9, x9, x15\n  ror x24, x24, 1\n  ror x23, x23, 1\n  ror x22, x22, 1\n  ror x21, x21, 1\n  ror x20, x20, 1\n  mul x8, x8, x15\n  mul x10, x10, x15\n  mul x11, x11, x15\n  mul x12, x12, x15\n  mul x13, x13, x15\n  sub x0, x0, x14\n  cbnz x0, mixmulrortest_loop\n  ldp x23, x24, [sp, #0x70]\n  ldp x21, x22, [sp, #0x60]\n  ldp x19, x20, [sp, #0x50]\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x80\n  ret\n\n_rortest:\nrortest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x8, 7\n  mov x9, 6\n  mov x10, 1\n  mov x11, 2\n  mov x12, 3\n  mov x13, 4\n  mov x15, 5\n  mov x14, 20\nrortest_loop:\n  ror x10, x10, 1\n  ror x11, x11, 1\n  ror x12, x12, 1\n  ror x13, x13, 1\n  ror x9, x9, 1\n  ror x8, x8, 1\n  ror x10, x10, 1\n  ror x11, x11, 1\n  ror x12, x12, 1\n  ror x13, x13, 1\n  ror x10, x10, 1\n  ror x11, x11, 1\n  ror x12, x12, 1\n  ror x13, x13, 1\n  ror x9, x9, 1\n  ror x8, x8, 1\n  ror x10, x10, 1\n  ror x11, x11, 1\n  ror x12, x12, 1\n  ror x13, x13, 1\n  sub x0, x0, x14\n  cbnz x0, rortest_loop\nrortest_jellydonut:\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_depmovtest:\ndepmovtest:\n  sub sp, sp, #0x40\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  mov x15, 1\n  mov x14, 20\n  eor x13, x13, x13\ndepmovtest_loop:\n  mov x12, x15\n  mov x10, x12\n  mov x13, x10\n  mov x11, x13\n  mov x15, x11\n  mov x12, x15\n  mov x10, x12\n  mov x13, x10\n  mov x11, x13\n  mov x15, x11\n  mov x12, x15\n  mov x10, x12\n  mov x13, x10\n  mov x11, x13\n  mov x15, x11\n  mov x12, x15\n  mov x10, x12\n  mov x13, x10\n  mov x11, x13\n  mov x15, x11\n  sub x0, x0, x14\n  cbnz x0, depmovtest_loop\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x40\n  ret\n\n_indepmovtest:\nindepmovtest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 20\n  eor x13, x13, x13\nindepmovtest_loop:\n  mov x10, x15\n  mov x11, x14\n  mov x12, x13\n  mov x9, x15\n  mov x8, x14\n  mov x10, x15\n  mov x11, x14\n  mov x12, x13\n  mov x9, x15\n  mov x8, x14\n  mov x10, x15\n  mov x11, x14\n  mov x12, x13\n  mov x9, x15\n  mov x8, x14\n  mov x10, x15\n  mov x11, x14\n  mov x12, x13\n  mov x9, x15\n  mov x8, x14\n  sub x0, x0, x14\n  cbnz x0, indepmovtest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_xorzerotest:\nxorzerotest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 20\nxorzerotest_loop:\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  eor x15, x15, x15\n  sub x0, x0, x14\n  cbnz x0, xorzerotest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_movzerotest:\nmovzerotest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 20\nmovzerotest_loop:\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  mov x15, 0\n  sub x0, x0, x14\n  cbnz x0, movzerotest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_subzerotest:\nsubzerotest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x15, 1\n  mov x14, 20\nsubzerotest_loop:\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x15, x15, x15\n  sub x0, x0, x14\n  cbnz x0, subzerotest_loop\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n_aesetest:\naesetest:\n  sub sp, sp, #0x50\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \n  mov x14, 20\naesetest_loop:\n  aese v0.16b, v16.16b\n  aese v1.16b, v17.16b\n  aese v2.16b, v18.16b\n  aese v3.16b, v19.16b\n  aese v4.16b, v20.16b\n  aese v0.16b, v16.16b\n  aese v1.16b, v17.16b\n  aese v2.16b, v18.16b\n  aese v3.16b, v19.16b\n  aese v4.16b, v20.16b \n  aese v0.16b, v16.16b\n  aese v1.16b, v17.16b\n  aese v2.16b, v18.16b\n  aese v3.16b, v19.16b\n  aese v4.16b, v20.16b  \n  aese v0.16b, v16.16b\n  aese v1.16b, v17.16b\n  aese v2.16b, v18.16b\n  aese v3.16b, v19.16b\n  aese v4.16b, v20.16b \n  sub x0, x0, x14\n  cbnz x0, aesetest_loop\n  add sp, sp, #0x50\n  ret\n\n_mixaesevecadd128test:\nmixaesevecadd128test:\n  sub sp, sp, #0x50\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \n  mov x14, 20\nmixaesevecadd128test_loop:\n  aese v0.16b, v16.16b\n  add v5.4s, v9.4s, v16.4s\n  aese v1.16b, v17.16b\n  add v6.4s, v10.4s, v16.4s\n  aese v2.16b, v18.16b\n  add v7.4s, v11.4s, v16.4s\n  aese v3.16b, v19.16b\n  add v31.4s, v12.4s, v16.4s\n  aese v4.16b, v20.16b\n  add v30.4s, v13.4s, v16.4s\n  aese v0.16b, v16.16b\n  add v5.4s, v9.4s, v16.4s\n  aese v1.16b, v17.16b\n  add v6.4s, v10.4s, v16.4s\n  aese v2.16b, v18.16b\n  add v7.4s, v11.4s, v16.4s\n  aese v3.16b, v19.16b\n  add v31.4s, v12.4s, v16.4s\n  aese v4.16b, v20.16b\n  add v30.4s, v13.4s, v16.4s \n  sub x0, x0, x14\n  cbnz x0, mixaesevecadd128test_loop\n  add sp, sp, #0x50\n  ret \n\n_pmulltest:\npmulltest:\n  sub sp, sp, #0x50\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \n  mov x14, 20\npmulltest_loop:\n  pmull v0.1q, v16.1d, v17.1d\n  pmull v1.1q, v16.1d, v17.1d\n  pmull v2.1q, v16.1d, v17.1d\n  pmull v3.1q, v16.1d, v17.1d\n  pmull v4.1q, v16.1d, v17.1d\n  pmull v0.1q, v16.1d, v17.1d\n  pmull v1.1q, v16.1d, v17.1d\n  pmull v2.1q, v16.1d, v17.1d\n  pmull v3.1q, v16.1d, v17.1d\n  pmull v4.1q, v16.1d, v17.1d \n  pmull v0.1q, v16.1d, v17.1d\n  pmull v1.1q, v16.1d, v17.1d\n  pmull v2.1q, v16.1d, v17.1d\n  pmull v3.1q, v16.1d, v17.1d\n  pmull v4.1q, v16.1d, v17.1d \n  pmull v0.1q, v16.1d, v17.1d\n  pmull v1.1q, v16.1d, v17.1d\n  pmull v2.1q, v16.1d, v17.1d\n  pmull v3.1q, v16.1d, v17.1d\n  pmull v4.1q, v16.1d, v17.1d \n  sub x0, x0, x14\n  cbnz x0, pmulltest_loop\n  add sp, sp, #0x50\n  ret \n\n_mixpmulladd128test:\nmixpmulladd128test:\n  sub sp, sp, #0x50\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \n  mov x14, 20\nmixpmulladd128test_loop:\n  pmull v0.1q, v16.1d, v17.1d\n  add v5.4s, v9.4s, v16.4s\n  pmull v1.1q, v16.1d, v17.1d\n  add v6.4s, v9.4s, v16.4s\n  pmull v2.1q, v16.1d, v17.1d\n  add v7.4s, v9.4s, v16.4s\n  pmull v3.1q, v16.1d, v17.1d\n  add v31.4s, v9.4s, v16.4s\n  pmull v4.1q, v16.1d, v17.1d\n  add v30.4s, v9.4s, v16.4s\n  pmull v0.1q, v16.1d, v17.1d\n  add v5.4s, v9.4s, v16.4s\n  pmull v1.1q, v16.1d, v17.1d\n  add v6.4s, v9.4s, v16.4s\n  pmull v2.1q, v16.1d, v17.1d\n  add v7.4s, v9.4s, v16.4s\n  pmull v3.1q, v16.1d, v17.1d\n  add v31.4s, v9.4s, v16.4s\n  pmull v4.1q, v16.1d, v17.1d\n  add v30.4s, v9.4s, v16.4s \n  sub x0, x0, x14\n  cbnz x0, mixpmulladd128test_loop\n  add sp, sp, #0x50\n  ret \n"
  },
  {
    "path": "InstructionRate/riscv_instructionrate.c",
    "content": "#define  _GNU_SOURCE\n#include <stdio.h>\n#include <sys/time.h>\n#include <time.h>\n#include <stdint.h>\n#include <stdlib.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include <string.h>\n\nfloat measureFunction(uint64_t iterations, float clockSpeedGhz, void *arr, uint64_t (*testfunc)(uint64_t, void *));\n\nextern uint64_t clktest(uint64_t iterations, void *data);\nextern uint64_t addtest(uint64_t iterations, void *data);\nextern uint64_t faddtest(uint64_t iterations, void *data);\nextern uint64_t fmultest(uint64_t iterations, void *data);\nextern uint64_t mixfaddfmultest(uint64_t iterations, void *data);\nextern uint64_t fmatest(uint64_t iterations, void *data);\nextern uint64_t faddlattest(uint64_t iterations, void *data);\nextern uint64_t fmullattest(uint64_t iterations, void *data);\nextern uint64_t fmalattest(uint64_t iterations, void *data);\n\nfloat fpTestArr[4] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14 };\nint intTestArr[4] __attribute__ ((aligned (64))) = { 1, 2, 3, 4 };\nint sinkArr[4] __attribute__ ((aligned (64))) = { 2, 3, 4, 5 };\n\nint main(int argc, char *argv[]) {\n  struct timeval startTv, endTv;\n  struct timezone startTz, endTz;\n  uint64_t iterations = 1500000000;\n  uint64_t iterationsHigh = iterations * 5;\n  uint64_t time_diff_ms;\n  float latency, opsPerNs, clockSpeedGhz;\n  if (argc > 1) {\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\n      if (*(argv[argIdx]) == '-') {\n        char *arg = argv[argIdx] + 1;\n        if (strncmp(arg, \"iter\", 4) == 0) {\n\t  argIdx++;\n\t  int iterMul = atoi(argv[argIdx]);\n\t  iterations *= iterMul;\n\t  iterationsHigh *= iterMul;\n\t  fprintf(stderr, \"Scaled iterations by %d\\n\", iterMul);\n\t}\n      }\n    }\n  }\n\n  gettimeofday(&startTv, &startTz);\n  clktest(iterations, NULL);\n  gettimeofday(&endTv, &endTz);\n  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n  latency = 1e6 * (float)time_diff_ms / (float)iterations;\n  // clk speed should be 1/latency, assuming we got one add per clk, roughly\n  clockSpeedGhz = 1/latency;\n  printf(\"Estimated clock speed> %.2f GHz\\n\", clockSpeedGhz);\n\n  // integer side\n  printf(\"Adds per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, NULL, addtest));\n\n  // FP\n  printf(\"FP32 Adds per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, faddtest));\n  printf(\"FP32 Add latency> %.2f cycles\\n\", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, faddlattest));\n  printf(\"FP32 Multiplies per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, fmultest));\n  printf(\"FP32 Multiply latency> %.2f cycles\\n\", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, fmullattest));\n  printf(\"1:1 FP32 Add:Mul per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, mixfaddfmultest));\n  printf(\"FP32 FMA per clk> %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, fmatest));\n  printf(\"FP32 FMA latency> %.2f cycles\\n\", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, fmalattest));\n\n  return 0;\n}\n\nfloat measureFunction(uint64_t iterations, float clockSpeedGhz, void *arr, uint64_t (*testfunc)(uint64_t, void *)) {\n  struct timeval startTv, endTv;\n  struct timezone startTz, endTz;\n  uint64_t time_diff_ms, retval;\n  float latency, opsPerNs;\n\n  gettimeofday(&startTv, &startTz);\n  retval = testfunc(iterations, arr);\n  gettimeofday(&endTv, &endTz);\n  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n  latency = 1e6 * (float)time_diff_ms / (float)iterations;\n  opsPerNs = 1/latency;\n  //printf(\"return value: %lu\\n\", retval);\n  return opsPerNs / clockSpeedGhz;\n}\n"
  },
  {
    "path": "InstructionRate/riscv_instructionrate.s",
    "content": ".text\n\n.global clktest\n.global addtest\n.global faddtest\n.global fmultest\n.global mixfaddfmultest\n.global fmatest\n.global faddlattest\n.global fmullattest\n.global fmalattest\n\n/* a0 = iterations, a1 = data arr */\nclktest:\n  mv t0, x0\n  mv t1, x0\n  addi t1, t1, 1\nclktest_loop:\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  add t0, t0, t1\n  addi a0, a0, -20\n  blt x0, a0, clktest_loop\n  ret\n\naddtest:\n  mv t0, x0\n  addi t0, t0, 1\n  mv t1, t0\n  mv t2, t0\n  mv t3, t0\n  mv t4, t0\n  mv t5, t0\n  mv t6, t0\naddtest_loop:\n  add t1, t1, t6\n  add t2, t2, t6\n  add t3, t3, t6\n  add t4, t4, t6\n  add t5, t5, t6\n  add t1, t1, t6\n  add t2, t2, t6\n  add t3, t3, t6\n  add t4, t4, t6\n  add t5, t5, t6 \n  add t1, t1, t6\n  add t2, t2, t6\n  add t3, t3, t6\n  add t4, t4, t6\n  add t5, t5, t6\n  add t1, t1, t6\n  add t2, t2, t6\n  add t3, t3, t6\n  add t4, t4, t6\n  add t5, t5, t6  \n  addi a0, a0, -20\n  blt x0, a0, addtest_loop\n  ret\n\n/* f0-7 are fp temporaries */\nfaddtest:\n  flw f0, (a1)\n  flw f1, 4(a1)\n  flw f2, 8(a1)\n  flw f3, 12(a1)\n  fsub.d f4, f4, f4\n  fsub.d f5, f5, f5\n  fsub.d f6, f6, f6\n  fsub.d f7, f7, f7\n  fadd.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fadd.d f7, f7, f0\nfaddtest_loop:\n  fadd.d f1, f1, f0\n  fadd.d f2, f2, f0\n  fadd.d f3, f3, f0\n  fadd.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fadd.d f7, f7, f0\n  fadd.d f1, f1, f0\n  fadd.d f2, f2, f0\n  fadd.d f3, f3, f0\n  fadd.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fadd.d f7, f7, f0 \n  addi a0, a0, -14\n  blt x0, a0, faddtest_loop\n  ret\n\nfaddlattest:\n  flw f0, (a1)\n  flw f1, 4(a1)\n  flw f2, 8(a1)\n  flw f3, 12(a1)\n  fsub.d f4, f4, f4\n  fsub.d f5, f5, f5\n  fsub.d f6, f6, f6\n  fsub.d f7, f7, f7\n  fadd.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fadd.d f7, f7, f0\nfaddlattest_loop:\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1\n  fadd.d f1, f1, f1 \n  addi a0, a0, -14\n  blt x0, a0, faddlattest_loop\n  ret \n\nfmultest:\n  flw f0, (a1)\n  flw f1, 4(a1)\n  flw f2, 8(a1)\n  flw f3, 12(a1)\n  fsub.d f4, f4, f4\n  fsub.d f5, f5, f5\n  fsub.d f6, f6, f6\n  fsub.d f7, f7, f7\n  fadd.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fadd.d f7, f7, f0\nfmultest_loop:\n  fmul.d f1, f1, f0\n  fmul.d f2, f2, f0\n  fmul.d f3, f3, f0\n  fmul.d f4, f4, f0\n  fmul.d f5, f5, f0\n  fmul.d f6, f6, f0\n  fmul.d f7, f7, f0\n  fmul.d f1, f1, f0\n  fmul.d f2, f2, f0\n  fmul.d f3, f3, f0\n  fmul.d f4, f4, f0\n  fmul.d f5, f5, f0\n  fmul.d f6, f6, f0\n  fmul.d f7, f7, f0 \n  addi a0, a0, -14\n  blt x0, a0, fmultest_loop\n  ret \n\nfmullattest:\n  flw f0, (a1)\n  flw f1, 4(a1)\n  flw f2, 8(a1)\n  flw f3, 12(a1)\n  fsub.d f4, f4, f4\n  fsub.d f5, f5, f5\n  fsub.d f6, f6, f6\n  fsub.d f7, f7, f7\n  fadd.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fadd.d f7, f7, f0\nfmullattest_loop:\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1\n  fmul.d f1, f1, f1 \n  addi a0, a0, -14\n  blt x0, a0, fmullattest_loop\n  ret\n\nmixfaddfmultest:\n  flw f0, (a1)\n  flw f1, 4(a1)\n  flw f2, 8(a1)\n  flw f3, 12(a1)\n  fsub.d f4, f4, f4\n  fsub.d f5, f5, f5\n  fsub.d f6, f6, f6\n  fsub.d f7, f7, f7\n  fadd.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fadd.d f7, f7, f0\nmixfaddfmultest_loop:\n  fadd.d f1, f1, f0\n  fmul.d f2, f2, f0\n  fadd.d f3, f3, f0\n  fmul.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fmul.d f6, f6, f0\n  fadd.d f7, f7, f0\n  fmul.d f1, f1, f0\n  fadd.d f2, f2, f0\n  fmul.d f3, f3, f0\n  fadd.d f4, f4, f0\n  fmul.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fmul.d f7, f7, f0 \n  addi a0, a0, -14\n  blt x0, a0, mixfaddfmultest_loop\n  ret  \n\nfmatest:\n  flw f0, (a1)\n  flw f1, 4(a1)\n  flw f2, 8(a1)\n  flw f3, 12(a1)\n  fsub.d f4, f4, f4\n  fsub.d f5, f5, f5\n  fsub.d f6, f6, f6\n  fsub.d f7, f7, f7\n  fadd.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fadd.d f7, f7, f0\nfmatest_loop:\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f2, f2, f2, f0\n  fmadd.d f3, f3, f3, f0\n  fmadd.d f4, f4, f4, f0\n  fmadd.d f5, f5, f5, f0\n  fmadd.d f6, f6, f6, f0\n  fmadd.d f7, f7, f7, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f2, f2, f2, f0\n  fmadd.d f3, f3, f3, f0\n  fmadd.d f4, f4, f4, f0\n  fmadd.d f5, f5, f5, f0\n  fmadd.d f6, f6, f6, f0\n  fmadd.d f7, f7, f7, f0 \n  addi a0, a0, -14\n  blt x0, a0, fmatest_loop\n  ret  \n\nfmalattest:\n  flw f0, (a1)\n  flw f1, 4(a1)\n  flw f2, 8(a1)\n  flw f3, 12(a1)\n  fsub.d f4, f4, f4\n  fsub.d f5, f5, f5\n  fsub.d f6, f6, f6\n  fsub.d f7, f7, f7\n  fadd.d f4, f4, f0\n  fadd.d f5, f5, f0\n  fadd.d f6, f6, f0\n  fadd.d f7, f7, f0\nfmalattest_loop:\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0\n  fmadd.d f1, f1, f1, f0 \n  addi a0, a0, -14\n  blt x0, a0, fmalattest_loop\n  ret\n"
  },
  {
    "path": "InstructionRate/test.s",
    "content": "\nx86_instructionrate:     file format elf64-x86-64\n\n\nDisassembly of section .init:\n\n0000000000001000 <_init>:\n    1000:\tf3 0f 1e fa          \tendbr64 \n    1004:\t48 83 ec 08          \tsub    $0x8,%rsp\n    1008:\t48 8b 05 d9 cf 00 00 \tmov    0xcfd9(%rip),%rax        # dfe8 <__gmon_start__>\n    100f:\t48 85 c0             \ttest   %rax,%rax\n    1012:\t74 02                \tje     1016 <_init+0x16>\n    1014:\tff d0                \tcallq  *%rax\n    1016:\t48 83 c4 08          \tadd    $0x8,%rsp\n    101a:\tc3                   \tretq   \n\nDisassembly of section .plt:\n\n0000000000001020 <.plt>:\n    1020:\tff 35 62 cf 00 00    \tpushq  0xcf62(%rip)        # df88 <_GLOBAL_OFFSET_TABLE_+0x8>\n    1026:\tff 25 64 cf 00 00    \tjmpq   *0xcf64(%rip)        # df90 <_GLOBAL_OFFSET_TABLE_+0x10>\n    102c:\t0f 1f 40 00          \tnopl   0x0(%rax)\n\n0000000000001030 <strncmp@plt>:\n    1030:\tff 25 62 cf 00 00    \tjmpq   *0xcf62(%rip)        # df98 <strncmp@GLIBC_2.2.5>\n    1036:\t68 00 00 00 00       \tpushq  $0x0\n    103b:\te9 e0 ff ff ff       \tjmpq   1020 <.plt>\n\n0000000000001040 <__stack_chk_fail@plt>:\n    1040:\tff 25 5a cf 00 00    \tjmpq   *0xcf5a(%rip)        # dfa0 <__stack_chk_fail@GLIBC_2.4>\n    1046:\t68 01 00 00 00       \tpushq  $0x1\n    104b:\te9 d0 ff ff ff       \tjmpq   1020 <.plt>\n\n0000000000001050 <gettimeofday@plt>:\n    1050:\tff 25 52 cf 00 00    \tjmpq   *0xcf52(%rip)        # dfa8 <gettimeofday@GLIBC_2.2.5>\n    1056:\t68 02 00 00 00       \tpushq  $0x2\n    105b:\te9 c0 ff ff ff       \tjmpq   1020 <.plt>\n\n0000000000001060 <strcmp@plt>:\n    1060:\tff 25 4a cf 00 00    \tjmpq   *0xcf4a(%rip)        # dfb0 <strcmp@GLIBC_2.2.5>\n    1066:\t68 03 00 00 00       \tpushq  $0x3\n    106b:\te9 b0 ff ff ff       \tjmpq   1020 <.plt>\n\n0000000000001070 <strtol@plt>:\n    1070:\tff 25 42 cf 00 00    \tjmpq   *0xcf42(%rip)        # dfb8 <strtol@GLIBC_2.2.5>\n    1076:\t68 04 00 00 00       \tpushq  $0x4\n    107b:\te9 a0 ff ff ff       \tjmpq   1020 <.plt>\n\n0000000000001080 <__printf_chk@plt>:\n    1080:\tff 25 3a cf 00 00    \tjmpq   *0xcf3a(%rip)        # dfc0 <__printf_chk@GLIBC_2.3.4>\n    1086:\t68 05 00 00 00       \tpushq  $0x5\n    108b:\te9 90 ff ff ff       \tjmpq   1020 <.plt>\n\n0000000000001090 <fwrite@plt>:\n    1090:\tff 25 32 cf 00 00    \tjmpq   *0xcf32(%rip)        # dfc8 <fwrite@GLIBC_2.2.5>\n    1096:\t68 06 00 00 00       \tpushq  $0x6\n    109b:\te9 80 ff ff ff       \tjmpq   1020 <.plt>\n\n00000000000010a0 <aligned_alloc@plt>:\n    10a0:\tff 25 2a cf 00 00    \tjmpq   *0xcf2a(%rip)        # dfd0 <aligned_alloc@GLIBC_2.16>\n    10a6:\t68 07 00 00 00       \tpushq  $0x7\n    10ab:\te9 70 ff ff ff       \tjmpq   1020 <.plt>\n\nDisassembly of section .plt.got:\n\n00000000000010b0 <__cxa_finalize@plt>:\n    10b0:\tff 25 42 cf 00 00    \tjmpq   *0xcf42(%rip)        # dff8 <__cxa_finalize@GLIBC_2.2.5>\n    10b6:\t66 90                \txchg   %ax,%ax\n\nDisassembly of section .text:\n\n00000000000010c0 <main>:\n    10c0:\tf3 0f 1e fa          \tendbr64 \n    10c4:\t41 57                \tpush   %r15\n    10c6:\t41 56                \tpush   %r14\n    10c8:\t41 55                \tpush   %r13\n    10ca:\t41 54                \tpush   %r12\n    10cc:\t41 89 fc             \tmov    %edi,%r12d\n    10cf:\tbf 40 00 00 00       \tmov    $0x40,%edi\n    10d4:\t55                   \tpush   %rbp\n    10d5:\t48 89 f5             \tmov    %rsi,%rbp\n    10d8:\tbe 00 10 00 00       \tmov    $0x1000,%esi\n    10dd:\t53                   \tpush   %rbx\n    10de:\t48 83 ec 58          \tsub    $0x58,%rsp\n    10e2:\t64 48 8b 04 25 28 00 \tmov    %fs:0x28,%rax\n    10e9:\t00 00 \n    10eb:\t48 89 44 24 48       \tmov    %rax,0x48(%rsp)\n    10f0:\t31 c0                \txor    %eax,%eax\n    10f2:\te8 a9 ff ff ff       \tcallq  10a0 <aligned_alloc@plt>\n    10f7:\t66 0f 6f 0d 91 af 00 \tmovdqa 0xaf91(%rip),%xmm1        # c090 <_IO_stdin_used+0x1090>\n    10fe:\t00 \n    10ff:\t66 0f 6f 25 99 af 00 \tmovdqa 0xaf99(%rip),%xmm4        # c0a0 <_IO_stdin_used+0x10a0>\n    1106:\t00 \n    1107:\t48 89 05 f2 cf 00 00 \tmov    %rax,0xcff2(%rip)        # e100 <intTestArr>\n    110e:\t66 0f 6f 1d 9a af 00 \tmovdqa 0xaf9a(%rip),%xmm3        # c0b0 <_IO_stdin_used+0x10b0>\n    1115:\t00 \n    1116:\t48 8d 90 00 10 00 00 \tlea    0x1000(%rax),%rdx\n    111d:\t0f 1f 00             \tnopl   (%rax)\n    1120:\t66 0f 6f c1          \tmovdqa %xmm1,%xmm0\n    1124:\t48 83 c0 10          \tadd    $0x10,%rax\n    1128:\t66 0f d4 cc          \tpaddq  %xmm4,%xmm1\n    112c:\t66 0f 6f d0          \tmovdqa %xmm0,%xmm2\n    1130:\t66 0f d4 d3          \tpaddq  %xmm3,%xmm2\n    1134:\t0f c6 c2 88          \tshufps $0x88,%xmm2,%xmm0\n    1138:\t0f 29 40 f0          \tmovaps %xmm0,-0x10(%rax)\n    113c:\t48 39 c2             \tcmp    %rax,%rdx\n    113f:\t75 df                \tjne    1120 <main+0x60>\n    1141:\t49 be 00 eb 08 bf 01 \tmovabs $0x1bf08eb00,%r14\n    1148:\t00 00 00 \n    114b:\t41 83 fc 02          \tcmp    $0x2,%r12d\n    114f:\t0f 8f db 35 00 00    \tjg     4730 <main+0x3670>\n    1155:\t4c 8d 2d 94 cf 00 00 \tlea    0xcf94(%rip),%r13        # e0f0 <__cpu_model>\n    115c:\t41 f6 45 0d 02       \ttestb  $0x2,0xd(%r13)\n    1161:\t0f 85 a7 35 00 00    \tjne    470e <main+0x364e>\n    1167:\t41 f6 45 0d 04       \ttestb  $0x4,0xd(%r13)\n    116c:\t0f 85 7a 35 00 00    \tjne    46ec <main+0x362c>\n    1172:\t41 f6 45 0e 02       \ttestb  $0x2,0xe(%r13)\n    1177:\t0f 85 4d 35 00 00    \tjne    46ca <main+0x360a>\n    117d:\tb8 07 00 00 00       \tmov    $0x7,%eax\n    1182:\t31 c9                \txor    %ecx,%ecx\n    1184:\t0f a2                \tcpuid  \n    1186:\t81 e3 00 00 01 00    \tand    $0x10000,%ebx\n    118c:\t0f 85 af 1f 00 00    \tjne    3141 <main+0x2081>\n    1192:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    1196:\t0f 84 9c 47 00 00    \tje     5938 <main+0x4878>\n    119c:\tf2 0f 10 05 dc ae 00 \tmovsd  0xaedc(%rip),%xmm0        # c080 <_IO_stdin_used+0x1080>\n    11a3:\t00 \n    11a4:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    11a9:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    11ae:\t48 8d 35 c3 a5 00 00 \tlea    0xa5c3(%rip),%rsi        # b778 <_IO_stdin_used+0x778>\n    11b5:\te8 c6 fe ff ff       \tcallq  1080 <__printf_chk@plt>\n    11ba:\tf3 0f 10 35 c6 ae 00 \tmovss  0xaec6(%rip),%xmm6        # c088 <_IO_stdin_used+0x1088>\n    11c1:\t00 \n    11c2:\tf3 0f 11 74 24 0c    \tmovss  %xmm6,0xc(%rsp)\n    11c8:\tf3 0f 11 74 24 08    \tmovss  %xmm6,0x8(%rsp)\n    11ce:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    11d2:\t0f 8e 6e 17 00 00    \tjle    2946 <main+0x1886>\n    11d8:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    11dc:\tba 05 00 00 00       \tmov    $0x5,%edx\n    11e1:\t48 8d 35 86 a5 00 00 \tlea    0xa586(%rip),%rsi        # b76e <_IO_stdin_used+0x76e>\n    11e8:\t4c 89 ef             \tmov    %r13,%rdi\n    11eb:\te8 40 fe ff ff       \tcallq  1030 <strncmp@plt>\n    11f0:\t85 c0                \ttest   %eax,%eax\n    11f2:\t0f 85 a3 17 00 00    \tjne    299b <main+0x18db>\n    11f8:\t48 8d 35 79 5b 00 00 \tlea    0x5b79(%rip),%rsi        # 6d78 <noptest1b>\n    11ff:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1205:\t4c 89 f7             \tmov    %r14,%rdi\n    1208:\te8 33 98 00 00       \tcallq  aa40 <measureFunction>\n    120d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1212:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1217:\t48 8d 35 2e a5 00 00 \tlea    0xa52e(%rip),%rsi        # b74c <_IO_stdin_used+0x74c>\n    121e:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1222:\te8 59 fe ff ff       \tcallq  1080 <__printf_chk@plt>\n    1227:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    122b:\tba 05 00 00 00       \tmov    $0x5,%edx\n    1230:\t48 8d 35 8e 9f 00 00 \tlea    0x9f8e(%rip),%rsi        # b1c5 <_IO_stdin_used+0x1c5>\n    1237:\t4c 89 ef             \tmov    %r13,%rdi\n    123a:\te8 f1 fd ff ff       \tcallq  1030 <strncmp@plt>\n    123f:\t85 c0                \ttest   %eax,%eax\n    1241:\t0f 85 70 17 00 00    \tjne    29b7 <main+0x18f7>\n    1247:\t48 8d 35 f1 5a 00 00 \tlea    0x5af1(%rip),%rsi        # 6d3f <noptest>\n    124e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1254:\t4c 89 f7             \tmov    %r14,%rdi\n    1257:\te8 e4 97 00 00       \tcallq  aa40 <measureFunction>\n    125c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1261:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1266:\t48 8d 35 c4 a4 00 00 \tlea    0xa4c4(%rip),%rsi        # b731 <_IO_stdin_used+0x731>\n    126d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1271:\te8 0a fe ff ff       \tcallq  1080 <__printf_chk@plt>\n    1276:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    127a:\tba 03 00 00 00       \tmov    $0x3,%edx\n    127f:\t48 8d 35 89 9f 00 00 \tlea    0x9f89(%rip),%rsi        # b20f <_IO_stdin_used+0x20f>\n    1286:\t4c 89 ef             \tmov    %r13,%rdi\n    1289:\te8 a2 fd ff ff       \tcallq  1030 <strncmp@plt>\n    128e:\t85 c0                \ttest   %eax,%eax\n    1290:\t0f 85 3d 17 00 00    \tjne    29d3 <main+0x1913>\n    1296:\t48 8d 35 01 5b 00 00 \tlea    0x5b01(%rip),%rsi        # 6d9e <addtest>\n    129d:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    12a3:\t4c 89 f7             \tmov    %r14,%rdi\n    12a6:\te8 95 97 00 00       \tcallq  aa40 <measureFunction>\n    12ab:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    12b0:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    12b5:\t48 8d 35 61 a4 00 00 \tlea    0xa461(%rip),%rsi        # b71d <_IO_stdin_used+0x71d>\n    12bc:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    12c0:\te8 bb fd ff ff       \tcallq  1080 <__printf_chk@plt>\n    12c5:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    12c9:\t48 8d 35 fb 9e 00 00 \tlea    0x9efb(%rip),%rsi        # b1cb <_IO_stdin_used+0x1cb>\n    12d0:\t4c 89 ef             \tmov    %r13,%rdi\n    12d3:\te8 88 fd ff ff       \tcallq  1060 <strcmp@plt>\n    12d8:\t85 c0                \ttest   %eax,%eax\n    12da:\t0f 85 0a 17 00 00    \tjne    29ea <main+0x192a>\n    12e0:\t4c 8d 3d 43 5b 00 00 \tlea    0x5b43(%rip),%r15        # 6e2a <addnoptest>\n    12e7:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    12ed:\t4c 89 f7             \tmov    %r14,%rdi\n    12f0:\t4c 89 fe             \tmov    %r15,%rsi\n    12f3:\te8 48 97 00 00       \tcallq  aa40 <measureFunction>\n    12f8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    12fd:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1302:\t48 8d 35 f7 a3 00 00 \tlea    0xa3f7(%rip),%rsi        # b700 <_IO_stdin_used+0x700>\n    1309:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    130d:\te8 6e fd ff ff       \tcallq  1080 <__printf_chk@plt>\n    1312:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1316:\t48 8d 35 b5 9e 00 00 \tlea    0x9eb5(%rip),%rsi        # b1d2 <_IO_stdin_used+0x1d2>\n    131d:\t4c 89 ef             \tmov    %r13,%rdi\n    1320:\te8 3b fd ff ff       \tcallq  1060 <strcmp@plt>\n    1325:\t85 c0                \ttest   %eax,%eax\n    1327:\t0f 85 d4 16 00 00    \tjne    2a01 <main+0x1941>\n    132d:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1333:\t4c 89 fe             \tmov    %r15,%rsi\n    1336:\t4c 89 f7             \tmov    %r14,%rdi\n    1339:\te8 02 97 00 00       \tcallq  aa40 <measureFunction>\n    133e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1343:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1348:\t48 8d 35 94 a3 00 00 \tlea    0xa394(%rip),%rsi        # b6e3 <_IO_stdin_used+0x6e3>\n    134f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1353:\te8 28 fd ff ff       \tcallq  1080 <__printf_chk@plt>\n    1358:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    135c:\tba 06 00 00 00       \tmov    $0x6,%edx\n    1361:\t48 8d 35 73 9e 00 00 \tlea    0x9e73(%rip),%rsi        # b1db <_IO_stdin_used+0x1db>\n    1368:\t4c 89 ef             \tmov    %r13,%rdi\n    136b:\te8 c0 fc ff ff       \tcallq  1030 <strncmp@plt>\n    1370:\t85 c0                \ttest   %eax,%eax\n    1372:\t0f 85 a5 16 00 00    \tjne    2a1d <main+0x195d>\n    1378:\t48 8d 35 c8 91 00 00 \tlea    0x91c8(%rip),%rsi        # a547 <depmovtest>\n    137f:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1385:\t4c 89 f7             \tmov    %r14,%rdi\n    1388:\te8 b3 96 00 00       \tcallq  aa40 <measureFunction>\n    138d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1392:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1397:\t48 8d 35 27 a3 00 00 \tlea    0xa327(%rip),%rsi        # b6c5 <_IO_stdin_used+0x6c5>\n    139e:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    13a2:\te8 d9 fc ff ff       \tcallq  1080 <__printf_chk@plt>\n    13a7:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    13ab:\tba 08 00 00 00       \tmov    $0x8,%edx\n    13b0:\t48 8d 35 22 9e 00 00 \tlea    0x9e22(%rip),%rsi        # b1d9 <_IO_stdin_used+0x1d9>\n    13b7:\t4c 89 ef             \tmov    %r13,%rdi\n    13ba:\te8 71 fc ff ff       \tcallq  1030 <strncmp@plt>\n    13bf:\t85 c0                \ttest   %eax,%eax\n    13c1:\t0f 85 72 16 00 00    \tjne    2a39 <main+0x1979>\n    13c7:\t48 8d 35 ee 91 00 00 \tlea    0x91ee(%rip),%rsi        # a5bc <indepmovtest>\n    13ce:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    13d4:\t4c 89 f7             \tmov    %r14,%rdi\n    13d7:\te8 64 96 00 00       \tcallq  aa40 <measureFunction>\n    13dc:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    13e1:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    13e6:\t48 8d 35 3b ac 00 00 \tlea    0xac3b(%rip),%rsi        # c028 <_IO_stdin_used+0x1028>\n    13ed:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    13f1:\te8 8a fc ff ff       \tcallq  1080 <__printf_chk@plt>\n    13f6:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    13fa:\tba 07 00 00 00       \tmov    $0x7,%edx\n    13ff:\t48 8d 35 dc 9d 00 00 \tlea    0x9ddc(%rip),%rsi        # b1e2 <_IO_stdin_used+0x1e2>\n    1406:\t4c 89 ef             \tmov    %r13,%rdi\n    1409:\te8 22 fc ff ff       \tcallq  1030 <strncmp@plt>\n    140e:\t85 c0                \ttest   %eax,%eax\n    1410:\t0f 85 3f 16 00 00    \tjne    2a55 <main+0x1995>\n    1416:\t48 8d 35 e1 92 00 00 \tlea    0x92e1(%rip),%rsi        # a6fe <xorzerotest>\n    141d:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1423:\t4c 89 f7             \tmov    %r14,%rdi\n    1426:\te8 15 96 00 00       \tcallq  aa40 <measureFunction>\n    142b:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1430:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1435:\t48 8d 35 71 a2 00 00 \tlea    0xa271(%rip),%rsi        # b6ad <_IO_stdin_used+0x6ad>\n    143c:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1440:\te8 3b fc ff ff       \tcallq  1080 <__printf_chk@plt>\n    1445:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1449:\tba 07 00 00 00       \tmov    $0x7,%edx\n    144e:\t48 8d 35 95 9d 00 00 \tlea    0x9d95(%rip),%rsi        # b1ea <_IO_stdin_used+0x1ea>\n    1455:\t4c 89 ef             \tmov    %r13,%rdi\n    1458:\te8 d3 fb ff ff       \tcallq  1030 <strncmp@plt>\n    145d:\t85 c0                \ttest   %eax,%eax\n    145f:\t0f 85 0c 16 00 00    \tjne    2a71 <main+0x19b1>\n    1465:\t48 8d 35 c7 91 00 00 \tlea    0x91c7(%rip),%rsi        # a633 <movzerotest>\n    146c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1472:\t4c 89 f7             \tmov    %r14,%rdi\n    1475:\te8 c6 95 00 00       \tcallq  aa40 <measureFunction>\n    147a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    147f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1484:\t48 8d 35 0a a2 00 00 \tlea    0xa20a(%rip),%rsi        # b695 <_IO_stdin_used+0x695>\n    148b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    148f:\te8 ec fb ff ff       \tcallq  1080 <__printf_chk@plt>\n    1494:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1498:\tba 07 00 00 00       \tmov    $0x7,%edx\n    149d:\t48 8d 35 4e 9d 00 00 \tlea    0x9d4e(%rip),%rsi        # b1f2 <_IO_stdin_used+0x1f2>\n    14a4:\t4c 89 ef             \tmov    %r13,%rdi\n    14a7:\te8 84 fb ff ff       \tcallq  1030 <strncmp@plt>\n    14ac:\t85 c0                \ttest   %eax,%eax\n    14ae:\t0f 85 d9 15 00 00    \tjne    2a8d <main+0x19cd>\n    14b4:\t48 8d 35 ba 92 00 00 \tlea    0x92ba(%rip),%rsi        # a775 <subzerotest>\n    14bb:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    14c1:\t4c 89 f7             \tmov    %r14,%rdi\n    14c4:\te8 77 95 00 00       \tcallq  aa40 <measureFunction>\n    14c9:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    14ce:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    14d3:\t48 8d 35 a3 a1 00 00 \tlea    0xa1a3(%rip),%rsi        # b67d <_IO_stdin_used+0x67d>\n    14da:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    14de:\te8 9d fb ff ff       \tcallq  1080 <__printf_chk@plt>\n    14e3:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    14e7:\tba 06 00 00 00       \tmov    $0x6,%edx\n    14ec:\t48 8d 35 07 9d 00 00 \tlea    0x9d07(%rip),%rsi        # b1fa <_IO_stdin_used+0x1fa>\n    14f3:\t4c 89 ef             \tmov    %r13,%rdi\n    14f6:\te8 35 fb ff ff       \tcallq  1030 <strncmp@plt>\n    14fb:\t85 c0                \ttest   %eax,%eax\n    14fd:\t0f 85 a6 15 00 00    \tjne    2aa9 <main+0x19e9>\n    1503:\t48 8d 35 6d 93 00 00 \tlea    0x936d(%rip),%rsi        # a877 <depinctest>\n    150a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1510:\t4c 89 f7             \tmov    %r14,%rdi\n    1513:\te8 28 95 00 00       \tcallq  aa40 <measureFunction>\n    1518:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    151d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1522:\t48 8d 35 3d a1 00 00 \tlea    0xa13d(%rip),%rsi        # b666 <_IO_stdin_used+0x666>\n    1529:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    152d:\te8 4e fb ff ff       \tcallq  1080 <__printf_chk@plt>\n    1532:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1536:\tba 06 00 00 00       \tmov    $0x6,%edx\n    153b:\t48 8d 35 bf 9c 00 00 \tlea    0x9cbf(%rip),%rsi        # b201 <_IO_stdin_used+0x201>\n    1542:\t4c 89 ef             \tmov    %r13,%rdi\n    1545:\te8 e6 fa ff ff       \tcallq  1030 <strncmp@plt>\n    154a:\t85 c0                \ttest   %eax,%eax\n    154c:\t0f 85 73 15 00 00    \tjne    2ac5 <main+0x1a05>\n    1552:\t48 8d 35 95 93 00 00 \tlea    0x9395(%rip),%rsi        # a8ee <depdectest>\n    1559:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    155f:\t4c 89 f7             \tmov    %r14,%rdi\n    1562:\te8 d9 94 00 00       \tcallq  aa40 <measureFunction>\n    1567:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    156c:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1571:\t48 8d 35 d7 a0 00 00 \tlea    0xa0d7(%rip),%rsi        # b64f <_IO_stdin_used+0x64f>\n    1578:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    157c:\te8 ff fa ff ff       \tcallq  1080 <__printf_chk@plt>\n    1581:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1585:\tba 06 00 00 00       \tmov    $0x6,%edx\n    158a:\t48 8d 35 70 9c 00 00 \tlea    0x9c70(%rip),%rsi        # b201 <_IO_stdin_used+0x201>\n    1591:\t4c 89 ef             \tmov    %r13,%rdi\n    1594:\te8 97 fa ff ff       \tcallq  1030 <strncmp@plt>\n    1599:\t85 c0                \ttest   %eax,%eax\n    159b:\t75 33                \tjne    15d0 <main+0x510>\n    159d:\t48 8d 35 48 92 00 00 \tlea    0x9248(%rip),%rsi        # a7ec <depaddimmtest>\n    15a4:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    15aa:\t4c 89 f7             \tmov    %r14,%rdi\n    15ad:\te8 8e 94 00 00       \tcallq  aa40 <measureFunction>\n    15b2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    15b7:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    15bc:\t48 8d 35 3d aa 00 00 \tlea    0xaa3d(%rip),%rsi        # c000 <_IO_stdin_used+0x1000>\n    15c3:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    15c7:\te8 b4 fa ff ff       \tcallq  1080 <__printf_chk@plt>\n    15cc:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    15d0:\tba 06 00 00 00       \tmov    $0x6,%edx\n    15d5:\t48 8d 35 8b a1 00 00 \tlea    0xa18b(%rip),%rsi        # b767 <_IO_stdin_used+0x767>\n    15dc:\t4c 89 ef             \tmov    %r13,%rdi\n    15df:\te8 4c fa ff ff       \tcallq  1030 <strncmp@plt>\n    15e4:\t85 c0                \ttest   %eax,%eax\n    15e6:\t0f 85 f5 14 00 00    \tjne    2ae1 <main+0x1a21>\n    15ec:\t48 8d 35 b3 56 00 00 \tlea    0x56b3(%rip),%rsi        # 6ca6 <clkmovtest>\n    15f3:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    15f9:\t4c 89 f7             \tmov    %r14,%rdi\n    15fc:\te8 3f 94 00 00       \tcallq  aa40 <measureFunction>\n    1601:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1606:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    160b:\t48 8d 35 c6 a9 00 00 \tlea    0xa9c6(%rip),%rsi        # bfd8 <_IO_stdin_used+0xfd8>\n    1612:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1616:\te8 65 fa ff ff       \tcallq  1080 <__printf_chk@plt>\n    161b:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    161f:\tba 0a 00 00 00       \tmov    $0xa,%edx\n    1624:\t48 8d 35 dd 9b 00 00 \tlea    0x9bdd(%rip),%rsi        # b208 <_IO_stdin_used+0x208>\n    162b:\t4c 89 ef             \tmov    %r13,%rdi\n    162e:\te8 fd f9 ff ff       \tcallq  1030 <strncmp@plt>\n    1633:\t85 c0                \ttest   %eax,%eax\n    1635:\t0f 85 c2 14 00 00    \tjne    2afd <main+0x1a3d>\n    163b:\t48 8d 35 62 61 00 00 \tlea    0x6162(%rip),%rsi        # 77a4 <addmultest>\n    1642:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1648:\t4c 89 f7             \tmov    %r14,%rdi\n    164b:\te8 f0 93 00 00       \tcallq  aa40 <measureFunction>\n    1650:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1655:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    165a:\t48 8d 35 d1 9f 00 00 \tlea    0x9fd1(%rip),%rsi        # b632 <_IO_stdin_used+0x632>\n    1661:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1665:\te8 16 fa ff ff       \tcallq  1080 <__printf_chk@plt>\n    166a:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    166e:\tba 06 00 00 00       \tmov    $0x6,%edx\n    1673:\t48 8d 35 99 9b 00 00 \tlea    0x9b99(%rip),%rsi        # b213 <_IO_stdin_used+0x213>\n    167a:\t4c 89 ef             \tmov    %r13,%rdi\n    167d:\te8 ae f9 ff ff       \tcallq  1030 <strncmp@plt>\n    1682:\t85 c0                \ttest   %eax,%eax\n    1684:\t0f 85 8f 14 00 00    \tjne    2b19 <main+0x1a59>\n    168a:\t48 8d 35 56 60 00 00 \tlea    0x6056(%rip),%rsi        # 76e7 <jmpmultest>\n    1691:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1697:\t4c 89 f7             \tmov    %r14,%rdi\n    169a:\te8 a1 93 00 00       \tcallq  aa40 <measureFunction>\n    169f:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    16a4:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    16a9:\t48 8d 35 67 9f 00 00 \tlea    0x9f67(%rip),%rsi        # b617 <_IO_stdin_used+0x617>\n    16b0:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    16b4:\te8 c7 f9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    16b9:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    16bd:\tba 03 00 00 00       \tmov    $0x3,%edx\n    16c2:\t48 8d 35 53 9b 00 00 \tlea    0x9b53(%rip),%rsi        # b21c <_IO_stdin_used+0x21c>\n    16c9:\t4c 89 ef             \tmov    %r13,%rdi\n    16cc:\te8 5f f9 ff ff       \tcallq  1030 <strncmp@plt>\n    16d1:\t85 c0                \ttest   %eax,%eax\n    16d3:\t0f 85 5c 14 00 00    \tjne    2b35 <main+0x1a75>\n    16d9:\t48 8d 35 11 5e 00 00 \tlea    0x5e11(%rip),%rsi        # 74f1 <jmptest>\n    16e0:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    16e6:\t4c 89 f7             \tmov    %r14,%rdi\n    16e9:\te8 52 93 00 00       \tcallq  aa40 <measureFunction>\n    16ee:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    16f3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    16f8:\t48 8d 35 ff 9e 00 00 \tlea    0x9eff(%rip),%rsi        # b5fe <_IO_stdin_used+0x5fe>\n    16ff:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1703:\te8 78 f9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1708:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    170c:\tba 05 00 00 00       \tmov    $0x5,%edx\n    1711:\t48 8d 35 02 9b 00 00 \tlea    0x9b02(%rip),%rsi        # b21a <_IO_stdin_used+0x21a>\n    1718:\t4c 89 ef             \tmov    %r13,%rdi\n    171b:\te8 10 f9 ff ff       \tcallq  1030 <strncmp@plt>\n    1720:\t85 c0                \ttest   %eax,%eax\n    1722:\t0f 85 29 14 00 00    \tjne    2b51 <main+0x1a91>\n    1728:\t48 8d 35 9a 5e 00 00 \tlea    0x5e9a(%rip),%rsi        # 75c9 <ntjmptest>\n    172f:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1735:\t4c 89 f7             \tmov    %r14,%rdi\n    1738:\te8 03 93 00 00       \tcallq  aa40 <measureFunction>\n    173d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1742:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1747:\t48 8d 35 9a 9e 00 00 \tlea    0x9e9a(%rip),%rsi        # b5e8 <_IO_stdin_used+0x5e8>\n    174e:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1752:\te8 29 f9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1757:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    175b:\tba 04 00 00 00       \tmov    $0x4,%edx\n    1760:\t48 8d 35 b9 9a 00 00 \tlea    0x9ab9(%rip),%rsi        # b220 <_IO_stdin_used+0x220>\n    1767:\t4c 89 ef             \tmov    %r13,%rdi\n    176a:\te8 c1 f8 ff ff       \tcallq  1030 <strncmp@plt>\n    176f:\t85 c0                \ttest   %eax,%eax\n    1771:\t0f 85 f6 13 00 00    \tjne    2b6d <main+0x1aad>\n    1777:\t48 8d 35 b2 8b 00 00 \tlea    0x8bb2(%rip),%rsi        # a330 <pdeptest>\n    177e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1784:\t4c 89 f7             \tmov    %r14,%rdi\n    1787:\te8 b4 92 00 00       \tcallq  aa40 <measureFunction>\n    178c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1791:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1796:\t48 8d 35 37 9e 00 00 \tlea    0x9e37(%rip),%rsi        # b5d4 <_IO_stdin_used+0x5d4>\n    179d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    17a1:\te8 da f8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    17a6:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    17aa:\tba 04 00 00 00       \tmov    $0x4,%edx\n    17af:\t48 8d 35 6f 9a 00 00 \tlea    0x9a6f(%rip),%rsi        # b225 <_IO_stdin_used+0x225>\n    17b6:\t4c 89 ef             \tmov    %r13,%rdi\n    17b9:\te8 72 f8 ff ff       \tcallq  1030 <strncmp@plt>\n    17be:\t85 c0                \ttest   %eax,%eax\n    17c0:\t0f 85 c3 13 00 00    \tjne    2b89 <main+0x1ac9>\n    17c6:\t48 8d 35 c6 8c 00 00 \tlea    0x8cc6(%rip),%rsi        # a493 <pexttest>\n    17cd:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    17d3:\t4c 89 f7             \tmov    %r14,%rdi\n    17d6:\te8 65 92 00 00       \tcallq  aa40 <measureFunction>\n    17db:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    17e0:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    17e5:\t48 8d 35 d4 9d 00 00 \tlea    0x9dd4(%rip),%rsi        # b5c0 <_IO_stdin_used+0x5c0>\n    17ec:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    17f0:\te8 8b f8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    17f5:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    17f9:\tba 07 00 00 00       \tmov    $0x7,%edx\n    17fe:\t48 8d 35 25 9a 00 00 \tlea    0x9a25(%rip),%rsi        # b22a <_IO_stdin_used+0x22a>\n    1805:\t4c 89 ef             \tmov    %r13,%rdi\n    1808:\te8 23 f8 ff ff       \tcallq  1030 <strncmp@plt>\n    180d:\t85 c0                \ttest   %eax,%eax\n    180f:\t0f 85 90 13 00 00    \tjne    2ba5 <main+0x1ae5>\n    1815:\t48 8d 35 c8 8b 00 00 \tlea    0x8bc8(%rip),%rsi        # a3e4 <pdepmultest>\n    181c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1822:\t4c 89 f7             \tmov    %r14,%rdi\n    1825:\te8 16 92 00 00       \tcallq  aa40 <measureFunction>\n    182a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    182f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1834:\t48 8d 35 69 9d 00 00 \tlea    0x9d69(%rip),%rsi        # b5a4 <_IO_stdin_used+0x5a4>\n    183b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    183f:\te8 3c f8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1844:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1848:\tba 03 00 00 00       \tmov    $0x3,%edx\n    184d:\t48 8d 35 e8 99 00 00 \tlea    0x99e8(%rip),%rsi        # b23c <_IO_stdin_used+0x23c>\n    1854:\t4c 89 ef             \tmov    %r13,%rdi\n    1857:\te8 d4 f7 ff ff       \tcallq  1030 <strncmp@plt>\n    185c:\t85 c0                \ttest   %eax,%eax\n    185e:\t0f 85 5d 13 00 00    \tjne    2bc1 <main+0x1b01>\n    1864:\t48 8d 35 5b 57 00 00 \tlea    0x575b(%rip),%rsi        # 6fc6 <shltest>\n    186b:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1871:\t4c 89 f7             \tmov    %r14,%rdi\n    1874:\te8 c7 91 00 00       \tcallq  aa40 <measureFunction>\n    1879:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    187e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1883:\t48 8d 35 03 9d 00 00 \tlea    0x9d03(%rip),%rsi        # b58d <_IO_stdin_used+0x58d>\n    188a:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    188e:\te8 ed f7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1893:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1897:\tba 03 00 00 00       \tmov    $0x3,%edx\n    189c:\t48 8d 35 8f 99 00 00 \tlea    0x998f(%rip),%rsi        # b232 <_IO_stdin_used+0x232>\n    18a3:\t4c 89 ef             \tmov    %r13,%rdi\n    18a6:\te8 85 f7 ff ff       \tcallq  1030 <strncmp@plt>\n    18ab:\t85 c0                \ttest   %eax,%eax\n    18ad:\t0f 85 2a 13 00 00    \tjne    2bdd <main+0x1b1d>\n    18b3:\t48 8d 35 80 56 00 00 \tlea    0x5680(%rip),%rsi        # 6f3a <rortest>\n    18ba:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    18c0:\t4c 89 f7             \tmov    %r14,%rdi\n    18c3:\te8 78 91 00 00       \tcallq  aa40 <measureFunction>\n    18c8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    18cd:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    18d2:\t48 8d 35 9d 9c 00 00 \tlea    0x9c9d(%rip),%rsi        # b576 <_IO_stdin_used+0x576>\n    18d9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    18dd:\te8 9e f7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    18e2:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    18e6:\tba 09 00 00 00       \tmov    $0x9,%edx\n    18eb:\t48 8d 35 44 99 00 00 \tlea    0x9944(%rip),%rsi        # b236 <_IO_stdin_used+0x236>\n    18f2:\t4c 89 ef             \tmov    %r13,%rdi\n    18f5:\te8 36 f7 ff ff       \tcallq  1030 <strncmp@plt>\n    18fa:\t85 c0                \ttest   %eax,%eax\n    18fc:\t0f 85 f7 12 00 00    \tjne    2bf9 <main+0x1b39>\n    1902:\t48 8d 35 49 57 00 00 \tlea    0x5749(%rip),%rsi        # 7052 <mixrorshltest>\n    1909:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    190f:\t4c 89 f7             \tmov    %r14,%rdi\n    1912:\te8 29 91 00 00       \tcallq  aa40 <measureFunction>\n    1917:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    191c:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1921:\t48 8d 35 90 a6 00 00 \tlea    0xa690(%rip),%rsi        # bfb8 <_IO_stdin_used+0xfb8>\n    1928:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    192c:\te8 4f f7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1931:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1935:\tba 03 00 00 00       \tmov    $0x3,%edx\n    193a:\t48 8d 35 ff 98 00 00 \tlea    0x98ff(%rip),%rsi        # b240 <_IO_stdin_used+0x240>\n    1941:\t4c 89 ef             \tmov    %r13,%rdi\n    1944:\te8 e7 f6 ff ff       \tcallq  1030 <strncmp@plt>\n    1949:\t85 c0                \ttest   %eax,%eax\n    194b:\t0f 85 c4 12 00 00    \tjne    2c15 <main+0x1b55>\n    1951:\t48 8d 35 86 57 00 00 \tlea    0x5786(%rip),%rsi        # 70de <mixrormultest>\n    1958:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    195e:\t4c 89 f7             \tmov    %r14,%rdi\n    1961:\te8 da 90 00 00       \tcallq  aa40 <measureFunction>\n    1966:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    196b:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1970:\t48 8d 35 e4 9b 00 00 \tlea    0x9be4(%rip),%rsi        # b55b <_IO_stdin_used+0x55b>\n    1977:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    197b:\te8 00 f7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1980:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1984:\tba 03 00 00 00       \tmov    $0x3,%edx\n    1989:\t48 8d 35 c0 98 00 00 \tlea    0x98c0(%rip),%rsi        # b250 <_IO_stdin_used+0x250>\n    1990:\t4c 89 ef             \tmov    %r13,%rdi\n    1993:\te8 98 f6 ff ff       \tcallq  1030 <strncmp@plt>\n    1998:\t85 c0                \ttest   %eax,%eax\n    199a:\t0f 85 91 12 00 00    \tjne    2c31 <main+0x1b71>\n    19a0:\t48 8d 35 89 58 00 00 \tlea    0x5889(%rip),%rsi        # 7230 <btstest>\n    19a7:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    19ad:\t4c 89 f7             \tmov    %r14,%rdi\n    19b0:\te8 8b 90 00 00       \tcallq  aa40 <measureFunction>\n    19b5:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    19ba:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    19bf:\t48 8d 35 82 9b 00 00 \tlea    0x9b82(%rip),%rsi        # b548 <_IO_stdin_used+0x548>\n    19c6:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    19ca:\te8 b1 f6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    19cf:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    19d3:\tba 09 00 00 00       \tmov    $0x9,%edx\n    19d8:\t48 8d 35 6b 98 00 00 \tlea    0x986b(%rip),%rsi        # b24a <_IO_stdin_used+0x24a>\n    19df:\t4c 89 ef             \tmov    %r13,%rdi\n    19e2:\te8 49 f6 ff ff       \tcallq  1030 <strncmp@plt>\n    19e7:\t85 c0                \ttest   %eax,%eax\n    19e9:\t0f 85 5e 12 00 00    \tjne    2c4d <main+0x1b8d>\n    19ef:\t48 8d 35 33 5a 00 00 \tlea    0x5a33(%rip),%rsi        # 7429 <btsmultest>\n    19f6:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    19fc:\t4c 89 f7             \tmov    %r14,%rdi\n    19ff:\te8 3c 90 00 00       \tcallq  aa40 <measureFunction>\n    1a04:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1a09:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1a0e:\t48 8d 35 18 9b 00 00 \tlea    0x9b18(%rip),%rsi        # b52d <_IO_stdin_used+0x52d>\n    1a15:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1a19:\te8 62 f6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1a1e:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1a22:\tba 09 00 00 00       \tmov    $0x9,%edx\n    1a27:\t48 8d 35 26 98 00 00 \tlea    0x9826(%rip),%rsi        # b254 <_IO_stdin_used+0x254>\n    1a2e:\t4c 89 ef             \tmov    %r13,%rdi\n    1a31:\te8 fa f5 ff ff       \tcallq  1030 <strncmp@plt>\n    1a36:\t85 c0                \ttest   %eax,%eax\n    1a38:\t0f 85 2b 12 00 00    \tjne    2c69 <main+0x1ba9>\n    1a3e:\t48 8d 35 4e 57 00 00 \tlea    0x574e(%rip),%rsi        # 7193 <rorbtstest>\n    1a45:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1a4b:\t4c 89 f7             \tmov    %r14,%rdi\n    1a4e:\te8 ed 8f 00 00       \tcallq  aa40 <measureFunction>\n    1a53:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1a58:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1a5d:\t48 8d 35 ae 9a 00 00 \tlea    0x9aae(%rip),%rsi        # b512 <_IO_stdin_used+0x512>\n    1a64:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1a68:\te8 13 f6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1a6d:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1a71:\tba 03 00 00 00       \tmov    $0x3,%edx\n    1a76:\t48 8d 35 e7 97 00 00 \tlea    0x97e7(%rip),%rsi        # b264 <_IO_stdin_used+0x264>\n    1a7d:\t4c 89 ef             \tmov    %r13,%rdi\n    1a80:\te8 ab f5 ff ff       \tcallq  1030 <strncmp@plt>\n    1a85:\t85 c0                \ttest   %eax,%eax\n    1a87:\t0f 85 f8 11 00 00    \tjne    2c85 <main+0x1bc5>\n    1a8d:\t48 8d 35 3f 58 00 00 \tlea    0x583f(%rip),%rsi        # 72d3 <leatest>\n    1a94:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1a9a:\t4c 89 f7             \tmov    %r14,%rdi\n    1a9d:\te8 9e 8f 00 00       \tcallq  aa40 <measureFunction>\n    1aa2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1aa7:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1aac:\t48 8d 35 46 9a 00 00 \tlea    0x9a46(%rip),%rsi        # b4f9 <_IO_stdin_used+0x4f9>\n    1ab3:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1ab7:\te8 c4 f5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1abc:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1ac0:\tba 09 00 00 00       \tmov    $0x9,%edx\n    1ac5:\t48 8d 35 92 97 00 00 \tlea    0x9792(%rip),%rsi        # b25e <_IO_stdin_used+0x25e>\n    1acc:\t4c 89 ef             \tmov    %r13,%rdi\n    1acf:\te8 5c f5 ff ff       \tcallq  1030 <strncmp@plt>\n    1ad4:\t85 c0                \ttest   %eax,%eax\n    1ad6:\t0f 85 c5 11 00 00    \tjne    2ca1 <main+0x1be1>\n    1adc:\t48 8d 35 93 58 00 00 \tlea    0x5893(%rip),%rsi        # 7376 <leamultest>\n    1ae3:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1ae9:\t4c 89 f7             \tmov    %r14,%rdi\n    1aec:\te8 4f 8f 00 00       \tcallq  aa40 <measureFunction>\n    1af1:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1af6:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1afb:\t48 8d 35 8e a4 00 00 \tlea    0xa48e(%rip),%rsi        # bf90 <_IO_stdin_used+0xf90>\n    1b02:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1b06:\te8 75 f5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1b0b:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1b0f:\tba 09 00 00 00       \tmov    $0x9,%edx\n    1b14:\t48 8d 35 5f 97 00 00 \tlea    0x975f(%rip),%rsi        # b27a <_IO_stdin_used+0x27a>\n    1b1b:\t4c 89 ef             \tmov    %r13,%rdi\n    1b1e:\te8 0d f5 ff ff       \tcallq  1030 <strncmp@plt>\n    1b23:\t85 c0                \ttest   %eax,%eax\n    1b25:\t0f 85 92 11 00 00    \tjne    2cbd <main+0x1bfd>\n    1b2b:\t48 8d 35 59 5d 00 00 \tlea    0x5d59(%rip),%rsi        # 788b <add256int>\n    1b32:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1b38:\t4c 89 f7             \tmov    %r14,%rdi\n    1b3b:\te8 00 8f 00 00       \tcallq  aa40 <measureFunction>\n    1b40:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1b45:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1b4a:\t48 8d 35 17 a4 00 00 \tlea    0xa417(%rip),%rsi        # bf68 <_IO_stdin_used+0xf68>\n    1b51:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1b55:\te8 26 f5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1b5a:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1b5e:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    1b63:\t48 8d 35 fe 96 00 00 \tlea    0x96fe(%rip),%rsi        # b268 <_IO_stdin_used+0x268>\n    1b6a:\t4c 89 ef             \tmov    %r13,%rdi\n    1b6d:\te8 be f4 ff ff       \tcallq  1030 <strncmp@plt>\n    1b72:\t85 c0                \ttest   %eax,%eax\n    1b74:\t0f 85 5f 11 00 00    \tjne    2cd9 <main+0x1c19>\n    1b7a:\t48 8d 35 c3 61 00 00 \tlea    0x61c3(%rip),%rsi        # 7d44 <mixadd256int>\n    1b81:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1b87:\t4c 89 f7             \tmov    %r14,%rdi\n    1b8a:\te8 b1 8e 00 00       \tcallq  aa40 <measureFunction>\n    1b8f:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1b94:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1b99:\t48 8d 35 90 a3 00 00 \tlea    0xa390(%rip),%rsi        # bf30 <_IO_stdin_used+0xf30>\n    1ba0:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1ba4:\te8 d7 f4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1ba9:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1bad:\tba 0e 00 00 00       \tmov    $0xe,%edx\n    1bb2:\t48 8d 35 bc 96 00 00 \tlea    0x96bc(%rip),%rsi        # b275 <_IO_stdin_used+0x275>\n    1bb9:\t4c 89 ef             \tmov    %r13,%rdi\n    1bbc:\te8 6f f4 ff ff       \tcallq  1030 <strncmp@plt>\n    1bc1:\t85 c0                \ttest   %eax,%eax\n    1bc3:\t0f 85 2c 11 00 00    \tjne    2cf5 <main+0x1c35>\n    1bc9:\t48 8d 35 39 62 00 00 \tlea    0x6239(%rip),%rsi        # 7e09 <mixadd256int11>\n    1bd0:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1bd6:\t4c 89 f7             \tmov    %r14,%rdi\n    1bd9:\te8 62 8e 00 00       \tcallq  aa40 <measureFunction>\n    1bde:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1be3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1be8:\t48 8d 35 09 a3 00 00 \tlea    0xa309(%rip),%rsi        # bef8 <_IO_stdin_used+0xef8>\n    1bef:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1bf3:\te8 88 f4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1bf8:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1bfc:\tba 0e 00 00 00       \tmov    $0xe,%edx\n    1c01:\t48 8d 35 7c 96 00 00 \tlea    0x967c(%rip),%rsi        # b284 <_IO_stdin_used+0x284>\n    1c08:\t4c 89 ef             \tmov    %r13,%rdi\n    1c0b:\te8 20 f4 ff ff       \tcallq  1030 <strncmp@plt>\n    1c10:\t85 c0                \ttest   %eax,%eax\n    1c12:\t0f 85 f9 10 00 00    \tjne    2d11 <main+0x1c51>\n    1c18:\t48 8d 35 2c 5f 00 00 \tlea    0x5f2c(%rip),%rsi        # 7b4b <mixadd256fpint>\n    1c1f:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1c25:\t4c 89 f7             \tmov    %r14,%rdi\n    1c28:\te8 13 8e 00 00       \tcallq  aa40 <measureFunction>\n    1c2d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1c32:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1c37:\t48 8d 35 8a a2 00 00 \tlea    0xa28a(%rip),%rsi        # bec8 <_IO_stdin_used+0xec8>\n    1c3e:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1c42:\te8 39 f4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1c47:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1c4b:\tba 08 00 00 00       \tmov    $0x8,%edx\n    1c50:\t48 8d 35 3c 96 00 00 \tlea    0x963c(%rip),%rsi        # b293 <_IO_stdin_used+0x293>\n    1c57:\t4c 89 ef             \tmov    %r13,%rdi\n    1c5a:\te8 d1 f3 ff ff       \tcallq  1030 <strncmp@plt>\n    1c5f:\t85 c0                \ttest   %eax,%eax\n    1c61:\t0f 85 c6 10 00 00    \tjne    2d2d <main+0x1c6d>\n    1c67:\t48 8d 35 32 60 00 00 \tlea    0x6032(%rip),%rsi        # 7ca0 <mix256fp>\n    1c6e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1c74:\t4c 89 f7             \tmov    %r14,%rdi\n    1c77:\te8 c4 8d 00 00       \tcallq  aa40 <measureFunction>\n    1c7c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1c81:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1c86:\t48 8d 35 0b a2 00 00 \tlea    0xa20b(%rip),%rsi        # be98 <_IO_stdin_used+0xe98>\n    1c8d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1c91:\te8 ea f3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1c96:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1c9a:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    1c9f:\t48 8d 35 f6 95 00 00 \tlea    0x95f6(%rip),%rsi        # b29c <_IO_stdin_used+0x29c>\n    1ca6:\t4c 89 ef             \tmov    %r13,%rdi\n    1ca9:\te8 82 f3 ff ff       \tcallq  1030 <strncmp@plt>\n    1cae:\t85 c0                \ttest   %eax,%eax\n    1cb0:\t0f 85 93 10 00 00    \tjne    2d49 <main+0x1c89>\n    1cb6:\t48 8d 35 f3 61 00 00 \tlea    0x61f3(%rip),%rsi        # 7eb0 <latadd256int>\n    1cbd:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1cc3:\t4c 89 f7             \tmov    %r14,%rdi\n    1cc6:\te8 75 8d 00 00       \tcallq  aa40 <measureFunction>\n    1ccb:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    1cd1:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1cd6:\t48 8d 35 8b a1 00 00 \tlea    0xa18b(%rip),%rsi        # be68 <_IO_stdin_used+0xe68>\n    1cdd:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1ce2:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    1ce6:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    1cea:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    1cee:\te8 8d f3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1cf3:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1cf7:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    1cfc:\t48 8d 35 a6 95 00 00 \tlea    0x95a6(%rip),%rsi        # b2a9 <_IO_stdin_used+0x2a9>\n    1d03:\t4c 89 ef             \tmov    %r13,%rdi\n    1d06:\te8 25 f3 ff ff       \tcallq  1030 <strncmp@plt>\n    1d0b:\t85 c0                \ttest   %eax,%eax\n    1d0d:\t0f 85 52 10 00 00    \tjne    2d65 <main+0x1ca5>\n    1d13:\t48 8d 35 81 65 00 00 \tlea    0x6581(%rip),%rsi        # 829b <latmul256int>\n    1d1a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1d20:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    1d25:\te8 16 8d 00 00       \tcallq  aa40 <measureFunction>\n    1d2a:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    1d30:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1d35:\t48 8d 35 f4 a0 00 00 \tlea    0xa0f4(%rip),%rsi        # be30 <_IO_stdin_used+0xe30>\n    1d3c:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1d41:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    1d45:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    1d49:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    1d4d:\te8 2e f3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1d52:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1d56:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    1d5b:\t48 8d 35 54 95 00 00 \tlea    0x9554(%rip),%rsi        # b2b6 <_IO_stdin_used+0x2b6>\n    1d62:\t4c 89 ef             \tmov    %r13,%rdi\n    1d65:\te8 c6 f2 ff ff       \tcallq  1030 <strncmp@plt>\n    1d6a:\t85 c0                \ttest   %eax,%eax\n    1d6c:\t0f 85 0f 10 00 00    \tjne    2d81 <main+0x1cc1>\n    1d72:\t48 8d 35 d0 65 00 00 \tlea    0x65d0(%rip),%rsi        # 8349 <latadd128int>\n    1d79:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1d7f:\t4c 89 f7             \tmov    %r14,%rdi\n    1d82:\te8 b9 8c 00 00       \tcallq  aa40 <measureFunction>\n    1d87:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    1d8d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1d92:\t48 8d 35 67 a0 00 00 \tlea    0xa067(%rip),%rsi        # be00 <_IO_stdin_used+0xe00>\n    1d99:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1d9e:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    1da2:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    1da6:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    1daa:\te8 d1 f2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1daf:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1db3:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    1db8:\t48 8d 35 04 95 00 00 \tlea    0x9504(%rip),%rsi        # b2c3 <_IO_stdin_used+0x2c3>\n    1dbf:\t4c 89 ef             \tmov    %r13,%rdi\n    1dc2:\te8 69 f2 ff ff       \tcallq  1030 <strncmp@plt>\n    1dc7:\t85 c0                \ttest   %eax,%eax\n    1dc9:\t0f 85 ce 0f 00 00    \tjne    2d9d <main+0x1cdd>\n    1dcf:\t48 8d 35 c6 6a 00 00 \tlea    0x6ac6(%rip),%rsi        # 889c <latmul128int>\n    1dd6:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1ddc:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    1de1:\te8 5a 8c 00 00       \tcallq  aa40 <measureFunction>\n    1de6:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    1dec:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1df1:\t48 8d 35 d0 9f 00 00 \tlea    0x9fd0(%rip),%rsi        # bdc8 <_IO_stdin_used+0xdc8>\n    1df8:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1dfd:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    1e01:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    1e05:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    1e09:\te8 72 f2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1e0e:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1e12:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    1e17:\t48 8d 35 b2 94 00 00 \tlea    0x94b2(%rip),%rsi        # b2d0 <_IO_stdin_used+0x2d0>\n    1e1e:\t4c 89 ef             \tmov    %r13,%rdi\n    1e21:\te8 0a f2 ff ff       \tcallq  1030 <strncmp@plt>\n    1e26:\t85 c0                \ttest   %eax,%eax\n    1e28:\t0f 85 8b 0f 00 00    \tjne    2db9 <main+0x1cf9>\n    1e2e:\t48 8d 35 8c 6b 00 00 \tlea    0x6b8c(%rip),%rsi        # 89c1 <latadd256fp>\n    1e35:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1e3b:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    1e40:\te8 fb 8b 00 00       \tcallq  aa40 <measureFunction>\n    1e45:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    1e4b:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1e50:\t48 8d 35 49 9f 00 00 \tlea    0x9f49(%rip),%rsi        # bda0 <_IO_stdin_used+0xda0>\n    1e57:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1e5c:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    1e60:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    1e64:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    1e68:\te8 13 f2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1e6d:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1e71:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    1e76:\t48 8d 35 5f 94 00 00 \tlea    0x945f(%rip),%rsi        # b2dc <_IO_stdin_used+0x2dc>\n    1e7d:\t4c 89 ef             \tmov    %r13,%rdi\n    1e80:\te8 ab f1 ff ff       \tcallq  1030 <strncmp@plt>\n    1e85:\t85 c0                \ttest   %eax,%eax\n    1e87:\t0f 85 48 0f 00 00    \tjne    2dd5 <main+0x1d15>\n    1e8d:\t48 8d 35 e7 6c 00 00 \tlea    0x6ce7(%rip),%rsi        # 8b7b <latmul256fp>\n    1e94:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1e9a:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    1e9f:\te8 9c 8b 00 00       \tcallq  aa40 <measureFunction>\n    1ea4:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    1eaa:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1eaf:\t48 8d 35 c2 9e 00 00 \tlea    0x9ec2(%rip),%rsi        # bd78 <_IO_stdin_used+0xd78>\n    1eb6:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1ebb:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    1ebf:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    1ec3:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    1ec7:\te8 b4 f1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1ecc:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1ed0:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    1ed5:\t48 8d 35 0c 94 00 00 \tlea    0x940c(%rip),%rsi        # b2e8 <_IO_stdin_used+0x2e8>\n    1edc:\t4c 89 ef             \tmov    %r13,%rdi\n    1edf:\te8 4c f1 ff ff       \tcallq  1030 <strncmp@plt>\n    1ee4:\t85 c0                \ttest   %eax,%eax\n    1ee6:\t0f 85 05 0f 00 00    \tjne    2df1 <main+0x1d31>\n    1eec:\t48 8d 35 99 79 00 00 \tlea    0x7999(%rip),%rsi        # 988c <latadd128fp>\n    1ef3:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1ef9:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    1efe:\te8 3d 8b 00 00       \tcallq  aa40 <measureFunction>\n    1f03:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    1f09:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1f0e:\t48 8d 35 3b 9e 00 00 \tlea    0x9e3b(%rip),%rsi        # bd50 <_IO_stdin_used+0xd50>\n    1f15:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1f1a:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    1f1e:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    1f22:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    1f26:\te8 55 f1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1f2b:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1f2f:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    1f34:\t48 8d 35 b9 93 00 00 \tlea    0x93b9(%rip),%rsi        # b2f4 <_IO_stdin_used+0x2f4>\n    1f3b:\t4c 89 ef             \tmov    %r13,%rdi\n    1f3e:\te8 ed f0 ff ff       \tcallq  1030 <strncmp@plt>\n    1f43:\t85 c0                \ttest   %eax,%eax\n    1f45:\t0f 85 c2 0e 00 00    \tjne    2e0d <main+0x1d4d>\n    1f4b:\t48 8d 35 9a 79 00 00 \tlea    0x799a(%rip),%rsi        # 98ec <latmul128fp>\n    1f52:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1f58:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    1f5d:\te8 de 8a 00 00       \tcallq  aa40 <measureFunction>\n    1f62:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    1f68:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1f6d:\t48 8d 35 b4 9d 00 00 \tlea    0x9db4(%rip),%rsi        # bd28 <_IO_stdin_used+0xd28>\n    1f74:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1f79:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    1f7d:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    1f81:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    1f85:\te8 f6 f0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1f8a:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1f8e:\tba 08 00 00 00       \tmov    $0x8,%edx\n    1f93:\t48 8d 35 51 93 00 00 \tlea    0x9351(%rip),%rsi        # b2eb <_IO_stdin_used+0x2eb>\n    1f9a:\t4c 89 ef             \tmov    %r13,%rdi\n    1f9d:\te8 8e f0 ff ff       \tcallq  1030 <strncmp@plt>\n    1fa2:\t85 c0                \ttest   %eax,%eax\n    1fa4:\t0f 85 7f 0e 00 00    \tjne    2e29 <main+0x1d69>\n    1faa:\t48 8d 35 0a 7a 00 00 \tlea    0x7a0a(%rip),%rsi        # 99bb <add128fp>\n    1fb1:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    1fb7:\t4c 89 f7             \tmov    %r14,%rdi\n    1fba:\te8 81 8a 00 00       \tcallq  aa40 <measureFunction>\n    1fbf:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    1fc4:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    1fc9:\t48 8d 35 38 9d 00 00 \tlea    0x9d38(%rip),%rsi        # bd08 <_IO_stdin_used+0xd08>\n    1fd0:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    1fd4:\te8 a7 f0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    1fd9:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    1fdd:\tba 08 00 00 00       \tmov    $0x8,%edx\n    1fe2:\t48 8d 35 0e 93 00 00 \tlea    0x930e(%rip),%rsi        # b2f7 <_IO_stdin_used+0x2f7>\n    1fe9:\t4c 89 ef             \tmov    %r13,%rdi\n    1fec:\te8 3f f0 ff ff       \tcallq  1030 <strncmp@plt>\n    1ff1:\t85 c0                \ttest   %eax,%eax\n    1ff3:\t0f 85 4c 0e 00 00    \tjne    2e45 <main+0x1d85>\n    1ff9:\t48 8d 35 4c 79 00 00 \tlea    0x794c(%rip),%rsi        # 994c <mul128fp>\n    2000:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    2006:\t4c 89 f7             \tmov    %r14,%rdi\n    2009:\te8 32 8a 00 00       \tcallq  aa40 <measureFunction>\n    200e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2013:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2018:\t48 8d 35 c9 9c 00 00 \tlea    0x9cc9(%rip),%rsi        # bce8 <_IO_stdin_used+0xce8>\n    201f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2023:\te8 58 f0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    2028:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    202c:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2031:\t48 8d 35 81 92 00 00 \tlea    0x9281(%rip),%rsi        # b2b9 <_IO_stdin_used+0x2b9>\n    2038:\t4c 89 ef             \tmov    %r13,%rdi\n    203b:\te8 f0 ef ff ff       \tcallq  1030 <strncmp@plt>\n    2040:\t85 c0                \ttest   %eax,%eax\n    2042:\t0f 85 19 0e 00 00    \tjne    2e61 <main+0x1da1>\n    2048:\t48 8d 35 65 63 00 00 \tlea    0x6365(%rip),%rsi        # 83b4 <add128int>\n    204f:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    2055:\t4c 89 f7             \tmov    %r14,%rdi\n    2058:\te8 e3 89 00 00       \tcallq  aa40 <measureFunction>\n    205d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2062:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2067:\t48 8d 35 52 9c 00 00 \tlea    0x9c52(%rip),%rsi        # bcc0 <_IO_stdin_used+0xcc0>\n    206e:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2072:\te8 09 f0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    2077:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    207b:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2080:\t48 8d 35 4e 93 00 00 \tlea    0x934e(%rip),%rsi        # b3d5 <_IO_stdin_used+0x3d5>\n    2087:\t4c 89 ef             \tmov    %r13,%rdi\n    208a:\te8 a1 ef ff ff       \tcallq  1030 <strncmp@plt>\n    208f:\t85 c0                \ttest   %eax,%eax\n    2091:\t0f 85 e6 0d 00 00    \tjne    2e7d <main+0x1dbd>\n    2097:\t48 8d 35 7f 67 00 00 \tlea    0x677f(%rip),%rsi        # 881d <mul128int>\n    209e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    20a4:\t4c 89 f7             \tmov    %r14,%rdi\n    20a7:\te8 94 89 00 00       \tcallq  aa40 <measureFunction>\n    20ac:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    20b1:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    20b6:\t48 8d 35 db 9b 00 00 \tlea    0x9bdb(%rip),%rsi        # bc98 <_IO_stdin_used+0xc98>\n    20bd:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    20c1:\te8 ba ef ff ff       \tcallq  1080 <__printf_chk@plt>\n    20c6:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    20ca:\tba 06 00 00 00       \tmov    $0x6,%edx\n    20cf:\t48 8d 35 2d 92 00 00 \tlea    0x922d(%rip),%rsi        # b303 <_IO_stdin_used+0x303>\n    20d6:\t4c 89 ef             \tmov    %r13,%rdi\n    20d9:\te8 52 ef ff ff       \tcallq  1030 <strncmp@plt>\n    20de:\t85 c0                \ttest   %eax,%eax\n    20e0:\t0f 85 b3 0d 00 00    \tjne    2e99 <main+0x1dd9>\n    20e6:\t48 8d 35 c2 6c 00 00 \tlea    0x6cc2(%rip),%rsi        # 8daf <fma256>\n    20ed:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    20f3:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    20f8:\te8 43 89 00 00       \tcallq  aa40 <measureFunction>\n    20fd:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2102:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2107:\t48 8d 35 d0 93 00 00 \tlea    0x93d0(%rip),%rsi        # b4de <_IO_stdin_used+0x4de>\n    210e:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2112:\te8 69 ef ff ff       \tcallq  1080 <__printf_chk@plt>\n    2117:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    211b:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2120:\t48 8d 35 8a 90 00 00 \tlea    0x908a(%rip),%rsi        # b1b1 <_IO_stdin_used+0x1b1>\n    2127:\t4c 89 ef             \tmov    %r13,%rdi\n    212a:\te8 01 ef ff ff       \tcallq  1030 <strncmp@plt>\n    212f:\t85 c0                \ttest   %eax,%eax\n    2131:\t0f 85 7e 0d 00 00    \tjne    2eb5 <main+0x1df5>\n    2137:\t48 8d 35 29 6d 00 00 \tlea    0x6d29(%rip),%rsi        # 8e67 <fma128>\n    213e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    2144:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    2149:\te8 f2 88 00 00       \tcallq  aa40 <measureFunction>\n    214e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2153:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2158:\t48 8d 35 64 93 00 00 \tlea    0x9364(%rip),%rsi        # b4c3 <_IO_stdin_used+0x4c3>\n    215f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2163:\te8 18 ef ff ff       \tcallq  1080 <__printf_chk@plt>\n    2168:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    216c:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2171:\t48 8d 35 88 91 00 00 \tlea    0x9188(%rip),%rsi        # b300 <_IO_stdin_used+0x300>\n    2178:\t4c 89 ef             \tmov    %r13,%rdi\n    217b:\te8 b0 ee ff ff       \tcallq  1030 <strncmp@plt>\n    2180:\t85 c0                \ttest   %eax,%eax\n    2182:\t0f 85 49 0d 00 00    \tjne    2ed1 <main+0x1e11>\n    2188:\t48 8d 35 8a 75 00 00 \tlea    0x758a(%rip),%rsi        # 9719 <latfma256>\n    218f:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    2195:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    219a:\te8 a1 88 00 00       \tcallq  aa40 <measureFunction>\n    219f:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    21a5:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    21aa:\t48 8d 35 bf 9a 00 00 \tlea    0x9abf(%rip),%rsi        # bc70 <_IO_stdin_used+0xc70>\n    21b1:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    21b6:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    21ba:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    21be:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    21c2:\te8 b9 ee ff ff       \tcallq  1080 <__printf_chk@plt>\n    21c7:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    21cb:\tba 09 00 00 00       \tmov    $0x9,%edx\n    21d0:\t48 8d 35 33 91 00 00 \tlea    0x9133(%rip),%rsi        # b30a <_IO_stdin_used+0x30a>\n    21d7:\t4c 89 ef             \tmov    %r13,%rdi\n    21da:\te8 51 ee ff ff       \tcallq  1030 <strncmp@plt>\n    21df:\t85 c0                \ttest   %eax,%eax\n    21e1:\t0f 85 06 0d 00 00    \tjne    2eed <main+0x1e2d>\n    21e7:\t48 8d 35 e3 75 00 00 \tlea    0x75e3(%rip),%rsi        # 97d1 <latfma128>\n    21ee:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    21f4:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    21f9:\te8 42 88 00 00       \tcallq  aa40 <measureFunction>\n    21fe:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    2204:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2209:\t48 8d 35 38 9a 00 00 \tlea    0x9a38(%rip),%rsi        # bc48 <_IO_stdin_used+0xc48>\n    2210:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2215:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    2219:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    221d:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    2221:\te8 5a ee ff ff       \tcallq  1080 <__printf_chk@plt>\n    2226:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    222a:\tba 06 00 00 00       \tmov    $0x6,%edx\n    222f:\t48 8d 35 ec 90 00 00 \tlea    0x90ec(%rip),%rsi        # b322 <_IO_stdin_used+0x322>\n    2236:\t4c 89 ef             \tmov    %r13,%rdi\n    2239:\te8 f2 ed ff ff       \tcallq  1030 <strncmp@plt>\n    223e:\t85 c0                \ttest   %eax,%eax\n    2240:\t0f 85 c3 0c 00 00    \tjne    2f09 <main+0x1e49>\n    2246:\t48 8d 35 8f 68 00 00 \tlea    0x688f(%rip),%rsi        # 8adc <add256fp>\n    224d:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    2253:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    2258:\te8 e3 87 00 00       \tcallq  aa40 <measureFunction>\n    225d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2262:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2267:\t48 8d 35 39 92 00 00 \tlea    0x9239(%rip),%rsi        # b4a7 <_IO_stdin_used+0x4a7>\n    226e:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2272:\te8 09 ee ff ff       \tcallq  1080 <__printf_chk@plt>\n    2277:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    227b:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2280:\t48 8d 35 8d 90 00 00 \tlea    0x908d(%rip),%rsi        # b314 <_IO_stdin_used+0x314>\n    2287:\t4c 89 ef             \tmov    %r13,%rdi\n    228a:\te8 a1 ed ff ff       \tcallq  1030 <strncmp@plt>\n    228f:\t85 c0                \ttest   %eax,%eax\n    2291:\t0f 85 8e 0c 00 00    \tjne    2f25 <main+0x1e65>\n    2297:\t48 8d 35 9f 67 00 00 \tlea    0x679f(%rip),%rsi        # 8a3d <mul256fp>\n    229e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    22a4:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    22a9:\te8 92 87 00 00       \tcallq  aa40 <measureFunction>\n    22ae:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    22b3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    22b8:\t48 8d 35 cc 91 00 00 \tlea    0x91cc(%rip),%rsi        # b48b <_IO_stdin_used+0x48b>\n    22bf:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    22c3:\te8 b8 ed ff ff       \tcallq  1080 <__printf_chk@plt>\n    22c8:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    22cc:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    22d1:\t48 8d 35 44 90 00 00 \tlea    0x9044(%rip),%rsi        # b31c <_IO_stdin_used+0x31c>\n    22d8:\t4c 89 ef             \tmov    %r13,%rdi\n    22db:\te8 50 ed ff ff       \tcallq  1030 <strncmp@plt>\n    22e0:\t85 c0                \ttest   %eax,%eax\n    22e2:\t0f 85 59 0c 00 00    \tjne    2f41 <main+0x1e81>\n    22e8:\t48 8d 35 33 6c 00 00 \tlea    0x6c33(%rip),%rsi        # 8f22 <mixfmafadd256>\n    22ef:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    22f5:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    22fa:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    22fe:\te8 3d 87 00 00       \tcallq  aa40 <measureFunction>\n    2303:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2308:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    230d:\t48 8d 35 0c 99 00 00 \tlea    0x990c(%rip),%rsi        # bc20 <_IO_stdin_used+0xc20>\n    2314:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2318:\te8 63 ed ff ff       \tcallq  1080 <__printf_chk@plt>\n    231d:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    2321:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    2326:\t48 8d 35 fd 8f 00 00 \tlea    0x8ffd(%rip),%rsi        # b32a <_IO_stdin_used+0x32a>\n    232d:\t4c 89 ef             \tmov    %r13,%rdi\n    2330:\te8 fb ec ff ff       \tcallq  1030 <strncmp@plt>\n    2335:\t85 c0                \ttest   %eax,%eax\n    2337:\t0f 85 20 0c 00 00    \tjne    2f5d <main+0x1e9d>\n    233d:\t48 8d 35 8d 6e 00 00 \tlea    0x6e8d(%rip),%rsi        # 91d1 <mixfmaadd256>\n    2344:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    234a:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    234f:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    2353:\te8 e8 86 00 00       \tcallq  aa40 <measureFunction>\n    2358:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    235d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2362:\t48 8d 35 8f 98 00 00 \tlea    0x988f(%rip),%rsi        # bbf8 <_IO_stdin_used+0xbf8>\n    2369:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    236d:\te8 0e ed ff ff       \tcallq  1080 <__printf_chk@plt>\n    2372:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    2376:\tba 0e 00 00 00       \tmov    $0xe,%edx\n    237b:\t48 8d 35 b5 8f 00 00 \tlea    0x8fb5(%rip),%rsi        # b337 <_IO_stdin_used+0x337>\n    2382:\t4c 89 ef             \tmov    %r13,%rdi\n    2385:\te8 a6 ec ff ff       \tcallq  1030 <strncmp@plt>\n    238a:\t85 c0                \ttest   %eax,%eax\n    238c:\t0f 85 e7 0b 00 00    \tjne    2f79 <main+0x1eb9>\n    2392:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    2398:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    239d:\t48 8d 35 8c 86 00 00 \tlea    0x868c(%rip),%rsi        # aa30 <mixfmaaddmem256wrapper>\n    23a4:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    23a8:\te8 93 86 00 00       \tcallq  aa40 <measureFunction>\n    23ad:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    23b2:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    23b7:\t48 8d 35 0a 98 00 00 \tlea    0x980a(%rip),%rsi        # bbc8 <_IO_stdin_used+0xbc8>\n    23be:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    23c2:\te8 b9 ec ff ff       \tcallq  1080 <__printf_chk@plt>\n    23c7:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    23cb:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    23d0:\t48 8d 35 70 8f 00 00 \tlea    0x8f70(%rip),%rsi        # b347 <_IO_stdin_used+0x347>\n    23d7:\t4c 89 ef             \tmov    %r13,%rdi\n    23da:\te8 51 ec ff ff       \tcallq  1030 <strncmp@plt>\n    23df:\t85 c0                \ttest   %eax,%eax\n    23e1:\t75 39                \tjne    241c <main+0x135c>\n    23e3:\t48 8d 35 98 6e 00 00 \tlea    0x6e98(%rip),%rsi        # 9282 <mixfmaand256>\n    23ea:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    23f0:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    23f5:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    23f9:\te8 42 86 00 00       \tcallq  aa40 <measureFunction>\n    23fe:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2403:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2408:\t48 8d 35 91 97 00 00 \tlea    0x9791(%rip),%rsi        # bba0 <_IO_stdin_used+0xba0>\n    240f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2413:\te8 68 ec ff ff       \tcallq  1080 <__printf_chk@plt>\n    2418:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    241c:\tba 0e 00 00 00       \tmov    $0xe,%edx\n    2421:\t48 8d 35 0f 8f 00 00 \tlea    0x8f0f(%rip),%rsi        # b337 <_IO_stdin_used+0x337>\n    2428:\t4c 89 ef             \tmov    %r13,%rdi\n    242b:\te8 00 ec ff ff       \tcallq  1030 <strncmp@plt>\n    2430:\t85 c0                \ttest   %eax,%eax\n    2432:\t0f 85 5d 0b 00 00    \tjne    2f95 <main+0x1ed5>\n    2438:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    243e:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    2443:\t48 8d 35 d6 85 00 00 \tlea    0x85d6(%rip),%rsi        # aa20 <mixfmaandmem256wrapper>\n    244a:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    244e:\te8 ed 85 00 00       \tcallq  aa40 <measureFunction>\n    2453:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2458:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    245d:\t48 8d 35 e4 9b 00 00 \tlea    0x9be4(%rip),%rsi        # c048 <_IO_stdin_used+0x1048>\n    2464:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2468:\te8 13 ec ff ff       \tcallq  1080 <__printf_chk@plt>\n    246d:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    2471:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    2476:\t48 8d 35 9a 8c 00 00 \tlea    0x8c9a(%rip),%rsi        # b117 <_IO_stdin_used+0x117>\n    247d:\t4c 89 ef             \tmov    %r13,%rdi\n    2480:\te8 ab eb ff ff       \tcallq  1030 <strncmp@plt>\n    2485:\t85 c0                \ttest   %eax,%eax\n    2487:\t0f 85 24 0b 00 00    \tjne    2fb1 <main+0x1ef1>\n    248d:\t48 8d 35 04 71 00 00 \tlea    0x7104(%rip),%rsi        # 9598 <nemesfpumix21>\n    2494:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    249a:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    249f:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    24a3:\te8 98 85 00 00       \tcallq  aa40 <measureFunction>\n    24a8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    24ad:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    24b2:\t48 8d 35 b7 96 00 00 \tlea    0x96b7(%rip),%rsi        # bb70 <_IO_stdin_used+0xb70>\n    24b9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    24bd:\te8 be eb ff ff       \tcallq  1080 <__printf_chk@plt>\n    24c2:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    24c6:\tba 0f 00 00 00       \tmov    $0xf,%edx\n    24cb:\t48 8d 35 82 8e 00 00 \tlea    0x8e82(%rip),%rsi        # b354 <_IO_stdin_used+0x354>\n    24d2:\t4c 89 ef             \tmov    %r13,%rdi\n    24d5:\te8 56 eb ff ff       \tcallq  1030 <strncmp@plt>\n    24da:\t85 c0                \ttest   %eax,%eax\n    24dc:\t0f 85 eb 0a 00 00    \tjne    2fcd <main+0x1f0d>\n    24e2:\t48 8d 35 0b 57 00 00 \tlea    0x570b(%rip),%rsi        # 7bf4 <mix256faddintadd>\n    24e9:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    24ef:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    24f4:\te8 47 85 00 00       \tcallq  aa40 <measureFunction>\n    24f9:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    24fe:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2503:\t48 8d 35 3e 96 00 00 \tlea    0x963e(%rip),%rsi        # bb48 <_IO_stdin_used+0xb48>\n    250a:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    250e:\te8 6d eb ff ff       \tcallq  1080 <__printf_chk@plt>\n    2513:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    2517:\tba 08 00 00 00       \tmov    $0x8,%edx\n    251c:\t48 8d 35 42 8e 00 00 \tlea    0x8e42(%rip),%rsi        # b365 <_IO_stdin_used+0x365>\n    2523:\t4c 89 ef             \tmov    %r13,%rdi\n    2526:\te8 05 eb ff ff       \tcallq  1030 <strncmp@plt>\n    252b:\t85 c0                \ttest   %eax,%eax\n    252d:\t0f 85 b6 0a 00 00    \tjne    2fe9 <main+0x1f29>\n    2533:\t48 8d 35 90 75 00 00 \tlea    0x7590(%rip),%rsi        # 9aca <latmul16>\n    253a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    2540:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    2545:\te8 f6 84 00 00       \tcallq  aa40 <measureFunction>\n    254a:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    2550:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2555:\t48 8d 35 c4 95 00 00 \tlea    0x95c4(%rip),%rsi        # bb20 <_IO_stdin_used+0xb20>\n    255c:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2561:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    2565:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    2569:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    256d:\te8 0e eb ff ff       \tcallq  1080 <__printf_chk@plt>\n    2572:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    2576:\tba 08 00 00 00       \tmov    $0x8,%edx\n    257b:\t48 8d 35 ec 8d 00 00 \tlea    0x8dec(%rip),%rsi        # b36e <_IO_stdin_used+0x36e>\n    2582:\t4c 89 ef             \tmov    %r13,%rdi\n    2585:\te8 a6 ea ff ff       \tcallq  1030 <strncmp@plt>\n    258a:\t85 c0                \ttest   %eax,%eax\n    258c:\t0f 85 73 0a 00 00    \tjne    3005 <main+0x1f45>\n    2592:\t48 8d 35 91 74 00 00 \tlea    0x7491(%rip),%rsi        # 9a2a <latmul64>\n    2599:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    259f:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    25a4:\te8 97 84 00 00       \tcallq  aa40 <measureFunction>\n    25a9:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    25af:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    25b4:\t48 8d 35 3d 95 00 00 \tlea    0x953d(%rip),%rsi        # baf8 <_IO_stdin_used+0xaf8>\n    25bb:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    25c0:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    25c4:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    25c8:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    25cc:\te8 af ea ff ff       \tcallq  1080 <__printf_chk@plt>\n    25d1:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    25d5:\tba 05 00 00 00       \tmov    $0x5,%edx\n    25da:\t48 8d 35 87 8d 00 00 \tlea    0x8d87(%rip),%rsi        # b368 <_IO_stdin_used+0x368>\n    25e1:\t4c 89 ef             \tmov    %r13,%rdi\n    25e4:\te8 47 ea ff ff       \tcallq  1030 <strncmp@plt>\n    25e9:\t85 c0                \ttest   %eax,%eax\n    25eb:\t0f 85 30 0a 00 00    \tjne    3021 <main+0x1f61>\n    25f1:\t48 8d 35 86 75 00 00 \tlea    0x7586(%rip),%rsi        # 9b7e <mul16>\n    25f8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    25fe:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    2603:\te8 38 84 00 00       \tcallq  aa40 <measureFunction>\n    2608:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    260d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2612:\t48 8d 35 57 8e 00 00 \tlea    0x8e57(%rip),%rsi        # b470 <_IO_stdin_used+0x470>\n    2619:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    261d:\te8 5e ea ff ff       \tcallq  1080 <__printf_chk@plt>\n    2622:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    2626:\tba 05 00 00 00       \tmov    $0x5,%edx\n    262b:\t48 8d 35 5d 8d 00 00 \tlea    0x8d5d(%rip),%rsi        # b38f <_IO_stdin_used+0x38f>\n    2632:\t4c 89 ef             \tmov    %r13,%rdi\n    2635:\te8 f6 e9 ff ff       \tcallq  1030 <strncmp@plt>\n    263a:\t85 c0                \ttest   %eax,%eax\n    263c:\t0f 85 fb 09 00 00    \tjne    303d <main+0x1f7d>\n    2642:\t48 8d 35 e9 75 00 00 \tlea    0x75e9(%rip),%rsi        # 9c32 <mul64>\n    2649:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    264f:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    2654:\te8 e7 83 00 00       \tcallq  aa40 <measureFunction>\n    2659:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    265e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2663:\t48 8d 35 eb 8d 00 00 \tlea    0x8deb(%rip),%rsi        # b455 <_IO_stdin_used+0x455>\n    266a:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    266e:\te8 0d ea ff ff       \tcallq  1080 <__printf_chk@plt>\n    2673:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    2677:\tba 05 00 00 00       \tmov    $0x5,%edx\n    267c:\t48 8d 35 f4 8c 00 00 \tlea    0x8cf4(%rip),%rsi        # b377 <_IO_stdin_used+0x377>\n    2683:\t4c 89 ef             \tmov    %r13,%rdi\n    2686:\te8 a5 e9 ff ff       \tcallq  1030 <strncmp@plt>\n    268b:\t85 c0                \ttest   %eax,%eax\n    268d:\t0f 85 c6 09 00 00    \tjne    3059 <main+0x1f99>\n    2693:\t48 8d 35 7a 76 00 00 \tlea    0x767a(%rip),%rsi        # 9d14 <mixmul16mul64>\n    269a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    26a0:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    26a5:\te8 96 83 00 00       \tcallq  aa40 <measureFunction>\n    26aa:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    26af:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    26b4:\t48 8d 35 0d 94 00 00 \tlea    0x940d(%rip),%rsi        # bac8 <_IO_stdin_used+0xac8>\n    26bb:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    26bf:\te8 bc e9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    26c4:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    26c8:\tba 05 00 00 00       \tmov    $0x5,%edx\n    26cd:\t48 8d 35 b1 8c 00 00 \tlea    0x8cb1(%rip),%rsi        # b385 <_IO_stdin_used+0x385>\n    26d4:\t4c 89 ef             \tmov    %r13,%rdi\n    26d7:\te8 54 e9 ff ff       \tcallq  1030 <strncmp@plt>\n    26dc:\t85 c0                \ttest   %eax,%eax\n    26de:\t0f 85 91 09 00 00    \tjne    3075 <main+0x1fb5>\n    26e4:\t48 8d 35 d5 76 00 00 \tlea    0x76d5(%rip),%rsi        # 9dc0 <mixmul16mul64_21>\n    26eb:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    26f1:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    26f6:\te8 45 83 00 00       \tcallq  aa40 <measureFunction>\n    26fb:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2700:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2705:\t48 8d 35 8c 93 00 00 \tlea    0x938c(%rip),%rsi        # ba98 <_IO_stdin_used+0xa98>\n    270c:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2710:\te8 6b e9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    2715:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    2719:\tba 07 00 00 00       \tmov    $0x7,%edx\n    271e:\t48 8d 35 76 8c 00 00 \tlea    0x8c76(%rip),%rsi        # b39b <_IO_stdin_used+0x39b>\n    2725:\t4c 89 ef             \tmov    %r13,%rdi\n    2728:\te8 03 e9 ff ff       \tcallq  1030 <strncmp@plt>\n    272d:\t85 c0                \ttest   %eax,%eax\n    272f:\t0f 85 5c 09 00 00    \tjne    3091 <main+0x1fd1>\n    2735:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    273b:\t48 8d 35 2e 82 00 00 \tlea    0x822e(%rip),%rsi        # a970 <load128wrapper>\n    2742:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    2747:\te8 f4 82 00 00       \tcallq  aa40 <measureFunction>\n    274c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2751:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2756:\t48 8d 35 db 8c 00 00 \tlea    0x8cdb(%rip),%rsi        # b438 <_IO_stdin_used+0x438>\n    275d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2761:\te8 1a e9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    2766:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    276a:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    276f:\t48 8d 35 1f 8c 00 00 \tlea    0x8c1f(%rip),%rsi        # b395 <_IO_stdin_used+0x395>\n    2776:\t4c 89 ef             \tmov    %r13,%rdi\n    2779:\te8 b2 e8 ff ff       \tcallq  1030 <strncmp@plt>\n    277e:\t85 c0                \ttest   %eax,%eax\n    2780:\t0f 85 27 09 00 00    \tjne    30ad <main+0x1fed>\n    2786:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    278c:\t48 8d 35 ed 81 00 00 \tlea    0x81ed(%rip),%rsi        # a980 <spacedload128wrapper>\n    2793:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    2798:\te8 a3 82 00 00       \tcallq  aa40 <measureFunction>\n    279d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    27a2:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    27a7:\t48 8d 35 c2 92 00 00 \tlea    0x92c2(%rip),%rsi        # ba70 <_IO_stdin_used+0xa70>\n    27ae:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    27b2:\te8 c9 e8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    27b7:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    27bb:\tba 07 00 00 00       \tmov    $0x7,%edx\n    27c0:\t48 8d 35 dc 8b 00 00 \tlea    0x8bdc(%rip),%rsi        # b3a3 <_IO_stdin_used+0x3a3>\n    27c7:\t4c 89 ef             \tmov    %r13,%rdi\n    27ca:\te8 61 e8 ff ff       \tcallq  1030 <strncmp@plt>\n    27cf:\t85 c0                \ttest   %eax,%eax\n    27d1:\t0f 85 f2 08 00 00    \tjne    30c9 <main+0x2009>\n    27d7:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    27dd:\t48 8d 35 bc 81 00 00 \tlea    0x81bc(%rip),%rsi        # a9a0 <load256wrapper>\n    27e4:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    27e9:\te8 52 82 00 00       \tcallq  aa40 <measureFunction>\n    27ee:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    27f3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    27f8:\t48 8d 35 1c 8c 00 00 \tlea    0x8c1c(%rip),%rsi        # b41b <_IO_stdin_used+0x41b>\n    27ff:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2803:\te8 78 e8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    2808:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    280c:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    2811:\t48 8d 35 93 8b 00 00 \tlea    0x8b93(%rip),%rsi        # b3ab <_IO_stdin_used+0x3ab>\n    2818:\t4c 89 ef             \tmov    %r13,%rdi\n    281b:\te8 10 e8 ff ff       \tcallq  1030 <strncmp@plt>\n    2820:\t85 c0                \ttest   %eax,%eax\n    2822:\t0f 85 bd 08 00 00    \tjne    30e5 <main+0x2025>\n    2828:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    282e:\t48 8d 35 5b 81 00 00 \tlea    0x815b(%rip),%rsi        # a990 <spacedstorescalarwrapper>\n    2835:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    283a:\te8 01 82 00 00       \tcallq  aa40 <measureFunction>\n    283f:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2844:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2849:\t48 8d 35 f8 91 00 00 \tlea    0x91f8(%rip),%rsi        # ba48 <_IO_stdin_used+0xa48>\n    2850:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2854:\te8 27 e8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    2859:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    285d:\tba 07 00 00 00       \tmov    $0x7,%edx\n    2862:\t48 8d 35 54 8b 00 00 \tlea    0x8b54(%rip),%rsi        # b3bd <_IO_stdin_used+0x3bd>\n    2869:\t4c 89 ef             \tmov    %r13,%rdi\n    286c:\te8 bf e7 ff ff       \tcallq  1030 <strncmp@plt>\n    2871:\t85 c0                \ttest   %eax,%eax\n    2873:\t75 35                \tjne    28aa <main+0x17ea>\n    2875:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    287b:\t48 8d 35 3e 81 00 00 \tlea    0x813e(%rip),%rsi        # a9c0 <store128wrapper>\n    2882:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    2887:\te8 b4 81 00 00       \tcallq  aa40 <measureFunction>\n    288c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2891:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2896:\t48 8d 35 60 8b 00 00 \tlea    0x8b60(%rip),%rsi        # b3fd <_IO_stdin_used+0x3fd>\n    289d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    28a1:\te8 da e7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    28a6:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    28aa:\tba 07 00 00 00       \tmov    $0x7,%edx\n    28af:\t48 8d 35 10 8b 00 00 \tlea    0x8b10(%rip),%rsi        # b3c6 <_IO_stdin_used+0x3c6>\n    28b6:\t4c 89 ef             \tmov    %r13,%rdi\n    28b9:\te8 72 e7 ff ff       \tcallq  1030 <strncmp@plt>\n    28be:\t85 c0                \ttest   %eax,%eax\n    28c0:\t75 3b                \tjne    28fd <main+0x183d>\n    28c2:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    28c8:\t48 8d 35 11 81 00 00 \tlea    0x8111(%rip),%rsi        # a9e0 <store256wrapper>\n    28cf:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    28d4:\te8 67 81 00 00       \tcallq  aa40 <measureFunction>\n    28d9:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    28de:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    28e3:\t48 8d 35 f5 8a 00 00 \tlea    0x8af5(%rip),%rsi        # b3df <_IO_stdin_used+0x3df>\n    28ea:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    28ee:\te8 8d e7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    28f3:\t41 83 ec 01          \tsub    $0x1,%r12d\n    28f7:\t7e 4d                \tjle    2946 <main+0x1886>\n    28f9:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    28fd:\tba 0f 00 00 00       \tmov    $0xf,%edx\n    2902:\t48 8d 35 c6 8a 00 00 \tlea    0x8ac6(%rip),%rsi        # b3cf <_IO_stdin_used+0x3cf>\n    2909:\t4c 89 ef             \tmov    %r13,%rdi\n    290c:\te8 1f e7 ff ff       \tcallq  1030 <strncmp@plt>\n    2911:\t85 c0                \ttest   %eax,%eax\n    2913:\t75 31                \tjne    2946 <main+0x1886>\n    2915:\t48 8d 35 ff 5f 00 00 \tlea    0x5fff(%rip),%rsi        # 891b <mixaddmul128int>\n    291c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    2922:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    2927:\te8 14 81 00 00       \tcallq  aa40 <measureFunction>\n    292c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    2931:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    2936:\t48 8d 35 db 90 00 00 \tlea    0x90db(%rip),%rsi        # ba18 <_IO_stdin_used+0xa18>\n    293d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    2941:\te8 3a e7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    2946:\t48 8b 44 24 48       \tmov    0x48(%rsp),%rax\n    294b:\t64 48 33 04 25 28 00 \txor    %fs:0x28,%rax\n    2952:\t00 00 \n    2954:\t0f 85 ce 3b 00 00    \tjne    6528 <main+0x5468>\n    295a:\t48 83 c4 58          \tadd    $0x58,%rsp\n    295e:\t31 c0                \txor    %eax,%eax\n    2960:\t5b                   \tpop    %rbx\n    2961:\t5d                   \tpop    %rbp\n    2962:\t41 5c                \tpop    %r12\n    2964:\t41 5d                \tpop    %r13\n    2966:\t41 5e                \tpop    %r14\n    2968:\t41 5f                \tpop    %r15\n    296a:\tc3                   \tretq   \n    296b:\tba 05 00 00 00       \tmov    $0x5,%edx\n    2970:\t48 8d 35 f7 8d 00 00 \tlea    0x8df7(%rip),%rsi        # b76e <_IO_stdin_used+0x76e>\n    2977:\t4c 89 ef             \tmov    %r13,%rdi\n    297a:\te8 b1 e6 ff ff       \tcallq  1030 <strncmp@plt>\n    297f:\t85 c0                \ttest   %eax,%eax\n    2981:\t0f 84 34 3b 00 00    \tje     64bb <main+0x53fb>\n    2987:\tf3 0f 10 35 f9 96 00 \tmovss  0x96f9(%rip),%xmm6        # c088 <_IO_stdin_used+0x1088>\n    298e:\t00 \n    298f:\tf3 0f 11 74 24 0c    \tmovss  %xmm6,0xc(%rsp)\n    2995:\tf3 0f 11 74 24 08    \tmovss  %xmm6,0x8(%rsp)\n    299b:\tba 05 00 00 00       \tmov    $0x5,%edx\n    29a0:\t48 8d 35 1e 88 00 00 \tlea    0x881e(%rip),%rsi        # b1c5 <_IO_stdin_used+0x1c5>\n    29a7:\t4c 89 ef             \tmov    %r13,%rdi\n    29aa:\te8 81 e6 ff ff       \tcallq  1030 <strncmp@plt>\n    29af:\t85 c0                \ttest   %eax,%eax\n    29b1:\t0f 84 43 2f 00 00    \tje     58fa <main+0x483a>\n    29b7:\tba 03 00 00 00       \tmov    $0x3,%edx\n    29bc:\t48 8d 35 4c 88 00 00 \tlea    0x884c(%rip),%rsi        # b20f <_IO_stdin_used+0x20f>\n    29c3:\t4c 89 ef             \tmov    %r13,%rdi\n    29c6:\te8 65 e6 ff ff       \tcallq  1030 <strncmp@plt>\n    29cb:\t85 c0                \ttest   %eax,%eax\n    29cd:\t0f 84 e9 2e 00 00    \tje     58bc <main+0x47fc>\n    29d3:\t48 8d 35 f1 87 00 00 \tlea    0x87f1(%rip),%rsi        # b1cb <_IO_stdin_used+0x1cb>\n    29da:\t4c 89 ef             \tmov    %r13,%rdi\n    29dd:\te8 7e e6 ff ff       \tcallq  1060 <strcmp@plt>\n    29e2:\t85 c0                \ttest   %eax,%eax\n    29e4:\t0f 84 91 2e 00 00    \tje     587b <main+0x47bb>\n    29ea:\t48 8d 35 e1 87 00 00 \tlea    0x87e1(%rip),%rsi        # b1d2 <_IO_stdin_used+0x1d2>\n    29f1:\t4c 89 ef             \tmov    %r13,%rdi\n    29f4:\te8 67 e6 ff ff       \tcallq  1060 <strcmp@plt>\n    29f9:\t85 c0                \ttest   %eax,%eax\n    29fb:\t0f 84 3c 2e 00 00    \tje     583d <main+0x477d>\n    2a01:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2a06:\t48 8d 35 ce 87 00 00 \tlea    0x87ce(%rip),%rsi        # b1db <_IO_stdin_used+0x1db>\n    2a0d:\t4c 89 ef             \tmov    %r13,%rdi\n    2a10:\te8 1b e6 ff ff       \tcallq  1030 <strncmp@plt>\n    2a15:\t85 c0                \ttest   %eax,%eax\n    2a17:\t0f 84 e2 2d 00 00    \tje     57ff <main+0x473f>\n    2a1d:\tba 08 00 00 00       \tmov    $0x8,%edx\n    2a22:\t48 8d 35 b0 87 00 00 \tlea    0x87b0(%rip),%rsi        # b1d9 <_IO_stdin_used+0x1d9>\n    2a29:\t4c 89 ef             \tmov    %r13,%rdi\n    2a2c:\te8 ff e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2a31:\t85 c0                \ttest   %eax,%eax\n    2a33:\t0f 84 88 2d 00 00    \tje     57c1 <main+0x4701>\n    2a39:\tba 07 00 00 00       \tmov    $0x7,%edx\n    2a3e:\t48 8d 35 9d 87 00 00 \tlea    0x879d(%rip),%rsi        # b1e2 <_IO_stdin_used+0x1e2>\n    2a45:\t4c 89 ef             \tmov    %r13,%rdi\n    2a48:\te8 e3 e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2a4d:\t85 c0                \ttest   %eax,%eax\n    2a4f:\t0f 84 2e 2d 00 00    \tje     5783 <main+0x46c3>\n    2a55:\tba 07 00 00 00       \tmov    $0x7,%edx\n    2a5a:\t48 8d 35 89 87 00 00 \tlea    0x8789(%rip),%rsi        # b1ea <_IO_stdin_used+0x1ea>\n    2a61:\t4c 89 ef             \tmov    %r13,%rdi\n    2a64:\te8 c7 e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2a69:\t85 c0                \ttest   %eax,%eax\n    2a6b:\t0f 84 d4 2c 00 00    \tje     5745 <main+0x4685>\n    2a71:\tba 07 00 00 00       \tmov    $0x7,%edx\n    2a76:\t48 8d 35 75 87 00 00 \tlea    0x8775(%rip),%rsi        # b1f2 <_IO_stdin_used+0x1f2>\n    2a7d:\t4c 89 ef             \tmov    %r13,%rdi\n    2a80:\te8 ab e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2a85:\t85 c0                \ttest   %eax,%eax\n    2a87:\t0f 84 7a 2c 00 00    \tje     5707 <main+0x4647>\n    2a8d:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2a92:\t48 8d 35 61 87 00 00 \tlea    0x8761(%rip),%rsi        # b1fa <_IO_stdin_used+0x1fa>\n    2a99:\t4c 89 ef             \tmov    %r13,%rdi\n    2a9c:\te8 8f e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2aa1:\t85 c0                \ttest   %eax,%eax\n    2aa3:\t0f 84 20 2c 00 00    \tje     56c9 <main+0x4609>\n    2aa9:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2aae:\t48 8d 35 4c 87 00 00 \tlea    0x874c(%rip),%rsi        # b201 <_IO_stdin_used+0x201>\n    2ab5:\t4c 89 ef             \tmov    %r13,%rdi\n    2ab8:\te8 73 e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2abd:\t85 c0                \ttest   %eax,%eax\n    2abf:\t0f 84 c6 2b 00 00    \tje     568b <main+0x45cb>\n    2ac5:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2aca:\t48 8d 35 96 8c 00 00 \tlea    0x8c96(%rip),%rsi        # b767 <_IO_stdin_used+0x767>\n    2ad1:\t4c 89 ef             \tmov    %r13,%rdi\n    2ad4:\te8 57 e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2ad9:\t85 c0                \ttest   %eax,%eax\n    2adb:\t0f 84 6c 2b 00 00    \tje     564d <main+0x458d>\n    2ae1:\tba 0a 00 00 00       \tmov    $0xa,%edx\n    2ae6:\t48 8d 35 1b 87 00 00 \tlea    0x871b(%rip),%rsi        # b208 <_IO_stdin_used+0x208>\n    2aed:\t4c 89 ef             \tmov    %r13,%rdi\n    2af0:\te8 3b e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2af5:\t85 c0                \ttest   %eax,%eax\n    2af7:\t0f 84 12 2b 00 00    \tje     560f <main+0x454f>\n    2afd:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2b02:\t48 8d 35 0a 87 00 00 \tlea    0x870a(%rip),%rsi        # b213 <_IO_stdin_used+0x213>\n    2b09:\t4c 89 ef             \tmov    %r13,%rdi\n    2b0c:\te8 1f e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2b11:\t85 c0                \ttest   %eax,%eax\n    2b13:\t0f 84 b8 2a 00 00    \tje     55d1 <main+0x4511>\n    2b19:\tba 03 00 00 00       \tmov    $0x3,%edx\n    2b1e:\t48 8d 35 f7 86 00 00 \tlea    0x86f7(%rip),%rsi        # b21c <_IO_stdin_used+0x21c>\n    2b25:\t4c 89 ef             \tmov    %r13,%rdi\n    2b28:\te8 03 e5 ff ff       \tcallq  1030 <strncmp@plt>\n    2b2d:\t85 c0                \ttest   %eax,%eax\n    2b2f:\t0f 84 5e 2a 00 00    \tje     5593 <main+0x44d3>\n    2b35:\tba 05 00 00 00       \tmov    $0x5,%edx\n    2b3a:\t48 8d 35 d9 86 00 00 \tlea    0x86d9(%rip),%rsi        # b21a <_IO_stdin_used+0x21a>\n    2b41:\t4c 89 ef             \tmov    %r13,%rdi\n    2b44:\te8 e7 e4 ff ff       \tcallq  1030 <strncmp@plt>\n    2b49:\t85 c0                \ttest   %eax,%eax\n    2b4b:\t0f 84 04 2a 00 00    \tje     5555 <main+0x4495>\n    2b51:\tba 04 00 00 00       \tmov    $0x4,%edx\n    2b56:\t48 8d 35 c3 86 00 00 \tlea    0x86c3(%rip),%rsi        # b220 <_IO_stdin_used+0x220>\n    2b5d:\t4c 89 ef             \tmov    %r13,%rdi\n    2b60:\te8 cb e4 ff ff       \tcallq  1030 <strncmp@plt>\n    2b65:\t85 c0                \ttest   %eax,%eax\n    2b67:\t0f 84 aa 29 00 00    \tje     5517 <main+0x4457>\n    2b6d:\tba 04 00 00 00       \tmov    $0x4,%edx\n    2b72:\t48 8d 35 ac 86 00 00 \tlea    0x86ac(%rip),%rsi        # b225 <_IO_stdin_used+0x225>\n    2b79:\t4c 89 ef             \tmov    %r13,%rdi\n    2b7c:\te8 af e4 ff ff       \tcallq  1030 <strncmp@plt>\n    2b81:\t85 c0                \ttest   %eax,%eax\n    2b83:\t0f 84 50 29 00 00    \tje     54d9 <main+0x4419>\n    2b89:\tba 07 00 00 00       \tmov    $0x7,%edx\n    2b8e:\t48 8d 35 95 86 00 00 \tlea    0x8695(%rip),%rsi        # b22a <_IO_stdin_used+0x22a>\n    2b95:\t4c 89 ef             \tmov    %r13,%rdi\n    2b98:\te8 93 e4 ff ff       \tcallq  1030 <strncmp@plt>\n    2b9d:\t85 c0                \ttest   %eax,%eax\n    2b9f:\t0f 84 f6 28 00 00    \tje     549b <main+0x43db>\n    2ba5:\tba 03 00 00 00       \tmov    $0x3,%edx\n    2baa:\t48 8d 35 8b 86 00 00 \tlea    0x868b(%rip),%rsi        # b23c <_IO_stdin_used+0x23c>\n    2bb1:\t4c 89 ef             \tmov    %r13,%rdi\n    2bb4:\te8 77 e4 ff ff       \tcallq  1030 <strncmp@plt>\n    2bb9:\t85 c0                \ttest   %eax,%eax\n    2bbb:\t0f 84 9c 28 00 00    \tje     545d <main+0x439d>\n    2bc1:\tba 03 00 00 00       \tmov    $0x3,%edx\n    2bc6:\t48 8d 35 65 86 00 00 \tlea    0x8665(%rip),%rsi        # b232 <_IO_stdin_used+0x232>\n    2bcd:\t4c 89 ef             \tmov    %r13,%rdi\n    2bd0:\te8 5b e4 ff ff       \tcallq  1030 <strncmp@plt>\n    2bd5:\t85 c0                \ttest   %eax,%eax\n    2bd7:\t0f 84 42 28 00 00    \tje     541f <main+0x435f>\n    2bdd:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2be2:\t48 8d 35 4d 86 00 00 \tlea    0x864d(%rip),%rsi        # b236 <_IO_stdin_used+0x236>\n    2be9:\t4c 89 ef             \tmov    %r13,%rdi\n    2bec:\te8 3f e4 ff ff       \tcallq  1030 <strncmp@plt>\n    2bf1:\t85 c0                \ttest   %eax,%eax\n    2bf3:\t0f 84 e8 27 00 00    \tje     53e1 <main+0x4321>\n    2bf9:\tba 03 00 00 00       \tmov    $0x3,%edx\n    2bfe:\t48 8d 35 3b 86 00 00 \tlea    0x863b(%rip),%rsi        # b240 <_IO_stdin_used+0x240>\n    2c05:\t4c 89 ef             \tmov    %r13,%rdi\n    2c08:\te8 23 e4 ff ff       \tcallq  1030 <strncmp@plt>\n    2c0d:\t85 c0                \ttest   %eax,%eax\n    2c0f:\t0f 84 8e 27 00 00    \tje     53a3 <main+0x42e3>\n    2c15:\tba 03 00 00 00       \tmov    $0x3,%edx\n    2c1a:\t48 8d 35 2f 86 00 00 \tlea    0x862f(%rip),%rsi        # b250 <_IO_stdin_used+0x250>\n    2c21:\t4c 89 ef             \tmov    %r13,%rdi\n    2c24:\te8 07 e4 ff ff       \tcallq  1030 <strncmp@plt>\n    2c29:\t85 c0                \ttest   %eax,%eax\n    2c2b:\t0f 84 34 27 00 00    \tje     5365 <main+0x42a5>\n    2c31:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2c36:\t48 8d 35 0d 86 00 00 \tlea    0x860d(%rip),%rsi        # b24a <_IO_stdin_used+0x24a>\n    2c3d:\t4c 89 ef             \tmov    %r13,%rdi\n    2c40:\te8 eb e3 ff ff       \tcallq  1030 <strncmp@plt>\n    2c45:\t85 c0                \ttest   %eax,%eax\n    2c47:\t0f 84 da 26 00 00    \tje     5327 <main+0x4267>\n    2c4d:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2c52:\t48 8d 35 fb 85 00 00 \tlea    0x85fb(%rip),%rsi        # b254 <_IO_stdin_used+0x254>\n    2c59:\t4c 89 ef             \tmov    %r13,%rdi\n    2c5c:\te8 cf e3 ff ff       \tcallq  1030 <strncmp@plt>\n    2c61:\t85 c0                \ttest   %eax,%eax\n    2c63:\t0f 84 80 26 00 00    \tje     52e9 <main+0x4229>\n    2c69:\tba 03 00 00 00       \tmov    $0x3,%edx\n    2c6e:\t48 8d 35 ef 85 00 00 \tlea    0x85ef(%rip),%rsi        # b264 <_IO_stdin_used+0x264>\n    2c75:\t4c 89 ef             \tmov    %r13,%rdi\n    2c78:\te8 b3 e3 ff ff       \tcallq  1030 <strncmp@plt>\n    2c7d:\t85 c0                \ttest   %eax,%eax\n    2c7f:\t0f 84 26 26 00 00    \tje     52ab <main+0x41eb>\n    2c85:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2c8a:\t48 8d 35 cd 85 00 00 \tlea    0x85cd(%rip),%rsi        # b25e <_IO_stdin_used+0x25e>\n    2c91:\t4c 89 ef             \tmov    %r13,%rdi\n    2c94:\te8 97 e3 ff ff       \tcallq  1030 <strncmp@plt>\n    2c99:\t85 c0                \ttest   %eax,%eax\n    2c9b:\t0f 84 cc 25 00 00    \tje     526d <main+0x41ad>\n    2ca1:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2ca6:\t48 8d 35 cd 85 00 00 \tlea    0x85cd(%rip),%rsi        # b27a <_IO_stdin_used+0x27a>\n    2cad:\t4c 89 ef             \tmov    %r13,%rdi\n    2cb0:\te8 7b e3 ff ff       \tcallq  1030 <strncmp@plt>\n    2cb5:\t85 c0                \ttest   %eax,%eax\n    2cb7:\t0f 84 72 25 00 00    \tje     522f <main+0x416f>\n    2cbd:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    2cc2:\t48 8d 35 9f 85 00 00 \tlea    0x859f(%rip),%rsi        # b268 <_IO_stdin_used+0x268>\n    2cc9:\t4c 89 ef             \tmov    %r13,%rdi\n    2ccc:\te8 5f e3 ff ff       \tcallq  1030 <strncmp@plt>\n    2cd1:\t85 c0                \ttest   %eax,%eax\n    2cd3:\t0f 84 18 25 00 00    \tje     51f1 <main+0x4131>\n    2cd9:\tba 0e 00 00 00       \tmov    $0xe,%edx\n    2cde:\t48 8d 35 90 85 00 00 \tlea    0x8590(%rip),%rsi        # b275 <_IO_stdin_used+0x275>\n    2ce5:\t4c 89 ef             \tmov    %r13,%rdi\n    2ce8:\te8 43 e3 ff ff       \tcallq  1030 <strncmp@plt>\n    2ced:\t85 c0                \ttest   %eax,%eax\n    2cef:\t0f 84 be 24 00 00    \tje     51b3 <main+0x40f3>\n    2cf5:\tba 0e 00 00 00       \tmov    $0xe,%edx\n    2cfa:\t48 8d 35 83 85 00 00 \tlea    0x8583(%rip),%rsi        # b284 <_IO_stdin_used+0x284>\n    2d01:\t4c 89 ef             \tmov    %r13,%rdi\n    2d04:\te8 27 e3 ff ff       \tcallq  1030 <strncmp@plt>\n    2d09:\t85 c0                \ttest   %eax,%eax\n    2d0b:\t0f 84 64 24 00 00    \tje     5175 <main+0x40b5>\n    2d11:\tba 08 00 00 00       \tmov    $0x8,%edx\n    2d16:\t48 8d 35 76 85 00 00 \tlea    0x8576(%rip),%rsi        # b293 <_IO_stdin_used+0x293>\n    2d1d:\t4c 89 ef             \tmov    %r13,%rdi\n    2d20:\te8 0b e3 ff ff       \tcallq  1030 <strncmp@plt>\n    2d25:\t85 c0                \ttest   %eax,%eax\n    2d27:\t0f 84 0a 24 00 00    \tje     5137 <main+0x4077>\n    2d2d:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    2d32:\t48 8d 35 63 85 00 00 \tlea    0x8563(%rip),%rsi        # b29c <_IO_stdin_used+0x29c>\n    2d39:\t4c 89 ef             \tmov    %r13,%rdi\n    2d3c:\te8 ef e2 ff ff       \tcallq  1030 <strncmp@plt>\n    2d41:\t85 c0                \ttest   %eax,%eax\n    2d43:\t0f 84 a2 23 00 00    \tje     50eb <main+0x402b>\n    2d49:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    2d4e:\t48 8d 35 54 85 00 00 \tlea    0x8554(%rip),%rsi        # b2a9 <_IO_stdin_used+0x2a9>\n    2d55:\t4c 89 ef             \tmov    %r13,%rdi\n    2d58:\te8 d3 e2 ff ff       \tcallq  1030 <strncmp@plt>\n    2d5d:\t85 c0                \ttest   %eax,%eax\n    2d5f:\t0f 84 38 23 00 00    \tje     509d <main+0x3fdd>\n    2d65:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    2d6a:\t48 8d 35 45 85 00 00 \tlea    0x8545(%rip),%rsi        # b2b6 <_IO_stdin_used+0x2b6>\n    2d71:\t4c 89 ef             \tmov    %r13,%rdi\n    2d74:\te8 b7 e2 ff ff       \tcallq  1030 <strncmp@plt>\n    2d79:\t85 c0                \ttest   %eax,%eax\n    2d7b:\t0f 84 d0 22 00 00    \tje     5051 <main+0x3f91>\n    2d81:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    2d86:\t48 8d 35 36 85 00 00 \tlea    0x8536(%rip),%rsi        # b2c3 <_IO_stdin_used+0x2c3>\n    2d8d:\t4c 89 ef             \tmov    %r13,%rdi\n    2d90:\te8 9b e2 ff ff       \tcallq  1030 <strncmp@plt>\n    2d95:\t85 c0                \ttest   %eax,%eax\n    2d97:\t0f 84 66 22 00 00    \tje     5003 <main+0x3f43>\n    2d9d:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    2da2:\t48 8d 35 27 85 00 00 \tlea    0x8527(%rip),%rsi        # b2d0 <_IO_stdin_used+0x2d0>\n    2da9:\t4c 89 ef             \tmov    %r13,%rdi\n    2dac:\te8 7f e2 ff ff       \tcallq  1030 <strncmp@plt>\n    2db1:\t85 c0                \ttest   %eax,%eax\n    2db3:\t0f 84 fc 21 00 00    \tje     4fb5 <main+0x3ef5>\n    2db9:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    2dbe:\t48 8d 35 17 85 00 00 \tlea    0x8517(%rip),%rsi        # b2dc <_IO_stdin_used+0x2dc>\n    2dc5:\t4c 89 ef             \tmov    %r13,%rdi\n    2dc8:\te8 63 e2 ff ff       \tcallq  1030 <strncmp@plt>\n    2dcd:\t85 c0                \ttest   %eax,%eax\n    2dcf:\t0f 84 92 21 00 00    \tje     4f67 <main+0x3ea7>\n    2dd5:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    2dda:\t48 8d 35 07 85 00 00 \tlea    0x8507(%rip),%rsi        # b2e8 <_IO_stdin_used+0x2e8>\n    2de1:\t4c 89 ef             \tmov    %r13,%rdi\n    2de4:\te8 47 e2 ff ff       \tcallq  1030 <strncmp@plt>\n    2de9:\t85 c0                \ttest   %eax,%eax\n    2deb:\t0f 84 28 21 00 00    \tje     4f19 <main+0x3e59>\n    2df1:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    2df6:\t48 8d 35 f7 84 00 00 \tlea    0x84f7(%rip),%rsi        # b2f4 <_IO_stdin_used+0x2f4>\n    2dfd:\t4c 89 ef             \tmov    %r13,%rdi\n    2e00:\te8 2b e2 ff ff       \tcallq  1030 <strncmp@plt>\n    2e05:\t85 c0                \ttest   %eax,%eax\n    2e07:\t0f 84 be 20 00 00    \tje     4ecb <main+0x3e0b>\n    2e0d:\tba 08 00 00 00       \tmov    $0x8,%edx\n    2e12:\t48 8d 35 d2 84 00 00 \tlea    0x84d2(%rip),%rsi        # b2eb <_IO_stdin_used+0x2eb>\n    2e19:\t4c 89 ef             \tmov    %r13,%rdi\n    2e1c:\te8 0f e2 ff ff       \tcallq  1030 <strncmp@plt>\n    2e21:\t85 c0                \ttest   %eax,%eax\n    2e23:\t0f 84 64 20 00 00    \tje     4e8d <main+0x3dcd>\n    2e29:\tba 08 00 00 00       \tmov    $0x8,%edx\n    2e2e:\t48 8d 35 c2 84 00 00 \tlea    0x84c2(%rip),%rsi        # b2f7 <_IO_stdin_used+0x2f7>\n    2e35:\t4c 89 ef             \tmov    %r13,%rdi\n    2e38:\te8 f3 e1 ff ff       \tcallq  1030 <strncmp@plt>\n    2e3d:\t85 c0                \ttest   %eax,%eax\n    2e3f:\t0f 84 0a 20 00 00    \tje     4e4f <main+0x3d8f>\n    2e45:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2e4a:\t48 8d 35 68 84 00 00 \tlea    0x8468(%rip),%rsi        # b2b9 <_IO_stdin_used+0x2b9>\n    2e51:\t4c 89 ef             \tmov    %r13,%rdi\n    2e54:\te8 d7 e1 ff ff       \tcallq  1030 <strncmp@plt>\n    2e59:\t85 c0                \ttest   %eax,%eax\n    2e5b:\t0f 84 b0 1f 00 00    \tje     4e11 <main+0x3d51>\n    2e61:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2e66:\t48 8d 35 68 85 00 00 \tlea    0x8568(%rip),%rsi        # b3d5 <_IO_stdin_used+0x3d5>\n    2e6d:\t4c 89 ef             \tmov    %r13,%rdi\n    2e70:\te8 bb e1 ff ff       \tcallq  1030 <strncmp@plt>\n    2e75:\t85 c0                \ttest   %eax,%eax\n    2e77:\t0f 84 56 1f 00 00    \tje     4dd3 <main+0x3d13>\n    2e7d:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2e82:\t48 8d 35 7a 84 00 00 \tlea    0x847a(%rip),%rsi        # b303 <_IO_stdin_used+0x303>\n    2e89:\t4c 89 ef             \tmov    %r13,%rdi\n    2e8c:\te8 9f e1 ff ff       \tcallq  1030 <strncmp@plt>\n    2e91:\t85 c0                \ttest   %eax,%eax\n    2e93:\t0f 84 fa 1e 00 00    \tje     4d93 <main+0x3cd3>\n    2e99:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2e9e:\t48 8d 35 0c 83 00 00 \tlea    0x830c(%rip),%rsi        # b1b1 <_IO_stdin_used+0x1b1>\n    2ea5:\t4c 89 ef             \tmov    %r13,%rdi\n    2ea8:\te8 83 e1 ff ff       \tcallq  1030 <strncmp@plt>\n    2ead:\t85 c0                \ttest   %eax,%eax\n    2eaf:\t0f 84 9e 1e 00 00    \tje     4d53 <main+0x3c93>\n    2eb5:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2eba:\t48 8d 35 3f 84 00 00 \tlea    0x843f(%rip),%rsi        # b300 <_IO_stdin_used+0x300>\n    2ec1:\t4c 89 ef             \tmov    %r13,%rdi\n    2ec4:\te8 67 e1 ff ff       \tcallq  1030 <strncmp@plt>\n    2ec9:\t85 c0                \ttest   %eax,%eax\n    2ecb:\t0f 84 34 1e 00 00    \tje     4d05 <main+0x3c45>\n    2ed1:\tba 09 00 00 00       \tmov    $0x9,%edx\n    2ed6:\t48 8d 35 2d 84 00 00 \tlea    0x842d(%rip),%rsi        # b30a <_IO_stdin_used+0x30a>\n    2edd:\t4c 89 ef             \tmov    %r13,%rdi\n    2ee0:\te8 4b e1 ff ff       \tcallq  1030 <strncmp@plt>\n    2ee5:\t85 c0                \ttest   %eax,%eax\n    2ee7:\t0f 84 ca 1d 00 00    \tje     4cb7 <main+0x3bf7>\n    2eed:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2ef2:\t48 8d 35 29 84 00 00 \tlea    0x8429(%rip),%rsi        # b322 <_IO_stdin_used+0x322>\n    2ef9:\t4c 89 ef             \tmov    %r13,%rdi\n    2efc:\te8 2f e1 ff ff       \tcallq  1030 <strncmp@plt>\n    2f01:\t85 c0                \ttest   %eax,%eax\n    2f03:\t0f 84 6e 1d 00 00    \tje     4c77 <main+0x3bb7>\n    2f09:\tba 06 00 00 00       \tmov    $0x6,%edx\n    2f0e:\t48 8d 35 ff 83 00 00 \tlea    0x83ff(%rip),%rsi        # b314 <_IO_stdin_used+0x314>\n    2f15:\t4c 89 ef             \tmov    %r13,%rdi\n    2f18:\te8 13 e1 ff ff       \tcallq  1030 <strncmp@plt>\n    2f1d:\t85 c0                \ttest   %eax,%eax\n    2f1f:\t0f 84 12 1d 00 00    \tje     4c37 <main+0x3b77>\n    2f25:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    2f2a:\t48 8d 35 eb 83 00 00 \tlea    0x83eb(%rip),%rsi        # b31c <_IO_stdin_used+0x31c>\n    2f31:\t4c 89 ef             \tmov    %r13,%rdi\n    2f34:\te8 f7 e0 ff ff       \tcallq  1030 <strncmp@plt>\n    2f39:\t85 c0                \ttest   %eax,%eax\n    2f3b:\t0f 84 b2 1c 00 00    \tje     4bf3 <main+0x3b33>\n    2f41:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    2f46:\t48 8d 35 dd 83 00 00 \tlea    0x83dd(%rip),%rsi        # b32a <_IO_stdin_used+0x32a>\n    2f4d:\t4c 89 ef             \tmov    %r13,%rdi\n    2f50:\te8 db e0 ff ff       \tcallq  1030 <strncmp@plt>\n    2f55:\t85 c0                \ttest   %eax,%eax\n    2f57:\t0f 84 52 1c 00 00    \tje     4baf <main+0x3aef>\n    2f5d:\tba 0e 00 00 00       \tmov    $0xe,%edx\n    2f62:\t48 8d 35 ce 83 00 00 \tlea    0x83ce(%rip),%rsi        # b337 <_IO_stdin_used+0x337>\n    2f69:\t4c 89 ef             \tmov    %r13,%rdi\n    2f6c:\te8 bf e0 ff ff       \tcallq  1030 <strncmp@plt>\n    2f71:\t85 c0                \ttest   %eax,%eax\n    2f73:\t0f 84 f2 1b 00 00    \tje     4b6b <main+0x3aab>\n    2f79:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    2f7e:\t48 8d 35 c2 83 00 00 \tlea    0x83c2(%rip),%rsi        # b347 <_IO_stdin_used+0x347>\n    2f85:\t4c 89 ef             \tmov    %r13,%rdi\n    2f88:\te8 a3 e0 ff ff       \tcallq  1030 <strncmp@plt>\n    2f8d:\t85 c0                \ttest   %eax,%eax\n    2f8f:\t0f 84 92 1b 00 00    \tje     4b27 <main+0x3a67>\n    2f95:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    2f9a:\t48 8d 35 76 81 00 00 \tlea    0x8176(%rip),%rsi        # b117 <_IO_stdin_used+0x117>\n    2fa1:\t4c 89 ef             \tmov    %r13,%rdi\n    2fa4:\te8 87 e0 ff ff       \tcallq  1030 <strncmp@plt>\n    2fa9:\t85 c0                \ttest   %eax,%eax\n    2fab:\t0f 84 32 1b 00 00    \tje     4ae3 <main+0x3a23>\n    2fb1:\tba 0f 00 00 00       \tmov    $0xf,%edx\n    2fb6:\t48 8d 35 97 83 00 00 \tlea    0x8397(%rip),%rsi        # b354 <_IO_stdin_used+0x354>\n    2fbd:\t4c 89 ef             \tmov    %r13,%rdi\n    2fc0:\te8 6b e0 ff ff       \tcallq  1030 <strncmp@plt>\n    2fc5:\t85 c0                \ttest   %eax,%eax\n    2fc7:\t0f 84 d6 1a 00 00    \tje     4aa3 <main+0x39e3>\n    2fcd:\tba 08 00 00 00       \tmov    $0x8,%edx\n    2fd2:\t48 8d 35 8c 83 00 00 \tlea    0x838c(%rip),%rsi        # b365 <_IO_stdin_used+0x365>\n    2fd9:\t4c 89 ef             \tmov    %r13,%rdi\n    2fdc:\te8 4f e0 ff ff       \tcallq  1030 <strncmp@plt>\n    2fe1:\t85 c0                \ttest   %eax,%eax\n    2fe3:\t0f 84 6c 1a 00 00    \tje     4a55 <main+0x3995>\n    2fe9:\tba 08 00 00 00       \tmov    $0x8,%edx\n    2fee:\t48 8d 35 79 83 00 00 \tlea    0x8379(%rip),%rsi        # b36e <_IO_stdin_used+0x36e>\n    2ff5:\t4c 89 ef             \tmov    %r13,%rdi\n    2ff8:\te8 33 e0 ff ff       \tcallq  1030 <strncmp@plt>\n    2ffd:\t85 c0                \ttest   %eax,%eax\n    2fff:\t0f 84 02 1a 00 00    \tje     4a07 <main+0x3947>\n    3005:\tba 05 00 00 00       \tmov    $0x5,%edx\n    300a:\t48 8d 35 57 83 00 00 \tlea    0x8357(%rip),%rsi        # b368 <_IO_stdin_used+0x368>\n    3011:\t4c 89 ef             \tmov    %r13,%rdi\n    3014:\te8 17 e0 ff ff       \tcallq  1030 <strncmp@plt>\n    3019:\t85 c0                \ttest   %eax,%eax\n    301b:\t0f 84 a6 19 00 00    \tje     49c7 <main+0x3907>\n    3021:\tba 05 00 00 00       \tmov    $0x5,%edx\n    3026:\t48 8d 35 62 83 00 00 \tlea    0x8362(%rip),%rsi        # b38f <_IO_stdin_used+0x38f>\n    302d:\t4c 89 ef             \tmov    %r13,%rdi\n    3030:\te8 fb df ff ff       \tcallq  1030 <strncmp@plt>\n    3035:\t85 c0                \ttest   %eax,%eax\n    3037:\t0f 84 4a 19 00 00    \tje     4987 <main+0x38c7>\n    303d:\tba 05 00 00 00       \tmov    $0x5,%edx\n    3042:\t48 8d 35 2e 83 00 00 \tlea    0x832e(%rip),%rsi        # b377 <_IO_stdin_used+0x377>\n    3049:\t4c 89 ef             \tmov    %r13,%rdi\n    304c:\te8 df df ff ff       \tcallq  1030 <strncmp@plt>\n    3051:\t85 c0                \ttest   %eax,%eax\n    3053:\t0f 84 ee 18 00 00    \tje     4947 <main+0x3887>\n    3059:\tba 05 00 00 00       \tmov    $0x5,%edx\n    305e:\t48 8d 35 20 83 00 00 \tlea    0x8320(%rip),%rsi        # b385 <_IO_stdin_used+0x385>\n    3065:\t4c 89 ef             \tmov    %r13,%rdi\n    3068:\te8 c3 df ff ff       \tcallq  1030 <strncmp@plt>\n    306d:\t85 c0                \ttest   %eax,%eax\n    306f:\t0f 84 92 18 00 00    \tje     4907 <main+0x3847>\n    3075:\tba 07 00 00 00       \tmov    $0x7,%edx\n    307a:\t48 8d 35 1a 83 00 00 \tlea    0x831a(%rip),%rsi        # b39b <_IO_stdin_used+0x39b>\n    3081:\t4c 89 ef             \tmov    %r13,%rdi\n    3084:\te8 a7 df ff ff       \tcallq  1030 <strncmp@plt>\n    3089:\t85 c0                \ttest   %eax,%eax\n    308b:\t0f 84 36 18 00 00    \tje     48c7 <main+0x3807>\n    3091:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    3096:\t48 8d 35 f8 82 00 00 \tlea    0x82f8(%rip),%rsi        # b395 <_IO_stdin_used+0x395>\n    309d:\t4c 89 ef             \tmov    %r13,%rdi\n    30a0:\te8 8b df ff ff       \tcallq  1030 <strncmp@plt>\n    30a5:\t85 c0                \ttest   %eax,%eax\n    30a7:\t0f 84 da 17 00 00    \tje     4887 <main+0x37c7>\n    30ad:\tba 07 00 00 00       \tmov    $0x7,%edx\n    30b2:\t48 8d 35 ea 82 00 00 \tlea    0x82ea(%rip),%rsi        # b3a3 <_IO_stdin_used+0x3a3>\n    30b9:\t4c 89 ef             \tmov    %r13,%rdi\n    30bc:\te8 6f df ff ff       \tcallq  1030 <strncmp@plt>\n    30c1:\t85 c0                \ttest   %eax,%eax\n    30c3:\t0f 84 7e 17 00 00    \tje     4847 <main+0x3787>\n    30c9:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    30ce:\t48 8d 35 d6 82 00 00 \tlea    0x82d6(%rip),%rsi        # b3ab <_IO_stdin_used+0x3ab>\n    30d5:\t4c 89 ef             \tmov    %r13,%rdi\n    30d8:\te8 53 df ff ff       \tcallq  1030 <strncmp@plt>\n    30dd:\t85 c0                \ttest   %eax,%eax\n    30df:\t0f 84 22 17 00 00    \tje     4807 <main+0x3747>\n    30e5:\tba 07 00 00 00       \tmov    $0x7,%edx\n    30ea:\t48 8d 35 cc 82 00 00 \tlea    0x82cc(%rip),%rsi        # b3bd <_IO_stdin_used+0x3bd>\n    30f1:\t4c 89 ef             \tmov    %r13,%rdi\n    30f4:\te8 37 df ff ff       \tcallq  1030 <strncmp@plt>\n    30f9:\t85 c0                \ttest   %eax,%eax\n    30fb:\t0f 85 a9 f7 ff ff    \tjne    28aa <main+0x17ea>\n    3101:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3107:\t48 8d 35 b2 78 00 00 \tlea    0x78b2(%rip),%rsi        # a9c0 <store128wrapper>\n    310e:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    3113:\te8 28 79 00 00       \tcallq  aa40 <measureFunction>\n    3118:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    311d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3122:\t48 8d 35 d4 82 00 00 \tlea    0x82d4(%rip),%rsi        # b3fd <_IO_stdin_used+0x3fd>\n    3129:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    312d:\te8 4e df ff ff       \tcallq  1080 <__printf_chk@plt>\n    3132:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    3136:\t0f 8f 6a f7 ff ff    \tjg     28a6 <main+0x17e6>\n    313c:\te9 05 f8 ff ff       \tjmpq   2946 <main+0x1886>\n    3141:\t48 8b 0d 98 af 00 00 \tmov    0xaf98(%rip),%rcx        # e0e0 <stderr@@GLIBC_2.2.5>\n    3148:\tba 11 00 00 00       \tmov    $0x11,%edx\n    314d:\tbe 01 00 00 00       \tmov    $0x1,%esi\n    3152:\t48 8d 3d f2 7e 00 00 \tlea    0x7ef2(%rip),%rdi        # b04b <_IO_stdin_used+0x4b>\n    3159:\te8 32 df ff ff       \tcallq  1090 <fwrite@plt>\n    315e:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    3162:\t0f 84 48 33 00 00    \tje     64b0 <main+0x53f0>\n    3168:\tf2 0f 10 05 10 8f 00 \tmovsd  0x8f10(%rip),%xmm0        # c080 <_IO_stdin_used+0x1080>\n    316f:\t00 \n    3170:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3175:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    317a:\t48 8d 35 f7 85 00 00 \tlea    0x85f7(%rip),%rsi        # b778 <_IO_stdin_used+0x778>\n    3181:\te8 fa de ff ff       \tcallq  1080 <__printf_chk@plt>\n    3186:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    318a:\t0f 8e b6 f7 ff ff    \tjle    2946 <main+0x1886>\n    3190:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    3194:\tba 06 00 00 00       \tmov    $0x6,%edx\n    3199:\t48 8d 35 70 7f 00 00 \tlea    0x7f70(%rip),%rsi        # b110 <_IO_stdin_used+0x110>\n    31a0:\t4c 89 ef             \tmov    %r13,%rdi\n    31a3:\te8 88 de ff ff       \tcallq  1030 <strncmp@plt>\n    31a8:\t85 c0                \ttest   %eax,%eax\n    31aa:\t75 37                \tjne    31e3 <main+0x2123>\n    31ac:\t48 8d 35 44 5a 00 00 \tlea    0x5a44(%rip),%rsi        # 8bf7 <fma512>\n    31b3:\tf3 0f 10 05 cd 8e 00 \tmovss  0x8ecd(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    31ba:\t00 \n    31bb:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    31c0:\te8 7b 78 00 00       \tcallq  aa40 <measureFunction>\n    31c5:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    31ca:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    31cf:\t48 8d 35 87 7e 00 00 \tlea    0x7e87(%rip),%rsi        # b05d <_IO_stdin_used+0x5d>\n    31d6:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    31da:\te8 a1 de ff ff       \tcallq  1080 <__printf_chk@plt>\n    31df:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    31e3:\tba 09 00 00 00       \tmov    $0x9,%edx\n    31e8:\t48 8d 35 0e 7f 00 00 \tlea    0x7f0e(%rip),%rsi        # b0fd <_IO_stdin_used+0xfd>\n    31ef:\t4c 89 ef             \tmov    %r13,%rdi\n    31f2:\te8 39 de ff ff       \tcallq  1030 <strncmp@plt>\n    31f7:\t85 c0                \ttest   %eax,%eax\n    31f9:\t0f 84 63 15 00 00    \tje     4762 <main+0x36a2>\n    31ff:\tba 0f 00 00 00       \tmov    $0xf,%edx\n    3204:\t48 8d 35 fc 7e 00 00 \tlea    0x7efc(%rip),%rsi        # b107 <_IO_stdin_used+0x107>\n    320b:\t4c 89 ef             \tmov    %r13,%rdi\n    320e:\te8 1d de ff ff       \tcallq  1030 <strncmp@plt>\n    3213:\t85 c0                \ttest   %eax,%eax\n    3215:\t0f 85 01 32 00 00    \tjne    641c <main+0x535c>\n    321b:\t48 8d 35 b6 5a 00 00 \tlea    0x5ab6(%rip),%rsi        # 8cd8 <mixfma256fma512>\n    3222:\tf3 0f 10 05 5e 8e 00 \tmovss  0x8e5e(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    3229:\t00 \n    322a:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    322f:\te8 0c 78 00 00       \tcallq  aa40 <measureFunction>\n    3234:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3239:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    323e:\t48 8d 35 83 85 00 00 \tlea    0x8583(%rip),%rsi        # b7c8 <_IO_stdin_used+0x7c8>\n    3245:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3249:\te8 32 de ff ff       \tcallq  1080 <__printf_chk@plt>\n    324e:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    3252:\t0f 8e ee f6 ff ff    \tjle    2946 <main+0x1886>\n    3258:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    325c:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    3261:\t48 8d 35 af 7e 00 00 \tlea    0x7eaf(%rip),%rsi        # b117 <_IO_stdin_used+0x117>\n    3268:\t4c 89 ef             \tmov    %r13,%rdi\n    326b:\te8 c0 dd ff ff       \tcallq  1030 <strncmp@plt>\n    3270:\t85 c0                \ttest   %eax,%eax\n    3272:\t0f 85 4a 31 00 00    \tjne    63c2 <main+0x5302>\n    3278:\t48 8d 35 4a 62 00 00 \tlea    0x624a(%rip),%rsi        # 94c9 <nemesfpu512mix21>\n    327f:\tf3 0f 10 05 01 8e 00 \tmovss  0x8e01(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    3286:\t00 \n    3287:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    328c:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    3290:\te8 ab 77 00 00       \tcallq  aa40 <measureFunction>\n    3295:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    329a:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    329f:\t48 8d 35 4a 85 00 00 \tlea    0x854a(%rip),%rsi        # b7f0 <_IO_stdin_used+0x7f0>\n    32a6:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    32aa:\te8 d1 dd ff ff       \tcallq  1080 <__printf_chk@plt>\n    32af:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    32b3:\tba 09 00 00 00       \tmov    $0x9,%edx\n    32b8:\t48 8d 35 69 7e 00 00 \tlea    0x7e69(%rip),%rsi        # b128 <_IO_stdin_used+0x128>\n    32bf:\t4c 89 ef             \tmov    %r13,%rdi\n    32c2:\te8 69 dd ff ff       \tcallq  1030 <strncmp@plt>\n    32c7:\t85 c0                \ttest   %eax,%eax\n    32c9:\t0f 85 8b 30 00 00    \tjne    635a <main+0x529a>\n    32cf:\t48 8d 35 b7 47 00 00 \tlea    0x47b7(%rip),%rsi        # 7a8d <add512int>\n    32d6:\tf3 0f 10 05 aa 8d 00 \tmovss  0x8daa(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    32dd:\t00 \n    32de:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    32e3:\te8 58 77 00 00       \tcallq  aa40 <measureFunction>\n    32e8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    32ed:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    32f2:\t48 8d 35 27 85 00 00 \tlea    0x8527(%rip),%rsi        # b820 <_IO_stdin_used+0x820>\n    32f9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    32fd:\te8 7e dd ff ff       \tcallq  1080 <__printf_chk@plt>\n    3302:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    3306:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    330b:\t48 8d 35 13 7e 00 00 \tlea    0x7e13(%rip),%rsi        # b125 <_IO_stdin_used+0x125>\n    3312:\t4c 89 ef             \tmov    %r13,%rdi\n    3315:\te8 16 dd ff ff       \tcallq  1030 <strncmp@plt>\n    331a:\t85 c0                \ttest   %eax,%eax\n    331c:\t0f 85 de 2f 00 00    \tjne    6300 <main+0x5240>\n    3322:\t48 8d 35 87 4b 00 00 \tlea    0x4b87(%rip),%rsi        # 7eb0 <latadd256int>\n    3329:\tf3 0f 10 05 57 8d 00 \tmovss  0x8d57(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    3330:\t00 \n    3331:\t4c 89 f7             \tmov    %r14,%rdi\n    3334:\te8 07 77 00 00       \tcallq  aa40 <measureFunction>\n    3339:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    333e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3343:\tf3 0f 10 0d 3d 8d 00 \tmovss  0x8d3d(%rip),%xmm1        # c088 <_IO_stdin_used+0x1088>\n    334a:\t00 \n    334b:\t48 8d 35 ee 84 00 00 \tlea    0x84ee(%rip),%rsi        # b840 <_IO_stdin_used+0x840>\n    3352:\tf3 0f 5e c8          \tdivss  %xmm0,%xmm1\n    3356:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    335a:\tf3 0f 5a c1          \tcvtss2sd %xmm1,%xmm0\n    335e:\te8 1d dd ff ff       \tcallq  1080 <__printf_chk@plt>\n    3363:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    3367:\tba 09 00 00 00       \tmov    $0x9,%edx\n    336c:\t48 8d 35 d0 7d 00 00 \tlea    0x7dd0(%rip),%rsi        # b143 <_IO_stdin_used+0x143>\n    3373:\t4c 89 ef             \tmov    %r13,%rdi\n    3376:\te8 b5 dc ff ff       \tcallq  1030 <strncmp@plt>\n    337b:\t85 c0                \ttest   %eax,%eax\n    337d:\t0f 85 1f 2f 00 00    \tjne    62a2 <main+0x51e2>\n    3383:\t48 8d 35 87 45 00 00 \tlea    0x4587(%rip),%rsi        # 7911 <mul512int>\n    338a:\tf3 0f 10 05 f6 8c 00 \tmovss  0x8cf6(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    3391:\t00 \n    3392:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    3397:\te8 a4 76 00 00       \tcallq  aa40 <measureFunction>\n    339c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    33a1:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    33a6:\t48 8d 35 bb 84 00 00 \tlea    0x84bb(%rip),%rsi        # b868 <_IO_stdin_used+0x868>\n    33ad:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    33b1:\te8 ca dc ff ff       \tcallq  1080 <__printf_chk@plt>\n    33b6:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    33ba:\tba 09 00 00 00       \tmov    $0x9,%edx\n    33bf:\t48 8d 35 8a 7d 00 00 \tlea    0x7d8a(%rip),%rsi        # b150 <_IO_stdin_used+0x150>\n    33c6:\t4c 89 ef             \tmov    %r13,%rdi\n    33c9:\te8 62 dc ff ff       \tcallq  1030 <strncmp@plt>\n    33ce:\t85 c0                \ttest   %eax,%eax\n    33d0:\t0f 85 5a 2e 00 00    \tjne    6230 <main+0x5170>\n    33d6:\t48 8d 35 f2 45 00 00 \tlea    0x45f2(%rip),%rsi        # 79cf <muldq512int>\n    33dd:\tf3 0f 10 05 a3 8c 00 \tmovss  0x8ca3(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    33e4:\t00 \n    33e5:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    33ea:\te8 51 76 00 00       \tcallq  aa40 <measureFunction>\n    33ef:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    33f4:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    33f9:\t48 8d 35 90 84 00 00 \tlea    0x8490(%rip),%rsi        # b890 <_IO_stdin_used+0x890>\n    3400:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3404:\te8 77 dc ff ff       \tcallq  1080 <__printf_chk@plt>\n    3409:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    340d:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    3412:\t48 8d 35 19 7d 00 00 \tlea    0x7d19(%rip),%rsi        # b132 <_IO_stdin_used+0x132>\n    3419:\t4c 89 ef             \tmov    %r13,%rdi\n    341c:\te8 0f dc ff ff       \tcallq  1030 <strncmp@plt>\n    3421:\t85 c0                \ttest   %eax,%eax\n    3423:\t0f 85 99 2d 00 00    \tjne    61c2 <main+0x5102>\n    3429:\t48 8d 35 99 4d 00 00 \tlea    0x4d99(%rip),%rsi        # 81c9 <latmulq512int>\n    3430:\tf3 0f 10 05 50 8c 00 \tmovss  0x8c50(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    3437:\t00 \n    3438:\t4c 89 f7             \tmov    %r14,%rdi\n    343b:\te8 00 76 00 00       \tcallq  aa40 <measureFunction>\n    3440:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3445:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    344a:\tf3 0f 10 3d 36 8c 00 \tmovss  0x8c36(%rip),%xmm7        # c088 <_IO_stdin_used+0x1088>\n    3451:\t00 \n    3452:\t48 8d 35 67 84 00 00 \tlea    0x8467(%rip),%rsi        # b8c0 <_IO_stdin_used+0x8c0>\n    3459:\tf3 0f 11 7c 24 0c    \tmovss  %xmm7,0xc(%rsp)\n    345f:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    3463:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    3467:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    346b:\te8 10 dc ff ff       \tcallq  1080 <__printf_chk@plt>\n    3470:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    3474:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    3479:\t48 8d 35 c0 7c 00 00 \tlea    0x7cc0(%rip),%rsi        # b140 <_IO_stdin_used+0x140>\n    3480:\t4c 89 ef             \tmov    %r13,%rdi\n    3483:\te8 a8 db ff ff       \tcallq  1030 <strncmp@plt>\n    3488:\t85 c0                \ttest   %eax,%eax\n    348a:\t0f 85 ca 2c 00 00    \tjne    615a <main+0x509a>\n    3490:\t48 8d 35 8e 4b 00 00 \tlea    0x4b8e(%rip),%rsi        # 8025 <latmul512int>\n    3497:\tf3 0f 10 05 e9 8b 00 \tmovss  0x8be9(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    349e:\t00 \n    349f:\t4c 89 f7             \tmov    %r14,%rdi\n    34a2:\te8 99 75 00 00       \tcallq  aa40 <measureFunction>\n    34a7:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    34ad:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    34b2:\t48 8d 35 37 84 00 00 \tlea    0x8437(%rip),%rsi        # b8f0 <_IO_stdin_used+0x8f0>\n    34b9:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    34be:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    34c2:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    34c6:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    34ca:\te8 b1 db ff ff       \tcallq  1080 <__printf_chk@plt>\n    34cf:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    34d3:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    34d8:\t48 8d 35 6e 7c 00 00 \tlea    0x7c6e(%rip),%rsi        # b14d <_IO_stdin_used+0x14d>\n    34df:\t4c 89 ef             \tmov    %r13,%rdi\n    34e2:\te8 49 db ff ff       \tcallq  1030 <strncmp@plt>\n    34e7:\t85 c0                \ttest   %eax,%eax\n    34e9:\t0f 85 0d 2c 00 00    \tjne    60fc <main+0x503c>\n    34ef:\t48 8d 35 01 4c 00 00 \tlea    0x4c01(%rip),%rsi        # 80f7 <latmuldq512int>\n    34f6:\tf3 0f 10 05 8a 8b 00 \tmovss  0x8b8a(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    34fd:\t00 \n    34fe:\t4c 89 f7             \tmov    %r14,%rdi\n    3501:\te8 3a 75 00 00       \tcallq  aa40 <measureFunction>\n    3506:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    350c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3511:\t48 8d 35 08 84 00 00 \tlea    0x8408(%rip),%rsi        # b920 <_IO_stdin_used+0x920>\n    3518:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    351d:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    3521:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    3525:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    3529:\te8 52 db ff ff       \tcallq  1080 <__printf_chk@plt>\n    352e:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    3532:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    3537:\t48 8d 35 1e 7c 00 00 \tlea    0x7c1e(%rip),%rsi        # b15c <_IO_stdin_used+0x15c>\n    353e:\t4c 89 ef             \tmov    %r13,%rdi\n    3541:\te8 ea da ff ff       \tcallq  1030 <strncmp@plt>\n    3546:\t85 c0                \ttest   %eax,%eax\n    3548:\t0f 85 50 2b 00 00    \tjne    609e <main+0x4fde>\n    354e:\t48 8d 35 cd 5a 00 00 \tlea    0x5acd(%rip),%rsi        # 9022 <mixfmaadd512>\n    3555:\tf3 0f 10 05 2b 8b 00 \tmovss  0x8b2b(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    355c:\t00 \n    355d:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    3562:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    3566:\te8 d5 74 00 00       \tcallq  aa40 <measureFunction>\n    356b:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3570:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3575:\t48 8d 35 dc 83 00 00 \tlea    0x83dc(%rip),%rsi        # b958 <_IO_stdin_used+0x958>\n    357c:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3580:\te8 fb da ff ff       \tcallq  1080 <__printf_chk@plt>\n    3585:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    3589:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    358e:\t48 8d 35 d4 7b 00 00 \tlea    0x7bd4(%rip),%rsi        # b169 <_IO_stdin_used+0x169>\n    3595:\t4c 89 ef             \tmov    %r13,%rdi\n    3598:\te8 93 da ff ff       \tcallq  1030 <strncmp@plt>\n    359d:\t85 c0                \ttest   %eax,%eax\n    359f:\t0f 85 9f 2a 00 00    \tjne    6044 <main+0x4f84>\n    35a5:\t48 8d 35 57 5b 00 00 \tlea    0x5b57(%rip),%rsi        # 9103 <mixfma512add256>\n    35ac:\tf3 0f 10 05 d4 8a 00 \tmovss  0x8ad4(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    35b3:\t00 \n    35b4:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    35b9:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    35bd:\te8 7e 74 00 00       \tcallq  aa40 <measureFunction>\n    35c2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    35c7:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    35cc:\t48 8d 35 ad 83 00 00 \tlea    0x83ad(%rip),%rsi        # b980 <_IO_stdin_used+0x980>\n    35d3:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    35d7:\te8 a4 da ff ff       \tcallq  1080 <__printf_chk@plt>\n    35dc:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    35e0:\tba 07 00 00 00       \tmov    $0x7,%edx\n    35e5:\t48 8d 35 8d 7b 00 00 \tlea    0x7b8d(%rip),%rsi        # b179 <_IO_stdin_used+0x179>\n    35ec:\t4c 89 ef             \tmov    %r13,%rdi\n    35ef:\te8 3c da ff ff       \tcallq  1030 <strncmp@plt>\n    35f4:\t85 c0                \ttest   %eax,%eax\n    35f6:\t0f 85 ee 29 00 00    \tjne    5fea <main+0x4f2a>\n    35fc:\tf3 0f 10 05 84 8a 00 \tmovss  0x8a84(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    3603:\t00 \n    3604:\t48 8d 35 a5 73 00 00 \tlea    0x73a5(%rip),%rsi        # a9b0 <load512wrapper>\n    360b:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    3610:\te8 2b 74 00 00       \tcallq  aa40 <measureFunction>\n    3615:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    361a:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    361f:\t48 8d 35 52 7a 00 00 \tlea    0x7a52(%rip),%rsi        # b078 <_IO_stdin_used+0x78>\n    3626:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    362a:\te8 51 da ff ff       \tcallq  1080 <__printf_chk@plt>\n    362f:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    3633:\tba 07 00 00 00       \tmov    $0x7,%edx\n    3638:\t48 8d 35 42 7b 00 00 \tlea    0x7b42(%rip),%rsi        # b181 <_IO_stdin_used+0x181>\n    363f:\t4c 89 ef             \tmov    %r13,%rdi\n    3642:\te8 e9 d9 ff ff       \tcallq  1030 <strncmp@plt>\n    3647:\t85 c0                \ttest   %eax,%eax\n    3649:\t0f 85 41 29 00 00    \tjne    5f90 <main+0x4ed0>\n    364f:\tf3 0f 10 05 31 8a 00 \tmovss  0x8a31(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    3656:\t00 \n    3657:\t48 8d 35 a2 73 00 00 \tlea    0x73a2(%rip),%rsi        # aa00 <store512wrapper>\n    365e:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    3663:\te8 d8 73 00 00       \tcallq  aa40 <measureFunction>\n    3668:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    366d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3672:\t48 8d 35 1c 7a 00 00 \tlea    0x7a1c(%rip),%rsi        # b095 <_IO_stdin_used+0x95>\n    3679:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    367d:\te8 fe d9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3682:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    3686:\tba 09 00 00 00       \tmov    $0x9,%edx\n    368b:\t48 8d 35 f8 7a 00 00 \tlea    0x7af8(%rip),%rsi        # b18a <_IO_stdin_used+0x18a>\n    3692:\t4c 89 ef             \tmov    %r13,%rdi\n    3695:\te8 96 d9 ff ff       \tcallq  1030 <strncmp@plt>\n    369a:\t85 c0                \ttest   %eax,%eax\n    369c:\t0f 85 94 28 00 00    \tjne    5f36 <main+0x4e76>\n    36a2:\t48 8d 35 76 4d 00 00 \tlea    0x4d76(%rip),%rsi        # 841f <aesenc128>\n    36a9:\tf3 0f 10 05 d7 89 00 \tmovss  0x89d7(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    36b0:\t00 \n    36b1:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    36b6:\te8 85 73 00 00       \tcallq  aa40 <measureFunction>\n    36bb:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    36c0:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    36c5:\t48 8d 35 e7 79 00 00 \tlea    0x79e7(%rip),%rsi        # b0b3 <_IO_stdin_used+0xb3>\n    36cc:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    36d0:\te8 ab d9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    36d5:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    36d9:\tba 09 00 00 00       \tmov    $0x9,%edx\n    36de:\t48 8d 35 af 7a 00 00 \tlea    0x7aaf(%rip),%rsi        # b194 <_IO_stdin_used+0x194>\n    36e5:\t4c 89 ef             \tmov    %r13,%rdi\n    36e8:\te8 43 d9 ff ff       \tcallq  1030 <strncmp@plt>\n    36ed:\t85 c0                \ttest   %eax,%eax\n    36ef:\t0f 85 e7 27 00 00    \tjne    5edc <main+0x4e1c>\n    36f5:\t48 8d 35 87 50 00 00 \tlea    0x5087(%rip),%rsi        # 8783 <aesdec128>\n    36fc:\tf3 0f 10 05 84 89 00 \tmovss  0x8984(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    3703:\t00 \n    3704:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    3709:\te8 32 73 00 00       \tcallq  aa40 <measureFunction>\n    370e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3713:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3718:\t48 8d 35 aa 79 00 00 \tlea    0x79aa(%rip),%rsi        # b0c9 <_IO_stdin_used+0xc9>\n    371f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3723:\te8 58 d9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3728:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    372c:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    3731:\t48 8d 35 66 7a 00 00 \tlea    0x7a66(%rip),%rsi        # b19e <_IO_stdin_used+0x19e>\n    3738:\t4c 89 ef             \tmov    %r13,%rdi\n    373b:\te8 f0 d8 ff ff       \tcallq  1030 <strncmp@plt>\n    3740:\t85 c0                \ttest   %eax,%eax\n    3742:\t0f 85 3a 27 00 00    \tjne    5e82 <main+0x4dc2>\n    3748:\t48 8d 35 6a 4d 00 00 \tlea    0x4d6a(%rip),%rsi        # 84b9 <aesencadd128>\n    374f:\tf3 0f 10 05 31 89 00 \tmovss  0x8931(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    3756:\t00 \n    3757:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    375c:\te8 df 72 00 00       \tcallq  aa40 <measureFunction>\n    3761:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3766:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    376b:\t48 8d 35 3e 82 00 00 \tlea    0x823e(%rip),%rsi        # b9b0 <_IO_stdin_used+0x9b0>\n    3772:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3776:\te8 05 d9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    377b:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    377f:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    3784:\t48 8d 35 20 7a 00 00 \tlea    0x7a20(%rip),%rsi        # b1ab <_IO_stdin_used+0x1ab>\n    378b:\t4c 89 ef             \tmov    %r13,%rdi\n    378e:\te8 9d d8 ff ff       \tcallq  1030 <strncmp@plt>\n    3793:\t85 c0                \ttest   %eax,%eax\n    3795:\t0f 85 89 26 00 00    \tjne    5e24 <main+0x4d64>\n    379b:\t48 8d 35 d1 4d 00 00 \tlea    0x4dd1(%rip),%rsi        # 8573 <aesencfma128>\n    37a2:\tf3 0f 10 05 de 88 00 \tmovss  0x88de(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    37a9:\t00 \n    37aa:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    37af:\te8 8c 72 00 00       \tcallq  aa40 <measureFunction>\n    37b4:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    37b9:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    37be:\t48 8d 35 1a 79 00 00 \tlea    0x791a(%rip),%rsi        # b0df <_IO_stdin_used+0xdf>\n    37c5:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    37c9:\te8 b2 d8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    37ce:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    37d2:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    37d7:\t48 8d 35 da 79 00 00 \tlea    0x79da(%rip),%rsi        # b1b8 <_IO_stdin_used+0x1b8>\n    37de:\t4c 89 ef             \tmov    %r13,%rdi\n    37e1:\te8 4a d8 ff ff       \tcallq  1030 <strncmp@plt>\n    37e6:\t85 c0                \ttest   %eax,%eax\n    37e8:\t0f 85 7d f1 ff ff    \tjne    296b <main+0x18ab>\n    37ee:\t48 8d 35 f8 4e 00 00 \tlea    0x4ef8(%rip),%rsi        # 86ed <aesencmul128>\n    37f5:\tf3 0f 10 05 8b 88 00 \tmovss  0x888b(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    37fc:\t00 \n    37fd:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    3802:\te8 39 72 00 00       \tcallq  aa40 <measureFunction>\n    3807:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    380c:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3811:\t48 8d 35 b8 81 00 00 \tlea    0x81b8(%rip),%rsi        # b9d0 <_IO_stdin_used+0x9d0>\n    3818:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    381c:\te8 5f d8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3821:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    3825:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    382a:\t48 8d 35 87 79 00 00 \tlea    0x7987(%rip),%rsi        # b1b8 <_IO_stdin_used+0x1b8>\n    3831:\t4c 89 ef             \tmov    %r13,%rdi\n    3834:\te8 f7 d7 ff ff       \tcallq  1030 <strncmp@plt>\n    3839:\t85 c0                \ttest   %eax,%eax\n    383b:\t0f 85 ce 2c 00 00    \tjne    650f <main+0x544f>\n    3841:\tf3 0f 10 35 3f 88 00 \tmovss  0x883f(%rip),%xmm6        # c088 <_IO_stdin_used+0x1088>\n    3848:\t00 \n    3849:\tf3 0f 11 74 24 0c    \tmovss  %xmm6,0xc(%rsp)\n    384f:\tf3 0f 11 74 24 08    \tmovss  %xmm6,0x8(%rsp)\n    3855:\t48 8d 35 d8 4d 00 00 \tlea    0x4dd8(%rip),%rsi        # 8634 <aesencfadd128>\n    385c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3862:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    3867:\te8 d4 71 00 00       \tcallq  aa40 <measureFunction>\n    386c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3871:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3876:\t48 8d 35 7b 81 00 00 \tlea    0x817b(%rip),%rsi        # b9f8 <_IO_stdin_used+0x9f8>\n    387d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3881:\te8 fa d7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3886:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    388a:\t0f 85 3e d9 ff ff    \tjne    11ce <main+0x10e>\n    3890:\t48 8d 35 e1 34 00 00 \tlea    0x34e1(%rip),%rsi        # 6d78 <noptest1b>\n    3897:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    389d:\t4c 89 f7             \tmov    %r14,%rdi\n    38a0:\te8 9b 71 00 00       \tcallq  aa40 <measureFunction>\n    38a5:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    38aa:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    38af:\t48 8d 35 96 7e 00 00 \tlea    0x7e96(%rip),%rsi        # b74c <_IO_stdin_used+0x74c>\n    38b6:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    38ba:\te8 c1 d7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    38bf:\t48 8d 35 79 34 00 00 \tlea    0x3479(%rip),%rsi        # 6d3f <noptest>\n    38c6:\t4c 89 f7             \tmov    %r14,%rdi\n    38c9:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    38cf:\te8 6c 71 00 00       \tcallq  aa40 <measureFunction>\n    38d4:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    38d9:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    38de:\t48 8d 35 4c 7e 00 00 \tlea    0x7e4c(%rip),%rsi        # b731 <_IO_stdin_used+0x731>\n    38e5:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    38e9:\te8 92 d7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    38ee:\t48 8d 35 a9 34 00 00 \tlea    0x34a9(%rip),%rsi        # 6d9e <addtest>\n    38f5:\t4c 89 f7             \tmov    %r14,%rdi\n    38f8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    38fe:\te8 3d 71 00 00       \tcallq  aa40 <measureFunction>\n    3903:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3908:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    390d:\t48 8d 35 09 7e 00 00 \tlea    0x7e09(%rip),%rsi        # b71d <_IO_stdin_used+0x71d>\n    3914:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3918:\te8 63 d7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    391d:\t48 8d 2d 06 35 00 00 \tlea    0x3506(%rip),%rbp        # 6e2a <addnoptest>\n    3924:\t4c 89 f7             \tmov    %r14,%rdi\n    3927:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    392d:\t48 89 ee             \tmov    %rbp,%rsi\n    3930:\te8 0b 71 00 00       \tcallq  aa40 <measureFunction>\n    3935:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    393a:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    393f:\t48 8d 35 ba 7d 00 00 \tlea    0x7dba(%rip),%rsi        # b700 <_IO_stdin_used+0x700>\n    3946:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    394a:\te8 31 d7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    394f:\t48 89 ee             \tmov    %rbp,%rsi\n    3952:\t4c 89 f7             \tmov    %r14,%rdi\n    3955:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    395b:\te8 e0 70 00 00       \tcallq  aa40 <measureFunction>\n    3960:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3965:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    396a:\t48 8d 35 72 7d 00 00 \tlea    0x7d72(%rip),%rsi        # b6e3 <_IO_stdin_used+0x6e3>\n    3971:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3975:\te8 06 d7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    397a:\t48 8d 35 c6 6b 00 00 \tlea    0x6bc6(%rip),%rsi        # a547 <depmovtest>\n    3981:\t4c 89 f7             \tmov    %r14,%rdi\n    3984:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    398a:\te8 b1 70 00 00       \tcallq  aa40 <measureFunction>\n    398f:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3994:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3999:\t48 8d 35 25 7d 00 00 \tlea    0x7d25(%rip),%rsi        # b6c5 <_IO_stdin_used+0x6c5>\n    39a0:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    39a4:\te8 d7 d6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    39a9:\t48 8d 35 0c 6c 00 00 \tlea    0x6c0c(%rip),%rsi        # a5bc <indepmovtest>\n    39b0:\t4c 89 f7             \tmov    %r14,%rdi\n    39b3:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    39b9:\te8 82 70 00 00       \tcallq  aa40 <measureFunction>\n    39be:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    39c3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    39c8:\t48 8d 35 59 86 00 00 \tlea    0x8659(%rip),%rsi        # c028 <_IO_stdin_used+0x1028>\n    39cf:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    39d3:\te8 a8 d6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    39d8:\t48 8d 35 1f 6d 00 00 \tlea    0x6d1f(%rip),%rsi        # a6fe <xorzerotest>\n    39df:\t4c 89 f7             \tmov    %r14,%rdi\n    39e2:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    39e8:\te8 53 70 00 00       \tcallq  aa40 <measureFunction>\n    39ed:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    39f2:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    39f7:\t48 8d 35 af 7c 00 00 \tlea    0x7caf(%rip),%rsi        # b6ad <_IO_stdin_used+0x6ad>\n    39fe:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3a02:\te8 79 d6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3a07:\t48 8d 35 25 6c 00 00 \tlea    0x6c25(%rip),%rsi        # a633 <movzerotest>\n    3a0e:\t4c 89 f7             \tmov    %r14,%rdi\n    3a11:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3a17:\te8 24 70 00 00       \tcallq  aa40 <measureFunction>\n    3a1c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3a21:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3a26:\t48 8d 35 68 7c 00 00 \tlea    0x7c68(%rip),%rsi        # b695 <_IO_stdin_used+0x695>\n    3a2d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3a31:\te8 4a d6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3a36:\t48 8d 35 38 6d 00 00 \tlea    0x6d38(%rip),%rsi        # a775 <subzerotest>\n    3a3d:\t4c 89 f7             \tmov    %r14,%rdi\n    3a40:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3a46:\te8 f5 6f 00 00       \tcallq  aa40 <measureFunction>\n    3a4b:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3a50:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3a55:\t48 8d 35 21 7c 00 00 \tlea    0x7c21(%rip),%rsi        # b67d <_IO_stdin_used+0x67d>\n    3a5c:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3a60:\te8 1b d6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3a65:\t48 8d 35 0b 6e 00 00 \tlea    0x6e0b(%rip),%rsi        # a877 <depinctest>\n    3a6c:\t4c 89 f7             \tmov    %r14,%rdi\n    3a6f:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3a75:\te8 c6 6f 00 00       \tcallq  aa40 <measureFunction>\n    3a7a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3a7f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3a84:\t48 8d 35 db 7b 00 00 \tlea    0x7bdb(%rip),%rsi        # b666 <_IO_stdin_used+0x666>\n    3a8b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3a8f:\te8 ec d5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3a94:\t48 8d 35 53 6e 00 00 \tlea    0x6e53(%rip),%rsi        # a8ee <depdectest>\n    3a9b:\t4c 89 f7             \tmov    %r14,%rdi\n    3a9e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3aa4:\te8 97 6f 00 00       \tcallq  aa40 <measureFunction>\n    3aa9:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3aae:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3ab3:\t48 8d 35 95 7b 00 00 \tlea    0x7b95(%rip),%rsi        # b64f <_IO_stdin_used+0x64f>\n    3aba:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3abe:\te8 bd d5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3ac3:\t48 8d 35 22 6d 00 00 \tlea    0x6d22(%rip),%rsi        # a7ec <depaddimmtest>\n    3aca:\t4c 89 f7             \tmov    %r14,%rdi\n    3acd:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3ad3:\te8 68 6f 00 00       \tcallq  aa40 <measureFunction>\n    3ad8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3add:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3ae2:\t48 8d 35 17 85 00 00 \tlea    0x8517(%rip),%rsi        # c000 <_IO_stdin_used+0x1000>\n    3ae9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3aed:\te8 8e d5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3af2:\t48 8d 35 ad 31 00 00 \tlea    0x31ad(%rip),%rsi        # 6ca6 <clkmovtest>\n    3af9:\t4c 89 f7             \tmov    %r14,%rdi\n    3afc:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3b02:\te8 39 6f 00 00       \tcallq  aa40 <measureFunction>\n    3b07:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3b0c:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3b11:\t48 8d 35 c0 84 00 00 \tlea    0x84c0(%rip),%rsi        # bfd8 <_IO_stdin_used+0xfd8>\n    3b18:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3b1c:\te8 5f d5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3b21:\t48 8d 35 7c 3c 00 00 \tlea    0x3c7c(%rip),%rsi        # 77a4 <addmultest>\n    3b28:\t4c 89 f7             \tmov    %r14,%rdi\n    3b2b:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3b31:\te8 0a 6f 00 00       \tcallq  aa40 <measureFunction>\n    3b36:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3b3b:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3b40:\t48 8d 35 eb 7a 00 00 \tlea    0x7aeb(%rip),%rsi        # b632 <_IO_stdin_used+0x632>\n    3b47:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3b4b:\te8 30 d5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3b50:\t48 8d 35 90 3b 00 00 \tlea    0x3b90(%rip),%rsi        # 76e7 <jmpmultest>\n    3b57:\t4c 89 f7             \tmov    %r14,%rdi\n    3b5a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3b60:\te8 db 6e 00 00       \tcallq  aa40 <measureFunction>\n    3b65:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3b6a:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3b6f:\t48 8d 35 a1 7a 00 00 \tlea    0x7aa1(%rip),%rsi        # b617 <_IO_stdin_used+0x617>\n    3b76:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3b7a:\te8 01 d5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3b7f:\t48 8d 35 6b 39 00 00 \tlea    0x396b(%rip),%rsi        # 74f1 <jmptest>\n    3b86:\t4c 89 f7             \tmov    %r14,%rdi\n    3b89:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3b8f:\te8 ac 6e 00 00       \tcallq  aa40 <measureFunction>\n    3b94:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3b99:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3b9e:\t48 8d 35 59 7a 00 00 \tlea    0x7a59(%rip),%rsi        # b5fe <_IO_stdin_used+0x5fe>\n    3ba5:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3ba9:\te8 d2 d4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3bae:\t48 8d 35 14 3a 00 00 \tlea    0x3a14(%rip),%rsi        # 75c9 <ntjmptest>\n    3bb5:\t4c 89 f7             \tmov    %r14,%rdi\n    3bb8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3bbe:\te8 7d 6e 00 00       \tcallq  aa40 <measureFunction>\n    3bc3:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3bc8:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3bcd:\t48 8d 35 14 7a 00 00 \tlea    0x7a14(%rip),%rsi        # b5e8 <_IO_stdin_used+0x5e8>\n    3bd4:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3bd8:\te8 a3 d4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3bdd:\t48 8d 35 4c 67 00 00 \tlea    0x674c(%rip),%rsi        # a330 <pdeptest>\n    3be4:\t4c 89 f7             \tmov    %r14,%rdi\n    3be7:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3bed:\te8 4e 6e 00 00       \tcallq  aa40 <measureFunction>\n    3bf2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3bf7:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3bfc:\t48 8d 35 d1 79 00 00 \tlea    0x79d1(%rip),%rsi        # b5d4 <_IO_stdin_used+0x5d4>\n    3c03:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3c07:\te8 74 d4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3c0c:\t48 8d 35 80 68 00 00 \tlea    0x6880(%rip),%rsi        # a493 <pexttest>\n    3c13:\t4c 89 f7             \tmov    %r14,%rdi\n    3c16:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3c1c:\te8 1f 6e 00 00       \tcallq  aa40 <measureFunction>\n    3c21:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3c26:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3c2b:\t48 8d 35 8e 79 00 00 \tlea    0x798e(%rip),%rsi        # b5c0 <_IO_stdin_used+0x5c0>\n    3c32:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3c36:\te8 45 d4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3c3b:\t48 8d 35 a2 67 00 00 \tlea    0x67a2(%rip),%rsi        # a3e4 <pdepmultest>\n    3c42:\t4c 89 f7             \tmov    %r14,%rdi\n    3c45:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3c4b:\te8 f0 6d 00 00       \tcallq  aa40 <measureFunction>\n    3c50:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3c55:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3c5a:\t48 8d 35 43 79 00 00 \tlea    0x7943(%rip),%rsi        # b5a4 <_IO_stdin_used+0x5a4>\n    3c61:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3c65:\te8 16 d4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3c6a:\t48 8d 35 55 33 00 00 \tlea    0x3355(%rip),%rsi        # 6fc6 <shltest>\n    3c71:\t4c 89 f7             \tmov    %r14,%rdi\n    3c74:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3c7a:\te8 c1 6d 00 00       \tcallq  aa40 <measureFunction>\n    3c7f:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3c84:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3c89:\t48 8d 35 fd 78 00 00 \tlea    0x78fd(%rip),%rsi        # b58d <_IO_stdin_used+0x58d>\n    3c90:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3c94:\te8 e7 d3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3c99:\t48 8d 35 9a 32 00 00 \tlea    0x329a(%rip),%rsi        # 6f3a <rortest>\n    3ca0:\t4c 89 f7             \tmov    %r14,%rdi\n    3ca3:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3ca9:\te8 92 6d 00 00       \tcallq  aa40 <measureFunction>\n    3cae:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3cb3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3cb8:\t48 8d 35 b7 78 00 00 \tlea    0x78b7(%rip),%rsi        # b576 <_IO_stdin_used+0x576>\n    3cbf:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3cc3:\te8 b8 d3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3cc8:\t48 8d 35 83 33 00 00 \tlea    0x3383(%rip),%rsi        # 7052 <mixrorshltest>\n    3ccf:\t4c 89 f7             \tmov    %r14,%rdi\n    3cd2:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3cd8:\te8 63 6d 00 00       \tcallq  aa40 <measureFunction>\n    3cdd:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3ce2:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3ce7:\t48 8d 35 ca 82 00 00 \tlea    0x82ca(%rip),%rsi        # bfb8 <_IO_stdin_used+0xfb8>\n    3cee:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3cf2:\te8 89 d3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3cf7:\t48 8d 35 e0 33 00 00 \tlea    0x33e0(%rip),%rsi        # 70de <mixrormultest>\n    3cfe:\t4c 89 f7             \tmov    %r14,%rdi\n    3d01:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3d07:\te8 34 6d 00 00       \tcallq  aa40 <measureFunction>\n    3d0c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3d11:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3d16:\t48 8d 35 3e 78 00 00 \tlea    0x783e(%rip),%rsi        # b55b <_IO_stdin_used+0x55b>\n    3d1d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3d21:\te8 5a d3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3d26:\t48 8d 35 03 35 00 00 \tlea    0x3503(%rip),%rsi        # 7230 <btstest>\n    3d2d:\t4c 89 f7             \tmov    %r14,%rdi\n    3d30:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3d36:\te8 05 6d 00 00       \tcallq  aa40 <measureFunction>\n    3d3b:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3d40:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3d45:\t48 8d 35 fc 77 00 00 \tlea    0x77fc(%rip),%rsi        # b548 <_IO_stdin_used+0x548>\n    3d4c:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3d50:\te8 2b d3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3d55:\t48 8d 35 cd 36 00 00 \tlea    0x36cd(%rip),%rsi        # 7429 <btsmultest>\n    3d5c:\t4c 89 f7             \tmov    %r14,%rdi\n    3d5f:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3d65:\te8 d6 6c 00 00       \tcallq  aa40 <measureFunction>\n    3d6a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3d6f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3d74:\t48 8d 35 b2 77 00 00 \tlea    0x77b2(%rip),%rsi        # b52d <_IO_stdin_used+0x52d>\n    3d7b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3d7f:\te8 fc d2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3d84:\t48 8d 35 08 34 00 00 \tlea    0x3408(%rip),%rsi        # 7193 <rorbtstest>\n    3d8b:\t4c 89 f7             \tmov    %r14,%rdi\n    3d8e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3d94:\te8 a7 6c 00 00       \tcallq  aa40 <measureFunction>\n    3d99:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3d9e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3da3:\t48 8d 35 68 77 00 00 \tlea    0x7768(%rip),%rsi        # b512 <_IO_stdin_used+0x512>\n    3daa:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3dae:\te8 cd d2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3db3:\t48 8d 35 19 35 00 00 \tlea    0x3519(%rip),%rsi        # 72d3 <leatest>\n    3dba:\t4c 89 f7             \tmov    %r14,%rdi\n    3dbd:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3dc3:\te8 78 6c 00 00       \tcallq  aa40 <measureFunction>\n    3dc8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3dcd:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3dd2:\t48 8d 35 20 77 00 00 \tlea    0x7720(%rip),%rsi        # b4f9 <_IO_stdin_used+0x4f9>\n    3dd9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3ddd:\te8 9e d2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3de2:\t48 8d 35 8d 35 00 00 \tlea    0x358d(%rip),%rsi        # 7376 <leamultest>\n    3de9:\t4c 89 f7             \tmov    %r14,%rdi\n    3dec:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3df2:\te8 49 6c 00 00       \tcallq  aa40 <measureFunction>\n    3df7:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3dfc:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3e01:\t48 8d 35 88 81 00 00 \tlea    0x8188(%rip),%rsi        # bf90 <_IO_stdin_used+0xf90>\n    3e08:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3e0c:\te8 6f d2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3e11:\t48 8d 35 73 3a 00 00 \tlea    0x3a73(%rip),%rsi        # 788b <add256int>\n    3e18:\t4c 89 f7             \tmov    %r14,%rdi\n    3e1b:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3e21:\te8 1a 6c 00 00       \tcallq  aa40 <measureFunction>\n    3e26:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3e2b:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3e30:\t48 8d 35 31 81 00 00 \tlea    0x8131(%rip),%rsi        # bf68 <_IO_stdin_used+0xf68>\n    3e37:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3e3b:\te8 40 d2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3e40:\t48 8d 35 fd 3e 00 00 \tlea    0x3efd(%rip),%rsi        # 7d44 <mixadd256int>\n    3e47:\t4c 89 f7             \tmov    %r14,%rdi\n    3e4a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3e50:\te8 eb 6b 00 00       \tcallq  aa40 <measureFunction>\n    3e55:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3e5a:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3e5f:\t48 8d 35 ca 80 00 00 \tlea    0x80ca(%rip),%rsi        # bf30 <_IO_stdin_used+0xf30>\n    3e66:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3e6a:\te8 11 d2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3e6f:\t48 8d 35 93 3f 00 00 \tlea    0x3f93(%rip),%rsi        # 7e09 <mixadd256int11>\n    3e76:\t4c 89 f7             \tmov    %r14,%rdi\n    3e79:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3e7f:\te8 bc 6b 00 00       \tcallq  aa40 <measureFunction>\n    3e84:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3e89:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3e8e:\t48 8d 35 63 80 00 00 \tlea    0x8063(%rip),%rsi        # bef8 <_IO_stdin_used+0xef8>\n    3e95:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3e99:\te8 e2 d1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3e9e:\t48 8d 35 a6 3c 00 00 \tlea    0x3ca6(%rip),%rsi        # 7b4b <mixadd256fpint>\n    3ea5:\t4c 89 f7             \tmov    %r14,%rdi\n    3ea8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3eae:\te8 8d 6b 00 00       \tcallq  aa40 <measureFunction>\n    3eb3:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3eb8:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3ebd:\t48 8d 35 04 80 00 00 \tlea    0x8004(%rip),%rsi        # bec8 <_IO_stdin_used+0xec8>\n    3ec4:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3ec8:\te8 b3 d1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3ecd:\t48 8d 35 cc 3d 00 00 \tlea    0x3dcc(%rip),%rsi        # 7ca0 <mix256fp>\n    3ed4:\t4c 89 f7             \tmov    %r14,%rdi\n    3ed7:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3edd:\te8 5e 6b 00 00       \tcallq  aa40 <measureFunction>\n    3ee2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3ee7:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3eec:\t48 8d 35 a5 7f 00 00 \tlea    0x7fa5(%rip),%rsi        # be98 <_IO_stdin_used+0xe98>\n    3ef3:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    3ef7:\te8 84 d1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3efc:\t48 8d 35 ad 3f 00 00 \tlea    0x3fad(%rip),%rsi        # 7eb0 <latadd256int>\n    3f03:\t4c 89 f7             \tmov    %r14,%rdi\n    3f06:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3f0c:\te8 2f 6b 00 00       \tcallq  aa40 <measureFunction>\n    3f11:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    3f17:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3f1c:\t48 8d 35 45 7f 00 00 \tlea    0x7f45(%rip),%rsi        # be68 <_IO_stdin_used+0xe68>\n    3f23:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3f28:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    3f2c:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    3f30:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    3f34:\te8 47 d1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3f39:\t48 8d 35 5b 43 00 00 \tlea    0x435b(%rip),%rsi        # 829b <latmul256int>\n    3f40:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3f46:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    3f4b:\te8 f0 6a 00 00       \tcallq  aa40 <measureFunction>\n    3f50:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    3f56:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3f5b:\t48 8d 35 ce 7e 00 00 \tlea    0x7ece(%rip),%rsi        # be30 <_IO_stdin_used+0xe30>\n    3f62:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3f67:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    3f6b:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    3f6f:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    3f73:\te8 08 d1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3f78:\t48 8d 35 ca 43 00 00 \tlea    0x43ca(%rip),%rsi        # 8349 <latadd128int>\n    3f7f:\t4c 89 f7             \tmov    %r14,%rdi\n    3f82:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3f88:\te8 b3 6a 00 00       \tcallq  aa40 <measureFunction>\n    3f8d:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    3f93:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3f98:\t48 8d 35 61 7e 00 00 \tlea    0x7e61(%rip),%rsi        # be00 <_IO_stdin_used+0xe00>\n    3f9f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3fa4:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    3fa8:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    3fac:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    3fb0:\te8 cb d0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3fb5:\t48 8d 35 e0 48 00 00 \tlea    0x48e0(%rip),%rsi        # 889c <latmul128int>\n    3fbc:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    3fc2:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    3fc7:\te8 74 6a 00 00       \tcallq  aa40 <measureFunction>\n    3fcc:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    3fd2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    3fd7:\t48 8d 35 ea 7d 00 00 \tlea    0x7dea(%rip),%rsi        # bdc8 <_IO_stdin_used+0xdc8>\n    3fde:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    3fe3:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    3fe7:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    3feb:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    3fef:\te8 8c d0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    3ff4:\t48 8d 35 c6 49 00 00 \tlea    0x49c6(%rip),%rsi        # 89c1 <latadd256fp>\n    3ffb:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4001:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4006:\te8 35 6a 00 00       \tcallq  aa40 <measureFunction>\n    400b:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    4011:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4016:\t48 8d 35 83 7d 00 00 \tlea    0x7d83(%rip),%rsi        # bda0 <_IO_stdin_used+0xda0>\n    401d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4022:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    4026:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    402a:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    402e:\te8 4d d0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4033:\t48 8d 35 41 4b 00 00 \tlea    0x4b41(%rip),%rsi        # 8b7b <latmul256fp>\n    403a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4040:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4045:\te8 f6 69 00 00       \tcallq  aa40 <measureFunction>\n    404a:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    4050:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4055:\t48 8d 35 1c 7d 00 00 \tlea    0x7d1c(%rip),%rsi        # bd78 <_IO_stdin_used+0xd78>\n    405c:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4061:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    4065:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4069:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    406d:\te8 0e d0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4072:\t48 8d 35 13 58 00 00 \tlea    0x5813(%rip),%rsi        # 988c <latadd128fp>\n    4079:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    407f:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4084:\te8 b7 69 00 00       \tcallq  aa40 <measureFunction>\n    4089:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    408f:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4094:\t48 8d 35 b5 7c 00 00 \tlea    0x7cb5(%rip),%rsi        # bd50 <_IO_stdin_used+0xd50>\n    409b:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    40a0:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    40a4:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    40a8:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    40ac:\te8 cf cf ff ff       \tcallq  1080 <__printf_chk@plt>\n    40b1:\t48 8d 35 34 58 00 00 \tlea    0x5834(%rip),%rsi        # 98ec <latmul128fp>\n    40b8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    40be:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    40c3:\te8 78 69 00 00       \tcallq  aa40 <measureFunction>\n    40c8:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    40ce:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    40d3:\t48 8d 35 4e 7c 00 00 \tlea    0x7c4e(%rip),%rsi        # bd28 <_IO_stdin_used+0xd28>\n    40da:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    40df:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    40e3:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    40e7:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    40eb:\te8 90 cf ff ff       \tcallq  1080 <__printf_chk@plt>\n    40f0:\t48 8d 35 c4 58 00 00 \tlea    0x58c4(%rip),%rsi        # 99bb <add128fp>\n    40f7:\t4c 89 f7             \tmov    %r14,%rdi\n    40fa:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4100:\te8 3b 69 00 00       \tcallq  aa40 <measureFunction>\n    4105:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    410a:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    410f:\t48 8d 35 f2 7b 00 00 \tlea    0x7bf2(%rip),%rsi        # bd08 <_IO_stdin_used+0xd08>\n    4116:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    411a:\te8 61 cf ff ff       \tcallq  1080 <__printf_chk@plt>\n    411f:\t48 8d 35 26 58 00 00 \tlea    0x5826(%rip),%rsi        # 994c <mul128fp>\n    4126:\t4c 89 f7             \tmov    %r14,%rdi\n    4129:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    412f:\te8 0c 69 00 00       \tcallq  aa40 <measureFunction>\n    4134:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4139:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    413e:\t48 8d 35 a3 7b 00 00 \tlea    0x7ba3(%rip),%rsi        # bce8 <_IO_stdin_used+0xce8>\n    4145:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4149:\te8 32 cf ff ff       \tcallq  1080 <__printf_chk@plt>\n    414e:\t48 8d 35 5f 42 00 00 \tlea    0x425f(%rip),%rsi        # 83b4 <add128int>\n    4155:\t4c 89 f7             \tmov    %r14,%rdi\n    4158:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    415e:\te8 dd 68 00 00       \tcallq  aa40 <measureFunction>\n    4163:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4168:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    416d:\t48 8d 35 4c 7b 00 00 \tlea    0x7b4c(%rip),%rsi        # bcc0 <_IO_stdin_used+0xcc0>\n    4174:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4178:\te8 03 cf ff ff       \tcallq  1080 <__printf_chk@plt>\n    417d:\t48 8d 35 99 46 00 00 \tlea    0x4699(%rip),%rsi        # 881d <mul128int>\n    4184:\t4c 89 f7             \tmov    %r14,%rdi\n    4187:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    418d:\te8 ae 68 00 00       \tcallq  aa40 <measureFunction>\n    4192:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4197:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    419c:\t48 8d 35 f5 7a 00 00 \tlea    0x7af5(%rip),%rsi        # bc98 <_IO_stdin_used+0xc98>\n    41a3:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    41a7:\te8 d4 ce ff ff       \tcallq  1080 <__printf_chk@plt>\n    41ac:\t48 8d 35 fc 4b 00 00 \tlea    0x4bfc(%rip),%rsi        # 8daf <fma256>\n    41b3:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    41b9:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    41be:\te8 7d 68 00 00       \tcallq  aa40 <measureFunction>\n    41c3:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    41c8:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    41cd:\t48 8d 35 0a 73 00 00 \tlea    0x730a(%rip),%rsi        # b4de <_IO_stdin_used+0x4de>\n    41d4:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    41d8:\te8 a3 ce ff ff       \tcallq  1080 <__printf_chk@plt>\n    41dd:\t48 8d 35 83 4c 00 00 \tlea    0x4c83(%rip),%rsi        # 8e67 <fma128>\n    41e4:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    41ea:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    41ef:\te8 4c 68 00 00       \tcallq  aa40 <measureFunction>\n    41f4:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    41f9:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    41fe:\t48 8d 35 be 72 00 00 \tlea    0x72be(%rip),%rsi        # b4c3 <_IO_stdin_used+0x4c3>\n    4205:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4209:\te8 72 ce ff ff       \tcallq  1080 <__printf_chk@plt>\n    420e:\t48 8d 35 04 55 00 00 \tlea    0x5504(%rip),%rsi        # 9719 <latfma256>\n    4215:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    421b:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4220:\te8 1b 68 00 00       \tcallq  aa40 <measureFunction>\n    4225:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    422b:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4230:\t48 8d 35 39 7a 00 00 \tlea    0x7a39(%rip),%rsi        # bc70 <_IO_stdin_used+0xc70>\n    4237:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    423c:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    4240:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4244:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    4248:\te8 33 ce ff ff       \tcallq  1080 <__printf_chk@plt>\n    424d:\t48 8d 35 7d 55 00 00 \tlea    0x557d(%rip),%rsi        # 97d1 <latfma128>\n    4254:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    425a:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    425f:\te8 dc 67 00 00       \tcallq  aa40 <measureFunction>\n    4264:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    426a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    426f:\t48 8d 35 d2 79 00 00 \tlea    0x79d2(%rip),%rsi        # bc48 <_IO_stdin_used+0xc48>\n    4276:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    427b:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    427f:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4283:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    4287:\te8 f4 cd ff ff       \tcallq  1080 <__printf_chk@plt>\n    428c:\t48 8d 35 49 48 00 00 \tlea    0x4849(%rip),%rsi        # 8adc <add256fp>\n    4293:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4299:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    429e:\te8 9d 67 00 00       \tcallq  aa40 <measureFunction>\n    42a3:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    42a8:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    42ad:\t48 8d 35 f3 71 00 00 \tlea    0x71f3(%rip),%rsi        # b4a7 <_IO_stdin_used+0x4a7>\n    42b4:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    42b8:\te8 c3 cd ff ff       \tcallq  1080 <__printf_chk@plt>\n    42bd:\t48 8d 35 79 47 00 00 \tlea    0x4779(%rip),%rsi        # 8a3d <mul256fp>\n    42c4:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    42ca:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    42cf:\te8 6c 67 00 00       \tcallq  aa40 <measureFunction>\n    42d4:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    42d9:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    42de:\t48 8d 35 a6 71 00 00 \tlea    0x71a6(%rip),%rsi        # b48b <_IO_stdin_used+0x48b>\n    42e5:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    42e9:\te8 92 cd ff ff       \tcallq  1080 <__printf_chk@plt>\n    42ee:\t48 8d 35 2d 4c 00 00 \tlea    0x4c2d(%rip),%rsi        # 8f22 <mixfmafadd256>\n    42f5:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    42fb:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    4300:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    4304:\te8 37 67 00 00       \tcallq  aa40 <measureFunction>\n    4309:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    430e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4313:\t48 8d 35 06 79 00 00 \tlea    0x7906(%rip),%rsi        # bc20 <_IO_stdin_used+0xc20>\n    431a:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    431e:\te8 5d cd ff ff       \tcallq  1080 <__printf_chk@plt>\n    4323:\t48 8d 35 a7 4e 00 00 \tlea    0x4ea7(%rip),%rsi        # 91d1 <mixfmaadd256>\n    432a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4330:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    4335:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    4339:\te8 02 67 00 00       \tcallq  aa40 <measureFunction>\n    433e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4343:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4348:\t48 8d 35 a9 78 00 00 \tlea    0x78a9(%rip),%rsi        # bbf8 <_IO_stdin_used+0xbf8>\n    434f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4353:\te8 28 cd ff ff       \tcallq  1080 <__printf_chk@plt>\n    4358:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    435e:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    4363:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    4367:\t48 8d 35 c2 66 00 00 \tlea    0x66c2(%rip),%rsi        # aa30 <mixfmaaddmem256wrapper>\n    436e:\te8 cd 66 00 00       \tcallq  aa40 <measureFunction>\n    4373:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4378:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    437d:\t48 8d 35 44 78 00 00 \tlea    0x7844(%rip),%rsi        # bbc8 <_IO_stdin_used+0xbc8>\n    4384:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4388:\te8 f3 cc ff ff       \tcallq  1080 <__printf_chk@plt>\n    438d:\t48 8d 35 ee 4e 00 00 \tlea    0x4eee(%rip),%rsi        # 9282 <mixfmaand256>\n    4394:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    439a:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    439f:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    43a3:\te8 98 66 00 00       \tcallq  aa40 <measureFunction>\n    43a8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    43ad:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    43b2:\t48 8d 35 e7 77 00 00 \tlea    0x77e7(%rip),%rsi        # bba0 <_IO_stdin_used+0xba0>\n    43b9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    43bd:\te8 be cc ff ff       \tcallq  1080 <__printf_chk@plt>\n    43c2:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    43c8:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    43cd:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    43d1:\t48 8d 35 48 66 00 00 \tlea    0x6648(%rip),%rsi        # aa20 <mixfmaandmem256wrapper>\n    43d8:\te8 63 66 00 00       \tcallq  aa40 <measureFunction>\n    43dd:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    43e2:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    43e7:\t48 8d 35 5a 7c 00 00 \tlea    0x7c5a(%rip),%rsi        # c048 <_IO_stdin_used+0x1048>\n    43ee:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    43f2:\te8 89 cc ff ff       \tcallq  1080 <__printf_chk@plt>\n    43f7:\t48 8d 35 9a 51 00 00 \tlea    0x519a(%rip),%rsi        # 9598 <nemesfpumix21>\n    43fe:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4404:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    4409:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    440d:\te8 2e 66 00 00       \tcallq  aa40 <measureFunction>\n    4412:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4417:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    441c:\t48 8d 35 4d 77 00 00 \tlea    0x774d(%rip),%rsi        # bb70 <_IO_stdin_used+0xb70>\n    4423:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4427:\te8 54 cc ff ff       \tcallq  1080 <__printf_chk@plt>\n    442c:\t48 8d 35 c1 37 00 00 \tlea    0x37c1(%rip),%rsi        # 7bf4 <mix256faddintadd>\n    4433:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4439:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    443e:\te8 fd 65 00 00       \tcallq  aa40 <measureFunction>\n    4443:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4448:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    444d:\t48 8d 35 f4 76 00 00 \tlea    0x76f4(%rip),%rsi        # bb48 <_IO_stdin_used+0xb48>\n    4454:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4458:\te8 23 cc ff ff       \tcallq  1080 <__printf_chk@plt>\n    445d:\t48 8d 35 66 56 00 00 \tlea    0x5666(%rip),%rsi        # 9aca <latmul16>\n    4464:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    446a:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    446f:\te8 cc 65 00 00       \tcallq  aa40 <measureFunction>\n    4474:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    447a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    447f:\t48 8d 35 9a 76 00 00 \tlea    0x769a(%rip),%rsi        # bb20 <_IO_stdin_used+0xb20>\n    4486:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    448b:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    448f:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4493:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    4497:\te8 e4 cb ff ff       \tcallq  1080 <__printf_chk@plt>\n    449c:\t48 8d 35 87 55 00 00 \tlea    0x5587(%rip),%rsi        # 9a2a <latmul64>\n    44a3:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    44a9:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    44ae:\te8 8d 65 00 00       \tcallq  aa40 <measureFunction>\n    44b3:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    44b9:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    44be:\t48 8d 35 33 76 00 00 \tlea    0x7633(%rip),%rsi        # baf8 <_IO_stdin_used+0xaf8>\n    44c5:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    44ca:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    44ce:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    44d2:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    44d6:\te8 a5 cb ff ff       \tcallq  1080 <__printf_chk@plt>\n    44db:\t48 8d 35 9c 56 00 00 \tlea    0x569c(%rip),%rsi        # 9b7e <mul16>\n    44e2:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    44e8:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    44ed:\te8 4e 65 00 00       \tcallq  aa40 <measureFunction>\n    44f2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    44f7:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    44fc:\t48 8d 35 6d 6f 00 00 \tlea    0x6f6d(%rip),%rsi        # b470 <_IO_stdin_used+0x470>\n    4503:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4507:\te8 74 cb ff ff       \tcallq  1080 <__printf_chk@plt>\n    450c:\t48 8d 35 1f 57 00 00 \tlea    0x571f(%rip),%rsi        # 9c32 <mul64>\n    4513:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4519:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    451e:\te8 1d 65 00 00       \tcallq  aa40 <measureFunction>\n    4523:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4528:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    452d:\t48 8d 35 21 6f 00 00 \tlea    0x6f21(%rip),%rsi        # b455 <_IO_stdin_used+0x455>\n    4534:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4538:\te8 43 cb ff ff       \tcallq  1080 <__printf_chk@plt>\n    453d:\t48 8d 35 d0 57 00 00 \tlea    0x57d0(%rip),%rsi        # 9d14 <mixmul16mul64>\n    4544:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    454a:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    454f:\te8 ec 64 00 00       \tcallq  aa40 <measureFunction>\n    4554:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4559:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    455e:\t48 8d 35 63 75 00 00 \tlea    0x7563(%rip),%rsi        # bac8 <_IO_stdin_used+0xac8>\n    4565:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4569:\te8 12 cb ff ff       \tcallq  1080 <__printf_chk@plt>\n    456e:\t48 8d 35 4b 58 00 00 \tlea    0x584b(%rip),%rsi        # 9dc0 <mixmul16mul64_21>\n    4575:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    457b:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4580:\te8 bb 64 00 00       \tcallq  aa40 <measureFunction>\n    4585:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    458a:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    458f:\t48 8d 35 02 75 00 00 \tlea    0x7502(%rip),%rsi        # ba98 <_IO_stdin_used+0xa98>\n    4596:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    459a:\te8 e1 ca ff ff       \tcallq  1080 <__printf_chk@plt>\n    459f:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    45a5:\t48 8d 35 c4 63 00 00 \tlea    0x63c4(%rip),%rsi        # a970 <load128wrapper>\n    45ac:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    45b1:\te8 8a 64 00 00       \tcallq  aa40 <measureFunction>\n    45b6:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    45bb:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    45c0:\t48 8d 35 71 6e 00 00 \tlea    0x6e71(%rip),%rsi        # b438 <_IO_stdin_used+0x438>\n    45c7:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    45cb:\te8 b0 ca ff ff       \tcallq  1080 <__printf_chk@plt>\n    45d0:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    45d6:\t48 8d 35 a3 63 00 00 \tlea    0x63a3(%rip),%rsi        # a980 <spacedload128wrapper>\n    45dd:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    45e2:\te8 59 64 00 00       \tcallq  aa40 <measureFunction>\n    45e7:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    45ec:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    45f1:\t48 8d 35 78 74 00 00 \tlea    0x7478(%rip),%rsi        # ba70 <_IO_stdin_used+0xa70>\n    45f8:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    45fc:\te8 7f ca ff ff       \tcallq  1080 <__printf_chk@plt>\n    4601:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4607:\t48 8d 35 92 63 00 00 \tlea    0x6392(%rip),%rsi        # a9a0 <load256wrapper>\n    460e:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4613:\te8 28 64 00 00       \tcallq  aa40 <measureFunction>\n    4618:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    461d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4622:\t48 8d 35 f2 6d 00 00 \tlea    0x6df2(%rip),%rsi        # b41b <_IO_stdin_used+0x41b>\n    4629:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    462d:\te8 4e ca ff ff       \tcallq  1080 <__printf_chk@plt>\n    4632:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4638:\t48 8d 35 51 63 00 00 \tlea    0x6351(%rip),%rsi        # a990 <spacedstorescalarwrapper>\n    463f:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4644:\te8 f7 63 00 00       \tcallq  aa40 <measureFunction>\n    4649:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    464e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4653:\t48 8d 35 ee 73 00 00 \tlea    0x73ee(%rip),%rsi        # ba48 <_IO_stdin_used+0xa48>\n    465a:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    465e:\te8 1d ca ff ff       \tcallq  1080 <__printf_chk@plt>\n    4663:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4669:\t48 8d 35 50 63 00 00 \tlea    0x6350(%rip),%rsi        # a9c0 <store128wrapper>\n    4670:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4675:\te8 c6 63 00 00       \tcallq  aa40 <measureFunction>\n    467a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    467f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4684:\t48 8d 35 72 6d 00 00 \tlea    0x6d72(%rip),%rsi        # b3fd <_IO_stdin_used+0x3fd>\n    468b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    468f:\te8 ec c9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4694:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    469a:\t48 8d 35 3f 63 00 00 \tlea    0x633f(%rip),%rsi        # a9e0 <store256wrapper>\n    46a1:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    46a6:\te8 95 63 00 00       \tcallq  aa40 <measureFunction>\n    46ab:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    46b0:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    46b5:\t48 8d 35 23 6d 00 00 \tlea    0x6d23(%rip),%rsi        # b3df <_IO_stdin_used+0x3df>\n    46bc:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    46c0:\te8 bb c9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    46c5:\te9 4b e2 ff ff       \tjmpq   2915 <main+0x1855>\n    46ca:\t48 8b 0d 0f 9a 00 00 \tmov    0x9a0f(%rip),%rcx        # e0e0 <stderr@@GLIBC_2.2.5>\n    46d1:\tba 0f 00 00 00       \tmov    $0xf,%edx\n    46d6:\tbe 01 00 00 00       \tmov    $0x1,%esi\n    46db:\t48 8d 3d 59 69 00 00 \tlea    0x6959(%rip),%rdi        # b03b <_IO_stdin_used+0x3b>\n    46e2:\te8 a9 c9 ff ff       \tcallq  1090 <fwrite@plt>\n    46e7:\te9 91 ca ff ff       \tjmpq   117d <main+0xbd>\n    46ec:\t48 8b 0d ed 99 00 00 \tmov    0x99ed(%rip),%rcx        # e0e0 <stderr@@GLIBC_2.2.5>\n    46f3:\tba 0f 00 00 00       \tmov    $0xf,%edx\n    46f8:\tbe 01 00 00 00       \tmov    $0x1,%esi\n    46fd:\t48 8d 3d 27 69 00 00 \tlea    0x6927(%rip),%rdi        # b02b <_IO_stdin_used+0x2b>\n    4704:\te8 87 c9 ff ff       \tcallq  1090 <fwrite@plt>\n    4709:\te9 64 ca ff ff       \tjmpq   1172 <main+0xb2>\n    470e:\t48 8b 0d cb 99 00 00 \tmov    0x99cb(%rip),%rcx        # e0e0 <stderr@@GLIBC_2.2.5>\n    4715:\tba 0e 00 00 00       \tmov    $0xe,%edx\n    471a:\tbe 01 00 00 00       \tmov    $0x1,%esi\n    471f:\t48 8d 3d f6 68 00 00 \tlea    0x68f6(%rip),%rdi        # b01c <_IO_stdin_used+0x1c>\n    4726:\te8 65 c9 ff ff       \tcallq  1090 <fwrite@plt>\n    472b:\te9 37 ca ff ff       \tjmpq   1167 <main+0xa7>\n    4730:\t48 8b 7d 10          \tmov    0x10(%rbp),%rdi\n    4734:\t31 f6                \txor    %esi,%esi\n    4736:\tba 0a 00 00 00       \tmov    $0xa,%edx\n    473b:\te8 30 c9 ff ff       \tcallq  1070 <strtol@plt>\n    4740:\t48 8d 35 bd 68 00 00 \tlea    0x68bd(%rip),%rsi        # b004 <_IO_stdin_used+0x4>\n    4747:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    474c:\t4c 69 f0 00 2f 68 59 \timul   $0x59682f00,%rax,%r14\n    4753:\t31 c0                \txor    %eax,%eax\n    4755:\t4c 89 f2             \tmov    %r14,%rdx\n    4758:\te8 23 c9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    475d:\te9 f3 c9 ff ff       \tjmpq   1155 <main+0x95>\n    4762:\t48 8d 35 cf 4e 00 00 \tlea    0x4ecf(%rip),%rsi        # 9638 <latfma512>\n    4769:\tf3 0f 10 05 17 79 00 \tmovss  0x7917(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    4770:\t00 \n    4771:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4776:\te8 c5 62 00 00       \tcallq  aa40 <measureFunction>\n    477b:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4780:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4785:\tf3 0f 10 2d fb 78 00 \tmovss  0x78fb(%rip),%xmm5        # c088 <_IO_stdin_used+0x1088>\n    478c:\t00 \n    478d:\t48 8d 35 0c 70 00 00 \tlea    0x700c(%rip),%rsi        # b7a0 <_IO_stdin_used+0x7a0>\n    4794:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    4798:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    479c:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    47a0:\te8 db c8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    47a5:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    47a9:\t0f 8e 97 e1 ff ff    \tjle    2946 <main+0x1886>\n    47af:\t4c 8b 6d 08          \tmov    0x8(%rbp),%r13\n    47b3:\tba 0f 00 00 00       \tmov    $0xf,%edx\n    47b8:\t48 8d 35 48 69 00 00 \tlea    0x6948(%rip),%rsi        # b107 <_IO_stdin_used+0x107>\n    47bf:\t4c 89 ef             \tmov    %r13,%rdi\n    47c2:\te8 69 c8 ff ff       \tcallq  1030 <strncmp@plt>\n    47c7:\t85 c0                \ttest   %eax,%eax\n    47c9:\t0f 85 4d 1c 00 00    \tjne    641c <main+0x535c>\n    47cf:\t48 8d 35 02 45 00 00 \tlea    0x4502(%rip),%rsi        # 8cd8 <mixfma256fma512>\n    47d6:\tf3 0f 10 05 aa 78 00 \tmovss  0x78aa(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    47dd:\t00 \n    47de:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    47e3:\te8 58 62 00 00       \tcallq  aa40 <measureFunction>\n    47e8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    47ed:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    47f2:\t48 8d 35 cf 6f 00 00 \tlea    0x6fcf(%rip),%rsi        # b7c8 <_IO_stdin_used+0x7c8>\n    47f9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    47fd:\te8 7e c8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4802:\te9 51 ea ff ff       \tjmpq   3258 <main+0x2198>\n    4807:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    480d:\t48 8d 35 7c 61 00 00 \tlea    0x617c(%rip),%rsi        # a990 <spacedstorescalarwrapper>\n    4814:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4819:\te8 22 62 00 00       \tcallq  aa40 <measureFunction>\n    481e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4823:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4828:\t48 8d 35 19 72 00 00 \tlea    0x7219(%rip),%rsi        # ba48 <_IO_stdin_used+0xa48>\n    482f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4833:\te8 48 c8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4838:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    483c:\t0f 8f 17 e0 ff ff    \tjg     2859 <main+0x1799>\n    4842:\te9 ff e0 ff ff       \tjmpq   2946 <main+0x1886>\n    4847:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    484d:\t48 8d 35 4c 61 00 00 \tlea    0x614c(%rip),%rsi        # a9a0 <load256wrapper>\n    4854:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4859:\te8 e2 61 00 00       \tcallq  aa40 <measureFunction>\n    485e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4863:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4868:\t48 8d 35 ac 6b 00 00 \tlea    0x6bac(%rip),%rsi        # b41b <_IO_stdin_used+0x41b>\n    486f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4873:\te8 08 c8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4878:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    487c:\t0f 8f 86 df ff ff    \tjg     2808 <main+0x1748>\n    4882:\te9 bf e0 ff ff       \tjmpq   2946 <main+0x1886>\n    4887:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    488d:\t48 8d 35 ec 60 00 00 \tlea    0x60ec(%rip),%rsi        # a980 <spacedload128wrapper>\n    4894:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4899:\te8 a2 61 00 00       \tcallq  aa40 <measureFunction>\n    489e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    48a3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    48a8:\t48 8d 35 c1 71 00 00 \tlea    0x71c1(%rip),%rsi        # ba70 <_IO_stdin_used+0xa70>\n    48af:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    48b3:\te8 c8 c7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    48b8:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    48bc:\t0f 8f f5 de ff ff    \tjg     27b7 <main+0x16f7>\n    48c2:\te9 7f e0 ff ff       \tjmpq   2946 <main+0x1886>\n    48c7:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    48cd:\t48 8d 35 9c 60 00 00 \tlea    0x609c(%rip),%rsi        # a970 <load128wrapper>\n    48d4:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    48d9:\te8 62 61 00 00       \tcallq  aa40 <measureFunction>\n    48de:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    48e3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    48e8:\t48 8d 35 49 6b 00 00 \tlea    0x6b49(%rip),%rsi        # b438 <_IO_stdin_used+0x438>\n    48ef:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    48f3:\te8 88 c7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    48f8:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    48fc:\t0f 8f 64 de ff ff    \tjg     2766 <main+0x16a6>\n    4902:\te9 3f e0 ff ff       \tjmpq   2946 <main+0x1886>\n    4907:\t48 8d 35 b2 54 00 00 \tlea    0x54b2(%rip),%rsi        # 9dc0 <mixmul16mul64_21>\n    490e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4914:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4919:\te8 22 61 00 00       \tcallq  aa40 <measureFunction>\n    491e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4923:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4928:\t48 8d 35 69 71 00 00 \tlea    0x7169(%rip),%rsi        # ba98 <_IO_stdin_used+0xa98>\n    492f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4933:\te8 48 c7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4938:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    493c:\t0f 8f d3 dd ff ff    \tjg     2715 <main+0x1655>\n    4942:\te9 ff df ff ff       \tjmpq   2946 <main+0x1886>\n    4947:\t48 8d 35 c6 53 00 00 \tlea    0x53c6(%rip),%rsi        # 9d14 <mixmul16mul64>\n    494e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4954:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4959:\te8 e2 60 00 00       \tcallq  aa40 <measureFunction>\n    495e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4963:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4968:\t48 8d 35 59 71 00 00 \tlea    0x7159(%rip),%rsi        # bac8 <_IO_stdin_used+0xac8>\n    496f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4973:\te8 08 c7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4978:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    497c:\t0f 8f 42 dd ff ff    \tjg     26c4 <main+0x1604>\n    4982:\te9 bf df ff ff       \tjmpq   2946 <main+0x1886>\n    4987:\t48 8d 35 a4 52 00 00 \tlea    0x52a4(%rip),%rsi        # 9c32 <mul64>\n    498e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4994:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4999:\te8 a2 60 00 00       \tcallq  aa40 <measureFunction>\n    499e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    49a3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    49a8:\t48 8d 35 a6 6a 00 00 \tlea    0x6aa6(%rip),%rsi        # b455 <_IO_stdin_used+0x455>\n    49af:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    49b3:\te8 c8 c6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    49b8:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    49bc:\t0f 8f b1 dc ff ff    \tjg     2673 <main+0x15b3>\n    49c2:\te9 7f df ff ff       \tjmpq   2946 <main+0x1886>\n    49c7:\t48 8d 35 b0 51 00 00 \tlea    0x51b0(%rip),%rsi        # 9b7e <mul16>\n    49ce:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    49d4:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    49d9:\te8 62 60 00 00       \tcallq  aa40 <measureFunction>\n    49de:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    49e3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    49e8:\t48 8d 35 81 6a 00 00 \tlea    0x6a81(%rip),%rsi        # b470 <_IO_stdin_used+0x470>\n    49ef:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    49f3:\te8 88 c6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    49f8:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    49fc:\t0f 8f 20 dc ff ff    \tjg     2622 <main+0x1562>\n    4a02:\te9 3f df ff ff       \tjmpq   2946 <main+0x1886>\n    4a07:\t48 8d 35 1c 50 00 00 \tlea    0x501c(%rip),%rsi        # 9a2a <latmul64>\n    4a0e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4a14:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4a19:\te8 22 60 00 00       \tcallq  aa40 <measureFunction>\n    4a1e:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    4a24:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4a29:\t48 8d 35 c8 70 00 00 \tlea    0x70c8(%rip),%rsi        # baf8 <_IO_stdin_used+0xaf8>\n    4a30:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4a35:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    4a39:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4a3d:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    4a41:\te8 3a c6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4a46:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4a4a:\t0f 8f 81 db ff ff    \tjg     25d1 <main+0x1511>\n    4a50:\te9 f1 de ff ff       \tjmpq   2946 <main+0x1886>\n    4a55:\t48 8d 35 6e 50 00 00 \tlea    0x506e(%rip),%rsi        # 9aca <latmul16>\n    4a5c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4a62:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4a67:\te8 d4 5f 00 00       \tcallq  aa40 <measureFunction>\n    4a6c:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    4a72:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4a77:\t48 8d 35 a2 70 00 00 \tlea    0x70a2(%rip),%rsi        # bb20 <_IO_stdin_used+0xb20>\n    4a7e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4a83:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    4a87:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4a8b:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    4a8f:\te8 ec c5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4a94:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4a98:\t0f 8f d4 da ff ff    \tjg     2572 <main+0x14b2>\n    4a9e:\te9 a3 de ff ff       \tjmpq   2946 <main+0x1886>\n    4aa3:\t48 8d 35 4a 31 00 00 \tlea    0x314a(%rip),%rsi        # 7bf4 <mix256faddintadd>\n    4aaa:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4ab0:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4ab5:\te8 86 5f 00 00       \tcallq  aa40 <measureFunction>\n    4aba:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4abf:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4ac4:\t48 8d 35 7d 70 00 00 \tlea    0x707d(%rip),%rsi        # bb48 <_IO_stdin_used+0xb48>\n    4acb:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4acf:\te8 ac c5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4ad4:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4ad8:\t0f 8f 35 da ff ff    \tjg     2513 <main+0x1453>\n    4ade:\te9 63 de ff ff       \tjmpq   2946 <main+0x1886>\n    4ae3:\t48 8d 35 ae 4a 00 00 \tlea    0x4aae(%rip),%rsi        # 9598 <nemesfpumix21>\n    4aea:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4af0:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    4af5:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    4af9:\te8 42 5f 00 00       \tcallq  aa40 <measureFunction>\n    4afe:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4b03:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4b08:\t48 8d 35 61 70 00 00 \tlea    0x7061(%rip),%rsi        # bb70 <_IO_stdin_used+0xb70>\n    4b0f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4b13:\te8 68 c5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4b18:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4b1c:\t0f 8f a0 d9 ff ff    \tjg     24c2 <main+0x1402>\n    4b22:\te9 1f de ff ff       \tjmpq   2946 <main+0x1886>\n    4b27:\t48 8d 35 54 47 00 00 \tlea    0x4754(%rip),%rsi        # 9282 <mixfmaand256>\n    4b2e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4b34:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    4b39:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    4b3d:\te8 fe 5e 00 00       \tcallq  aa40 <measureFunction>\n    4b42:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4b47:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4b4c:\t48 8d 35 4d 70 00 00 \tlea    0x704d(%rip),%rsi        # bba0 <_IO_stdin_used+0xba0>\n    4b53:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4b57:\te8 24 c5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4b5c:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4b60:\t0f 8f b2 d8 ff ff    \tjg     2418 <main+0x1358>\n    4b66:\te9 db dd ff ff       \tjmpq   2946 <main+0x1886>\n    4b6b:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4b71:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    4b76:\t48 8d 35 b3 5e 00 00 \tlea    0x5eb3(%rip),%rsi        # aa30 <mixfmaaddmem256wrapper>\n    4b7d:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    4b81:\te8 ba 5e 00 00       \tcallq  aa40 <measureFunction>\n    4b86:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4b8b:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4b90:\t48 8d 35 31 70 00 00 \tlea    0x7031(%rip),%rsi        # bbc8 <_IO_stdin_used+0xbc8>\n    4b97:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4b9b:\te8 e0 c4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4ba0:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4ba4:\t0f 8f 1d d8 ff ff    \tjg     23c7 <main+0x1307>\n    4baa:\te9 97 dd ff ff       \tjmpq   2946 <main+0x1886>\n    4baf:\t48 8d 35 1b 46 00 00 \tlea    0x461b(%rip),%rsi        # 91d1 <mixfmaadd256>\n    4bb6:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4bbc:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    4bc1:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    4bc5:\te8 76 5e 00 00       \tcallq  aa40 <measureFunction>\n    4bca:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4bcf:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4bd4:\t48 8d 35 1d 70 00 00 \tlea    0x701d(%rip),%rsi        # bbf8 <_IO_stdin_used+0xbf8>\n    4bdb:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4bdf:\te8 9c c4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4be4:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4be8:\t0f 8f 84 d7 ff ff    \tjg     2372 <main+0x12b2>\n    4bee:\te9 53 dd ff ff       \tjmpq   2946 <main+0x1886>\n    4bf3:\t48 8d 35 28 43 00 00 \tlea    0x4328(%rip),%rsi        # 8f22 <mixfmafadd256>\n    4bfa:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4c00:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    4c05:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    4c09:\te8 32 5e 00 00       \tcallq  aa40 <measureFunction>\n    4c0e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4c13:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4c18:\t48 8d 35 01 70 00 00 \tlea    0x7001(%rip),%rsi        # bc20 <_IO_stdin_used+0xc20>\n    4c1f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4c23:\te8 58 c4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4c28:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4c2c:\t0f 8f eb d6 ff ff    \tjg     231d <main+0x125d>\n    4c32:\te9 0f dd ff ff       \tjmpq   2946 <main+0x1886>\n    4c37:\t48 8d 35 ff 3d 00 00 \tlea    0x3dff(%rip),%rsi        # 8a3d <mul256fp>\n    4c3e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4c44:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4c49:\te8 f2 5d 00 00       \tcallq  aa40 <measureFunction>\n    4c4e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4c53:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4c58:\t48 8d 35 2c 68 00 00 \tlea    0x682c(%rip),%rsi        # b48b <_IO_stdin_used+0x48b>\n    4c5f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4c63:\te8 18 c4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4c68:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4c6c:\t0f 8f 56 d6 ff ff    \tjg     22c8 <main+0x1208>\n    4c72:\te9 cf dc ff ff       \tjmpq   2946 <main+0x1886>\n    4c77:\t48 8d 35 5e 3e 00 00 \tlea    0x3e5e(%rip),%rsi        # 8adc <add256fp>\n    4c7e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4c84:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4c89:\te8 b2 5d 00 00       \tcallq  aa40 <measureFunction>\n    4c8e:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4c93:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4c98:\t48 8d 35 08 68 00 00 \tlea    0x6808(%rip),%rsi        # b4a7 <_IO_stdin_used+0x4a7>\n    4c9f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4ca3:\te8 d8 c3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4ca8:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4cac:\t0f 8f c5 d5 ff ff    \tjg     2277 <main+0x11b7>\n    4cb2:\te9 8f dc ff ff       \tjmpq   2946 <main+0x1886>\n    4cb7:\t48 8d 35 13 4b 00 00 \tlea    0x4b13(%rip),%rsi        # 97d1 <latfma128>\n    4cbe:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4cc4:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4cc9:\te8 72 5d 00 00       \tcallq  aa40 <measureFunction>\n    4cce:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    4cd4:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4cd9:\t48 8d 35 68 6f 00 00 \tlea    0x6f68(%rip),%rsi        # bc48 <_IO_stdin_used+0xc48>\n    4ce0:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4ce5:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    4ce9:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4ced:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    4cf1:\te8 8a c3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4cf6:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4cfa:\t0f 8f 26 d5 ff ff    \tjg     2226 <main+0x1166>\n    4d00:\te9 41 dc ff ff       \tjmpq   2946 <main+0x1886>\n    4d05:\t48 8d 35 0d 4a 00 00 \tlea    0x4a0d(%rip),%rsi        # 9719 <latfma256>\n    4d0c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4d12:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4d17:\te8 24 5d 00 00       \tcallq  aa40 <measureFunction>\n    4d1c:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    4d22:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4d27:\t48 8d 35 42 6f 00 00 \tlea    0x6f42(%rip),%rsi        # bc70 <_IO_stdin_used+0xc70>\n    4d2e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4d33:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    4d37:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4d3b:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    4d3f:\te8 3c c3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4d44:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4d48:\t0f 8f 79 d4 ff ff    \tjg     21c7 <main+0x1107>\n    4d4e:\te9 f3 db ff ff       \tjmpq   2946 <main+0x1886>\n    4d53:\t48 8d 35 0d 41 00 00 \tlea    0x410d(%rip),%rsi        # 8e67 <fma128>\n    4d5a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4d60:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4d65:\te8 d6 5c 00 00       \tcallq  aa40 <measureFunction>\n    4d6a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4d6f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4d74:\t48 8d 35 48 67 00 00 \tlea    0x6748(%rip),%rsi        # b4c3 <_IO_stdin_used+0x4c3>\n    4d7b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4d7f:\te8 fc c2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4d84:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4d88:\t0f 8f da d3 ff ff    \tjg     2168 <main+0x10a8>\n    4d8e:\te9 b3 db ff ff       \tjmpq   2946 <main+0x1886>\n    4d93:\t48 8d 35 15 40 00 00 \tlea    0x4015(%rip),%rsi        # 8daf <fma256>\n    4d9a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4da0:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4da5:\te8 96 5c 00 00       \tcallq  aa40 <measureFunction>\n    4daa:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4daf:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4db4:\t48 8d 35 23 67 00 00 \tlea    0x6723(%rip),%rsi        # b4de <_IO_stdin_used+0x4de>\n    4dbb:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4dbf:\te8 bc c2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4dc4:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4dc8:\t0f 8f 49 d3 ff ff    \tjg     2117 <main+0x1057>\n    4dce:\te9 73 db ff ff       \tjmpq   2946 <main+0x1886>\n    4dd3:\t48 8d 35 43 3a 00 00 \tlea    0x3a43(%rip),%rsi        # 881d <mul128int>\n    4dda:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4de0:\t4c 89 f7             \tmov    %r14,%rdi\n    4de3:\te8 58 5c 00 00       \tcallq  aa40 <measureFunction>\n    4de8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4ded:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4df2:\t48 8d 35 9f 6e 00 00 \tlea    0x6e9f(%rip),%rsi        # bc98 <_IO_stdin_used+0xc98>\n    4df9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4dfd:\te8 7e c2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4e02:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4e06:\t0f 8f ba d2 ff ff    \tjg     20c6 <main+0x1006>\n    4e0c:\te9 35 db ff ff       \tjmpq   2946 <main+0x1886>\n    4e11:\t48 8d 35 9c 35 00 00 \tlea    0x359c(%rip),%rsi        # 83b4 <add128int>\n    4e18:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4e1e:\t4c 89 f7             \tmov    %r14,%rdi\n    4e21:\te8 1a 5c 00 00       \tcallq  aa40 <measureFunction>\n    4e26:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4e2b:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4e30:\t48 8d 35 89 6e 00 00 \tlea    0x6e89(%rip),%rsi        # bcc0 <_IO_stdin_used+0xcc0>\n    4e37:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4e3b:\te8 40 c2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4e40:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4e44:\t0f 8f 2d d2 ff ff    \tjg     2077 <main+0xfb7>\n    4e4a:\te9 f7 da ff ff       \tjmpq   2946 <main+0x1886>\n    4e4f:\t48 8d 35 f6 4a 00 00 \tlea    0x4af6(%rip),%rsi        # 994c <mul128fp>\n    4e56:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4e5c:\t4c 89 f7             \tmov    %r14,%rdi\n    4e5f:\te8 dc 5b 00 00       \tcallq  aa40 <measureFunction>\n    4e64:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4e69:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4e6e:\t48 8d 35 73 6e 00 00 \tlea    0x6e73(%rip),%rsi        # bce8 <_IO_stdin_used+0xce8>\n    4e75:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4e79:\te8 02 c2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4e7e:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4e82:\t0f 8f a0 d1 ff ff    \tjg     2028 <main+0xf68>\n    4e88:\te9 b9 da ff ff       \tjmpq   2946 <main+0x1886>\n    4e8d:\t48 8d 35 27 4b 00 00 \tlea    0x4b27(%rip),%rsi        # 99bb <add128fp>\n    4e94:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4e9a:\t4c 89 f7             \tmov    %r14,%rdi\n    4e9d:\te8 9e 5b 00 00       \tcallq  aa40 <measureFunction>\n    4ea2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4ea7:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4eac:\t48 8d 35 55 6e 00 00 \tlea    0x6e55(%rip),%rsi        # bd08 <_IO_stdin_used+0xd08>\n    4eb3:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    4eb7:\te8 c4 c1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4ebc:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4ec0:\t0f 8f 13 d1 ff ff    \tjg     1fd9 <main+0xf19>\n    4ec6:\te9 7b da ff ff       \tjmpq   2946 <main+0x1886>\n    4ecb:\t48 8d 35 1a 4a 00 00 \tlea    0x4a1a(%rip),%rsi        # 98ec <latmul128fp>\n    4ed2:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4ed8:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4edd:\te8 5e 5b 00 00       \tcallq  aa40 <measureFunction>\n    4ee2:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    4ee8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4eed:\t48 8d 35 34 6e 00 00 \tlea    0x6e34(%rip),%rsi        # bd28 <_IO_stdin_used+0xd28>\n    4ef4:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4ef9:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    4efd:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4f01:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    4f05:\te8 76 c1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4f0a:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4f0e:\t0f 8f 76 d0 ff ff    \tjg     1f8a <main+0xeca>\n    4f14:\te9 2d da ff ff       \tjmpq   2946 <main+0x1886>\n    4f19:\t48 8d 35 6c 49 00 00 \tlea    0x496c(%rip),%rsi        # 988c <latadd128fp>\n    4f20:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4f26:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4f2b:\te8 10 5b 00 00       \tcallq  aa40 <measureFunction>\n    4f30:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    4f36:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4f3b:\t48 8d 35 0e 6e 00 00 \tlea    0x6e0e(%rip),%rsi        # bd50 <_IO_stdin_used+0xd50>\n    4f42:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4f47:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    4f4b:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4f4f:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    4f53:\te8 28 c1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4f58:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4f5c:\t0f 8f c9 cf ff ff    \tjg     1f2b <main+0xe6b>\n    4f62:\te9 df d9 ff ff       \tjmpq   2946 <main+0x1886>\n    4f67:\t48 8d 35 0d 3c 00 00 \tlea    0x3c0d(%rip),%rsi        # 8b7b <latmul256fp>\n    4f6e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4f74:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4f79:\te8 c2 5a 00 00       \tcallq  aa40 <measureFunction>\n    4f7e:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    4f84:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4f89:\t48 8d 35 e8 6d 00 00 \tlea    0x6de8(%rip),%rsi        # bd78 <_IO_stdin_used+0xd78>\n    4f90:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4f95:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    4f99:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4f9d:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    4fa1:\te8 da c0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4fa6:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4faa:\t0f 8f 1c cf ff ff    \tjg     1ecc <main+0xe0c>\n    4fb0:\te9 91 d9 ff ff       \tjmpq   2946 <main+0x1886>\n    4fb5:\t48 8d 35 05 3a 00 00 \tlea    0x3a05(%rip),%rsi        # 89c1 <latadd256fp>\n    4fbc:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    4fc2:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    4fc7:\te8 74 5a 00 00       \tcallq  aa40 <measureFunction>\n    4fcc:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    4fd2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    4fd7:\t48 8d 35 c2 6d 00 00 \tlea    0x6dc2(%rip),%rsi        # bda0 <_IO_stdin_used+0xda0>\n    4fde:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    4fe3:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    4fe7:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    4feb:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    4fef:\te8 8c c0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    4ff4:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    4ff8:\t0f 8f 6f ce ff ff    \tjg     1e6d <main+0xdad>\n    4ffe:\te9 43 d9 ff ff       \tjmpq   2946 <main+0x1886>\n    5003:\t48 8d 35 92 38 00 00 \tlea    0x3892(%rip),%rsi        # 889c <latmul128int>\n    500a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5010:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5015:\te8 26 5a 00 00       \tcallq  aa40 <measureFunction>\n    501a:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    5020:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5025:\t48 8d 35 9c 6d 00 00 \tlea    0x6d9c(%rip),%rsi        # bdc8 <_IO_stdin_used+0xdc8>\n    502c:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5031:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    5035:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    5039:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    503d:\te8 3e c0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5042:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5046:\t0f 8f c2 cd ff ff    \tjg     1e0e <main+0xd4e>\n    504c:\te9 f5 d8 ff ff       \tjmpq   2946 <main+0x1886>\n    5051:\t48 8d 35 f1 32 00 00 \tlea    0x32f1(%rip),%rsi        # 8349 <latadd128int>\n    5058:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    505e:\t4c 89 f7             \tmov    %r14,%rdi\n    5061:\te8 da 59 00 00       \tcallq  aa40 <measureFunction>\n    5066:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    506c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5071:\t48 8d 35 88 6d 00 00 \tlea    0x6d88(%rip),%rsi        # be00 <_IO_stdin_used+0xe00>\n    5078:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    507d:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    5081:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    5085:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    5089:\te8 f2 bf ff ff       \tcallq  1080 <__printf_chk@plt>\n    508e:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5092:\t0f 8f 17 cd ff ff    \tjg     1daf <main+0xcef>\n    5098:\te9 a9 d8 ff ff       \tjmpq   2946 <main+0x1886>\n    509d:\t48 8d 35 f7 31 00 00 \tlea    0x31f7(%rip),%rsi        # 829b <latmul256int>\n    50a4:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    50aa:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    50af:\te8 8c 59 00 00       \tcallq  aa40 <measureFunction>\n    50b4:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    50ba:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    50bf:\t48 8d 35 6a 6d 00 00 \tlea    0x6d6a(%rip),%rsi        # be30 <_IO_stdin_used+0xe30>\n    50c6:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    50cb:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    50cf:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    50d3:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    50d7:\te8 a4 bf ff ff       \tcallq  1080 <__printf_chk@plt>\n    50dc:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    50e0:\t0f 8f 6c cc ff ff    \tjg     1d52 <main+0xc92>\n    50e6:\te9 5b d8 ff ff       \tjmpq   2946 <main+0x1886>\n    50eb:\t48 8d 35 be 2d 00 00 \tlea    0x2dbe(%rip),%rsi        # 7eb0 <latadd256int>\n    50f2:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    50f8:\t4c 89 f7             \tmov    %r14,%rdi\n    50fb:\te8 40 59 00 00       \tcallq  aa40 <measureFunction>\n    5100:\tf3 0f 10 74 24 0c    \tmovss  0xc(%rsp),%xmm6\n    5106:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    510b:\t48 8d 35 56 6d 00 00 \tlea    0x6d56(%rip),%rsi        # be68 <_IO_stdin_used+0xe68>\n    5112:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5117:\tf3 0f 5e f0          \tdivss  %xmm0,%xmm6\n    511b:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    511f:\tf3 0f 5a c6          \tcvtss2sd %xmm6,%xmm0\n    5123:\te8 58 bf ff ff       \tcallq  1080 <__printf_chk@plt>\n    5128:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    512c:\t0f 8f c1 cb ff ff    \tjg     1cf3 <main+0xc33>\n    5132:\te9 0f d8 ff ff       \tjmpq   2946 <main+0x1886>\n    5137:\t48 8d 35 62 2b 00 00 \tlea    0x2b62(%rip),%rsi        # 7ca0 <mix256fp>\n    513e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5144:\t4c 89 f7             \tmov    %r14,%rdi\n    5147:\te8 f4 58 00 00       \tcallq  aa40 <measureFunction>\n    514c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5151:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5156:\t48 8d 35 3b 6d 00 00 \tlea    0x6d3b(%rip),%rsi        # be98 <_IO_stdin_used+0xe98>\n    515d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5161:\te8 1a bf ff ff       \tcallq  1080 <__printf_chk@plt>\n    5166:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    516a:\t0f 8f 26 cb ff ff    \tjg     1c96 <main+0xbd6>\n    5170:\te9 d1 d7 ff ff       \tjmpq   2946 <main+0x1886>\n    5175:\t48 8d 35 cf 29 00 00 \tlea    0x29cf(%rip),%rsi        # 7b4b <mixadd256fpint>\n    517c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5182:\t4c 89 f7             \tmov    %r14,%rdi\n    5185:\te8 b6 58 00 00       \tcallq  aa40 <measureFunction>\n    518a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    518f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5194:\t48 8d 35 2d 6d 00 00 \tlea    0x6d2d(%rip),%rsi        # bec8 <_IO_stdin_used+0xec8>\n    519b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    519f:\te8 dc be ff ff       \tcallq  1080 <__printf_chk@plt>\n    51a4:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    51a8:\t0f 8f 99 ca ff ff    \tjg     1c47 <main+0xb87>\n    51ae:\te9 93 d7 ff ff       \tjmpq   2946 <main+0x1886>\n    51b3:\t48 8d 35 4f 2c 00 00 \tlea    0x2c4f(%rip),%rsi        # 7e09 <mixadd256int11>\n    51ba:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    51c0:\t4c 89 f7             \tmov    %r14,%rdi\n    51c3:\te8 78 58 00 00       \tcallq  aa40 <measureFunction>\n    51c8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    51cd:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    51d2:\t48 8d 35 1f 6d 00 00 \tlea    0x6d1f(%rip),%rsi        # bef8 <_IO_stdin_used+0xef8>\n    51d9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    51dd:\te8 9e be ff ff       \tcallq  1080 <__printf_chk@plt>\n    51e2:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    51e6:\t0f 8f 0c ca ff ff    \tjg     1bf8 <main+0xb38>\n    51ec:\te9 55 d7 ff ff       \tjmpq   2946 <main+0x1886>\n    51f1:\t48 8d 35 4c 2b 00 00 \tlea    0x2b4c(%rip),%rsi        # 7d44 <mixadd256int>\n    51f8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    51fe:\t4c 89 f7             \tmov    %r14,%rdi\n    5201:\te8 3a 58 00 00       \tcallq  aa40 <measureFunction>\n    5206:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    520b:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5210:\t48 8d 35 19 6d 00 00 \tlea    0x6d19(%rip),%rsi        # bf30 <_IO_stdin_used+0xf30>\n    5217:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    521b:\te8 60 be ff ff       \tcallq  1080 <__printf_chk@plt>\n    5220:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5224:\t0f 8f 7f c9 ff ff    \tjg     1ba9 <main+0xae9>\n    522a:\te9 17 d7 ff ff       \tjmpq   2946 <main+0x1886>\n    522f:\t48 8d 35 55 26 00 00 \tlea    0x2655(%rip),%rsi        # 788b <add256int>\n    5236:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    523c:\t4c 89 f7             \tmov    %r14,%rdi\n    523f:\te8 fc 57 00 00       \tcallq  aa40 <measureFunction>\n    5244:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5249:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    524e:\t48 8d 35 13 6d 00 00 \tlea    0x6d13(%rip),%rsi        # bf68 <_IO_stdin_used+0xf68>\n    5255:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5259:\te8 22 be ff ff       \tcallq  1080 <__printf_chk@plt>\n    525e:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5262:\t0f 8f f2 c8 ff ff    \tjg     1b5a <main+0xa9a>\n    5268:\te9 d9 d6 ff ff       \tjmpq   2946 <main+0x1886>\n    526d:\t48 8d 35 02 21 00 00 \tlea    0x2102(%rip),%rsi        # 7376 <leamultest>\n    5274:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    527a:\t4c 89 f7             \tmov    %r14,%rdi\n    527d:\te8 be 57 00 00       \tcallq  aa40 <measureFunction>\n    5282:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5287:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    528c:\t48 8d 35 fd 6c 00 00 \tlea    0x6cfd(%rip),%rsi        # bf90 <_IO_stdin_used+0xf90>\n    5293:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5297:\te8 e4 bd ff ff       \tcallq  1080 <__printf_chk@plt>\n    529c:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    52a0:\t0f 8f 65 c8 ff ff    \tjg     1b0b <main+0xa4b>\n    52a6:\te9 9b d6 ff ff       \tjmpq   2946 <main+0x1886>\n    52ab:\t48 8d 35 21 20 00 00 \tlea    0x2021(%rip),%rsi        # 72d3 <leatest>\n    52b2:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    52b8:\t4c 89 f7             \tmov    %r14,%rdi\n    52bb:\te8 80 57 00 00       \tcallq  aa40 <measureFunction>\n    52c0:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    52c5:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    52ca:\t48 8d 35 28 62 00 00 \tlea    0x6228(%rip),%rsi        # b4f9 <_IO_stdin_used+0x4f9>\n    52d1:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    52d5:\te8 a6 bd ff ff       \tcallq  1080 <__printf_chk@plt>\n    52da:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    52de:\t0f 8f d8 c7 ff ff    \tjg     1abc <main+0x9fc>\n    52e4:\te9 5d d6 ff ff       \tjmpq   2946 <main+0x1886>\n    52e9:\t48 8d 35 a3 1e 00 00 \tlea    0x1ea3(%rip),%rsi        # 7193 <rorbtstest>\n    52f0:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    52f6:\t4c 89 f7             \tmov    %r14,%rdi\n    52f9:\te8 42 57 00 00       \tcallq  aa40 <measureFunction>\n    52fe:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5303:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5308:\t48 8d 35 03 62 00 00 \tlea    0x6203(%rip),%rsi        # b512 <_IO_stdin_used+0x512>\n    530f:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5313:\te8 68 bd ff ff       \tcallq  1080 <__printf_chk@plt>\n    5318:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    531c:\t0f 8f 4b c7 ff ff    \tjg     1a6d <main+0x9ad>\n    5322:\te9 1f d6 ff ff       \tjmpq   2946 <main+0x1886>\n    5327:\t48 8d 35 fb 20 00 00 \tlea    0x20fb(%rip),%rsi        # 7429 <btsmultest>\n    532e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5334:\t4c 89 f7             \tmov    %r14,%rdi\n    5337:\te8 04 57 00 00       \tcallq  aa40 <measureFunction>\n    533c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5341:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5346:\t48 8d 35 e0 61 00 00 \tlea    0x61e0(%rip),%rsi        # b52d <_IO_stdin_used+0x52d>\n    534d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5351:\te8 2a bd ff ff       \tcallq  1080 <__printf_chk@plt>\n    5356:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    535a:\t0f 8f be c6 ff ff    \tjg     1a1e <main+0x95e>\n    5360:\te9 e1 d5 ff ff       \tjmpq   2946 <main+0x1886>\n    5365:\t48 8d 35 c4 1e 00 00 \tlea    0x1ec4(%rip),%rsi        # 7230 <btstest>\n    536c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5372:\t4c 89 f7             \tmov    %r14,%rdi\n    5375:\te8 c6 56 00 00       \tcallq  aa40 <measureFunction>\n    537a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    537f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5384:\t48 8d 35 bd 61 00 00 \tlea    0x61bd(%rip),%rsi        # b548 <_IO_stdin_used+0x548>\n    538b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    538f:\te8 ec bc ff ff       \tcallq  1080 <__printf_chk@plt>\n    5394:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5398:\t0f 8f 31 c6 ff ff    \tjg     19cf <main+0x90f>\n    539e:\te9 a3 d5 ff ff       \tjmpq   2946 <main+0x1886>\n    53a3:\t48 8d 35 34 1d 00 00 \tlea    0x1d34(%rip),%rsi        # 70de <mixrormultest>\n    53aa:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    53b0:\t4c 89 f7             \tmov    %r14,%rdi\n    53b3:\te8 88 56 00 00       \tcallq  aa40 <measureFunction>\n    53b8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    53bd:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    53c2:\t48 8d 35 92 61 00 00 \tlea    0x6192(%rip),%rsi        # b55b <_IO_stdin_used+0x55b>\n    53c9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    53cd:\te8 ae bc ff ff       \tcallq  1080 <__printf_chk@plt>\n    53d2:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    53d6:\t0f 8f a4 c5 ff ff    \tjg     1980 <main+0x8c0>\n    53dc:\te9 65 d5 ff ff       \tjmpq   2946 <main+0x1886>\n    53e1:\t48 8d 35 6a 1c 00 00 \tlea    0x1c6a(%rip),%rsi        # 7052 <mixrorshltest>\n    53e8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    53ee:\t4c 89 f7             \tmov    %r14,%rdi\n    53f1:\te8 4a 56 00 00       \tcallq  aa40 <measureFunction>\n    53f6:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    53fb:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5400:\t48 8d 35 b1 6b 00 00 \tlea    0x6bb1(%rip),%rsi        # bfb8 <_IO_stdin_used+0xfb8>\n    5407:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    540b:\te8 70 bc ff ff       \tcallq  1080 <__printf_chk@plt>\n    5410:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5414:\t0f 8f 17 c5 ff ff    \tjg     1931 <main+0x871>\n    541a:\te9 27 d5 ff ff       \tjmpq   2946 <main+0x1886>\n    541f:\t48 8d 35 14 1b 00 00 \tlea    0x1b14(%rip),%rsi        # 6f3a <rortest>\n    5426:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    542c:\t4c 89 f7             \tmov    %r14,%rdi\n    542f:\te8 0c 56 00 00       \tcallq  aa40 <measureFunction>\n    5434:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5439:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    543e:\t48 8d 35 31 61 00 00 \tlea    0x6131(%rip),%rsi        # b576 <_IO_stdin_used+0x576>\n    5445:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5449:\te8 32 bc ff ff       \tcallq  1080 <__printf_chk@plt>\n    544e:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5452:\t0f 8f 8a c4 ff ff    \tjg     18e2 <main+0x822>\n    5458:\te9 e9 d4 ff ff       \tjmpq   2946 <main+0x1886>\n    545d:\t48 8d 35 62 1b 00 00 \tlea    0x1b62(%rip),%rsi        # 6fc6 <shltest>\n    5464:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    546a:\t4c 89 f7             \tmov    %r14,%rdi\n    546d:\te8 ce 55 00 00       \tcallq  aa40 <measureFunction>\n    5472:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5477:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    547c:\t48 8d 35 0a 61 00 00 \tlea    0x610a(%rip),%rsi        # b58d <_IO_stdin_used+0x58d>\n    5483:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5487:\te8 f4 bb ff ff       \tcallq  1080 <__printf_chk@plt>\n    548c:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5490:\t0f 8f fd c3 ff ff    \tjg     1893 <main+0x7d3>\n    5496:\te9 ab d4 ff ff       \tjmpq   2946 <main+0x1886>\n    549b:\t48 8d 35 42 4f 00 00 \tlea    0x4f42(%rip),%rsi        # a3e4 <pdepmultest>\n    54a2:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    54a8:\t4c 89 f7             \tmov    %r14,%rdi\n    54ab:\te8 90 55 00 00       \tcallq  aa40 <measureFunction>\n    54b0:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    54b5:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    54ba:\t48 8d 35 e3 60 00 00 \tlea    0x60e3(%rip),%rsi        # b5a4 <_IO_stdin_used+0x5a4>\n    54c1:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    54c5:\te8 b6 bb ff ff       \tcallq  1080 <__printf_chk@plt>\n    54ca:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    54ce:\t0f 8f 70 c3 ff ff    \tjg     1844 <main+0x784>\n    54d4:\te9 6d d4 ff ff       \tjmpq   2946 <main+0x1886>\n    54d9:\t48 8d 35 b3 4f 00 00 \tlea    0x4fb3(%rip),%rsi        # a493 <pexttest>\n    54e0:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    54e6:\t4c 89 f7             \tmov    %r14,%rdi\n    54e9:\te8 52 55 00 00       \tcallq  aa40 <measureFunction>\n    54ee:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    54f3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    54f8:\t48 8d 35 c1 60 00 00 \tlea    0x60c1(%rip),%rsi        # b5c0 <_IO_stdin_used+0x5c0>\n    54ff:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5503:\te8 78 bb ff ff       \tcallq  1080 <__printf_chk@plt>\n    5508:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    550c:\t0f 8f e3 c2 ff ff    \tjg     17f5 <main+0x735>\n    5512:\te9 2f d4 ff ff       \tjmpq   2946 <main+0x1886>\n    5517:\t48 8d 35 12 4e 00 00 \tlea    0x4e12(%rip),%rsi        # a330 <pdeptest>\n    551e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5524:\t4c 89 f7             \tmov    %r14,%rdi\n    5527:\te8 14 55 00 00       \tcallq  aa40 <measureFunction>\n    552c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5531:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5536:\t48 8d 35 97 60 00 00 \tlea    0x6097(%rip),%rsi        # b5d4 <_IO_stdin_used+0x5d4>\n    553d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5541:\te8 3a bb ff ff       \tcallq  1080 <__printf_chk@plt>\n    5546:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    554a:\t0f 8f 56 c2 ff ff    \tjg     17a6 <main+0x6e6>\n    5550:\te9 f1 d3 ff ff       \tjmpq   2946 <main+0x1886>\n    5555:\t48 8d 35 6d 20 00 00 \tlea    0x206d(%rip),%rsi        # 75c9 <ntjmptest>\n    555c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5562:\t4c 89 f7             \tmov    %r14,%rdi\n    5565:\te8 d6 54 00 00       \tcallq  aa40 <measureFunction>\n    556a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    556f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5574:\t48 8d 35 6d 60 00 00 \tlea    0x606d(%rip),%rsi        # b5e8 <_IO_stdin_used+0x5e8>\n    557b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    557f:\te8 fc ba ff ff       \tcallq  1080 <__printf_chk@plt>\n    5584:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5588:\t0f 8f c9 c1 ff ff    \tjg     1757 <main+0x697>\n    558e:\te9 b3 d3 ff ff       \tjmpq   2946 <main+0x1886>\n    5593:\t48 8d 35 57 1f 00 00 \tlea    0x1f57(%rip),%rsi        # 74f1 <jmptest>\n    559a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    55a0:\t4c 89 f7             \tmov    %r14,%rdi\n    55a3:\te8 98 54 00 00       \tcallq  aa40 <measureFunction>\n    55a8:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    55ad:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    55b2:\t48 8d 35 45 60 00 00 \tlea    0x6045(%rip),%rsi        # b5fe <_IO_stdin_used+0x5fe>\n    55b9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    55bd:\te8 be ba ff ff       \tcallq  1080 <__printf_chk@plt>\n    55c2:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    55c6:\t0f 8f 3c c1 ff ff    \tjg     1708 <main+0x648>\n    55cc:\te9 75 d3 ff ff       \tjmpq   2946 <main+0x1886>\n    55d1:\t48 8d 35 0f 21 00 00 \tlea    0x210f(%rip),%rsi        # 76e7 <jmpmultest>\n    55d8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    55de:\t4c 89 f7             \tmov    %r14,%rdi\n    55e1:\te8 5a 54 00 00       \tcallq  aa40 <measureFunction>\n    55e6:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    55eb:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    55f0:\t48 8d 35 20 60 00 00 \tlea    0x6020(%rip),%rsi        # b617 <_IO_stdin_used+0x617>\n    55f7:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    55fb:\te8 80 ba ff ff       \tcallq  1080 <__printf_chk@plt>\n    5600:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5604:\t0f 8f af c0 ff ff    \tjg     16b9 <main+0x5f9>\n    560a:\te9 37 d3 ff ff       \tjmpq   2946 <main+0x1886>\n    560f:\t48 8d 35 8e 21 00 00 \tlea    0x218e(%rip),%rsi        # 77a4 <addmultest>\n    5616:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    561c:\t4c 89 f7             \tmov    %r14,%rdi\n    561f:\te8 1c 54 00 00       \tcallq  aa40 <measureFunction>\n    5624:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5629:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    562e:\t48 8d 35 fd 5f 00 00 \tlea    0x5ffd(%rip),%rsi        # b632 <_IO_stdin_used+0x632>\n    5635:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5639:\te8 42 ba ff ff       \tcallq  1080 <__printf_chk@plt>\n    563e:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5642:\t0f 8f 22 c0 ff ff    \tjg     166a <main+0x5aa>\n    5648:\te9 f9 d2 ff ff       \tjmpq   2946 <main+0x1886>\n    564d:\t48 8d 35 52 16 00 00 \tlea    0x1652(%rip),%rsi        # 6ca6 <clkmovtest>\n    5654:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    565a:\t4c 89 f7             \tmov    %r14,%rdi\n    565d:\te8 de 53 00 00       \tcallq  aa40 <measureFunction>\n    5662:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5667:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    566c:\t48 8d 35 65 69 00 00 \tlea    0x6965(%rip),%rsi        # bfd8 <_IO_stdin_used+0xfd8>\n    5673:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5677:\te8 04 ba ff ff       \tcallq  1080 <__printf_chk@plt>\n    567c:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5680:\t0f 8f 95 bf ff ff    \tjg     161b <main+0x55b>\n    5686:\te9 bb d2 ff ff       \tjmpq   2946 <main+0x1886>\n    568b:\t48 8d 35 5c 52 00 00 \tlea    0x525c(%rip),%rsi        # a8ee <depdectest>\n    5692:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5698:\t4c 89 f7             \tmov    %r14,%rdi\n    569b:\te8 a0 53 00 00       \tcallq  aa40 <measureFunction>\n    56a0:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    56a5:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    56aa:\t48 8d 35 9e 5f 00 00 \tlea    0x5f9e(%rip),%rsi        # b64f <_IO_stdin_used+0x64f>\n    56b1:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    56b5:\te8 c6 b9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    56ba:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    56be:\t0f 8f bd be ff ff    \tjg     1581 <main+0x4c1>\n    56c4:\te9 7d d2 ff ff       \tjmpq   2946 <main+0x1886>\n    56c9:\t48 8d 35 a7 51 00 00 \tlea    0x51a7(%rip),%rsi        # a877 <depinctest>\n    56d0:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    56d6:\t4c 89 f7             \tmov    %r14,%rdi\n    56d9:\te8 62 53 00 00       \tcallq  aa40 <measureFunction>\n    56de:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    56e3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    56e8:\t48 8d 35 77 5f 00 00 \tlea    0x5f77(%rip),%rsi        # b666 <_IO_stdin_used+0x666>\n    56ef:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    56f3:\te8 88 b9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    56f8:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    56fc:\t0f 8f 30 be ff ff    \tjg     1532 <main+0x472>\n    5702:\te9 3f d2 ff ff       \tjmpq   2946 <main+0x1886>\n    5707:\t48 8d 35 67 50 00 00 \tlea    0x5067(%rip),%rsi        # a775 <subzerotest>\n    570e:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5714:\t4c 89 f7             \tmov    %r14,%rdi\n    5717:\te8 24 53 00 00       \tcallq  aa40 <measureFunction>\n    571c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5721:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5726:\t48 8d 35 50 5f 00 00 \tlea    0x5f50(%rip),%rsi        # b67d <_IO_stdin_used+0x67d>\n    572d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5731:\te8 4a b9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5736:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    573a:\t0f 8f a3 bd ff ff    \tjg     14e3 <main+0x423>\n    5740:\te9 01 d2 ff ff       \tjmpq   2946 <main+0x1886>\n    5745:\t48 8d 35 e7 4e 00 00 \tlea    0x4ee7(%rip),%rsi        # a633 <movzerotest>\n    574c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5752:\t4c 89 f7             \tmov    %r14,%rdi\n    5755:\te8 e6 52 00 00       \tcallq  aa40 <measureFunction>\n    575a:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    575f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5764:\t48 8d 35 2a 5f 00 00 \tlea    0x5f2a(%rip),%rsi        # b695 <_IO_stdin_used+0x695>\n    576b:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    576f:\te8 0c b9 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5774:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5778:\t0f 8f 16 bd ff ff    \tjg     1494 <main+0x3d4>\n    577e:\te9 c3 d1 ff ff       \tjmpq   2946 <main+0x1886>\n    5783:\t48 8d 35 74 4f 00 00 \tlea    0x4f74(%rip),%rsi        # a6fe <xorzerotest>\n    578a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5790:\t4c 89 f7             \tmov    %r14,%rdi\n    5793:\te8 a8 52 00 00       \tcallq  aa40 <measureFunction>\n    5798:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    579d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    57a2:\t48 8d 35 04 5f 00 00 \tlea    0x5f04(%rip),%rsi        # b6ad <_IO_stdin_used+0x6ad>\n    57a9:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    57ad:\te8 ce b8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    57b2:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    57b6:\t0f 8f 89 bc ff ff    \tjg     1445 <main+0x385>\n    57bc:\te9 85 d1 ff ff       \tjmpq   2946 <main+0x1886>\n    57c1:\t48 8d 35 f4 4d 00 00 \tlea    0x4df4(%rip),%rsi        # a5bc <indepmovtest>\n    57c8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    57ce:\t4c 89 f7             \tmov    %r14,%rdi\n    57d1:\te8 6a 52 00 00       \tcallq  aa40 <measureFunction>\n    57d6:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    57db:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    57e0:\t48 8d 35 41 68 00 00 \tlea    0x6841(%rip),%rsi        # c028 <_IO_stdin_used+0x1028>\n    57e7:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    57eb:\te8 90 b8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    57f0:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    57f4:\t0f 8f fc bb ff ff    \tjg     13f6 <main+0x336>\n    57fa:\te9 47 d1 ff ff       \tjmpq   2946 <main+0x1886>\n    57ff:\t48 8d 35 41 4d 00 00 \tlea    0x4d41(%rip),%rsi        # a547 <depmovtest>\n    5806:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    580c:\t4c 89 f7             \tmov    %r14,%rdi\n    580f:\te8 2c 52 00 00       \tcallq  aa40 <measureFunction>\n    5814:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5819:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    581e:\t48 8d 35 a0 5e 00 00 \tlea    0x5ea0(%rip),%rsi        # b6c5 <_IO_stdin_used+0x6c5>\n    5825:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5829:\te8 52 b8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    582e:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5832:\t0f 8f 6f bb ff ff    \tjg     13a7 <main+0x2e7>\n    5838:\te9 09 d1 ff ff       \tjmpq   2946 <main+0x1886>\n    583d:\t48 8d 35 e6 15 00 00 \tlea    0x15e6(%rip),%rsi        # 6e2a <addnoptest>\n    5844:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    584a:\t4c 89 f7             \tmov    %r14,%rdi\n    584d:\te8 ee 51 00 00       \tcallq  aa40 <measureFunction>\n    5852:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5857:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    585c:\t48 8d 35 80 5e 00 00 \tlea    0x5e80(%rip),%rsi        # b6e3 <_IO_stdin_used+0x6e3>\n    5863:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5867:\te8 14 b8 ff ff       \tcallq  1080 <__printf_chk@plt>\n    586c:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5870:\t0f 8f e2 ba ff ff    \tjg     1358 <main+0x298>\n    5876:\te9 cb d0 ff ff       \tjmpq   2946 <main+0x1886>\n    587b:\t4c 8d 3d a8 15 00 00 \tlea    0x15a8(%rip),%r15        # 6e2a <addnoptest>\n    5882:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5888:\t4c 89 f7             \tmov    %r14,%rdi\n    588b:\t4c 89 fe             \tmov    %r15,%rsi\n    588e:\te8 ad 51 00 00       \tcallq  aa40 <measureFunction>\n    5893:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5898:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    589d:\t48 8d 35 5c 5e 00 00 \tlea    0x5e5c(%rip),%rsi        # b700 <_IO_stdin_used+0x700>\n    58a4:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    58a8:\te8 d3 b7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    58ad:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    58b1:\t0f 8f 5b ba ff ff    \tjg     1312 <main+0x252>\n    58b7:\te9 8a d0 ff ff       \tjmpq   2946 <main+0x1886>\n    58bc:\t48 8d 35 db 14 00 00 \tlea    0x14db(%rip),%rsi        # 6d9e <addtest>\n    58c3:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    58c9:\t4c 89 f7             \tmov    %r14,%rdi\n    58cc:\te8 6f 51 00 00       \tcallq  aa40 <measureFunction>\n    58d1:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    58d6:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    58db:\t48 8d 35 3b 5e 00 00 \tlea    0x5e3b(%rip),%rsi        # b71d <_IO_stdin_used+0x71d>\n    58e2:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    58e6:\te8 95 b7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    58eb:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    58ef:\t0f 8f d0 b9 ff ff    \tjg     12c5 <main+0x205>\n    58f5:\te9 4c d0 ff ff       \tjmpq   2946 <main+0x1886>\n    58fa:\t48 8d 35 3e 14 00 00 \tlea    0x143e(%rip),%rsi        # 6d3f <noptest>\n    5901:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5907:\t4c 89 f7             \tmov    %r14,%rdi\n    590a:\te8 31 51 00 00       \tcallq  aa40 <measureFunction>\n    590f:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5914:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5919:\t48 8d 35 11 5e 00 00 \tlea    0x5e11(%rip),%rsi        # b731 <_IO_stdin_used+0x731>\n    5920:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5924:\te8 57 b7 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5929:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    592d:\t0f 8f 43 b9 ff ff    \tjg     1276 <main+0x1b6>\n    5933:\te9 0e d0 ff ff       \tjmpq   2946 <main+0x1886>\n    5938:\t45 31 ed             \txor    %r13d,%r13d\n    593b:\t48 8d 74 24 10       \tlea    0x10(%rsp),%rsi\n    5940:\t48 8d 7c 24 20       \tlea    0x20(%rsp),%rdi\n    5945:\te8 06 b7 ff ff       \tcallq  1050 <gettimeofday@plt>\n    594a:\t4c 89 f7             \tmov    %r14,%rdi\n    594d:\te8 f7 12 00 00       \tcallq  6c49 <clktest>\n    5952:\t48 8d 74 24 18       \tlea    0x18(%rsp),%rsi\n    5957:\t48 8d 7c 24 30       \tlea    0x30(%rsp),%rdi\n    595c:\te8 ef b6 ff ff       \tcallq  1050 <gettimeofday@plt>\n    5961:\t48 8b 44 24 38       \tmov    0x38(%rsp),%rax\n    5966:\t48 2b 44 24 28       \tsub    0x28(%rsp),%rax\n    596b:\tbe e8 03 00 00       \tmov    $0x3e8,%esi\n    5970:\t48 99                \tcqto   \n    5972:\t48 8b 4c 24 30       \tmov    0x30(%rsp),%rcx\n    5977:\t48 2b 4c 24 20       \tsub    0x20(%rsp),%rcx\n    597c:\t48 f7 fe             \tidiv   %rsi\n    597f:\t48 69 c9 e8 03 00 00 \timul   $0x3e8,%rcx,%rcx\n    5986:\t48 01 c1             \tadd    %rax,%rcx\n    5989:\t0f 88 03 0b 00 00    \tjs     6492 <main+0x53d2>\n    598f:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    5993:\tf3 48 0f 2a c1       \tcvtsi2ss %rcx,%xmm0\n    5998:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    599c:\tf2 0f 59 05 d4 66 00 \tmulsd  0x66d4(%rip),%xmm0        # c078 <_IO_stdin_used+0x1078>\n    59a3:\t00 \n    59a4:\t4d 85 f6             \ttest   %r14,%r14\n    59a7:\t0f 88 cd 0a 00 00    \tjs     647a <main+0x53ba>\n    59ad:\t66 0f ef c9          \tpxor   %xmm1,%xmm1\n    59b1:\tf3 49 0f 2a ce       \tcvtsi2ss %r14,%xmm1\n    59b6:\tf3 0f 5a c9          \tcvtss2sd %xmm1,%xmm1\n    59ba:\tf2 0f 5e c1          \tdivsd  %xmm1,%xmm0\n    59be:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    59c3:\tf3 0f 10 2d bd 66 00 \tmovss  0x66bd(%rip),%xmm5        # c088 <_IO_stdin_used+0x1088>\n    59ca:\t00 \n    59cb:\t48 8d 35 a6 5d 00 00 \tlea    0x5da6(%rip),%rsi        # b778 <_IO_stdin_used+0x778>\n    59d2:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    59d7:\tf3 0f 11 6c 24 0c    \tmovss  %xmm5,0xc(%rsp)\n    59dd:\tf2 0f 5a c0          \tcvtsd2ss %xmm0,%xmm0\n    59e1:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    59e5:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    59e9:\tf3 0f 11 6c 24 08    \tmovss  %xmm5,0x8(%rsp)\n    59ef:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    59f3:\te8 88 b6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    59f8:\t45 85 ed             \ttest   %r13d,%r13d\n    59fb:\t0f 84 8f de ff ff    \tje     3890 <main+0x27d0>\n    5a01:\t48 8d 35 ef 31 00 00 \tlea    0x31ef(%rip),%rsi        # 8bf7 <fma512>\n    5a08:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5a0e:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5a13:\te8 28 50 00 00       \tcallq  aa40 <measureFunction>\n    5a18:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5a1d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5a22:\t48 8d 35 34 56 00 00 \tlea    0x5634(%rip),%rsi        # b05d <_IO_stdin_used+0x5d>\n    5a29:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5a2d:\te8 4e b6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5a32:\t48 8d 35 ff 3b 00 00 \tlea    0x3bff(%rip),%rsi        # 9638 <latfma512>\n    5a39:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5a3f:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5a44:\te8 f7 4f 00 00       \tcallq  aa40 <measureFunction>\n    5a49:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    5a4f:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5a54:\t48 8d 35 45 5d 00 00 \tlea    0x5d45(%rip),%rsi        # b7a0 <_IO_stdin_used+0x7a0>\n    5a5b:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5a60:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    5a64:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    5a68:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    5a6c:\te8 0f b6 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5a71:\t48 8d 35 60 32 00 00 \tlea    0x3260(%rip),%rsi        # 8cd8 <mixfma256fma512>\n    5a78:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5a7e:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5a83:\te8 b8 4f 00 00       \tcallq  aa40 <measureFunction>\n    5a88:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5a8d:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5a92:\t48 8d 35 2f 5d 00 00 \tlea    0x5d2f(%rip),%rsi        # b7c8 <_IO_stdin_used+0x7c8>\n    5a99:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5a9d:\te8 de b5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5aa2:\t48 8d 35 20 3a 00 00 \tlea    0x3a20(%rip),%rsi        # 94c9 <nemesfpu512mix21>\n    5aa9:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5aaf:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    5ab4:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    5ab8:\te8 83 4f 00 00       \tcallq  aa40 <measureFunction>\n    5abd:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5ac2:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5ac7:\t48 8d 35 22 5d 00 00 \tlea    0x5d22(%rip),%rsi        # b7f0 <_IO_stdin_used+0x7f0>\n    5ace:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5ad2:\te8 a9 b5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5ad7:\t48 8d 35 af 1f 00 00 \tlea    0x1faf(%rip),%rsi        # 7a8d <add512int>\n    5ade:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5ae4:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5ae9:\te8 52 4f 00 00       \tcallq  aa40 <measureFunction>\n    5aee:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5af3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5af8:\t48 8d 35 21 5d 00 00 \tlea    0x5d21(%rip),%rsi        # b820 <_IO_stdin_used+0x820>\n    5aff:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5b03:\te8 78 b5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5b08:\t48 8d 35 a1 23 00 00 \tlea    0x23a1(%rip),%rsi        # 7eb0 <latadd256int>\n    5b0f:\t4c 89 f7             \tmov    %r14,%rdi\n    5b12:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5b18:\te8 23 4f 00 00       \tcallq  aa40 <measureFunction>\n    5b1d:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    5b23:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5b28:\t48 8d 35 11 5d 00 00 \tlea    0x5d11(%rip),%rsi        # b840 <_IO_stdin_used+0x840>\n    5b2f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5b34:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    5b38:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    5b3c:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    5b40:\te8 3b b5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5b45:\t48 8d 35 c5 1d 00 00 \tlea    0x1dc5(%rip),%rsi        # 7911 <mul512int>\n    5b4c:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5b52:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5b57:\te8 e4 4e 00 00       \tcallq  aa40 <measureFunction>\n    5b5c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5b61:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5b66:\t48 8d 35 fb 5c 00 00 \tlea    0x5cfb(%rip),%rsi        # b868 <_IO_stdin_used+0x868>\n    5b6d:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5b71:\te8 0a b5 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5b76:\t48 8d 35 52 1e 00 00 \tlea    0x1e52(%rip),%rsi        # 79cf <muldq512int>\n    5b7d:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5b83:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5b88:\te8 b3 4e 00 00       \tcallq  aa40 <measureFunction>\n    5b8d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5b92:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5b97:\t48 8d 35 f2 5c 00 00 \tlea    0x5cf2(%rip),%rsi        # b890 <_IO_stdin_used+0x890>\n    5b9e:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5ba2:\te8 d9 b4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5ba7:\t48 8d 35 1b 26 00 00 \tlea    0x261b(%rip),%rsi        # 81c9 <latmulq512int>\n    5bae:\t4c 89 f7             \tmov    %r14,%rdi\n    5bb1:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5bb7:\te8 84 4e 00 00       \tcallq  aa40 <measureFunction>\n    5bbc:\tf3 0f 10 6c 24 0c    \tmovss  0xc(%rsp),%xmm5\n    5bc2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5bc7:\t48 8d 35 f2 5c 00 00 \tlea    0x5cf2(%rip),%rsi        # b8c0 <_IO_stdin_used+0x8c0>\n    5bce:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5bd3:\tf3 0f 5e e8          \tdivss  %xmm0,%xmm5\n    5bd7:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    5bdb:\tf3 0f 5a c5          \tcvtss2sd %xmm5,%xmm0\n    5bdf:\te8 9c b4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5be4:\t48 8d 35 3a 24 00 00 \tlea    0x243a(%rip),%rsi        # 8025 <latmul512int>\n    5beb:\t4c 89 f7             \tmov    %r14,%rdi\n    5bee:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5bf4:\te8 47 4e 00 00       \tcallq  aa40 <measureFunction>\n    5bf9:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    5bff:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5c04:\t48 8d 35 e5 5c 00 00 \tlea    0x5ce5(%rip),%rsi        # b8f0 <_IO_stdin_used+0x8f0>\n    5c0b:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5c10:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    5c14:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    5c18:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    5c1c:\te8 5f b4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5c21:\t48 8d 35 cf 24 00 00 \tlea    0x24cf(%rip),%rsi        # 80f7 <latmuldq512int>\n    5c28:\t4c 89 f7             \tmov    %r14,%rdi\n    5c2b:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5c31:\te8 0a 4e 00 00       \tcallq  aa40 <measureFunction>\n    5c36:\tf3 0f 10 7c 24 0c    \tmovss  0xc(%rsp),%xmm7\n    5c3c:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5c41:\t48 8d 35 d8 5c 00 00 \tlea    0x5cd8(%rip),%rsi        # b920 <_IO_stdin_used+0x920>\n    5c48:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5c4d:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    5c51:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    5c55:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    5c59:\te8 22 b4 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5c5e:\t48 8d 35 bd 33 00 00 \tlea    0x33bd(%rip),%rsi        # 9022 <mixfmaadd512>\n    5c65:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5c6b:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    5c70:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    5c74:\te8 c7 4d 00 00       \tcallq  aa40 <measureFunction>\n    5c79:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5c7e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5c83:\t48 8d 35 ce 5c 00 00 \tlea    0x5cce(%rip),%rsi        # b958 <_IO_stdin_used+0x958>\n    5c8a:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5c8e:\te8 ed b3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5c93:\t48 8d 35 69 34 00 00 \tlea    0x3469(%rip),%rsi        # 9103 <mixfma512add256>\n    5c9a:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5ca0:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    5ca5:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    5ca9:\te8 92 4d 00 00       \tcallq  aa40 <measureFunction>\n    5cae:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5cb3:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5cb8:\t48 8d 35 c1 5c 00 00 \tlea    0x5cc1(%rip),%rsi        # b980 <_IO_stdin_used+0x980>\n    5cbf:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5cc3:\te8 b8 b3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5cc8:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5cce:\t48 8d 35 db 4c 00 00 \tlea    0x4cdb(%rip),%rsi        # a9b0 <load512wrapper>\n    5cd5:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5cda:\te8 61 4d 00 00       \tcallq  aa40 <measureFunction>\n    5cdf:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5ce4:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5ce9:\t48 8d 35 88 53 00 00 \tlea    0x5388(%rip),%rsi        # b078 <_IO_stdin_used+0x78>\n    5cf0:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5cf4:\te8 87 b3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5cf9:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5cff:\t48 8d 35 fa 4c 00 00 \tlea    0x4cfa(%rip),%rsi        # aa00 <store512wrapper>\n    5d06:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5d0b:\te8 30 4d 00 00       \tcallq  aa40 <measureFunction>\n    5d10:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5d15:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5d1a:\t48 8d 35 74 53 00 00 \tlea    0x5374(%rip),%rsi        # b095 <_IO_stdin_used+0x95>\n    5d21:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5d25:\te8 56 b3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5d2a:\t48 8d 35 ee 26 00 00 \tlea    0x26ee(%rip),%rsi        # 841f <aesenc128>\n    5d31:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5d37:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5d3c:\te8 ff 4c 00 00       \tcallq  aa40 <measureFunction>\n    5d41:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5d46:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5d4b:\t48 8d 35 61 53 00 00 \tlea    0x5361(%rip),%rsi        # b0b3 <_IO_stdin_used+0xb3>\n    5d52:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5d56:\te8 25 b3 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5d5b:\t48 8d 35 21 2a 00 00 \tlea    0x2a21(%rip),%rsi        # 8783 <aesdec128>\n    5d62:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5d68:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5d6d:\te8 ce 4c 00 00       \tcallq  aa40 <measureFunction>\n    5d72:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5d77:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5d7c:\t48 8d 35 46 53 00 00 \tlea    0x5346(%rip),%rsi        # b0c9 <_IO_stdin_used+0xc9>\n    5d83:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5d87:\te8 f4 b2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5d8c:\t48 8d 35 26 27 00 00 \tlea    0x2726(%rip),%rsi        # 84b9 <aesencadd128>\n    5d93:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5d99:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5d9e:\te8 9d 4c 00 00       \tcallq  aa40 <measureFunction>\n    5da3:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5da8:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5dad:\t48 8d 35 fc 5b 00 00 \tlea    0x5bfc(%rip),%rsi        # b9b0 <_IO_stdin_used+0x9b0>\n    5db4:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5db8:\te8 c3 b2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5dbd:\t48 8d 35 af 27 00 00 \tlea    0x27af(%rip),%rsi        # 8573 <aesencfma128>\n    5dc4:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5dca:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5dcf:\te8 6c 4c 00 00       \tcallq  aa40 <measureFunction>\n    5dd4:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5dd9:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5dde:\t48 8d 35 fa 52 00 00 \tlea    0x52fa(%rip),%rsi        # b0df <_IO_stdin_used+0xdf>\n    5de5:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5de9:\te8 92 b2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5dee:\t48 8d 35 f8 28 00 00 \tlea    0x28f8(%rip),%rsi        # 86ed <aesencmul128>\n    5df5:\tf3 0f 10 44 24 08    \tmovss  0x8(%rsp),%xmm0\n    5dfb:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5e00:\te8 3b 4c 00 00       \tcallq  aa40 <measureFunction>\n    5e05:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5e0a:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5e0f:\t48 8d 35 ba 5b 00 00 \tlea    0x5bba(%rip),%rsi        # b9d0 <_IO_stdin_used+0x9d0>\n    5e16:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5e1a:\te8 61 b2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5e1f:\te9 31 da ff ff       \tjmpq   3855 <main+0x2795>\n    5e24:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    5e29:\t48 8d 35 88 53 00 00 \tlea    0x5388(%rip),%rsi        # b1b8 <_IO_stdin_used+0x1b8>\n    5e30:\t4c 89 ef             \tmov    %r13,%rdi\n    5e33:\te8 f8 b1 ff ff       \tcallq  1030 <strncmp@plt>\n    5e38:\t85 c0                \ttest   %eax,%eax\n    5e3a:\t0f 85 2b cb ff ff    \tjne    296b <main+0x18ab>\n    5e40:\t48 8d 35 a6 28 00 00 \tlea    0x28a6(%rip),%rsi        # 86ed <aesencmul128>\n    5e47:\tf3 0f 10 05 39 62 00 \tmovss  0x6239(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    5e4e:\t00 \n    5e4f:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5e54:\te8 e7 4b 00 00       \tcallq  aa40 <measureFunction>\n    5e59:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5e5e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5e63:\t48 8d 35 66 5b 00 00 \tlea    0x5b66(%rip),%rsi        # b9d0 <_IO_stdin_used+0x9d0>\n    5e6a:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5e6e:\te8 0d b2 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5e73:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5e77:\t0f 8f a4 d9 ff ff    \tjg     3821 <main+0x2761>\n    5e7d:\te9 c4 ca ff ff       \tjmpq   2946 <main+0x1886>\n    5e82:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    5e87:\t48 8d 35 1d 53 00 00 \tlea    0x531d(%rip),%rsi        # b1ab <_IO_stdin_used+0x1ab>\n    5e8e:\t4c 89 ef             \tmov    %r13,%rdi\n    5e91:\te8 9a b1 ff ff       \tcallq  1030 <strncmp@plt>\n    5e96:\t85 c0                \ttest   %eax,%eax\n    5e98:\t75 8a                \tjne    5e24 <main+0x4d64>\n    5e9a:\t48 8d 35 d2 26 00 00 \tlea    0x26d2(%rip),%rsi        # 8573 <aesencfma128>\n    5ea1:\tf3 0f 10 05 df 61 00 \tmovss  0x61df(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    5ea8:\t00 \n    5ea9:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5eae:\te8 8d 4b 00 00       \tcallq  aa40 <measureFunction>\n    5eb3:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5eb8:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5ebd:\t48 8d 35 1b 52 00 00 \tlea    0x521b(%rip),%rsi        # b0df <_IO_stdin_used+0xdf>\n    5ec4:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5ec8:\te8 b3 b1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5ecd:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5ed1:\t0f 8f f7 d8 ff ff    \tjg     37ce <main+0x270e>\n    5ed7:\te9 6a ca ff ff       \tjmpq   2946 <main+0x1886>\n    5edc:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    5ee1:\t48 8d 35 b6 52 00 00 \tlea    0x52b6(%rip),%rsi        # b19e <_IO_stdin_used+0x19e>\n    5ee8:\t4c 89 ef             \tmov    %r13,%rdi\n    5eeb:\te8 40 b1 ff ff       \tcallq  1030 <strncmp@plt>\n    5ef0:\t85 c0                \ttest   %eax,%eax\n    5ef2:\t75 8e                \tjne    5e82 <main+0x4dc2>\n    5ef4:\t48 8d 35 be 25 00 00 \tlea    0x25be(%rip),%rsi        # 84b9 <aesencadd128>\n    5efb:\tf3 0f 10 05 85 61 00 \tmovss  0x6185(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    5f02:\t00 \n    5f03:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5f08:\te8 33 4b 00 00       \tcallq  aa40 <measureFunction>\n    5f0d:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5f12:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5f17:\t48 8d 35 92 5a 00 00 \tlea    0x5a92(%rip),%rsi        # b9b0 <_IO_stdin_used+0x9b0>\n    5f1e:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5f22:\te8 59 b1 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5f27:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5f2b:\t0f 8f 4a d8 ff ff    \tjg     377b <main+0x26bb>\n    5f31:\te9 10 ca ff ff       \tjmpq   2946 <main+0x1886>\n    5f36:\tba 09 00 00 00       \tmov    $0x9,%edx\n    5f3b:\t48 8d 35 52 52 00 00 \tlea    0x5252(%rip),%rsi        # b194 <_IO_stdin_used+0x194>\n    5f42:\t4c 89 ef             \tmov    %r13,%rdi\n    5f45:\te8 e6 b0 ff ff       \tcallq  1030 <strncmp@plt>\n    5f4a:\t85 c0                \ttest   %eax,%eax\n    5f4c:\t75 8e                \tjne    5edc <main+0x4e1c>\n    5f4e:\t48 8d 35 2e 28 00 00 \tlea    0x282e(%rip),%rsi        # 8783 <aesdec128>\n    5f55:\tf3 0f 10 05 2b 61 00 \tmovss  0x612b(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    5f5c:\t00 \n    5f5d:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5f62:\te8 d9 4a 00 00       \tcallq  aa40 <measureFunction>\n    5f67:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5f6c:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5f71:\t48 8d 35 51 51 00 00 \tlea    0x5151(%rip),%rsi        # b0c9 <_IO_stdin_used+0xc9>\n    5f78:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5f7c:\te8 ff b0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5f81:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5f85:\t0f 8f 9d d7 ff ff    \tjg     3728 <main+0x2668>\n    5f8b:\te9 b6 c9 ff ff       \tjmpq   2946 <main+0x1886>\n    5f90:\tba 09 00 00 00       \tmov    $0x9,%edx\n    5f95:\t48 8d 35 ee 51 00 00 \tlea    0x51ee(%rip),%rsi        # b18a <_IO_stdin_used+0x18a>\n    5f9c:\t4c 89 ef             \tmov    %r13,%rdi\n    5f9f:\te8 8c b0 ff ff       \tcallq  1030 <strncmp@plt>\n    5fa4:\t85 c0                \ttest   %eax,%eax\n    5fa6:\t75 8e                \tjne    5f36 <main+0x4e76>\n    5fa8:\t48 8d 35 70 24 00 00 \tlea    0x2470(%rip),%rsi        # 841f <aesenc128>\n    5faf:\tf3 0f 10 05 d1 60 00 \tmovss  0x60d1(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    5fb6:\t00 \n    5fb7:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    5fbc:\te8 7f 4a 00 00       \tcallq  aa40 <measureFunction>\n    5fc1:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    5fc6:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    5fcb:\t48 8d 35 e1 50 00 00 \tlea    0x50e1(%rip),%rsi        # b0b3 <_IO_stdin_used+0xb3>\n    5fd2:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    5fd6:\te8 a5 b0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    5fdb:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    5fdf:\t0f 8f f0 d6 ff ff    \tjg     36d5 <main+0x2615>\n    5fe5:\te9 5c c9 ff ff       \tjmpq   2946 <main+0x1886>\n    5fea:\tba 07 00 00 00       \tmov    $0x7,%edx\n    5fef:\t48 8d 35 8b 51 00 00 \tlea    0x518b(%rip),%rsi        # b181 <_IO_stdin_used+0x181>\n    5ff6:\t4c 89 ef             \tmov    %r13,%rdi\n    5ff9:\te8 32 b0 ff ff       \tcallq  1030 <strncmp@plt>\n    5ffe:\t85 c0                \ttest   %eax,%eax\n    6000:\t75 8e                \tjne    5f90 <main+0x4ed0>\n    6002:\tf3 0f 10 05 7e 60 00 \tmovss  0x607e(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    6009:\t00 \n    600a:\t48 8d 35 ef 49 00 00 \tlea    0x49ef(%rip),%rsi        # aa00 <store512wrapper>\n    6011:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    6016:\te8 25 4a 00 00       \tcallq  aa40 <measureFunction>\n    601b:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    6020:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    6025:\t48 8d 35 69 50 00 00 \tlea    0x5069(%rip),%rsi        # b095 <_IO_stdin_used+0x95>\n    602c:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    6030:\te8 4b b0 ff ff       \tcallq  1080 <__printf_chk@plt>\n    6035:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    6039:\t0f 8f 43 d6 ff ff    \tjg     3682 <main+0x25c2>\n    603f:\te9 02 c9 ff ff       \tjmpq   2946 <main+0x1886>\n    6044:\tba 07 00 00 00       \tmov    $0x7,%edx\n    6049:\t48 8d 35 29 51 00 00 \tlea    0x5129(%rip),%rsi        # b179 <_IO_stdin_used+0x179>\n    6050:\t4c 89 ef             \tmov    %r13,%rdi\n    6053:\te8 d8 af ff ff       \tcallq  1030 <strncmp@plt>\n    6058:\t85 c0                \ttest   %eax,%eax\n    605a:\t75 8e                \tjne    5fea <main+0x4f2a>\n    605c:\tf3 0f 10 05 24 60 00 \tmovss  0x6024(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    6063:\t00 \n    6064:\t48 8d 35 45 49 00 00 \tlea    0x4945(%rip),%rsi        # a9b0 <load512wrapper>\n    606b:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    6070:\te8 cb 49 00 00       \tcallq  aa40 <measureFunction>\n    6075:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    607a:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    607f:\t48 8d 35 f2 4f 00 00 \tlea    0x4ff2(%rip),%rsi        # b078 <_IO_stdin_used+0x78>\n    6086:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    608a:\te8 f1 af ff ff       \tcallq  1080 <__printf_chk@plt>\n    608f:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    6093:\t0f 8f 96 d5 ff ff    \tjg     362f <main+0x256f>\n    6099:\te9 a8 c8 ff ff       \tjmpq   2946 <main+0x1886>\n    609e:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    60a3:\t48 8d 35 bf 50 00 00 \tlea    0x50bf(%rip),%rsi        # b169 <_IO_stdin_used+0x169>\n    60aa:\t4c 89 ef             \tmov    %r13,%rdi\n    60ad:\te8 7e af ff ff       \tcallq  1030 <strncmp@plt>\n    60b2:\t85 c0                \ttest   %eax,%eax\n    60b4:\t75 8e                \tjne    6044 <main+0x4f84>\n    60b6:\t48 8d 35 46 30 00 00 \tlea    0x3046(%rip),%rsi        # 9103 <mixfma512add256>\n    60bd:\tf3 0f 10 05 c3 5f 00 \tmovss  0x5fc3(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    60c4:\t00 \n    60c5:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    60ca:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    60ce:\te8 6d 49 00 00       \tcallq  aa40 <measureFunction>\n    60d3:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    60d8:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    60dd:\t48 8d 35 9c 58 00 00 \tlea    0x589c(%rip),%rsi        # b980 <_IO_stdin_used+0x980>\n    60e4:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    60e8:\te8 93 af ff ff       \tcallq  1080 <__printf_chk@plt>\n    60ed:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    60f1:\t0f 8f e5 d4 ff ff    \tjg     35dc <main+0x251c>\n    60f7:\te9 4a c8 ff ff       \tjmpq   2946 <main+0x1886>\n    60fc:\tba 0b 00 00 00       \tmov    $0xb,%edx\n    6101:\t48 8d 35 54 50 00 00 \tlea    0x5054(%rip),%rsi        # b15c <_IO_stdin_used+0x15c>\n    6108:\t4c 89 ef             \tmov    %r13,%rdi\n    610b:\te8 20 af ff ff       \tcallq  1030 <strncmp@plt>\n    6110:\t85 c0                \ttest   %eax,%eax\n    6112:\t75 8a                \tjne    609e <main+0x4fde>\n    6114:\t48 8d 35 07 2f 00 00 \tlea    0x2f07(%rip),%rsi        # 9022 <mixfmaadd512>\n    611b:\tf3 0f 10 05 65 5f 00 \tmovss  0x5f65(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    6122:\t00 \n    6123:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    6128:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    612c:\te8 0f 49 00 00       \tcallq  aa40 <measureFunction>\n    6131:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    6136:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    613b:\t48 8d 35 16 58 00 00 \tlea    0x5816(%rip),%rsi        # b958 <_IO_stdin_used+0x958>\n    6142:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    6146:\te8 35 af ff ff       \tcallq  1080 <__printf_chk@plt>\n    614b:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    614f:\t0f 8f 30 d4 ff ff    \tjg     3585 <main+0x24c5>\n    6155:\te9 ec c7 ff ff       \tjmpq   2946 <main+0x1886>\n    615a:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    615f:\t48 8d 35 e7 4f 00 00 \tlea    0x4fe7(%rip),%rsi        # b14d <_IO_stdin_used+0x14d>\n    6166:\t4c 89 ef             \tmov    %r13,%rdi\n    6169:\te8 c2 ae ff ff       \tcallq  1030 <strncmp@plt>\n    616e:\t85 c0                \ttest   %eax,%eax\n    6170:\t75 8a                \tjne    60fc <main+0x503c>\n    6172:\t48 8d 35 7e 1f 00 00 \tlea    0x1f7e(%rip),%rsi        # 80f7 <latmuldq512int>\n    6179:\tf3 0f 10 05 07 5f 00 \tmovss  0x5f07(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    6180:\t00 \n    6181:\t4c 89 f7             \tmov    %r14,%rdi\n    6184:\te8 b7 48 00 00       \tcallq  aa40 <measureFunction>\n    6189:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    618e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    6193:\tf3 0f 10 0d ed 5e 00 \tmovss  0x5eed(%rip),%xmm1        # c088 <_IO_stdin_used+0x1088>\n    619a:\t00 \n    619b:\t48 8d 35 7e 57 00 00 \tlea    0x577e(%rip),%rsi        # b920 <_IO_stdin_used+0x920>\n    61a2:\tf3 0f 5e c8          \tdivss  %xmm0,%xmm1\n    61a6:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    61aa:\tf3 0f 5a c1          \tcvtss2sd %xmm1,%xmm0\n    61ae:\te8 cd ae ff ff       \tcallq  1080 <__printf_chk@plt>\n    61b3:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    61b7:\t0f 8f 71 d3 ff ff    \tjg     352e <main+0x246e>\n    61bd:\te9 84 c7 ff ff       \tjmpq   2946 <main+0x1886>\n    61c2:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    61c7:\t48 8d 35 72 4f 00 00 \tlea    0x4f72(%rip),%rsi        # b140 <_IO_stdin_used+0x140>\n    61ce:\t4c 89 ef             \tmov    %r13,%rdi\n    61d1:\te8 5a ae ff ff       \tcallq  1030 <strncmp@plt>\n    61d6:\t85 c0                \ttest   %eax,%eax\n    61d8:\t75 80                \tjne    615a <main+0x509a>\n    61da:\t48 8d 35 44 1e 00 00 \tlea    0x1e44(%rip),%rsi        # 8025 <latmul512int>\n    61e1:\tf3 0f 10 05 9f 5e 00 \tmovss  0x5e9f(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    61e8:\t00 \n    61e9:\t4c 89 f7             \tmov    %r14,%rdi\n    61ec:\te8 4f 48 00 00       \tcallq  aa40 <measureFunction>\n    61f1:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    61f6:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    61fb:\tf3 0f 10 3d 85 5e 00 \tmovss  0x5e85(%rip),%xmm7        # c088 <_IO_stdin_used+0x1088>\n    6202:\t00 \n    6203:\t48 8d 35 e6 56 00 00 \tlea    0x56e6(%rip),%rsi        # b8f0 <_IO_stdin_used+0x8f0>\n    620a:\tf3 0f 11 7c 24 0c    \tmovss  %xmm7,0xc(%rsp)\n    6210:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    6214:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    6218:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    621c:\te8 5f ae ff ff       \tcallq  1080 <__printf_chk@plt>\n    6221:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    6225:\t0f 8f a4 d2 ff ff    \tjg     34cf <main+0x240f>\n    622b:\te9 16 c7 ff ff       \tjmpq   2946 <main+0x1886>\n    6230:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    6235:\t48 8d 35 f6 4e 00 00 \tlea    0x4ef6(%rip),%rsi        # b132 <_IO_stdin_used+0x132>\n    623c:\t4c 89 ef             \tmov    %r13,%rdi\n    623f:\te8 ec ad ff ff       \tcallq  1030 <strncmp@plt>\n    6244:\t85 c0                \ttest   %eax,%eax\n    6246:\t0f 85 76 ff ff ff    \tjne    61c2 <main+0x5102>\n    624c:\t48 8d 35 76 1f 00 00 \tlea    0x1f76(%rip),%rsi        # 81c9 <latmulq512int>\n    6253:\tf3 0f 10 05 2d 5e 00 \tmovss  0x5e2d(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    625a:\t00 \n    625b:\t4c 89 f7             \tmov    %r14,%rdi\n    625e:\te8 dd 47 00 00       \tcallq  aa40 <measureFunction>\n    6263:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    6268:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    626d:\tf3 0f 10 3d 13 5e 00 \tmovss  0x5e13(%rip),%xmm7        # c088 <_IO_stdin_used+0x1088>\n    6274:\t00 \n    6275:\t48 8d 35 44 56 00 00 \tlea    0x5644(%rip),%rsi        # b8c0 <_IO_stdin_used+0x8c0>\n    627c:\tf3 0f 11 7c 24 0c    \tmovss  %xmm7,0xc(%rsp)\n    6282:\tf3 0f 5e f8          \tdivss  %xmm0,%xmm7\n    6286:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    628a:\tf3 0f 5a c7          \tcvtss2sd %xmm7,%xmm0\n    628e:\te8 ed ad ff ff       \tcallq  1080 <__printf_chk@plt>\n    6293:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    6297:\t0f 8f d3 d1 ff ff    \tjg     3470 <main+0x23b0>\n    629d:\te9 a4 c6 ff ff       \tjmpq   2946 <main+0x1886>\n    62a2:\tba 09 00 00 00       \tmov    $0x9,%edx\n    62a7:\t48 8d 35 a2 4e 00 00 \tlea    0x4ea2(%rip),%rsi        # b150 <_IO_stdin_used+0x150>\n    62ae:\t4c 89 ef             \tmov    %r13,%rdi\n    62b1:\te8 7a ad ff ff       \tcallq  1030 <strncmp@plt>\n    62b6:\t85 c0                \ttest   %eax,%eax\n    62b8:\t0f 85 72 ff ff ff    \tjne    6230 <main+0x5170>\n    62be:\t48 8d 35 0a 17 00 00 \tlea    0x170a(%rip),%rsi        # 79cf <muldq512int>\n    62c5:\tf3 0f 10 05 bb 5d 00 \tmovss  0x5dbb(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    62cc:\t00 \n    62cd:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    62d2:\te8 69 47 00 00       \tcallq  aa40 <measureFunction>\n    62d7:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    62dc:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    62e1:\t48 8d 35 a8 55 00 00 \tlea    0x55a8(%rip),%rsi        # b890 <_IO_stdin_used+0x890>\n    62e8:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    62ec:\te8 8f ad ff ff       \tcallq  1080 <__printf_chk@plt>\n    62f1:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    62f5:\t0f 8f 0e d1 ff ff    \tjg     3409 <main+0x2349>\n    62fb:\te9 46 c6 ff ff       \tjmpq   2946 <main+0x1886>\n    6300:\tba 09 00 00 00       \tmov    $0x9,%edx\n    6305:\t48 8d 35 37 4e 00 00 \tlea    0x4e37(%rip),%rsi        # b143 <_IO_stdin_used+0x143>\n    630c:\t4c 89 ef             \tmov    %r13,%rdi\n    630f:\te8 1c ad ff ff       \tcallq  1030 <strncmp@plt>\n    6314:\t85 c0                \ttest   %eax,%eax\n    6316:\t75 8a                \tjne    62a2 <main+0x51e2>\n    6318:\t48 8d 35 f2 15 00 00 \tlea    0x15f2(%rip),%rsi        # 7911 <mul512int>\n    631f:\tf3 0f 10 05 61 5d 00 \tmovss  0x5d61(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    6326:\t00 \n    6327:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    632c:\te8 0f 47 00 00       \tcallq  aa40 <measureFunction>\n    6331:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    6336:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    633b:\t48 8d 35 26 55 00 00 \tlea    0x5526(%rip),%rsi        # b868 <_IO_stdin_used+0x868>\n    6342:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    6346:\te8 35 ad ff ff       \tcallq  1080 <__printf_chk@plt>\n    634b:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    634f:\t0f 8f 61 d0 ff ff    \tjg     33b6 <main+0x22f6>\n    6355:\te9 ec c5 ff ff       \tjmpq   2946 <main+0x1886>\n    635a:\tba 0c 00 00 00       \tmov    $0xc,%edx\n    635f:\t48 8d 35 bf 4d 00 00 \tlea    0x4dbf(%rip),%rsi        # b125 <_IO_stdin_used+0x125>\n    6366:\t4c 89 ef             \tmov    %r13,%rdi\n    6369:\te8 c2 ac ff ff       \tcallq  1030 <strncmp@plt>\n    636e:\t85 c0                \ttest   %eax,%eax\n    6370:\t75 8e                \tjne    6300 <main+0x5240>\n    6372:\t48 8d 35 37 1b 00 00 \tlea    0x1b37(%rip),%rsi        # 7eb0 <latadd256int>\n    6379:\tf3 0f 10 05 07 5d 00 \tmovss  0x5d07(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    6380:\t00 \n    6381:\t4c 89 f7             \tmov    %r14,%rdi\n    6384:\te8 b7 46 00 00       \tcallq  aa40 <measureFunction>\n    6389:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    638e:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    6393:\tf3 0f 10 0d ed 5c 00 \tmovss  0x5ced(%rip),%xmm1        # c088 <_IO_stdin_used+0x1088>\n    639a:\t00 \n    639b:\t48 8d 35 9e 54 00 00 \tlea    0x549e(%rip),%rsi        # b840 <_IO_stdin_used+0x840>\n    63a2:\tf3 0f 5e c8          \tdivss  %xmm0,%xmm1\n    63a6:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    63aa:\tf3 0f 5a c1          \tcvtss2sd %xmm1,%xmm0\n    63ae:\te8 cd ac ff ff       \tcallq  1080 <__printf_chk@plt>\n    63b3:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    63b7:\t0f 8f a6 cf ff ff    \tjg     3363 <main+0x22a3>\n    63bd:\te9 84 c5 ff ff       \tjmpq   2946 <main+0x1886>\n    63c2:\tba 09 00 00 00       \tmov    $0x9,%edx\n    63c7:\t48 8d 35 5a 4d 00 00 \tlea    0x4d5a(%rip),%rsi        # b128 <_IO_stdin_used+0x128>\n    63ce:\t4c 89 ef             \tmov    %r13,%rdi\n    63d1:\te8 5a ac ff ff       \tcallq  1030 <strncmp@plt>\n    63d6:\t85 c0                \ttest   %eax,%eax\n    63d8:\t75 80                \tjne    635a <main+0x529a>\n    63da:\t48 8d 35 ac 16 00 00 \tlea    0x16ac(%rip),%rsi        # 7a8d <add512int>\n    63e1:\tf3 0f 10 05 9f 5c 00 \tmovss  0x5c9f(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    63e8:\t00 \n    63e9:\tbf 00 2f 68 59       \tmov    $0x59682f00,%edi\n    63ee:\te8 4d 46 00 00       \tcallq  aa40 <measureFunction>\n    63f3:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    63f8:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    63fd:\t48 8d 35 1c 54 00 00 \tlea    0x541c(%rip),%rsi        # b820 <_IO_stdin_used+0x820>\n    6404:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    6408:\te8 73 ac ff ff       \tcallq  1080 <__printf_chk@plt>\n    640d:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    6411:\t0f 8f eb ce ff ff    \tjg     3302 <main+0x2242>\n    6417:\te9 2a c5 ff ff       \tjmpq   2946 <main+0x1886>\n    641c:\tba 0d 00 00 00       \tmov    $0xd,%edx\n    6421:\t48 8d 35 ef 4c 00 00 \tlea    0x4cef(%rip),%rsi        # b117 <_IO_stdin_used+0x117>\n    6428:\t4c 89 ef             \tmov    %r13,%rdi\n    642b:\te8 00 ac ff ff       \tcallq  1030 <strncmp@plt>\n    6430:\t85 c0                \ttest   %eax,%eax\n    6432:\t75 8e                \tjne    63c2 <main+0x5302>\n    6434:\t48 8d 35 8e 30 00 00 \tlea    0x308e(%rip),%rsi        # 94c9 <nemesfpu512mix21>\n    643b:\tf3 0f 10 05 45 5c 00 \tmovss  0x5c45(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    6442:\t00 \n    6443:\tbf 05 7a d7 03       \tmov    $0x3d77a05,%edi\n    6448:\t48 c1 e7 09          \tshl    $0x9,%rdi\n    644c:\te8 ef 45 00 00       \tcallq  aa40 <measureFunction>\n    6451:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    6456:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    645b:\t48 8d 35 8e 53 00 00 \tlea    0x538e(%rip),%rsi        # b7f0 <_IO_stdin_used+0x7f0>\n    6462:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    6466:\te8 15 ac ff ff       \tcallq  1080 <__printf_chk@plt>\n    646b:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    646f:\t0f 8f 3a ce ff ff    \tjg     32af <main+0x21ef>\n    6475:\te9 cc c4 ff ff       \tjmpq   2946 <main+0x1886>\n    647a:\t4c 89 f0             \tmov    %r14,%rax\n    647d:\t66 0f ef c9          \tpxor   %xmm1,%xmm1\n    6481:\t48 d1 e8             \tshr    %rax\n    6484:\tf3 48 0f 2a c8       \tcvtsi2ss %rax,%xmm1\n    6489:\tf3 0f 58 c9          \taddss  %xmm1,%xmm1\n    648d:\te9 24 f5 ff ff       \tjmpq   59b6 <main+0x48f6>\n    6492:\t48 89 c8             \tmov    %rcx,%rax\n    6495:\t83 e1 01             \tand    $0x1,%ecx\n    6498:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    649c:\t48 d1 e8             \tshr    %rax\n    649f:\t48 09 c8             \tor     %rcx,%rax\n    64a2:\tf3 48 0f 2a c0       \tcvtsi2ss %rax,%xmm0\n    64a7:\tf3 0f 58 c0          \taddss  %xmm0,%xmm0\n    64ab:\te9 e8 f4 ff ff       \tjmpq   5998 <main+0x48d8>\n    64b0:\t41 bd 01 00 00 00    \tmov    $0x1,%r13d\n    64b6:\te9 80 f4 ff ff       \tjmpq   593b <main+0x487b>\n    64bb:\t48 8d 35 b6 08 00 00 \tlea    0x8b6(%rip),%rsi        # 6d78 <noptest1b>\n    64c2:\tf3 0f 10 05 be 5b 00 \tmovss  0x5bbe(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    64c9:\t00 \n    64ca:\t4c 89 f7             \tmov    %r14,%rdi\n    64cd:\te8 6e 45 00 00       \tcallq  aa40 <measureFunction>\n    64d2:\tbf 01 00 00 00       \tmov    $0x1,%edi\n    64d7:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    64dc:\t48 8d 35 69 52 00 00 \tlea    0x5269(%rip),%rsi        # b74c <_IO_stdin_used+0x74c>\n    64e3:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    64e7:\te8 94 ab ff ff       \tcallq  1080 <__printf_chk@plt>\n    64ec:\t41 83 fc 01          \tcmp    $0x1,%r12d\n    64f0:\t0f 8e 50 c4 ff ff    \tjle    2946 <main+0x1886>\n    64f6:\tf3 0f 10 3d 8a 5b 00 \tmovss  0x5b8a(%rip),%xmm7        # c088 <_IO_stdin_used+0x1088>\n    64fd:\t00 \n    64fe:\tf3 0f 11 7c 24 0c    \tmovss  %xmm7,0xc(%rsp)\n    6504:\tf3 0f 11 7c 24 08    \tmovss  %xmm7,0x8(%rsp)\n    650a:\te9 18 ad ff ff       \tjmpq   1227 <main+0x167>\n    650f:\tf3 0f 10 2d 71 5b 00 \tmovss  0x5b71(%rip),%xmm5        # c088 <_IO_stdin_used+0x1088>\n    6516:\t00 \n    6517:\tf3 0f 11 6c 24 0c    \tmovss  %xmm5,0xc(%rsp)\n    651d:\tf3 0f 11 6c 24 08    \tmovss  %xmm5,0x8(%rsp)\n    6523:\te9 b4 ac ff ff       \tjmpq   11dc <main+0x11c>\n    6528:\te8 13 ab ff ff       \tcallq  1040 <__stack_chk_fail@plt>\n    652d:\t0f 1f 00             \tnopl   (%rax)\n\n0000000000006530 <get_available_features>:\n    6530:\t53                   \tpush   %rbx\n    6531:\t41 89 d1             \tmov    %edx,%r9d\n    6534:\tf7 c7 00 00 00 08    \ttest   $0x8000000,%edi\n    653a:\t74 13                \tje     654f <get_available_features+0x1f>\n    653c:\t31 c9                \txor    %ecx,%ecx\n    653e:\t0f 01 d0             \txgetbv \n    6541:\t89 c2                \tmov    %eax,%edx\n    6543:\t83 e2 06             \tand    $0x6,%edx\n    6546:\t83 fa 06             \tcmp    $0x6,%edx\n    6549:\t0f 84 4a 02 00 00    \tje     6799 <get_available_features+0x269>\n    654f:\t45 31 db             \txor    %r11d,%r11d\n    6552:\t45 31 d2             \txor    %r10d,%r10d\n    6555:\t41 89 f0             \tmov    %esi,%r8d\n    6558:\t41 c1 e8 0f          \tshr    $0xf,%r8d\n    655c:\t41 83 e0 01          \tand    $0x1,%r8d\n    6560:\t44 89 c0             \tmov    %r8d,%eax\n    6563:\t83 c8 02             \tor     $0x2,%eax\n    6566:\tf7 c6 00 00 80 00    \ttest   $0x800000,%esi\n    656c:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    6570:\t44 89 c0             \tmov    %r8d,%eax\n    6573:\t83 c8 08             \tor     $0x8,%eax\n    6576:\tf7 c6 00 00 00 02    \ttest   $0x2000000,%esi\n    657c:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    6580:\t44 89 c0             \tmov    %r8d,%eax\n    6583:\t83 c8 10             \tor     $0x10,%eax\n    6586:\t81 e6 00 00 00 04    \tand    $0x4000000,%esi\n    658c:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    6590:\t44 89 c0             \tmov    %r8d,%eax\n    6593:\t83 c8 04             \tor     $0x4,%eax\n    6596:\tf7 c7 00 00 80 00    \ttest   $0x800000,%edi\n    659c:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    65a0:\t44 89 c0             \tmov    %r8d,%eax\n    65a3:\t0d 00 00 04 00       \tor     $0x40000,%eax\n    65a8:\tf7 c7 00 00 00 02    \ttest   $0x2000000,%edi\n    65ae:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    65b2:\t44 89 c0             \tmov    %r8d,%eax\n    65b5:\t0d 00 00 08 00       \tor     $0x80000,%eax\n    65ba:\t40 f6 c7 02          \ttest   $0x2,%dil\n    65be:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    65c2:\t44 89 c0             \tmov    %r8d,%eax\n    65c5:\t83 c8 20             \tor     $0x20,%eax\n    65c8:\t40 f6 c7 01          \ttest   $0x1,%dil\n    65cc:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    65d0:\t44 89 c0             \tmov    %r8d,%eax\n    65d3:\t83 c8 40             \tor     $0x40,%eax\n    65d6:\tf7 c7 00 02 00 00    \ttest   $0x200,%edi\n    65dc:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    65e0:\t44 89 c0             \tmov    %r8d,%eax\n    65e3:\t0c 80                \tor     $0x80,%al\n    65e5:\tf7 c7 00 00 08 00    \ttest   $0x80000,%edi\n    65eb:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    65ef:\t44 89 c0             \tmov    %r8d,%eax\n    65f2:\t80 cc 01             \tor     $0x1,%ah\n    65f5:\tf7 c7 00 00 10 00    \ttest   $0x100000,%edi\n    65fb:\t44 0f 45 c0          \tcmovne %eax,%r8d\n    65ff:\t45 85 d2             \ttest   %r10d,%r10d\n    6602:\t74 1b                \tje     661f <get_available_features+0xef>\n    6604:\tf7 c7 00 00 00 10    \ttest   $0x10000000,%edi\n    660a:\t74 07                \tje     6613 <get_available_features+0xe3>\n    660c:\t41 81 c8 00 02 00 00 \tor     $0x200,%r8d\n    6613:\t81 e7 00 10 00 00    \tand    $0x1000,%edi\n    6619:\t0f 85 6e 01 00 00    \tjne    678d <get_available_features+0x25d>\n    661f:\t31 f6                \txor    %esi,%esi\n    6621:\t41 83 f9 06          \tcmp    $0x6,%r9d\n    6625:\t7e 50                \tjle    6677 <get_available_features+0x147>\n    6627:\tb8 07 00 00 00       \tmov    $0x7,%eax\n    662c:\t89 f1                \tmov    %esi,%ecx\n    662e:\t0f a2                \tcpuid  \n    6630:\tf6 c3 08             \ttest   $0x8,%bl\n    6633:\t74 07                \tje     663c <get_available_features+0x10c>\n    6635:\t41 81 c8 00 00 01 00 \tor     $0x10000,%r8d\n    663c:\t31 f6                \txor    %esi,%esi\n    663e:\t45 85 d2             \ttest   %r10d,%r10d\n    6641:\t74 1b                \tje     665e <get_available_features+0x12e>\n    6643:\tf6 c3 20             \ttest   $0x20,%bl\n    6646:\t74 07                \tje     664f <get_available_features+0x11f>\n    6648:\t41 81 c8 00 04 00 00 \tor     $0x400,%r8d\n    664f:\t89 ce                \tmov    %ecx,%esi\n    6651:\t81 e6 00 04 00 00    \tand    $0x400,%esi\n    6657:\t74 05                \tje     665e <get_available_features+0x12e>\n    6659:\tbe 02 00 00 00       \tmov    $0x2,%esi\n    665e:\tf6 c7 01             \ttest   $0x1,%bh\n    6661:\t74 07                \tje     666a <get_available_features+0x13a>\n    6663:\t41 81 c8 00 00 02 00 \tor     $0x20000,%r8d\n    666a:\tf6 c5 01             \ttest   $0x1,%ch\n    666d:\t74 03                \tje     6672 <get_available_features+0x142>\n    666f:\t83 ce 01             \tor     $0x1,%esi\n    6672:\t45 85 db             \ttest   %r11d,%r11d\n    6675:\t75 50                \tjne    66c7 <get_available_features+0x197>\n    6677:\tb8 00 00 00 80       \tmov    $0x80000000,%eax\n    667c:\t0f a2                \tcpuid  \n    667e:\t3d 00 00 00 80       \tcmp    $0x80000000,%eax\n    6683:\t76 33                \tjbe    66b8 <get_available_features+0x188>\n    6685:\tb8 01 00 00 80       \tmov    $0x80000001,%eax\n    668a:\t0f a2                \tcpuid  \n    668c:\tf6 c1 40             \ttest   $0x40,%cl\n    668f:\t74 07                \tje     6698 <get_available_features+0x168>\n    6691:\t41 81 c8 00 08 00 00 \tor     $0x800,%r8d\n    6698:\t45 85 d2             \ttest   %r10d,%r10d\n    669b:\t74 1b                \tje     66b8 <get_available_features+0x188>\n    669d:\tf7 c1 00 00 01 00    \ttest   $0x10000,%ecx\n    66a3:\t74 07                \tje     66ac <get_available_features+0x17c>\n    66a5:\t41 81 c8 00 10 00 00 \tor     $0x1000,%r8d\n    66ac:\t80 e5 08             \tand    $0x8,%ch\n    66af:\t74 07                \tje     66b8 <get_available_features+0x188>\n    66b1:\t41 81 c8 00 20 00 00 \tor     $0x2000,%r8d\n    66b8:\t44 89 05 3d 7a 00 00 \tmov    %r8d,0x7a3d(%rip)        # e0fc <__cpu_model+0xc>\n    66bf:\t5b                   \tpop    %rbx\n    66c0:\t89 35 42 7a 00 00    \tmov    %esi,0x7a42(%rip)        # e108 <__cpu_features2>\n    66c6:\tc3                   \tretq   \n    66c7:\tf7 c3 00 00 01 00    \ttest   $0x10000,%ebx\n    66cd:\t74 07                \tje     66d6 <get_available_features+0x1a6>\n    66cf:\t41 81 c8 00 80 00 00 \tor     $0x8000,%r8d\n    66d6:\t85 db                \ttest   %ebx,%ebx\n    66d8:\t0f 88 d7 00 00 00    \tjs     67b5 <get_available_features+0x285>\n    66de:\tf7 c3 00 00 00 40    \ttest   $0x40000000,%ebx\n    66e4:\t74 07                \tje     66ed <get_available_features+0x1bd>\n    66e6:\t41 81 c8 00 00 20 00 \tor     $0x200000,%r8d\n    66ed:\tf7 c3 00 00 02 00    \ttest   $0x20000,%ebx\n    66f3:\t74 07                \tje     66fc <get_available_features+0x1cc>\n    66f5:\t41 81 c8 00 00 40 00 \tor     $0x400000,%r8d\n    66fc:\tf7 c3 00 00 00 10    \ttest   $0x10000000,%ebx\n    6702:\t74 07                \tje     670b <get_available_features+0x1db>\n    6704:\t41 81 c8 00 00 80 00 \tor     $0x800000,%r8d\n    670b:\tf7 c3 00 00 00 04    \ttest   $0x4000000,%ebx\n    6711:\t74 07                \tje     671a <get_available_features+0x1ea>\n    6713:\t41 81 c8 00 00 00 02 \tor     $0x2000000,%r8d\n    671a:\tf7 c3 00 00 00 08    \ttest   $0x8000000,%ebx\n    6720:\t74 07                \tje     6729 <get_available_features+0x1f9>\n    6722:\t41 81 c8 00 00 00 01 \tor     $0x1000000,%r8d\n    6729:\t81 e3 00 00 20 00    \tand    $0x200000,%ebx\n    672f:\t74 07                \tje     6738 <get_available_features+0x208>\n    6731:\t41 81 c8 00 00 00 08 \tor     $0x8000000,%r8d\n    6738:\tf6 c1 02             \ttest   $0x2,%cl\n    673b:\t74 07                \tje     6744 <get_available_features+0x214>\n    673d:\t41 81 c8 00 00 00 04 \tor     $0x4000000,%r8d\n    6744:\tf6 c1 40             \ttest   $0x40,%cl\n    6747:\t74 07                \tje     6750 <get_available_features+0x220>\n    6749:\t41 81 c8 00 00 00 80 \tor     $0x80000000,%r8d\n    6750:\tf6 c5 08             \ttest   $0x8,%ch\n    6753:\t74 03                \tje     6758 <get_available_features+0x228>\n    6755:\t83 ce 04             \tor     $0x4,%esi\n    6758:\tf6 c5 10             \ttest   $0x10,%ch\n    675b:\t74 03                \tje     6760 <get_available_features+0x230>\n    675d:\t83 ce 08             \tor     $0x8,%esi\n    6760:\t80 e5 40             \tand    $0x40,%ch\n    6763:\t74 07                \tje     676c <get_available_features+0x23c>\n    6765:\t41 81 c8 00 00 00 40 \tor     $0x40000000,%r8d\n    676c:\tf6 c2 04             \ttest   $0x4,%dl\n    676f:\t74 07                \tje     6778 <get_available_features+0x248>\n    6771:\t41 81 c8 00 00 00 10 \tor     $0x10000000,%r8d\n    6778:\t80 e2 08             \tand    $0x8,%dl\n    677b:\t0f 84 f6 fe ff ff    \tje     6677 <get_available_features+0x147>\n    6781:\t41 81 c8 00 00 00 20 \tor     $0x20000000,%r8d\n    6788:\te9 ea fe ff ff       \tjmpq   6677 <get_available_features+0x147>\n    678d:\t41 81 c8 00 40 00 00 \tor     $0x4000,%r8d\n    6794:\te9 86 fe ff ff       \tjmpq   661f <get_available_features+0xef>\n    6799:\t25 e6 00 00 00       \tand    $0xe6,%eax\n    679e:\t45 31 db             \txor    %r11d,%r11d\n    67a1:\t41 ba 01 00 00 00    \tmov    $0x1,%r10d\n    67a7:\t3d e6 00 00 00       \tcmp    $0xe6,%eax\n    67ac:\t41 0f 94 c3          \tsete   %r11b\n    67b0:\te9 a0 fd ff ff       \tjmpq   6555 <get_available_features+0x25>\n    67b5:\t41 81 c8 00 00 10 00 \tor     $0x100000,%r8d\n    67bc:\te9 1d ff ff ff       \tjmpq   66de <get_available_features+0x1ae>\n    67c1:\t66 66 2e 0f 1f 84 00 \tdata16 nopw %cs:0x0(%rax,%rax,1)\n    67c8:\t00 00 00 00 \n    67cc:\t0f 1f 40 00          \tnopl   0x0(%rax)\n\n00000000000067d0 <__cpu_indicator_init>:\n    67d0:\tf3 0f 1e fa          \tendbr64 \n    67d4:\t8b 05 16 79 00 00    \tmov    0x7916(%rip),%eax        # e0f0 <__cpu_model>\n    67da:\t45 31 c9             \txor    %r9d,%r9d\n    67dd:\t85 c0                \ttest   %eax,%eax\n    67df:\t75 78                \tjne    6859 <__cpu_indicator_init+0x89>\n    67e1:\t53                   \tpush   %rbx\n    67e2:\t44 89 c8             \tmov    %r9d,%eax\n    67e5:\t0f a2                \tcpuid  \n    67e7:\t85 c0                \ttest   %eax,%eax\n    67e9:\t0f 84 cc 00 00 00    \tje     68bb <__cpu_indicator_init+0xeb>\n    67ef:\t44 89 c8             \tmov    %r9d,%eax\n    67f2:\t0f a2                \tcpuid  \n    67f4:\t41 89 da             \tmov    %ebx,%r10d\n    67f7:\t41 89 c0             \tmov    %eax,%r8d\n    67fa:\t85 c0                \ttest   %eax,%eax\n    67fc:\t0f 8e b9 00 00 00    \tjle    68bb <__cpu_indicator_init+0xeb>\n    6802:\t44 89 c8             \tmov    %r9d,%eax\n    6805:\t0f a2                \tcpuid  \n    6807:\t85 c0                \ttest   %eax,%eax\n    6809:\t0f 84 ac 00 00 00    \tje     68bb <__cpu_indicator_init+0xeb>\n    680f:\tb8 01 00 00 00       \tmov    $0x1,%eax\n    6814:\t0f a2                \tcpuid  \n    6816:\t89 d6                \tmov    %edx,%esi\n    6818:\t89 cf                \tmov    %ecx,%edi\n    681a:\t89 c2                \tmov    %eax,%edx\n    681c:\t89 c1                \tmov    %eax,%ecx\n    681e:\tc1 ea 04             \tshr    $0x4,%edx\n    6821:\t41 89 c3             \tmov    %eax,%r11d\n    6824:\tc1 e9 08             \tshr    $0x8,%ecx\n    6827:\t41 c1 eb 0c          \tshr    $0xc,%r11d\n    682b:\t83 e2 0f             \tand    $0xf,%edx\n    682e:\t83 e1 0f             \tand    $0xf,%ecx\n    6831:\t41 81 e3 f0 00 00 00 \tand    $0xf0,%r11d\n    6838:\t41 81 fa 47 65 6e 75 \tcmp    $0x756e6547,%r10d\n    683f:\t74 1c                \tje     685d <__cpu_indicator_init+0x8d>\n    6841:\t41 81 fa 41 75 74 68 \tcmp    $0x68747541,%r10d\n    6848:\t74 33                \tje     687d <__cpu_indicator_init+0xad>\n    684a:\tc7 05 9c 78 00 00 03 \tmovl   $0x3,0x789c(%rip)        # e0f0 <__cpu_model>\n    6851:\t00 00 00 \n    6854:\t44 89 c8             \tmov    %r9d,%eax\n    6857:\t5b                   \tpop    %rbx\n    6858:\tc3                   \tretq   \n    6859:\t44 89 c8             \tmov    %r9d,%eax\n    685c:\tc3                   \tretq   \n    685d:\t83 f9 06             \tcmp    $0x6,%ecx\n    6860:\t0f 84 fe 00 00 00    \tje     6964 <__cpu_indicator_init+0x194>\n    6866:\t44 89 c2             \tmov    %r8d,%edx\n    6869:\te8 c2 fc ff ff       \tcallq  6530 <get_available_features>\n    686e:\t45 31 c9             \txor    %r9d,%r9d\n    6871:\tc7 05 75 78 00 00 01 \tmovl   $0x1,0x7875(%rip)        # e0f0 <__cpu_model>\n    6878:\t00 00 00 \n    687b:\teb d7                \tjmp    6854 <__cpu_indicator_init+0x84>\n    687d:\t83 f9 0f             \tcmp    $0xf,%ecx\n    6880:\t74 17                \tje     6899 <__cpu_indicator_init+0xc9>\n    6882:\t44 89 c2             \tmov    %r8d,%edx\n    6885:\te8 a6 fc ff ff       \tcallq  6530 <get_available_features>\n    688a:\t45 31 c9             \txor    %r9d,%r9d\n    688d:\tc7 05 59 78 00 00 02 \tmovl   $0x2,0x7859(%rip)        # e0f0 <__cpu_model>\n    6894:\t00 00 00 \n    6897:\teb bb                \tjmp    6854 <__cpu_indicator_init+0x84>\n    6899:\tc1 e8 14             \tshr    $0x14,%eax\n    689c:\t44 09 da             \tor     %r11d,%edx\n    689f:\t0f b6 c0             \tmovzbl %al,%eax\n    68a2:\t83 e8 01             \tsub    $0x1,%eax\n    68a5:\t83 f8 07             \tcmp    $0x7,%eax\n    68a8:\t77 d8                \tja     6882 <__cpu_indicator_init+0xb2>\n    68aa:\t48 8d 0d 0f 58 00 00 \tlea    0x580f(%rip),%rcx        # c0c0 <_IO_stdin_used+0x10c0>\n    68b1:\t48 63 04 81          \tmovslq (%rcx,%rax,4),%rax\n    68b5:\t48 01 c8             \tadd    %rcx,%rax\n    68b8:\t3e ff e0             \tnotrack jmpq *%rax\n    68bb:\tc7 05 2b 78 00 00 03 \tmovl   $0x3,0x782b(%rip)        # e0f0 <__cpu_model>\n    68c2:\t00 00 00 \n    68c5:\t41 83 c9 ff          \tor     $0xffffffff,%r9d\n    68c9:\teb 89                \tjmp    6854 <__cpu_indicator_init+0x84>\n    68cb:\tc7 05 1f 78 00 00 0a \tmovl   $0xa,0x781f(%rip)        # e0f4 <__cpu_model+0x4>\n    68d2:\t00 00 00 \n    68d5:\t83 fa 1f             \tcmp    $0x1f,%edx\n    68d8:\t0f 87 05 02 00 00    \tja     6ae3 <__cpu_indicator_init+0x313>\n    68de:\tc7 05 10 78 00 00 0b \tmovl   $0xb,0x7810(%rip)        # e0f8 <__cpu_model+0x8>\n    68e5:\t00 00 00 \n    68e8:\teb 98                \tjmp    6882 <__cpu_indicator_init+0xb2>\n    68ea:\tc7 05 00 78 00 00 09 \tmovl   $0x9,0x7800(%rip)        # e0f4 <__cpu_model+0x4>\n    68f1:\t00 00 00 \n    68f4:\teb 8c                \tjmp    6882 <__cpu_indicator_init+0xb2>\n    68f6:\tc7 05 f4 77 00 00 05 \tmovl   $0x5,0x77f4(%rip)        # e0f4 <__cpu_model+0x4>\n    68fd:\t00 00 00 \n    6900:\t83 fa 02             \tcmp    $0x2,%edx\n    6903:\t0f 84 cb 01 00 00    \tje     6ad4 <__cpu_indicator_init+0x304>\n    6909:\t83 fa 0f             \tcmp    $0xf,%edx\n    690c:\t0f 87 e9 01 00 00    \tja     6afb <__cpu_indicator_init+0x32b>\n    6912:\tc7 05 dc 77 00 00 07 \tmovl   $0x7,0x77dc(%rip)        # e0f8 <__cpu_model+0x8>\n    6919:\t00 00 00 \n    691c:\te9 61 ff ff ff       \tjmpq   6882 <__cpu_indicator_init+0xb2>\n    6921:\tc7 05 c9 77 00 00 08 \tmovl   $0x8,0x77c9(%rip)        # e0f4 <__cpu_model+0x4>\n    6928:\t00 00 00 \n    692b:\te9 52 ff ff ff       \tjmpq   6882 <__cpu_indicator_init+0xb2>\n    6930:\tc7 05 ba 77 00 00 04 \tmovl   $0x4,0x77ba(%rip)        # e0f4 <__cpu_model+0x4>\n    6937:\t00 00 00 \n    693a:\t83 fa 04             \tcmp    $0x4,%edx\n    693d:\t0f 84 e0 01 00 00    \tje     6b23 <__cpu_indicator_init+0x353>\n    6943:\t83 fa 08             \tcmp    $0x8,%edx\n    6946:\t0f 84 c8 01 00 00    \tje     6b14 <__cpu_indicator_init+0x344>\n    694c:\t83 fa 02             \tcmp    $0x2,%edx\n    694f:\t0f 85 2d ff ff ff    \tjne    6882 <__cpu_indicator_init+0xb2>\n    6955:\tc7 05 99 77 00 00 04 \tmovl   $0x4,0x7799(%rip)        # e0f8 <__cpu_model+0x8>\n    695c:\t00 00 00 \n    695f:\te9 1e ff ff ff       \tjmpq   6882 <__cpu_indicator_init+0xb2>\n    6964:\t89 d0                \tmov    %edx,%eax\n    6966:\t44 09 d8             \tor     %r11d,%eax\n    6969:\t84 db                \ttest   %bl,%bl\n    696b:\t0f 85 f5 fe ff ff    \tjne    6866 <__cpu_indicator_init+0x96>\n    6971:\t83 e8 0f             \tsub    $0xf,%eax\n    6974:\t3d 8f 00 00 00       \tcmp    $0x8f,%eax\n    6979:\t0f 87 e7 fe ff ff    \tja     6866 <__cpu_indicator_init+0x96>\n    697f:\t48 8d 15 5a 57 00 00 \tlea    0x575a(%rip),%rdx        # c0e0 <_IO_stdin_used+0x10e0>\n    6986:\t48 63 04 82          \tmovslq (%rdx,%rax,4),%rax\n    698a:\t48 01 d0             \tadd    %rdx,%rax\n    698d:\t3e ff e0             \tnotrack jmpq *%rax\n    6990:\tc7 05 5a 77 00 00 0b \tmovl   $0xb,0x775a(%rip)        # e0f4 <__cpu_model+0x4>\n    6997:\t00 00 00 \n    699a:\te9 c7 fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    699f:\tc7 05 4b 77 00 00 0d \tmovl   $0xd,0x774b(%rip)        # e0f4 <__cpu_model+0x4>\n    69a6:\t00 00 00 \n    69a9:\te9 b8 fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    69ae:\t48 b8 03 00 00 00 11 \tmovabs $0x1100000003,%rax\n    69b5:\t00 00 00 \n    69b8:\t48 89 05 35 77 00 00 \tmov    %rax,0x7735(%rip)        # e0f4 <__cpu_model+0x4>\n    69bf:\te9 a2 fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    69c4:\tc7 05 26 77 00 00 0c \tmovl   $0xc,0x7726(%rip)        # e0f4 <__cpu_model+0x4>\n    69cb:\t00 00 00 \n    69ce:\te9 93 fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    69d3:\tc7 05 17 77 00 00 07 \tmovl   $0x7,0x7717(%rip)        # e0f4 <__cpu_model+0x4>\n    69da:\t00 00 00 \n    69dd:\te9 84 fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    69e2:\tc7 05 08 77 00 00 03 \tmovl   $0x3,0x7708(%rip)        # e0f4 <__cpu_model+0x4>\n    69e9:\t00 00 00 \n    69ec:\tb8 07 00 00 00       \tmov    $0x7,%eax\n    69f1:\t31 c9                \txor    %ecx,%ecx\n    69f3:\t0f a2                \tcpuid  \n    69f5:\t80 e5 08             \tand    $0x8,%ch\n    69f8:\t0f 84 4c 01 00 00    \tje     6b4a <__cpu_indicator_init+0x37a>\n    69fe:\tc7 05 f0 76 00 00 15 \tmovl   $0x15,0x76f0(%rip)        # e0f8 <__cpu_model+0x8>\n    6a05:\t00 00 00 \n    6a08:\te9 59 fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6a0d:\t48 b8 03 00 00 00 0f \tmovabs $0xf00000003,%rax\n    6a14:\t00 00 00 \n    6a17:\t48 89 05 d6 76 00 00 \tmov    %rax,0x76d6(%rip)        # e0f4 <__cpu_model+0x4>\n    6a1e:\te9 43 fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6a23:\t48 b8 03 00 00 00 0e \tmovabs $0xe00000003,%rax\n    6a2a:\t00 00 00 \n    6a2d:\t48 89 05 c0 76 00 00 \tmov    %rax,0x76c0(%rip)        # e0f4 <__cpu_model+0x4>\n    6a34:\te9 2d fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6a39:\t48 b8 03 00 00 00 0d \tmovabs $0xd00000003,%rax\n    6a40:\t00 00 00 \n    6a43:\t48 89 05 aa 76 00 00 \tmov    %rax,0x76aa(%rip)        # e0f4 <__cpu_model+0x4>\n    6a4a:\te9 17 fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6a4f:\t48 b8 03 00 00 00 0c \tmovabs $0xc00000003,%rax\n    6a56:\t00 00 00 \n    6a59:\t48 89 05 94 76 00 00 \tmov    %rax,0x7694(%rip)        # e0f4 <__cpu_model+0x4>\n    6a60:\te9 01 fe ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6a65:\tc7 05 85 76 00 00 06 \tmovl   $0x6,0x7685(%rip)        # e0f4 <__cpu_model+0x4>\n    6a6c:\t00 00 00 \n    6a6f:\te9 f2 fd ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6a74:\t48 b8 03 00 00 00 03 \tmovabs $0x300000003,%rax\n    6a7b:\t00 00 00 \n    6a7e:\t48 89 05 6f 76 00 00 \tmov    %rax,0x766f(%rip)        # e0f4 <__cpu_model+0x4>\n    6a85:\te9 dc fd ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6a8a:\t48 b8 03 00 00 00 02 \tmovabs $0x200000003,%rax\n    6a91:\t00 00 00 \n    6a94:\t48 89 05 59 76 00 00 \tmov    %rax,0x7659(%rip)        # e0f4 <__cpu_model+0x4>\n    6a9b:\te9 c6 fd ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6aa0:\tc7 05 4a 76 00 00 01 \tmovl   $0x1,0x764a(%rip)        # e0f4 <__cpu_model+0x4>\n    6aa7:\t00 00 00 \n    6aaa:\te9 b7 fd ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6aaf:\t48 b8 03 00 00 00 01 \tmovabs $0x100000003,%rax\n    6ab6:\t00 00 00 \n    6ab9:\t48 89 05 34 76 00 00 \tmov    %rax,0x7634(%rip)        # e0f4 <__cpu_model+0x4>\n    6ac0:\te9 a1 fd ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6ac5:\tc7 05 25 76 00 00 02 \tmovl   $0x2,0x7625(%rip)        # e0f4 <__cpu_model+0x4>\n    6acc:\t00 00 00 \n    6acf:\te9 92 fd ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6ad4:\tc7 05 1a 76 00 00 08 \tmovl   $0x8,0x761a(%rip)        # e0f8 <__cpu_model+0x8>\n    6adb:\t00 00 00 \n    6ade:\te9 9f fd ff ff       \tjmpq   6882 <__cpu_indicator_init+0xb2>\n    6ae3:\t83 fa 2f             \tcmp    $0x2f,%edx\n    6ae6:\t0f 86 96 fd ff ff    \tjbe    6882 <__cpu_indicator_init+0xb2>\n    6aec:\tc7 05 02 76 00 00 14 \tmovl   $0x14,0x7602(%rip)        # e0f8 <__cpu_model+0x8>\n    6af3:\t00 00 00 \n    6af6:\te9 87 fd ff ff       \tjmpq   6882 <__cpu_indicator_init+0xb2>\n    6afb:\t83 fa 2f             \tcmp    $0x2f,%edx\n    6afe:\t76 d4                \tjbe    6ad4 <__cpu_indicator_init+0x304>\n    6b00:\t83 fa 4f             \tcmp    $0x4f,%edx\n    6b03:\t77 2d                \tja     6b32 <__cpu_indicator_init+0x362>\n    6b05:\tc7 05 e9 75 00 00 09 \tmovl   $0x9,0x75e9(%rip)        # e0f8 <__cpu_model+0x8>\n    6b0c:\t00 00 00 \n    6b0f:\te9 6e fd ff ff       \tjmpq   6882 <__cpu_indicator_init+0xb2>\n    6b14:\tc7 05 da 75 00 00 06 \tmovl   $0x6,0x75da(%rip)        # e0f8 <__cpu_model+0x8>\n    6b1b:\t00 00 00 \n    6b1e:\te9 5f fd ff ff       \tjmpq   6882 <__cpu_indicator_init+0xb2>\n    6b23:\tc7 05 cb 75 00 00 05 \tmovl   $0x5,0x75cb(%rip)        # e0f8 <__cpu_model+0x8>\n    6b2a:\t00 00 00 \n    6b2d:\te9 50 fd ff ff       \tjmpq   6882 <__cpu_indicator_init+0xb2>\n    6b32:\t83 fa 7f             \tcmp    $0x7f,%edx\n    6b35:\t0f 87 47 fd ff ff    \tja     6882 <__cpu_indicator_init+0xb2>\n    6b3b:\tc7 05 b3 75 00 00 0a \tmovl   $0xa,0x75b3(%rip)        # e0f8 <__cpu_model+0x8>\n    6b42:\t00 00 00 \n    6b45:\te9 38 fd ff ff       \tjmpq   6882 <__cpu_indicator_init+0xb2>\n    6b4a:\tc7 05 a4 75 00 00 10 \tmovl   $0x10,0x75a4(%rip)        # e0f8 <__cpu_model+0x8>\n    6b51:\t00 00 00 \n    6b54:\te9 0d fd ff ff       \tjmpq   6866 <__cpu_indicator_init+0x96>\n    6b59:\t0f 1f 80 00 00 00 00 \tnopl   0x0(%rax)\n\n0000000000006b60 <_start>:\n    6b60:\tf3 0f 1e fa          \tendbr64 \n    6b64:\t31 ed                \txor    %ebp,%ebp\n    6b66:\t49 89 d1             \tmov    %rdx,%r9\n    6b69:\t5e                   \tpop    %rsi\n    6b6a:\t48 89 e2             \tmov    %rsp,%rdx\n    6b6d:\t48 83 e4 f0          \tand    $0xfffffffffffffff0,%rsp\n    6b71:\t50                   \tpush   %rax\n    6b72:\t54                   \tpush   %rsp\n    6b73:\t4c 8d 05 66 40 00 00 \tlea    0x4066(%rip),%r8        # abe0 <__libc_csu_fini>\n    6b7a:\t48 8d 0d ef 3f 00 00 \tlea    0x3fef(%rip),%rcx        # ab70 <__libc_csu_init>\n    6b81:\t48 8d 3d 38 a5 ff ff \tlea    -0x5ac8(%rip),%rdi        # 10c0 <main>\n    6b88:\tff 15 52 74 00 00    \tcallq  *0x7452(%rip)        # dfe0 <__libc_start_main@GLIBC_2.2.5>\n    6b8e:\tf4                   \thlt    \n    6b8f:\t90                   \tnop\n\n0000000000006b90 <deregister_tm_clones>:\n    6b90:\t48 8d 3d 49 75 00 00 \tlea    0x7549(%rip),%rdi        # e0e0 <stderr@@GLIBC_2.2.5>\n    6b97:\t48 8d 05 42 75 00 00 \tlea    0x7542(%rip),%rax        # e0e0 <stderr@@GLIBC_2.2.5>\n    6b9e:\t48 39 f8             \tcmp    %rdi,%rax\n    6ba1:\t74 15                \tje     6bb8 <deregister_tm_clones+0x28>\n    6ba3:\t48 8b 05 2e 74 00 00 \tmov    0x742e(%rip),%rax        # dfd8 <_ITM_deregisterTMCloneTable>\n    6baa:\t48 85 c0             \ttest   %rax,%rax\n    6bad:\t74 09                \tje     6bb8 <deregister_tm_clones+0x28>\n    6baf:\tff e0                \tjmpq   *%rax\n    6bb1:\t0f 1f 80 00 00 00 00 \tnopl   0x0(%rax)\n    6bb8:\tc3                   \tretq   \n    6bb9:\t0f 1f 80 00 00 00 00 \tnopl   0x0(%rax)\n\n0000000000006bc0 <register_tm_clones>:\n    6bc0:\t48 8d 3d 19 75 00 00 \tlea    0x7519(%rip),%rdi        # e0e0 <stderr@@GLIBC_2.2.5>\n    6bc7:\t48 8d 35 12 75 00 00 \tlea    0x7512(%rip),%rsi        # e0e0 <stderr@@GLIBC_2.2.5>\n    6bce:\t48 29 fe             \tsub    %rdi,%rsi\n    6bd1:\t48 89 f0             \tmov    %rsi,%rax\n    6bd4:\t48 c1 ee 3f          \tshr    $0x3f,%rsi\n    6bd8:\t48 c1 f8 03          \tsar    $0x3,%rax\n    6bdc:\t48 01 c6             \tadd    %rax,%rsi\n    6bdf:\t48 d1 fe             \tsar    %rsi\n    6be2:\t74 14                \tje     6bf8 <register_tm_clones+0x38>\n    6be4:\t48 8b 05 05 74 00 00 \tmov    0x7405(%rip),%rax        # dff0 <_ITM_registerTMCloneTable>\n    6beb:\t48 85 c0             \ttest   %rax,%rax\n    6bee:\t74 08                \tje     6bf8 <register_tm_clones+0x38>\n    6bf0:\tff e0                \tjmpq   *%rax\n    6bf2:\t66 0f 1f 44 00 00    \tnopw   0x0(%rax,%rax,1)\n    6bf8:\tc3                   \tretq   \n    6bf9:\t0f 1f 80 00 00 00 00 \tnopl   0x0(%rax)\n\n0000000000006c00 <__do_global_dtors_aux>:\n    6c00:\tf3 0f 1e fa          \tendbr64 \n    6c04:\t80 3d dd 74 00 00 00 \tcmpb   $0x0,0x74dd(%rip)        # e0e8 <completed.8061>\n    6c0b:\t75 2b                \tjne    6c38 <__do_global_dtors_aux+0x38>\n    6c0d:\t55                   \tpush   %rbp\n    6c0e:\t48 83 3d e2 73 00 00 \tcmpq   $0x0,0x73e2(%rip)        # dff8 <__cxa_finalize@GLIBC_2.2.5>\n    6c15:\t00 \n    6c16:\t48 89 e5             \tmov    %rsp,%rbp\n    6c19:\t74 0c                \tje     6c27 <__do_global_dtors_aux+0x27>\n    6c1b:\t48 8b 3d e6 73 00 00 \tmov    0x73e6(%rip),%rdi        # e008 <__dso_handle>\n    6c22:\te8 89 a4 ff ff       \tcallq  10b0 <__cxa_finalize@plt>\n    6c27:\te8 64 ff ff ff       \tcallq  6b90 <deregister_tm_clones>\n    6c2c:\tc6 05 b5 74 00 00 01 \tmovb   $0x1,0x74b5(%rip)        # e0e8 <completed.8061>\n    6c33:\t5d                   \tpop    %rbp\n    6c34:\tc3                   \tretq   \n    6c35:\t0f 1f 00             \tnopl   (%rax)\n    6c38:\tc3                   \tretq   \n    6c39:\t0f 1f 80 00 00 00 00 \tnopl   0x0(%rax)\n\n0000000000006c40 <frame_dummy>:\n    6c40:\tf3 0f 1e fa          \tendbr64 \n    6c44:\te9 77 ff ff ff       \tjmpq   6bc0 <register_tm_clones>\n\n0000000000006c49 <clktest>:\n    6c49:\t53                   \tpush   %rbx\n    6c4a:\t41 50                \tpush   %r8\n    6c4c:\t41 51                \tpush   %r9\n    6c4e:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    6c55:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    6c5c:\t48 31 db             \txor    %rbx,%rbx\n\n0000000000006c5f <clktest_loop>:\n    6c5f:\t4c 01 c3             \tadd    %r8,%rbx\n    6c62:\t4c 01 c3             \tadd    %r8,%rbx\n    6c65:\t4c 01 c3             \tadd    %r8,%rbx\n    6c68:\t4c 01 c3             \tadd    %r8,%rbx\n    6c6b:\t4c 01 c3             \tadd    %r8,%rbx\n    6c6e:\t4c 01 c3             \tadd    %r8,%rbx\n    6c71:\t4c 01 c3             \tadd    %r8,%rbx\n    6c74:\t4c 01 c3             \tadd    %r8,%rbx\n    6c77:\t4c 01 c3             \tadd    %r8,%rbx\n    6c7a:\t4c 01 c3             \tadd    %r8,%rbx\n    6c7d:\t4c 01 c3             \tadd    %r8,%rbx\n    6c80:\t4c 01 c3             \tadd    %r8,%rbx\n    6c83:\t4c 01 c3             \tadd    %r8,%rbx\n    6c86:\t4c 01 c3             \tadd    %r8,%rbx\n    6c89:\t4c 01 c3             \tadd    %r8,%rbx\n    6c8c:\t4c 01 c3             \tadd    %r8,%rbx\n    6c8f:\t4c 01 c3             \tadd    %r8,%rbx\n    6c92:\t4c 01 c3             \tadd    %r8,%rbx\n    6c95:\t4c 01 c3             \tadd    %r8,%rbx\n    6c98:\t4c 01 c3             \tadd    %r8,%rbx\n    6c9b:\t4c 29 cf             \tsub    %r9,%rdi\n    6c9e:\t75 bf                \tjne    6c5f <clktest_loop>\n    6ca0:\t41 59                \tpop    %r9\n    6ca2:\t41 58                \tpop    %r8\n    6ca4:\t5b                   \tpop    %rbx\n    6ca5:\tc3                   \tretq   \n\n0000000000006ca6 <clkmovtest>:\n    6ca6:\t53                   \tpush   %rbx\n    6ca7:\t41 50                \tpush   %r8\n    6ca9:\t41 51                \tpush   %r9\n    6cab:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    6cb2:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    6cb9:\t48 31 db             \txor    %rbx,%rbx\n\n0000000000006cbc <clkmovtest_loop>:\n    6cbc:\t4c 01 c3             \tadd    %r8,%rbx\n    6cbf:\t49 89 d8             \tmov    %rbx,%r8\n    6cc2:\t4c 01 c3             \tadd    %r8,%rbx\n    6cc5:\t49 89 d8             \tmov    %rbx,%r8\n    6cc8:\t4c 01 c3             \tadd    %r8,%rbx\n    6ccb:\t49 89 d8             \tmov    %rbx,%r8\n    6cce:\t4c 01 c3             \tadd    %r8,%rbx\n    6cd1:\t49 89 d8             \tmov    %rbx,%r8\n    6cd4:\t4c 01 c3             \tadd    %r8,%rbx\n    6cd7:\t49 89 d8             \tmov    %rbx,%r8\n    6cda:\t4c 01 c3             \tadd    %r8,%rbx\n    6cdd:\t49 89 d8             \tmov    %rbx,%r8\n    6ce0:\t4c 01 c3             \tadd    %r8,%rbx\n    6ce3:\t49 89 d8             \tmov    %rbx,%r8\n    6ce6:\t4c 01 c3             \tadd    %r8,%rbx\n    6ce9:\t49 89 d8             \tmov    %rbx,%r8\n    6cec:\t4c 01 c3             \tadd    %r8,%rbx\n    6cef:\t49 89 d8             \tmov    %rbx,%r8\n    6cf2:\t4c 01 c3             \tadd    %r8,%rbx\n    6cf5:\t49 89 d8             \tmov    %rbx,%r8\n    6cf8:\t4c 01 c3             \tadd    %r8,%rbx\n    6cfb:\t49 89 d8             \tmov    %rbx,%r8\n    6cfe:\t4c 01 c3             \tadd    %r8,%rbx\n    6d01:\t49 89 d8             \tmov    %rbx,%r8\n    6d04:\t4c 01 c3             \tadd    %r8,%rbx\n    6d07:\t49 89 d8             \tmov    %rbx,%r8\n    6d0a:\t4c 01 c3             \tadd    %r8,%rbx\n    6d0d:\t49 89 d8             \tmov    %rbx,%r8\n    6d10:\t4c 01 c3             \tadd    %r8,%rbx\n    6d13:\t49 89 d8             \tmov    %rbx,%r8\n    6d16:\t4c 01 c3             \tadd    %r8,%rbx\n    6d19:\t49 89 d8             \tmov    %rbx,%r8\n    6d1c:\t4c 01 c3             \tadd    %r8,%rbx\n    6d1f:\t49 89 d8             \tmov    %rbx,%r8\n    6d22:\t4c 01 c3             \tadd    %r8,%rbx\n    6d25:\t49 89 d8             \tmov    %rbx,%r8\n    6d28:\t4c 01 c3             \tadd    %r8,%rbx\n    6d2b:\t49 89 d8             \tmov    %rbx,%r8\n    6d2e:\t4c 01 c3             \tadd    %r8,%rbx\n    6d31:\t49 89 d8             \tmov    %rbx,%r8\n    6d34:\t4c 29 cf             \tsub    %r9,%rdi\n    6d37:\t75 83                \tjne    6cbc <clkmovtest_loop>\n    6d39:\t41 59                \tpop    %r9\n    6d3b:\t41 58                \tpop    %r8\n    6d3d:\t5b                   \tpop    %rbx\n    6d3e:\tc3                   \tretq   \n\n0000000000006d3f <noptest>:\n    6d3f:\t53                   \tpush   %rbx\n    6d40:\t41 51                \tpush   %r9\n    6d42:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n0000000000006d49 <noptest_loop>:\n    6d49:\t66 90                \txchg   %ax,%ax\n    6d4b:\t66 90                \txchg   %ax,%ax\n    6d4d:\t66 90                \txchg   %ax,%ax\n    6d4f:\t66 90                \txchg   %ax,%ax\n    6d51:\t66 90                \txchg   %ax,%ax\n    6d53:\t66 90                \txchg   %ax,%ax\n    6d55:\t66 90                \txchg   %ax,%ax\n    6d57:\t66 90                \txchg   %ax,%ax\n    6d59:\t66 90                \txchg   %ax,%ax\n    6d5b:\t66 90                \txchg   %ax,%ax\n    6d5d:\t66 90                \txchg   %ax,%ax\n    6d5f:\t66 90                \txchg   %ax,%ax\n    6d61:\t66 90                \txchg   %ax,%ax\n    6d63:\t66 90                \txchg   %ax,%ax\n    6d65:\t66 90                \txchg   %ax,%ax\n    6d67:\t66 90                \txchg   %ax,%ax\n    6d69:\t66 90                \txchg   %ax,%ax\n    6d6b:\t66 90                \txchg   %ax,%ax\n    6d6d:\t66 90                \txchg   %ax,%ax\n    6d6f:\t4c 29 cf             \tsub    %r9,%rdi\n    6d72:\t75 d5                \tjne    6d49 <noptest_loop>\n    6d74:\t41 59                \tpop    %r9\n    6d76:\t5b                   \tpop    %rbx\n    6d77:\tc3                   \tretq   \n\n0000000000006d78 <noptest1b>:\n    6d78:\t53                   \tpush   %rbx\n    6d79:\t41 51                \tpush   %r9\n    6d7b:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n0000000000006d82 <noptest1b_loop>:\n    6d82:\t90                   \tnop\n    6d83:\t90                   \tnop\n    6d84:\t90                   \tnop\n    6d85:\t90                   \tnop\n    6d86:\t90                   \tnop\n    6d87:\t90                   \tnop\n    6d88:\t90                   \tnop\n    6d89:\t90                   \tnop\n    6d8a:\t90                   \tnop\n    6d8b:\t90                   \tnop\n    6d8c:\t90                   \tnop\n    6d8d:\t90                   \tnop\n    6d8e:\t90                   \tnop\n    6d8f:\t90                   \tnop\n    6d90:\t90                   \tnop\n    6d91:\t90                   \tnop\n    6d92:\t90                   \tnop\n    6d93:\t90                   \tnop\n    6d94:\t90                   \tnop\n    6d95:\t4c 29 cf             \tsub    %r9,%rdi\n    6d98:\t75 e8                \tjne    6d82 <noptest1b_loop>\n    6d9a:\t41 59                \tpop    %r9\n    6d9c:\t5b                   \tpop    %rbx\n    6d9d:\tc3                   \tretq   \n\n0000000000006d9e <addtest>:\n    6d9e:\t53                   \tpush   %rbx\n    6d9f:\t51                   \tpush   %rcx\n    6da0:\t41 50                \tpush   %r8\n    6da2:\t41 51                \tpush   %r9\n    6da4:\t41 52                \tpush   %r10\n    6da6:\t41 53                \tpush   %r11\n    6da8:\t41 54                \tpush   %r12\n    6daa:\t41 55                \tpush   %r13\n    6dac:\t41 56                \tpush   %r14\n    6dae:\t41 57                \tpush   %r15\n    6db0:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    6db7:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    6dbe:\t48 31 db             \txor    %rbx,%rbx\n    6dc1:\t48 31 c9             \txor    %rcx,%rcx\n    6dc4:\t4d 31 d2             \txor    %r10,%r10\n    6dc7:\t4d 31 db             \txor    %r11,%r11\n    6dca:\t4d 31 e4             \txor    %r12,%r12\n    6dcd:\t4d 31 ed             \txor    %r13,%r13\n    6dd0:\t4d 31 f6             \txor    %r14,%r14\n    6dd3:\t4d 31 ff             \txor    %r15,%r15\n\n0000000000006dd6 <addtest_loop>:\n    6dd6:\t4d 01 c7             \tadd    %r8,%r15\n    6dd9:\t4d 01 c6             \tadd    %r8,%r14\n    6ddc:\t4d 01 c5             \tadd    %r8,%r13\n    6ddf:\t4d 01 c4             \tadd    %r8,%r12\n    6de2:\t4d 01 c3             \tadd    %r8,%r11\n    6de5:\t4d 01 c2             \tadd    %r8,%r10\n    6de8:\t4c 01 c1             \tadd    %r8,%rcx\n    6deb:\t4d 01 c7             \tadd    %r8,%r15\n    6dee:\t4d 01 c6             \tadd    %r8,%r14\n    6df1:\t4d 01 c5             \tadd    %r8,%r13\n    6df4:\t4d 01 c4             \tadd    %r8,%r12\n    6df7:\t4d 01 c3             \tadd    %r8,%r11\n    6dfa:\t4d 01 c2             \tadd    %r8,%r10\n    6dfd:\t4c 01 c1             \tadd    %r8,%rcx\n    6e00:\t4d 01 c7             \tadd    %r8,%r15\n    6e03:\t4d 01 c6             \tadd    %r8,%r14\n    6e06:\t4d 01 c5             \tadd    %r8,%r13\n    6e09:\t4d 01 c4             \tadd    %r8,%r12\n    6e0c:\t4d 01 c3             \tadd    %r8,%r11\n    6e0f:\t4d 01 c2             \tadd    %r8,%r10\n    6e12:\t4c 29 cf             \tsub    %r9,%rdi\n    6e15:\t75 bf                \tjne    6dd6 <addtest_loop>\n    6e17:\t41 5f                \tpop    %r15\n    6e19:\t41 5e                \tpop    %r14\n    6e1b:\t41 5d                \tpop    %r13\n    6e1d:\t41 5c                \tpop    %r12\n    6e1f:\t41 5b                \tpop    %r11\n    6e21:\t41 5a                \tpop    %r10\n    6e23:\t41 59                \tpop    %r9\n    6e25:\t41 58                \tpop    %r8\n    6e27:\t59                   \tpop    %rcx\n    6e28:\t5b                   \tpop    %rbx\n    6e29:\tc3                   \tretq   \n\n0000000000006e2a <addnoptest>:\n    6e2a:\t53                   \tpush   %rbx\n    6e2b:\t51                   \tpush   %rcx\n    6e2c:\t41 50                \tpush   %r8\n    6e2e:\t41 51                \tpush   %r9\n    6e30:\t41 52                \tpush   %r10\n    6e32:\t41 53                \tpush   %r11\n    6e34:\t41 54                \tpush   %r12\n    6e36:\t41 55                \tpush   %r13\n    6e38:\t41 56                \tpush   %r14\n    6e3a:\t41 57                \tpush   %r15\n    6e3c:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    6e43:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    6e4a:\t48 31 db             \txor    %rbx,%rbx\n    6e4d:\t48 31 c9             \txor    %rcx,%rcx\n    6e50:\t4d 31 d2             \txor    %r10,%r10\n    6e53:\t4d 31 db             \txor    %r11,%r11\n    6e56:\t4d 31 e4             \txor    %r12,%r12\n    6e59:\t4d 31 ed             \txor    %r13,%r13\n    6e5c:\t4d 31 f6             \txor    %r14,%r14\n    6e5f:\t4d 31 ff             \txor    %r15,%r15\n\n0000000000006e62 <addnoptest_loop>:\n    6e62:\t4d 01 c7             \tadd    %r8,%r15\n    6e65:\t4d 01 c6             \tadd    %r8,%r14\n    6e68:\t4d 01 c5             \tadd    %r8,%r13\n    6e6b:\t4d 01 c4             \tadd    %r8,%r12\n    6e6e:\t90                   \tnop\n    6e6f:\t4d 01 c2             \tadd    %r8,%r10\n    6e72:\t4c 01 c1             \tadd    %r8,%rcx\n    6e75:\t4d 01 c7             \tadd    %r8,%r15\n    6e78:\t4d 01 c6             \tadd    %r8,%r14\n    6e7b:\t90                   \tnop\n    6e7c:\t4d 01 c4             \tadd    %r8,%r12\n    6e7f:\t4d 01 c3             \tadd    %r8,%r11\n    6e82:\t4d 01 c2             \tadd    %r8,%r10\n    6e85:\t4c 01 c1             \tadd    %r8,%rcx\n    6e88:\t90                   \tnop\n    6e89:\t4d 01 c6             \tadd    %r8,%r14\n    6e8c:\t4d 01 c5             \tadd    %r8,%r13\n    6e8f:\t4d 01 c4             \tadd    %r8,%r12\n    6e92:\t4d 01 c3             \tadd    %r8,%r11\n    6e95:\t90                   \tnop\n    6e96:\t4c 29 cf             \tsub    %r9,%rdi\n    6e99:\t75 c7                \tjne    6e62 <addnoptest_loop>\n    6e9b:\t41 5f                \tpop    %r15\n    6e9d:\t41 5e                \tpop    %r14\n    6e9f:\t41 5d                \tpop    %r13\n    6ea1:\t41 5c                \tpop    %r12\n    6ea3:\t41 5b                \tpop    %r11\n    6ea5:\t41 5a                \tpop    %r10\n    6ea7:\t41 59                \tpop    %r9\n    6ea9:\t41 58                \tpop    %r8\n    6eab:\t59                   \tpop    %rcx\n    6eac:\t5b                   \tpop    %rbx\n    6ead:\tc3                   \tretq   \n\n0000000000006eae <addmovtest>:\n    6eae:\t53                   \tpush   %rbx\n    6eaf:\t51                   \tpush   %rcx\n    6eb0:\t41 50                \tpush   %r8\n    6eb2:\t41 51                \tpush   %r9\n    6eb4:\t41 52                \tpush   %r10\n    6eb6:\t41 53                \tpush   %r11\n    6eb8:\t41 54                \tpush   %r12\n    6eba:\t41 55                \tpush   %r13\n    6ebc:\t41 56                \tpush   %r14\n    6ebe:\t41 57                \tpush   %r15\n    6ec0:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    6ec7:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    6ece:\t48 31 db             \txor    %rbx,%rbx\n    6ed1:\t48 31 c9             \txor    %rcx,%rcx\n    6ed4:\t4d 31 d2             \txor    %r10,%r10\n    6ed7:\t4d 31 db             \txor    %r11,%r11\n    6eda:\t4d 31 e4             \txor    %r12,%r12\n    6edd:\t4d 31 ed             \txor    %r13,%r13\n    6ee0:\t4d 31 f6             \txor    %r14,%r14\n    6ee3:\t4d 31 ff             \txor    %r15,%r15\n\n0000000000006ee6 <addmovtest_loop>:\n    6ee6:\t4d 01 c7             \tadd    %r8,%r15\n    6ee9:\t4d 01 c6             \tadd    %r8,%r14\n    6eec:\t4d 01 c5             \tadd    %r8,%r13\n    6eef:\t4d 01 c4             \tadd    %r8,%r12\n    6ef2:\t4c 89 fa             \tmov    %r15,%rdx\n    6ef5:\t4d 01 c2             \tadd    %r8,%r10\n    6ef8:\t4c 01 c1             \tadd    %r8,%rcx\n    6efb:\t4d 01 c7             \tadd    %r8,%r15\n    6efe:\t4d 01 c6             \tadd    %r8,%r14\n    6f01:\t4c 89 fa             \tmov    %r15,%rdx\n    6f04:\t4d 01 c4             \tadd    %r8,%r12\n    6f07:\t4d 01 c3             \tadd    %r8,%r11\n    6f0a:\t4d 01 c2             \tadd    %r8,%r10\n    6f0d:\t4c 01 c1             \tadd    %r8,%rcx\n    6f10:\t4c 89 fa             \tmov    %r15,%rdx\n    6f13:\t4d 01 c6             \tadd    %r8,%r14\n    6f16:\t4d 01 c5             \tadd    %r8,%r13\n    6f19:\t4d 01 c4             \tadd    %r8,%r12\n    6f1c:\t4d 01 c3             \tadd    %r8,%r11\n    6f1f:\t4c 89 fa             \tmov    %r15,%rdx\n    6f22:\t4c 29 cf             \tsub    %r9,%rdi\n    6f25:\t75 bf                \tjne    6ee6 <addmovtest_loop>\n    6f27:\t41 5f                \tpop    %r15\n    6f29:\t41 5e                \tpop    %r14\n    6f2b:\t41 5d                \tpop    %r13\n    6f2d:\t41 5c                \tpop    %r12\n    6f2f:\t41 5b                \tpop    %r11\n    6f31:\t41 5a                \tpop    %r10\n    6f33:\t41 59                \tpop    %r9\n    6f35:\t41 58                \tpop    %r8\n    6f37:\t59                   \tpop    %rcx\n    6f38:\t5b                   \tpop    %rbx\n    6f39:\tc3                   \tretq   \n\n0000000000006f3a <rortest>:\n    6f3a:\t53                   \tpush   %rbx\n    6f3b:\t51                   \tpush   %rcx\n    6f3c:\t41 50                \tpush   %r8\n    6f3e:\t41 51                \tpush   %r9\n    6f40:\t41 52                \tpush   %r10\n    6f42:\t41 53                \tpush   %r11\n    6f44:\t41 54                \tpush   %r12\n    6f46:\t41 55                \tpush   %r13\n    6f48:\t41 56                \tpush   %r14\n    6f4a:\t41 57                \tpush   %r15\n    6f4c:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    6f53:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    6f5a:\t4c 89 c3             \tmov    %r8,%rbx\n    6f5d:\t4c 89 c1             \tmov    %r8,%rcx\n    6f60:\t4d 89 c2             \tmov    %r8,%r10\n    6f63:\t4d 89 c3             \tmov    %r8,%r11\n    6f66:\t4d 89 c4             \tmov    %r8,%r12\n    6f69:\t4d 89 c5             \tmov    %r8,%r13\n    6f6c:\t4d 89 c6             \tmov    %r8,%r14\n    6f6f:\t4d 89 c7             \tmov    %r8,%r15\n\n0000000000006f72 <rortest_loop>:\n    6f72:\t49 d1 cf             \tror    %r15\n    6f75:\t49 d1 ce             \tror    %r14\n    6f78:\t49 d1 cd             \tror    %r13\n    6f7b:\t49 d1 cc             \tror    %r12\n    6f7e:\t49 d1 cb             \tror    %r11\n    6f81:\t49 d1 cf             \tror    %r15\n    6f84:\t49 d1 ce             \tror    %r14\n    6f87:\t49 d1 cd             \tror    %r13\n    6f8a:\t49 d1 cc             \tror    %r12\n    6f8d:\t49 d1 cb             \tror    %r11\n    6f90:\t49 d1 cf             \tror    %r15\n    6f93:\t49 d1 ce             \tror    %r14\n    6f96:\t49 d1 cd             \tror    %r13\n    6f99:\t49 d1 cc             \tror    %r12\n    6f9c:\t49 d1 cb             \tror    %r11\n    6f9f:\t49 d1 cf             \tror    %r15\n    6fa2:\t49 d1 ce             \tror    %r14\n    6fa5:\t49 d1 cd             \tror    %r13\n    6fa8:\t49 d1 cc             \tror    %r12\n    6fab:\t49 d1 cb             \tror    %r11\n    6fae:\t4c 29 cf             \tsub    %r9,%rdi\n    6fb1:\t75 bf                \tjne    6f72 <rortest_loop>\n    6fb3:\t41 5f                \tpop    %r15\n    6fb5:\t41 5e                \tpop    %r14\n    6fb7:\t41 5d                \tpop    %r13\n    6fb9:\t41 5c                \tpop    %r12\n    6fbb:\t41 5b                \tpop    %r11\n    6fbd:\t41 5a                \tpop    %r10\n    6fbf:\t41 59                \tpop    %r9\n    6fc1:\t41 58                \tpop    %r8\n    6fc3:\t59                   \tpop    %rcx\n    6fc4:\t5b                   \tpop    %rbx\n    6fc5:\tc3                   \tretq   \n\n0000000000006fc6 <shltest>:\n    6fc6:\t53                   \tpush   %rbx\n    6fc7:\t51                   \tpush   %rcx\n    6fc8:\t41 50                \tpush   %r8\n    6fca:\t41 51                \tpush   %r9\n    6fcc:\t41 52                \tpush   %r10\n    6fce:\t41 53                \tpush   %r11\n    6fd0:\t41 54                \tpush   %r12\n    6fd2:\t41 55                \tpush   %r13\n    6fd4:\t41 56                \tpush   %r14\n    6fd6:\t41 57                \tpush   %r15\n    6fd8:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    6fdf:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    6fe6:\t4c 89 c3             \tmov    %r8,%rbx\n    6fe9:\t4c 89 c1             \tmov    %r8,%rcx\n    6fec:\t4d 89 c2             \tmov    %r8,%r10\n    6fef:\t4d 89 c3             \tmov    %r8,%r11\n    6ff2:\t4d 89 c4             \tmov    %r8,%r12\n    6ff5:\t4d 89 c5             \tmov    %r8,%r13\n    6ff8:\t4d 89 c6             \tmov    %r8,%r14\n    6ffb:\t4d 89 c7             \tmov    %r8,%r15\n\n0000000000006ffe <shltest_loop>:\n    6ffe:\t49 d1 e7             \tshl    %r15\n    7001:\t49 d1 e6             \tshl    %r14\n    7004:\t49 d1 e5             \tshl    %r13\n    7007:\t49 d1 e4             \tshl    %r12\n    700a:\t49 d1 e3             \tshl    %r11\n    700d:\t49 d1 e7             \tshl    %r15\n    7010:\t49 d1 e6             \tshl    %r14\n    7013:\t49 d1 e5             \tshl    %r13\n    7016:\t49 d1 e4             \tshl    %r12\n    7019:\t49 d1 e3             \tshl    %r11\n    701c:\t49 d1 e7             \tshl    %r15\n    701f:\t49 d1 e6             \tshl    %r14\n    7022:\t49 d1 e5             \tshl    %r13\n    7025:\t49 d1 e4             \tshl    %r12\n    7028:\t49 d1 e3             \tshl    %r11\n    702b:\t49 d1 e7             \tshl    %r15\n    702e:\t49 d1 e6             \tshl    %r14\n    7031:\t49 d1 e5             \tshl    %r13\n    7034:\t49 d1 e4             \tshl    %r12\n    7037:\t49 d1 e3             \tshl    %r11\n    703a:\t4c 29 cf             \tsub    %r9,%rdi\n    703d:\t75 bf                \tjne    6ffe <shltest_loop>\n    703f:\t41 5f                \tpop    %r15\n    7041:\t41 5e                \tpop    %r14\n    7043:\t41 5d                \tpop    %r13\n    7045:\t41 5c                \tpop    %r12\n    7047:\t41 5b                \tpop    %r11\n    7049:\t41 5a                \tpop    %r10\n    704b:\t41 59                \tpop    %r9\n    704d:\t41 58                \tpop    %r8\n    704f:\t59                   \tpop    %rcx\n    7050:\t5b                   \tpop    %rbx\n    7051:\tc3                   \tretq   \n\n0000000000007052 <mixrorshltest>:\n    7052:\t53                   \tpush   %rbx\n    7053:\t51                   \tpush   %rcx\n    7054:\t41 50                \tpush   %r8\n    7056:\t41 51                \tpush   %r9\n    7058:\t41 52                \tpush   %r10\n    705a:\t41 53                \tpush   %r11\n    705c:\t41 54                \tpush   %r12\n    705e:\t41 55                \tpush   %r13\n    7060:\t41 56                \tpush   %r14\n    7062:\t41 57                \tpush   %r15\n    7064:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    706b:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7072:\t4c 89 c3             \tmov    %r8,%rbx\n    7075:\t4c 89 c1             \tmov    %r8,%rcx\n    7078:\t4d 89 c2             \tmov    %r8,%r10\n    707b:\t4d 89 c3             \tmov    %r8,%r11\n    707e:\t4d 89 c4             \tmov    %r8,%r12\n    7081:\t4d 89 c5             \tmov    %r8,%r13\n    7084:\t4d 89 c6             \tmov    %r8,%r14\n    7087:\t4d 89 c7             \tmov    %r8,%r15\n\n000000000000708a <mixrorshltest_loop>:\n    708a:\t49 d1 cf             \tror    %r15\n    708d:\t49 d1 e6             \tshl    %r14\n    7090:\t49 d1 cd             \tror    %r13\n    7093:\t49 d1 e4             \tshl    %r12\n    7096:\t49 d1 cb             \tror    %r11\n    7099:\t49 d1 e7             \tshl    %r15\n    709c:\t49 d1 ce             \tror    %r14\n    709f:\t49 d1 e5             \tshl    %r13\n    70a2:\t49 d1 cc             \tror    %r12\n    70a5:\t49 d1 e3             \tshl    %r11\n    70a8:\t49 d1 cf             \tror    %r15\n    70ab:\t49 d1 e6             \tshl    %r14\n    70ae:\t49 d1 cd             \tror    %r13\n    70b1:\t49 d1 e4             \tshl    %r12\n    70b4:\t49 d1 cb             \tror    %r11\n    70b7:\t49 d1 e7             \tshl    %r15\n    70ba:\t49 d1 ce             \tror    %r14\n    70bd:\t49 d1 e5             \tshl    %r13\n    70c0:\t49 d1 cc             \tror    %r12\n    70c3:\t49 d1 e3             \tshl    %r11\n    70c6:\t4c 29 cf             \tsub    %r9,%rdi\n    70c9:\t75 bf                \tjne    708a <mixrorshltest_loop>\n    70cb:\t41 5f                \tpop    %r15\n    70cd:\t41 5e                \tpop    %r14\n    70cf:\t41 5d                \tpop    %r13\n    70d1:\t41 5c                \tpop    %r12\n    70d3:\t41 5b                \tpop    %r11\n    70d5:\t41 5a                \tpop    %r10\n    70d7:\t41 59                \tpop    %r9\n    70d9:\t41 58                \tpop    %r8\n    70db:\t59                   \tpop    %rcx\n    70dc:\t5b                   \tpop    %rbx\n    70dd:\tc3                   \tretq   \n\n00000000000070de <mixrormultest>:\n    70de:\t53                   \tpush   %rbx\n    70df:\t51                   \tpush   %rcx\n    70e0:\t56                   \tpush   %rsi\n    70e1:\t52                   \tpush   %rdx\n    70e2:\t41 50                \tpush   %r8\n    70e4:\t41 51                \tpush   %r9\n    70e6:\t41 52                \tpush   %r10\n    70e8:\t41 53                \tpush   %r11\n    70ea:\t41 54                \tpush   %r12\n    70ec:\t41 55                \tpush   %r13\n    70ee:\t41 56                \tpush   %r14\n    70f0:\t41 57                \tpush   %r15\n    70f2:\t49 c7 c0 03 00 00 00 \tmov    $0x3,%r8\n    70f9:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7100:\t4c 89 c3             \tmov    %r8,%rbx\n    7103:\t4c 89 c1             \tmov    %r8,%rcx\n    7106:\t4d 89 c2             \tmov    %r8,%r10\n    7109:\t4d 89 c3             \tmov    %r8,%r11\n    710c:\t4d 89 c4             \tmov    %r8,%r12\n    710f:\t4d 89 c5             \tmov    %r8,%r13\n    7112:\t4d 89 c6             \tmov    %r8,%r14\n    7115:\t4d 89 c7             \tmov    %r8,%r15\n\n0000000000007118 <mixrormultest_loop>:\n    7118:\t49 d1 cf             \tror    %r15\n    711b:\t4d 0f af f0          \timul   %r8,%r14\n    711f:\t4d 89 ce             \tmov    %r9,%r14\n    7122:\t49 d1 cd             \tror    %r13\n    7125:\t4d 0f af e0          \timul   %r8,%r12\n    7129:\t4d 89 cc             \tmov    %r9,%r12\n    712c:\t49 d1 cb             \tror    %r11\n    712f:\t4d 0f af d0          \timul   %r8,%r10\n    7133:\t4d 89 ca             \tmov    %r9,%r10\n    7136:\t48 d1 cb             \tror    %rbx\n    7139:\t49 0f af c8          \timul   %r8,%rcx\n    713d:\t4c 89 c9             \tmov    %r9,%rcx\n    7140:\t48 d1 ce             \tror    %rsi\n    7143:\t49 0f af c0          \timul   %r8,%rax\n    7147:\t4c 89 c8             \tmov    %r9,%rax\n    714a:\t49 d1 cf             \tror    %r15\n    714d:\t4d 0f af f0          \timul   %r8,%r14\n    7151:\t4d 89 ce             \tmov    %r9,%r14\n    7154:\t49 d1 cd             \tror    %r13\n    7157:\t4d 0f af e0          \timul   %r8,%r12\n    715b:\t4d 89 cc             \tmov    %r9,%r12\n    715e:\t49 d1 cb             \tror    %r11\n    7161:\t4d 0f af d0          \timul   %r8,%r10\n    7165:\t4d 89 ca             \tmov    %r9,%r10\n    7168:\t48 d1 cb             \tror    %rbx\n    716b:\t49 0f af c8          \timul   %r8,%rcx\n    716f:\t4c 89 c9             \tmov    %r9,%rcx\n    7172:\t48 d1 ce             \tror    %rsi\n    7175:\t49 0f af d0          \timul   %r8,%rdx\n    7179:\t4c 29 cf             \tsub    %r9,%rdi\n    717c:\t75 9a                \tjne    7118 <mixrormultest_loop>\n    717e:\t41 5f                \tpop    %r15\n    7180:\t41 5e                \tpop    %r14\n    7182:\t41 5d                \tpop    %r13\n    7184:\t41 5c                \tpop    %r12\n    7186:\t41 5b                \tpop    %r11\n    7188:\t41 5a                \tpop    %r10\n    718a:\t41 59                \tpop    %r9\n    718c:\t41 58                \tpop    %r8\n    718e:\t5a                   \tpop    %rdx\n    718f:\t5e                   \tpop    %rsi\n    7190:\t59                   \tpop    %rcx\n    7191:\t5b                   \tpop    %rbx\n    7192:\tc3                   \tretq   \n\n0000000000007193 <rorbtstest>:\n    7193:\t53                   \tpush   %rbx\n    7194:\t51                   \tpush   %rcx\n    7195:\t52                   \tpush   %rdx\n    7196:\t56                   \tpush   %rsi\n    7197:\t41 50                \tpush   %r8\n    7199:\t41 51                \tpush   %r9\n    719b:\t41 52                \tpush   %r10\n    719d:\t41 53                \tpush   %r11\n    719f:\t41 54                \tpush   %r12\n    71a1:\t41 55                \tpush   %r13\n    71a3:\t41 56                \tpush   %r14\n    71a5:\t41 57                \tpush   %r15\n    71a7:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    71ae:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    71b5:\t4c 89 c3             \tmov    %r8,%rbx\n    71b8:\t4c 89 c1             \tmov    %r8,%rcx\n    71bb:\t4d 89 c2             \tmov    %r8,%r10\n    71be:\t4d 89 c3             \tmov    %r8,%r11\n    71c1:\t4d 89 c4             \tmov    %r8,%r12\n    71c4:\t4d 89 c5             \tmov    %r8,%r13\n    71c7:\t4d 89 c6             \tmov    %r8,%r14\n    71ca:\t4d 89 c7             \tmov    %r8,%r15\n    71cd:\t49 ff c0             \tinc    %r8\n\n00000000000071d0 <rorbtstest_loop>:\n    71d0:\t4d 0f ab c7          \tbts    %r8,%r15\n    71d4:\t49 d1 ce             \tror    %r14\n    71d7:\t4d 0f ab c5          \tbts    %r8,%r13\n    71db:\t49 d1 cc             \tror    %r12\n    71de:\t4d 0f ab c3          \tbts    %r8,%r11\n    71e2:\t49 d1 ca             \tror    %r10\n    71e5:\t4c 0f ab c1          \tbts    %r8,%rcx\n    71e9:\t48 d1 cb             \tror    %rbx\n    71ec:\t4c 0f ab c2          \tbts    %r8,%rdx\n    71f0:\t48 d1 ce             \tror    %rsi\n    71f3:\t4d 0f ab c7          \tbts    %r8,%r15\n    71f7:\t49 d1 ce             \tror    %r14\n    71fa:\t4d 0f ab c5          \tbts    %r8,%r13\n    71fe:\t49 d1 cc             \tror    %r12\n    7201:\t4d 0f ab c3          \tbts    %r8,%r11\n    7205:\t49 d1 ca             \tror    %r10\n    7208:\t4c 0f ab c1          \tbts    %r8,%rcx\n    720c:\t48 d1 cb             \tror    %rbx\n    720f:\t4c 0f ab c2          \tbts    %r8,%rdx\n    7213:\t48 d1 ce             \tror    %rsi\n    7216:\t4c 29 cf             \tsub    %r9,%rdi\n    7219:\t75 b5                \tjne    71d0 <rorbtstest_loop>\n    721b:\t41 5f                \tpop    %r15\n    721d:\t41 5e                \tpop    %r14\n    721f:\t41 5d                \tpop    %r13\n    7221:\t41 5c                \tpop    %r12\n    7223:\t41 5b                \tpop    %r11\n    7225:\t41 5a                \tpop    %r10\n    7227:\t41 59                \tpop    %r9\n    7229:\t41 58                \tpop    %r8\n    722b:\t5e                   \tpop    %rsi\n    722c:\t5a                   \tpop    %rdx\n    722d:\t59                   \tpop    %rcx\n    722e:\t5b                   \tpop    %rbx\n    722f:\tc3                   \tretq   \n\n0000000000007230 <btstest>:\n    7230:\t53                   \tpush   %rbx\n    7231:\t51                   \tpush   %rcx\n    7232:\t41 50                \tpush   %r8\n    7234:\t41 51                \tpush   %r9\n    7236:\t41 52                \tpush   %r10\n    7238:\t41 53                \tpush   %r11\n    723a:\t41 54                \tpush   %r12\n    723c:\t41 55                \tpush   %r13\n    723e:\t41 56                \tpush   %r14\n    7240:\t41 57                \tpush   %r15\n    7242:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    7249:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7250:\t4c 89 c3             \tmov    %r8,%rbx\n    7253:\t4c 89 c1             \tmov    %r8,%rcx\n    7256:\t4d 89 c2             \tmov    %r8,%r10\n    7259:\t4d 89 c3             \tmov    %r8,%r11\n    725c:\t4d 89 c4             \tmov    %r8,%r12\n    725f:\t4d 89 c5             \tmov    %r8,%r13\n    7262:\t4d 89 c6             \tmov    %r8,%r14\n    7265:\t4d 89 c7             \tmov    %r8,%r15\n    7268:\t49 ff c0             \tinc    %r8\n\n000000000000726b <btstest_loop>:\n    726b:\t4d 0f ab c7          \tbts    %r8,%r15\n    726f:\t4d 0f ab c6          \tbts    %r8,%r14\n    7273:\t4d 0f ab c5          \tbts    %r8,%r13\n    7277:\t4d 0f ab c4          \tbts    %r8,%r12\n    727b:\t4d 0f ab c3          \tbts    %r8,%r11\n    727f:\t4d 0f ab c7          \tbts    %r8,%r15\n    7283:\t4d 0f ab c6          \tbts    %r8,%r14\n    7287:\t4d 0f ab c5          \tbts    %r8,%r13\n    728b:\t4d 0f ab c4          \tbts    %r8,%r12\n    728f:\t4d 0f ab c3          \tbts    %r8,%r11\n    7293:\t4d 0f ab c7          \tbts    %r8,%r15\n    7297:\t4d 0f ab c6          \tbts    %r8,%r14\n    729b:\t4d 0f ab c5          \tbts    %r8,%r13\n    729f:\t4d 0f ab c4          \tbts    %r8,%r12\n    72a3:\t4d 0f ab c3          \tbts    %r8,%r11\n    72a7:\t4d 0f ab c7          \tbts    %r8,%r15\n    72ab:\t4d 0f ab c6          \tbts    %r8,%r14\n    72af:\t4d 0f ab c5          \tbts    %r8,%r13\n    72b3:\t4d 0f ab c4          \tbts    %r8,%r12\n    72b7:\t4d 0f ab c3          \tbts    %r8,%r11\n    72bb:\t4c 29 cf             \tsub    %r9,%rdi\n    72be:\t75 ab                \tjne    726b <btstest_loop>\n    72c0:\t41 5f                \tpop    %r15\n    72c2:\t41 5e                \tpop    %r14\n    72c4:\t41 5d                \tpop    %r13\n    72c6:\t41 5c                \tpop    %r12\n    72c8:\t41 5b                \tpop    %r11\n    72ca:\t41 5a                \tpop    %r10\n    72cc:\t41 59                \tpop    %r9\n    72ce:\t41 58                \tpop    %r8\n    72d0:\t59                   \tpop    %rcx\n    72d1:\t5b                   \tpop    %rbx\n    72d2:\tc3                   \tretq   \n\n00000000000072d3 <leatest>:\n    72d3:\t53                   \tpush   %rbx\n    72d4:\t51                   \tpush   %rcx\n    72d5:\t41 50                \tpush   %r8\n    72d7:\t41 51                \tpush   %r9\n    72d9:\t41 52                \tpush   %r10\n    72db:\t41 53                \tpush   %r11\n    72dd:\t41 54                \tpush   %r12\n    72df:\t41 55                \tpush   %r13\n    72e1:\t41 56                \tpush   %r14\n    72e3:\t41 57                \tpush   %r15\n    72e5:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    72ec:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    72f3:\t4c 89 c3             \tmov    %r8,%rbx\n    72f6:\t4c 89 c1             \tmov    %r8,%rcx\n    72f9:\t4d 89 c2             \tmov    %r8,%r10\n    72fc:\t4d 89 c3             \tmov    %r8,%r11\n    72ff:\t4d 89 c4             \tmov    %r8,%r12\n    7302:\t4d 89 c5             \tmov    %r8,%r13\n    7305:\t4d 89 c6             \tmov    %r8,%r14\n    7308:\t4d 89 c7             \tmov    %r8,%r15\n    730b:\t49 ff c0             \tinc    %r8\n\n000000000000730e <leatest_loop>:\n    730e:\t4f 8d 14 d1          \tlea    (%r9,%r10,8),%r10\n    7312:\t4f 8d 1c d9          \tlea    (%r9,%r11,8),%r11\n    7316:\t4f 8d 24 e1          \tlea    (%r9,%r12,8),%r12\n    731a:\t4f 8d 2c e9          \tlea    (%r9,%r13,8),%r13\n    731e:\t4f 8d 34 f1          \tlea    (%r9,%r14,8),%r14\n    7322:\t4f 8d 3c f9          \tlea    (%r9,%r15,8),%r15\n    7326:\t4f 8d 14 d1          \tlea    (%r9,%r10,8),%r10\n    732a:\t4f 8d 1c d9          \tlea    (%r9,%r11,8),%r11\n    732e:\t4f 8d 24 e1          \tlea    (%r9,%r12,8),%r12\n    7332:\t4f 8d 2c e9          \tlea    (%r9,%r13,8),%r13\n    7336:\t4f 8d 34 f1          \tlea    (%r9,%r14,8),%r14\n    733a:\t4f 8d 3c f9          \tlea    (%r9,%r15,8),%r15\n    733e:\t4f 8d 14 d1          \tlea    (%r9,%r10,8),%r10\n    7342:\t4f 8d 1c d9          \tlea    (%r9,%r11,8),%r11\n    7346:\t4f 8d 24 e1          \tlea    (%r9,%r12,8),%r12\n    734a:\t4f 8d 2c e9          \tlea    (%r9,%r13,8),%r13\n    734e:\t4f 8d 34 f1          \tlea    (%r9,%r14,8),%r14\n    7352:\t4f 8d 3c f9          \tlea    (%r9,%r15,8),%r15\n    7356:\t4f 8d 14 d1          \tlea    (%r9,%r10,8),%r10\n    735a:\t4f 8d 1c d9          \tlea    (%r9,%r11,8),%r11\n    735e:\t4c 29 cf             \tsub    %r9,%rdi\n    7361:\t75 ab                \tjne    730e <leatest_loop>\n    7363:\t41 5f                \tpop    %r15\n    7365:\t41 5e                \tpop    %r14\n    7367:\t41 5d                \tpop    %r13\n    7369:\t41 5c                \tpop    %r12\n    736b:\t41 5b                \tpop    %r11\n    736d:\t41 5a                \tpop    %r10\n    736f:\t41 59                \tpop    %r9\n    7371:\t41 58                \tpop    %r8\n    7373:\t59                   \tpop    %rcx\n    7374:\t5b                   \tpop    %rbx\n    7375:\tc3                   \tretq   \n\n0000000000007376 <leamultest>:\n    7376:\t53                   \tpush   %rbx\n    7377:\t51                   \tpush   %rcx\n    7378:\t52                   \tpush   %rdx\n    7379:\t56                   \tpush   %rsi\n    737a:\t41 50                \tpush   %r8\n    737c:\t41 51                \tpush   %r9\n    737e:\t41 52                \tpush   %r10\n    7380:\t41 53                \tpush   %r11\n    7382:\t41 54                \tpush   %r12\n    7384:\t41 55                \tpush   %r13\n    7386:\t41 56                \tpush   %r14\n    7388:\t41 57                \tpush   %r15\n    738a:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    7391:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7398:\t4c 89 c3             \tmov    %r8,%rbx\n    739b:\t4c 89 c1             \tmov    %r8,%rcx\n    739e:\t4d 89 c2             \tmov    %r8,%r10\n    73a1:\t4d 89 c3             \tmov    %r8,%r11\n    73a4:\t4d 89 c4             \tmov    %r8,%r12\n    73a7:\t4d 89 c5             \tmov    %r8,%r13\n    73aa:\t4d 89 c6             \tmov    %r8,%r14\n    73ad:\t4d 89 c7             \tmov    %r8,%r15\n    73b0:\t49 ff c0             \tinc    %r8\n\n00000000000073b3 <leamultest_loop>:\n    73b3:\t4f 8d 3c f9          \tlea    (%r9,%r15,8),%r15\n    73b7:\t4d 0f af f0          \timul   %r8,%r14\n    73bb:\t4d 89 c6             \tmov    %r8,%r14\n    73be:\t4f 8d 2c e9          \tlea    (%r9,%r13,8),%r13\n    73c2:\t4d 0f af e0          \timul   %r8,%r12\n    73c6:\t4d 89 c4             \tmov    %r8,%r12\n    73c9:\t4f 8d 1c d9          \tlea    (%r9,%r11,8),%r11\n    73cd:\t4d 0f af d0          \timul   %r8,%r10\n    73d1:\t4d 89 c2             \tmov    %r8,%r10\n    73d4:\t49 8d 1c d9          \tlea    (%r9,%rbx,8),%rbx\n    73d8:\t49 0f af c8          \timul   %r8,%rcx\n    73dc:\t4c 89 c1             \tmov    %r8,%rcx\n    73df:\t49 8d 14 d1          \tlea    (%r9,%rdx,8),%rdx\n    73e3:\t49 0f af c0          \timul   %r8,%rax\n    73e7:\t4f 8d 3c f9          \tlea    (%r9,%r15,8),%r15\n    73eb:\t4d 0f af f0          \timul   %r8,%r14\n    73ef:\t4f 8d 2c e9          \tlea    (%r9,%r13,8),%r13\n    73f3:\t4d 0f af e0          \timul   %r8,%r12\n    73f7:\t4f 8d 1c d9          \tlea    (%r9,%r11,8),%r11\n    73fb:\t4d 0f af d0          \timul   %r8,%r10\n    73ff:\t49 8d 1c d9          \tlea    (%r9,%rbx,8),%rbx\n    7403:\t49 0f af c8          \timul   %r8,%rcx\n    7407:\t49 8d 14 d1          \tlea    (%r9,%rdx,8),%rdx\n    740b:\t49 0f af c0          \timul   %r8,%rax\n    740f:\t4c 29 cf             \tsub    %r9,%rdi\n    7412:\t75 9f                \tjne    73b3 <leamultest_loop>\n    7414:\t41 5f                \tpop    %r15\n    7416:\t41 5e                \tpop    %r14\n    7418:\t41 5d                \tpop    %r13\n    741a:\t41 5c                \tpop    %r12\n    741c:\t41 5b                \tpop    %r11\n    741e:\t41 5a                \tpop    %r10\n    7420:\t41 59                \tpop    %r9\n    7422:\t41 58                \tpop    %r8\n    7424:\t5e                   \tpop    %rsi\n    7425:\t5a                   \tpop    %rdx\n    7426:\t59                   \tpop    %rcx\n    7427:\t5b                   \tpop    %rbx\n    7428:\tc3                   \tretq   \n\n0000000000007429 <btsmultest>:\n    7429:\t53                   \tpush   %rbx\n    742a:\t51                   \tpush   %rcx\n    742b:\t56                   \tpush   %rsi\n    742c:\t52                   \tpush   %rdx\n    742d:\t41 50                \tpush   %r8\n    742f:\t41 51                \tpush   %r9\n    7431:\t41 52                \tpush   %r10\n    7433:\t41 53                \tpush   %r11\n    7435:\t41 54                \tpush   %r12\n    7437:\t41 55                \tpush   %r13\n    7439:\t41 56                \tpush   %r14\n    743b:\t41 57                \tpush   %r15\n    743d:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    7444:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    744b:\t4c 89 c3             \tmov    %r8,%rbx\n    744e:\t4c 89 c2             \tmov    %r8,%rdx\n    7451:\t4c 89 c6             \tmov    %r8,%rsi\n    7454:\t4c 89 c1             \tmov    %r8,%rcx\n    7457:\t4d 89 c2             \tmov    %r8,%r10\n    745a:\t4d 89 c3             \tmov    %r8,%r11\n    745d:\t4d 89 c4             \tmov    %r8,%r12\n    7460:\t4d 89 c5             \tmov    %r8,%r13\n    7463:\t4d 89 c6             \tmov    %r8,%r14\n    7466:\t4d 89 c7             \tmov    %r8,%r15\n    7469:\t49 ff c0             \tinc    %r8\n\n000000000000746c <btsmultest_loop>:\n    746c:\t4d 0f af f0          \timul   %r8,%r14\n    7470:\t4d 0f ab c5          \tbts    %r8,%r13\n    7474:\t4d 89 c5             \tmov    %r8,%r13\n    7477:\t4d 0f af e0          \timul   %r8,%r12\n    747b:\t4d 0f ab c3          \tbts    %r8,%r11\n    747f:\t4d 89 c3             \tmov    %r8,%r11\n    7482:\t4d 0f af d0          \timul   %r8,%r10\n    7486:\t4c 0f ab c3          \tbts    %r8,%rbx\n    748a:\t49 0f af c8          \timul   %r8,%rcx\n    748e:\t4c 89 c1             \tmov    %r8,%rcx\n    7491:\t4c 0f ab c6          \tbts    %r8,%rsi\n    7495:\t49 0f af c0          \timul   %r8,%rax\n    7499:\t4c 89 c0             \tmov    %r8,%rax\n    749c:\t4d 0f ab c7          \tbts    %r8,%r15\n    74a0:\t4d 0f af f0          \timul   %r8,%r14\n    74a4:\t4d 89 c6             \tmov    %r8,%r14\n    74a7:\t4d 0f ab c5          \tbts    %r8,%r13\n    74ab:\t4d 0f af e0          \timul   %r8,%r12\n    74af:\t4d 89 c4             \tmov    %r8,%r12\n    74b2:\t4d 0f ab c3          \tbts    %r8,%r11\n    74b6:\t4d 0f af d0          \timul   %r8,%r10\n    74ba:\t4d 89 c2             \tmov    %r8,%r10\n    74bd:\t4c 0f ab c3          \tbts    %r8,%rbx\n    74c1:\t49 0f af c8          \timul   %r8,%rcx\n    74c5:\t4c 89 c1             \tmov    %r8,%rcx\n    74c8:\t4c 0f ab c6          \tbts    %r8,%rsi\n    74cc:\t49 0f af d0          \timul   %r8,%rdx\n    74d0:\t4c 89 c2             \tmov    %r8,%rdx\n    74d3:\t4d 0f ab c3          \tbts    %r8,%r11\n    74d7:\t4c 29 cf             \tsub    %r9,%rdi\n    74da:\t75 90                \tjne    746c <btsmultest_loop>\n    74dc:\t41 5f                \tpop    %r15\n    74de:\t41 5e                \tpop    %r14\n    74e0:\t41 5d                \tpop    %r13\n    74e2:\t41 5c                \tpop    %r12\n    74e4:\t41 5b                \tpop    %r11\n    74e6:\t41 5a                \tpop    %r10\n    74e8:\t41 59                \tpop    %r9\n    74ea:\t41 58                \tpop    %r8\n    74ec:\t5a                   \tpop    %rdx\n    74ed:\t5e                   \tpop    %rsi\n    74ee:\t59                   \tpop    %rcx\n    74ef:\t5b                   \tpop    %rbx\n    74f0:\tc3                   \tretq   \n\n00000000000074f1 <jmptest>:\n    74f1:\t56                   \tpush   %rsi\n    74f2:\t53                   \tpush   %rbx\n    74f3:\t51                   \tpush   %rcx\n    74f4:\t52                   \tpush   %rdx\n    74f5:\t41 50                \tpush   %r8\n    74f7:\t41 51                \tpush   %r9\n    74f9:\t41 52                \tpush   %r10\n    74fb:\t41 53                \tpush   %r11\n    74fd:\t41 54                \tpush   %r12\n    74ff:\t41 55                \tpush   %r13\n    7501:\t41 56                \tpush   %r14\n    7503:\t41 57                \tpush   %r15\n    7505:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    750c:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7513:\t48 31 db             \txor    %rbx,%rbx\n    7516:\t48 31 c9             \txor    %rcx,%rcx\n    7519:\t4d 31 d2             \txor    %r10,%r10\n    751c:\t4d 31 db             \txor    %r11,%r11\n    751f:\t4d 31 e4             \txor    %r12,%r12\n    7522:\t4d 31 ed             \txor    %r13,%r13\n    7525:\t4d 31 f6             \txor    %r14,%r14\n    7528:\t4d 31 ff             \txor    %r15,%r15\n    752b:\t48 31 f6             \txor    %rsi,%rsi\n    752e:\t4d 89 c2             \tmov    %r8,%r10\n    7531:\t4d 89 c3             \tmov    %r8,%r11\n    7534:\t4c 89 c6             \tmov    %r8,%rsi\n    7537:\t4c 89 c0             \tmov    %r8,%rax\n    753a:\t4c 89 c2             \tmov    %r8,%rdx\n\n000000000000753d <jmptest_loop>:\n    753d:\teb 04                \tjmp    7543 <jmptest1>\n    753f:\t48 83 c0 01          \tadd    $0x1,%rax\n\n0000000000007543 <jmptest1>:\n    7543:\teb 04                \tjmp    7549 <jmptest2>\n    7545:\t48 83 c0 02          \tadd    $0x2,%rax\n\n0000000000007549 <jmptest2>:\n    7549:\teb 04                \tjmp    754f <jmptest3>\n    754b:\t48 83 c0 03          \tadd    $0x3,%rax\n\n000000000000754f <jmptest3>:\n    754f:\teb 04                \tjmp    7555 <jmptest4>\n    7551:\t48 83 c0 04          \tadd    $0x4,%rax\n\n0000000000007555 <jmptest4>:\n    7555:\teb 04                \tjmp    755b <jmptest5>\n    7557:\t48 83 c0 05          \tadd    $0x5,%rax\n\n000000000000755b <jmptest5>:\n    755b:\teb 04                \tjmp    7561 <jmptest6>\n    755d:\t48 83 c0 06          \tadd    $0x6,%rax\n\n0000000000007561 <jmptest6>:\n    7561:\teb 04                \tjmp    7567 <jmptest7>\n    7563:\t48 83 c0 07          \tadd    $0x7,%rax\n\n0000000000007567 <jmptest7>:\n    7567:\teb 04                \tjmp    756d <jmptest8>\n    7569:\t48 83 c0 08          \tadd    $0x8,%rax\n\n000000000000756d <jmptest8>:\n    756d:\teb 04                \tjmp    7573 <jmptest9>\n    756f:\t48 83 c0 09          \tadd    $0x9,%rax\n\n0000000000007573 <jmptest9>:\n    7573:\teb 04                \tjmp    7579 <jmptest10>\n    7575:\t48 83 c0 0a          \tadd    $0xa,%rax\n\n0000000000007579 <jmptest10>:\n    7579:\teb 04                \tjmp    757f <jmptest11>\n    757b:\t48 83 c0 0b          \tadd    $0xb,%rax\n\n000000000000757f <jmptest11>:\n    757f:\teb 04                \tjmp    7585 <jmptest12>\n    7581:\t48 83 c0 0c          \tadd    $0xc,%rax\n\n0000000000007585 <jmptest12>:\n    7585:\teb 04                \tjmp    758b <jmptest13>\n    7587:\t48 83 c0 0d          \tadd    $0xd,%rax\n\n000000000000758b <jmptest13>:\n    758b:\teb 04                \tjmp    7591 <jmptest14>\n    758d:\t48 83 c0 0e          \tadd    $0xe,%rax\n\n0000000000007591 <jmptest14>:\n    7591:\teb 04                \tjmp    7597 <jmptest15>\n    7593:\t48 83 c0 0f          \tadd    $0xf,%rax\n\n0000000000007597 <jmptest15>:\n    7597:\teb 04                \tjmp    759d <jmptest16>\n    7599:\t48 83 c0 10          \tadd    $0x10,%rax\n\n000000000000759d <jmptest16>:\n    759d:\teb 04                \tjmp    75a3 <jmptest17>\n    759f:\t48 83 c0 11          \tadd    $0x11,%rax\n\n00000000000075a3 <jmptest17>:\n    75a3:\teb 04                \tjmp    75a9 <jmptest18>\n    75a5:\t48 83 c0 12          \tadd    $0x12,%rax\n\n00000000000075a9 <jmptest18>:\n    75a9:\teb 04                \tjmp    75af <jmptest19>\n    75ab:\t48 83 c0 13          \tadd    $0x13,%rax\n\n00000000000075af <jmptest19>:\n    75af:\t4c 29 cf             \tsub    %r9,%rdi\n    75b2:\t75 89                \tjne    753d <jmptest_loop>\n\n00000000000075b4 <jmptest_jellydonut>:\n    75b4:\t41 5f                \tpop    %r15\n    75b6:\t41 5e                \tpop    %r14\n    75b8:\t41 5d                \tpop    %r13\n    75ba:\t41 5c                \tpop    %r12\n    75bc:\t41 5b                \tpop    %r11\n    75be:\t41 5a                \tpop    %r10\n    75c0:\t41 59                \tpop    %r9\n    75c2:\t41 58                \tpop    %r8\n    75c4:\t5a                   \tpop    %rdx\n    75c5:\t59                   \tpop    %rcx\n    75c6:\t5b                   \tpop    %rbx\n    75c7:\t5e                   \tpop    %rsi\n    75c8:\tc3                   \tretq   \n\n00000000000075c9 <ntjmptest>:\n    75c9:\t56                   \tpush   %rsi\n    75ca:\t53                   \tpush   %rbx\n    75cb:\t51                   \tpush   %rcx\n    75cc:\t52                   \tpush   %rdx\n    75cd:\t41 50                \tpush   %r8\n    75cf:\t41 51                \tpush   %r9\n    75d1:\t41 52                \tpush   %r10\n    75d3:\t41 53                \tpush   %r11\n    75d5:\t41 54                \tpush   %r12\n    75d7:\t41 55                \tpush   %r13\n    75d9:\t41 56                \tpush   %r14\n    75db:\t41 57                \tpush   %r15\n    75dd:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    75e4:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    75eb:\t48 31 db             \txor    %rbx,%rbx\n    75ee:\t48 31 c9             \txor    %rcx,%rcx\n    75f1:\t4d 31 d2             \txor    %r10,%r10\n    75f4:\t4d 31 db             \txor    %r11,%r11\n    75f7:\t4d 31 e4             \txor    %r12,%r12\n    75fa:\t4d 31 ed             \txor    %r13,%r13\n    75fd:\t4d 31 f6             \txor    %r14,%r14\n    7600:\t4d 31 ff             \txor    %r15,%r15\n    7603:\t48 31 f6             \txor    %rsi,%rsi\n    7606:\t4d 89 c2             \tmov    %r8,%r10\n    7609:\t4d 89 c3             \tmov    %r8,%r11\n    760c:\t4c 89 c6             \tmov    %r8,%rsi\n    760f:\t4c 89 c0             \tmov    %r8,%rax\n    7612:\t4c 89 c2             \tmov    %r8,%rdx\n\n0000000000007615 <ntjmptest_loop>:\n    7615:\t4d 39 c1             \tcmp    %r8,%r9\n    7618:\t0f 84 71 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    761e:\t4d 39 c1             \tcmp    %r8,%r9\n    7621:\t0f 84 68 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    7627:\t4d 39 c1             \tcmp    %r8,%r9\n    762a:\t0f 84 5f 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    7630:\t4d 39 c1             \tcmp    %r8,%r9\n    7633:\t0f 84 56 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    7639:\t4d 39 c1             \tcmp    %r8,%r9\n    763c:\t0f 84 4d 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    7642:\t4d 39 c1             \tcmp    %r8,%r9\n    7645:\t0f 84 44 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    764b:\t4d 39 c1             \tcmp    %r8,%r9\n    764e:\t0f 84 3b 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    7654:\t4d 39 c1             \tcmp    %r8,%r9\n    7657:\t0f 84 32 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    765d:\t4d 39 c1             \tcmp    %r8,%r9\n    7660:\t0f 84 29 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    7666:\t4d 39 c1             \tcmp    %r8,%r9\n    7669:\t0f 84 20 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    766f:\t4d 39 c1             \tcmp    %r8,%r9\n    7672:\t0f 84 17 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    7678:\t4d 39 c1             \tcmp    %r8,%r9\n    767b:\t0f 84 0e 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    7681:\t4d 39 c1             \tcmp    %r8,%r9\n    7684:\t0f 84 05 01 00 00    \tje     778f <jmpmultest_jellydonut>\n    768a:\t4d 39 c1             \tcmp    %r8,%r9\n    768d:\t0f 84 fc 00 00 00    \tje     778f <jmpmultest_jellydonut>\n    7693:\t4d 39 c1             \tcmp    %r8,%r9\n    7696:\t0f 84 f3 00 00 00    \tje     778f <jmpmultest_jellydonut>\n    769c:\t4d 39 c1             \tcmp    %r8,%r9\n    769f:\t0f 84 ea 00 00 00    \tje     778f <jmpmultest_jellydonut>\n    76a5:\t4d 39 c1             \tcmp    %r8,%r9\n    76a8:\t0f 84 e1 00 00 00    \tje     778f <jmpmultest_jellydonut>\n    76ae:\t4d 39 c1             \tcmp    %r8,%r9\n    76b1:\t0f 84 d8 00 00 00    \tje     778f <jmpmultest_jellydonut>\n    76b7:\t4d 39 c1             \tcmp    %r8,%r9\n    76ba:\t0f 84 cf 00 00 00    \tje     778f <jmpmultest_jellydonut>\n    76c0:\t4d 39 c1             \tcmp    %r8,%r9\n    76c3:\t0f 84 c6 00 00 00    \tje     778f <jmpmultest_jellydonut>\n    76c9:\t4c 29 cf             \tsub    %r9,%rdi\n    76cc:\t0f 85 43 ff ff ff    \tjne    7615 <ntjmptest_loop>\n\n00000000000076d2 <ntjmptest_jellydonut>:\n    76d2:\t41 5f                \tpop    %r15\n    76d4:\t41 5e                \tpop    %r14\n    76d6:\t41 5d                \tpop    %r13\n    76d8:\t41 5c                \tpop    %r12\n    76da:\t41 5b                \tpop    %r11\n    76dc:\t41 5a                \tpop    %r10\n    76de:\t41 59                \tpop    %r9\n    76e0:\t41 58                \tpop    %r8\n    76e2:\t5a                   \tpop    %rdx\n    76e3:\t59                   \tpop    %rcx\n    76e4:\t5b                   \tpop    %rbx\n    76e5:\t5e                   \tpop    %rsi\n    76e6:\tc3                   \tretq   \n\n00000000000076e7 <jmpmultest>:\n    76e7:\t56                   \tpush   %rsi\n    76e8:\t53                   \tpush   %rbx\n    76e9:\t51                   \tpush   %rcx\n    76ea:\t52                   \tpush   %rdx\n    76eb:\t41 50                \tpush   %r8\n    76ed:\t41 51                \tpush   %r9\n    76ef:\t41 52                \tpush   %r10\n    76f1:\t41 53                \tpush   %r11\n    76f3:\t41 54                \tpush   %r12\n    76f5:\t41 55                \tpush   %r13\n    76f7:\t41 56                \tpush   %r14\n    76f9:\t41 57                \tpush   %r15\n    76fb:\t49 c7 c0 02 00 00 00 \tmov    $0x2,%r8\n    7702:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7709:\t48 31 db             \txor    %rbx,%rbx\n    770c:\t48 31 c9             \txor    %rcx,%rcx\n    770f:\t4d 31 db             \txor    %r11,%r11\n    7712:\t4d 31 e4             \txor    %r12,%r12\n    7715:\t4d 31 ed             \txor    %r13,%r13\n    7718:\t4d 31 f6             \txor    %r14,%r14\n    771b:\t4d 31 ff             \txor    %r15,%r15\n    771e:\t48 31 f6             \txor    %rsi,%rsi\n    7721:\t4d 89 c2             \tmov    %r8,%r10\n    7724:\t4d 89 c3             \tmov    %r8,%r11\n    7727:\t4c 89 c6             \tmov    %r8,%rsi\n    772a:\t4c 89 c0             \tmov    %r8,%rax\n    772d:\t4c 89 c2             \tmov    %r8,%rdx\n\n0000000000007730 <jmpmultest_loop>:\n    7730:\t4d 39 c1             \tcmp    %r8,%r9\n    7733:\t74 5a                \tje     778f <jmpmultest_jellydonut>\n    7735:\t45 0f af d0          \timul   %r8d,%r10d\n    7739:\t4d 39 c1             \tcmp    %r8,%r9\n    773c:\t74 51                \tje     778f <jmpmultest_jellydonut>\n    773e:\t41 0f af f0          \timul   %r8d,%esi\n    7742:\t4d 39 c1             \tcmp    %r8,%r9\n    7745:\t74 48                \tje     778f <jmpmultest_jellydonut>\n    7747:\t41 0f af d8          \timul   %r8d,%ebx\n    774b:\t4d 39 c1             \tcmp    %r8,%r9\n    774e:\t74 3f                \tje     778f <jmpmultest_jellydonut>\n    7750:\t41 0f af d0          \timul   %r8d,%edx\n    7754:\t4d 39 c1             \tcmp    %r8,%r9\n    7757:\t74 36                \tje     778f <jmpmultest_jellydonut>\n    7759:\t45 0f af d0          \timul   %r8d,%r10d\n    775d:\t4d 39 c1             \tcmp    %r8,%r9\n    7760:\t74 2d                \tje     778f <jmpmultest_jellydonut>\n    7762:\t41 0f af f0          \timul   %r8d,%esi\n    7766:\t4d 39 c1             \tcmp    %r8,%r9\n    7769:\t74 24                \tje     778f <jmpmultest_jellydonut>\n    776b:\t41 0f af d8          \timul   %r8d,%ebx\n    776f:\t4d 39 c1             \tcmp    %r8,%r9\n    7772:\t74 1b                \tje     778f <jmpmultest_jellydonut>\n    7774:\t41 0f af d0          \timul   %r8d,%edx\n    7778:\t4d 39 c1             \tcmp    %r8,%r9\n    777b:\t74 12                \tje     778f <jmpmultest_jellydonut>\n    777d:\t45 0f af f8          \timul   %r8d,%r15d\n    7781:\t4d 39 c1             \tcmp    %r8,%r9\n    7784:\t74 09                \tje     778f <jmpmultest_jellydonut>\n    7786:\t45 0f af f0          \timul   %r8d,%r14d\n    778a:\t4c 29 cf             \tsub    %r9,%rdi\n    778d:\t75 a1                \tjne    7730 <jmpmultest_loop>\n\n000000000000778f <jmpmultest_jellydonut>:\n    778f:\t41 5f                \tpop    %r15\n    7791:\t41 5e                \tpop    %r14\n    7793:\t41 5d                \tpop    %r13\n    7795:\t41 5c                \tpop    %r12\n    7797:\t41 5b                \tpop    %r11\n    7799:\t41 5a                \tpop    %r10\n    779b:\t41 59                \tpop    %r9\n    779d:\t41 58                \tpop    %r8\n    779f:\t5a                   \tpop    %rdx\n    77a0:\t59                   \tpop    %rcx\n    77a1:\t5b                   \tpop    %rbx\n    77a2:\t5e                   \tpop    %rsi\n    77a3:\tc3                   \tretq   \n\n00000000000077a4 <addmultest>:\n    77a4:\t56                   \tpush   %rsi\n    77a5:\t53                   \tpush   %rbx\n    77a6:\t51                   \tpush   %rcx\n    77a7:\t52                   \tpush   %rdx\n    77a8:\t41 50                \tpush   %r8\n    77aa:\t41 51                \tpush   %r9\n    77ac:\t41 52                \tpush   %r10\n    77ae:\t41 53                \tpush   %r11\n    77b0:\t41 54                \tpush   %r12\n    77b2:\t41 55                \tpush   %r13\n    77b4:\t41 56                \tpush   %r14\n    77b6:\t41 57                \tpush   %r15\n    77b8:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    77bf:\t49 c7 c1 28 00 00 00 \tmov    $0x28,%r9\n    77c6:\t48 31 db             \txor    %rbx,%rbx\n    77c9:\t48 31 c9             \txor    %rcx,%rcx\n    77cc:\t4d 31 d2             \txor    %r10,%r10\n    77cf:\t4d 31 db             \txor    %r11,%r11\n    77d2:\t4d 31 e4             \txor    %r12,%r12\n    77d5:\t4d 31 ed             \txor    %r13,%r13\n    77d8:\t4d 31 f6             \txor    %r14,%r14\n    77db:\t4d 31 ff             \txor    %r15,%r15\n    77de:\t48 31 f6             \txor    %rsi,%rsi\n    77e1:\t4d 89 c2             \tmov    %r8,%r10\n    77e4:\t4d 89 c3             \tmov    %r8,%r11\n    77e7:\t4c 89 c6             \tmov    %r8,%rsi\n    77ea:\t4c 89 c0             \tmov    %r8,%rax\n    77ed:\t4c 89 c2             \tmov    %r8,%rdx\n\n00000000000077f0 <addmultest_loop>:\n    77f0:\t4d 01 c7             \tadd    %r8,%r15\n    77f3:\t4d 01 c6             \tadd    %r8,%r14\n    77f6:\t4d 01 c5             \tadd    %r8,%r13\n    77f9:\t4d 01 c4             \tadd    %r8,%r12\n    77fc:\t4d 0f af d0          \timul   %r8,%r10\n    7800:\t4d 01 c7             \tadd    %r8,%r15\n    7803:\t4d 01 c6             \tadd    %r8,%r14\n    7806:\t4d 01 c5             \tadd    %r8,%r13\n    7809:\t4d 01 c4             \tadd    %r8,%r12\n    780c:\t49 0f af f0          \timul   %r8,%rsi\n    7810:\t4d 01 c7             \tadd    %r8,%r15\n    7813:\t4d 01 c6             \tadd    %r8,%r14\n    7816:\t4d 01 c5             \tadd    %r8,%r13\n    7819:\t4d 01 c4             \tadd    %r8,%r12\n    781c:\t49 0f af d8          \timul   %r8,%rbx\n    7820:\t4d 01 c7             \tadd    %r8,%r15\n    7823:\t4d 01 c7             \tadd    %r8,%r15\n    7826:\t4d 01 c5             \tadd    %r8,%r13\n    7829:\t4d 01 c4             \tadd    %r8,%r12\n    782c:\t49 0f af d0          \timul   %r8,%rdx\n    7830:\t4d 01 c7             \tadd    %r8,%r15\n    7833:\t4d 01 c6             \tadd    %r8,%r14\n    7836:\t4d 01 c5             \tadd    %r8,%r13\n    7839:\t4d 01 c4             \tadd    %r8,%r12\n    783c:\t4d 0f af d0          \timul   %r8,%r10\n    7840:\t4d 01 c7             \tadd    %r8,%r15\n    7843:\t4d 01 c6             \tadd    %r8,%r14\n    7846:\t4d 01 c5             \tadd    %r8,%r13\n    7849:\t4d 01 c4             \tadd    %r8,%r12\n    784c:\t49 0f af f0          \timul   %r8,%rsi\n    7850:\t4d 01 c7             \tadd    %r8,%r15\n    7853:\t4d 01 c6             \tadd    %r8,%r14\n    7856:\t4d 01 c5             \tadd    %r8,%r13\n    7859:\t4d 01 c4             \tadd    %r8,%r12\n    785c:\t49 0f af d8          \timul   %r8,%rbx\n    7860:\t4d 01 c7             \tadd    %r8,%r15\n    7863:\t4d 01 c5             \tadd    %r8,%r13\n    7866:\t4d 01 c4             \tadd    %r8,%r12\n    7869:\t49 0f af d0          \timul   %r8,%rdx\n    786d:\t4c 29 cf             \tsub    %r9,%rdi\n    7870:\t0f 85 7a ff ff ff    \tjne    77f0 <addmultest_loop>\n    7876:\t41 5f                \tpop    %r15\n    7878:\t41 5e                \tpop    %r14\n    787a:\t41 5d                \tpop    %r13\n    787c:\t41 5c                \tpop    %r12\n    787e:\t41 5b                \tpop    %r11\n    7880:\t41 5a                \tpop    %r10\n    7882:\t41 59                \tpop    %r9\n    7884:\t41 58                \tpop    %r8\n    7886:\t5a                   \tpop    %rdx\n    7887:\t59                   \tpop    %rcx\n    7888:\t5b                   \tpop    %rbx\n    7889:\t5e                   \tpop    %rsi\n    788a:\tc3                   \tretq   \n\n000000000000788b <add256int>:\n    788b:\t41 51                \tpush   %r9\n    788d:\t41 50                \tpush   %r8\n    788f:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7896:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    789b:\tc5 fe 6f c8          \tvmovdqu %ymm0,%ymm1\n    789f:\tc5 fe 6f d0          \tvmovdqu %ymm0,%ymm2\n    78a3:\tc5 fe 6f d8          \tvmovdqu %ymm0,%ymm3\n    78a7:\tc5 fe 6f e0          \tvmovdqu %ymm0,%ymm4\n    78ab:\tc5 fe 6f e8          \tvmovdqu %ymm0,%ymm5\n\n00000000000078af <add256int_loop>:\n    78af:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    78b3:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    78b7:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    78bb:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    78bf:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    78c3:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    78c7:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    78cb:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    78cf:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    78d3:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    78d7:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    78db:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    78df:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    78e3:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    78e7:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    78eb:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    78ef:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    78f3:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    78f7:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    78fb:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    78ff:\t4c 29 cf             \tsub    %r9,%rdi\n    7902:\t75 ab                \tjne    78af <add256int_loop>\n    7904:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    7909:\tc5 f8 77             \tvzeroupper \n    790c:\t41 58                \tpop    %r8\n    790e:\t41 59                \tpop    %r9\n    7910:\tc3                   \tretq   \n\n0000000000007911 <mul512int>:\n    7911:\t41 51                \tpush   %r9\n    7913:\t41 50                \tpush   %r8\n    7915:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    791c:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    7921:\t62 f2 fd 48 59 c1    \tvpbroadcastq %xmm1,%zmm0\n    7927:\t62 f1 fe 48 6f c8    \tvmovdqu64 %zmm0,%zmm1\n    792d:\t62 f1 fe 48 6f d0    \tvmovdqu64 %zmm0,%zmm2\n    7933:\t62 f1 fe 48 6f d8    \tvmovdqu64 %zmm0,%zmm3\n    7939:\t62 f1 fe 48 6f e0    \tvmovdqu64 %zmm0,%zmm4\n    793f:\t62 f1 fe 48 6f e8    \tvmovdqu64 %zmm0,%zmm5\n\n0000000000007945 <mul512int_loop>:\n    7945:\t62 f2 75 48 40 c8    \tvpmulld %zmm0,%zmm1,%zmm1\n    794b:\t62 f2 6d 48 40 d0    \tvpmulld %zmm0,%zmm2,%zmm2\n    7951:\t62 f2 65 48 40 d8    \tvpmulld %zmm0,%zmm3,%zmm3\n    7957:\t62 f2 5d 48 40 e0    \tvpmulld %zmm0,%zmm4,%zmm4\n    795d:\t62 f2 55 48 40 e8    \tvpmulld %zmm0,%zmm5,%zmm5\n    7963:\t62 f2 75 48 40 c8    \tvpmulld %zmm0,%zmm1,%zmm1\n    7969:\t62 f2 6d 48 40 d0    \tvpmulld %zmm0,%zmm2,%zmm2\n    796f:\t62 f2 65 48 40 d8    \tvpmulld %zmm0,%zmm3,%zmm3\n    7975:\t62 f2 5d 48 40 e0    \tvpmulld %zmm0,%zmm4,%zmm4\n    797b:\t62 f2 55 48 40 e8    \tvpmulld %zmm0,%zmm5,%zmm5\n    7981:\t62 f2 75 48 40 c8    \tvpmulld %zmm0,%zmm1,%zmm1\n    7987:\t62 f2 6d 48 40 d0    \tvpmulld %zmm0,%zmm2,%zmm2\n    798d:\t62 f2 65 48 40 d8    \tvpmulld %zmm0,%zmm3,%zmm3\n    7993:\t62 f2 5d 48 40 e0    \tvpmulld %zmm0,%zmm4,%zmm4\n    7999:\t62 f2 55 48 40 e8    \tvpmulld %zmm0,%zmm5,%zmm5\n    799f:\t62 f2 75 48 40 c8    \tvpmulld %zmm0,%zmm1,%zmm1\n    79a5:\t62 f2 6d 48 40 d0    \tvpmulld %zmm0,%zmm2,%zmm2\n    79ab:\t62 f2 65 48 40 d8    \tvpmulld %zmm0,%zmm3,%zmm3\n    79b1:\t62 f2 5d 48 40 e0    \tvpmulld %zmm0,%zmm4,%zmm4\n    79b7:\t62 f2 55 48 40 e8    \tvpmulld %zmm0,%zmm5,%zmm5\n    79bd:\t4c 29 cf             \tsub    %r9,%rdi\n    79c0:\t75 83                \tjne    7945 <mul512int_loop>\n    79c2:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    79c7:\tc5 f8 77             \tvzeroupper \n    79ca:\t41 58                \tpop    %r8\n    79cc:\t41 59                \tpop    %r9\n    79ce:\tc3                   \tretq   \n\n00000000000079cf <muldq512int>:\n    79cf:\t41 51                \tpush   %r9\n    79d1:\t41 50                \tpush   %r8\n    79d3:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    79da:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    79df:\t62 f2 fd 48 59 c1    \tvpbroadcastq %xmm1,%zmm0\n    79e5:\t62 f1 fe 48 6f c8    \tvmovdqu64 %zmm0,%zmm1\n    79eb:\t62 f1 fe 48 6f d0    \tvmovdqu64 %zmm0,%zmm2\n    79f1:\t62 f1 fe 48 6f d8    \tvmovdqu64 %zmm0,%zmm3\n    79f7:\t62 f1 fe 48 6f e0    \tvmovdqu64 %zmm0,%zmm4\n    79fd:\t62 f1 fe 48 6f e8    \tvmovdqu64 %zmm0,%zmm5\n\n0000000000007a03 <muldq512int_loop>:\n    7a03:\t62 f2 f5 48 28 c8    \tvpmuldq %zmm0,%zmm1,%zmm1\n    7a09:\t62 f2 ed 48 28 d0    \tvpmuldq %zmm0,%zmm2,%zmm2\n    7a0f:\t62 f2 e5 48 28 d8    \tvpmuldq %zmm0,%zmm3,%zmm3\n    7a15:\t62 f2 dd 48 28 e0    \tvpmuldq %zmm0,%zmm4,%zmm4\n    7a1b:\t62 f2 d5 48 28 e8    \tvpmuldq %zmm0,%zmm5,%zmm5\n    7a21:\t62 f2 f5 48 28 c8    \tvpmuldq %zmm0,%zmm1,%zmm1\n    7a27:\t62 f2 ed 48 28 d0    \tvpmuldq %zmm0,%zmm2,%zmm2\n    7a2d:\t62 f2 e5 48 28 d8    \tvpmuldq %zmm0,%zmm3,%zmm3\n    7a33:\t62 f2 dd 48 28 e0    \tvpmuldq %zmm0,%zmm4,%zmm4\n    7a39:\t62 f2 d5 48 28 e8    \tvpmuldq %zmm0,%zmm5,%zmm5\n    7a3f:\t62 f2 f5 48 28 c8    \tvpmuldq %zmm0,%zmm1,%zmm1\n    7a45:\t62 f2 ed 48 28 d0    \tvpmuldq %zmm0,%zmm2,%zmm2\n    7a4b:\t62 f2 e5 48 28 d8    \tvpmuldq %zmm0,%zmm3,%zmm3\n    7a51:\t62 f2 dd 48 28 e0    \tvpmuldq %zmm0,%zmm4,%zmm4\n    7a57:\t62 f2 d5 48 28 e8    \tvpmuldq %zmm0,%zmm5,%zmm5\n    7a5d:\t62 f2 f5 48 28 c8    \tvpmuldq %zmm0,%zmm1,%zmm1\n    7a63:\t62 f2 ed 48 28 d0    \tvpmuldq %zmm0,%zmm2,%zmm2\n    7a69:\t62 f2 e5 48 28 d8    \tvpmuldq %zmm0,%zmm3,%zmm3\n    7a6f:\t62 f2 dd 48 28 e0    \tvpmuldq %zmm0,%zmm4,%zmm4\n    7a75:\t62 f2 d5 48 28 e8    \tvpmuldq %zmm0,%zmm5,%zmm5\n    7a7b:\t4c 29 cf             \tsub    %r9,%rdi\n    7a7e:\t75 83                \tjne    7a03 <muldq512int_loop>\n    7a80:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    7a85:\tc5 f8 77             \tvzeroupper \n    7a88:\t41 58                \tpop    %r8\n    7a8a:\t41 59                \tpop    %r9\n    7a8c:\tc3                   \tretq   \n\n0000000000007a8d <add512int>:\n    7a8d:\t41 51                \tpush   %r9\n    7a8f:\t41 50                \tpush   %r8\n    7a91:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7a98:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    7a9d:\t62 f2 fd 48 59 c1    \tvpbroadcastq %xmm1,%zmm0\n    7aa3:\t62 f1 fe 48 6f c8    \tvmovdqu64 %zmm0,%zmm1\n    7aa9:\t62 f1 fe 48 6f d0    \tvmovdqu64 %zmm0,%zmm2\n    7aaf:\t62 f1 fe 48 6f d8    \tvmovdqu64 %zmm0,%zmm3\n    7ab5:\t62 f1 fe 48 6f e0    \tvmovdqu64 %zmm0,%zmm4\n    7abb:\t62 f1 fe 48 6f e8    \tvmovdqu64 %zmm0,%zmm5\n\n0000000000007ac1 <add512int_loop>:\n    7ac1:\t62 f1 f5 48 d4 c8    \tvpaddq %zmm0,%zmm1,%zmm1\n    7ac7:\t62 f1 ed 48 d4 d0    \tvpaddq %zmm0,%zmm2,%zmm2\n    7acd:\t62 f1 e5 48 d4 d8    \tvpaddq %zmm0,%zmm3,%zmm3\n    7ad3:\t62 f1 dd 48 d4 e0    \tvpaddq %zmm0,%zmm4,%zmm4\n    7ad9:\t62 f1 d5 48 d4 e8    \tvpaddq %zmm0,%zmm5,%zmm5\n    7adf:\t62 f1 f5 48 d4 c8    \tvpaddq %zmm0,%zmm1,%zmm1\n    7ae5:\t62 f1 ed 48 d4 d0    \tvpaddq %zmm0,%zmm2,%zmm2\n    7aeb:\t62 f1 e5 48 d4 d8    \tvpaddq %zmm0,%zmm3,%zmm3\n    7af1:\t62 f1 dd 48 d4 e0    \tvpaddq %zmm0,%zmm4,%zmm4\n    7af7:\t62 f1 d5 48 d4 e8    \tvpaddq %zmm0,%zmm5,%zmm5\n    7afd:\t62 f1 f5 48 d4 c8    \tvpaddq %zmm0,%zmm1,%zmm1\n    7b03:\t62 f1 ed 48 d4 d0    \tvpaddq %zmm0,%zmm2,%zmm2\n    7b09:\t62 f1 e5 48 d4 d8    \tvpaddq %zmm0,%zmm3,%zmm3\n    7b0f:\t62 f1 dd 48 d4 e0    \tvpaddq %zmm0,%zmm4,%zmm4\n    7b15:\t62 f1 d5 48 d4 e8    \tvpaddq %zmm0,%zmm5,%zmm5\n    7b1b:\t62 f1 f5 48 d4 c8    \tvpaddq %zmm0,%zmm1,%zmm1\n    7b21:\t62 f1 ed 48 d4 d0    \tvpaddq %zmm0,%zmm2,%zmm2\n    7b27:\t62 f1 e5 48 d4 d8    \tvpaddq %zmm0,%zmm3,%zmm3\n    7b2d:\t62 f1 dd 48 d4 e0    \tvpaddq %zmm0,%zmm4,%zmm4\n    7b33:\t62 f1 d5 48 d4 e8    \tvpaddq %zmm0,%zmm5,%zmm5\n    7b39:\t4c 29 cf             \tsub    %r9,%rdi\n    7b3c:\t75 83                \tjne    7ac1 <add512int_loop>\n    7b3e:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    7b43:\tc5 f8 77             \tvzeroupper \n    7b46:\t41 58                \tpop    %r8\n    7b48:\t41 59                \tpop    %r9\n    7b4a:\tc3                   \tretq   \n\n0000000000007b4b <mixadd256fpint>:\n    7b4b:\t41 51                \tpush   %r9\n    7b4d:\t41 50                \tpush   %r8\n    7b4f:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7b56:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    7b5b:\tc4 e2 7d 59 c1       \tvpbroadcastq %xmm1,%ymm0\n    7b60:\tc5 fe 6f c8          \tvmovdqu %ymm0,%ymm1\n    7b64:\tc5 fe 6f d0          \tvmovdqu %ymm0,%ymm2\n    7b68:\tc5 fe 6f d8          \tvmovdqu %ymm0,%ymm3\n    7b6c:\tc5 fe 6f e0          \tvmovdqu %ymm0,%ymm4\n    7b70:\tc5 fe 6f e8          \tvmovdqu %ymm0,%ymm5\n    7b74:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    7b79:\tc4 e2 7d 18 f6       \tvbroadcastss %xmm6,%ymm6\n    7b7e:\tc5 fc 10 fe          \tvmovups %ymm6,%ymm7\n    7b82:\tc5 7c 10 c6          \tvmovups %ymm6,%ymm8\n    7b86:\tc5 7c 10 ce          \tvmovups %ymm6,%ymm9\n    7b8a:\tc5 7c 10 d6          \tvmovups %ymm6,%ymm10\n    7b8e:\tc5 7c 10 de          \tvmovups %ymm6,%ymm11\n\n0000000000007b92 <mixadd256fpint_loop>:\n    7b92:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    7b96:\tc5 c4 58 fe          \tvaddps %ymm6,%ymm7,%ymm7\n    7b9a:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    7b9e:\tc5 3c 58 c6          \tvaddps %ymm6,%ymm8,%ymm8\n    7ba2:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    7ba6:\tc5 34 58 ce          \tvaddps %ymm6,%ymm9,%ymm9\n    7baa:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    7bae:\tc5 2c 58 d6          \tvaddps %ymm6,%ymm10,%ymm10\n    7bb2:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    7bb6:\tc5 24 58 de          \tvaddps %ymm6,%ymm11,%ymm11\n    7bba:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    7bbe:\tc5 c4 58 fe          \tvaddps %ymm6,%ymm7,%ymm7\n    7bc2:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    7bc6:\tc5 3c 58 c6          \tvaddps %ymm6,%ymm8,%ymm8\n    7bca:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    7bce:\tc5 34 58 ce          \tvaddps %ymm6,%ymm9,%ymm9\n    7bd2:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    7bd6:\tc5 2c 58 d6          \tvaddps %ymm6,%ymm10,%ymm10\n    7bda:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    7bde:\tc5 24 58 de          \tvaddps %ymm6,%ymm11,%ymm11\n    7be2:\t4c 29 cf             \tsub    %r9,%rdi\n    7be5:\t75 ab                \tjne    7b92 <mixadd256fpint_loop>\n    7be7:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    7bec:\tc5 f8 77             \tvzeroupper \n    7bef:\t41 58                \tpop    %r8\n    7bf1:\t41 59                \tpop    %r9\n    7bf3:\tc3                   \tretq   \n\n0000000000007bf4 <mix256faddintadd>:\n    7bf4:\t41 51                \tpush   %r9\n    7bf6:\t41 50                \tpush   %r8\n    7bf8:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7bff:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    7c04:\tc4 62 7d 59 c1       \tvpbroadcastq %xmm1,%ymm8\n    7c09:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    7c0e:\tc4 e2 7d 18 f6       \tvbroadcastss %xmm6,%ymm6\n    7c13:\tc5 fc 10 fe          \tvmovups %ymm6,%ymm7\n    7c17:\tc5 7c 10 ce          \tvmovups %ymm6,%ymm9\n    7c1b:\tc5 7c 10 de          \tvmovups %ymm6,%ymm11\n    7c1f:\tc5 7c 10 ee          \tvmovups %ymm6,%ymm13\n    7c23:\tc5 7c 10 fe          \tvmovups %ymm6,%ymm15\n    7c27:\tc4 41 7e 6f d0       \tvmovdqu %ymm8,%ymm10\n    7c2c:\tc4 41 7e 6f e0       \tvmovdqu %ymm8,%ymm12\n    7c31:\tc4 41 7e 6f f0       \tvmovdqu %ymm8,%ymm14\n\n0000000000007c36 <mix256faddintadd_loop>:\n    7c36:\tc5 c4 58 fe          \tvaddps %ymm6,%ymm7,%ymm7\n    7c3a:\tc4 41 3d fe c0       \tvpaddd %ymm8,%ymm8,%ymm8\n    7c3f:\tc5 34 58 ce          \tvaddps %ymm6,%ymm9,%ymm9\n    7c43:\tc4 41 2d fe d2       \tvpaddd %ymm10,%ymm10,%ymm10\n    7c48:\tc5 24 58 de          \tvaddps %ymm6,%ymm11,%ymm11\n    7c4c:\tc4 41 1d fe e4       \tvpaddd %ymm12,%ymm12,%ymm12\n    7c51:\tc5 14 58 ee          \tvaddps %ymm6,%ymm13,%ymm13\n    7c55:\tc4 41 0d fe f6       \tvpaddd %ymm14,%ymm14,%ymm14\n    7c5a:\tc5 04 58 fe          \tvaddps %ymm6,%ymm15,%ymm15\n    7c5e:\tc5 d5 fe ed          \tvpaddd %ymm5,%ymm5,%ymm5\n    7c62:\tc5 c4 58 fe          \tvaddps %ymm6,%ymm7,%ymm7\n    7c66:\tc4 41 3d fe c0       \tvpaddd %ymm8,%ymm8,%ymm8\n    7c6b:\tc5 34 58 ce          \tvaddps %ymm6,%ymm9,%ymm9\n    7c6f:\tc4 41 2d fe d2       \tvpaddd %ymm10,%ymm10,%ymm10\n    7c74:\tc5 24 58 de          \tvaddps %ymm6,%ymm11,%ymm11\n    7c78:\tc4 41 1d fe e4       \tvpaddd %ymm12,%ymm12,%ymm12\n    7c7d:\tc5 14 58 ee          \tvaddps %ymm6,%ymm13,%ymm13\n    7c81:\tc4 41 0d fe f6       \tvpaddd %ymm14,%ymm14,%ymm14\n    7c86:\tc5 04 58 fe          \tvaddps %ymm6,%ymm15,%ymm15\n    7c8a:\tc5 d5 fe ed          \tvpaddd %ymm5,%ymm5,%ymm5\n    7c8e:\t4c 29 cf             \tsub    %r9,%rdi\n    7c91:\t75 a3                \tjne    7c36 <mix256faddintadd_loop>\n    7c93:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    7c98:\tc5 f8 77             \tvzeroupper \n    7c9b:\t41 58                \tpop    %r8\n    7c9d:\t41 59                \tpop    %r9\n    7c9f:\tc3                   \tretq   \n\n0000000000007ca0 <mix256fp>:\n    7ca0:\t41 51                \tpush   %r9\n    7ca2:\t41 50                \tpush   %r8\n    7ca4:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7cab:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    7cb0:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    7cb5:\tc4 e2 7d 18 f6       \tvbroadcastss %xmm6,%ymm6\n    7cba:\tc5 fc 10 ee          \tvmovups %ymm6,%ymm5\n    7cbe:\tc5 fc 10 fe          \tvmovups %ymm6,%ymm7\n    7cc2:\tc5 7c 10 c6          \tvmovups %ymm6,%ymm8\n    7cc6:\tc5 7c 10 ce          \tvmovups %ymm6,%ymm9\n    7cca:\tc5 7c 10 d6          \tvmovups %ymm6,%ymm10\n    7cce:\tc5 7c 10 de          \tvmovups %ymm6,%ymm11\n    7cd2:\tc5 7c 10 e6          \tvmovups %ymm6,%ymm12\n    7cd6:\tc5 7c 10 ee          \tvmovups %ymm6,%ymm13\n    7cda:\tc5 7c 10 f6          \tvmovups %ymm6,%ymm14\n    7cde:\tc5 7c 10 fe          \tvmovups %ymm6,%ymm15\n\n0000000000007ce2 <mix256fp_loop>:\n    7ce2:\tc5 c4 58 fe          \tvaddps %ymm6,%ymm7,%ymm7\n    7ce6:\tc5 3c 59 c6          \tvmulps %ymm6,%ymm8,%ymm8\n    7cea:\tc5 34 58 ce          \tvaddps %ymm6,%ymm9,%ymm9\n    7cee:\tc5 2c 59 d6          \tvmulps %ymm6,%ymm10,%ymm10\n    7cf2:\tc5 24 58 de          \tvaddps %ymm6,%ymm11,%ymm11\n    7cf6:\tc5 1c 59 e6          \tvmulps %ymm6,%ymm12,%ymm12\n    7cfa:\tc5 14 58 ee          \tvaddps %ymm6,%ymm13,%ymm13\n    7cfe:\tc5 0c 59 f6          \tvmulps %ymm6,%ymm14,%ymm14\n    7d02:\tc5 04 58 fe          \tvaddps %ymm6,%ymm15,%ymm15\n    7d06:\tc5 d4 59 ee          \tvmulps %ymm6,%ymm5,%ymm5\n    7d0a:\tc5 c4 58 fe          \tvaddps %ymm6,%ymm7,%ymm7\n    7d0e:\tc5 3c 59 c6          \tvmulps %ymm6,%ymm8,%ymm8\n    7d12:\tc5 34 58 ce          \tvaddps %ymm6,%ymm9,%ymm9\n    7d16:\tc5 2c 59 d6          \tvmulps %ymm6,%ymm10,%ymm10\n    7d1a:\tc5 24 58 de          \tvaddps %ymm6,%ymm11,%ymm11\n    7d1e:\tc5 1c 59 e6          \tvmulps %ymm6,%ymm12,%ymm12\n    7d22:\tc5 14 58 ee          \tvaddps %ymm6,%ymm13,%ymm13\n    7d26:\tc5 0c 59 f6          \tvmulps %ymm6,%ymm14,%ymm14\n    7d2a:\tc5 04 58 fe          \tvaddps %ymm6,%ymm15,%ymm15\n    7d2e:\tc5 d4 59 ee          \tvmulps %ymm6,%ymm5,%ymm5\n    7d32:\t4c 29 cf             \tsub    %r9,%rdi\n    7d35:\t75 ab                \tjne    7ce2 <mix256fp_loop>\n    7d37:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    7d3c:\tc5 f8 77             \tvzeroupper \n    7d3f:\t41 58                \tpop    %r8\n    7d41:\t41 59                \tpop    %r9\n    7d43:\tc3                   \tretq   \n\n0000000000007d44 <mixadd256int>:\n    7d44:\t41 51                \tpush   %r9\n    7d46:\t41 50                \tpush   %r8\n    7d48:\t41 57                \tpush   %r15\n    7d4a:\t41 56                \tpush   %r14\n    7d4c:\t41 55                \tpush   %r13\n    7d4e:\t41 54                \tpush   %r12\n    7d50:\t41 53                \tpush   %r11\n    7d52:\t49 c7 c1 1e 00 00 00 \tmov    $0x1e,%r9\n    7d59:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    7d5e:\tc4 e2 7d 59 c1       \tvpbroadcastq %xmm1,%ymm0\n    7d63:\tc5 fe 6f c8          \tvmovdqu %ymm0,%ymm1\n    7d67:\tc5 fe 6f d0          \tvmovdqu %ymm0,%ymm2\n    7d6b:\tc5 fe 6f d8          \tvmovdqu %ymm0,%ymm3\n    7d6f:\tc5 fe 6f e0          \tvmovdqu %ymm0,%ymm4\n    7d73:\tc5 fe 6f e8          \tvmovdqu %ymm0,%ymm5\n    7d77:\t4d 89 cf             \tmov    %r9,%r15\n    7d7a:\t4d 89 ce             \tmov    %r9,%r14\n    7d7d:\t4d 89 cd             \tmov    %r9,%r13\n    7d80:\t4d 89 cc             \tmov    %r9,%r12\n    7d83:\t4d 89 cb             \tmov    %r9,%r11\n    7d86:\t4d 89 c8             \tmov    %r9,%r8\n\n0000000000007d89 <mixadd256int_loop>:\n    7d89:\t4d 01 c3             \tadd    %r8,%r11\n    7d8c:\t4d 01 c4             \tadd    %r8,%r12\n    7d8f:\t4d 01 c5             \tadd    %r8,%r13\n    7d92:\t4d 01 c6             \tadd    %r8,%r14\n    7d95:\t4d 01 c7             \tadd    %r8,%r15\n    7d98:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    7d9c:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    7da0:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    7da4:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    7da8:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    7dac:\t4d 01 c3             \tadd    %r8,%r11\n    7daf:\t4d 01 c4             \tadd    %r8,%r12\n    7db2:\t4d 01 c5             \tadd    %r8,%r13\n    7db5:\t4d 01 c6             \tadd    %r8,%r14\n    7db8:\t4d 01 c7             \tadd    %r8,%r15\n    7dbb:\t4d 01 c3             \tadd    %r8,%r11\n    7dbe:\t4d 01 c4             \tadd    %r8,%r12\n    7dc1:\t4d 01 c5             \tadd    %r8,%r13\n    7dc4:\t4d 01 c6             \tadd    %r8,%r14\n    7dc7:\t4d 01 c7             \tadd    %r8,%r15\n    7dca:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    7dce:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    7dd2:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    7dd6:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    7dda:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    7dde:\t4d 01 c3             \tadd    %r8,%r11\n    7de1:\t4d 01 c4             \tadd    %r8,%r12\n    7de4:\t4d 01 c5             \tadd    %r8,%r13\n    7de7:\t4d 01 c6             \tadd    %r8,%r14\n    7dea:\t4d 01 c7             \tadd    %r8,%r15\n    7ded:\t4c 29 cf             \tsub    %r9,%rdi\n    7df0:\t75 97                \tjne    7d89 <mixadd256int_loop>\n    7df2:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    7df7:\tc5 f8 77             \tvzeroupper \n    7dfa:\t41 5b                \tpop    %r11\n    7dfc:\t41 5c                \tpop    %r12\n    7dfe:\t41 5d                \tpop    %r13\n    7e00:\t41 5e                \tpop    %r14\n    7e02:\t41 5f                \tpop    %r15\n    7e04:\t41 58                \tpop    %r8\n    7e06:\t41 59                \tpop    %r9\n    7e08:\tc3                   \tretq   \n\n0000000000007e09 <mixadd256int11>:\n    7e09:\t41 51                \tpush   %r9\n    7e0b:\t41 50                \tpush   %r8\n    7e0d:\t41 57                \tpush   %r15\n    7e0f:\t41 56                \tpush   %r14\n    7e11:\t41 55                \tpush   %r13\n    7e13:\t41 54                \tpush   %r12\n    7e15:\t41 53                \tpush   %r11\n    7e17:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7e1e:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    7e23:\tc4 e2 7d 59 c1       \tvpbroadcastq %xmm1,%ymm0\n    7e28:\tc5 fe 6f c8          \tvmovdqu %ymm0,%ymm1\n    7e2c:\tc5 fe 6f d0          \tvmovdqu %ymm0,%ymm2\n    7e30:\tc5 fe 6f d8          \tvmovdqu %ymm0,%ymm3\n    7e34:\tc5 fe 6f e0          \tvmovdqu %ymm0,%ymm4\n    7e38:\tc5 fe 6f e8          \tvmovdqu %ymm0,%ymm5\n    7e3c:\t4d 89 cf             \tmov    %r9,%r15\n    7e3f:\t4d 89 ce             \tmov    %r9,%r14\n    7e42:\t4d 89 cd             \tmov    %r9,%r13\n    7e45:\t4d 89 cc             \tmov    %r9,%r12\n    7e48:\t4d 89 cb             \tmov    %r9,%r11\n    7e4b:\t4d 89 c8             \tmov    %r9,%r8\n\n0000000000007e4e <mixadd256int11_loop>:\n    7e4e:\t4d 01 c3             \tadd    %r8,%r11\n    7e51:\t4d 01 c4             \tadd    %r8,%r12\n    7e54:\t4d 01 c5             \tadd    %r8,%r13\n    7e57:\t4d 01 c6             \tadd    %r8,%r14\n    7e5a:\t4d 01 c7             \tadd    %r8,%r15\n    7e5d:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    7e61:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    7e65:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    7e69:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    7e6d:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    7e71:\t4d 01 c3             \tadd    %r8,%r11\n    7e74:\t4d 01 c4             \tadd    %r8,%r12\n    7e77:\t4d 01 c5             \tadd    %r8,%r13\n    7e7a:\t4d 01 c6             \tadd    %r8,%r14\n    7e7d:\t4d 01 c7             \tadd    %r8,%r15\n    7e80:\tc5 f5 d4 c8          \tvpaddq %ymm0,%ymm1,%ymm1\n    7e84:\tc5 ed d4 d0          \tvpaddq %ymm0,%ymm2,%ymm2\n    7e88:\tc5 e5 d4 d8          \tvpaddq %ymm0,%ymm3,%ymm3\n    7e8c:\tc5 dd d4 e0          \tvpaddq %ymm0,%ymm4,%ymm4\n    7e90:\tc5 d5 d4 e8          \tvpaddq %ymm0,%ymm5,%ymm5\n    7e94:\t4c 29 cf             \tsub    %r9,%rdi\n    7e97:\t75 b5                \tjne    7e4e <mixadd256int11_loop>\n    7e99:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    7e9e:\tc5 f8 77             \tvzeroupper \n    7ea1:\t41 5b                \tpop    %r11\n    7ea3:\t41 5c                \tpop    %r12\n    7ea5:\t41 5d                \tpop    %r13\n    7ea7:\t41 5e                \tpop    %r14\n    7ea9:\t41 5f                \tpop    %r15\n    7eab:\t41 58                \tpop    %r8\n    7ead:\t41 59                \tpop    %r9\n    7eaf:\tc3                   \tretq   \n\n0000000000007eb0 <latadd256int>:\n    7eb0:\t41 51                \tpush   %r9\n    7eb2:\t41 50                \tpush   %r8\n    7eb4:\t41 57                \tpush   %r15\n    7eb6:\t41 56                \tpush   %r14\n    7eb8:\t41 55                \tpush   %r13\n    7eba:\t41 54                \tpush   %r12\n    7ebc:\t41 53                \tpush   %r11\n    7ebe:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7ec5:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    7eca:\tc4 e2 7d 59 c1       \tvpbroadcastq %xmm1,%ymm0\n    7ecf:\tc5 fe 6f c8          \tvmovdqu %ymm0,%ymm1\n    7ed3:\tc5 fe 6f d0          \tvmovdqu %ymm0,%ymm2\n    7ed7:\tc5 fe 6f d8          \tvmovdqu %ymm0,%ymm3\n    7edb:\tc5 fe 6f e0          \tvmovdqu %ymm0,%ymm4\n    7edf:\tc5 fe 6f e8          \tvmovdqu %ymm0,%ymm5\n\n0000000000007ee3 <latadd256int_loop>:\n    7ee3:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7ee7:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7eeb:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7eef:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7ef3:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7ef7:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7efb:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7eff:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f03:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f07:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f0b:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f0f:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f13:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f17:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f1b:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f1f:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f23:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f27:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f2b:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f2f:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    7f33:\t4c 29 cf             \tsub    %r9,%rdi\n    7f36:\t75 ab                \tjne    7ee3 <latadd256int_loop>\n    7f38:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    7f3d:\tc5 f8 77             \tvzeroupper \n    7f40:\t41 5b                \tpop    %r11\n    7f42:\t41 5c                \tpop    %r12\n    7f44:\t41 5d                \tpop    %r13\n    7f46:\t41 5e                \tpop    %r14\n    7f48:\t41 5f                \tpop    %r15\n    7f4a:\t41 58                \tpop    %r8\n    7f4c:\t41 59                \tpop    %r9\n    7f4e:\tc3                   \tretq   \n\n0000000000007f4f <latadd512int>:\n    7f4f:\t41 51                \tpush   %r9\n    7f51:\t41 50                \tpush   %r8\n    7f53:\t41 57                \tpush   %r15\n    7f55:\t41 56                \tpush   %r14\n    7f57:\t41 55                \tpush   %r13\n    7f59:\t41 54                \tpush   %r12\n    7f5b:\t41 53                \tpush   %r11\n    7f5d:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    7f64:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    7f69:\t62 f2 fd 48 59 c1    \tvpbroadcastq %xmm1,%zmm0\n    7f6f:\t62 f1 fd 48 6f c8    \tvmovdqa64 %zmm0,%zmm1\n    7f75:\t62 f1 fd 48 6f d0    \tvmovdqa64 %zmm0,%zmm2\n    7f7b:\t62 f1 fd 48 6f d8    \tvmovdqa64 %zmm0,%zmm3\n    7f81:\t62 f1 fd 48 6f e0    \tvmovdqa64 %zmm0,%zmm4\n    7f87:\t62 f1 fd 48 6f e8    \tvmovdqa64 %zmm0,%zmm5\n\n0000000000007f8d <latadd51a2int_loop>:\n    7f8d:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7f93:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7f99:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7f9f:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fa5:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fab:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fb1:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fb7:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fbd:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fc3:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fc9:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fcf:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fd5:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fdb:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fe1:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fe7:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fed:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7ff3:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7ff9:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    7fff:\t62 f1 fd 48 d4 c0    \tvpaddq %zmm0,%zmm0,%zmm0\n    8005:\t4c 29 cf             \tsub    %r9,%rdi\n    8008:\t0f 85 d5 fe ff ff    \tjne    7ee3 <latadd256int_loop>\n    800e:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8013:\tc5 f8 77             \tvzeroupper \n    8016:\t41 5b                \tpop    %r11\n    8018:\t41 5c                \tpop    %r12\n    801a:\t41 5d                \tpop    %r13\n    801c:\t41 5e                \tpop    %r14\n    801e:\t41 5f                \tpop    %r15\n    8020:\t41 58                \tpop    %r8\n    8022:\t41 59                \tpop    %r9\n    8024:\tc3                   \tretq   \n\n0000000000008025 <latmul512int>:\n    8025:\t41 51                \tpush   %r9\n    8027:\t41 50                \tpush   %r8\n    8029:\t41 57                \tpush   %r15\n    802b:\t41 56                \tpush   %r14\n    802d:\t41 55                \tpush   %r13\n    802f:\t41 54                \tpush   %r12\n    8031:\t41 53                \tpush   %r11\n    8033:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    803a:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    803f:\t62 f2 7d 48 58 c1    \tvpbroadcastd %xmm1,%zmm0\n    8045:\t62 f1 fe 48 6f c8    \tvmovdqu64 %zmm0,%zmm1\n    804b:\t62 f1 fe 48 6f d0    \tvmovdqu64 %zmm0,%zmm2\n    8051:\t62 f1 fe 48 6f d8    \tvmovdqu64 %zmm0,%zmm3\n    8057:\t62 f1 fe 48 6f e0    \tvmovdqu64 %zmm0,%zmm4\n    805d:\t62 f1 fe 48 6f e8    \tvmovdqu64 %zmm0,%zmm5\n\n0000000000008063 <latmul512int_loop>:\n    8063:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    8069:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    806f:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    8075:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    807b:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    8081:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    8087:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    808d:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    8093:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    8099:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    809f:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80a5:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80ab:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80b1:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80b7:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80bd:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80c3:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80c9:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80cf:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80d5:\t62 f2 7d 48 40 c0    \tvpmulld %zmm0,%zmm0,%zmm0\n    80db:\t4c 29 cf             \tsub    %r9,%rdi\n    80de:\t75 83                \tjne    8063 <latmul512int_loop>\n    80e0:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    80e5:\tc5 f8 77             \tvzeroupper \n    80e8:\t41 5b                \tpop    %r11\n    80ea:\t41 5c                \tpop    %r12\n    80ec:\t41 5d                \tpop    %r13\n    80ee:\t41 5e                \tpop    %r14\n    80f0:\t41 5f                \tpop    %r15\n    80f2:\t41 58                \tpop    %r8\n    80f4:\t41 59                \tpop    %r9\n    80f6:\tc3                   \tretq   \n\n00000000000080f7 <latmuldq512int>:\n    80f7:\t41 51                \tpush   %r9\n    80f9:\t41 50                \tpush   %r8\n    80fb:\t41 57                \tpush   %r15\n    80fd:\t41 56                \tpush   %r14\n    80ff:\t41 55                \tpush   %r13\n    8101:\t41 54                \tpush   %r12\n    8103:\t41 53                \tpush   %r11\n    8105:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    810c:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8111:\t62 f2 7d 48 58 c1    \tvpbroadcastd %xmm1,%zmm0\n    8117:\t62 f1 fe 48 6f c8    \tvmovdqu64 %zmm0,%zmm1\n    811d:\t62 f1 fe 48 6f d0    \tvmovdqu64 %zmm0,%zmm2\n    8123:\t62 f1 fe 48 6f d8    \tvmovdqu64 %zmm0,%zmm3\n    8129:\t62 f1 fe 48 6f e0    \tvmovdqu64 %zmm0,%zmm4\n    812f:\t62 f1 fe 48 6f e8    \tvmovdqu64 %zmm0,%zmm5\n\n0000000000008135 <latmuldq512int_loop>:\n    8135:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    813b:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8141:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8147:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    814d:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8153:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8159:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    815f:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8165:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    816b:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8171:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8177:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    817d:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8183:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8189:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    818f:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    8195:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    819b:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    81a1:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    81a7:\t62 f2 fd 48 28 c0    \tvpmuldq %zmm0,%zmm0,%zmm0\n    81ad:\t4c 29 cf             \tsub    %r9,%rdi\n    81b0:\t75 83                \tjne    8135 <latmuldq512int_loop>\n    81b2:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    81b7:\tc5 f8 77             \tvzeroupper \n    81ba:\t41 5b                \tpop    %r11\n    81bc:\t41 5c                \tpop    %r12\n    81be:\t41 5d                \tpop    %r13\n    81c0:\t41 5e                \tpop    %r14\n    81c2:\t41 5f                \tpop    %r15\n    81c4:\t41 58                \tpop    %r8\n    81c6:\t41 59                \tpop    %r9\n    81c8:\tc3                   \tretq   \n\n00000000000081c9 <latmulq512int>:\n    81c9:\t41 51                \tpush   %r9\n    81cb:\t41 50                \tpush   %r8\n    81cd:\t41 57                \tpush   %r15\n    81cf:\t41 56                \tpush   %r14\n    81d1:\t41 55                \tpush   %r13\n    81d3:\t41 54                \tpush   %r12\n    81d5:\t41 53                \tpush   %r11\n    81d7:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    81de:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    81e3:\t62 f2 7d 48 58 c1    \tvpbroadcastd %xmm1,%zmm0\n    81e9:\t62 f1 fe 48 6f c8    \tvmovdqu64 %zmm0,%zmm1\n    81ef:\t62 f1 fe 48 6f d0    \tvmovdqu64 %zmm0,%zmm2\n    81f5:\t62 f1 fe 48 6f d8    \tvmovdqu64 %zmm0,%zmm3\n    81fb:\t62 f1 fe 48 6f e0    \tvmovdqu64 %zmm0,%zmm4\n    8201:\t62 f1 fe 48 6f e8    \tvmovdqu64 %zmm0,%zmm5\n\n0000000000008207 <latmulq512int_loop>:\n    8207:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    820d:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8213:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8219:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    821f:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8225:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    822b:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8231:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8237:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    823d:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8243:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8249:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    824f:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8255:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    825b:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8261:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8267:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    826d:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8273:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    8279:\t62 f2 fd 48 40 c0    \tvpmullq %zmm0,%zmm0,%zmm0\n    827f:\t4c 29 cf             \tsub    %r9,%rdi\n    8282:\t75 83                \tjne    8207 <latmulq512int_loop>\n    8284:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8289:\tc5 f8 77             \tvzeroupper \n    828c:\t41 5b                \tpop    %r11\n    828e:\t41 5c                \tpop    %r12\n    8290:\t41 5d                \tpop    %r13\n    8292:\t41 5e                \tpop    %r14\n    8294:\t41 5f                \tpop    %r15\n    8296:\t41 58                \tpop    %r8\n    8298:\t41 59                \tpop    %r9\n    829a:\tc3                   \tretq   \n\n000000000000829b <latmul256int>:\n    829b:\t41 51                \tpush   %r9\n    829d:\t41 50                \tpush   %r8\n    829f:\t41 57                \tpush   %r15\n    82a1:\t41 56                \tpush   %r14\n    82a3:\t41 55                \tpush   %r13\n    82a5:\t41 54                \tpush   %r12\n    82a7:\t41 53                \tpush   %r11\n    82a9:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    82b0:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    82b5:\tc5 fe 6f c8          \tvmovdqu %ymm0,%ymm1\n    82b9:\tc5 fe 6f d0          \tvmovdqu %ymm0,%ymm2\n    82bd:\tc5 fe 6f d8          \tvmovdqu %ymm0,%ymm3\n    82c1:\tc5 fe 6f e0          \tvmovdqu %ymm0,%ymm4\n    82c5:\tc5 fe 6f e8          \tvmovdqu %ymm0,%ymm5\n\n00000000000082c9 <latmul256int_loop>:\n    82c9:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82ce:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82d3:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82d8:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82dd:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82e2:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82e7:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82ec:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82f1:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82f6:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    82fb:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    8300:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    8305:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    830a:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    830f:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    8314:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    8319:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    831e:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    8323:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    8328:\tc4 e2 7d 40 c0       \tvpmulld %ymm0,%ymm0,%ymm0\n    832d:\t4c 29 cf             \tsub    %r9,%rdi\n    8330:\t75 97                \tjne    82c9 <latmul256int_loop>\n    8332:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8337:\tc5 f8 77             \tvzeroupper \n    833a:\t41 5b                \tpop    %r11\n    833c:\t41 5c                \tpop    %r12\n    833e:\t41 5d                \tpop    %r13\n    8340:\t41 5e                \tpop    %r14\n    8342:\t41 5f                \tpop    %r15\n    8344:\t41 58                \tpop    %r8\n    8346:\t41 59                \tpop    %r9\n    8348:\tc3                   \tretq   \n\n0000000000008349 <latadd128int>:\n    8349:\t41 51                \tpush   %r9\n    834b:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8352:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n\n0000000000008357 <latadd128int_loop>:\n    8357:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    835b:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    835f:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    8363:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    8367:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    836b:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    836f:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    8373:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    8377:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    837b:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    837f:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    8383:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    8387:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    838b:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    838f:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    8393:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    8397:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    839b:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    839f:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    83a3:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    83a7:\t4c 29 cf             \tsub    %r9,%rdi\n    83aa:\t75 ab                \tjne    8357 <latadd128int_loop>\n    83ac:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    83b1:\t41 59                \tpop    %r9\n    83b3:\tc3                   \tretq   \n\n00000000000083b4 <add128int>:\n    83b4:\t41 51                \tpush   %r9\n    83b6:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    83bd:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n\n00000000000083c2 <add128int_loop>:\n    83c2:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    83c6:\t66 0f d4 c9          \tpaddq  %xmm1,%xmm1\n    83ca:\t66 0f d4 d2          \tpaddq  %xmm2,%xmm2\n    83ce:\t66 0f d4 db          \tpaddq  %xmm3,%xmm3\n    83d2:\t66 0f d4 e4          \tpaddq  %xmm4,%xmm4\n    83d6:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    83da:\t66 0f d4 c9          \tpaddq  %xmm1,%xmm1\n    83de:\t66 0f d4 d2          \tpaddq  %xmm2,%xmm2\n    83e2:\t66 0f d4 db          \tpaddq  %xmm3,%xmm3\n    83e6:\t66 0f d4 e4          \tpaddq  %xmm4,%xmm4\n    83ea:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    83ee:\t66 0f d4 c9          \tpaddq  %xmm1,%xmm1\n    83f2:\t66 0f d4 d2          \tpaddq  %xmm2,%xmm2\n    83f6:\t66 0f d4 db          \tpaddq  %xmm3,%xmm3\n    83fa:\t66 0f d4 e4          \tpaddq  %xmm4,%xmm4\n    83fe:\t66 0f d4 c0          \tpaddq  %xmm0,%xmm0\n    8402:\t66 0f d4 c9          \tpaddq  %xmm1,%xmm1\n    8406:\t66 0f d4 d2          \tpaddq  %xmm2,%xmm2\n    840a:\t66 0f d4 db          \tpaddq  %xmm3,%xmm3\n    840e:\t66 0f d4 e4          \tpaddq  %xmm4,%xmm4\n    8412:\t4c 29 cf             \tsub    %r9,%rdi\n    8415:\t75 ab                \tjne    83c2 <add128int_loop>\n    8417:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    841c:\t41 59                \tpop    %r9\n    841e:\tc3                   \tretq   \n\n000000000000841f <aesenc128>:\n    841f:\t41 51                \tpush   %r9\n    8421:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8428:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    842d:\tc5 fc 77             \tvzeroall \n    8430:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    8434:\t66 0f ef c9          \tpxor   %xmm1,%xmm1\n    8438:\t66 0f ef d2          \tpxor   %xmm2,%xmm2\n    843c:\t66 0f ef db          \tpxor   %xmm3,%xmm3\n    8440:\t66 0f ef e4          \tpxor   %xmm4,%xmm4\n    8444:\t66 0f ef ed          \tpxor   %xmm5,%xmm5\n\n0000000000008448 <aesenc128_loop>:\n    8448:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    844d:\t66 0f 38 dc d0       \taesenc %xmm0,%xmm2\n    8452:\t66 0f 38 dc d8       \taesenc %xmm0,%xmm3\n    8457:\t66 0f 38 dc e0       \taesenc %xmm0,%xmm4\n    845c:\t66 0f 38 dc e8       \taesenc %xmm0,%xmm5\n    8461:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    8466:\t66 0f 38 dc d0       \taesenc %xmm0,%xmm2\n    846b:\t66 0f 38 dc d8       \taesenc %xmm0,%xmm3\n    8470:\t66 0f 38 dc e0       \taesenc %xmm0,%xmm4\n    8475:\t66 0f 38 dc e8       \taesenc %xmm0,%xmm5\n    847a:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    847f:\t66 0f 38 dc d0       \taesenc %xmm0,%xmm2\n    8484:\t66 0f 38 dc d8       \taesenc %xmm0,%xmm3\n    8489:\t66 0f 38 dc e0       \taesenc %xmm0,%xmm4\n    848e:\t66 0f 38 dc e8       \taesenc %xmm0,%xmm5\n    8493:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    8498:\t66 0f 38 dc d0       \taesenc %xmm0,%xmm2\n    849d:\t66 0f 38 dc d8       \taesenc %xmm0,%xmm3\n    84a2:\t66 0f 38 dc e0       \taesenc %xmm0,%xmm4\n    84a7:\t66 0f 38 dc e8       \taesenc %xmm0,%xmm5\n    84ac:\t4c 29 cf             \tsub    %r9,%rdi\n    84af:\t75 97                \tjne    8448 <aesenc128_loop>\n    84b1:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    84b6:\t41 59                \tpop    %r9\n    84b8:\tc3                   \tretq   \n\n00000000000084b9 <aesencadd128>:\n    84b9:\t41 51                \tpush   %r9\n    84bb:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    84c2:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    84c7:\tc5 fc 77             \tvzeroall \n    84ca:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    84ce:\t66 0f ef c9          \tpxor   %xmm1,%xmm1\n    84d2:\t66 0f ef d2          \tpxor   %xmm2,%xmm2\n    84d6:\t66 0f ef db          \tpxor   %xmm3,%xmm3\n    84da:\t66 0f ef e4          \tpxor   %xmm4,%xmm4\n    84de:\t66 0f ef ed          \tpxor   %xmm5,%xmm5\n    84e2:\t66 0f ef f6          \tpxor   %xmm6,%xmm6\n    84e6:\t66 0f ef ff          \tpxor   %xmm7,%xmm7\n    84ea:\t66 45 0f ef c0       \tpxor   %xmm8,%xmm8\n    84ef:\t66 45 0f ef c9       \tpxor   %xmm9,%xmm9\n    84f4:\t66 45 0f ef d2       \tpxor   %xmm10,%xmm10\n    84f9:\t66 45 0f ef db       \tpxor   %xmm11,%xmm11\n    84fe:\t66 45 0f ef e4       \tpxor   %xmm12,%xmm12\n    8503:\t66 45 0f ef ed       \tpxor   %xmm13,%xmm13\n\n0000000000008508 <aesencadd128_loop>:\n    8508:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    850d:\t66 0f fe d6          \tpaddd  %xmm6,%xmm2\n    8511:\t66 0f fe de          \tpaddd  %xmm6,%xmm3\n    8515:\t66 0f fe e6          \tpaddd  %xmm6,%xmm4\n    8519:\t66 0f 38 dc e8       \taesenc %xmm0,%xmm5\n    851e:\t66 0f fe fe          \tpaddd  %xmm6,%xmm7\n    8522:\t66 44 0f fe c6       \tpaddd  %xmm6,%xmm8\n    8527:\t66 44 0f fe ce       \tpaddd  %xmm6,%xmm9\n    852c:\t66 44 0f 38 dc d0    \taesenc %xmm0,%xmm10\n    8532:\t66 0f fe d6          \tpaddd  %xmm6,%xmm2\n    8536:\t66 0f fe de          \tpaddd  %xmm6,%xmm3\n    853a:\t66 0f fe e6          \tpaddd  %xmm6,%xmm4\n    853e:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    8543:\t66 0f fe fe          \tpaddd  %xmm6,%xmm7\n    8547:\t66 44 0f fe c6       \tpaddd  %xmm6,%xmm8\n    854c:\t66 44 0f fe ce       \tpaddd  %xmm6,%xmm9\n    8551:\t66 44 0f 38 dc d0    \taesenc %xmm0,%xmm10\n    8557:\t66 44 0f fe de       \tpaddd  %xmm6,%xmm11\n    855c:\t66 44 0f fe e6       \tpaddd  %xmm6,%xmm12\n    8561:\t66 44 0f fe ee       \tpaddd  %xmm6,%xmm13\n    8566:\t4c 29 cf             \tsub    %r9,%rdi\n    8569:\t75 9d                \tjne    8508 <aesencadd128_loop>\n    856b:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8570:\t41 59                \tpop    %r9\n    8572:\tc3                   \tretq   \n\n0000000000008573 <aesencfma128>:\n    8573:\t41 51                \tpush   %r9\n    8575:\t49 c7 c1 0f 00 00 00 \tmov    $0xf,%r9\n    857c:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8581:\tc5 fc 77             \tvzeroall \n    8584:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    8588:\t66 0f ef c9          \tpxor   %xmm1,%xmm1\n    858c:\t0f 57 d2             \txorps  %xmm2,%xmm2\n    858f:\t0f 57 db             \txorps  %xmm3,%xmm3\n    8592:\t0f 57 e4             \txorps  %xmm4,%xmm4\n    8595:\t66 0f ef ed          \tpxor   %xmm5,%xmm5\n    8599:\t0f 57 f6             \txorps  %xmm6,%xmm6\n    859c:\t0f 57 ff             \txorps  %xmm7,%xmm7\n    859f:\t45 0f 57 c0          \txorps  %xmm8,%xmm8\n    85a3:\t45 0f 57 c9          \txorps  %xmm9,%xmm9\n    85a7:\t66 45 0f ef d2       \tpxor   %xmm10,%xmm10\n    85ac:\t45 0f 57 db          \txorps  %xmm11,%xmm11\n    85b0:\t45 0f 57 e4          \txorps  %xmm12,%xmm12\n    85b4:\t45 0f 57 ed          \txorps  %xmm13,%xmm13\n    85b8:\t45 0f 57 f6          \txorps  %xmm14,%xmm14\n    85bc:\t45 0f 57 ff          \txorps  %xmm15,%xmm15\n    85c0:\t62 a1 7c 00 57 c0    \tvxorps %xmm16,%xmm16,%xmm16\n    85c6:\t62 a1 74 00 57 c9    \tvxorps %xmm17,%xmm17,%xmm17\n    85cc:\t62 a1 6c 00 57 d2    \tvxorps %xmm18,%xmm18,%xmm18\n    85d2:\t62 a1 64 00 57 db    \tvxorps %xmm19,%xmm19,%xmm19\n\n00000000000085d8 <aesencfma128_loop>:\n    85d8:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    85dd:\tc4 e2 69 98 d6       \tvfmadd132ps %xmm6,%xmm2,%xmm2\n    85e2:\tc4 e2 61 98 de       \tvfmadd132ps %xmm6,%xmm3,%xmm3\n    85e7:\t66 0f 38 dc e8       \taesenc %xmm0,%xmm5\n    85ec:\tc4 e2 41 98 fe       \tvfmadd132ps %xmm6,%xmm7,%xmm7\n    85f1:\tc4 62 39 98 c6       \tvfmadd132ps %xmm6,%xmm8,%xmm8\n    85f6:\t66 44 0f 38 dc d0    \taesenc %xmm0,%xmm10\n    85fc:\tc4 62 21 98 de       \tvfmadd132ps %xmm6,%xmm11,%xmm11\n    8601:\tc4 62 19 98 e6       \tvfmadd132ps %xmm6,%xmm12,%xmm12\n    8606:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    860b:\tc4 62 09 98 f6       \tvfmadd132ps %xmm6,%xmm14,%xmm14\n    8610:\tc4 62 01 98 fe       \tvfmadd132ps %xmm6,%xmm15,%xmm15\n    8615:\t66 44 0f 38 dc d0    \taesenc %xmm0,%xmm10\n    861b:\t62 e2 75 00 98 ce    \tvfmadd132ps %xmm6,%xmm17,%xmm17\n    8621:\t62 e2 6d 00 98 d6    \tvfmadd132ps %xmm6,%xmm18,%xmm18\n    8627:\t4c 29 cf             \tsub    %r9,%rdi\n    862a:\t75 ac                \tjne    85d8 <aesencfma128_loop>\n    862c:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8631:\t41 59                \tpop    %r9\n    8633:\tc3                   \tretq   \n\n0000000000008634 <aesencfadd128>:\n    8634:\t41 51                \tpush   %r9\n    8636:\t49 c7 c1 0f 00 00 00 \tmov    $0xf,%r9\n    863d:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8642:\tc5 fc 77             \tvzeroall \n    8645:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    8649:\t66 0f ef c9          \tpxor   %xmm1,%xmm1\n    864d:\t0f 57 d2             \txorps  %xmm2,%xmm2\n    8650:\t0f 57 db             \txorps  %xmm3,%xmm3\n    8653:\t0f 57 e4             \txorps  %xmm4,%xmm4\n    8656:\t66 0f ef ed          \tpxor   %xmm5,%xmm5\n    865a:\t0f 57 f6             \txorps  %xmm6,%xmm6\n    865d:\t0f 57 ff             \txorps  %xmm7,%xmm7\n    8660:\t45 0f 57 c0          \txorps  %xmm8,%xmm8\n    8664:\t45 0f 57 c9          \txorps  %xmm9,%xmm9\n    8668:\t66 45 0f ef d2       \tpxor   %xmm10,%xmm10\n    866d:\t45 0f 57 db          \txorps  %xmm11,%xmm11\n    8671:\t45 0f 57 e4          \txorps  %xmm12,%xmm12\n    8675:\t45 0f 57 ed          \txorps  %xmm13,%xmm13\n    8679:\t45 0f 57 f6          \txorps  %xmm14,%xmm14\n    867d:\t45 0f 57 ff          \txorps  %xmm15,%xmm15\n    8681:\t62 a1 7c 00 57 c0    \tvxorps %xmm16,%xmm16,%xmm16\n    8687:\t62 a1 74 00 57 c9    \tvxorps %xmm17,%xmm17,%xmm17\n    868d:\t62 a1 6c 00 57 d2    \tvxorps %xmm18,%xmm18,%xmm18\n    8693:\t62 a1 64 00 57 db    \tvxorps %xmm19,%xmm19,%xmm19\n\n0000000000008699 <aesencfadd128_loop>:\n    8699:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    869e:\tc5 e8 58 d6          \tvaddps %xmm6,%xmm2,%xmm2\n    86a2:\tc5 e0 58 de          \tvaddps %xmm6,%xmm3,%xmm3\n    86a6:\t66 0f 38 dc e8       \taesenc %xmm0,%xmm5\n    86ab:\tc5 c0 58 fe          \tvaddps %xmm6,%xmm7,%xmm7\n    86af:\tc5 38 58 c6          \tvaddps %xmm6,%xmm8,%xmm8\n    86b3:\t66 44 0f 38 dc d0    \taesenc %xmm0,%xmm10\n    86b9:\tc5 20 58 de          \tvaddps %xmm6,%xmm11,%xmm11\n    86bd:\tc5 18 58 e6          \tvaddps %xmm6,%xmm12,%xmm12\n    86c1:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    86c6:\tc5 08 58 f6          \tvaddps %xmm6,%xmm14,%xmm14\n    86ca:\tc5 00 58 fe          \tvaddps %xmm6,%xmm15,%xmm15\n    86ce:\t66 44 0f 38 dc d0    \taesenc %xmm0,%xmm10\n    86d4:\t62 e1 74 00 58 ce    \tvaddps %xmm6,%xmm17,%xmm17\n    86da:\t62 e1 6c 00 58 d6    \tvaddps %xmm6,%xmm18,%xmm18\n    86e0:\t4c 29 cf             \tsub    %r9,%rdi\n    86e3:\t7f b4                \tjg     8699 <aesencfadd128_loop>\n    86e5:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    86ea:\t41 59                \tpop    %r9\n    86ec:\tc3                   \tretq   \n\n00000000000086ed <aesencmul128>:\n    86ed:\t41 51                \tpush   %r9\n    86ef:\t49 c7 c1 0f 00 00 00 \tmov    $0xf,%r9\n    86f6:\tc5 fc 77             \tvzeroall \n    86f9:\t66 49 0f 6e f1       \tmovq   %r9,%xmm6\n    86fe:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    8702:\t66 0f ef ed          \tpxor   %xmm5,%xmm5\n    8706:\t66 45 0f ef d2       \tpxor   %xmm10,%xmm10\n    870b:\t0f 57 c9             \txorps  %xmm1,%xmm1\n    870e:\t0f 57 d2             \txorps  %xmm2,%xmm2\n    8711:\t0f 57 db             \txorps  %xmm3,%xmm3\n    8714:\t0f 57 e4             \txorps  %xmm4,%xmm4\n    8717:\t0f 57 ff             \txorps  %xmm7,%xmm7\n    871a:\t45 0f 57 c0          \txorps  %xmm8,%xmm8\n    871e:\t45 0f 57 db          \txorps  %xmm11,%xmm11\n    8722:\t45 0f 57 e4          \txorps  %xmm12,%xmm12\n    8726:\t45 0f 57 f6          \txorps  %xmm14,%xmm14\n    872a:\t45 0f 57 ff          \txorps  %xmm15,%xmm15\n\n000000000000872e <aesencmul128_loop>:\n    872e:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    8733:\t66 0f d5 d6          \tpmullw %xmm6,%xmm2\n    8737:\t66 0f d5 de          \tpmullw %xmm6,%xmm3\n    873b:\t66 0f 38 dc e8       \taesenc %xmm0,%xmm5\n    8740:\t66 0f d5 fe          \tpmullw %xmm6,%xmm7\n    8744:\t66 44 0f d5 c6       \tpmullw %xmm6,%xmm8\n    8749:\t66 44 0f 38 dc d0    \taesenc %xmm0,%xmm10\n    874f:\t66 44 0f d5 de       \tpmullw %xmm6,%xmm11\n    8754:\t66 44 0f d5 e6       \tpmullw %xmm6,%xmm12\n    8759:\t66 0f 38 dc c8       \taesenc %xmm0,%xmm1\n    875e:\t66 0f d5 e6          \tpmullw %xmm6,%xmm4\n    8762:\t66 0f d5 f6          \tpmullw %xmm6,%xmm6\n    8766:\t66 44 0f 38 dc d0    \taesenc %xmm0,%xmm10\n    876c:\t66 44 0f d5 ee       \tpmullw %xmm6,%xmm13\n    8771:\t66 44 0f d5 f6       \tpmullw %xmm6,%xmm14\n    8776:\t4c 29 cf             \tsub    %r9,%rdi\n    8779:\t7f b3                \tjg     872e <aesencmul128_loop>\n    877b:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8780:\t41 59                \tpop    %r9\n    8782:\tc3                   \tretq   \n\n0000000000008783 <aesdec128>:\n    8783:\t41 51                \tpush   %r9\n    8785:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    878c:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8791:\tc5 fc 77             \tvzeroall \n    8794:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    8798:\t66 0f ef c9          \tpxor   %xmm1,%xmm1\n    879c:\t66 0f ef d2          \tpxor   %xmm2,%xmm2\n    87a0:\t66 0f ef db          \tpxor   %xmm3,%xmm3\n    87a4:\t66 0f ef e4          \tpxor   %xmm4,%xmm4\n    87a8:\t66 0f ef ed          \tpxor   %xmm5,%xmm5\n\n00000000000087ac <aesdec128_loop>:\n    87ac:\t66 0f 38 de c8       \taesdec %xmm0,%xmm1\n    87b1:\t66 0f 38 de d0       \taesdec %xmm0,%xmm2\n    87b6:\t66 0f 38 de d8       \taesdec %xmm0,%xmm3\n    87bb:\t66 0f 38 de e0       \taesdec %xmm0,%xmm4\n    87c0:\t66 0f 38 de e8       \taesdec %xmm0,%xmm5\n    87c5:\t66 0f 38 de c8       \taesdec %xmm0,%xmm1\n    87ca:\t66 0f 38 de d0       \taesdec %xmm0,%xmm2\n    87cf:\t66 0f 38 de d8       \taesdec %xmm0,%xmm3\n    87d4:\t66 0f 38 de e0       \taesdec %xmm0,%xmm4\n    87d9:\t66 0f 38 de e8       \taesdec %xmm0,%xmm5\n    87de:\t66 0f 38 de c8       \taesdec %xmm0,%xmm1\n    87e3:\t66 0f 38 de d0       \taesdec %xmm0,%xmm2\n    87e8:\t66 0f 38 de d8       \taesdec %xmm0,%xmm3\n    87ed:\t66 0f 38 de e0       \taesdec %xmm0,%xmm4\n    87f2:\t66 0f 38 de e8       \taesdec %xmm0,%xmm5\n    87f7:\t66 0f 38 de c8       \taesdec %xmm0,%xmm1\n    87fc:\t66 0f 38 de d0       \taesdec %xmm0,%xmm2\n    8801:\t66 0f 38 de d8       \taesdec %xmm0,%xmm3\n    8806:\t66 0f 38 de e0       \taesdec %xmm0,%xmm4\n    880b:\t66 0f 38 de e8       \taesdec %xmm0,%xmm5\n    8810:\t4c 29 cf             \tsub    %r9,%rdi\n    8813:\t75 97                \tjne    87ac <aesdec128_loop>\n    8815:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    881a:\t41 59                \tpop    %r9\n    881c:\tc3                   \tretq   \n\n000000000000881d <mul128int>:\n    881d:\t41 51                \tpush   %r9\n    881f:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8826:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n\n000000000000882b <mul128int_loop>:\n    882b:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    8830:\t66 0f 38 40 c9       \tpmulld %xmm1,%xmm1\n    8835:\t66 0f 38 40 d2       \tpmulld %xmm2,%xmm2\n    883a:\t66 0f 38 40 db       \tpmulld %xmm3,%xmm3\n    883f:\t66 0f 38 40 e4       \tpmulld %xmm4,%xmm4\n    8844:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    8849:\t66 0f 38 40 c9       \tpmulld %xmm1,%xmm1\n    884e:\t66 0f 38 40 d2       \tpmulld %xmm2,%xmm2\n    8853:\t66 0f 38 40 db       \tpmulld %xmm3,%xmm3\n    8858:\t66 0f 38 40 e4       \tpmulld %xmm4,%xmm4\n    885d:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    8862:\t66 0f 38 40 c9       \tpmulld %xmm1,%xmm1\n    8867:\t66 0f 38 40 d2       \tpmulld %xmm2,%xmm2\n    886c:\t66 0f 38 40 db       \tpmulld %xmm3,%xmm3\n    8871:\t66 0f 38 40 e4       \tpmulld %xmm4,%xmm4\n    8876:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    887b:\t66 0f 38 40 c9       \tpmulld %xmm1,%xmm1\n    8880:\t66 0f 38 40 d2       \tpmulld %xmm2,%xmm2\n    8885:\t66 0f 38 40 db       \tpmulld %xmm3,%xmm3\n    888a:\t66 0f 38 40 e4       \tpmulld %xmm4,%xmm4\n    888f:\t4c 29 cf             \tsub    %r9,%rdi\n    8892:\t75 97                \tjne    882b <mul128int_loop>\n    8894:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8899:\t41 59                \tpop    %r9\n    889b:\tc3                   \tretq   \n\n000000000000889c <latmul128int>:\n    889c:\t41 51                \tpush   %r9\n    889e:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    88a5:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n\n00000000000088aa <latmul128int_loop>:\n    88aa:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88af:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88b4:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88b9:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88be:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88c3:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88c8:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88cd:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88d2:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88d7:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88dc:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88e1:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88e6:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88eb:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88f0:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88f5:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88fa:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    88ff:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    8904:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    8909:\t66 0f 38 40 c0       \tpmulld %xmm0,%xmm0\n    890e:\t4c 29 cf             \tsub    %r9,%rdi\n    8911:\t75 97                \tjne    88aa <latmul128int_loop>\n    8913:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8918:\t41 59                \tpop    %r9\n    891a:\tc3                   \tretq   \n\n000000000000891b <mixaddmul128int>:\n    891b:\t41 51                \tpush   %r9\n    891d:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8924:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8929:\t66 0f 6f c8          \tmovdqa %xmm0,%xmm1\n    892d:\t66 0f 6f d0          \tmovdqa %xmm0,%xmm2\n    8931:\t66 0f 6f d8          \tmovdqa %xmm0,%xmm3\n    8935:\t66 0f 6f e0          \tmovdqa %xmm0,%xmm4\n    8939:\t66 0f 6f e8          \tmovdqa %xmm0,%xmm5\n    893d:\t66 0f 6f f0          \tmovdqa %xmm0,%xmm6\n    8941:\t66 0f 6f f8          \tmovdqa %xmm0,%xmm7\n    8945:\t66 44 0f 6f c0       \tmovdqa %xmm0,%xmm8\n    894a:\t66 44 0f 6f c8       \tmovdqa %xmm0,%xmm9\n    894f:\t66 44 0f 6f d0       \tmovdqa %xmm0,%xmm10\n\n0000000000008954 <mixaddmul128int_loop>:\n    8954:\t66 0f 38 40 c8       \tpmulld %xmm0,%xmm1\n    8959:\t66 0f fe d0          \tpaddd  %xmm0,%xmm2\n    895d:\t66 0f 38 40 d8       \tpmulld %xmm0,%xmm3\n    8962:\t66 0f fe e0          \tpaddd  %xmm0,%xmm4\n    8966:\t66 0f 38 40 e8       \tpmulld %xmm0,%xmm5\n    896b:\t66 0f fe f0          \tpaddd  %xmm0,%xmm6\n    896f:\t66 0f 38 40 f8       \tpmulld %xmm0,%xmm7\n    8974:\t66 44 0f fe c0       \tpaddd  %xmm0,%xmm8\n    8979:\t66 44 0f 38 40 c8    \tpmulld %xmm0,%xmm9\n    897f:\t66 44 0f fe d0       \tpaddd  %xmm0,%xmm10\n    8984:\t66 0f 38 40 c8       \tpmulld %xmm0,%xmm1\n    8989:\t66 0f fe d0          \tpaddd  %xmm0,%xmm2\n    898d:\t66 0f 38 40 d8       \tpmulld %xmm0,%xmm3\n    8992:\t66 0f fe e0          \tpaddd  %xmm0,%xmm4\n    8996:\t66 0f 38 40 e8       \tpmulld %xmm0,%xmm5\n    899b:\t66 0f fe f0          \tpaddd  %xmm0,%xmm6\n    899f:\t66 0f 38 40 f8       \tpmulld %xmm0,%xmm7\n    89a4:\t66 44 0f fe c0       \tpaddd  %xmm0,%xmm8\n    89a9:\t66 44 0f 38 40 c8    \tpmulld %xmm0,%xmm9\n    89af:\t66 44 0f fe d0       \tpaddd  %xmm0,%xmm10\n    89b4:\t4c 29 cf             \tsub    %r9,%rdi\n    89b7:\t75 9b                \tjne    8954 <mixaddmul128int_loop>\n    89b9:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    89be:\t41 59                \tpop    %r9\n    89c0:\tc3                   \tretq   \n\n00000000000089c1 <latadd256fp>:\n    89c1:\t41 51                \tpush   %r9\n    89c3:\t41 50                \tpush   %r8\n    89c5:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    89cc:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    89d1:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    89d6:\tc4 e2 7d 18 f6       \tvbroadcastss %xmm6,%ymm6\n\n00000000000089db <latadd256fp_loop>:\n    89db:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    89df:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    89e3:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    89e7:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    89eb:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    89ef:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    89f3:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    89f7:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    89fb:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    89ff:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a03:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a07:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a0b:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a0f:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a13:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a17:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a1b:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a1f:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a23:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a27:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    8a2b:\t4c 29 cf             \tsub    %r9,%rdi\n    8a2e:\t75 ab                \tjne    89db <latadd256fp_loop>\n    8a30:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8a35:\tc5 f8 77             \tvzeroupper \n    8a38:\t41 58                \tpop    %r8\n    8a3a:\t41 59                \tpop    %r9\n    8a3c:\tc3                   \tretq   \n\n0000000000008a3d <mul256fp>:\n    8a3d:\t41 51                \tpush   %r9\n    8a3f:\t41 50                \tpush   %r8\n    8a41:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8a48:\tf3 49 0f 2a c1       \tcvtsi2ss %r9,%xmm0\n    8a4d:\tc4 e2 7d 18 f0       \tvbroadcastss %xmm0,%ymm6\n    8a52:\tc5 fd 6f c8          \tvmovdqa %ymm0,%ymm1\n    8a56:\tc5 fd 6f d0          \tvmovdqa %ymm0,%ymm2\n    8a5a:\tc5 fd 6f d8          \tvmovdqa %ymm0,%ymm3\n    8a5e:\tc5 fd 6f e0          \tvmovdqa %ymm0,%ymm4\n    8a62:\tc5 fd 6f e8          \tvmovdqa %ymm0,%ymm5\n    8a66:\tc5 fd 6f f0          \tvmovdqa %ymm0,%ymm6\n    8a6a:\tc5 fd 6f f8          \tvmovdqa %ymm0,%ymm7\n    8a6e:\tc5 7d 6f c0          \tvmovdqa %ymm0,%ymm8\n    8a72:\tc5 7d 6f c8          \tvmovdqa %ymm0,%ymm9\n    8a76:\tc5 7d 6f d0          \tvmovdqa %ymm0,%ymm10\n\n0000000000008a7a <mul256fp_loop>:\n    8a7a:\tc5 f4 59 c8          \tvmulps %ymm0,%ymm1,%ymm1\n    8a7e:\tc5 ec 59 d0          \tvmulps %ymm0,%ymm2,%ymm2\n    8a82:\tc5 e4 59 d8          \tvmulps %ymm0,%ymm3,%ymm3\n    8a86:\tc5 dc 59 e0          \tvmulps %ymm0,%ymm4,%ymm4\n    8a8a:\tc5 d4 59 e8          \tvmulps %ymm0,%ymm5,%ymm5\n    8a8e:\tc5 cc 59 f0          \tvmulps %ymm0,%ymm6,%ymm6\n    8a92:\tc5 c4 59 f8          \tvmulps %ymm0,%ymm7,%ymm7\n    8a96:\tc5 3c 59 c0          \tvmulps %ymm0,%ymm8,%ymm8\n    8a9a:\tc5 34 59 c8          \tvmulps %ymm0,%ymm9,%ymm9\n    8a9e:\tc5 2c 59 d0          \tvmulps %ymm0,%ymm10,%ymm10\n    8aa2:\tc5 f4 59 c8          \tvmulps %ymm0,%ymm1,%ymm1\n    8aa6:\tc5 ec 59 d0          \tvmulps %ymm0,%ymm2,%ymm2\n    8aaa:\tc5 e4 59 d8          \tvmulps %ymm0,%ymm3,%ymm3\n    8aae:\tc5 dc 59 e0          \tvmulps %ymm0,%ymm4,%ymm4\n    8ab2:\tc5 d4 59 e8          \tvmulps %ymm0,%ymm5,%ymm5\n    8ab6:\tc5 cc 59 f0          \tvmulps %ymm0,%ymm6,%ymm6\n    8aba:\tc5 c4 59 f8          \tvmulps %ymm0,%ymm7,%ymm7\n    8abe:\tc5 3c 59 c0          \tvmulps %ymm0,%ymm8,%ymm8\n    8ac2:\tc5 34 59 c8          \tvmulps %ymm0,%ymm9,%ymm9\n    8ac6:\tc5 2c 59 d0          \tvmulps %ymm0,%ymm10,%ymm10\n    8aca:\t4c 29 cf             \tsub    %r9,%rdi\n    8acd:\t75 ab                \tjne    8a7a <mul256fp_loop>\n    8acf:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8ad4:\tc5 f8 77             \tvzeroupper \n    8ad7:\t41 58                \tpop    %r8\n    8ad9:\t41 59                \tpop    %r9\n    8adb:\tc3                   \tretq   \n\n0000000000008adc <add256fp>:\n    8adc:\t41 51                \tpush   %r9\n    8ade:\t41 50                \tpush   %r8\n    8ae0:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8ae7:\tf3 49 0f 2a c1       \tcvtsi2ss %r9,%xmm0\n    8aec:\tc4 e2 7d 18 f0       \tvbroadcastss %xmm0,%ymm6\n    8af1:\tc5 fd 6f c8          \tvmovdqa %ymm0,%ymm1\n    8af5:\tc5 fd 6f d0          \tvmovdqa %ymm0,%ymm2\n    8af9:\tc5 fd 6f d8          \tvmovdqa %ymm0,%ymm3\n    8afd:\tc5 fd 6f e0          \tvmovdqa %ymm0,%ymm4\n    8b01:\tc5 fd 6f e8          \tvmovdqa %ymm0,%ymm5\n    8b05:\tc5 fd 6f f0          \tvmovdqa %ymm0,%ymm6\n    8b09:\tc5 fd 6f f8          \tvmovdqa %ymm0,%ymm7\n    8b0d:\tc5 7d 6f c0          \tvmovdqa %ymm0,%ymm8\n    8b11:\tc5 7d 6f c8          \tvmovdqa %ymm0,%ymm9\n    8b15:\tc5 7d 6f d0          \tvmovdqa %ymm0,%ymm10\n\n0000000000008b19 <add256fp_loop>:\n    8b19:\tc5 f4 58 c8          \tvaddps %ymm0,%ymm1,%ymm1\n    8b1d:\tc5 ec 58 d0          \tvaddps %ymm0,%ymm2,%ymm2\n    8b21:\tc5 e4 58 d8          \tvaddps %ymm0,%ymm3,%ymm3\n    8b25:\tc5 dc 58 e0          \tvaddps %ymm0,%ymm4,%ymm4\n    8b29:\tc5 d4 58 e8          \tvaddps %ymm0,%ymm5,%ymm5\n    8b2d:\tc5 cc 58 f0          \tvaddps %ymm0,%ymm6,%ymm6\n    8b31:\tc5 c4 58 f8          \tvaddps %ymm0,%ymm7,%ymm7\n    8b35:\tc5 3c 58 c0          \tvaddps %ymm0,%ymm8,%ymm8\n    8b39:\tc5 34 58 c8          \tvaddps %ymm0,%ymm9,%ymm9\n    8b3d:\tc5 2c 58 d0          \tvaddps %ymm0,%ymm10,%ymm10\n    8b41:\tc5 f4 58 c8          \tvaddps %ymm0,%ymm1,%ymm1\n    8b45:\tc5 ec 58 d0          \tvaddps %ymm0,%ymm2,%ymm2\n    8b49:\tc5 e4 58 d8          \tvaddps %ymm0,%ymm3,%ymm3\n    8b4d:\tc5 dc 58 e0          \tvaddps %ymm0,%ymm4,%ymm4\n    8b51:\tc5 d4 58 e8          \tvaddps %ymm0,%ymm5,%ymm5\n    8b55:\tc5 cc 58 f0          \tvaddps %ymm0,%ymm6,%ymm6\n    8b59:\tc5 c4 58 f8          \tvaddps %ymm0,%ymm7,%ymm7\n    8b5d:\tc5 3c 58 c0          \tvaddps %ymm0,%ymm8,%ymm8\n    8b61:\tc5 34 58 c8          \tvaddps %ymm0,%ymm9,%ymm9\n    8b65:\tc5 2c 58 d0          \tvaddps %ymm0,%ymm10,%ymm10\n    8b69:\t4c 29 cf             \tsub    %r9,%rdi\n    8b6c:\t75 ab                \tjne    8b19 <add256fp_loop>\n    8b6e:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8b73:\tc5 f8 77             \tvzeroupper \n    8b76:\t41 58                \tpop    %r8\n    8b78:\t41 59                \tpop    %r9\n    8b7a:\tc3                   \tretq   \n\n0000000000008b7b <latmul256fp>:\n    8b7b:\t41 51                \tpush   %r9\n    8b7d:\t41 50                \tpush   %r8\n    8b7f:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8b86:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8b8b:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    8b90:\tc4 e2 7d 18 f6       \tvbroadcastss %xmm6,%ymm6\n\n0000000000008b95 <latmul256fp_loop>:\n    8b95:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8b99:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8b9d:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8ba1:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8ba5:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8ba9:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bad:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bb1:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bb5:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bb9:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bbd:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bc1:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bc5:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bc9:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bcd:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bd1:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bd5:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bd9:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8bdd:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8be1:\tc5 cc 59 f6          \tvmulps %ymm6,%ymm6,%ymm6\n    8be5:\t4c 29 cf             \tsub    %r9,%rdi\n    8be8:\t75 ab                \tjne    8b95 <latmul256fp_loop>\n    8bea:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8bef:\tc5 f8 77             \tvzeroupper \n    8bf2:\t41 58                \tpop    %r8\n    8bf4:\t41 59                \tpop    %r9\n    8bf6:\tc3                   \tretq   \n\n0000000000008bf7 <fma512>:\n    8bf7:\t41 51                \tpush   %r9\n    8bf9:\t41 50                \tpush   %r8\n    8bfb:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8c02:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8c07:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    8c0c:\t62 f2 7d 48 18 f6    \tvbroadcastss %xmm6,%zmm6\n    8c12:\t62 f1 7c 48 10 ee    \tvmovups %zmm6,%zmm5\n    8c18:\t62 f1 7c 48 10 fe    \tvmovups %zmm6,%zmm7\n    8c1e:\t62 71 7c 48 10 c6    \tvmovups %zmm6,%zmm8\n    8c24:\t62 71 7c 48 10 ce    \tvmovups %zmm6,%zmm9\n    8c2a:\t62 71 7c 48 10 d6    \tvmovups %zmm6,%zmm10\n    8c30:\t62 71 7c 48 10 de    \tvmovups %zmm6,%zmm11\n    8c36:\t62 71 7c 48 10 e6    \tvmovups %zmm6,%zmm12\n    8c3c:\t62 71 7c 48 10 ee    \tvmovups %zmm6,%zmm13\n    8c42:\t62 71 7c 48 10 f6    \tvmovups %zmm6,%zmm14\n    8c48:\t62 71 7c 48 10 fe    \tvmovups %zmm6,%zmm15\n\n0000000000008c4e <fma512_loop>:\n    8c4e:\t62 f2 55 48 98 ee    \tvfmadd132ps %zmm6,%zmm5,%zmm5\n    8c54:\t62 f2 45 48 98 fe    \tvfmadd132ps %zmm6,%zmm7,%zmm7\n    8c5a:\t62 72 3d 48 98 c6    \tvfmadd132ps %zmm6,%zmm8,%zmm8\n    8c60:\t62 72 35 48 98 ce    \tvfmadd132ps %zmm6,%zmm9,%zmm9\n    8c66:\t62 72 2d 48 98 d6    \tvfmadd132ps %zmm6,%zmm10,%zmm10\n    8c6c:\t62 72 25 48 98 de    \tvfmadd132ps %zmm6,%zmm11,%zmm11\n    8c72:\t62 72 1d 48 98 e6    \tvfmadd132ps %zmm6,%zmm12,%zmm12\n    8c78:\t62 72 15 48 98 ee    \tvfmadd132ps %zmm6,%zmm13,%zmm13\n    8c7e:\t62 72 0d 48 98 f6    \tvfmadd132ps %zmm6,%zmm14,%zmm14\n    8c84:\t62 72 05 48 98 fe    \tvfmadd132ps %zmm6,%zmm15,%zmm15\n    8c8a:\t62 f2 55 48 98 ee    \tvfmadd132ps %zmm6,%zmm5,%zmm5\n    8c90:\t62 f2 45 48 98 fe    \tvfmadd132ps %zmm6,%zmm7,%zmm7\n    8c96:\t62 72 3d 48 98 c6    \tvfmadd132ps %zmm6,%zmm8,%zmm8\n    8c9c:\t62 72 35 48 98 ce    \tvfmadd132ps %zmm6,%zmm9,%zmm9\n    8ca2:\t62 72 2d 48 98 d6    \tvfmadd132ps %zmm6,%zmm10,%zmm10\n    8ca8:\t62 72 25 48 98 de    \tvfmadd132ps %zmm6,%zmm11,%zmm11\n    8cae:\t62 72 1d 48 98 e6    \tvfmadd132ps %zmm6,%zmm12,%zmm12\n    8cb4:\t62 72 15 48 98 ee    \tvfmadd132ps %zmm6,%zmm13,%zmm13\n    8cba:\t62 72 0d 48 98 f6    \tvfmadd132ps %zmm6,%zmm14,%zmm14\n    8cc0:\t62 72 05 48 98 fe    \tvfmadd132ps %zmm6,%zmm15,%zmm15\n    8cc6:\t4c 29 cf             \tsub    %r9,%rdi\n    8cc9:\t75 83                \tjne    8c4e <fma512_loop>\n    8ccb:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8cd0:\tc5 f8 77             \tvzeroupper \n    8cd3:\t41 58                \tpop    %r8\n    8cd5:\t41 59                \tpop    %r9\n    8cd7:\tc3                   \tretq   \n\n0000000000008cd8 <mixfma256fma512>:\n    8cd8:\t41 51                \tpush   %r9\n    8cda:\t41 50                \tpush   %r8\n    8cdc:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8ce3:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8ce8:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    8ced:\t62 f2 7d 48 18 f6    \tvbroadcastss %xmm6,%zmm6\n    8cf3:\t62 f1 7c 48 10 ee    \tvmovups %zmm6,%zmm5\n    8cf9:\t62 f1 7c 48 10 fe    \tvmovups %zmm6,%zmm7\n    8cff:\t62 71 7c 48 10 c6    \tvmovups %zmm6,%zmm8\n    8d05:\t62 71 7c 48 10 ce    \tvmovups %zmm6,%zmm9\n    8d0b:\t62 71 7c 48 10 d6    \tvmovups %zmm6,%zmm10\n    8d11:\t62 71 7c 48 10 de    \tvmovups %zmm6,%zmm11\n    8d17:\t62 71 7c 48 10 e6    \tvmovups %zmm6,%zmm12\n    8d1d:\t62 71 7c 48 10 ee    \tvmovups %zmm6,%zmm13\n    8d23:\t62 71 7c 48 10 f6    \tvmovups %zmm6,%zmm14\n    8d29:\t62 71 7c 48 10 fe    \tvmovups %zmm6,%zmm15\n\n0000000000008d2f <mixfma256fma512_loop>:\n    8d2f:\tc4 e2 55 98 ee       \tvfmadd132ps %ymm6,%ymm5,%ymm5\n    8d34:\t62 f2 45 48 98 fe    \tvfmadd132ps %zmm6,%zmm7,%zmm7\n    8d3a:\tc4 62 3d 98 c6       \tvfmadd132ps %ymm6,%ymm8,%ymm8\n    8d3f:\t62 72 35 48 98 ce    \tvfmadd132ps %zmm6,%zmm9,%zmm9\n    8d45:\tc4 62 2d 98 d6       \tvfmadd132ps %ymm6,%ymm10,%ymm10\n    8d4a:\t62 72 25 48 98 de    \tvfmadd132ps %zmm6,%zmm11,%zmm11\n    8d50:\tc4 62 1d 98 e6       \tvfmadd132ps %ymm6,%ymm12,%ymm12\n    8d55:\t62 72 15 48 98 ee    \tvfmadd132ps %zmm6,%zmm13,%zmm13\n    8d5b:\tc4 62 0d 98 f6       \tvfmadd132ps %ymm6,%ymm14,%ymm14\n    8d60:\t62 72 05 48 98 fe    \tvfmadd132ps %zmm6,%zmm15,%zmm15\n    8d66:\tc4 e2 55 98 ee       \tvfmadd132ps %ymm6,%ymm5,%ymm5\n    8d6b:\t62 f2 45 48 98 fe    \tvfmadd132ps %zmm6,%zmm7,%zmm7\n    8d71:\tc4 62 3d 98 c6       \tvfmadd132ps %ymm6,%ymm8,%ymm8\n    8d76:\t62 72 35 48 98 ce    \tvfmadd132ps %zmm6,%zmm9,%zmm9\n    8d7c:\tc4 62 2d 98 d6       \tvfmadd132ps %ymm6,%ymm10,%ymm10\n    8d81:\t62 72 25 48 98 de    \tvfmadd132ps %zmm6,%zmm11,%zmm11\n    8d87:\tc4 62 1d 98 e6       \tvfmadd132ps %ymm6,%ymm12,%ymm12\n    8d8c:\t62 72 15 48 98 ee    \tvfmadd132ps %zmm6,%zmm13,%zmm13\n    8d92:\tc4 62 0d 98 f6       \tvfmadd132ps %ymm6,%ymm14,%ymm14\n    8d97:\t62 72 05 48 98 fe    \tvfmadd132ps %zmm6,%zmm15,%zmm15\n    8d9d:\t4c 29 cf             \tsub    %r9,%rdi\n    8da0:\t75 8d                \tjne    8d2f <mixfma256fma512_loop>\n    8da2:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8da7:\tc5 f8 77             \tvzeroupper \n    8daa:\t41 58                \tpop    %r8\n    8dac:\t41 59                \tpop    %r9\n    8dae:\tc3                   \tretq   \n\n0000000000008daf <fma256>:\n    8daf:\t41 51                \tpush   %r9\n    8db1:\t41 50                \tpush   %r8\n    8db3:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8dba:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8dbf:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    8dc4:\tc4 e2 7d 18 f6       \tvbroadcastss %xmm6,%ymm6\n    8dc9:\tc5 fc 10 ee          \tvmovups %ymm6,%ymm5\n    8dcd:\tc5 fc 10 fe          \tvmovups %ymm6,%ymm7\n    8dd1:\tc5 7c 10 c6          \tvmovups %ymm6,%ymm8\n    8dd5:\tc5 7c 10 ce          \tvmovups %ymm6,%ymm9\n    8dd9:\tc5 7c 10 d6          \tvmovups %ymm6,%ymm10\n    8ddd:\tc5 7c 10 de          \tvmovups %ymm6,%ymm11\n    8de1:\tc5 7c 10 e6          \tvmovups %ymm6,%ymm12\n    8de5:\tc5 7c 10 ee          \tvmovups %ymm6,%ymm13\n    8de9:\tc5 7c 10 f6          \tvmovups %ymm6,%ymm14\n    8ded:\tc5 7c 10 fe          \tvmovups %ymm6,%ymm15\n\n0000000000008df1 <fma256_loop>:\n    8df1:\tc4 e2 55 98 ee       \tvfmadd132ps %ymm6,%ymm5,%ymm5\n    8df6:\tc4 e2 45 98 fe       \tvfmadd132ps %ymm6,%ymm7,%ymm7\n    8dfb:\tc4 62 3d 98 c6       \tvfmadd132ps %ymm6,%ymm8,%ymm8\n    8e00:\tc4 62 35 98 ce       \tvfmadd132ps %ymm6,%ymm9,%ymm9\n    8e05:\tc4 62 2d 98 d6       \tvfmadd132ps %ymm6,%ymm10,%ymm10\n    8e0a:\tc4 62 25 98 de       \tvfmadd132ps %ymm6,%ymm11,%ymm11\n    8e0f:\tc4 62 1d 98 e6       \tvfmadd132ps %ymm6,%ymm12,%ymm12\n    8e14:\tc4 62 15 98 ee       \tvfmadd132ps %ymm6,%ymm13,%ymm13\n    8e19:\tc4 62 0d 98 f6       \tvfmadd132ps %ymm6,%ymm14,%ymm14\n    8e1e:\tc4 62 05 98 fe       \tvfmadd132ps %ymm6,%ymm15,%ymm15\n    8e23:\tc4 e2 55 98 ee       \tvfmadd132ps %ymm6,%ymm5,%ymm5\n    8e28:\tc4 e2 45 98 fe       \tvfmadd132ps %ymm6,%ymm7,%ymm7\n    8e2d:\tc4 62 3d 98 c6       \tvfmadd132ps %ymm6,%ymm8,%ymm8\n    8e32:\tc4 62 35 98 ce       \tvfmadd132ps %ymm6,%ymm9,%ymm9\n    8e37:\tc4 62 2d 98 d6       \tvfmadd132ps %ymm6,%ymm10,%ymm10\n    8e3c:\tc4 62 25 98 de       \tvfmadd132ps %ymm6,%ymm11,%ymm11\n    8e41:\tc4 62 1d 98 e6       \tvfmadd132ps %ymm6,%ymm12,%ymm12\n    8e46:\tc4 62 15 98 ee       \tvfmadd132ps %ymm6,%ymm13,%ymm13\n    8e4b:\tc4 62 0d 98 f6       \tvfmadd132ps %ymm6,%ymm14,%ymm14\n    8e50:\tc4 62 05 98 fe       \tvfmadd132ps %ymm6,%ymm15,%ymm15\n    8e55:\t4c 29 cf             \tsub    %r9,%rdi\n    8e58:\t75 97                \tjne    8df1 <fma256_loop>\n    8e5a:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8e5f:\tc5 f8 77             \tvzeroupper \n    8e62:\t41 58                \tpop    %r8\n    8e64:\t41 59                \tpop    %r9\n    8e66:\tc3                   \tretq   \n\n0000000000008e67 <fma128>:\n    8e67:\t41 51                \tpush   %r9\n    8e69:\t41 50                \tpush   %r8\n    8e6b:\tc5 f8 77             \tvzeroupper \n    8e6e:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    8e75:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8e7a:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    8e7f:\tc4 e2 79 18 f6       \tvbroadcastss %xmm6,%xmm6\n    8e84:\tc5 f8 10 ee          \tvmovups %xmm6,%xmm5\n    8e88:\tc5 f8 10 fe          \tvmovups %xmm6,%xmm7\n    8e8c:\tc5 78 10 c6          \tvmovups %xmm6,%xmm8\n    8e90:\tc5 78 10 ce          \tvmovups %xmm6,%xmm9\n    8e94:\tc5 78 10 d6          \tvmovups %xmm6,%xmm10\n    8e98:\tc5 78 10 de          \tvmovups %xmm6,%xmm11\n    8e9c:\tc5 78 10 e6          \tvmovups %xmm6,%xmm12\n    8ea0:\tc5 78 10 ee          \tvmovups %xmm6,%xmm13\n    8ea4:\tc5 78 10 f6          \tvmovups %xmm6,%xmm14\n    8ea8:\tc5 78 10 fe          \tvmovups %xmm6,%xmm15\n\n0000000000008eac <fma128_loop>:\n    8eac:\tc4 e2 51 98 ee       \tvfmadd132ps %xmm6,%xmm5,%xmm5\n    8eb1:\tc4 e2 41 98 fe       \tvfmadd132ps %xmm6,%xmm7,%xmm7\n    8eb6:\tc4 62 39 98 c6       \tvfmadd132ps %xmm6,%xmm8,%xmm8\n    8ebb:\tc4 62 31 98 ce       \tvfmadd132ps %xmm6,%xmm9,%xmm9\n    8ec0:\tc4 62 29 98 d6       \tvfmadd132ps %xmm6,%xmm10,%xmm10\n    8ec5:\tc4 62 21 98 de       \tvfmadd132ps %xmm6,%xmm11,%xmm11\n    8eca:\tc4 62 19 98 e6       \tvfmadd132ps %xmm6,%xmm12,%xmm12\n    8ecf:\tc4 62 11 98 ee       \tvfmadd132ps %xmm6,%xmm13,%xmm13\n    8ed4:\tc4 62 09 98 f6       \tvfmadd132ps %xmm6,%xmm14,%xmm14\n    8ed9:\tc4 62 01 98 fe       \tvfmadd132ps %xmm6,%xmm15,%xmm15\n    8ede:\tc4 e2 51 98 ee       \tvfmadd132ps %xmm6,%xmm5,%xmm5\n    8ee3:\tc4 e2 41 98 fe       \tvfmadd132ps %xmm6,%xmm7,%xmm7\n    8ee8:\tc4 62 39 98 c6       \tvfmadd132ps %xmm6,%xmm8,%xmm8\n    8eed:\tc4 62 31 98 ce       \tvfmadd132ps %xmm6,%xmm9,%xmm9\n    8ef2:\tc4 62 29 98 d6       \tvfmadd132ps %xmm6,%xmm10,%xmm10\n    8ef7:\tc4 62 21 98 de       \tvfmadd132ps %xmm6,%xmm11,%xmm11\n    8efc:\tc4 62 19 98 e6       \tvfmadd132ps %xmm6,%xmm12,%xmm12\n    8f01:\tc4 62 11 98 ee       \tvfmadd132ps %xmm6,%xmm13,%xmm13\n    8f06:\tc4 62 09 98 f6       \tvfmadd132ps %xmm6,%xmm14,%xmm14\n    8f0b:\tc4 62 01 98 fe       \tvfmadd132ps %xmm6,%xmm15,%xmm15\n    8f10:\t4c 29 cf             \tsub    %r9,%rdi\n    8f13:\t75 97                \tjne    8eac <fma128_loop>\n    8f15:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    8f1a:\tc5 f8 77             \tvzeroupper \n    8f1d:\t41 58                \tpop    %r8\n    8f1f:\t41 59                \tpop    %r9\n    8f21:\tc3                   \tretq   \n\n0000000000008f22 <mixfmafadd256>:\n    8f22:\t41 51                \tpush   %r9\n    8f24:\t41 50                \tpush   %r8\n    8f26:\t49 c7 c1 1e 00 00 00 \tmov    $0x1e,%r9\n    8f2d:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    8f32:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    8f37:\tc4 e2 7d 18 f6       \tvbroadcastss %xmm6,%ymm6\n    8f3c:\tc5 fc 10 c6          \tvmovups %ymm6,%ymm0\n    8f40:\tc5 fc 10 ce          \tvmovups %ymm6,%ymm1\n    8f44:\tc5 fc 10 d6          \tvmovups %ymm6,%ymm2\n    8f48:\tc5 fc 10 de          \tvmovups %ymm6,%ymm3\n    8f4c:\tc5 fc 10 e6          \tvmovups %ymm6,%ymm4\n    8f50:\tc5 fc 10 ee          \tvmovups %ymm6,%ymm5\n    8f54:\tc5 fc 10 fe          \tvmovups %ymm6,%ymm7\n    8f58:\tc5 7c 10 c6          \tvmovups %ymm6,%ymm8\n    8f5c:\tc5 7c 10 ce          \tvmovups %ymm6,%ymm9\n    8f60:\tc5 7c 10 d6          \tvmovups %ymm6,%ymm10\n    8f64:\tc5 7c 10 de          \tvmovups %ymm6,%ymm11\n    8f68:\tc5 7c 10 e6          \tvmovups %ymm6,%ymm12\n    8f6c:\tc5 7c 10 ee          \tvmovups %ymm6,%ymm13\n    8f70:\tc5 7c 10 f6          \tvmovups %ymm6,%ymm14\n    8f74:\tc5 7c 10 fe          \tvmovups %ymm6,%ymm15\n\n0000000000008f78 <mixfmafadd256_loop>:\n    8f78:\tc4 e2 55 98 ee       \tvfmadd132ps %ymm6,%ymm5,%ymm5\n    8f7d:\tc4 e2 45 98 fe       \tvfmadd132ps %ymm6,%ymm7,%ymm7\n    8f82:\tc4 41 54 58 da       \tvaddps %ymm10,%ymm5,%ymm11\n    8f87:\tc4 62 3d 98 c6       \tvfmadd132ps %ymm6,%ymm8,%ymm8\n    8f8c:\tc4 62 35 98 ce       \tvfmadd132ps %ymm6,%ymm9,%ymm9\n    8f91:\tc4 41 54 58 ec       \tvaddps %ymm12,%ymm5,%ymm13\n    8f96:\tc4 62 0d 98 f6       \tvfmadd132ps %ymm6,%ymm14,%ymm14\n    8f9b:\tc4 62 05 98 fe       \tvfmadd132ps %ymm6,%ymm15,%ymm15\n    8fa0:\tc4 41 4c 58 ec       \tvaddps %ymm12,%ymm6,%ymm13\n    8fa5:\tc4 e2 7d 98 ce       \tvfmadd132ps %ymm6,%ymm0,%ymm1\n    8faa:\tc4 e2 6d 98 de       \tvfmadd132ps %ymm6,%ymm2,%ymm3\n    8faf:\tc5 d4 58 e6          \tvaddps %ymm6,%ymm5,%ymm4\n    8fb3:\tc4 e2 55 98 ee       \tvfmadd132ps %ymm6,%ymm5,%ymm5\n    8fb8:\tc4 e2 45 98 fe       \tvfmadd132ps %ymm6,%ymm7,%ymm7\n    8fbd:\tc4 41 4c 58 da       \tvaddps %ymm10,%ymm6,%ymm11\n    8fc2:\tc4 62 3d 98 c6       \tvfmadd132ps %ymm6,%ymm8,%ymm8\n    8fc7:\tc4 62 35 98 ce       \tvfmadd132ps %ymm6,%ymm9,%ymm9\n    8fcc:\tc4 41 44 58 ec       \tvaddps %ymm12,%ymm7,%ymm13\n    8fd1:\tc4 62 0d 98 f6       \tvfmadd132ps %ymm6,%ymm14,%ymm14\n    8fd6:\tc4 62 05 98 fe       \tvfmadd132ps %ymm6,%ymm15,%ymm15\n    8fdb:\tc4 41 54 58 ec       \tvaddps %ymm12,%ymm5,%ymm13\n    8fe0:\tc4 e2 7d 98 ce       \tvfmadd132ps %ymm6,%ymm0,%ymm1\n    8fe5:\tc4 e2 6d 98 de       \tvfmadd132ps %ymm6,%ymm2,%ymm3\n    8fea:\tc5 d4 58 e6          \tvaddps %ymm6,%ymm5,%ymm4\n    8fee:\tc4 e2 55 98 ee       \tvfmadd132ps %ymm6,%ymm5,%ymm5\n    8ff3:\tc4 e2 45 98 fe       \tvfmadd132ps %ymm6,%ymm7,%ymm7\n    8ff8:\tc4 41 4c 58 da       \tvaddps %ymm10,%ymm6,%ymm11\n    8ffd:\tc4 62 3d 98 c6       \tvfmadd132ps %ymm6,%ymm8,%ymm8\n    9002:\tc4 62 35 98 ce       \tvfmadd132ps %ymm6,%ymm9,%ymm9\n    9007:\tc4 41 54 58 ec       \tvaddps %ymm12,%ymm5,%ymm13\n    900c:\t4c 29 cf             \tsub    %r9,%rdi\n    900f:\t0f 85 63 ff ff ff    \tjne    8f78 <mixfmafadd256_loop>\n    9015:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    901a:\tc5 f8 77             \tvzeroupper \n    901d:\t41 58                \tpop    %r8\n    901f:\t41 59                \tpop    %r9\n    9021:\tc3                   \tretq   \n\n0000000000009022 <mixfmaadd512>:\n    9022:\t41 51                \tpush   %r9\n    9024:\t41 50                \tpush   %r8\n    9026:\t49 c7 c1 10 00 00 00 \tmov    $0x10,%r9\n    902d:\t66 49 0f 6e c1       \tmovq   %r9,%xmm0\n    9032:\t62 f2 fd 48 59 c0    \tvpbroadcastq %xmm0,%zmm0\n    9038:\tf3 49 0f 2a c9       \tcvtsi2ss %r9,%xmm1\n    903d:\t62 f2 7d 48 18 c9    \tvbroadcastss %xmm1,%zmm1\n    9043:\t62 f1 fd 48 6f d8    \tvmovdqa64 %zmm0,%zmm3\n    9049:\t62 f1 fd 48 6f f0    \tvmovdqa64 %zmm0,%zmm6\n    904f:\t62 71 fd 48 6f c8    \tvmovdqa64 %zmm0,%zmm9\n    9055:\t62 71 fd 48 6f e0    \tvmovdqa64 %zmm0,%zmm12\n    905b:\t62 71 fd 48 6f f8    \tvmovdqa64 %zmm0,%zmm15\n    9061:\t62 f1 7c 48 28 d1    \tvmovaps %zmm1,%zmm2\n    9067:\t62 f1 7c 48 28 e1    \tvmovaps %zmm1,%zmm4\n    906d:\t62 f1 7c 48 28 e9    \tvmovaps %zmm1,%zmm5\n    9073:\t62 f1 7c 48 28 f9    \tvmovaps %zmm1,%zmm7\n    9079:\t62 71 7c 48 28 c1    \tvmovaps %zmm1,%zmm8\n    907f:\t62 71 7c 48 28 d1    \tvmovaps %zmm1,%zmm10\n    9085:\t62 71 7c 48 28 d9    \tvmovaps %zmm1,%zmm11\n    908b:\t62 71 7c 48 28 e9    \tvmovaps %zmm1,%zmm13\n    9091:\t62 71 7c 48 28 f1    \tvmovaps %zmm1,%zmm14\n\n0000000000009097 <mixfmaadd512_loop>:\n    9097:\t62 f1 85 48 d4 c0    \tvpaddq %zmm0,%zmm15,%zmm0\n    909d:\t62 f2 75 48 98 c9    \tvfmadd132ps %zmm1,%zmm1,%zmm1\n    90a3:\t62 f2 6d 48 98 d2    \tvfmadd132ps %zmm2,%zmm2,%zmm2\n    90a9:\t62 f1 85 48 d4 db    \tvpaddq %zmm3,%zmm15,%zmm3\n    90af:\t62 f2 5d 48 98 e4    \tvfmadd132ps %zmm4,%zmm4,%zmm4\n    90b5:\t62 f2 55 48 98 ed    \tvfmadd132ps %zmm5,%zmm5,%zmm5\n    90bb:\t62 f1 85 48 d4 f6    \tvpaddq %zmm6,%zmm15,%zmm6\n    90c1:\t62 f2 45 48 98 ff    \tvfmadd132ps %zmm7,%zmm7,%zmm7\n    90c7:\t62 52 3d 48 98 c0    \tvfmadd132ps %zmm8,%zmm8,%zmm8\n    90cd:\t62 51 85 48 d4 c9    \tvpaddq %zmm9,%zmm15,%zmm9\n    90d3:\t62 52 2d 48 98 d2    \tvfmadd132ps %zmm10,%zmm10,%zmm10\n    90d9:\t62 52 25 48 98 db    \tvfmadd132ps %zmm11,%zmm11,%zmm11\n    90df:\t62 51 85 48 d4 e4    \tvpaddq %zmm12,%zmm15,%zmm12\n    90e5:\t62 52 15 48 98 ed    \tvfmadd132ps %zmm13,%zmm13,%zmm13\n    90eb:\t62 52 0d 48 98 f6    \tvfmadd132ps %zmm14,%zmm14,%zmm14\n    90f1:\t4c 29 cf             \tsub    %r9,%rdi\n    90f4:\t7f a1                \tjg     9097 <mixfmaadd512_loop>\n    90f6:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    90fb:\tc5 f8 77             \tvzeroupper \n    90fe:\t41 58                \tpop    %r8\n    9100:\t41 59                \tpop    %r9\n    9102:\tc3                   \tretq   \n\n0000000000009103 <mixfma512add256>:\n    9103:\t41 51                \tpush   %r9\n    9105:\t41 50                \tpush   %r8\n    9107:\t49 c7 c1 10 00 00 00 \tmov    $0x10,%r9\n    910e:\t66 49 0f 6e c1       \tmovq   %r9,%xmm0\n    9113:\tc4 e2 7d 59 c0       \tvpbroadcastq %xmm0,%ymm0\n    9118:\tf3 49 0f 2a c9       \tcvtsi2ss %r9,%xmm1\n    911d:\t62 f2 7d 48 18 c9    \tvbroadcastss %xmm1,%zmm1\n    9123:\tc5 fd 6f d8          \tvmovdqa %ymm0,%ymm3\n    9127:\tc5 fd 6f f0          \tvmovdqa %ymm0,%ymm6\n    912b:\tc5 7d 6f c8          \tvmovdqa %ymm0,%ymm9\n    912f:\tc5 7d 6f e0          \tvmovdqa %ymm0,%ymm12\n    9133:\tc5 7d 6f f8          \tvmovdqa %ymm0,%ymm15\n    9137:\t62 f1 7c 48 28 d1    \tvmovaps %zmm1,%zmm2\n    913d:\t62 f1 7c 48 28 e1    \tvmovaps %zmm1,%zmm4\n    9143:\t62 f1 7c 48 28 e9    \tvmovaps %zmm1,%zmm5\n    9149:\t62 f1 7c 48 28 f9    \tvmovaps %zmm1,%zmm7\n    914f:\t62 71 7c 48 28 c1    \tvmovaps %zmm1,%zmm8\n    9155:\t62 71 7c 48 28 d1    \tvmovaps %zmm1,%zmm10\n    915b:\t62 71 7c 48 28 d9    \tvmovaps %zmm1,%zmm11\n    9161:\t62 71 7c 48 28 e9    \tvmovaps %zmm1,%zmm13\n    9167:\t62 71 7c 48 28 f1    \tvmovaps %zmm1,%zmm14\n\n000000000000916d <mixfma512add256_loop>:\n    916d:\tc5 85 d4 c0          \tvpaddq %ymm0,%ymm15,%ymm0\n    9171:\t62 f2 75 48 98 c9    \tvfmadd132ps %zmm1,%zmm1,%zmm1\n    9177:\t62 f2 6d 48 98 d2    \tvfmadd132ps %zmm2,%zmm2,%zmm2\n    917d:\tc5 85 d4 db          \tvpaddq %ymm3,%ymm15,%ymm3\n    9181:\t62 f2 5d 48 98 e4    \tvfmadd132ps %zmm4,%zmm4,%zmm4\n    9187:\t62 f2 55 48 98 ed    \tvfmadd132ps %zmm5,%zmm5,%zmm5\n    918d:\tc5 85 d4 f6          \tvpaddq %ymm6,%ymm15,%ymm6\n    9191:\t62 f2 45 48 98 ff    \tvfmadd132ps %zmm7,%zmm7,%zmm7\n    9197:\t62 52 3d 48 98 c0    \tvfmadd132ps %zmm8,%zmm8,%zmm8\n    919d:\tc4 41 05 d4 c9       \tvpaddq %ymm9,%ymm15,%ymm9\n    91a2:\t62 52 2d 48 98 d2    \tvfmadd132ps %zmm10,%zmm10,%zmm10\n    91a8:\t62 52 25 48 98 db    \tvfmadd132ps %zmm11,%zmm11,%zmm11\n    91ae:\tc4 41 05 d4 e4       \tvpaddq %ymm12,%ymm15,%ymm12\n    91b3:\t62 52 15 48 98 ed    \tvfmadd132ps %zmm13,%zmm13,%zmm13\n    91b9:\t62 52 0d 48 98 f6    \tvfmadd132ps %zmm14,%zmm14,%zmm14\n    91bf:\t4c 29 cf             \tsub    %r9,%rdi\n    91c2:\t7f a9                \tjg     916d <mixfma512add256_loop>\n    91c4:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    91c9:\tc5 f8 77             \tvzeroupper \n    91cc:\t41 58                \tpop    %r8\n    91ce:\t41 59                \tpop    %r9\n    91d0:\tc3                   \tretq   \n\n00000000000091d1 <mixfmaadd256>:\n    91d1:\t41 51                \tpush   %r9\n    91d3:\t41 50                \tpush   %r8\n    91d5:\t49 c7 c1 10 00 00 00 \tmov    $0x10,%r9\n    91dc:\t66 49 0f 6e c1       \tmovq   %r9,%xmm0\n    91e1:\tc4 e2 7d 59 c0       \tvpbroadcastq %xmm0,%ymm0\n    91e6:\tf3 49 0f 2a c9       \tcvtsi2ss %r9,%xmm1\n    91eb:\tc4 e2 7d 18 c9       \tvbroadcastss %xmm1,%ymm1\n    91f0:\tc5 fd 6f d8          \tvmovdqa %ymm0,%ymm3\n    91f4:\tc5 fd 6f f0          \tvmovdqa %ymm0,%ymm6\n    91f8:\tc5 7d 6f c8          \tvmovdqa %ymm0,%ymm9\n    91fc:\tc5 7d 6f e0          \tvmovdqa %ymm0,%ymm12\n    9200:\tc5 7d 6f f8          \tvmovdqa %ymm0,%ymm15\n    9204:\tc5 fc 28 d1          \tvmovaps %ymm1,%ymm2\n    9208:\tc5 fc 28 e1          \tvmovaps %ymm1,%ymm4\n    920c:\tc5 fc 28 e9          \tvmovaps %ymm1,%ymm5\n    9210:\tc5 fc 28 f9          \tvmovaps %ymm1,%ymm7\n    9214:\tc5 7c 28 c1          \tvmovaps %ymm1,%ymm8\n    9218:\tc5 7c 28 d1          \tvmovaps %ymm1,%ymm10\n    921c:\tc5 7c 28 d9          \tvmovaps %ymm1,%ymm11\n    9220:\tc5 7c 28 e9          \tvmovaps %ymm1,%ymm13\n    9224:\tc5 7c 28 f1          \tvmovaps %ymm1,%ymm14\n\n0000000000009228 <mixfmaadd256_loop>:\n    9228:\tc5 85 d4 c0          \tvpaddq %ymm0,%ymm15,%ymm0\n    922c:\tc4 e2 75 98 c9       \tvfmadd132ps %ymm1,%ymm1,%ymm1\n    9231:\tc4 e2 6d 98 d2       \tvfmadd132ps %ymm2,%ymm2,%ymm2\n    9236:\tc5 85 d4 db          \tvpaddq %ymm3,%ymm15,%ymm3\n    923a:\tc4 e2 5d 98 e4       \tvfmadd132ps %ymm4,%ymm4,%ymm4\n    923f:\tc4 e2 55 98 ed       \tvfmadd132ps %ymm5,%ymm5,%ymm5\n    9244:\tc5 85 d4 f6          \tvpaddq %ymm6,%ymm15,%ymm6\n    9248:\tc4 e2 45 98 ff       \tvfmadd132ps %ymm7,%ymm7,%ymm7\n    924d:\tc4 42 3d 98 c0       \tvfmadd132ps %ymm8,%ymm8,%ymm8\n    9252:\tc4 41 05 d4 c9       \tvpaddq %ymm9,%ymm15,%ymm9\n    9257:\tc4 42 2d 98 d2       \tvfmadd132ps %ymm10,%ymm10,%ymm10\n    925c:\tc4 42 25 98 db       \tvfmadd132ps %ymm11,%ymm11,%ymm11\n    9261:\tc4 41 05 d4 e4       \tvpaddq %ymm12,%ymm15,%ymm12\n    9266:\tc4 42 15 98 ed       \tvfmadd132ps %ymm13,%ymm13,%ymm13\n    926b:\tc4 42 0d 98 f6       \tvfmadd132ps %ymm14,%ymm14,%ymm14\n    9270:\t4c 29 cf             \tsub    %r9,%rdi\n    9273:\t7f b3                \tjg     9228 <mixfmaadd256_loop>\n    9275:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    927a:\tc5 f8 77             \tvzeroupper \n    927d:\t41 58                \tpop    %r8\n    927f:\t41 59                \tpop    %r9\n    9281:\tc3                   \tretq   \n\n0000000000009282 <mixfmaand256>:\n    9282:\t41 51                \tpush   %r9\n    9284:\t41 50                \tpush   %r8\n    9286:\t49 c7 c1 0f 00 00 00 \tmov    $0xf,%r9\n    928d:\t66 49 0f 6e c1       \tmovq   %r9,%xmm0\n    9292:\tc4 e2 7d 59 c0       \tvpbroadcastq %xmm0,%ymm0\n    9297:\tf3 49 0f 2a c9       \tcvtsi2ss %r9,%xmm1\n    929c:\tc4 e2 7d 18 c9       \tvbroadcastss %xmm1,%ymm1\n    92a1:\tc5 fd 6f d8          \tvmovdqa %ymm0,%ymm3\n    92a5:\tc5 fd 6f f0          \tvmovdqa %ymm0,%ymm6\n    92a9:\tc5 7d 6f c8          \tvmovdqa %ymm0,%ymm9\n    92ad:\tc5 7d 6f e0          \tvmovdqa %ymm0,%ymm12\n    92b1:\tc5 7d 6f f8          \tvmovdqa %ymm0,%ymm15\n    92b5:\tc5 fc 28 d1          \tvmovaps %ymm1,%ymm2\n    92b9:\tc5 fc 28 e1          \tvmovaps %ymm1,%ymm4\n    92bd:\tc5 fc 28 e9          \tvmovaps %ymm1,%ymm5\n    92c1:\tc5 fc 28 f9          \tvmovaps %ymm1,%ymm7\n    92c5:\tc5 7c 28 c1          \tvmovaps %ymm1,%ymm8\n    92c9:\tc5 7c 28 d1          \tvmovaps %ymm1,%ymm10\n    92cd:\tc5 7c 28 d9          \tvmovaps %ymm1,%ymm11\n    92d1:\tc5 7c 28 e9          \tvmovaps %ymm1,%ymm13\n    92d5:\tc5 7c 28 f1          \tvmovaps %ymm1,%ymm14\n\n00000000000092d9 <mixfmaand256_loop>:\n    92d9:\tc5 85 db c0          \tvpand  %ymm0,%ymm15,%ymm0\n    92dd:\tc4 e2 75 98 c9       \tvfmadd132ps %ymm1,%ymm1,%ymm1\n    92e2:\tc4 e2 6d 98 d2       \tvfmadd132ps %ymm2,%ymm2,%ymm2\n    92e7:\tc5 85 db db          \tvpand  %ymm3,%ymm15,%ymm3\n    92eb:\tc4 e2 5d 98 e4       \tvfmadd132ps %ymm4,%ymm4,%ymm4\n    92f0:\tc4 e2 55 98 ed       \tvfmadd132ps %ymm5,%ymm5,%ymm5\n    92f5:\tc5 85 db f6          \tvpand  %ymm6,%ymm15,%ymm6\n    92f9:\tc4 e2 45 98 ff       \tvfmadd132ps %ymm7,%ymm7,%ymm7\n    92fe:\tc4 42 3d 98 c0       \tvfmadd132ps %ymm8,%ymm8,%ymm8\n    9303:\tc4 41 05 db c9       \tvpand  %ymm9,%ymm15,%ymm9\n    9308:\tc4 42 2d 98 d2       \tvfmadd132ps %ymm10,%ymm10,%ymm10\n    930d:\tc4 42 25 98 db       \tvfmadd132ps %ymm11,%ymm11,%ymm11\n    9312:\tc4 41 05 db e4       \tvpand  %ymm12,%ymm15,%ymm12\n    9317:\tc4 42 15 98 ed       \tvfmadd132ps %ymm13,%ymm13,%ymm13\n    931c:\tc4 42 0d 98 f6       \tvfmadd132ps %ymm14,%ymm14,%ymm14\n    9321:\t4c 29 cf             \tsub    %r9,%rdi\n    9324:\t7f b3                \tjg     92d9 <mixfmaand256_loop>\n    9326:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    932b:\tc5 f8 77             \tvzeroupper \n    932e:\t41 58                \tpop    %r8\n    9330:\t41 59                \tpop    %r9\n    9332:\tc3                   \tretq   \n\n0000000000009333 <mixfmaandmem256>:\n    9333:\t41 51                \tpush   %r9\n    9335:\t41 50                \tpush   %r8\n    9337:\t49 c7 c1 16 00 00 00 \tmov    $0x16,%r9\n    933e:\t66 49 0f 6e c1       \tmovq   %r9,%xmm0\n    9343:\tc4 e2 7d 59 c0       \tvpbroadcastq %xmm0,%ymm0\n    9348:\tf3 49 0f 2a c9       \tcvtsi2ss %r9,%xmm1\n    934d:\tc4 e2 7d 18 c9       \tvbroadcastss %xmm1,%ymm1\n    9352:\tc5 fd 6f d8          \tvmovdqa %ymm0,%ymm3\n    9356:\tc5 fc 28 f1          \tvmovaps %ymm1,%ymm6\n    935a:\tc5 7c 28 c9          \tvmovaps %ymm1,%ymm9\n    935e:\tc5 7c 28 e1          \tvmovaps %ymm1,%ymm12\n    9362:\tc5 7c 28 f9          \tvmovaps %ymm1,%ymm15\n    9366:\tc5 fc 28 d1          \tvmovaps %ymm1,%ymm2\n    936a:\tc5 fc 28 e1          \tvmovaps %ymm1,%ymm4\n    936e:\tc5 fc 28 e9          \tvmovaps %ymm1,%ymm5\n    9372:\tc5 fc 28 f9          \tvmovaps %ymm1,%ymm7\n    9376:\tc5 7c 28 c1          \tvmovaps %ymm1,%ymm8\n    937a:\tc5 7c 28 d1          \tvmovaps %ymm1,%ymm10\n    937e:\tc5 7c 28 d9          \tvmovaps %ymm1,%ymm11\n    9382:\tc5 7c 28 e9          \tvmovaps %ymm1,%ymm13\n    9386:\tc5 7c 28 f1          \tvmovaps %ymm1,%ymm14\n\n000000000000938a <mixfmaandmem256_loop>:\n    938a:\tc5 fd db c0          \tvpand  %ymm0,%ymm0,%ymm0\n    938e:\tc4 e2 75 98 c9       \tvfmadd132ps %ymm1,%ymm1,%ymm1\n    9393:\tc4 e2 6d 98 16       \tvfmadd132ps (%rsi),%ymm2,%ymm2\n    9398:\tc5 e5 db db          \tvpand  %ymm3,%ymm3,%ymm3\n    939c:\tc4 e2 5d 98 e4       \tvfmadd132ps %ymm4,%ymm4,%ymm4\n    93a1:\tc4 e2 55 98 2e       \tvfmadd132ps (%rsi),%ymm5,%ymm5\n    93a6:\tc5 fd db c0          \tvpand  %ymm0,%ymm0,%ymm0\n    93aa:\tc4 e2 45 98 ff       \tvfmadd132ps %ymm7,%ymm7,%ymm7\n    93af:\tc4 62 3d 98 06       \tvfmadd132ps (%rsi),%ymm8,%ymm8\n    93b4:\tc5 e5 db db          \tvpand  %ymm3,%ymm3,%ymm3\n    93b8:\tc4 42 2d 98 d2       \tvfmadd132ps %ymm10,%ymm10,%ymm10\n    93bd:\tc4 62 25 98 1e       \tvfmadd132ps (%rsi),%ymm11,%ymm11\n    93c2:\tc5 fd db c0          \tvpand  %ymm0,%ymm0,%ymm0\n    93c6:\tc4 42 15 98 ed       \tvfmadd132ps %ymm13,%ymm13,%ymm13\n    93cb:\tc4 62 0d 98 36       \tvfmadd132ps (%rsi),%ymm14,%ymm14\n    93d0:\tc5 e5 db db          \tvpand  %ymm3,%ymm3,%ymm3\n    93d4:\tc4 e2 4d 98 f6       \tvfmadd132ps %ymm6,%ymm6,%ymm6\n    93d9:\tc4 62 35 98 0e       \tvfmadd132ps (%rsi),%ymm9,%ymm9\n    93de:\tc5 fd db c0          \tvpand  %ymm0,%ymm0,%ymm0\n    93e2:\tc4 42 1d 98 e4       \tvfmadd132ps %ymm12,%ymm12,%ymm12\n    93e7:\tc4 62 05 98 3e       \tvfmadd132ps (%rsi),%ymm15,%ymm15\n    93ec:\t4c 29 cf             \tsub    %r9,%rdi\n    93ef:\t7f 99                \tjg     938a <mixfmaandmem256_loop>\n    93f1:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    93f6:\tc5 f8 77             \tvzeroupper \n    93f9:\t41 58                \tpop    %r8\n    93fb:\t41 59                \tpop    %r9\n    93fd:\tc3                   \tretq   \n\n00000000000093fe <mixfmaaddmem256>:\n    93fe:\t41 51                \tpush   %r9\n    9400:\t41 50                \tpush   %r8\n    9402:\t49 c7 c1 16 00 00 00 \tmov    $0x16,%r9\n    9409:\t66 49 0f 6e c1       \tmovq   %r9,%xmm0\n    940e:\tc4 e2 7d 59 c0       \tvpbroadcastq %xmm0,%ymm0\n    9413:\tf3 49 0f 2a c9       \tcvtsi2ss %r9,%xmm1\n    9418:\tc4 e2 7d 18 c9       \tvbroadcastss %xmm1,%ymm1\n    941d:\tc5 fd 6f d8          \tvmovdqa %ymm0,%ymm3\n    9421:\tc5 fc 28 f1          \tvmovaps %ymm1,%ymm6\n    9425:\tc5 7c 28 c9          \tvmovaps %ymm1,%ymm9\n    9429:\tc5 7c 28 e1          \tvmovaps %ymm1,%ymm12\n    942d:\tc5 7c 28 f9          \tvmovaps %ymm1,%ymm15\n    9431:\tc5 fc 28 d1          \tvmovaps %ymm1,%ymm2\n    9435:\tc5 fc 28 e1          \tvmovaps %ymm1,%ymm4\n    9439:\tc5 fc 28 e9          \tvmovaps %ymm1,%ymm5\n    943d:\tc5 fc 28 f9          \tvmovaps %ymm1,%ymm7\n    9441:\tc5 7c 28 c1          \tvmovaps %ymm1,%ymm8\n    9445:\tc5 7c 28 d1          \tvmovaps %ymm1,%ymm10\n    9449:\tc5 7c 28 d9          \tvmovaps %ymm1,%ymm11\n    944d:\tc5 7c 28 e9          \tvmovaps %ymm1,%ymm13\n    9451:\tc5 7c 28 f1          \tvmovaps %ymm1,%ymm14\n\n0000000000009455 <mixfmaaddmem256_loop>:\n    9455:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    9459:\tc4 e2 75 98 c9       \tvfmadd132ps %ymm1,%ymm1,%ymm1\n    945e:\tc4 e2 6d 98 16       \tvfmadd132ps (%rsi),%ymm2,%ymm2\n    9463:\tc5 e5 d4 db          \tvpaddq %ymm3,%ymm3,%ymm3\n    9467:\tc4 e2 5d 98 e4       \tvfmadd132ps %ymm4,%ymm4,%ymm4\n    946c:\tc4 e2 55 98 2e       \tvfmadd132ps (%rsi),%ymm5,%ymm5\n    9471:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    9475:\tc4 e2 45 98 ff       \tvfmadd132ps %ymm7,%ymm7,%ymm7\n    947a:\tc4 62 3d 98 06       \tvfmadd132ps (%rsi),%ymm8,%ymm8\n    947f:\tc5 e5 d4 db          \tvpaddq %ymm3,%ymm3,%ymm3\n    9483:\tc4 42 2d 98 d2       \tvfmadd132ps %ymm10,%ymm10,%ymm10\n    9488:\tc4 62 25 98 1e       \tvfmadd132ps (%rsi),%ymm11,%ymm11\n    948d:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    9491:\tc4 42 15 98 ed       \tvfmadd132ps %ymm13,%ymm13,%ymm13\n    9496:\tc4 62 0d 98 36       \tvfmadd132ps (%rsi),%ymm14,%ymm14\n    949b:\tc5 e5 d4 db          \tvpaddq %ymm3,%ymm3,%ymm3\n    949f:\tc4 e2 4d 98 f6       \tvfmadd132ps %ymm6,%ymm6,%ymm6\n    94a4:\tc4 62 35 98 0e       \tvfmadd132ps (%rsi),%ymm9,%ymm9\n    94a9:\tc5 fd d4 c0          \tvpaddq %ymm0,%ymm0,%ymm0\n    94ad:\tc4 42 1d 98 e4       \tvfmadd132ps %ymm12,%ymm12,%ymm12\n    94b2:\tc4 62 05 98 3e       \tvfmadd132ps (%rsi),%ymm15,%ymm15\n    94b7:\t4c 29 cf             \tsub    %r9,%rdi\n    94ba:\t7f 99                \tjg     9455 <mixfmaaddmem256_loop>\n    94bc:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    94c1:\tc5 f8 77             \tvzeroupper \n    94c4:\t41 58                \tpop    %r8\n    94c6:\t41 59                \tpop    %r9\n    94c8:\tc3                   \tretq   \n\n00000000000094c9 <nemesfpu512mix21>:\n    94c9:\t41 51                \tpush   %r9\n    94cb:\t49 c7 c1 10 00 00 00 \tmov    $0x10,%r9\n    94d2:\tf3 49 0f 2a c1       \tcvtsi2ss %r9,%xmm0\n    94d7:\t62 f2 7d 48 18 c8    \tvbroadcastss %xmm0,%zmm1\n    94dd:\t62 f1 fd 48 6f d1    \tvmovdqa64 %zmm1,%zmm2\n    94e3:\t62 f1 fd 48 6f d9    \tvmovdqa64 %zmm1,%zmm3\n    94e9:\t62 f1 fd 48 6f e1    \tvmovdqa64 %zmm1,%zmm4\n    94ef:\t62 f1 fd 48 6f e9    \tvmovdqa64 %zmm1,%zmm5\n    94f5:\t62 f1 fd 48 6f f1    \tvmovdqa64 %zmm1,%zmm6\n    94fb:\t62 f1 fd 48 6f f9    \tvmovdqa64 %zmm1,%zmm7\n    9501:\t62 71 fd 48 6f c1    \tvmovdqa64 %zmm1,%zmm8\n    9507:\t62 71 fd 48 6f c9    \tvmovdqa64 %zmm1,%zmm9\n    950d:\t62 71 fd 48 6f d1    \tvmovdqa64 %zmm1,%zmm10\n    9513:\t62 71 fd 48 6f d9    \tvmovdqa64 %zmm1,%zmm11\n    9519:\t62 71 fd 48 6f e1    \tvmovdqa64 %zmm1,%zmm12\n    951f:\t62 71 fd 48 6f e9    \tvmovdqa64 %zmm1,%zmm13\n    9525:\t62 71 fd 48 6f f1    \tvmovdqa64 %zmm1,%zmm14\n    952b:\t62 71 fd 48 6f f9    \tvmovdqa64 %zmm1,%zmm15\n\n0000000000009531 <nemesfpu512mix21_loop>:\n    9531:\t62 f1 7c 48 58 c0    \tvaddps %zmm0,%zmm0,%zmm0\n    9537:\t62 f2 75 48 98 c9    \tvfmadd132ps %zmm1,%zmm1,%zmm1\n    953d:\t62 f2 6d 48 98 d2    \tvfmadd132ps %zmm2,%zmm2,%zmm2\n    9543:\t62 f1 64 48 58 db    \tvaddps %zmm3,%zmm3,%zmm3\n    9549:\t62 f2 5d 48 98 e4    \tvfmadd132ps %zmm4,%zmm4,%zmm4\n    954f:\t62 f2 55 48 98 ed    \tvfmadd132ps %zmm5,%zmm5,%zmm5\n    9555:\t62 f1 4c 48 58 f6    \tvaddps %zmm6,%zmm6,%zmm6\n    955b:\t62 f2 45 48 98 ff    \tvfmadd132ps %zmm7,%zmm7,%zmm7\n    9561:\t62 52 3d 48 98 c0    \tvfmadd132ps %zmm8,%zmm8,%zmm8\n    9567:\t62 51 34 48 58 c9    \tvaddps %zmm9,%zmm9,%zmm9\n    956d:\t62 52 2d 48 98 d2    \tvfmadd132ps %zmm10,%zmm10,%zmm10\n    9573:\t62 52 25 48 98 db    \tvfmadd132ps %zmm11,%zmm11,%zmm11\n    9579:\tc4 41 1c 58 e4       \tvaddps %ymm12,%ymm12,%ymm12\n    957e:\t62 52 15 48 98 ed    \tvfmadd132ps %zmm13,%zmm13,%zmm13\n    9584:\t62 52 0d 48 98 f6    \tvfmadd132ps %zmm14,%zmm14,%zmm14\n    958a:\t62 51 04 48 58 ff    \tvaddps %zmm15,%zmm15,%zmm15\n    9590:\t4c 29 cf             \tsub    %r9,%rdi\n    9593:\t7f 9c                \tjg     9531 <nemesfpu512mix21_loop>\n    9595:\t41 59                \tpop    %r9\n    9597:\tc3                   \tretq   \n\n0000000000009598 <nemesfpumix21>:\n    9598:\t41 51                \tpush   %r9\n    959a:\t49 c7 c1 10 00 00 00 \tmov    $0x10,%r9\n    95a1:\tf3 49 0f 2a c1       \tcvtsi2ss %r9,%xmm0\n    95a6:\tc4 e2 7d 18 c8       \tvbroadcastss %xmm0,%ymm1\n    95ab:\tc5 fd 6f d1          \tvmovdqa %ymm1,%ymm2\n    95af:\tc5 fd 6f d9          \tvmovdqa %ymm1,%ymm3\n    95b3:\tc5 fd 6f e1          \tvmovdqa %ymm1,%ymm4\n    95b7:\tc5 fd 6f e9          \tvmovdqa %ymm1,%ymm5\n    95bb:\tc5 fd 6f f1          \tvmovdqa %ymm1,%ymm6\n    95bf:\tc5 fd 6f f9          \tvmovdqa %ymm1,%ymm7\n    95c3:\tc5 7d 6f c1          \tvmovdqa %ymm1,%ymm8\n    95c7:\tc5 7d 6f c9          \tvmovdqa %ymm1,%ymm9\n    95cb:\tc5 7d 6f d1          \tvmovdqa %ymm1,%ymm10\n    95cf:\tc5 7d 6f d9          \tvmovdqa %ymm1,%ymm11\n    95d3:\tc5 7d 6f e1          \tvmovdqa %ymm1,%ymm12\n    95d7:\tc5 7d 6f e9          \tvmovdqa %ymm1,%ymm13\n    95db:\tc5 7d 6f f1          \tvmovdqa %ymm1,%ymm14\n    95df:\tc5 7d 6f f9          \tvmovdqa %ymm1,%ymm15\n\n00000000000095e3 <nemesfpumix21_loop>:\n    95e3:\tc5 fc 58 c0          \tvaddps %ymm0,%ymm0,%ymm0\n    95e7:\tc4 e2 75 98 c9       \tvfmadd132ps %ymm1,%ymm1,%ymm1\n    95ec:\tc4 e2 6d 98 d2       \tvfmadd132ps %ymm2,%ymm2,%ymm2\n    95f1:\tc5 e4 58 db          \tvaddps %ymm3,%ymm3,%ymm3\n    95f5:\tc4 e2 5d 98 e4       \tvfmadd132ps %ymm4,%ymm4,%ymm4\n    95fa:\tc4 e2 55 98 ed       \tvfmadd132ps %ymm5,%ymm5,%ymm5\n    95ff:\tc5 cc 58 f6          \tvaddps %ymm6,%ymm6,%ymm6\n    9603:\tc4 e2 45 98 ff       \tvfmadd132ps %ymm7,%ymm7,%ymm7\n    9608:\tc4 42 3d 98 c0       \tvfmadd132ps %ymm8,%ymm8,%ymm8\n    960d:\tc4 41 34 58 c9       \tvaddps %ymm9,%ymm9,%ymm9\n    9612:\tc4 42 2d 98 d2       \tvfmadd132ps %ymm10,%ymm10,%ymm10\n    9617:\tc4 42 25 98 db       \tvfmadd132ps %ymm11,%ymm11,%ymm11\n    961c:\tc4 41 1c 58 e4       \tvaddps %ymm12,%ymm12,%ymm12\n    9621:\tc4 42 15 98 ed       \tvfmadd132ps %ymm13,%ymm13,%ymm13\n    9626:\tc4 42 0d 98 f6       \tvfmadd132ps %ymm14,%ymm14,%ymm14\n    962b:\tc4 41 04 58 ff       \tvaddps %ymm15,%ymm15,%ymm15\n    9630:\t4c 29 cf             \tsub    %r9,%rdi\n    9633:\t7f ae                \tjg     95e3 <nemesfpumix21_loop>\n    9635:\t41 59                \tpop    %r9\n    9637:\tc3                   \tretq   \n\n0000000000009638 <latfma512>:\n    9638:\t41 51                \tpush   %r9\n    963a:\t41 50                \tpush   %r8\n    963c:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    9643:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    9648:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    964d:\t62 f2 7d 48 18 f6    \tvbroadcastss %xmm6,%zmm6\n    9653:\t62 f1 7c 48 10 ee    \tvmovups %zmm6,%zmm5\n    9659:\t62 f1 7c 48 10 fe    \tvmovups %zmm6,%zmm7\n    965f:\t62 71 7c 48 10 c6    \tvmovups %zmm6,%zmm8\n    9665:\t62 71 7c 48 10 ce    \tvmovups %zmm6,%zmm9\n    966b:\t62 71 7c 48 10 d6    \tvmovups %zmm6,%zmm10\n    9671:\t62 71 7c 48 10 de    \tvmovups %zmm6,%zmm11\n    9677:\t62 71 7c 48 10 e6    \tvmovups %zmm6,%zmm12\n    967d:\t62 71 7c 48 10 ee    \tvmovups %zmm6,%zmm13\n    9683:\t62 71 7c 48 10 f6    \tvmovups %zmm6,%zmm14\n    9689:\t62 71 7c 48 10 fe    \tvmovups %zmm6,%zmm15\n\n000000000000968f <latfma512_loop>:\n    968f:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    9695:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    969b:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96a1:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96a7:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96ad:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96b3:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96b9:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96bf:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96c5:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96cb:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96d1:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96d7:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96dd:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96e3:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96e9:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96ef:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96f5:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    96fb:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    9701:\t62 f2 55 48 98 fe    \tvfmadd132ps %zmm6,%zmm5,%zmm7\n    9707:\t4c 29 cf             \tsub    %r9,%rdi\n    970a:\t75 83                \tjne    968f <latfma512_loop>\n    970c:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    9711:\tc5 f8 77             \tvzeroupper \n    9714:\t41 58                \tpop    %r8\n    9716:\t41 59                \tpop    %r9\n    9718:\tc3                   \tretq   \n\n0000000000009719 <latfma256>:\n    9719:\t41 51                \tpush   %r9\n    971b:\t41 50                \tpush   %r8\n    971d:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    9724:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    9729:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    972e:\tc4 e2 7d 18 f6       \tvbroadcastss %xmm6,%ymm6\n    9733:\tc5 fc 10 ee          \tvmovups %ymm6,%ymm5\n    9737:\tc5 fc 10 fe          \tvmovups %ymm6,%ymm7\n    973b:\tc5 7c 10 c6          \tvmovups %ymm6,%ymm8\n    973f:\tc5 7c 10 ce          \tvmovups %ymm6,%ymm9\n    9743:\tc5 7c 10 d6          \tvmovups %ymm6,%ymm10\n    9747:\tc5 7c 10 de          \tvmovups %ymm6,%ymm11\n    974b:\tc5 7c 10 e6          \tvmovups %ymm6,%ymm12\n    974f:\tc5 7c 10 ee          \tvmovups %ymm6,%ymm13\n    9753:\tc5 7c 10 f6          \tvmovups %ymm6,%ymm14\n    9757:\tc5 7c 10 fe          \tvmovups %ymm6,%ymm15\n\n000000000000975b <latfma256_loop>:\n    975b:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    9760:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    9765:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    976a:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    976f:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    9774:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    9779:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    977e:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    9783:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    9788:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    978d:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    9792:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    9797:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    979c:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    97a1:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    97a6:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    97ab:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    97b0:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    97b5:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    97ba:\tc4 e2 55 98 fe       \tvfmadd132ps %ymm6,%ymm5,%ymm7\n    97bf:\t4c 29 cf             \tsub    %r9,%rdi\n    97c2:\t75 97                \tjne    975b <latfma256_loop>\n    97c4:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    97c9:\tc5 f8 77             \tvzeroupper \n    97cc:\t41 58                \tpop    %r8\n    97ce:\t41 59                \tpop    %r9\n    97d0:\tc3                   \tretq   \n\n00000000000097d1 <latfma128>:\n    97d1:\t41 51                \tpush   %r9\n    97d3:\t41 50                \tpush   %r8\n    97d5:\tc5 f8 77             \tvzeroupper \n    97d8:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    97df:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    97e4:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n    97e9:\tc4 e2 79 18 f6       \tvbroadcastss %xmm6,%xmm6\n    97ee:\tc5 f8 10 ee          \tvmovups %xmm6,%xmm5\n    97f2:\tc5 f8 10 fe          \tvmovups %xmm6,%xmm7\n    97f6:\tc5 78 10 c6          \tvmovups %xmm6,%xmm8\n    97fa:\tc5 78 10 ce          \tvmovups %xmm6,%xmm9\n    97fe:\tc5 78 10 d6          \tvmovups %xmm6,%xmm10\n    9802:\tc5 78 10 de          \tvmovups %xmm6,%xmm11\n    9806:\tc5 78 10 e6          \tvmovups %xmm6,%xmm12\n    980a:\tc5 78 10 ee          \tvmovups %xmm6,%xmm13\n    980e:\tc5 78 10 f6          \tvmovups %xmm6,%xmm14\n    9812:\tc5 78 10 fe          \tvmovups %xmm6,%xmm15\n\n0000000000009816 <latfma128_loop>:\n    9816:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    981b:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9820:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9825:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    982a:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    982f:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9834:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9839:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    983e:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9843:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9848:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    984d:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9852:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9857:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    985c:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9861:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9866:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    986b:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9870:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    9875:\tc4 e2 51 98 fe       \tvfmadd132ps %xmm6,%xmm5,%xmm7\n    987a:\t4c 29 cf             \tsub    %r9,%rdi\n    987d:\t75 97                \tjne    9816 <latfma128_loop>\n    987f:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    9884:\tc5 f8 77             \tvzeroupper \n    9887:\t41 58                \tpop    %r8\n    9889:\t41 59                \tpop    %r9\n    988b:\tc3                   \tretq   \n\n000000000000988c <latadd128fp>:\n    988c:\t41 51                \tpush   %r9\n    988e:\t41 50                \tpush   %r8\n    9890:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    9897:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    989c:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n\n00000000000098a1 <latadd128fp_loop>:\n    98a1:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98a4:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98a7:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98aa:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98ad:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98b0:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98b3:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98b6:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98b9:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98bc:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98bf:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98c2:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98c5:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98c8:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98cb:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98ce:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98d1:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98d4:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98d7:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98da:\t0f 58 f6             \taddps  %xmm6,%xmm6\n    98dd:\t4c 29 cf             \tsub    %r9,%rdi\n    98e0:\t75 bf                \tjne    98a1 <latadd128fp_loop>\n    98e2:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    98e7:\t41 58                \tpop    %r8\n    98e9:\t41 59                \tpop    %r9\n    98eb:\tc3                   \tretq   \n\n00000000000098ec <latmul128fp>:\n    98ec:\t41 51                \tpush   %r9\n    98ee:\t41 50                \tpush   %r8\n    98f0:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    98f7:\t66 49 0f 6e c9       \tmovq   %r9,%xmm1\n    98fc:\tf3 49 0f 2a f1       \tcvtsi2ss %r9,%xmm6\n\n0000000000009901 <latmul128fp_loop>:\n    9901:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9904:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9907:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    990a:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    990d:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9910:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9913:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9916:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9919:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    991c:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    991f:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9922:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9925:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9928:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    992b:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    992e:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9931:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9934:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    9937:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    993a:\t0f 59 f6             \tmulps  %xmm6,%xmm6\n    993d:\t4c 29 cf             \tsub    %r9,%rdi\n    9940:\t75 bf                \tjne    9901 <latmul128fp_loop>\n    9942:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    9947:\t41 58                \tpop    %r8\n    9949:\t41 59                \tpop    %r9\n    994b:\tc3                   \tretq   \n\n000000000000994c <mul128fp>:\n    994c:\t41 51                \tpush   %r9\n    994e:\t41 50                \tpush   %r8\n    9950:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    9957:\tf3 49 0f 2a e1       \tcvtsi2ss %r9,%xmm4\n    995c:\tf3 49 0f 2a d9       \tcvtsi2ss %r9,%xmm3\n    9961:\tf3 49 0f 2a d1       \tcvtsi2ss %r9,%xmm2\n    9966:\tf3 49 0f 2a c9       \tcvtsi2ss %r9,%xmm1\n    996b:\tf3 49 0f 2a c1       \tcvtsi2ss %r9,%xmm0\n\n0000000000009970 <mul128fp_loop>:\n    9970:\t0f 59 c0             \tmulps  %xmm0,%xmm0\n    9973:\t0f 59 c9             \tmulps  %xmm1,%xmm1\n    9976:\t0f 59 d2             \tmulps  %xmm2,%xmm2\n    9979:\t0f 59 db             \tmulps  %xmm3,%xmm3\n    997c:\t0f 59 e4             \tmulps  %xmm4,%xmm4\n    997f:\t0f 59 c0             \tmulps  %xmm0,%xmm0\n    9982:\t0f 59 c9             \tmulps  %xmm1,%xmm1\n    9985:\t0f 59 d2             \tmulps  %xmm2,%xmm2\n    9988:\t0f 59 db             \tmulps  %xmm3,%xmm3\n    998b:\t0f 59 e4             \tmulps  %xmm4,%xmm4\n    998e:\t0f 59 c0             \tmulps  %xmm0,%xmm0\n    9991:\t0f 59 c9             \tmulps  %xmm1,%xmm1\n    9994:\t0f 59 d2             \tmulps  %xmm2,%xmm2\n    9997:\t0f 59 db             \tmulps  %xmm3,%xmm3\n    999a:\t0f 59 e4             \tmulps  %xmm4,%xmm4\n    999d:\t0f 59 c0             \tmulps  %xmm0,%xmm0\n    99a0:\t0f 59 c9             \tmulps  %xmm1,%xmm1\n    99a3:\t0f 59 d2             \tmulps  %xmm2,%xmm2\n    99a6:\t0f 59 db             \tmulps  %xmm3,%xmm3\n    99a9:\t0f 59 e4             \tmulps  %xmm4,%xmm4\n    99ac:\t4c 29 cf             \tsub    %r9,%rdi\n    99af:\t75 bf                \tjne    9970 <mul128fp_loop>\n    99b1:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    99b6:\t41 58                \tpop    %r8\n    99b8:\t41 59                \tpop    %r9\n    99ba:\tc3                   \tretq   \n\n00000000000099bb <add128fp>:\n    99bb:\t41 51                \tpush   %r9\n    99bd:\t41 50                \tpush   %r8\n    99bf:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    99c6:\tf3 49 0f 2a e1       \tcvtsi2ss %r9,%xmm4\n    99cb:\tf3 49 0f 2a d9       \tcvtsi2ss %r9,%xmm3\n    99d0:\tf3 49 0f 2a d1       \tcvtsi2ss %r9,%xmm2\n    99d5:\tf3 49 0f 2a c9       \tcvtsi2ss %r9,%xmm1\n    99da:\tf3 49 0f 2a c1       \tcvtsi2ss %r9,%xmm0\n\n00000000000099df <add128fp_loop>:\n    99df:\t0f 58 c0             \taddps  %xmm0,%xmm0\n    99e2:\t0f 58 c9             \taddps  %xmm1,%xmm1\n    99e5:\t0f 58 d2             \taddps  %xmm2,%xmm2\n    99e8:\t0f 58 db             \taddps  %xmm3,%xmm3\n    99eb:\t0f 58 e4             \taddps  %xmm4,%xmm4\n    99ee:\t0f 58 c0             \taddps  %xmm0,%xmm0\n    99f1:\t0f 58 c9             \taddps  %xmm1,%xmm1\n    99f4:\t0f 58 d2             \taddps  %xmm2,%xmm2\n    99f7:\t0f 58 db             \taddps  %xmm3,%xmm3\n    99fa:\t0f 58 e4             \taddps  %xmm4,%xmm4\n    99fd:\t0f 58 c0             \taddps  %xmm0,%xmm0\n    9a00:\t0f 58 c9             \taddps  %xmm1,%xmm1\n    9a03:\t0f 58 d2             \taddps  %xmm2,%xmm2\n    9a06:\t0f 58 db             \taddps  %xmm3,%xmm3\n    9a09:\t0f 58 e4             \taddps  %xmm4,%xmm4\n    9a0c:\t0f 58 c0             \taddps  %xmm0,%xmm0\n    9a0f:\t0f 58 c9             \taddps  %xmm1,%xmm1\n    9a12:\t0f 58 d2             \taddps  %xmm2,%xmm2\n    9a15:\t0f 58 db             \taddps  %xmm3,%xmm3\n    9a18:\t0f 58 e4             \taddps  %xmm4,%xmm4\n    9a1b:\t4c 29 cf             \tsub    %r9,%rdi\n    9a1e:\t75 bf                \tjne    99df <add128fp_loop>\n    9a20:\t66 48 0f 7e c8       \tmovq   %xmm1,%rax\n    9a25:\t41 58                \tpop    %r8\n    9a27:\t41 59                \tpop    %r9\n    9a29:\tc3                   \tretq   \n\n0000000000009a2a <latmul64>:\n    9a2a:\t53                   \tpush   %rbx\n    9a2b:\t51                   \tpush   %rcx\n    9a2c:\t41 50                \tpush   %r8\n    9a2e:\t41 51                \tpush   %r9\n    9a30:\t41 52                \tpush   %r10\n    9a32:\t41 53                \tpush   %r11\n    9a34:\t41 54                \tpush   %r12\n    9a36:\t41 55                \tpush   %r13\n    9a38:\t41 56                \tpush   %r14\n    9a3a:\t41 57                \tpush   %r15\n    9a3c:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    9a43:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    9a4a:\t4c 89 c3             \tmov    %r8,%rbx\n    9a4d:\t4c 89 c1             \tmov    %r8,%rcx\n    9a50:\t4d 89 c2             \tmov    %r8,%r10\n    9a53:\t4d 89 c3             \tmov    %r8,%r11\n    9a56:\t4d 89 c4             \tmov    %r8,%r12\n    9a59:\t4d 89 c5             \tmov    %r8,%r13\n    9a5c:\t4d 89 c6             \tmov    %r8,%r14\n    9a5f:\t4d 89 cf             \tmov    %r9,%r15\n\n0000000000009a62 <latmul64_loop>:\n    9a62:\t4d 0f af f9          \timul   %r9,%r15\n    9a66:\t4d 0f af f9          \timul   %r9,%r15\n    9a6a:\t4d 0f af f9          \timul   %r9,%r15\n    9a6e:\t4d 0f af f9          \timul   %r9,%r15\n    9a72:\t4d 0f af f9          \timul   %r9,%r15\n    9a76:\t4d 0f af f9          \timul   %r9,%r15\n    9a7a:\t4d 0f af f9          \timul   %r9,%r15\n    9a7e:\t4d 0f af f9          \timul   %r9,%r15\n    9a82:\t4d 0f af f9          \timul   %r9,%r15\n    9a86:\t4d 0f af f9          \timul   %r9,%r15\n    9a8a:\t4d 0f af f9          \timul   %r9,%r15\n    9a8e:\t4d 0f af f9          \timul   %r9,%r15\n    9a92:\t4d 0f af f9          \timul   %r9,%r15\n    9a96:\t4d 0f af f9          \timul   %r9,%r15\n    9a9a:\t4d 0f af f9          \timul   %r9,%r15\n    9a9e:\t4d 0f af f9          \timul   %r9,%r15\n    9aa2:\t4d 0f af f9          \timul   %r9,%r15\n    9aa6:\t4d 0f af f9          \timul   %r9,%r15\n    9aaa:\t4d 0f af f9          \timul   %r9,%r15\n    9aae:\t4d 0f af f9          \timul   %r9,%r15\n    9ab2:\t4c 29 cf             \tsub    %r9,%rdi\n    9ab5:\t75 ab                \tjne    9a62 <latmul64_loop>\n    9ab7:\t41 5f                \tpop    %r15\n    9ab9:\t41 5e                \tpop    %r14\n    9abb:\t41 5d                \tpop    %r13\n    9abd:\t41 5c                \tpop    %r12\n    9abf:\t41 5b                \tpop    %r11\n    9ac1:\t41 5a                \tpop    %r10\n    9ac3:\t41 59                \tpop    %r9\n    9ac5:\t41 58                \tpop    %r8\n    9ac7:\t59                   \tpop    %rcx\n    9ac8:\t5b                   \tpop    %rbx\n    9ac9:\tc3                   \tretq   \n\n0000000000009aca <latmul16>:\n    9aca:\t53                   \tpush   %rbx\n    9acb:\t51                   \tpush   %rcx\n    9acc:\t41 50                \tpush   %r8\n    9ace:\t41 51                \tpush   %r9\n    9ad0:\t41 52                \tpush   %r10\n    9ad2:\t41 53                \tpush   %r11\n    9ad4:\t41 54                \tpush   %r12\n    9ad6:\t41 55                \tpush   %r13\n    9ad8:\t41 56                \tpush   %r14\n    9ada:\t41 57                \tpush   %r15\n    9adc:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    9ae3:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    9aea:\t4c 89 c3             \tmov    %r8,%rbx\n    9aed:\t4c 89 c1             \tmov    %r8,%rcx\n    9af0:\t4d 89 c2             \tmov    %r8,%r10\n    9af3:\t4d 89 c3             \tmov    %r8,%r11\n    9af6:\t4d 89 c4             \tmov    %r8,%r12\n    9af9:\t4d 89 c5             \tmov    %r8,%r13\n    9afc:\t4d 89 c6             \tmov    %r8,%r14\n    9aff:\t4d 89 cf             \tmov    %r9,%r15\n\n0000000000009b02 <latmul16_loop>:\n    9b02:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b07:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b0c:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b11:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b16:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b1b:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b20:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b25:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b2a:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b2f:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b34:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b39:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b3e:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b43:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b48:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b4d:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b52:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b57:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b5c:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b61:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9b66:\t4c 29 cf             \tsub    %r9,%rdi\n    9b69:\t75 97                \tjne    9b02 <latmul16_loop>\n    9b6b:\t41 5f                \tpop    %r15\n    9b6d:\t41 5e                \tpop    %r14\n    9b6f:\t41 5d                \tpop    %r13\n    9b71:\t41 5c                \tpop    %r12\n    9b73:\t41 5b                \tpop    %r11\n    9b75:\t41 5a                \tpop    %r10\n    9b77:\t41 59                \tpop    %r9\n    9b79:\t41 58                \tpop    %r8\n    9b7b:\t59                   \tpop    %rcx\n    9b7c:\t5b                   \tpop    %rbx\n    9b7d:\tc3                   \tretq   \n\n0000000000009b7e <mul16>:\n    9b7e:\t53                   \tpush   %rbx\n    9b7f:\t51                   \tpush   %rcx\n    9b80:\t41 50                \tpush   %r8\n    9b82:\t41 51                \tpush   %r9\n    9b84:\t41 52                \tpush   %r10\n    9b86:\t41 53                \tpush   %r11\n    9b88:\t41 54                \tpush   %r12\n    9b8a:\t41 55                \tpush   %r13\n    9b8c:\t41 56                \tpush   %r14\n    9b8e:\t41 57                \tpush   %r15\n    9b90:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    9b97:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    9b9e:\t4c 89 c3             \tmov    %r8,%rbx\n    9ba1:\t4c 89 c1             \tmov    %r8,%rcx\n    9ba4:\t4d 89 c2             \tmov    %r8,%r10\n    9ba7:\t4d 89 c3             \tmov    %r8,%r11\n    9baa:\t4d 89 c4             \tmov    %r8,%r12\n    9bad:\t4d 89 c5             \tmov    %r8,%r13\n    9bb0:\t4d 89 c6             \tmov    %r8,%r14\n    9bb3:\t4d 89 cf             \tmov    %r9,%r15\n\n0000000000009bb6 <mul16_loop>:\n    9bb6:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9bbb:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9bc0:\t66 45 0f af e9       \timul   %r9w,%r13w\n    9bc5:\t66 45 0f af e1       \timul   %r9w,%r12w\n    9bca:\t66 45 0f af d9       \timul   %r9w,%r11w\n    9bcf:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9bd4:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9bd9:\t66 45 0f af e9       \timul   %r9w,%r13w\n    9bde:\t66 45 0f af e1       \timul   %r9w,%r12w\n    9be3:\t66 45 0f af d9       \timul   %r9w,%r11w\n    9be8:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9bed:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9bf2:\t66 45 0f af e9       \timul   %r9w,%r13w\n    9bf7:\t66 45 0f af e1       \timul   %r9w,%r12w\n    9bfc:\t66 45 0f af d9       \timul   %r9w,%r11w\n    9c01:\t66 45 0f af f9       \timul   %r9w,%r15w\n    9c06:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9c0b:\t66 45 0f af e9       \timul   %r9w,%r13w\n    9c10:\t66 45 0f af e1       \timul   %r9w,%r12w\n    9c15:\t66 45 0f af d9       \timul   %r9w,%r11w\n    9c1a:\t4c 29 cf             \tsub    %r9,%rdi\n    9c1d:\t75 97                \tjne    9bb6 <mul16_loop>\n    9c1f:\t41 5f                \tpop    %r15\n    9c21:\t41 5e                \tpop    %r14\n    9c23:\t41 5d                \tpop    %r13\n    9c25:\t41 5c                \tpop    %r12\n    9c27:\t41 5b                \tpop    %r11\n    9c29:\t41 5a                \tpop    %r10\n    9c2b:\t41 59                \tpop    %r9\n    9c2d:\t41 58                \tpop    %r8\n    9c2f:\t59                   \tpop    %rcx\n    9c30:\t5b                   \tpop    %rbx\n    9c31:\tc3                   \tretq   \n\n0000000000009c32 <mul64>:\n    9c32:\t53                   \tpush   %rbx\n    9c33:\t51                   \tpush   %rcx\n    9c34:\t56                   \tpush   %rsi\n    9c35:\t41 50                \tpush   %r8\n    9c37:\t41 51                \tpush   %r9\n    9c39:\t41 52                \tpush   %r10\n    9c3b:\t41 53                \tpush   %r11\n    9c3d:\t41 54                \tpush   %r12\n    9c3f:\t41 55                \tpush   %r13\n    9c41:\t41 56                \tpush   %r14\n    9c43:\t41 57                \tpush   %r15\n    9c45:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    9c4c:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    9c53:\t4c 89 c3             \tmov    %r8,%rbx\n    9c56:\t4c 89 c1             \tmov    %r8,%rcx\n    9c59:\t4d 89 c2             \tmov    %r8,%r10\n    9c5c:\t4d 89 c3             \tmov    %r8,%r11\n    9c5f:\t4d 89 c4             \tmov    %r8,%r12\n    9c62:\t4d 89 c5             \tmov    %r8,%r13\n    9c65:\t4d 89 c6             \tmov    %r8,%r14\n    9c68:\t4d 89 cf             \tmov    %r9,%r15\n\n0000000000009c6b <mul64_loop>:\n    9c6b:\t4d 0f af f9          \timul   %r9,%r15\n    9c6f:\t4d 89 cf             \tmov    %r9,%r15\n    9c72:\t4d 0f af f1          \timul   %r9,%r14\n    9c76:\t4d 89 ce             \tmov    %r9,%r14\n    9c79:\t4d 0f af e9          \timul   %r9,%r13\n    9c7d:\t4d 89 cd             \tmov    %r9,%r13\n    9c80:\t4d 0f af e1          \timul   %r9,%r12\n    9c84:\t4d 89 cc             \tmov    %r9,%r12\n    9c87:\t4d 0f af d9          \timul   %r9,%r11\n    9c8b:\t4d 89 cb             \tmov    %r9,%r11\n    9c8e:\t4d 0f af d1          \timul   %r9,%r10\n    9c92:\t4d 89 ca             \tmov    %r9,%r10\n    9c95:\t4d 0f af c1          \timul   %r9,%r8\n    9c99:\t4d 89 c8             \tmov    %r9,%r8\n    9c9c:\t49 0f af d9          \timul   %r9,%rbx\n    9ca0:\t4c 89 cb             \tmov    %r9,%rbx\n    9ca3:\t49 0f af c9          \timul   %r9,%rcx\n    9ca7:\t4c 89 c9             \tmov    %r9,%rcx\n    9caa:\t49 0f af f1          \timul   %r9,%rsi\n    9cae:\t4c 89 ce             \tmov    %r9,%rsi\n    9cb1:\t4d 0f af f9          \timul   %r9,%r15\n    9cb5:\t4d 89 cf             \tmov    %r9,%r15\n    9cb8:\t4d 0f af f1          \timul   %r9,%r14\n    9cbc:\t4d 89 ce             \tmov    %r9,%r14\n    9cbf:\t4d 0f af e9          \timul   %r9,%r13\n    9cc3:\t4d 89 cd             \tmov    %r9,%r13\n    9cc6:\t4d 0f af e1          \timul   %r9,%r12\n    9cca:\t4d 89 cc             \tmov    %r9,%r12\n    9ccd:\t4d 0f af d9          \timul   %r9,%r11\n    9cd1:\t4d 89 cb             \tmov    %r9,%r11\n    9cd4:\t4d 0f af d1          \timul   %r9,%r10\n    9cd8:\t4d 89 ca             \tmov    %r9,%r10\n    9cdb:\t4d 0f af c1          \timul   %r9,%r8\n    9cdf:\t4d 89 c8             \tmov    %r9,%r8\n    9ce2:\t49 0f af d9          \timul   %r9,%rbx\n    9ce6:\t4c 89 cb             \tmov    %r9,%rbx\n    9ce9:\t49 0f af c9          \timul   %r9,%rcx\n    9ced:\t4c 89 c9             \tmov    %r9,%rcx\n    9cf0:\t49 0f af f1          \timul   %r9,%rsi\n    9cf4:\t4c 89 ce             \tmov    %r9,%rsi\n    9cf7:\t4c 29 cf             \tsub    %r9,%rdi\n    9cfa:\t0f 85 6b ff ff ff    \tjne    9c6b <mul64_loop>\n    9d00:\t41 5f                \tpop    %r15\n    9d02:\t41 5e                \tpop    %r14\n    9d04:\t41 5d                \tpop    %r13\n    9d06:\t41 5c                \tpop    %r12\n    9d08:\t41 5b                \tpop    %r11\n    9d0a:\t41 5a                \tpop    %r10\n    9d0c:\t41 59                \tpop    %r9\n    9d0e:\t41 58                \tpop    %r8\n    9d10:\t5e                   \tpop    %rsi\n    9d11:\t59                   \tpop    %rcx\n    9d12:\t5b                   \tpop    %rbx\n    9d13:\tc3                   \tretq   \n\n0000000000009d14 <mixmul16mul64>:\n    9d14:\t53                   \tpush   %rbx\n    9d15:\t51                   \tpush   %rcx\n    9d16:\t56                   \tpush   %rsi\n    9d17:\t41 50                \tpush   %r8\n    9d19:\t41 51                \tpush   %r9\n    9d1b:\t41 52                \tpush   %r10\n    9d1d:\t41 53                \tpush   %r11\n    9d1f:\t41 54                \tpush   %r12\n    9d21:\t41 55                \tpush   %r13\n    9d23:\t41 56                \tpush   %r14\n    9d25:\t41 57                \tpush   %r15\n    9d27:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    9d2e:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    9d35:\t4c 89 c3             \tmov    %r8,%rbx\n    9d38:\t4c 89 c1             \tmov    %r8,%rcx\n    9d3b:\t4d 89 c2             \tmov    %r8,%r10\n    9d3e:\t4d 89 c3             \tmov    %r8,%r11\n    9d41:\t4d 89 c4             \tmov    %r8,%r12\n    9d44:\t4d 89 c5             \tmov    %r8,%r13\n    9d47:\t4d 89 c6             \tmov    %r8,%r14\n    9d4a:\t4d 89 cf             \tmov    %r9,%r15\n\n0000000000009d4d <mixmul16mul64_loop>:\n    9d4d:\t4d 0f af f9          \timul   %r9,%r15\n    9d51:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9d56:\t4d 0f af e9          \timul   %r9,%r13\n    9d5a:\t66 45 0f af e1       \timul   %r9w,%r12w\n    9d5f:\t4d 0f af d9          \timul   %r9,%r11\n    9d63:\t66 45 0f af d1       \timul   %r9w,%r10w\n    9d68:\t4d 0f af c1          \timul   %r9,%r8\n    9d6c:\t66 41 0f af d9       \timul   %r9w,%bx\n    9d71:\t49 0f af c9          \timul   %r9,%rcx\n    9d75:\t66 41 0f af f1       \timul   %r9w,%si\n    9d7a:\t4d 0f af f9          \timul   %r9,%r15\n    9d7e:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9d83:\t4d 0f af e9          \timul   %r9,%r13\n    9d87:\t66 45 0f af e1       \timul   %r9w,%r12w\n    9d8c:\t4d 0f af d9          \timul   %r9,%r11\n    9d90:\t66 45 0f af d1       \timul   %r9w,%r10w\n    9d95:\t4d 0f af c1          \timul   %r9,%r8\n    9d99:\t66 41 0f af d9       \timul   %r9w,%bx\n    9d9e:\t49 0f af c9          \timul   %r9,%rcx\n    9da2:\t66 41 0f af f1       \timul   %r9w,%si\n    9da7:\t4c 29 cf             \tsub    %r9,%rdi\n    9daa:\t75 a1                \tjne    9d4d <mixmul16mul64_loop>\n    9dac:\t41 5f                \tpop    %r15\n    9dae:\t41 5e                \tpop    %r14\n    9db0:\t41 5d                \tpop    %r13\n    9db2:\t41 5c                \tpop    %r12\n    9db4:\t41 5b                \tpop    %r11\n    9db6:\t41 5a                \tpop    %r10\n    9db8:\t41 59                \tpop    %r9\n    9dba:\t41 58                \tpop    %r8\n    9dbc:\t5e                   \tpop    %rsi\n    9dbd:\t59                   \tpop    %rcx\n    9dbe:\t5b                   \tpop    %rbx\n    9dbf:\tc3                   \tretq   \n\n0000000000009dc0 <mixmul16mul64_21>:\n    9dc0:\t53                   \tpush   %rbx\n    9dc1:\t51                   \tpush   %rcx\n    9dc2:\t52                   \tpush   %rdx\n    9dc3:\t56                   \tpush   %rsi\n    9dc4:\t41 50                \tpush   %r8\n    9dc6:\t41 51                \tpush   %r9\n    9dc8:\t41 52                \tpush   %r10\n    9dca:\t41 53                \tpush   %r11\n    9dcc:\t41 54                \tpush   %r12\n    9dce:\t41 55                \tpush   %r13\n    9dd0:\t41 56                \tpush   %r14\n    9dd2:\t41 57                \tpush   %r15\n    9dd4:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    9ddb:\t49 c7 c1 18 00 00 00 \tmov    $0x18,%r9\n    9de2:\t4c 89 c3             \tmov    %r8,%rbx\n    9de5:\t4c 89 c1             \tmov    %r8,%rcx\n    9de8:\t4c 89 c6             \tmov    %r8,%rsi\n    9deb:\t4d 89 c2             \tmov    %r8,%r10\n    9dee:\t4d 89 c3             \tmov    %r8,%r11\n    9df1:\t4d 89 c4             \tmov    %r8,%r12\n    9df4:\t4d 89 c5             \tmov    %r8,%r13\n    9df7:\t4d 89 c6             \tmov    %r8,%r14\n    9dfa:\t4d 89 cf             \tmov    %r9,%r15\n\n0000000000009dfd <mixmul16mul64_21_loop>:\n    9dfd:\t4d 0f af f9          \timul   %r9,%r15\n    9e01:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9e06:\t66 45 0f af e9       \timul   %r9w,%r13w\n    9e0b:\t4d 0f af e1          \timul   %r9,%r12\n    9e0f:\t66 45 0f af d9       \timul   %r9w,%r11w\n    9e14:\t66 45 0f af d1       \timul   %r9w,%r10w\n    9e19:\t4d 0f af c1          \timul   %r9,%r8\n    9e1d:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9e22:\t66 45 0f af e9       \timul   %r9w,%r13w\n    9e27:\t49 0f af c9          \timul   %r9,%rcx\n    9e2b:\t66 45 0f af d9       \timul   %r9w,%r11w\n    9e30:\t66 45 0f af d1       \timul   %r9w,%r10w\n    9e35:\t49 0f af d9          \timul   %r9,%rbx\n    9e39:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9e3e:\t66 45 0f af e9       \timul   %r9w,%r13w\n    9e43:\t49 0f af c1          \timul   %r9,%rax\n    9e47:\t66 45 0f af d9       \timul   %r9w,%r11w\n    9e4c:\t66 45 0f af d1       \timul   %r9w,%r10w\n    9e51:\t49 0f af f1          \timul   %r9,%rsi\n    9e55:\t66 45 0f af f1       \timul   %r9w,%r14w\n    9e5a:\t66 45 0f af e9       \timul   %r9w,%r13w\n    9e5f:\t49 0f af d1          \timul   %r9,%rdx\n    9e63:\t66 45 0f af d9       \timul   %r9w,%r11w\n    9e68:\t66 45 0f af d1       \timul   %r9w,%r10w\n    9e6d:\t4c 29 cf             \tsub    %r9,%rdi\n    9e70:\t7d 8b                \tjge    9dfd <mixmul16mul64_21_loop>\n    9e72:\t41 5f                \tpop    %r15\n    9e74:\t41 5e                \tpop    %r14\n    9e76:\t41 5d                \tpop    %r13\n    9e78:\t41 5c                \tpop    %r12\n    9e7a:\t41 5b                \tpop    %r11\n    9e7c:\t41 5a                \tpop    %r10\n    9e7e:\t41 59                \tpop    %r9\n    9e80:\t41 58                \tpop    %r8\n    9e82:\t5e                   \tpop    %rsi\n    9e83:\t5a                   \tpop    %rdx\n    9e84:\t59                   \tpop    %rcx\n    9e85:\t5b                   \tpop    %rbx\n    9e86:\tc3                   \tretq   \n\n0000000000009e87 <spacedstorescalar>:\n    9e87:\t53                   \tpush   %rbx\n    9e88:\t51                   \tpush   %rcx\n    9e89:\t41 50                \tpush   %r8\n    9e8b:\t41 51                \tpush   %r9\n    9e8d:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n0000000000009e94 <spacedstorescalar_loop>:\n    9e94:\t48 89 3e             \tmov    %rdi,(%rsi)\n    9e97:\t48 89 7e 40          \tmov    %rdi,0x40(%rsi)\n    9e9b:\t48 89 be 80 00 00 00 \tmov    %rdi,0x80(%rsi)\n    9ea2:\t48 89 be c0 00 00 00 \tmov    %rdi,0xc0(%rsi)\n    9ea9:\t48 89 be 00 01 00 00 \tmov    %rdi,0x100(%rsi)\n    9eb0:\t48 89 be 40 01 00 00 \tmov    %rdi,0x140(%rsi)\n    9eb7:\t48 89 be 80 01 00 00 \tmov    %rdi,0x180(%rsi)\n    9ebe:\t48 89 be c0 01 00 00 \tmov    %rdi,0x1c0(%rsi)\n    9ec5:\t48 89 be 00 02 00 00 \tmov    %rdi,0x200(%rsi)\n    9ecc:\t48 89 be 40 02 00 00 \tmov    %rdi,0x240(%rsi)\n    9ed3:\t48 89 be 80 02 00 00 \tmov    %rdi,0x280(%rsi)\n    9eda:\t48 89 be c0 02 00 00 \tmov    %rdi,0x2c0(%rsi)\n    9ee1:\t48 89 be 00 03 00 00 \tmov    %rdi,0x300(%rsi)\n    9ee8:\t48 89 be 40 03 00 00 \tmov    %rdi,0x340(%rsi)\n    9eef:\t48 89 be 80 03 00 00 \tmov    %rdi,0x380(%rsi)\n    9ef6:\t48 89 be c0 03 00 00 \tmov    %rdi,0x3c0(%rsi)\n    9efd:\t48 89 be 00 04 00 00 \tmov    %rdi,0x400(%rsi)\n    9f04:\t48 89 be 40 04 00 00 \tmov    %rdi,0x440(%rsi)\n    9f0b:\t48 89 be 80 04 00 00 \tmov    %rdi,0x480(%rsi)\n    9f12:\t48 89 be c0 04 00 00 \tmov    %rdi,0x4c0(%rsi)\n    9f19:\t4c 29 cf             \tsub    %r9,%rdi\n    9f1c:\t0f 85 72 ff ff ff    \tjne    9e94 <spacedstorescalar_loop>\n    9f22:\t41 59                \tpop    %r9\n    9f24:\t41 58                \tpop    %r8\n    9f26:\t59                   \tpop    %rcx\n    9f27:\t5b                   \tpop    %rbx\n    9f28:\tc3                   \tretq   \n\n0000000000009f29 <spacedload128>:\n    9f29:\t53                   \tpush   %rbx\n    9f2a:\t51                   \tpush   %rcx\n    9f2b:\t41 50                \tpush   %r8\n    9f2d:\t41 51                \tpush   %r9\n    9f2f:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n0000000000009f36 <spacedload128_loop>:\n    9f36:\t66 44 0f 6f 16       \tmovdqa (%rsi),%xmm10\n    9f3b:\t66 44 0f 6f 5e 40    \tmovdqa 0x40(%rsi),%xmm11\n    9f41:\t66 44 0f 6f a6 80 00 \tmovdqa 0x80(%rsi),%xmm12\n    9f48:\t00 00 \n    9f4a:\t66 44 0f 6f ae c0 00 \tmovdqa 0xc0(%rsi),%xmm13\n    9f51:\t00 00 \n    9f53:\t66 44 0f 6f b6 00 01 \tmovdqa 0x100(%rsi),%xmm14\n    9f5a:\t00 00 \n    9f5c:\t66 44 0f 6f 96 40 01 \tmovdqa 0x140(%rsi),%xmm10\n    9f63:\t00 00 \n    9f65:\t66 44 0f 6f 9e 80 01 \tmovdqa 0x180(%rsi),%xmm11\n    9f6c:\t00 00 \n    9f6e:\t66 44 0f 6f a6 c0 01 \tmovdqa 0x1c0(%rsi),%xmm12\n    9f75:\t00 00 \n    9f77:\t66 44 0f 6f ae 00 02 \tmovdqa 0x200(%rsi),%xmm13\n    9f7e:\t00 00 \n    9f80:\t66 44 0f 6f b6 40 02 \tmovdqa 0x240(%rsi),%xmm14\n    9f87:\t00 00 \n    9f89:\t66 44 0f 6f 96 80 02 \tmovdqa 0x280(%rsi),%xmm10\n    9f90:\t00 00 \n    9f92:\t66 44 0f 6f 9e c0 02 \tmovdqa 0x2c0(%rsi),%xmm11\n    9f99:\t00 00 \n    9f9b:\t66 44 0f 6f a6 00 03 \tmovdqa 0x300(%rsi),%xmm12\n    9fa2:\t00 00 \n    9fa4:\t66 44 0f 6f ae 40 03 \tmovdqa 0x340(%rsi),%xmm13\n    9fab:\t00 00 \n    9fad:\t66 44 0f 6f b6 80 03 \tmovdqa 0x380(%rsi),%xmm14\n    9fb4:\t00 00 \n    9fb6:\t66 44 0f 6f 96 c0 03 \tmovdqa 0x3c0(%rsi),%xmm10\n    9fbd:\t00 00 \n    9fbf:\t66 44 0f 6f 9e 00 04 \tmovdqa 0x400(%rsi),%xmm11\n    9fc6:\t00 00 \n    9fc8:\t66 44 0f 6f a6 40 04 \tmovdqa 0x440(%rsi),%xmm12\n    9fcf:\t00 00 \n    9fd1:\t66 44 0f 6f ae 80 04 \tmovdqa 0x480(%rsi),%xmm13\n    9fd8:\t00 00 \n    9fda:\t66 44 0f 6f b6 c0 04 \tmovdqa 0x4c0(%rsi),%xmm14\n    9fe1:\t00 00 \n    9fe3:\t4c 29 cf             \tsub    %r9,%rdi\n    9fe6:\t0f 85 4a ff ff ff    \tjne    9f36 <spacedload128_loop>\n    9fec:\t41 59                \tpop    %r9\n    9fee:\t41 58                \tpop    %r8\n    9ff0:\t59                   \tpop    %rcx\n    9ff1:\t5b                   \tpop    %rbx\n    9ff2:\tc3                   \tretq   \n\n0000000000009ff3 <load128>:\n    9ff3:\t53                   \tpush   %rbx\n    9ff4:\t51                   \tpush   %rcx\n    9ff5:\t41 50                \tpush   %r8\n    9ff7:\t41 51                \tpush   %r9\n    9ff9:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n000000000000a000 <load128_loop>:\n    a000:\t66 44 0f 6f 16       \tmovdqa (%rsi),%xmm10\n    a005:\t66 44 0f 6f 1e       \tmovdqa (%rsi),%xmm11\n    a00a:\t66 44 0f 6f 26       \tmovdqa (%rsi),%xmm12\n    a00f:\t66 44 0f 6f 2e       \tmovdqa (%rsi),%xmm13\n    a014:\t66 44 0f 6f 36       \tmovdqa (%rsi),%xmm14\n    a019:\t66 44 0f 6f 16       \tmovdqa (%rsi),%xmm10\n    a01e:\t66 44 0f 6f 1e       \tmovdqa (%rsi),%xmm11\n    a023:\t66 44 0f 6f 26       \tmovdqa (%rsi),%xmm12\n    a028:\t66 44 0f 6f 2e       \tmovdqa (%rsi),%xmm13\n    a02d:\t66 44 0f 6f 36       \tmovdqa (%rsi),%xmm14\n    a032:\t66 44 0f 6f 16       \tmovdqa (%rsi),%xmm10\n    a037:\t66 44 0f 6f 1e       \tmovdqa (%rsi),%xmm11\n    a03c:\t66 44 0f 6f 26       \tmovdqa (%rsi),%xmm12\n    a041:\t66 44 0f 6f 2e       \tmovdqa (%rsi),%xmm13\n    a046:\t66 44 0f 6f 36       \tmovdqa (%rsi),%xmm14\n    a04b:\t66 44 0f 6f 16       \tmovdqa (%rsi),%xmm10\n    a050:\t66 44 0f 6f 1e       \tmovdqa (%rsi),%xmm11\n    a055:\t66 44 0f 6f 26       \tmovdqa (%rsi),%xmm12\n    a05a:\t66 44 0f 6f 2e       \tmovdqa (%rsi),%xmm13\n    a05f:\t66 44 0f 6f 36       \tmovdqa (%rsi),%xmm14\n    a064:\t4c 29 cf             \tsub    %r9,%rdi\n    a067:\t75 97                \tjne    a000 <load128_loop>\n    a069:\t41 59                \tpop    %r9\n    a06b:\t41 58                \tpop    %r8\n    a06d:\t59                   \tpop    %rcx\n    a06e:\t5b                   \tpop    %rbx\n    a06f:\tc3                   \tretq   \n\n000000000000a070 <load256>:\n    a070:\t53                   \tpush   %rbx\n    a071:\t51                   \tpush   %rcx\n    a072:\t41 50                \tpush   %r8\n    a074:\t41 51                \tpush   %r9\n    a076:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n000000000000a07d <load256_loop>:\n    a07d:\tc5 7c 28 16          \tvmovaps (%rsi),%ymm10\n    a081:\tc5 7c 28 1e          \tvmovaps (%rsi),%ymm11\n    a085:\tc5 7c 28 26          \tvmovaps (%rsi),%ymm12\n    a089:\tc5 7c 28 2e          \tvmovaps (%rsi),%ymm13\n    a08d:\tc5 7c 28 36          \tvmovaps (%rsi),%ymm14\n    a091:\tc5 7c 28 16          \tvmovaps (%rsi),%ymm10\n    a095:\tc5 7c 28 1e          \tvmovaps (%rsi),%ymm11\n    a099:\tc5 7c 28 26          \tvmovaps (%rsi),%ymm12\n    a09d:\tc5 7c 28 2e          \tvmovaps (%rsi),%ymm13\n    a0a1:\tc5 7c 28 36          \tvmovaps (%rsi),%ymm14\n    a0a5:\tc5 7c 28 16          \tvmovaps (%rsi),%ymm10\n    a0a9:\tc5 7c 28 1e          \tvmovaps (%rsi),%ymm11\n    a0ad:\tc5 7c 28 26          \tvmovaps (%rsi),%ymm12\n    a0b1:\tc5 7c 28 2e          \tvmovaps (%rsi),%ymm13\n    a0b5:\tc5 7c 28 36          \tvmovaps (%rsi),%ymm14\n    a0b9:\tc5 7c 28 16          \tvmovaps (%rsi),%ymm10\n    a0bd:\tc5 7c 28 1e          \tvmovaps (%rsi),%ymm11\n    a0c1:\tc5 7c 28 26          \tvmovaps (%rsi),%ymm12\n    a0c5:\tc5 7c 28 2e          \tvmovaps (%rsi),%ymm13\n    a0c9:\tc5 7c 28 36          \tvmovaps (%rsi),%ymm14\n    a0cd:\t4c 29 cf             \tsub    %r9,%rdi\n    a0d0:\t75 ab                \tjne    a07d <load256_loop>\n    a0d2:\t41 59                \tpop    %r9\n    a0d4:\t41 58                \tpop    %r8\n    a0d6:\t59                   \tpop    %rcx\n    a0d7:\t5b                   \tpop    %rbx\n    a0d8:\tc3                   \tretq   \n\n000000000000a0d9 <load512>:\n    a0d9:\t53                   \tpush   %rbx\n    a0da:\t51                   \tpush   %rcx\n    a0db:\t41 50                \tpush   %r8\n    a0dd:\t41 51                \tpush   %r9\n    a0df:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n000000000000a0e6 <load512_loop>:\n    a0e6:\t62 71 7c 48 28 16    \tvmovaps (%rsi),%zmm10\n    a0ec:\t62 71 7c 48 28 1e    \tvmovaps (%rsi),%zmm11\n    a0f2:\t62 71 7c 48 28 26    \tvmovaps (%rsi),%zmm12\n    a0f8:\t62 71 7c 48 28 2e    \tvmovaps (%rsi),%zmm13\n    a0fe:\t62 71 7c 48 28 36    \tvmovaps (%rsi),%zmm14\n    a104:\t62 71 7c 48 28 16    \tvmovaps (%rsi),%zmm10\n    a10a:\t62 71 7c 48 28 1e    \tvmovaps (%rsi),%zmm11\n    a110:\t62 71 7c 48 28 26    \tvmovaps (%rsi),%zmm12\n    a116:\t62 71 7c 48 28 2e    \tvmovaps (%rsi),%zmm13\n    a11c:\t62 71 7c 48 28 36    \tvmovaps (%rsi),%zmm14\n    a122:\t62 71 7c 48 28 16    \tvmovaps (%rsi),%zmm10\n    a128:\t62 71 7c 48 28 1e    \tvmovaps (%rsi),%zmm11\n    a12e:\t62 71 7c 48 28 26    \tvmovaps (%rsi),%zmm12\n    a134:\t62 71 7c 48 28 2e    \tvmovaps (%rsi),%zmm13\n    a13a:\t62 71 7c 48 28 36    \tvmovaps (%rsi),%zmm14\n    a140:\t62 71 7c 48 28 16    \tvmovaps (%rsi),%zmm10\n    a146:\t62 71 7c 48 28 1e    \tvmovaps (%rsi),%zmm11\n    a14c:\t62 71 7c 48 28 26    \tvmovaps (%rsi),%zmm12\n    a152:\t62 71 7c 48 28 2e    \tvmovaps (%rsi),%zmm13\n    a158:\t62 71 7c 48 28 36    \tvmovaps (%rsi),%zmm14\n    a15e:\t4c 29 cf             \tsub    %r9,%rdi\n    a161:\t75 83                \tjne    a0e6 <load512_loop>\n    a163:\t41 59                \tpop    %r9\n    a165:\t41 58                \tpop    %r8\n    a167:\t59                   \tpop    %rcx\n    a168:\t5b                   \tpop    %rbx\n    a169:\tc3                   \tretq   \n\n000000000000a16a <store128>:\n    a16a:\t53                   \tpush   %rbx\n    a16b:\t51                   \tpush   %rcx\n    a16c:\t41 50                \tpush   %r8\n    a16e:\t41 51                \tpush   %r9\n    a170:\t66 44 0f 6f 16       \tmovdqa (%rsi),%xmm10\n    a175:\t66 45 0f 6f da       \tmovdqa %xmm10,%xmm11\n    a17a:\t66 45 0f 6f e2       \tmovdqa %xmm10,%xmm12\n    a17f:\t66 45 0f 6f ea       \tmovdqa %xmm10,%xmm13\n    a184:\t66 45 0f 6f f2       \tmovdqa %xmm10,%xmm14\n    a189:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n000000000000a190 <store128_loop>:\n    a190:\t66 44 0f 7f 12       \tmovdqa %xmm10,(%rdx)\n    a195:\t66 44 0f 7f 1a       \tmovdqa %xmm11,(%rdx)\n    a19a:\t66 44 0f 7f 22       \tmovdqa %xmm12,(%rdx)\n    a19f:\t66 44 0f 7f 2a       \tmovdqa %xmm13,(%rdx)\n    a1a4:\t66 44 0f 7f 32       \tmovdqa %xmm14,(%rdx)\n    a1a9:\t66 44 0f 7f 12       \tmovdqa %xmm10,(%rdx)\n    a1ae:\t66 44 0f 7f 1a       \tmovdqa %xmm11,(%rdx)\n    a1b3:\t66 44 0f 7f 22       \tmovdqa %xmm12,(%rdx)\n    a1b8:\t66 44 0f 7f 2a       \tmovdqa %xmm13,(%rdx)\n    a1bd:\t66 44 0f 7f 32       \tmovdqa %xmm14,(%rdx)\n    a1c2:\t66 44 0f 7f 12       \tmovdqa %xmm10,(%rdx)\n    a1c7:\t66 44 0f 7f 1a       \tmovdqa %xmm11,(%rdx)\n    a1cc:\t66 44 0f 7f 22       \tmovdqa %xmm12,(%rdx)\n    a1d1:\t66 44 0f 7f 2a       \tmovdqa %xmm13,(%rdx)\n    a1d6:\t66 44 0f 7f 32       \tmovdqa %xmm14,(%rdx)\n    a1db:\t66 44 0f 7f 12       \tmovdqa %xmm10,(%rdx)\n    a1e0:\t66 44 0f 7f 1a       \tmovdqa %xmm11,(%rdx)\n    a1e5:\t66 44 0f 7f 22       \tmovdqa %xmm12,(%rdx)\n    a1ea:\t66 44 0f 7f 2a       \tmovdqa %xmm13,(%rdx)\n    a1ef:\t66 44 0f 7f 32       \tmovdqa %xmm14,(%rdx)\n    a1f4:\t4c 29 cf             \tsub    %r9,%rdi\n    a1f7:\t75 97                \tjne    a190 <store128_loop>\n    a1f9:\t41 59                \tpop    %r9\n    a1fb:\t41 58                \tpop    %r8\n    a1fd:\t59                   \tpop    %rcx\n    a1fe:\t5b                   \tpop    %rbx\n    a1ff:\tc3                   \tretq   \n\n000000000000a200 <store256>:\n    a200:\t53                   \tpush   %rbx\n    a201:\t51                   \tpush   %rcx\n    a202:\t41 50                \tpush   %r8\n    a204:\t41 51                \tpush   %r9\n    a206:\tc5 7c 28 16          \tvmovaps (%rsi),%ymm10\n    a20a:\tc4 41 7c 28 da       \tvmovaps %ymm10,%ymm11\n    a20f:\tc4 41 7c 28 e2       \tvmovaps %ymm10,%ymm12\n    a214:\tc4 41 7c 28 ea       \tvmovaps %ymm10,%ymm13\n    a219:\tc4 41 7c 28 f2       \tvmovaps %ymm10,%ymm14\n    a21e:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n000000000000a225 <store256_loop>:\n    a225:\tc5 7c 29 12          \tvmovaps %ymm10,(%rdx)\n    a229:\tc5 7c 29 1a          \tvmovaps %ymm11,(%rdx)\n    a22d:\tc5 7c 29 22          \tvmovaps %ymm12,(%rdx)\n    a231:\tc5 7c 29 2a          \tvmovaps %ymm13,(%rdx)\n    a235:\tc5 7c 29 32          \tvmovaps %ymm14,(%rdx)\n    a239:\tc5 7c 29 12          \tvmovaps %ymm10,(%rdx)\n    a23d:\tc5 7c 29 1a          \tvmovaps %ymm11,(%rdx)\n    a241:\tc5 7c 29 22          \tvmovaps %ymm12,(%rdx)\n    a245:\tc5 7c 29 2a          \tvmovaps %ymm13,(%rdx)\n    a249:\tc5 7c 29 32          \tvmovaps %ymm14,(%rdx)\n    a24d:\tc5 7c 29 12          \tvmovaps %ymm10,(%rdx)\n    a251:\tc5 7c 29 1a          \tvmovaps %ymm11,(%rdx)\n    a255:\tc5 7c 29 22          \tvmovaps %ymm12,(%rdx)\n    a259:\tc5 7c 29 2a          \tvmovaps %ymm13,(%rdx)\n    a25d:\tc5 7c 29 32          \tvmovaps %ymm14,(%rdx)\n    a261:\tc5 7c 29 12          \tvmovaps %ymm10,(%rdx)\n    a265:\tc5 7c 29 1a          \tvmovaps %ymm11,(%rdx)\n    a269:\tc5 7c 29 22          \tvmovaps %ymm12,(%rdx)\n    a26d:\tc5 7c 29 2a          \tvmovaps %ymm13,(%rdx)\n    a271:\tc5 7c 29 32          \tvmovaps %ymm14,(%rdx)\n    a275:\t4c 29 cf             \tsub    %r9,%rdi\n    a278:\t75 ab                \tjne    a225 <store256_loop>\n    a27a:\t41 59                \tpop    %r9\n    a27c:\t41 58                \tpop    %r8\n    a27e:\t59                   \tpop    %rcx\n    a27f:\t5b                   \tpop    %rbx\n    a280:\tc3                   \tretq   \n\n000000000000a281 <store512>:\n    a281:\t53                   \tpush   %rbx\n    a282:\t51                   \tpush   %rcx\n    a283:\t41 50                \tpush   %r8\n    a285:\t41 51                \tpush   %r9\n    a287:\t62 71 7c 48 28 16    \tvmovaps (%rsi),%zmm10\n    a28d:\t62 51 7c 48 28 da    \tvmovaps %zmm10,%zmm11\n    a293:\t62 51 7c 48 28 e2    \tvmovaps %zmm10,%zmm12\n    a299:\t62 51 7c 48 28 ea    \tvmovaps %zmm10,%zmm13\n    a29f:\t62 51 7c 48 28 f2    \tvmovaps %zmm10,%zmm14\n    a2a5:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n\n000000000000a2ac <store512_loop>:\n    a2ac:\t62 71 7c 48 29 12    \tvmovaps %zmm10,(%rdx)\n    a2b2:\t62 71 7c 48 29 1a    \tvmovaps %zmm11,(%rdx)\n    a2b8:\t62 71 7c 48 29 22    \tvmovaps %zmm12,(%rdx)\n    a2be:\t62 71 7c 48 29 2a    \tvmovaps %zmm13,(%rdx)\n    a2c4:\t62 71 7c 48 29 32    \tvmovaps %zmm14,(%rdx)\n    a2ca:\t62 71 7c 48 29 12    \tvmovaps %zmm10,(%rdx)\n    a2d0:\t62 71 7c 48 29 1a    \tvmovaps %zmm11,(%rdx)\n    a2d6:\t62 71 7c 48 29 22    \tvmovaps %zmm12,(%rdx)\n    a2dc:\t62 71 7c 48 29 2a    \tvmovaps %zmm13,(%rdx)\n    a2e2:\t62 71 7c 48 29 32    \tvmovaps %zmm14,(%rdx)\n    a2e8:\t62 71 7c 48 29 12    \tvmovaps %zmm10,(%rdx)\n    a2ee:\t62 71 7c 48 29 1a    \tvmovaps %zmm11,(%rdx)\n    a2f4:\t62 71 7c 48 29 22    \tvmovaps %zmm12,(%rdx)\n    a2fa:\t62 71 7c 48 29 2a    \tvmovaps %zmm13,(%rdx)\n    a300:\t62 71 7c 48 29 32    \tvmovaps %zmm14,(%rdx)\n    a306:\t62 71 7c 48 29 12    \tvmovaps %zmm10,(%rdx)\n    a30c:\t62 71 7c 48 29 1a    \tvmovaps %zmm11,(%rdx)\n    a312:\t62 71 7c 48 29 22    \tvmovaps %zmm12,(%rdx)\n    a318:\t62 71 7c 48 29 2a    \tvmovaps %zmm13,(%rdx)\n    a31e:\t62 71 7c 48 29 32    \tvmovaps %zmm14,(%rdx)\n    a324:\t4c 29 cf             \tsub    %r9,%rdi\n    a327:\t75 83                \tjne    a2ac <store512_loop>\n    a329:\t41 59                \tpop    %r9\n    a32b:\t41 58                \tpop    %r8\n    a32d:\t59                   \tpop    %rcx\n    a32e:\t5b                   \tpop    %rbx\n    a32f:\tc3                   \tretq   \n\n000000000000a330 <pdeptest>:\n    a330:\t53                   \tpush   %rbx\n    a331:\t51                   \tpush   %rcx\n    a332:\t41 50                \tpush   %r8\n    a334:\t41 51                \tpush   %r9\n    a336:\t41 52                \tpush   %r10\n    a338:\t41 53                \tpush   %r11\n    a33a:\t41 54                \tpush   %r12\n    a33c:\t41 55                \tpush   %r13\n    a33e:\t41 56                \tpush   %r14\n    a340:\t41 57                \tpush   %r15\n    a342:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a349:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a350:\t48 31 db             \txor    %rbx,%rbx\n    a353:\t48 31 c9             \txor    %rcx,%rcx\n    a356:\t4d 31 d2             \txor    %r10,%r10\n    a359:\t4d 31 db             \txor    %r11,%r11\n    a35c:\t4d 31 e4             \txor    %r12,%r12\n    a35f:\t4d 31 ed             \txor    %r13,%r13\n    a362:\t4d 31 f6             \txor    %r14,%r14\n    a365:\t4d 31 ff             \txor    %r15,%r15\n\n000000000000a368 <pdeptest_loop>:\n    a368:\tc4 42 83 f5 f8       \tpdep   %r8,%r15,%r15\n    a36d:\tc4 42 8b f5 f0       \tpdep   %r8,%r14,%r14\n    a372:\tc4 42 93 f5 e8       \tpdep   %r8,%r13,%r13\n    a377:\tc4 42 9b f5 e0       \tpdep   %r8,%r12,%r12\n    a37c:\tc4 42 a3 f5 d8       \tpdep   %r8,%r11,%r11\n    a381:\tc4 42 ab f5 d0       \tpdep   %r8,%r10,%r10\n    a386:\tc4 c2 f3 f5 c8       \tpdep   %r8,%rcx,%rcx\n    a38b:\tc4 c2 e3 f5 d8       \tpdep   %r8,%rbx,%rbx\n    a390:\tc4 42 83 f5 f8       \tpdep   %r8,%r15,%r15\n    a395:\tc4 42 8b f5 f0       \tpdep   %r8,%r14,%r14\n    a39a:\tc4 42 93 f5 e8       \tpdep   %r8,%r13,%r13\n    a39f:\tc4 42 9b f5 e0       \tpdep   %r8,%r12,%r12\n    a3a4:\tc4 42 a3 f5 d8       \tpdep   %r8,%r11,%r11\n    a3a9:\tc4 42 ab f5 d0       \tpdep   %r8,%r10,%r10\n    a3ae:\tc4 c2 f3 f5 c8       \tpdep   %r8,%rcx,%rcx\n    a3b3:\tc4 c2 e3 f5 d8       \tpdep   %r8,%rbx,%rbx\n    a3b8:\tc4 42 83 f5 f8       \tpdep   %r8,%r15,%r15\n    a3bd:\tc4 42 8b f5 f0       \tpdep   %r8,%r14,%r14\n    a3c2:\tc4 42 93 f5 e8       \tpdep   %r8,%r13,%r13\n    a3c7:\tc4 42 9b f5 e0       \tpdep   %r8,%r12,%r12\n    a3cc:\t4c 29 cf             \tsub    %r9,%rdi\n    a3cf:\t75 97                \tjne    a368 <pdeptest_loop>\n    a3d1:\t41 5f                \tpop    %r15\n    a3d3:\t41 5e                \tpop    %r14\n    a3d5:\t41 5d                \tpop    %r13\n    a3d7:\t41 5c                \tpop    %r12\n    a3d9:\t41 5b                \tpop    %r11\n    a3db:\t41 5a                \tpop    %r10\n    a3dd:\t41 59                \tpop    %r9\n    a3df:\t41 58                \tpop    %r8\n    a3e1:\t59                   \tpop    %rcx\n    a3e2:\t5b                   \tpop    %rbx\n    a3e3:\tc3                   \tretq   \n\n000000000000a3e4 <pdepmultest>:\n    a3e4:\t53                   \tpush   %rbx\n    a3e5:\t51                   \tpush   %rcx\n    a3e6:\t56                   \tpush   %rsi\n    a3e7:\t41 50                \tpush   %r8\n    a3e9:\t41 51                \tpush   %r9\n    a3eb:\t41 52                \tpush   %r10\n    a3ed:\t41 53                \tpush   %r11\n    a3ef:\t41 54                \tpush   %r12\n    a3f1:\t41 55                \tpush   %r13\n    a3f3:\t41 56                \tpush   %r14\n    a3f5:\t41 57                \tpush   %r15\n    a3f7:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a3fe:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a405:\t48 31 db             \txor    %rbx,%rbx\n    a408:\t48 31 c9             \txor    %rcx,%rcx\n    a40b:\t48 31 f6             \txor    %rsi,%rsi\n    a40e:\t4d 31 d2             \txor    %r10,%r10\n    a411:\t4d 31 db             \txor    %r11,%r11\n    a414:\t4d 31 e4             \txor    %r12,%r12\n    a417:\t4d 31 ed             \txor    %r13,%r13\n    a41a:\t4d 31 f6             \txor    %r14,%r14\n    a41d:\t4d 31 ff             \txor    %r15,%r15\n\n000000000000a420 <pdepmultest_loop>:\n    a420:\tc4 42 83 f5 f8       \tpdep   %r8,%r15,%r15\n    a425:\t4d 0f af f1          \timul   %r9,%r14\n    a429:\tc4 42 93 f5 e8       \tpdep   %r8,%r13,%r13\n    a42e:\t4d 0f af e1          \timul   %r9,%r12\n    a432:\tc4 42 a3 f5 d8       \tpdep   %r8,%r11,%r11\n    a437:\t4d 0f af d1          \timul   %r9,%r10\n    a43b:\tc4 c2 f3 f5 c8       \tpdep   %r8,%rcx,%rcx\n    a440:\t49 0f af d9          \timul   %r9,%rbx\n    a444:\tc4 42 83 f5 f8       \tpdep   %r8,%r15,%r15\n    a449:\t49 0f af f1          \timul   %r9,%rsi\n    a44d:\tc4 42 83 f5 f8       \tpdep   %r8,%r15,%r15\n    a452:\t4d 0f af f1          \timul   %r9,%r14\n    a456:\tc4 42 93 f5 e8       \tpdep   %r8,%r13,%r13\n    a45b:\t4d 0f af e1          \timul   %r9,%r12\n    a45f:\tc4 42 a3 f5 d8       \tpdep   %r8,%r11,%r11\n    a464:\t4d 0f af d1          \timul   %r9,%r10\n    a468:\tc4 c2 f3 f5 c8       \tpdep   %r8,%rcx,%rcx\n    a46d:\t49 0f af d9          \timul   %r9,%rbx\n    a471:\tc4 42 83 f5 f8       \tpdep   %r8,%r15,%r15\n    a476:\t49 0f af f1          \timul   %r9,%rsi\n    a47a:\t4c 29 cf             \tsub    %r9,%rdi\n    a47d:\t75 a1                \tjne    a420 <pdepmultest_loop>\n    a47f:\t41 5f                \tpop    %r15\n    a481:\t41 5e                \tpop    %r14\n    a483:\t41 5d                \tpop    %r13\n    a485:\t41 5c                \tpop    %r12\n    a487:\t41 5b                \tpop    %r11\n    a489:\t41 5a                \tpop    %r10\n    a48b:\t41 59                \tpop    %r9\n    a48d:\t41 58                \tpop    %r8\n    a48f:\t5e                   \tpop    %rsi\n    a490:\t59                   \tpop    %rcx\n    a491:\t5b                   \tpop    %rbx\n    a492:\tc3                   \tretq   \n\n000000000000a493 <pexttest>:\n    a493:\t53                   \tpush   %rbx\n    a494:\t51                   \tpush   %rcx\n    a495:\t41 50                \tpush   %r8\n    a497:\t41 51                \tpush   %r9\n    a499:\t41 52                \tpush   %r10\n    a49b:\t41 53                \tpush   %r11\n    a49d:\t41 54                \tpush   %r12\n    a49f:\t41 55                \tpush   %r13\n    a4a1:\t41 56                \tpush   %r14\n    a4a3:\t41 57                \tpush   %r15\n    a4a5:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a4ac:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a4b3:\t48 31 db             \txor    %rbx,%rbx\n    a4b6:\t48 31 c9             \txor    %rcx,%rcx\n    a4b9:\t4d 31 d2             \txor    %r10,%r10\n    a4bc:\t4d 31 db             \txor    %r11,%r11\n    a4bf:\t4d 31 e4             \txor    %r12,%r12\n    a4c2:\t4d 31 ed             \txor    %r13,%r13\n    a4c5:\t4d 31 f6             \txor    %r14,%r14\n    a4c8:\t4d 31 ff             \txor    %r15,%r15\n\n000000000000a4cb <pexttest_loop>:\n    a4cb:\tc4 42 82 f5 f8       \tpext   %r8,%r15,%r15\n    a4d0:\tc4 42 8a f5 f0       \tpext   %r8,%r14,%r14\n    a4d5:\tc4 42 92 f5 e8       \tpext   %r8,%r13,%r13\n    a4da:\tc4 42 9a f5 e0       \tpext   %r8,%r12,%r12\n    a4df:\tc4 42 a2 f5 d8       \tpext   %r8,%r11,%r11\n    a4e4:\tc4 42 aa f5 d0       \tpext   %r8,%r10,%r10\n    a4e9:\tc4 c2 f2 f5 c8       \tpext   %r8,%rcx,%rcx\n    a4ee:\tc4 c2 e2 f5 d8       \tpext   %r8,%rbx,%rbx\n    a4f3:\tc4 42 82 f5 f8       \tpext   %r8,%r15,%r15\n    a4f8:\tc4 42 8a f5 f0       \tpext   %r8,%r14,%r14\n    a4fd:\tc4 42 92 f5 e8       \tpext   %r8,%r13,%r13\n    a502:\tc4 42 9a f5 e0       \tpext   %r8,%r12,%r12\n    a507:\tc4 42 a2 f5 d8       \tpext   %r8,%r11,%r11\n    a50c:\tc4 42 aa f5 d0       \tpext   %r8,%r10,%r10\n    a511:\tc4 c2 f2 f5 c8       \tpext   %r8,%rcx,%rcx\n    a516:\tc4 c2 e2 f5 d8       \tpext   %r8,%rbx,%rbx\n    a51b:\tc4 42 82 f5 f8       \tpext   %r8,%r15,%r15\n    a520:\tc4 42 8a f5 f0       \tpext   %r8,%r14,%r14\n    a525:\tc4 42 92 f5 e8       \tpext   %r8,%r13,%r13\n    a52a:\tc4 42 9a f5 e0       \tpext   %r8,%r12,%r12\n    a52f:\t4c 29 cf             \tsub    %r9,%rdi\n    a532:\t75 97                \tjne    a4cb <pexttest_loop>\n    a534:\t41 5f                \tpop    %r15\n    a536:\t41 5e                \tpop    %r14\n    a538:\t41 5d                \tpop    %r13\n    a53a:\t41 5c                \tpop    %r12\n    a53c:\t41 5b                \tpop    %r11\n    a53e:\t41 5a                \tpop    %r10\n    a540:\t41 59                \tpop    %r9\n    a542:\t41 58                \tpop    %r8\n    a544:\t59                   \tpop    %rcx\n    a545:\t5b                   \tpop    %rbx\n    a546:\tc3                   \tretq   \n\n000000000000a547 <depmovtest>:\n    a547:\t53                   \tpush   %rbx\n    a548:\t41 50                \tpush   %r8\n    a54a:\t41 51                \tpush   %r9\n    a54c:\t41 57                \tpush   %r15\n    a54e:\t41 56                \tpush   %r14\n    a550:\t41 55                \tpush   %r13\n    a552:\t41 54                \tpush   %r12\n    a554:\t41 53                \tpush   %r11\n    a556:\t41 52                \tpush   %r10\n    a558:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a55f:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a566:\t48 31 db             \txor    %rbx,%rbx\n\n000000000000a569 <depmovtest_loop>:\n    a569:\t4d 89 fc             \tmov    %r15,%r12\n    a56c:\t4d 89 e6             \tmov    %r12,%r14\n    a56f:\t4d 89 f5             \tmov    %r14,%r13\n    a572:\t4d 89 eb             \tmov    %r13,%r11\n    a575:\t4d 89 df             \tmov    %r11,%r15\n    a578:\t4d 89 fc             \tmov    %r15,%r12\n    a57b:\t4d 89 e6             \tmov    %r12,%r14\n    a57e:\t4d 89 f5             \tmov    %r14,%r13\n    a581:\t4d 89 eb             \tmov    %r13,%r11\n    a584:\t4d 89 df             \tmov    %r11,%r15\n    a587:\t4d 89 fc             \tmov    %r15,%r12\n    a58a:\t4d 89 e6             \tmov    %r12,%r14\n    a58d:\t4d 89 f5             \tmov    %r14,%r13\n    a590:\t4d 89 eb             \tmov    %r13,%r11\n    a593:\t4d 89 df             \tmov    %r11,%r15\n    a596:\t4d 89 fc             \tmov    %r15,%r12\n    a599:\t4d 89 e6             \tmov    %r12,%r14\n    a59c:\t4d 89 f5             \tmov    %r14,%r13\n    a59f:\t4d 89 eb             \tmov    %r13,%r11\n    a5a2:\t4d 89 df             \tmov    %r11,%r15\n    a5a5:\t4c 29 cf             \tsub    %r9,%rdi\n    a5a8:\t75 bf                \tjne    a569 <depmovtest_loop>\n    a5aa:\t41 5a                \tpop    %r10\n    a5ac:\t41 5b                \tpop    %r11\n    a5ae:\t41 5c                \tpop    %r12\n    a5b0:\t41 5d                \tpop    %r13\n    a5b2:\t41 5e                \tpop    %r14\n    a5b4:\t41 5f                \tpop    %r15\n    a5b6:\t41 59                \tpop    %r9\n    a5b8:\t41 58                \tpop    %r8\n    a5ba:\t5b                   \tpop    %rbx\n    a5bb:\tc3                   \tretq   \n\n000000000000a5bc <indepmovtest>:\n    a5bc:\t53                   \tpush   %rbx\n    a5bd:\t51                   \tpush   %rcx\n    a5be:\t41 50                \tpush   %r8\n    a5c0:\t41 51                \tpush   %r9\n    a5c2:\t41 57                \tpush   %r15\n    a5c4:\t41 56                \tpush   %r14\n    a5c6:\t41 55                \tpush   %r13\n    a5c8:\t41 54                \tpush   %r12\n    a5ca:\t41 53                \tpush   %r11\n    a5cc:\t41 52                \tpush   %r10\n    a5ce:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a5d5:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a5dc:\t48 31 db             \txor    %rbx,%rbx\n\n000000000000a5df <indepmovtest_loop>:\n    a5df:\t4d 89 d7             \tmov    %r10,%r15\n    a5e2:\t4d 89 de             \tmov    %r11,%r14\n    a5e5:\t4d 89 e5             \tmov    %r12,%r13\n    a5e8:\t49 89 c7             \tmov    %rax,%r15\n    a5eb:\t49 89 ce             \tmov    %rcx,%r14\n    a5ee:\t4d 89 d7             \tmov    %r10,%r15\n    a5f1:\t4d 89 de             \tmov    %r11,%r14\n    a5f4:\t4d 89 e5             \tmov    %r12,%r13\n    a5f7:\t49 89 c7             \tmov    %rax,%r15\n    a5fa:\t49 89 ce             \tmov    %rcx,%r14\n    a5fd:\t4d 89 d7             \tmov    %r10,%r15\n    a600:\t4d 89 de             \tmov    %r11,%r14\n    a603:\t4d 89 e5             \tmov    %r12,%r13\n    a606:\t49 89 c7             \tmov    %rax,%r15\n    a609:\t49 89 ce             \tmov    %rcx,%r14\n    a60c:\t4d 89 d7             \tmov    %r10,%r15\n    a60f:\t4d 89 de             \tmov    %r11,%r14\n    a612:\t4d 89 e5             \tmov    %r12,%r13\n    a615:\t49 89 c7             \tmov    %rax,%r15\n    a618:\t49 89 ce             \tmov    %rcx,%r14\n    a61b:\t4c 29 cf             \tsub    %r9,%rdi\n    a61e:\t75 bf                \tjne    a5df <indepmovtest_loop>\n    a620:\t41 5a                \tpop    %r10\n    a622:\t41 5b                \tpop    %r11\n    a624:\t41 5c                \tpop    %r12\n    a626:\t41 5d                \tpop    %r13\n    a628:\t41 5e                \tpop    %r14\n    a62a:\t41 5f                \tpop    %r15\n    a62c:\t41 59                \tpop    %r9\n    a62e:\t41 58                \tpop    %r8\n    a630:\t59                   \tpop    %rcx\n    a631:\t5b                   \tpop    %rbx\n    a632:\tc3                   \tretq   \n\n000000000000a633 <movzerotest>:\n    a633:\t53                   \tpush   %rbx\n    a634:\t51                   \tpush   %rcx\n    a635:\t41 50                \tpush   %r8\n    a637:\t41 51                \tpush   %r9\n    a639:\t41 57                \tpush   %r15\n    a63b:\t41 56                \tpush   %r14\n    a63d:\t41 55                \tpush   %r13\n    a63f:\t41 54                \tpush   %r12\n    a641:\t41 53                \tpush   %r11\n    a643:\t41 52                \tpush   %r10\n    a645:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a64c:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a653:\t48 31 db             \txor    %rbx,%rbx\n\n000000000000a656 <movzerotest_loop>:\n    a656:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a65d:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a664:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a66b:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a672:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a679:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a680:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a687:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a68e:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a695:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a69c:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6a3:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6aa:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6b1:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6b8:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6bf:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6c6:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6cd:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6d4:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6db:\t49 c7 c7 00 00 00 00 \tmov    $0x0,%r15\n    a6e2:\t4c 29 cf             \tsub    %r9,%rdi\n    a6e5:\t0f 85 6b ff ff ff    \tjne    a656 <movzerotest_loop>\n    a6eb:\t41 5a                \tpop    %r10\n    a6ed:\t41 5b                \tpop    %r11\n    a6ef:\t41 5c                \tpop    %r12\n    a6f1:\t41 5d                \tpop    %r13\n    a6f3:\t41 5e                \tpop    %r14\n    a6f5:\t41 5f                \tpop    %r15\n    a6f7:\t41 59                \tpop    %r9\n    a6f9:\t41 58                \tpop    %r8\n    a6fb:\t59                   \tpop    %rcx\n    a6fc:\t5b                   \tpop    %rbx\n    a6fd:\tc3                   \tretq   \n\n000000000000a6fe <xorzerotest>:\n    a6fe:\t53                   \tpush   %rbx\n    a6ff:\t51                   \tpush   %rcx\n    a700:\t41 50                \tpush   %r8\n    a702:\t41 51                \tpush   %r9\n    a704:\t41 57                \tpush   %r15\n    a706:\t41 56                \tpush   %r14\n    a708:\t41 55                \tpush   %r13\n    a70a:\t41 54                \tpush   %r12\n    a70c:\t41 53                \tpush   %r11\n    a70e:\t41 52                \tpush   %r10\n    a710:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a717:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a71e:\t48 31 db             \txor    %rbx,%rbx\n\n000000000000a721 <xorzerotest_loop>:\n    a721:\t4d 31 ff             \txor    %r15,%r15\n    a724:\t4d 31 ff             \txor    %r15,%r15\n    a727:\t4d 31 ff             \txor    %r15,%r15\n    a72a:\t4d 31 ff             \txor    %r15,%r15\n    a72d:\t4d 31 ff             \txor    %r15,%r15\n    a730:\t4d 31 ff             \txor    %r15,%r15\n    a733:\t4d 31 ff             \txor    %r15,%r15\n    a736:\t4d 31 ff             \txor    %r15,%r15\n    a739:\t4d 31 ff             \txor    %r15,%r15\n    a73c:\t4d 31 ff             \txor    %r15,%r15\n    a73f:\t4d 31 ff             \txor    %r15,%r15\n    a742:\t4d 31 ff             \txor    %r15,%r15\n    a745:\t4d 31 ff             \txor    %r15,%r15\n    a748:\t4d 31 ff             \txor    %r15,%r15\n    a74b:\t4d 31 ff             \txor    %r15,%r15\n    a74e:\t4d 31 ff             \txor    %r15,%r15\n    a751:\t4d 31 ff             \txor    %r15,%r15\n    a754:\t4d 31 ff             \txor    %r15,%r15\n    a757:\t4d 31 ff             \txor    %r15,%r15\n    a75a:\t4d 31 ff             \txor    %r15,%r15\n    a75d:\t4c 29 cf             \tsub    %r9,%rdi\n    a760:\t75 bf                \tjne    a721 <xorzerotest_loop>\n    a762:\t41 5a                \tpop    %r10\n    a764:\t41 5b                \tpop    %r11\n    a766:\t41 5c                \tpop    %r12\n    a768:\t41 5d                \tpop    %r13\n    a76a:\t41 5e                \tpop    %r14\n    a76c:\t41 5f                \tpop    %r15\n    a76e:\t41 59                \tpop    %r9\n    a770:\t41 58                \tpop    %r8\n    a772:\t59                   \tpop    %rcx\n    a773:\t5b                   \tpop    %rbx\n    a774:\tc3                   \tretq   \n\n000000000000a775 <subzerotest>:\n    a775:\t53                   \tpush   %rbx\n    a776:\t51                   \tpush   %rcx\n    a777:\t41 50                \tpush   %r8\n    a779:\t41 51                \tpush   %r9\n    a77b:\t41 57                \tpush   %r15\n    a77d:\t41 56                \tpush   %r14\n    a77f:\t41 55                \tpush   %r13\n    a781:\t41 54                \tpush   %r12\n    a783:\t41 53                \tpush   %r11\n    a785:\t41 52                \tpush   %r10\n    a787:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a78e:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a795:\t48 31 db             \txor    %rbx,%rbx\n\n000000000000a798 <subzerotest_loop>:\n    a798:\t4d 29 ff             \tsub    %r15,%r15\n    a79b:\t4d 29 ff             \tsub    %r15,%r15\n    a79e:\t4d 29 ff             \tsub    %r15,%r15\n    a7a1:\t4d 29 ff             \tsub    %r15,%r15\n    a7a4:\t4d 29 ff             \tsub    %r15,%r15\n    a7a7:\t4d 29 ff             \tsub    %r15,%r15\n    a7aa:\t4d 29 ff             \tsub    %r15,%r15\n    a7ad:\t4d 29 ff             \tsub    %r15,%r15\n    a7b0:\t4d 29 ff             \tsub    %r15,%r15\n    a7b3:\t4d 29 ff             \tsub    %r15,%r15\n    a7b6:\t4d 29 ff             \tsub    %r15,%r15\n    a7b9:\t4d 29 ff             \tsub    %r15,%r15\n    a7bc:\t4d 29 ff             \tsub    %r15,%r15\n    a7bf:\t4d 29 ff             \tsub    %r15,%r15\n    a7c2:\t4d 29 ff             \tsub    %r15,%r15\n    a7c5:\t4d 29 ff             \tsub    %r15,%r15\n    a7c8:\t4d 29 ff             \tsub    %r15,%r15\n    a7cb:\t4d 29 ff             \tsub    %r15,%r15\n    a7ce:\t4d 29 ff             \tsub    %r15,%r15\n    a7d1:\t4d 29 ff             \tsub    %r15,%r15\n    a7d4:\t4c 29 cf             \tsub    %r9,%rdi\n    a7d7:\t75 bf                \tjne    a798 <subzerotest_loop>\n    a7d9:\t41 5a                \tpop    %r10\n    a7db:\t41 5b                \tpop    %r11\n    a7dd:\t41 5c                \tpop    %r12\n    a7df:\t41 5d                \tpop    %r13\n    a7e1:\t41 5e                \tpop    %r14\n    a7e3:\t41 5f                \tpop    %r15\n    a7e5:\t41 59                \tpop    %r9\n    a7e7:\t41 58                \tpop    %r8\n    a7e9:\t59                   \tpop    %rcx\n    a7ea:\t5b                   \tpop    %rbx\n    a7eb:\tc3                   \tretq   \n\n000000000000a7ec <depaddimmtest>:\n    a7ec:\t53                   \tpush   %rbx\n    a7ed:\t51                   \tpush   %rcx\n    a7ee:\t41 50                \tpush   %r8\n    a7f0:\t41 51                \tpush   %r9\n    a7f2:\t41 57                \tpush   %r15\n    a7f4:\t41 56                \tpush   %r14\n    a7f6:\t41 55                \tpush   %r13\n    a7f8:\t41 54                \tpush   %r12\n    a7fa:\t41 53                \tpush   %r11\n    a7fc:\t41 52                \tpush   %r10\n    a7fe:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a805:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a80c:\t48 31 db             \txor    %rbx,%rbx\n\n000000000000a80f <depaddimmtest_loop>:\n    a80f:\t49 83 c7 01          \tadd    $0x1,%r15\n    a813:\t49 83 c7 02          \tadd    $0x2,%r15\n    a817:\t49 83 c7 03          \tadd    $0x3,%r15\n    a81b:\t49 83 c7 04          \tadd    $0x4,%r15\n    a81f:\t49 83 c7 05          \tadd    $0x5,%r15\n    a823:\t49 83 c7 06          \tadd    $0x6,%r15\n    a827:\t49 83 c7 07          \tadd    $0x7,%r15\n    a82b:\t49 83 c7 08          \tadd    $0x8,%r15\n    a82f:\t49 83 c7 09          \tadd    $0x9,%r15\n    a833:\t49 83 c7 0a          \tadd    $0xa,%r15\n    a837:\t49 83 c7 0b          \tadd    $0xb,%r15\n    a83b:\t49 83 c7 0c          \tadd    $0xc,%r15\n    a83f:\t49 83 c7 0d          \tadd    $0xd,%r15\n    a843:\t49 83 c7 0e          \tadd    $0xe,%r15\n    a847:\t49 83 c7 0f          \tadd    $0xf,%r15\n    a84b:\t49 83 c7 10          \tadd    $0x10,%r15\n    a84f:\t49 83 c7 11          \tadd    $0x11,%r15\n    a853:\t49 83 c7 12          \tadd    $0x12,%r15\n    a857:\t49 83 c7 13          \tadd    $0x13,%r15\n    a85b:\t49 83 c7 14          \tadd    $0x14,%r15\n    a85f:\t4c 29 cf             \tsub    %r9,%rdi\n    a862:\t75 ab                \tjne    a80f <depaddimmtest_loop>\n    a864:\t41 5a                \tpop    %r10\n    a866:\t41 5b                \tpop    %r11\n    a868:\t41 5c                \tpop    %r12\n    a86a:\t41 5d                \tpop    %r13\n    a86c:\t41 5e                \tpop    %r14\n    a86e:\t41 5f                \tpop    %r15\n    a870:\t41 59                \tpop    %r9\n    a872:\t41 58                \tpop    %r8\n    a874:\t59                   \tpop    %rcx\n    a875:\t5b                   \tpop    %rbx\n    a876:\tc3                   \tretq   \n\n000000000000a877 <depinctest>:\n    a877:\t53                   \tpush   %rbx\n    a878:\t51                   \tpush   %rcx\n    a879:\t41 50                \tpush   %r8\n    a87b:\t41 51                \tpush   %r9\n    a87d:\t41 57                \tpush   %r15\n    a87f:\t41 56                \tpush   %r14\n    a881:\t41 55                \tpush   %r13\n    a883:\t41 54                \tpush   %r12\n    a885:\t41 53                \tpush   %r11\n    a887:\t41 52                \tpush   %r10\n    a889:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a890:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a897:\t48 31 db             \txor    %rbx,%rbx\n\n000000000000a89a <depinctest_loop>:\n    a89a:\t49 ff c7             \tinc    %r15\n    a89d:\t49 ff c7             \tinc    %r15\n    a8a0:\t49 ff c7             \tinc    %r15\n    a8a3:\t49 ff c7             \tinc    %r15\n    a8a6:\t49 ff c7             \tinc    %r15\n    a8a9:\t49 ff c7             \tinc    %r15\n    a8ac:\t49 ff c7             \tinc    %r15\n    a8af:\t49 ff c7             \tinc    %r15\n    a8b2:\t49 ff c7             \tinc    %r15\n    a8b5:\t49 ff c7             \tinc    %r15\n    a8b8:\t49 ff c7             \tinc    %r15\n    a8bb:\t49 ff c7             \tinc    %r15\n    a8be:\t49 ff c7             \tinc    %r15\n    a8c1:\t49 ff c7             \tinc    %r15\n    a8c4:\t49 ff c7             \tinc    %r15\n    a8c7:\t49 ff c7             \tinc    %r15\n    a8ca:\t49 ff c7             \tinc    %r15\n    a8cd:\t49 ff c7             \tinc    %r15\n    a8d0:\t49 ff c7             \tinc    %r15\n    a8d3:\t49 ff c7             \tinc    %r15\n    a8d6:\t4c 29 cf             \tsub    %r9,%rdi\n    a8d9:\t75 bf                \tjne    a89a <depinctest_loop>\n    a8db:\t41 5a                \tpop    %r10\n    a8dd:\t41 5b                \tpop    %r11\n    a8df:\t41 5c                \tpop    %r12\n    a8e1:\t41 5d                \tpop    %r13\n    a8e3:\t41 5e                \tpop    %r14\n    a8e5:\t41 5f                \tpop    %r15\n    a8e7:\t41 59                \tpop    %r9\n    a8e9:\t41 58                \tpop    %r8\n    a8eb:\t59                   \tpop    %rcx\n    a8ec:\t5b                   \tpop    %rbx\n    a8ed:\tc3                   \tretq   \n\n000000000000a8ee <depdectest>:\n    a8ee:\t53                   \tpush   %rbx\n    a8ef:\t51                   \tpush   %rcx\n    a8f0:\t41 50                \tpush   %r8\n    a8f2:\t41 51                \tpush   %r9\n    a8f4:\t41 57                \tpush   %r15\n    a8f6:\t41 56                \tpush   %r14\n    a8f8:\t41 55                \tpush   %r13\n    a8fa:\t41 54                \tpush   %r12\n    a8fc:\t41 53                \tpush   %r11\n    a8fe:\t41 52                \tpush   %r10\n    a900:\t49 c7 c0 01 00 00 00 \tmov    $0x1,%r8\n    a907:\t49 c7 c1 14 00 00 00 \tmov    $0x14,%r9\n    a90e:\t4d 31 ff             \txor    %r15,%r15\n    a911:\t49 f7 d7             \tnot    %r15\n    a914:\t48 31 db             \txor    %rbx,%rbx\n\n000000000000a917 <depdectest_loop>:\n    a917:\t49 ff cf             \tdec    %r15\n    a91a:\t49 ff cf             \tdec    %r15\n    a91d:\t49 ff cf             \tdec    %r15\n    a920:\t49 ff cf             \tdec    %r15\n    a923:\t49 ff cf             \tdec    %r15\n    a926:\t49 ff cf             \tdec    %r15\n    a929:\t49 ff cf             \tdec    %r15\n    a92c:\t49 ff cf             \tdec    %r15\n    a92f:\t49 ff cf             \tdec    %r15\n    a932:\t49 ff cf             \tdec    %r15\n    a935:\t49 ff cf             \tdec    %r15\n    a938:\t49 ff cf             \tdec    %r15\n    a93b:\t49 ff cf             \tdec    %r15\n    a93e:\t49 ff cf             \tdec    %r15\n    a941:\t49 ff cf             \tdec    %r15\n    a944:\t49 ff cf             \tdec    %r15\n    a947:\t49 ff cf             \tdec    %r15\n    a94a:\t49 ff cf             \tdec    %r15\n    a94d:\t49 ff cf             \tdec    %r15\n    a950:\t49 ff cf             \tdec    %r15\n    a953:\t4c 29 cf             \tsub    %r9,%rdi\n    a956:\t75 bf                \tjne    a917 <depdectest_loop>\n    a958:\t41 5a                \tpop    %r10\n    a95a:\t41 5b                \tpop    %r11\n    a95c:\t41 5c                \tpop    %r12\n    a95e:\t41 5d                \tpop    %r13\n    a960:\t41 5e                \tpop    %r14\n    a962:\t41 5f                \tpop    %r15\n    a964:\t41 59                \tpop    %r9\n    a966:\t41 58                \tpop    %r8\n    a968:\t59                   \tpop    %rcx\n    a969:\t5b                   \tpop    %rbx\n    a96a:\tc3                   \tretq   \n    a96b:\t0f 1f 44 00 00       \tnopl   0x0(%rax,%rax,1)\n\n000000000000a970 <load128wrapper>:\n    a970:\tf3 0f 1e fa          \tendbr64 \n    a974:\t48 8b 35 85 37 00 00 \tmov    0x3785(%rip),%rsi        # e100 <intTestArr>\n    a97b:\te9 73 f6 ff ff       \tjmpq   9ff3 <load128>\n\n000000000000a980 <spacedload128wrapper>:\n    a980:\tf3 0f 1e fa          \tendbr64 \n    a984:\t48 8b 35 75 37 00 00 \tmov    0x3775(%rip),%rsi        # e100 <intTestArr>\n    a98b:\te9 99 f5 ff ff       \tjmpq   9f29 <spacedload128>\n\n000000000000a990 <spacedstorescalarwrapper>:\n    a990:\tf3 0f 1e fa          \tendbr64 \n    a994:\t48 8b 35 65 37 00 00 \tmov    0x3765(%rip),%rsi        # e100 <intTestArr>\n    a99b:\te9 e7 f4 ff ff       \tjmpq   9e87 <spacedstorescalar>\n\n000000000000a9a0 <load256wrapper>:\n    a9a0:\tf3 0f 1e fa          \tendbr64 \n    a9a4:\t48 8d 35 15 37 00 00 \tlea    0x3715(%rip),%rsi        # e0c0 <fpTestArr>\n    a9ab:\te9 c0 f6 ff ff       \tjmpq   a070 <load256>\n\n000000000000a9b0 <load512wrapper>:\n    a9b0:\tf3 0f 1e fa          \tendbr64 \n    a9b4:\t48 8d 35 05 37 00 00 \tlea    0x3705(%rip),%rsi        # e0c0 <fpTestArr>\n    a9bb:\te9 19 f7 ff ff       \tjmpq   a0d9 <load512>\n\n000000000000a9c0 <store128wrapper>:\n    a9c0:\tf3 0f 1e fa          \tendbr64 \n    a9c4:\t48 8b 35 35 37 00 00 \tmov    0x3735(%rip),%rsi        # e100 <intTestArr>\n    a9cb:\t48 8d 15 6e 36 00 00 \tlea    0x366e(%rip),%rdx        # e040 <intSinkArr>\n    a9d2:\te9 93 f7 ff ff       \tjmpq   a16a <store128>\n    a9d7:\t66 0f 1f 84 00 00 00 \tnopw   0x0(%rax,%rax,1)\n    a9de:\t00 00 \n\n000000000000a9e0 <store256wrapper>:\n    a9e0:\tf3 0f 1e fa          \tendbr64 \n    a9e4:\t48 8d 15 95 36 00 00 \tlea    0x3695(%rip),%rdx        # e080 <fpSinkArr>\n    a9eb:\t48 8d 35 ce 36 00 00 \tlea    0x36ce(%rip),%rsi        # e0c0 <fpTestArr>\n    a9f2:\te9 09 f8 ff ff       \tjmpq   a200 <store256>\n    a9f7:\t66 0f 1f 84 00 00 00 \tnopw   0x0(%rax,%rax,1)\n    a9fe:\t00 00 \n\n000000000000aa00 <store512wrapper>:\n    aa00:\tf3 0f 1e fa          \tendbr64 \n    aa04:\t48 8d 15 75 36 00 00 \tlea    0x3675(%rip),%rdx        # e080 <fpSinkArr>\n    aa0b:\t48 8d 35 ae 36 00 00 \tlea    0x36ae(%rip),%rsi        # e0c0 <fpTestArr>\n    aa12:\te9 6a f8 ff ff       \tjmpq   a281 <store512>\n    aa17:\t66 0f 1f 84 00 00 00 \tnopw   0x0(%rax,%rax,1)\n    aa1e:\t00 00 \n\n000000000000aa20 <mixfmaandmem256wrapper>:\n    aa20:\tf3 0f 1e fa          \tendbr64 \n    aa24:\t48 8d 35 95 36 00 00 \tlea    0x3695(%rip),%rsi        # e0c0 <fpTestArr>\n    aa2b:\te9 03 e9 ff ff       \tjmpq   9333 <mixfmaandmem256>\n\n000000000000aa30 <mixfmaaddmem256wrapper>:\n    aa30:\tf3 0f 1e fa          \tendbr64 \n    aa34:\t48 8d 35 85 36 00 00 \tlea    0x3685(%rip),%rsi        # e0c0 <fpTestArr>\n    aa3b:\te9 be e9 ff ff       \tjmpq   93fe <mixfmaaddmem256>\n\n000000000000aa40 <measureFunction>:\n    aa40:\tf3 0f 1e fa          \tendbr64 \n    aa44:\t55                   \tpush   %rbp\n    aa45:\t48 89 f5             \tmov    %rsi,%rbp\n    aa48:\t53                   \tpush   %rbx\n    aa49:\t48 89 fb             \tmov    %rdi,%rbx\n    aa4c:\t48 83 ec 58          \tsub    $0x58,%rsp\n    aa50:\tf3 0f 11 44 24 0c    \tmovss  %xmm0,0xc(%rsp)\n    aa56:\t48 8d 74 24 10       \tlea    0x10(%rsp),%rsi\n    aa5b:\t48 8d 7c 24 20       \tlea    0x20(%rsp),%rdi\n    aa60:\t64 48 8b 04 25 28 00 \tmov    %fs:0x28,%rax\n    aa67:\t00 00 \n    aa69:\t48 89 44 24 48       \tmov    %rax,0x48(%rsp)\n    aa6e:\t31 c0                \txor    %eax,%eax\n    aa70:\te8 db 65 ff ff       \tcallq  1050 <gettimeofday@plt>\n    aa75:\t48 89 df             \tmov    %rbx,%rdi\n    aa78:\tff d5                \tcallq  *%rbp\n    aa7a:\t48 8d 74 24 18       \tlea    0x18(%rsp),%rsi\n    aa7f:\t48 8d 7c 24 30       \tlea    0x30(%rsp),%rdi\n    aa84:\te8 c7 65 ff ff       \tcallq  1050 <gettimeofday@plt>\n    aa89:\t48 8b 74 24 38       \tmov    0x38(%rsp),%rsi\n    aa8e:\t48 2b 74 24 28       \tsub    0x28(%rsp),%rsi\n    aa93:\t48 ba cf f7 53 e3 a5 \tmovabs $0x20c49ba5e353f7cf,%rdx\n    aa9a:\t9b c4 20 \n    aa9d:\t48 89 f0             \tmov    %rsi,%rax\n    aaa0:\t48 8b 4c 24 30       \tmov    0x30(%rsp),%rcx\n    aaa5:\t48 2b 4c 24 20       \tsub    0x20(%rsp),%rcx\n    aaaa:\t48 c1 fe 3f          \tsar    $0x3f,%rsi\n    aaae:\t48 f7 ea             \timul   %rdx\n    aab1:\t48 69 c9 e8 03 00 00 \timul   $0x3e8,%rcx,%rcx\n    aab8:\t48 c1 fa 07          \tsar    $0x7,%rdx\n    aabc:\t48 29 f2             \tsub    %rsi,%rdx\n    aabf:\t48 01 d1             \tadd    %rdx,%rcx\n    aac2:\t78 5c                \tjs     ab20 <measureFunction+0xe0>\n    aac4:\t66 0f ef c9          \tpxor   %xmm1,%xmm1\n    aac8:\tf3 48 0f 2a c9       \tcvtsi2ss %rcx,%xmm1\n    aacd:\tf3 0f 5a c9          \tcvtss2sd %xmm1,%xmm1\n    aad1:\tf2 0f 59 0d 9f 15 00 \tmulsd  0x159f(%rip),%xmm1        # c078 <_IO_stdin_used+0x1078>\n    aad8:\t00 \n    aad9:\t48 85 db             \ttest   %rbx,%rbx\n    aadc:\t78 62                \tjs     ab40 <measureFunction+0x100>\n    aade:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    aae2:\tf3 48 0f 2a c3       \tcvtsi2ss %rbx,%xmm0\n    aae7:\tf3 0f 5a c0          \tcvtss2sd %xmm0,%xmm0\n    aaeb:\t48 8b 44 24 48       \tmov    0x48(%rsp),%rax\n    aaf0:\t64 48 33 04 25 28 00 \txor    %fs:0x28,%rax\n    aaf7:\t00 00 \n    aaf9:\tf2 0f 5e c8          \tdivsd  %xmm0,%xmm1\n    aafd:\tf3 0f 10 05 83 15 00 \tmovss  0x1583(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>\n    ab04:\t00 \n    ab05:\tf2 0f 5a c9          \tcvtsd2ss %xmm1,%xmm1\n    ab09:\tf3 0f 5e c1          \tdivss  %xmm1,%xmm0\n    ab0d:\tf3 0f 5e 44 24 0c    \tdivss  0xc(%rsp),%xmm0\n    ab13:\t75 46                \tjne    ab5b <measureFunction+0x11b>\n    ab15:\t48 83 c4 58          \tadd    $0x58,%rsp\n    ab19:\t5b                   \tpop    %rbx\n    ab1a:\t5d                   \tpop    %rbp\n    ab1b:\tc3                   \tretq   \n    ab1c:\t0f 1f 40 00          \tnopl   0x0(%rax)\n    ab20:\t48 89 c8             \tmov    %rcx,%rax\n    ab23:\t83 e1 01             \tand    $0x1,%ecx\n    ab26:\t66 0f ef c9          \tpxor   %xmm1,%xmm1\n    ab2a:\t48 d1 e8             \tshr    %rax\n    ab2d:\t48 09 c8             \tor     %rcx,%rax\n    ab30:\tf3 48 0f 2a c8       \tcvtsi2ss %rax,%xmm1\n    ab35:\tf3 0f 58 c9          \taddss  %xmm1,%xmm1\n    ab39:\teb 92                \tjmp    aacd <measureFunction+0x8d>\n    ab3b:\t0f 1f 44 00 00       \tnopl   0x0(%rax,%rax,1)\n    ab40:\t48 89 d8             \tmov    %rbx,%rax\n    ab43:\t83 e3 01             \tand    $0x1,%ebx\n    ab46:\t66 0f ef c0          \tpxor   %xmm0,%xmm0\n    ab4a:\t48 d1 e8             \tshr    %rax\n    ab4d:\t48 09 d8             \tor     %rbx,%rax\n    ab50:\tf3 48 0f 2a c0       \tcvtsi2ss %rax,%xmm0\n    ab55:\tf3 0f 58 c0          \taddss  %xmm0,%xmm0\n    ab59:\teb 8c                \tjmp    aae7 <measureFunction+0xa7>\n    ab5b:\te8 e0 64 ff ff       \tcallq  1040 <__stack_chk_fail@plt>\n\n000000000000ab60 <spacedload256wrapper>:\n    ab60:\tf3 0f 1e fa          \tendbr64 \n    ab64:\t48 8b 35 95 35 00 00 \tmov    0x3595(%rip),%rsi        # e100 <intTestArr>\n    ab6b:\te9 b9 f3 ff ff       \tjmpq   9f29 <spacedload128>\n\n000000000000ab70 <__libc_csu_init>:\n    ab70:\tf3 0f 1e fa          \tendbr64 \n    ab74:\t41 57                \tpush   %r15\n    ab76:\t4c 8d 3d fb 31 00 00 \tlea    0x31fb(%rip),%r15        # dd78 <__init_array_start>\n    ab7d:\t41 56                \tpush   %r14\n    ab7f:\t49 89 d6             \tmov    %rdx,%r14\n    ab82:\t41 55                \tpush   %r13\n    ab84:\t49 89 f5             \tmov    %rsi,%r13\n    ab87:\t41 54                \tpush   %r12\n    ab89:\t41 89 fc             \tmov    %edi,%r12d\n    ab8c:\t55                   \tpush   %rbp\n    ab8d:\t48 8d 2d f4 31 00 00 \tlea    0x31f4(%rip),%rbp        # dd88 <__do_global_dtors_aux_fini_array_entry>\n    ab94:\t53                   \tpush   %rbx\n    ab95:\t4c 29 fd             \tsub    %r15,%rbp\n    ab98:\t48 83 ec 08          \tsub    $0x8,%rsp\n    ab9c:\te8 5f 64 ff ff       \tcallq  1000 <_init>\n    aba1:\t48 c1 fd 03          \tsar    $0x3,%rbp\n    aba5:\t74 1f                \tje     abc6 <__libc_csu_init+0x56>\n    aba7:\t31 db                \txor    %ebx,%ebx\n    aba9:\t0f 1f 80 00 00 00 00 \tnopl   0x0(%rax)\n    abb0:\t4c 89 f2             \tmov    %r14,%rdx\n    abb3:\t4c 89 ee             \tmov    %r13,%rsi\n    abb6:\t44 89 e7             \tmov    %r12d,%edi\n    abb9:\t41 ff 14 df          \tcallq  *(%r15,%rbx,8)\n    abbd:\t48 83 c3 01          \tadd    $0x1,%rbx\n    abc1:\t48 39 dd             \tcmp    %rbx,%rbp\n    abc4:\t75 ea                \tjne    abb0 <__libc_csu_init+0x40>\n    abc6:\t48 83 c4 08          \tadd    $0x8,%rsp\n    abca:\t5b                   \tpop    %rbx\n    abcb:\t5d                   \tpop    %rbp\n    abcc:\t41 5c                \tpop    %r12\n    abce:\t41 5d                \tpop    %r13\n    abd0:\t41 5e                \tpop    %r14\n    abd2:\t41 5f                \tpop    %r15\n    abd4:\tc3                   \tretq   \n    abd5:\t66 66 2e 0f 1f 84 00 \tdata16 nopw %cs:0x0(%rax,%rax,1)\n    abdc:\t00 00 00 00 \n\n000000000000abe0 <__libc_csu_fini>:\n    abe0:\tf3 0f 1e fa          \tendbr64 \n    abe4:\tc3                   \tretq   \n\nDisassembly of section .fini:\n\n000000000000abe8 <_fini>:\n    abe8:\tf3 0f 1e fa          \tendbr64 \n    abec:\t48 83 ec 08          \tsub    $0x8,%rsp\n    abf0:\t48 83 c4 08          \tadd    $0x8,%rsp\n    abf4:\tc3                   \tretq   \n"
  },
  {
    "path": "InstructionRate/x86_fusion.c",
    "content": "/* This is a one-off microbenchmark for attempts to figure out what\n * instructions are fused on Centaur's CNS\n */\n#include <stdio.h>\n#include <sys/time.h>\n#include <time.h>\n#include <stdint.h>\n#include <stdlib.h>\n#include <string.h>\n#include <cpuid.h>\n\n// make mingw happy for cross compiling\n#ifdef __MINGW32__\n#define aligned_alloc(align, size) _aligned_malloc(size, align)\n#endif\n\nextern uint64_t noptest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t addtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t testfusion(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t cmpfusion(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t subfusion(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t nopfusion(uint64_t iterations) __attribute((sysv_abi));\n\nfloat fpTestArr[8] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14, 5.16, 6.3, 7.7, 9.45 };\nfloat fpSinkArr[8] __attribute__ ((aligned (64))) = { 2.1, 3.2, 4.3, 5.4, 6.2, 7.8, 8.3, 9.4 };\nint *intTestArr;\nint intSinkArr[8] __attribute__ ((aligned (64))) = { 2, 3, 4, 5, 6, 7, 8, 9 };\n\nfloat measureFunction(uint64_t iterations, float clockSpeedGhz, __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t));\n\nint main(int argc, char *argv[]) {\n  struct timeval startTv, endTv;\n  struct timezone startTz, endTz;\n  uint64_t iterations = 1500000000;\n  uint64_t iterationsHigh = iterations * 5;\n  uint64_t time_diff_ms;\n  float latency, opsPerNs, clockSpeedGhz;\n  uint64_t intTestArrLength = 1024;\n\n  intTestArr = aligned_alloc(64, sizeof(int) * intTestArrLength);\n  for (uint64_t i = 0; i < intTestArrLength; i++) {\n    intTestArr[i] = i;\n  }\n\n  if (argc > 2) {\n    iterationsHigh =  1500000000 * (uint64_t)atol(argv[2]);\n    printf(\"setting %lu iterations\\n\", iterationsHigh);\n  }\n\n  // figure out clock speed\n  gettimeofday(&startTv, &startTz);\n  clktest(iterationsHigh);\n  gettimeofday(&endTv, &endTz);\n  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n  latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;\n  // clk speed should be 1/latency, assuming we got one add per clk, roughly\n  clockSpeedGhz = 1/latency;\n\n  printf(\"Estimated clock speed: %.2f GHz\\n\", clockSpeedGhz);\n\n  // throughput\n  printf(\"2-byte nops per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, noptest));\n  printf(\"Adds per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, addtest));\n  printf(\"test+jnz: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, testfusion));\n  printf(\"cmp+jnz: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, cmpfusion));\n  printf(\"sub+jnz: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, subfusion));\n  printf(\"nop+jnz: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, nopfusion));\n\n  return 0;\n}\n\nfloat measureFunction(uint64_t iterations, float clockSpeedGhz,  __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t)) {\n  struct timeval startTv, endTv;\n  struct timezone startTz, endTz;\n  uint64_t time_diff_ms, retval;\n  float latency, opsPerNs;\n\n  gettimeofday(&startTv, &startTz);\n  retval = testfunc(iterations);\n  gettimeofday(&endTv, &endTz);\n  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n  latency = 1e6 * (float)time_diff_ms / (float)iterations;\n  opsPerNs = 1/latency;\n  //printf(\"%f adds/ns, %f adds/clk?\\n\", opsPerNs, opsPerNs / clockSpeedGhz);\n  //printf(\"return value: %lu\\n\", retval);\n  return opsPerNs / clockSpeedGhz;\n}\n"
  },
  {
    "path": "InstructionRate/x86_fusion.s",
    "content": ".text\n\n.global clktest\n.global addtest\n.global noptest\n.global testfusion\n.global cmpfusion\n.global subfusion\n.global nopfusion\n\ntestfusion:\n  push %rbx\n  push %r8\n  push %r9\n  push %r10\n  xor %rax, %rax\n  not %rax\ntestfusion_loop:\n  xor %r8, %r8\n  xor %r9, %r9\n  sub $5, %rdi\n  test %rdi, %rax\n  jnz testfusion_loop\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret\n\ncmpfusion:\n  push %rbx\n  push %r8\n  push %r9\n  push %r10\n  xor %rax, %rax\ncmpfusion_loop:\n  xor %r8, %r8\n  xor %r9, %r9\n  sub $5, %rdi\n  cmp %rdi, %rax\n  jnz cmpfusion_loop\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret\n\nsubfusion:\n  push %rbx\n  push %r8\n  push %r9\n  push %r10\n  xor %rax, %rax\nsubfusion_loop:\n  xor %r8, %r8\n  xor %r9, %r9\n  xor %r10, %r10\n  sub $5, %rdi\n  jnz subfusion_loop\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret\n\nnopfusion:\n  push %rbx\n  push %r8\n  push %r9\n  push %r10\n  xor %rax, %rax\nnopfusion_loop:\n  sub $5, %rdi\n  nop\n  nop\n  nop\n  jnz nopfusion_loop\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret\n\nclktest:\n  push %rbx\n  push %r8\n  push %r9\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\nclktest_loop:\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  sub %r9, %rdi\n  jnz clktest_loop\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret\n\nnoptest:\n  push %rbx\n  push %r9\n  mov $20, %r9\nnoptest_loop:\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  sub %r9, %rdi\n  jnz noptest_loop\n  pop %r9\n  pop %rbx\n  ret\n\naddtest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\naddtest_loop:\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  add %r8, %r11\n  add %r8, %r10\n  add %r8, %rcx\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  add %r8, %r11\n  add %r8, %r10\n  add %r8, %rcx\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  add %r8, %r11\n  add %r8, %r10\n  sub %r9, %rdi\n  jnz addtest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n"
  },
  {
    "path": "InstructionRate/x86_instructionrate.c",
    "content": "/* This is a one-off microbenchmark for attempts to dissect\n * Zhaoxin's KX-6640MA (LuJiaZui) architecture\n */\n#include <stdio.h>\n#include <sys/time.h>\n#include <time.h>\n#include <stdint.h>\n#include <stdlib.h>\n#include <string.h>\n#include <cpuid.h>\n#include <pthread.h>\n#include <xmmintrin.h>\n#include <pmmintrin.h>\n\n// make mingw happy for cross compiling\n#ifdef __MINGW32__\n#define aligned_alloc(align, size) _aligned_malloc(size, align)\n#endif\n\nextern uint64_t noptest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t noptest1b(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t clkmovtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t addtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t addnoptest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t addmovtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t leatest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t leamultest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t rortest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t shltest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t rorbtstest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixrormultest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixrorshltest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t btstest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t btsmultest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t addmultest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t addjmptest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t jmpmultest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t jmptest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t ntjmptest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixadd256int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixadd256int11(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixadd256fpint(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mix256fp(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mix256fp11(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latadd512int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latadd256int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latadd128int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latadd256fp(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latmul128int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latmul256int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latmul512int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latmulq512int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latmuldq512int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latmul256fp(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latadd128fp(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latmul128fp(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latfma512(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latfma256(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latfma128(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t add128int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t add256int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t add512int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mul512int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t muldq512int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mul128int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t add128fp(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mul128fp(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t fma512(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixfma256fma512(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mix21fma256fma512(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t fma256(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t fma128(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixfmafadd256(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixfmaadd256(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixfmaadd512(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixfma512add256(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixfmaand256(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixfmaandmem256(uint64_t iterations, float *arr) __attribute((sysv_abi));\nextern uint64_t mixfmaaddmem256(uint64_t iterations, float *arr) __attribute((sysv_abi));\nextern uint64_t nemesfpumix21(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t nemesfpu512mix21(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mul256fp(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t add256fp(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latmul64(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t latmul16(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mul16(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mul64(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t load128(uint64_t iterations, int *arr) __attribute((sysv_abi));\nextern uint64_t spacedload128(uint64_t iterations, int *arr) __attribute((sysv_abi));\nextern uint64_t loadscalar(uint64_t iterations, int *arr) __attribute((sysv_abi));\nextern uint64_t mixedscalarloadstore(uint64_t iterations, int *arr) __attribute((sysv_abi));\nextern uint64_t load256(uint64_t iterations, float *arr) __attribute((sysv_abi));\nextern uint64_t load512(uint64_t iterations, float *arr) __attribute((sysv_abi));\nextern uint64_t store128(uint64_t iterations, int *arr, int *sink) __attribute((sysv_abi));\nextern uint64_t store256(uint64_t iterations, float *arr, float *sink) __attribute((sysv_abi));\nextern uint64_t store512(uint64_t iterations, float *arr, float *sink) __attribute((sysv_abi));\nextern uint64_t mixaddmul128int(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixmul16mul64(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mixmul16mul64_21(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t pdeptest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t pdepmultest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t pexttest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t indepmovtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t vecindepmovtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t depmovtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t vecdepmovtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t xorzerotest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t vecxorzerotest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t movzerotest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t subzerotest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t vecsubzerotest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t depinctest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t depdectest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t depaddimmtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t memrenametest(uint64_t iterations, int *arr) __attribute((sysv_abi));\nextern uint64_t spacedstorescalar(uint64_t iterations, int *arr) __attribute((sysv_abi));\nextern uint64_t aesenc128(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t aesdec128(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t aesencfadd128(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t aesencadd128(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t aesencfma128(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t aesencmul128(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t mix256faddintadd(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t movqtoxmmtest(uint64_t iterations) __attribute((sysv_abi));\n\nextern uint64_t fma4_256(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t fma4_128(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t fdivtest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t fdivlattest(uint64_t iterations) __attribute((sysv_abi));\nextern uint64_t fmuldenormtest(uint64_t iterations) __attribute((sysv_abi)); \nextern uint64_t fmuldenormlattest(uint64_t iterations) __attribute((sysv_abi)); \n\nfloat fpTestArr[8] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14, 5.16, 6.3, 7.7, 9.45 };\nfloat fpSinkArr[8] __attribute__ ((aligned (64))) = { 2.1, 3.2, 4.3, 5.4, 6.2, 7.8, 8.3, 9.4 };\nint *intTestArr;\nint intSinkArr[8] __attribute__ ((aligned (64))) = { 2, 3, 4, 5, 6, 7, 8, 9 };\n\nuint64_t load128wrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t loadscalarwrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t mixedscalarloadstorewrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t spacedload128wrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t spacedstorescalarwrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t load256wrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t load512wrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t store128wrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t store256wrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t store512wrapper(uint64_t iterations) __attribute((sysv_abi));\nuint64_t mixfmaandmem256wrapper(uint64_t iterations)  __attribute((sysv_abi));\nuint64_t mixfmaaddmem256wrapper(uint64_t iterations)  __attribute((sysv_abi));\nuint64_t memrenamewrapper(uint64_t iterations) __attribute((sysv_abi));\n\nfloat measureFunction(uint64_t iterations, float clockSpeedGhz, __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t));\n\nint threads = 0;\n\nint main(int argc, char *argv[]) {\n  struct timeval startTv, endTv;\n  struct timezone startTz, endTz;\n  uint64_t iterations = 1500000000;\n  uint64_t iterationsHigh = iterations * 5;\n  uint64_t time_diff_ms;\n  float latency, opsPerNs, clockSpeedGhz;\n  uint64_t intTestArrLength = 1024;\n  int avxSupported = 0, avx2Supported = 0, bmi2Supported = 0, avx512Supported = 0;\n  int fmaSupported = 0, fma4Supported = 0;\n  char *testName = NULL;\n\n  if (argc > 1) {\n      for (int argIdx = 1; argIdx < argc; argIdx++) {\n          if (*(argv[argIdx]) == '-') {\n              char *arg = argv[argIdx] + 1;\n              if (strncmp(arg, \"threads\", 7) == 0) {\n                  argIdx++;\n                  threads = atoi(argv[argIdx]);\n                  fprintf(stderr, \"Multithreading mode, %d threads\\n\", threads);\n              } else if (strncmp(arg, \"iter\", 4) == 0) {\n                  argIdx++;\n                  int iterMul = atoi(argv[argIdx]);\n                  iterations *= iterMul;\n                  iterationsHigh *= iterMul;\n                  fprintf(stderr, \"Scaled iterations by %d\\n\", iterMul);\n              } else if (strncmp(arg, \"test\", 4) == 0) {\n                  argIdx++;\n                  testName = argv[argIdx];\n                  fprintf(stderr, \"Only running test %s\\n\", testName);\n              }\n          }\n      }\n  }\n\n  intTestArr = aligned_alloc(64, sizeof(int) * intTestArrLength);\n  for (uint64_t i = 0; i < intTestArrLength; i++) {\n    intTestArr[i] = i;\n  }\n\n  if (__builtin_cpu_supports(\"avx\")) {\n    fprintf(stderr, \"avx supported\\n\");\n    avxSupported = 1;\n  }\n\n  if (__builtin_cpu_supports(\"avx2\")) {\n    fprintf(stderr, \"avx2 supported\\n\");\n    avx2Supported = 1;\n  }\n\n  if (__builtin_cpu_supports(\"bmi2\")) {\n    fprintf(stderr, \"bmi2 supported\\n\");\n    bmi2Supported = 1;\n  }\n\n  if (__builtin_cpu_supports(\"fma\")) {\n      fprintf(stderr, \"fma3 supported\\n\");\n      fmaSupported = 1;\n  }\n\n  if (__builtin_cpu_supports(\"fma4\")) {\n      fprintf(stderr, \"fma4 supported\\n\");\n      fma4Supported = 1;\n  }\n\n  uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx;\n  __cpuid_count(7, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx);\n  if (cpuidEbx & (1UL << 16)) {\n      fprintf(stderr, \"AVX512 supported\\n\");\n      avx512Supported = 1;\n  }\n\n  // figure out clock speed\n  gettimeofday(&startTv, &startTz);\n  clktest(iterationsHigh);\n  gettimeofday(&endTv, &endTz);\n  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n  latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;\n  // clk speed should be 1/latency, assuming we got one add per clk, roughly\n  clockSpeedGhz = 1/latency;\n\n  printf(\"Estimated clock speed: %.2f GHz\\n\", clockSpeedGhz);\n\n  // avx-512 testing\n  if (avx512Supported) {\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"fma512\", 6) == 0)\n      printf(\"512-bit FMA per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, fma512));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"latfma512\", 9) == 0)\n      printf(\"512-bit FMA latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latfma512));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixfma256fma512\", 15) == 0)\n      printf(\"1:1 256-bit/512-bit FMA per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mixfma256fma512));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"mix21fma256fma512\", 17) == 0)\n      printf(\"2:1 256-bit/512-bit FMA per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mix21fma256fma512));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"nemesfpumix21\", 13) == 0)\n      printf(\"1:2 512b FMA:FADD per clk (nemes): %.2f\\n\", measureFunction(iterations * 22, clockSpeedGhz, nemesfpu512mix21));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"add512int\", 9) == 0)\n      printf(\"512-bit int add per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, add512int));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"latadd512int\", 12) == 0)\n      printf(\"512-bit int add latency: %.2f clocks\\n\", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latadd256int));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"mul512int\", 9) == 0)\n      printf(\"512-bit 32-bit int mul per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mul512int));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"muldq512int\", 9) == 0)\n      printf(\"512-bit 32->64-bit int mul per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, muldq512int));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"latmulq512int\", 13) == 0)\n      printf(\"512-bit 64-bit int mul latency: %.2f clocks\\n\", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latmulq512int));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"latmul512int\", 12) == 0)\n      printf(\"512-bit 32-bit int mul latency: %.2f clocks\\n\", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latmul512int));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"latmuldq512int\", 13) == 0)\n      printf(\"512-bit 32->64-bit int mul latency: %.2f clocks\\n\", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latmuldq512int));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixfmaadd512\", 11) == 0)\n      printf(\"1:2 512b PADDQ:FMA per clk: %.2f\\n\", measureFunction(iterations * 22, clockSpeedGhz, mixfmaadd512));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixfma512add256\", 11) == 0)\n      printf(\"1:2 256b PADDQ : 512b FMA per clk: %.2f\\n\", measureFunction(iterations * 22, clockSpeedGhz, mixfma512add256));\n\n\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"load512\", 7) == 0)\n      printf(\"512-bit loads per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, load512wrapper));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"store512\", 7) == 0)\n      printf(\"512-bit stores per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, store512wrapper));\n\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"aesenc128\", 9) == 0)\n      printf(\"aesenc per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, aesenc128));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"aesdec128\", 9) == 0)\n      printf(\"aesdec per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, aesdec128));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"aesencadd128\", 12) == 0)\n      printf(\"1:3 aesenc+paddd per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, aesencadd128));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"aesencfma128\", 12) == 0)\n      printf(\"1:2 aesenc+fma per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, aesencfma128));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"aesencmul128\", 12) == 0)\n      printf(\"1:2 aesenc+pmullw per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, aesencmul128));\n    if (testName == NULL || argc > 1 && strncmp(argv[1], \"aesencmul128\", 12) == 0)\n      printf(\"1:2 aesenc+addps per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, aesencfadd128));\n  }\n\n  // throughput\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"1bnop\", 5) == 0)\n    printf(\"1-byte nops per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, noptest1b));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"2bnop\", 5) == 0)\n    printf(\"2-byte nops per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, noptest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"add\", 3) == 0)\n    printf(\"Adds per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, addtest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"addnop\", 7) == 0)\n    printf(\"1:4 nops/adds per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, addnoptest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"addmov\", 7) == 0)\n    printf(\"1:4 movs/adds per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, addnoptest));\n\n  // renamer throughput\n  printf(\"--- Renamer tests ---\\n\");\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"depmov\", 6) == 0)\n    printf(\"Dependent movs per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"indepmov\", 8) == 0)\n    printf(\"Independent movs per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"xorzero\", 7) == 0)\n    printf(\"xor -> 0 per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"movzero\", 7) == 0)\n    printf(\"mov -> 0 per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"subzero\", 7) == 0)\n    printf(\"sub -> 0 per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"depinc\", 6) == 0)\n    printf(\"dep inc per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, depinctest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"depdec\", 6) == 0)\n    printf(\"dep dec per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, depdectest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"depdec\", 6) == 0)\n    printf(\"dep add immediate per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, depaddimmtest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"clkmov\", 6) == 0)\n    printf(\"dep add + mov pair per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, clkmovtest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"vecdepmov\", 9) == 0)\n    printf(\"Dependent vec movs per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecdepmovtest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"vecindepmov\", 12) == 0)\n    printf(\"Independent vec movs per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecindepmovtest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"vecxorzero\", 10) == 0)\n    printf(\"xor xmm -> 0 per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecxorzerotest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"vecsubzero\", 10) == 0)\n    printf(\"sub xmm -> 0 per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, vecsubzerotest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"memrename\", 9) == 0) \n    printf(\"mov -> [r] -> mov latency: %.2f\\n\", 1 / measureFunction(iterations, clockSpeedGhz, memrenamewrapper));\n  // misc mixed integer tests\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"miximuladd\", 10) == 0)\n    printf(\"4:1 adds/imul per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, addmultest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"jmpmul\", 6) == 0)\n    printf(\"1:1 mul/jmp per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, jmpmultest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"addjmp\", 6) == 0)\n    printf(\"3:1 add/jmp per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, addjmptest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"jmp\", 3) == 0)\n    printf(\"taken jmp per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, jmptest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"ntjmp\", 5) == 0)\n    printf(\"nt jmp per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, ntjmptest));\n  if (bmi2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"pdep\", 4) == 0))\n    printf(\"pdep per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, pdeptest));\n  if (bmi2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"pext\", 4) == 0))\n    printf(\"pext per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, pexttest));\n  if (bmi2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"pdepmul\", 7) == 0))\n    printf(\"1:1 pdep/mul per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, pdepmultest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"shl\", 3) == 0)\n    printf(\"shl r,1 per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, shltest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"ror\", 3) == 0)\n    printf(\"ror r,1 per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, rortest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixrorshl\", 9) == 0)\n    printf(\"1:1 shl/ror r,1 per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixrorshltest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixrormul\", 3) == 0)\n    printf(\"1:1 ror/mul per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixrormultest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"bts\", 3) == 0)\n    printf(\"bts per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, btstest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixmulbts\", 9) == 0)\n    printf(\"1:1 bts/mul per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, btsmultest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixrorbts\", 9) == 0)\n    printf(\"1:1 bts/ror per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, rorbtstest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"lea\", 3) == 0)\n    printf(\"lea r+r*8 per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, leatest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixmullea\", 9) == 0)\n    printf(\"1:1 lea r+r*8/mul per clk: %.4f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, leamultest));\n\n  // vector and FP\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"fdiv\", 4) == 0) \n    printf(\"divss per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, fdivtest));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"latfdiv\", 7) == 0)\n    printf(\"divss latency: %.2f\\n\", 1 / measureFunction(iterationsHigh, clockSpeedGhz, fdivlattest));\n  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"avx256int\", 9) == 0))\n    printf(\"256-bit avx integer add per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, add256int));\n  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mixavx256int\", 12) == 0))\n    printf(\"2:1 scalar add/256-bit avx integer add per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixadd256int));\n  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mix11avx256int\", 14) == 0))\n    printf(\"1:1 scalar add/256-bit avx integer add per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixadd256int11));\n  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mixavx256fpint\", 14) == 0))\n    printf(\"1:1 256-bit avx int add/avx fadd per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mixadd256fpint));\n  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mix256fp\", 8) == 0))\n    printf(\"1:1 256-bit avx fp mul/add per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mix256fp));\n  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"latadd256int\", 12) == 0))\n    printf(\"256-bit avx2 integer add latency: %.2f clocks\\n\", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latadd256int));\n  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"latmul256int\", 12) == 0))\n    printf(\"256-bit avx2 integer multiply latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latmul256int));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"latadd128int\", 12) == 0)\n    printf(\"128-bit sse integer add latency: %.2f clocks\\n\", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latadd128int));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"latmul128int\", 12) == 0)\n    printf(\"128-bit sse integer multiply latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latmul128int));\n  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], \"latadd256fp\", 11) == 0))\n    printf(\"256-bit avx fadd latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latadd256fp));\n  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], \"latmul256fp\", 11) == 0))\n    printf(\"256-bit avx fmul latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latmul256fp));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"latadd128fp\", 11) == 0)\n    printf(\"128-bit sse fadd latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latadd128fp));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"latmul128fp\", 11) == 0)\n    printf(\"128-bit sse fmul latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latmul128fp));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"add128fp\", 8) == 0)\n    printf(\"128-bit sse fadd per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, add128fp));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mul128fp\", 8) == 0)\n    printf(\"128-bit sse fmul per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mul128fp));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"add128int\", 9) == 0)\n    printf(\"128-bit sse int add per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, add128int));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mul128int\", 9) == 0)\n    printf(\"128-bit sse int mul per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, mul128int));\n\n  // set no ftz or daz\n  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);\n  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);\n  if (argc == 1 || argc > 1 && strncmp(argv[1], \"fmuldenorm\", 10) == 0)  {\n    float denormTp = measureFunction(iterations, clockSpeedGhz, fmuldenormtest);\n    printf(\"Scalar FP32 multiply -> denorm per clk: %.2f (%.2f recip)\\n\", denormTp, 1/denormTp);\n  }\n\n  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);\n  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); \n  if (argc == 1 || argc > 1 && strncmp(argv[1], \"fmuldenormftz\", 13) == 0) {\n    printf(\"Scalar FP32 multiply -> denorm (ftz/daz) per clk: %.2f\\n\", measureFunction(iterationsHigh, clockSpeedGhz, fmuldenormtest)); \n  }\n\n  if (fmaSupported) {\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"fma256\", 6) == 0))\n          printf(\"256-bit FMA per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, fma256));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"fma128\", 6) == 0))\n          printf(\"128-bit FMA per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, fma128));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"latfma256\", 9) == 0))\n          printf(\"256-bit FMA latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latfma256));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"latfma128\", 9) == 0))\n          printf(\"128-bit FMA latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latfma128));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mixfmafadd256\", 12) == 0))\n          printf(\"1:2 256b FMA:FADD per clk: %.2f\\n\", measureFunction(iterations * 22, clockSpeedGhz, mixfmafadd256));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mixfmaadd256\", 11) == 0))\n          printf(\"2:1 256b FMA:PADDQ per clk: %.2f\\n\", measureFunction(iterations * 22, clockSpeedGhz, mixfmaadd256));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mixfmaandmem256\", 14) == 0))\n          printf(\"2:1 256b FMA:PADDQ load-op per clk: %.2f\\n\", measureFunction(iterations * 22, clockSpeedGhz, mixfmaaddmem256wrapper));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mixfmaand256\", 11) == 0))\n          printf(\"2:1 256b FMA:PAND per clk: %.2f\\n\", measureFunction(iterations * 22, clockSpeedGhz, mixfmaand256));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mixfmaandmem256\", 14) == 0))\n          printf(\"2:1 256b FMA:PAND load-op per clk: %.2f\\n\", measureFunction(iterations * 22, clockSpeedGhz, mixfmaandmem256wrapper));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"nemesfpumix21\", 13) == 0))\n          printf(\"1:2 256b FMA:FADD per clk (nemes): %.2f\\n\", measureFunction(iterations * 22, clockSpeedGhz, nemesfpumix21));\n      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"mix256faddintadd\", 15) == 0))\n          printf(\"1:2 256b FMA:PADD per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mix256faddintadd));\n  }\n\n  if (fma4Supported)\n  {\n      if (testName == NULL || argc > 1 && strncmp(argv[1], \"fma4_256\", 8) == 0)\n          printf(\"256-bit FMA4 per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, fma4_256));\n      if (testName == NULL || argc > 1 && strncmp(argv[1], \"fma4_256\", 8) == 0)\n          printf(\"128-bit FMA4 per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, fma4_128));\n  }\n\n  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"fadd256\", 6) == 0))\n    printf(\"256-bit FADD per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, add256fp));\n  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], \"fmul256\", 6) == 0))\n    printf(\"256-bit FMUL per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mul256fp));\n\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"movqtoxmm\", 9) == 0) \n    printf(\"MOVQ GPR <-> XMM: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, movqtoxmmtest));\n\n  // integer multiply. zhaoxin appears to handle 16-bit and 64-bit multiplies differntly\n  // unlike Intel/AMD CPUs that behave similarly regardless of register width\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"latmul16\", 8) == 0)\n    printf(\"16-bit imul latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latmul16));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"latmul64\", 8) == 0)\n    printf(\"64-bit imul latency: %.2f clocks\\n\", 1 / measureFunction(iterations, clockSpeedGhz, latmul64));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mul16\", 5) == 0)\n    printf(\"16-bit imul per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mul16));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mul64\", 5) == 0)\n    printf(\"64-bit imul per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mul64));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixmul16mul64\", 5) == 0)\n    printf(\"1:1 mixed 16-bit/64-bit imul per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mixmul16mul64));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mix21mul16mul64\", 5) == 0)\n    printf(\"2:1 mixed 16-bit/64-bit imul per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mixmul16mul64_21));\n\n  // load/store\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"loadscalar\", 10) == 0)\n    printf(\"64-bit scalar loads per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, loadscalarwrapper));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixedscalarloadstore\", 20) == 0)\n    printf(\"2:1 64-bit scalar loads:stores per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mixedscalarloadstorewrapper));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"load128\", 7) == 0)\n    printf(\"128-bit loads per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, load128wrapper));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"spacedload128\", 13) == 0)\n    printf(\"128-bit loads (spaced) per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, spacedload128wrapper));\n  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], \"load256\", 7) == 0))\n    printf(\"256-bit loads per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, load256wrapper));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"spacedstorescalar\", 13) == 0)\n    printf(\"scalar stores (spaced) per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, spacedstorescalarwrapper));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"store128\", 7) == 0)\n    printf(\"128-bit stores per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, store128wrapper));\n  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], \"store256\", 7) == 0))\n    printf(\"256-bit stores per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, store256wrapper));\n  if (testName == NULL || argc > 1 && strncmp(argv[1], \"mixaddmul128int\", 15) == 0)\n    printf(\"1:1 mixed 128-bit vec add/mul per clk: %.2f\\n\", measureFunction(iterations, clockSpeedGhz, mixaddmul128int));\n\n  return 0;\n}\n\nstruct TestThreadData {\n    uint64_t iterations;\n    uint64_t (*testfunc)(uint64_t);\n};\n\nvoid *TestThread(void *param) {\n    struct TestThreadData *testData = (struct TestThreadData *)param;\n    testData->testfunc(testData->iterations);\n    return NULL;\n}\n\nfloat measureFunction(uint64_t iterations, float clockSpeedGhz,  __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t)) {\n  struct timeval startTv, endTv;\n  struct timezone startTz, endTz;\n  uint64_t time_diff_ms, retval;\n  float latency, opsPerNs;\n\n  gettimeofday(&startTv, &startTz);\n  if (threads == 0) retval = testfunc(iterations);\n  else {\n      pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t));\n      struct TestThreadData *testData = (struct TestThreadData*)malloc(threads * sizeof(struct TestThreadData));\n      for (int threadIdx = 0; threadIdx < threads; threadIdx++) {\n          testData[threadIdx].iterations = iterations;\n          testData[threadIdx].testfunc = testfunc;\n          pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx);\n      }\n\n      for (int threadIdx = 0; threadIdx < threads; threadIdx++) {\n          pthread_join(testThreads[threadIdx], NULL);\n      }\n\n      free(testThreads);\n      free(testData);\n  }\n  gettimeofday(&endTv, &endTz);\n  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n  latency = 1e6 * (float)time_diff_ms / (float)iterations;\n  opsPerNs = 1/latency;\n  //printf(\"%f adds/ns, %f adds/clk?\\n\", opsPerNs, opsPerNs / clockSpeedGhz);\n  //printf(\"return value: %lu\\n\", retval);\n  return opsPerNs / clockSpeedGhz;\n}\n\n__attribute((sysv_abi)) uint64_t load128wrapper(uint64_t iterations) {\n  return load128(iterations, intTestArr);\n}\n\n__attribute((sysv_abi)) uint64_t spacedload128wrapper(uint64_t iterations) {\n  return spacedload128(iterations, intTestArr);\n}\n\n__attribute((sysv_abi)) uint64_t spacedstorescalarwrapper(uint64_t iterations) {\n  return spacedstorescalar(iterations, intTestArr);\n}\n\n__attribute((sysv_abi)) uint64_t load256wrapper(uint64_t iterations) {\n  return load256(iterations, fpTestArr);\n}\n\n__attribute((sysv_abi)) uint64_t loadscalarwrapper(uint64_t iterations) {\n  return loadscalar(iterations, intTestArr);\n}\n\n__attribute((sysv_abi)) uint64_t mixedscalarloadstorewrapper(uint64_t iterations) {\n  return mixedscalarloadstore(iterations, intTestArr);\n}\n \n\n__attribute((sysv_abi)) uint64_t load512wrapper(uint64_t iterations) {\n  return load512(iterations, fpTestArr);\n}\n\n__attribute((sysv_abi)) uint64_t spacedload256wrapper(uint64_t iterations) {\n  return spacedload128(iterations, intTestArr);\n}\n\n__attribute((sysv_abi)) uint64_t store128wrapper(uint64_t iterations) {\n  return store128(iterations, intTestArr, intSinkArr);\n}\n\n__attribute((sysv_abi)) uint64_t store256wrapper(uint64_t iterations) {\n  return store256(iterations, fpTestArr, fpSinkArr);\n}\n\n__attribute((sysv_abi)) uint64_t store512wrapper(uint64_t iterations) {\n  return store512(iterations, fpTestArr, fpSinkArr);\n}\n\n__attribute((sysv_abi)) uint64_t mixfmaandmem256wrapper(uint64_t iterations) {\n  return mixfmaandmem256(iterations, fpTestArr);\n}\n\n__attribute((sysv_abi)) uint64_t mixfmaaddmem256wrapper(uint64_t iterations) {\n  return mixfmaaddmem256(iterations, fpTestArr);\n}\n__attribute((sysv_abi)) uint64_t memrenamewrapper(uint64_t iterations) {\n  return memrenametest(iterations, intSinkArr);\n}\n"
  },
  {
    "path": "InstructionRate/x86_instructionrate.s",
    "content": ".text\n\n.global clktest\n.global clkmovtest\n.global addtest\n.global addnoptest\n.global addmovtest\n.global rortest\n.global shltest\n.global mixrorshltest\n.global mixrormultest\n.global btstest\n.global leatest\n.global leamultest\n.global rorbtstest\n.global btsmultest\n.global depmovtest\n.global indepmovtest\n.global vecindepmovtest\n.global vecdepmovtest\n.global xorzerotest\n.global vecxorzerotest\n.global movzerotest\n.global subzerotest\n.global vecsubzerotest\n.global depinctest\n.global depdectest\n.global depaddimmtest\n.global memrenametest\n.global addmultest\n.global jmpmultest\n.global addjmptest\n.global jmptest\n.global ntjmptest\n.global noptest\n.global noptest1b\n.global add256int\n.global add512int\n.global mul512int\n.global muldq512int\n.global mixadd256int\n.global mixadd256int11\n.global mixadd256fpint\n.global mix256fp\n.global latadd256int\n.global latadd128int\n.global latmul256int\n.global latmul512int\n.global latmulq512int\n.global latmuldq512int\n.global latmul128int\n.global latadd256int\n.global latmul256fp\n.global latadd256fp\n.global latmul128fp\n.global latadd128fp\n.global fma512\n.global mixfma256fma512\n.global mix21fma256fma512\n.global fma256\n.global fma128\n.global mixfmafadd256\n.global mixfmaadd256\n.global mixfmaadd512\n.global mixfma512add256\n.global mixfmaand256\n.global nemesfpumix21\n.global nemesfpu512mix21\n.global mixfmaandmem256\n.global mixfmaaddmem256\n.global latfma512\n.global latfma256\n.global latfma128\n.global mul256fp\n.global add256fp\n.global add128fp\n.global mul128fp\n.global latmul64\n.global latmul16\n.global mul16\n.global mul64\n.global load128\n.global spacedload128\n.global load256\n.global load512\n.global store128\n.global store256\n.global store512\n.global loadscalar\n.global mixedscalarloadstore\n.global spacedstorescalar\n.global mixaddmul128int\n.global mixmul16mul64\n.global mixmul16mul64_21\n.global add128int\n.global mul128int\n.global mix256faddintadd\n.global movqtoxmmtest\n\n.global pdeptest\n.global pexttest\n.global pdepmultest\n\n.global aesenc128\n.global aesdec128\n.global aesencadd128\n.global aesencfma128\n.global aesencfadd128\n.global aesencmul128\n\n.global fma4_256\n.global fma4_128\n.global fdivtest\n.global fdivlattest\n.global fmuldenormtest\n.global fmuldenormlattest\n\n/*\n  %rdi = arg0 = iteration count\n*/\nclktest:\n  push %rbx\n  push %r8\n  push %r9\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\nclktest_loop:\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  add %r8, %rbx\n  sub %r9, %rdi\n  jnz clktest_loop\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret\n\nclkmovtest:\n  push %rbx\n  push %r8\n  push %r9\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\nclkmovtest_loop:\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  add %r8, %rbx\n  mov %rbx, %r8\n  sub %r9, %rdi\n  jnz clkmovtest_loop\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret\n\nnoptest:\n  push %rbx\n  push %r9\n  mov $20, %r9\nnoptest_loop:\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  xchg %ax,%ax\n  sub %r9, %rdi\n  jnz noptest_loop\n  pop %r9\n  pop %rbx\n  ret\n\nnoptest1b:\n  push %rbx\n  push %r9\n  mov $20, %r9\nnoptest1b_loop:\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  sub %r9, %rdi\n  jnz noptest1b_loop\n  pop %r9\n  pop %rbx\n  ret\n\naddtest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\naddtest_loop:\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  add %r8, %r11\n  add %r8, %r10\n  add %r8, %rcx\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  add %r8, %r11\n  add %r8, %r10\n  add %r8, %rcx\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  add %r8, %r11\n  add %r8, %r10\n  sub %r9, %rdi\n  jnz addtest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\naddnoptest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\naddnoptest_loop:\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  nop\n  add %r8, %r10\n  add %r8, %rcx\n  add %r8, %r15\n  add %r8, %r14\n  nop\n  add %r8, %r12\n  add %r8, %r11\n  add %r8, %r10\n  add %r8, %rcx\n  nop\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  add %r8, %r11\n  nop\n  sub %r9, %rdi\n  jnz addnoptest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\naddmovtest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\naddmovtest_loop:\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  mov %r15, %rdx\n  add %r8, %r10\n  add %r8, %rcx\n  add %r8, %r15\n  add %r8, %r14\n  mov %r15, %rdx\n  add %r8, %r12\n  add %r8, %r11\n  add %r8, %r10\n  add %r8, %rcx\n  mov %r15, %rdx\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  add %r8, %r11\n  mov %r15, %rdx\n  sub %r9, %rdi\n  jnz addmovtest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nrortest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r8, %r15\nrortest_loop:\n  ror $1, %r15\n  ror $1, %r14\n  ror $1, %r13\n  ror $1, %r12\n  ror $1, %r11\n  ror $1, %r15\n  ror $1, %r14\n  ror $1, %r13\n  ror $1, %r12\n  ror $1, %r11\n  ror $1, %r15\n  ror $1, %r14\n  ror $1, %r13\n  ror $1, %r12\n  ror $1, %r11\n  ror $1, %r15\n  ror $1, %r14\n  ror $1, %r13\n  ror $1, %r12\n  ror $1, %r11\n  sub %r9, %rdi\n  jnz rortest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nshltest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r8, %r15\nshltest_loop:\n  shl $1, %r15\n  shl $1, %r14\n  shl $1, %r13\n  shl $1, %r12\n  shl $1, %r11\n  shl $1, %r15\n  shl $1, %r14\n  shl $1, %r13\n  shl $1, %r12\n  shl $1, %r11\n  shl $1, %r15\n  shl $1, %r14\n  shl $1, %r13\n  shl $1, %r12\n  shl $1, %r11\n  shl $1, %r15\n  shl $1, %r14\n  shl $1, %r13\n  shl $1, %r12\n  shl $1, %r11\n  sub %r9, %rdi\n  jnz shltest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nmixrorshltest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r8, %r15\nmixrorshltest_loop:\n  ror $1, %r15\n  shl $1, %r14\n  ror $1, %r13\n  shl $1, %r12\n  ror $1, %r11\n  shl $1, %r15\n  ror $1, %r14\n  shl $1, %r13\n  ror $1, %r12\n  shl $1, %r11\n  ror $1, %r15\n  shl $1, %r14\n  ror $1, %r13\n  shl $1, %r12\n  ror $1, %r11\n  shl $1, %r15\n  ror $1, %r14\n  shl $1, %r13\n  ror $1, %r12\n  shl $1, %r11\n  sub %r9, %rdi\n  jnz mixrorshltest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nmixrormultest:\n  push %rbx\n  push %rcx\n  push %rsi\n  push %rdx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $3, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r8, %r15\nmixrormultest_loop:\n  ror $1, %r15\n  imul %r8, %r14\n  mov %r9, %r14\n  ror $1, %r13\n  imul %r8, %r12\n  mov %r9, %r12\n  ror $1, %r11\n  imul %r8, %r10\n  mov %r9, %r10\n  ror $1, %rbx\n  imul %r8, %rcx\n  mov %r9, %rcx\n  ror $1, %rsi\n  imul %r8, %rax\n  mov %r9, %rax\n  ror $1, %r15\n  imul %r8, %r14\n  mov %r9, %r14\n  ror $1, %r13\n  imul %r8, %r12\n  mov %r9, %r12\n  ror $1, %r11\n  imul %r8, %r10\n  mov %r9, %r10\n  ror $1, %rbx\n  imul %r8, %rcx\n  mov %r9, %rcx\n  ror $1, %rsi\n  imul %r8, %rdx\n  sub %r9, %rdi\n  jnz mixrormultest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rdx\n  pop %rsi\n  pop %rcx\n  pop %rbx\n  ret\n\nrorbtstest:\n  push %rbx\n  push %rcx\n  push %rdx\n  push %rsi\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r8, %r15\n  inc %r8\nrorbtstest_loop:\n  bts %r8, %r15\n  ror $1, %r14\n  bts %r8, %r13\n  ror $1, %r12\n  bts %r8, %r11\n  ror $1, %r10\n  bts %r8, %rcx\n  ror $1, %rbx\n  bts %r8, %rdx\n  ror $1, %rsi\n  bts %r8, %r15\n  ror $1, %r14\n  bts %r8, %r13\n  ror $1, %r12\n  bts %r8, %r11\n  ror $1, %r10\n  bts %r8, %rcx\n  ror $1, %rbx\n  bts %r8, %rdx\n  ror $1, %rsi\n  sub %r9, %rdi\n  jnz rorbtstest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rsi\n  pop %rdx\n  pop %rcx\n  pop %rbx\n  ret\n\nbtstest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r8, %r15\n  inc %r8\nbtstest_loop:\n  bts %r8, %r15\n  bts %r8, %r14\n  bts %r8, %r13\n  bts %r8, %r12\n  bts %r8, %r11\n  bts %r8, %r15\n  bts %r8, %r14\n  bts %r8, %r13\n  bts %r8, %r12\n  bts %r8, %r11\n  bts %r8, %r15\n  bts %r8, %r14\n  bts %r8, %r13\n  bts %r8, %r12\n  bts %r8, %r11\n  bts %r8, %r15\n  bts %r8, %r14\n  bts %r8, %r13\n  bts %r8, %r12\n  bts %r8, %r11\n  sub %r9, %rdi\n  jnz btstest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nleatest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r8, %r15\n  inc %r8\nleatest_loop:\n  lea (%r9,%r10,8), %r10\n  lea (%r9,%r11,8), %r11\n  lea (%r9,%r12,8), %r12\n  lea (%r9,%r13,8), %r13\n  lea (%r9,%r14,8), %r14\n  lea (%r9,%r15,8), %r15\n  lea (%r9,%r10,8), %r10\n  lea (%r9,%r11,8), %r11\n  lea (%r9,%r12,8), %r12\n  lea (%r9,%r13,8), %r13\n  lea (%r9,%r14,8), %r14\n  lea (%r9,%r15,8), %r15\n  lea (%r9,%r10,8), %r10\n  lea (%r9,%r11,8), %r11\n  lea (%r9,%r12,8), %r12\n  lea (%r9,%r13,8), %r13\n  lea (%r9,%r14,8), %r14\n  lea (%r9,%r15,8), %r15\n  lea (%r9,%r10,8), %r10\n  lea (%r9,%r11,8), %r11\n  sub %r9, %rdi\n  jnz leatest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nleamultest:\n  push %rbx\n  push %rcx\n  push %rdx\n  push %rsi\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r8, %r15\n  inc %r8\nleamultest_loop:\n  lea (%r9,%r15,8), %r15\n  imul %r8, %r14\n  mov %r8, %r14\n  lea (%r9,%r13,8), %r13\n  imul %r8, %r12\n  mov %r8, %r12\n  lea (%r9,%r11,8), %r11\n  imul %r8, %r10\n  mov %r8, %r10\n  lea (%r9,%rbx,8), %rbx\n  imul %r8, %rcx\n  mov %r8, %rcx\n  lea (%r9,%rdx,8), %rdx\n  imul %r8, %rax\n  lea (%r9,%r15,8), %r15\n  imul %r8, %r14\n  lea (%r9,%r13,8), %r13\n  imul %r8, %r12\n  lea (%r9,%r11,8), %r11\n  imul %r8, %r10\n  lea (%r9,%rbx,8), %rbx\n  imul %r8, %rcx\n  lea (%r9,%rdx,8), %rdx\n  imul %r8, %rax\n  sub %r9, %rdi\n  jnz leamultest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rsi\n  pop %rdx\n  pop %rcx\n  pop %rbx\n  ret\n\n\nbtsmultest:\n  push %rbx\n  push %rcx\n  push %rsi\n  push %rdx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rdx\n  mov %r8, %rsi\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r8, %r15\n  inc %r8\nbtsmultest_loop:\n  imul %r8, %r14\n  bts %r8, %r13\n  mov %r8, %r13\n  imul %r8, %r12\n  bts %r8, %r11\n  mov %r8, %r11\n  imul %r8, %r10\n  bts %r8, %rbx\n  imul %r8, %rcx\n  mov %r8, %rcx\n  bts %r8, %rsi\n  imul %r8, %rax\n  mov %r8, %rax\n  bts %r8, %r15\n  imul %r8, %r14\n  mov %r8, %r14\n  bts %r8, %r13\n  imul %r8, %r12\n  mov %r8, %r12\n  bts %r8, %r11\n  imul %r8, %r10\n  mov %r8, %r10\n  bts %r8, %rbx\n  imul %r8, %rcx\n  mov %r8, %rcx\n  bts %r8, %rsi\n  imul %r8, %rdx\n  mov %r8, %rdx\n  bts %r8, %r11\n  sub %r9, %rdi\n  jnz btsmultest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rdx\n  pop %rsi\n  pop %rcx\n  pop %rbx\n  ret\n\n\njmptest:\n  push %rsi\n  push %rbx\n  push %rcx\n  push %rdx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\n  xor %rsi, %rsi\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %rsi\n  mov %r8, %rax\n  mov %r8, %rdx\njmptest_loop:\n  jmp jmptest1\n  add $1, %rax\njmptest1:\n  jmp jmptest2\n  add $2, %rax\njmptest2:\n  jmp jmptest3\n  add $3, %rax\njmptest3:\n  jmp jmptest4\n  add $4, %rax\njmptest4:\n  jmp jmptest5\n  add $5, %rax\njmptest5:\n  jmp jmptest6\n  add $6, %rax\njmptest6:\n  jmp jmptest7\n  add $7, %rax\njmptest7:\n  jmp jmptest8\n  add $8, %rax\njmptest8:\n  jmp jmptest9\n  add $9, %rax\njmptest9:\n  jmp jmptest10\n  add $10, %rax\njmptest10:\n  jmp jmptest11\n  add $11, %rax\njmptest11:\n  jmp jmptest12\n  add $12, %rax\njmptest12:\n  jmp jmptest13\n  add $13, %rax\njmptest13:\n  jmp jmptest14\n  add $14, %rax\njmptest14:\n  jmp jmptest15\n  add $15, %rax\njmptest15:\n  jmp jmptest16\n  add $16, %rax\njmptest16:\n  jmp jmptest17\n  add $17, %rax\njmptest17:\n  jmp jmptest18\n  add $18, %rax\njmptest18:\n  jmp jmptest19\n  add $19, %rax\njmptest19:      /* jump back counts as nr 20 */\n  sub %r9, %rdi\n  jnz jmptest_loop\njmptest_jellydonut:\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rdx\n  pop %rcx\n  pop %rbx\n  pop %rsi\n  ret\n\nntjmptest:\n  push %rsi\n  push %rbx\n  push %rcx\n  push %rdx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\n  xor %rsi, %rsi\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %rsi\n  mov %r8, %rax\n  mov %r8, %rdx\nntjmptest_loop:\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  sub %r9, %rdi\n  jnz ntjmptest_loop\nntjmptest_jellydonut:\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rdx\n  pop %rcx\n  pop %rbx\n  pop %rsi\n  ret\n\naddjmptest:\n  push %rsi\n  push %rbx\n  push %rcx\n  push %rdx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $2, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\n  xor %rsi, %rsi\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %rsi\n  mov %r8, %rax\n  mov %r8, %rdx\naddjmptest_loop:\n  add %r8, %r10\n  add %r11, %r12\n  add %r13, %r14\n  jnz addjmptest_jellydonut\n\n  add %r8, %r10\n  add %r11, %r12\n  add %r13, %r14 \n  jnz addjmptest_jellydonut\n\n  add %r8, %r10\n  add %r11, %r12\n  add %r13, %r14 \n  jnz addjmptest_jellydonut\n\n  add %r8, %r10\n  add %r11, %r12\n  add %r13, %r14  \n  jnz addjmptest_jellydonut\n\n  add %r8, %r10\n  add %r11, %r12\n  add %r13, %r14    \n  jnz addjmptest_jellydonut\n\n  sub %r9, %rdi\n  jnz addjmptest_loop\naddjmptest_jellydonut:\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rdx\n  pop %rcx\n  pop %rbx\n  pop %rsi\n  ret \n\njmpmultest:\n  push %rsi\n  push %rbx\n  push %rcx\n  push %rdx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $2, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\n  xor %rsi, %rsi\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %rsi\n  mov %r8, %rax\n  mov %r8, %rdx\njmpmultest_loop:\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %r10d\n\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %esi\n\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %ebx\n\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %edx\n\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %r10d\n\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %esi\n\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %ebx\n\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %edx\n\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %r15d\n\n  cmp %r8, %r9\n  je jmpmultest_jellydonut\n  imul %r8d, %r14d\n\n  sub %r9, %rdi\n  jnz jmpmultest_loop\njmpmultest_jellydonut:\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rdx\n  pop %rcx\n  pop %rbx\n  pop %rsi\n  ret\n\naddmultest:\n  push %rsi\n  push %rbx\n  push %rcx\n  push %rdx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $40, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\n  xor %rsi, %rsi\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %rsi\n  mov %r8, %rax\n  mov %r8, %rdx\naddmultest_loop:\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  imul %r8, %r10\n\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  imul %r8, %rsi\n\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  imul %r8, %rbx\n\n  add %r8, %r15\n  add %r8, %r15\n  add %r8, %r13\n  add %r8, %r12\n  imul %r8, %rdx\n\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  imul %r8, %r10\n\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  imul %r8, %rsi\n\n  add %r8, %r15\n  add %r8, %r14\n  add %r8, %r13\n  add %r8, %r12\n  imul %r8, %rbx\n\n  add %r8, %r15\n  add %r8, %r13\n  add %r8, %r12\n  imul %r8, %rdx\n\n  sub %r9, %rdi\n  jnz addmultest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rdx\n  pop %rcx\n  pop %rbx\n  pop %rsi\n  ret\n\nadd256int:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  //vpbroadcastq %xmm1, %ymm0\n  vmovdqu %ymm0, %ymm1\n  vmovdqu %ymm0, %ymm2\n  vmovdqu %ymm0, %ymm3\n  vmovdqu %ymm0, %ymm4\n  vmovdqu %ymm0, %ymm5\nadd256int_loop:\n  vpaddq %ymm0, %ymm1, %ymm1\n  vpaddq %ymm0, %ymm2, %ymm2\n  vpaddq %ymm0, %ymm3, %ymm3\n  vpaddq %ymm0, %ymm4, %ymm4\n  vpaddq %ymm0, %ymm5, %ymm5\n  vpaddq %ymm0, %ymm1, %ymm1\n  vpaddq %ymm0, %ymm2, %ymm2\n  vpaddq %ymm0, %ymm3, %ymm3\n  vpaddq %ymm0, %ymm4, %ymm4\n  vpaddq %ymm0, %ymm5, %ymm5\n  vpaddq %ymm0, %ymm1, %ymm1\n  vpaddq %ymm0, %ymm2, %ymm2\n  vpaddq %ymm0, %ymm3, %ymm3\n  vpaddq %ymm0, %ymm4, %ymm4\n  vpaddq %ymm0, %ymm5, %ymm5\n  vpaddq %ymm0, %ymm1, %ymm1\n  vpaddq %ymm0, %ymm2, %ymm2\n  vpaddq %ymm0, %ymm3, %ymm3\n  vpaddq %ymm0, %ymm4, %ymm4\n  vpaddq %ymm0, %ymm5, %ymm5\n  sub %r9, %rdi\n  jnz add256int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmul512int:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastq %xmm1, %zmm0\n  vmovdqu64 %zmm0, %zmm1\n  vmovdqu64 %zmm0, %zmm2\n  vmovdqu64 %zmm0, %zmm3\n  vmovdqu64 %zmm0, %zmm4\n  vmovdqu64 %zmm0, %zmm5\nmul512int_loop:\n  vpmulld %zmm0, %zmm1, %zmm1\n  vpmulld %zmm0, %zmm2, %zmm2\n  vpmulld %zmm0, %zmm3, %zmm3\n  vpmulld %zmm0, %zmm4, %zmm4\n  vpmulld %zmm0, %zmm5, %zmm5\n  vpmulld %zmm0, %zmm1, %zmm1\n  vpmulld %zmm0, %zmm2, %zmm2\n  vpmulld %zmm0, %zmm3, %zmm3\n  vpmulld %zmm0, %zmm4, %zmm4\n  vpmulld %zmm0, %zmm5, %zmm5\n  vpmulld %zmm0, %zmm1, %zmm1\n  vpmulld %zmm0, %zmm2, %zmm2\n  vpmulld %zmm0, %zmm3, %zmm3\n  vpmulld %zmm0, %zmm4, %zmm4\n  vpmulld %zmm0, %zmm5, %zmm5\n  vpmulld %zmm0, %zmm1, %zmm1\n  vpmulld %zmm0, %zmm2, %zmm2\n  vpmulld %zmm0, %zmm3, %zmm3\n  vpmulld %zmm0, %zmm4, %zmm4\n  vpmulld %zmm0, %zmm5, %zmm5\n  sub %r9, %rdi\n  jnz mul512int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmuldq512int:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastq %xmm1, %zmm0\n  vmovdqu64 %zmm0, %zmm1\n  vmovdqu64 %zmm0, %zmm2\n  vmovdqu64 %zmm0, %zmm3\n  vmovdqu64 %zmm0, %zmm4\n  vmovdqu64 %zmm0, %zmm5\nmuldq512int_loop:\n  vpmuldq %zmm0, %zmm1, %zmm1\n  vpmuldq %zmm0, %zmm2, %zmm2\n  vpmuldq %zmm0, %zmm3, %zmm3\n  vpmuldq %zmm0, %zmm4, %zmm4\n  vpmuldq %zmm0, %zmm5, %zmm5\n  vpmuldq %zmm0, %zmm1, %zmm1\n  vpmuldq %zmm0, %zmm2, %zmm2\n  vpmuldq %zmm0, %zmm3, %zmm3\n  vpmuldq %zmm0, %zmm4, %zmm4\n  vpmuldq %zmm0, %zmm5, %zmm5\n  vpmuldq %zmm0, %zmm1, %zmm1\n  vpmuldq %zmm0, %zmm2, %zmm2\n  vpmuldq %zmm0, %zmm3, %zmm3\n  vpmuldq %zmm0, %zmm4, %zmm4\n  vpmuldq %zmm0, %zmm5, %zmm5\n  vpmuldq %zmm0, %zmm1, %zmm1\n  vpmuldq %zmm0, %zmm2, %zmm2\n  vpmuldq %zmm0, %zmm3, %zmm3\n  vpmuldq %zmm0, %zmm4, %zmm4\n  vpmuldq %zmm0, %zmm5, %zmm5\n  sub %r9, %rdi\n  jnz muldq512int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nadd512int:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastq %xmm1, %zmm0\n  vmovdqu64 %zmm0, %zmm1\n  vmovdqu64 %zmm0, %zmm2\n  vmovdqu64 %zmm0, %zmm3\n  vmovdqu64 %zmm0, %zmm4\n  vmovdqu64 %zmm0, %zmm5\nadd512int_loop:\n  vpaddq %zmm0, %zmm1, %zmm1\n  vpaddq %zmm0, %zmm2, %zmm2\n  vpaddq %zmm0, %zmm3, %zmm3\n  vpaddq %zmm0, %zmm4, %zmm4\n  vpaddq %zmm0, %zmm5, %zmm5\n  vpaddq %zmm0, %zmm1, %zmm1\n  vpaddq %zmm0, %zmm2, %zmm2\n  vpaddq %zmm0, %zmm3, %zmm3\n  vpaddq %zmm0, %zmm4, %zmm4\n  vpaddq %zmm0, %zmm5, %zmm5\n  vpaddq %zmm0, %zmm1, %zmm1\n  vpaddq %zmm0, %zmm2, %zmm2\n  vpaddq %zmm0, %zmm3, %zmm3\n  vpaddq %zmm0, %zmm4, %zmm4\n  vpaddq %zmm0, %zmm5, %zmm5\n  vpaddq %zmm0, %zmm1, %zmm1\n  vpaddq %zmm0, %zmm2, %zmm2\n  vpaddq %zmm0, %zmm3, %zmm3\n  vpaddq %zmm0, %zmm4, %zmm4\n  vpaddq %zmm0, %zmm5, %zmm5\n  sub %r9, %rdi\n  jnz add512int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmixadd256fpint:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastq %xmm1, %ymm0\n  vmovdqu %ymm0, %ymm1\n  vmovdqu %ymm0, %ymm2\n  vmovdqu %ymm0, %ymm3\n  vmovdqu %ymm0, %ymm4\n  vmovdqu %ymm0, %ymm5\n  cvtsi2ss %r9, %xmm6\n  vbroadcastss %xmm6, %ymm6\n  vmovups %ymm6, %ymm7\n  vmovups %ymm6, %ymm8\n  vmovups %ymm6, %ymm9\n  vmovups %ymm6, %ymm10\n  vmovups %ymm6, %ymm11\nmixadd256fpint_loop:\n  vpaddq %ymm0, %ymm1, %ymm1\n  vaddps %ymm6, %ymm7, %ymm7\n  vpaddq %ymm0, %ymm2, %ymm2\n  vaddps %ymm6, %ymm8, %ymm8\n  vpaddq %ymm0, %ymm3, %ymm3\n  vaddps %ymm6, %ymm9, %ymm9\n  vpaddq %ymm0, %ymm4, %ymm4\n  vaddps %ymm6, %ymm10, %ymm10\n  vpaddq %ymm0, %ymm5, %ymm5\n  vaddps %ymm6, %ymm11, %ymm11\n  vpaddq %ymm0, %ymm1, %ymm1\n  vaddps %ymm6, %ymm7, %ymm7\n  vpaddq %ymm0, %ymm2, %ymm2\n  vaddps %ymm6, %ymm8, %ymm8\n  vpaddq %ymm0, %ymm3, %ymm3\n  vaddps %ymm6, %ymm9, %ymm9\n  vpaddq %ymm0, %ymm4, %ymm4\n  vaddps %ymm6, %ymm10, %ymm10\n  vpaddq %ymm0, %ymm5, %ymm5\n  vaddps %ymm6, %ymm11, %ymm11\n  sub %r9, %rdi\n  jnz mixadd256fpint_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmix256faddintadd:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastq %xmm1, %ymm8\n  cvtsi2ss %r9, %xmm6\n  vbroadcastss %xmm6, %ymm6\n  vmovups %ymm6, %ymm7\n  vmovups %ymm6, %ymm9\n  vmovups %ymm6, %ymm11\n  vmovups %ymm6, %ymm13\n  vmovups %ymm6, %ymm15\n  vmovdqu %ymm8, %ymm10\n  vmovdqu %ymm8, %ymm12\n  vmovdqu %ymm8, %ymm14\nmix256faddintadd_loop:\n  vaddps %ymm6, %ymm7, %ymm7\n  vpaddd %ymm8, %ymm8, %ymm8\n  vaddps %ymm6, %ymm9, %ymm9\n  vpaddd %ymm10, %ymm10, %ymm10\n  vaddps %ymm6, %ymm11, %ymm11\n  vpaddd %ymm12, %ymm12, %ymm12\n  vaddps %ymm6, %ymm13, %ymm13\n  vpaddd %ymm14, %ymm14, %ymm14\n  vaddps %ymm6, %ymm15, %ymm15\n  vpaddd %ymm5, %ymm5, %ymm5\n  vaddps %ymm6, %ymm7, %ymm7\n  vpaddd %ymm8, %ymm8, %ymm8\n  vaddps %ymm6, %ymm9, %ymm9\n  vpaddd %ymm10, %ymm10, %ymm10\n  vaddps %ymm6, %ymm11, %ymm11\n  vpaddd %ymm12, %ymm12, %ymm12\n  vaddps %ymm6, %ymm13, %ymm13\n  vpaddd %ymm14, %ymm14, %ymm14\n  vaddps %ymm6, %ymm15, %ymm15\n  vpaddd %ymm5, %ymm5, %ymm5\n  sub %r9, %rdi\n  jnz mix256faddintadd_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmix256fp:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  movups %xmm6, -32(%rsp)\n  vbroadcastss -32(%rsp), %ymm6\n  vmovups %ymm6, %ymm5\n  vmovups %ymm6, %ymm7\n  vmovups %ymm6, %ymm8\n  vmovups %ymm6, %ymm9\n  vmovups %ymm6, %ymm10\n  vmovups %ymm6, %ymm11\n  vmovups %ymm6, %ymm12\n  vmovups %ymm6, %ymm13\n  vmovups %ymm6, %ymm14\n  vmovups %ymm6, %ymm15\nmix256fp_loop:\n  vaddps %ymm6, %ymm7, %ymm7\n  vmulps %ymm6, %ymm8, %ymm8\n  vaddps %ymm6, %ymm9, %ymm9\n  vmulps %ymm6, %ymm10, %ymm10\n  vaddps %ymm6, %ymm11, %ymm11\n  vmulps %ymm6, %ymm12, %ymm12\n  vaddps %ymm6, %ymm13, %ymm13\n  vmulps %ymm6, %ymm14, %ymm14\n  vaddps %ymm6, %ymm15, %ymm15\n  vmulps %ymm6, %ymm5, %ymm5\n  vaddps %ymm6, %ymm7, %ymm7\n  vmulps %ymm6, %ymm8, %ymm8\n  vaddps %ymm6, %ymm9, %ymm9\n  vmulps %ymm6, %ymm10, %ymm10\n  vaddps %ymm6, %ymm11, %ymm11\n  vmulps %ymm6, %ymm12, %ymm12\n  vaddps %ymm6, %ymm13, %ymm13\n  vmulps %ymm6, %ymm14, %ymm14\n  vaddps %ymm6, %ymm15, %ymm15\n  vmulps %ymm6, %ymm5, %ymm5\n  sub %r9, %rdi\n  jnz mix256fp_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmixadd256int:\n  push %r9\n  push %r8\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  mov $30, %r9\n  movq %r9, %xmm1\n  vpbroadcastq %xmm1, %ymm0\n  vmovdqu %ymm0, %ymm1\n  vmovdqu %ymm0, %ymm2\n  vmovdqu %ymm0, %ymm3\n  vmovdqu %ymm0, %ymm4\n  vmovdqu %ymm0, %ymm5\n  mov %r9, %r15\n  mov %r9, %r14\n  mov %r9, %r13\n  mov %r9, %r12\n  mov %r9, %r11\n  mov %r9, %r8\nmixadd256int_loop:\n  add %r8, %r11\n  add %r8, %r12\n  add %r8, %r13\n  add %r8, %r14\n  add %r8, %r15\n  vpaddq %ymm0, %ymm1, %ymm1\n  vpaddq %ymm0, %ymm2, %ymm2\n  vpaddq %ymm0, %ymm3, %ymm3\n  vpaddq %ymm0, %ymm4, %ymm4\n  vpaddq %ymm0, %ymm5, %ymm5\n  add %r8, %r11\n  add %r8, %r12\n  add %r8, %r13\n  add %r8, %r14\n  add %r8, %r15\n  add %r8, %r11\n  add %r8, %r12\n  add %r8, %r13\n  add %r8, %r14\n  add %r8, %r15\n  vpaddq %ymm0, %ymm1, %ymm1\n  vpaddq %ymm0, %ymm2, %ymm2\n  vpaddq %ymm0, %ymm3, %ymm3\n  vpaddq %ymm0, %ymm4, %ymm4\n  vpaddq %ymm0, %ymm5, %ymm5\n  add %r8, %r11\n  add %r8, %r12\n  add %r8, %r13\n  add %r8, %r14\n  add %r8, %r15\n  sub %r9, %rdi\n  jnz mixadd256int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r8\n  pop %r9\n  ret\n\nmixadd256int11:\n  push %r9\n  push %r8\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastq %xmm1, %ymm0\n  vmovdqu %ymm0, %ymm1\n  vmovdqu %ymm0, %ymm2\n  vmovdqu %ymm0, %ymm3\n  vmovdqu %ymm0, %ymm4\n  vmovdqu %ymm0, %ymm5\n  mov %r9, %r15\n  mov %r9, %r14\n  mov %r9, %r13\n  mov %r9, %r12\n  mov %r9, %r11\n  mov %r9, %r8\nmixadd256int11_loop:\n  add %r8, %r11\n  add %r8, %r12\n  add %r8, %r13\n  add %r8, %r14\n  add %r8, %r15\n  vpaddq %ymm0, %ymm1, %ymm1\n  vpaddq %ymm0, %ymm2, %ymm2\n  vpaddq %ymm0, %ymm3, %ymm3\n  vpaddq %ymm0, %ymm4, %ymm4\n  vpaddq %ymm0, %ymm5, %ymm5\n  add %r8, %r11\n  add %r8, %r12\n  add %r8, %r13\n  add %r8, %r14\n  add %r8, %r15\n  vpaddq %ymm0, %ymm1, %ymm1\n  vpaddq %ymm0, %ymm2, %ymm2\n  vpaddq %ymm0, %ymm3, %ymm3\n  vpaddq %ymm0, %ymm4, %ymm4\n  vpaddq %ymm0, %ymm5, %ymm5\n  sub %r9, %rdi\n  jnz mixadd256int11_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r8\n  pop %r9\n  ret\n\nlatadd256int:\n  push %r9\n  push %r8\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastq %xmm1, %ymm0\n  vmovdqu %ymm0, %ymm1\n  vmovdqu %ymm0, %ymm2\n  vmovdqu %ymm0, %ymm3\n  vmovdqu %ymm0, %ymm4\n  vmovdqu %ymm0, %ymm5\nlatadd256int_loop:\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  vpaddq %ymm0, %ymm0, %ymm0\n  sub %r9, %rdi\n  jnz latadd256int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r8\n  pop %r9\n  ret\n\nlatadd512int:\n  push %r9\n  push %r8\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastq %xmm1, %zmm0\n  vmovdqa64 %zmm0, %zmm1\n  vmovdqa64 %zmm0, %zmm2\n  vmovdqa64 %zmm0, %zmm3\n  vmovdqa64 %zmm0, %zmm4\n  vmovdqa64 %zmm0, %zmm5\nlatadd51a2int_loop:\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  vpaddq %zmm0, %zmm0, %zmm0\n  sub %r9, %rdi\n  jnz latadd256int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r8\n  pop %r9\n  ret\n\nlatmul512int:\n  push %r9\n  push %r8\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastd %xmm1, %zmm0\n  vmovdqu64 %zmm0, %zmm1\n  vmovdqu64 %zmm0, %zmm2\n  vmovdqu64 %zmm0, %zmm3\n  vmovdqu64 %zmm0, %zmm4\n  vmovdqu64 %zmm0, %zmm5\nlatmul512int_loop:\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  vpmulld %zmm0, %zmm0, %zmm0\n  sub %r9, %rdi\n  jnz latmul512int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r8\n  pop %r9\n  ret\n\nlatmuldq512int:\n  push %r9\n  push %r8\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastd %xmm1, %zmm0\n  vmovdqu64 %zmm0, %zmm1\n  vmovdqu64 %zmm0, %zmm2\n  vmovdqu64 %zmm0, %zmm3\n  vmovdqu64 %zmm0, %zmm4\n  vmovdqu64 %zmm0, %zmm5\nlatmuldq512int_loop:\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  vpmuldq %zmm0, %zmm0, %zmm0\n  sub %r9, %rdi\n  jnz latmuldq512int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r8\n  pop %r9\n  ret\n\nlatmulq512int:\n  push %r9\n  push %r8\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  mov $20, %r9\n  movq %r9, %xmm1\n  vpbroadcastd %xmm1, %zmm0\n  vmovdqu64 %zmm0, %zmm1\n  vmovdqu64 %zmm0, %zmm2\n  vmovdqu64 %zmm0, %zmm3\n  vmovdqu64 %zmm0, %zmm4\n  vmovdqu64 %zmm0, %zmm5\nlatmulq512int_loop:\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  vpmullq %zmm0, %zmm0, %zmm0\n  sub %r9, %rdi\n  jnz latmulq512int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r8\n  pop %r9\n  ret\n\nlatmul256int:\n  push %r9\n  push %r8\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  mov $20, %r9\n  movq %r9, %xmm1\n  //vpbroadcastd %xmm1, %ymm0\n  vmovdqu %ymm0, %ymm1\n  vmovdqu %ymm0, %ymm2\n  vmovdqu %ymm0, %ymm3\n  vmovdqu %ymm0, %ymm4\n  vmovdqu %ymm0, %ymm5\nlatmul256int_loop:\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  vpmulld %ymm0, %ymm0, %ymm0\n  sub %r9, %rdi\n  jnz latmul256int_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r8\n  pop %r9\n  ret\n\nlatadd128int:\n  push %r9\n  mov $20, %r9\n  movq %r9, %xmm1\n  //vpbroadcastq %xmm1, %xmm0\nlatadd128int_loop:\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  paddq %xmm0, %xmm0\n  sub %r9, %rdi\n  jnz latadd128int_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\nadd128int:\n  push %r9\n  mov $16, %r9\n  movq %r9, %xmm1\n  //vpbroadcastq %xmm1, %xmm0\nadd128int_loop:\n  paddq %xmm0, %xmm0\n  paddq %xmm1, %xmm1\n  paddq %xmm2, %xmm2\n  paddq %xmm3, %xmm3\n  paddq %xmm4, %xmm4\n  paddq %xmm5, %xmm5\n  paddq %xmm6, %xmm6\n  paddq %xmm7, %xmm7\n  paddq %xmm0, %xmm0\n  paddq %xmm1, %xmm1\n  paddq %xmm2, %xmm2\n  paddq %xmm3, %xmm3\n  paddq %xmm4, %xmm4\n  paddq %xmm5, %xmm5\n  paddq %xmm6, %xmm6\n  paddq %xmm7, %xmm7 \n  sub %r9, %rdi\n  jg add128int_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\naesenc128:\n  push %r9\n  mov $20, %r9\n  movq %r9, %xmm1\n  vzeroall\n  pxor %xmm0, %xmm0\n  pxor %xmm1, %xmm1\n  pxor %xmm2, %xmm2\n  pxor %xmm3, %xmm3\n  pxor %xmm4, %xmm4\n  pxor %xmm5, %xmm5\naesenc128_loop:\n  aesenc %xmm0, %xmm1\n  aesenc %xmm0, %xmm2\n  aesenc %xmm0, %xmm3\n  aesenc %xmm0, %xmm4\n  aesenc %xmm0, %xmm5\n  aesenc %xmm0, %xmm1\n  aesenc %xmm0, %xmm2\n  aesenc %xmm0, %xmm3\n  aesenc %xmm0, %xmm4\n  aesenc %xmm0, %xmm5\n  aesenc %xmm0, %xmm1\n  aesenc %xmm0, %xmm2\n  aesenc %xmm0, %xmm3\n  aesenc %xmm0, %xmm4\n  aesenc %xmm0, %xmm5\n  aesenc %xmm0, %xmm1\n  aesenc %xmm0, %xmm2\n  aesenc %xmm0, %xmm3\n  aesenc %xmm0, %xmm4\n  aesenc %xmm0, %xmm5\n  sub %r9, %rdi\n  jnz aesenc128_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\naesencadd128:\n  push %r9\n  mov $20, %r9\n  movq %r9, %xmm1\n  vzeroall\n  pxor %xmm0, %xmm0\n  pxor %xmm1, %xmm1\n  pxor %xmm2, %xmm2\n  pxor %xmm3, %xmm3\n  pxor %xmm4, %xmm4\n  pxor %xmm5, %xmm5\n  pxor %xmm6, %xmm6\n  pxor %xmm7, %xmm7\n  pxor %xmm8, %xmm8\n  pxor %xmm9, %xmm9\n  pxor %xmm10, %xmm10\n  pxor %xmm11, %xmm11\n  pxor %xmm12, %xmm12\n  pxor %xmm13, %xmm13\naesencadd128_loop:\n  aesenc %xmm0, %xmm1\n  paddd %xmm6, %xmm2\n  paddd %xmm6, %xmm3\n  paddd %xmm6, %xmm4\n  aesenc %xmm0, %xmm5\n  paddd %xmm6, %xmm7\n  paddd %xmm6, %xmm8\n  paddd %xmm6, %xmm9\n  aesenc %xmm0, %xmm10\n  paddd %xmm6, %xmm2\n  paddd %xmm6, %xmm3\n  paddd %xmm6, %xmm4\n  aesenc %xmm0, %xmm1\n  paddd %xmm6, %xmm7\n  paddd %xmm6, %xmm8\n  paddd %xmm6, %xmm9\n  aesenc %xmm0, %xmm10\n  paddd %xmm6, %xmm11\n  paddd %xmm6, %xmm12\n  paddd %xmm6, %xmm13\n  sub %r9, %rdi\n  jnz aesencadd128_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\naesencfma128:\n  push %r9\n  mov $15, %r9\n  movq %r9, %xmm1\n  vzeroall\n  pxor %xmm0, %xmm0\n  pxor %xmm1, %xmm1\n  xorps %xmm2, %xmm2\n  xorps %xmm3, %xmm3\n  xorps %xmm4, %xmm4\n  pxor %xmm5, %xmm5\n  xorps %xmm6, %xmm6\n  xorps %xmm7, %xmm7\n  xorps %xmm8, %xmm8\n  xorps %xmm9, %xmm9\n  pxor %xmm10, %xmm10\n  xorps %xmm11, %xmm11\n  xorps %xmm12, %xmm12\n  xorps %xmm13, %xmm13\n  xorps %xmm14, %xmm14\n  xorps %xmm15, %xmm15\n  vxorps %xmm16, %xmm16, %xmm16\n  vxorps %xmm17, %xmm17, %xmm17\n  vxorps %xmm18, %xmm18, %xmm18\n  vxorps %xmm19, %xmm19, %xmm19\naesencfma128_loop:\n  aesenc %xmm0, %xmm1\n  vfmadd132ps %xmm6, %xmm2, %xmm2\n  vfmadd132ps %xmm6, %xmm3, %xmm3\n  aesenc %xmm0, %xmm5\n  vfmadd132ps %xmm6, %xmm7, %xmm7\n  vfmadd132ps %xmm6, %xmm8, %xmm8\n  aesenc %xmm0, %xmm10\n  vfmadd132ps %xmm6, %xmm11, %xmm11\n  vfmadd132ps %xmm6, %xmm12, %xmm12\n  aesenc %xmm0, %xmm1\n  vfmadd132ps %xmm6, %xmm14, %xmm14\n  vfmadd132ps %xmm6, %xmm15, %xmm15\n  aesenc %xmm0, %xmm10\n  vfmadd132ps %xmm6, %xmm17, %xmm17\n  vfmadd132ps %xmm6, %xmm18, %xmm18\n  sub %r9, %rdi\n  jnz aesencfma128_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\naesencfadd128:\n  push %r9\n  mov $15, %r9\n  movq %r9, %xmm1\n  vzeroall\n  pxor %xmm0, %xmm0\n  pxor %xmm1, %xmm1\n  xorps %xmm2, %xmm2\n  xorps %xmm3, %xmm3\n  xorps %xmm4, %xmm4\n  pxor %xmm5, %xmm5\n  xorps %xmm6, %xmm6\n  xorps %xmm7, %xmm7\n  xorps %xmm8, %xmm8\n  xorps %xmm9, %xmm9\n  pxor %xmm10, %xmm10\n  xorps %xmm11, %xmm11\n  xorps %xmm12, %xmm12\n  xorps %xmm13, %xmm13\n  xorps %xmm14, %xmm14\n  xorps %xmm15, %xmm15\n  vxorps %xmm16, %xmm16, %xmm16\n  vxorps %xmm17, %xmm17, %xmm17\n  vxorps %xmm18, %xmm18, %xmm18\n  vxorps %xmm19, %xmm19, %xmm19\naesencfadd128_loop:\n  aesenc %xmm0, %xmm1\n  vaddps %xmm6, %xmm2, %xmm2\n  vaddps %xmm6, %xmm3, %xmm3\n  aesenc %xmm0, %xmm5\n  vaddps %xmm6, %xmm7, %xmm7\n  vaddps %xmm6, %xmm8, %xmm8\n  aesenc %xmm0, %xmm10\n  vaddps %xmm6, %xmm11, %xmm11\n  vaddps %xmm6, %xmm12, %xmm12\n  aesenc %xmm0, %xmm1\n  vaddps %xmm6, %xmm14, %xmm14\n  vaddps %xmm6, %xmm15, %xmm15\n  aesenc %xmm0, %xmm10\n  vaddps %xmm6, %xmm17, %xmm17\n  vaddps %xmm6, %xmm18, %xmm18\n  sub %r9, %rdi\n  jg aesencfadd128_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\naesencmul128:\n  push %r9\n  mov $15, %r9\n  vzeroall\n  movq %r9, %xmm6\n  pxor %xmm0, %xmm0\n  pxor %xmm5, %xmm5\n  pxor %xmm10, %xmm10\n  xorps %xmm1, %xmm1\n  xorps %xmm2, %xmm2\n  xorps %xmm3, %xmm3\n  xorps %xmm4, %xmm4\n  xorps %xmm7, %xmm7\n  xorps %xmm8, %xmm8\n  xorps %xmm11, %xmm11\n  xorps %xmm12, %xmm12\n  xorps %xmm14, %xmm14\n  xorps %xmm15, %xmm15\naesencmul128_loop:\n  aesenc %xmm0, %xmm1\n  pmullw %xmm6, %xmm2\n  pmullw %xmm6, %xmm3\n  aesenc %xmm0, %xmm5\n  pmullw %xmm6, %xmm7\n  pmullw %xmm6, %xmm8\n  aesenc %xmm0, %xmm10\n  pmullw %xmm6, %xmm11\n  pmullw %xmm6, %xmm12\n  aesenc %xmm0, %xmm1\n  pmullw %xmm6, %xmm4\n  pmullw %xmm6, %xmm6\n  aesenc %xmm0, %xmm10\n  pmullw %xmm6, %xmm13\n  pmullw %xmm6, %xmm14\n  sub %r9, %rdi\n  jg aesencmul128_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\naesdec128:\n  push %r9\n  mov $20, %r9\n  movq %r9, %xmm1\n  vzeroall\n  pxor %xmm0, %xmm0\n  pxor %xmm1, %xmm1\n  pxor %xmm2, %xmm2\n  pxor %xmm3, %xmm3\n  pxor %xmm4, %xmm4\n  pxor %xmm5, %xmm5\naesdec128_loop:\n  aesdec %xmm0, %xmm1\n  aesdec %xmm0, %xmm2\n  aesdec %xmm0, %xmm3\n  aesdec %xmm0, %xmm4\n  aesdec %xmm0, %xmm5\n  aesdec %xmm0, %xmm1\n  aesdec %xmm0, %xmm2\n  aesdec %xmm0, %xmm3\n  aesdec %xmm0, %xmm4\n  aesdec %xmm0, %xmm5\n  aesdec %xmm0, %xmm1\n  aesdec %xmm0, %xmm2\n  aesdec %xmm0, %xmm3\n  aesdec %xmm0, %xmm4\n  aesdec %xmm0, %xmm5\n  aesdec %xmm0, %xmm1\n  aesdec %xmm0, %xmm2\n  aesdec %xmm0, %xmm3\n  aesdec %xmm0, %xmm4\n  aesdec %xmm0, %xmm5\n  sub %r9, %rdi\n  jnz aesdec128_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\nmul128int:\n  push %r9\n  mov $20, %r9\n  movq %r9, %xmm1\n  //vpbroadcastd %xmm1, %xmm0\nmul128int_loop:\n  pmulld %xmm0, %xmm0\n  pmulld %xmm1, %xmm1\n  pmulld %xmm2, %xmm2\n  pmulld %xmm3, %xmm3\n  pmulld %xmm4, %xmm4\n  pmulld %xmm0, %xmm0\n  pmulld %xmm1, %xmm1\n  pmulld %xmm2, %xmm2\n  pmulld %xmm3, %xmm3\n  pmulld %xmm4, %xmm4\n  pmulld %xmm0, %xmm0\n  pmulld %xmm1, %xmm1\n  pmulld %xmm2, %xmm2\n  pmulld %xmm3, %xmm3\n  pmulld %xmm4, %xmm4\n  pmulld %xmm0, %xmm0\n  pmulld %xmm1, %xmm1\n  pmulld %xmm2, %xmm2\n  pmulld %xmm3, %xmm3\n  pmulld %xmm4, %xmm4\n  sub %r9, %rdi\n  jnz mul128int_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\nlatmul128int:\n  push %r9\n  mov $20, %r9\n  movq %r9, %xmm1\n  //vpbroadcastd %xmm1, %xmm0\nlatmul128int_loop:\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  pmulld %xmm0, %xmm0\n  sub %r9, %rdi\n  jnz latmul128int_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\nmixaddmul128int:\n  push %r9\n  mov $20, %r9\n  movq %r9, %xmm1\n  //vpbroadcastd %xmm1, %xmm0\n  movdqa %xmm0, %xmm1\n  movdqa %xmm0, %xmm2\n  movdqa %xmm0, %xmm3\n  movdqa %xmm0, %xmm4\n  movdqa %xmm0, %xmm5\n  movdqa %xmm0, %xmm6\n  movdqa %xmm0, %xmm7\n  movdqa %xmm0, %xmm8\n  movdqa %xmm0, %xmm9\n  movdqa %xmm0, %xmm10\nmixaddmul128int_loop:\n  pmulld %xmm0, %xmm1\n  paddd %xmm0, %xmm2\n  pmulld %xmm0, %xmm3\n  paddd %xmm0, %xmm4\n  pmulld %xmm0, %xmm5\n  paddd %xmm0, %xmm6\n  pmulld %xmm0, %xmm7\n  paddd %xmm0, %xmm8\n  pmulld %xmm0, %xmm9\n  paddd %xmm0, %xmm10\n  pmulld %xmm0, %xmm1\n  paddd %xmm0, %xmm2\n  pmulld %xmm0, %xmm3\n  paddd %xmm0, %xmm4\n  pmulld %xmm0, %xmm5\n  paddd %xmm0, %xmm6\n  pmulld %xmm0, %xmm7\n  paddd %xmm0, %xmm8\n  pmulld %xmm0, %xmm9\n  paddd %xmm0, %xmm10\n  sub %r9, %rdi\n  jnz mixaddmul128int_loop\n  movq %xmm1, %rax\n  pop %r9\n  ret\n\nlatadd256fp:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  movups %xmm6, -32(%rsp)\n  vbroadcastss -32(%rsp), %ymm6\nlatadd256fp_loop:\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  vaddps %ymm6, %ymm6, %ymm6\n  sub %r9, %rdi\n  jnz latadd256fp_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmul256fp:\n  push %r9\n  push %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm0\n  movups %xmm0, -32(%rsp)\n  vbroadcastss -32(%rsp), %ymm6\n  vmovdqa %ymm0, %ymm1\n  vmovdqa %ymm0, %ymm2\n  vmovdqa %ymm0, %ymm3\n  vmovdqa %ymm0, %ymm4\n  vmovdqa %ymm0, %ymm5\n  vmovdqa %ymm0, %ymm6\n  vmovdqa %ymm0, %ymm7\n  vmovdqa %ymm0, %ymm8\n  vmovdqa %ymm0, %ymm9\n  vmovdqa %ymm0, %ymm10\nmul256fp_loop:\n  vmulps %ymm0, %ymm1, %ymm1\n  vmulps %ymm0, %ymm2, %ymm2\n  vmulps %ymm0, %ymm3, %ymm3\n  vmulps %ymm0, %ymm4, %ymm4\n  vmulps %ymm0, %ymm5, %ymm5\n  vmulps %ymm0, %ymm6, %ymm6\n  vmulps %ymm0, %ymm7, %ymm7\n  vmulps %ymm0, %ymm8, %ymm8\n  vmulps %ymm0, %ymm9, %ymm9\n  vmulps %ymm0, %ymm10, %ymm10\n  vmulps %ymm0, %ymm1, %ymm1\n  vmulps %ymm0, %ymm2, %ymm2\n  vmulps %ymm0, %ymm3, %ymm3\n  vmulps %ymm0, %ymm4, %ymm4\n  vmulps %ymm0, %ymm5, %ymm5\n  vmulps %ymm0, %ymm6, %ymm6\n  vmulps %ymm0, %ymm7, %ymm7\n  vmulps %ymm0, %ymm8, %ymm8\n  vmulps %ymm0, %ymm9, %ymm9\n  vmulps %ymm0, %ymm10, %ymm10\n  sub %r9, %rdi\n  jnz mul256fp_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nadd256fp:\n  push %r9\n  push %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm0\n  movups %xmm0, -32(%rsp)\n  vbroadcastss -32(%rsp), %ymm6\n  vmovdqa %ymm0, %ymm1\n  vmovdqa %ymm0, %ymm2\n  vmovdqa %ymm0, %ymm3\n  vmovdqa %ymm0, %ymm4\n  vmovdqa %ymm0, %ymm5\n  vmovdqa %ymm0, %ymm6\n  vmovdqa %ymm0, %ymm7\n  vmovdqa %ymm0, %ymm8\n  vmovdqa %ymm0, %ymm9\n  vmovdqa %ymm0, %ymm10\nadd256fp_loop:\n  vaddps %ymm0, %ymm1, %ymm1\n  vaddps %ymm0, %ymm2, %ymm2\n  vaddps %ymm0, %ymm3, %ymm3\n  vaddps %ymm0, %ymm4, %ymm4\n  vaddps %ymm0, %ymm5, %ymm5\n  vaddps %ymm0, %ymm6, %ymm6\n  vaddps %ymm0, %ymm7, %ymm7\n  vaddps %ymm0, %ymm8, %ymm8\n  vaddps %ymm0, %ymm9, %ymm9\n  vaddps %ymm0, %ymm10, %ymm10\n  vaddps %ymm0, %ymm1, %ymm1\n  vaddps %ymm0, %ymm2, %ymm2\n  vaddps %ymm0, %ymm3, %ymm3\n  vaddps %ymm0, %ymm4, %ymm4\n  vaddps %ymm0, %ymm5, %ymm5\n  vaddps %ymm0, %ymm6, %ymm6\n  vaddps %ymm0, %ymm7, %ymm7\n  vaddps %ymm0, %ymm8, %ymm8\n  vaddps %ymm0, %ymm9, %ymm9\n  vaddps %ymm0, %ymm10, %ymm10\n  sub %r9, %rdi\n  jnz add256fp_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\n\nlatmul256fp:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  movups %xmm6, -32(%rsp)\n  vbroadcastss -32(%rsp), %ymm6\nlatmul256fp_loop:\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  vmulps %ymm6, %ymm6, %ymm6\n  sub %r9, %rdi\n  jnz latmul256fp_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nfma512:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  vbroadcastss %xmm6, %zmm6\n  vmovups %zmm6, %zmm5\n  vmovups %zmm6, %zmm7\n  vmovups %zmm6, %zmm8\n  vmovups %zmm6, %zmm9\n  vmovups %zmm6, %zmm10\n  vmovups %zmm6, %zmm11\n  vmovups %zmm6, %zmm12\n  vmovups %zmm6, %zmm13\n  vmovups %zmm6, %zmm14\n  vmovups %zmm6, %zmm15\nfma512_loop:\n  vfmadd132ps %zmm6, %zmm5, %zmm5\n  vfmadd132ps %zmm6, %zmm7, %zmm7\n  vfmadd132ps %zmm6, %zmm8, %zmm8\n  vfmadd132ps %zmm6, %zmm9, %zmm9\n  vfmadd132ps %zmm6, %zmm10, %zmm10\n  vfmadd132ps %zmm6, %zmm11, %zmm11\n  vfmadd132ps %zmm6, %zmm12, %zmm12\n  vfmadd132ps %zmm6, %zmm13, %zmm13\n  vfmadd132ps %zmm6, %zmm14, %zmm14\n  vfmadd132ps %zmm6, %zmm15, %zmm15\n  vfmadd132ps %zmm6, %zmm5, %zmm5\n  vfmadd132ps %zmm6, %zmm7, %zmm7\n  vfmadd132ps %zmm6, %zmm8, %zmm8\n  vfmadd132ps %zmm6, %zmm9, %zmm9\n  vfmadd132ps %zmm6, %zmm10, %zmm10\n  vfmadd132ps %zmm6, %zmm11, %zmm11\n  vfmadd132ps %zmm6, %zmm12, %zmm12\n  vfmadd132ps %zmm6, %zmm13, %zmm13\n  vfmadd132ps %zmm6, %zmm14, %zmm14\n  vfmadd132ps %zmm6, %zmm15, %zmm15\n  sub %r9, %rdi\n  jnz fma512_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmix21fma256fma512:\n  push %r9\n  push %r8\n  mov $18, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  vbroadcastss %xmm6, %zmm6\n  vmovups %zmm6, %zmm5\n  vmovups %zmm6, %zmm7\n  vmovups %zmm6, %zmm8\n  vmovups %zmm6, %zmm9\n  vmovups %zmm6, %zmm10\n  vmovups %zmm6, %zmm11\n  vmovups %zmm6, %zmm12\n  vmovups %zmm6, %zmm13\n  vmovups %zmm6, %zmm14\n  vmovups %zmm6, %zmm15\nmix21fma256fma512_loop:\n  vfmadd132ps %ymm6, %ymm5, %ymm5\n  vfmadd132ps %ymm6, %ymm7, %ymm7\n  vfmadd132ps %zmm6, %zmm8, %zmm8\n  vfmadd132ps %ymm6, %ymm9, %ymm9\n  vfmadd132ps %ymm6, %ymm10, %ymm10\n  vfmadd132ps %zmm6, %zmm11, %zmm11\n  vfmadd132ps %ymm6, %ymm12, %ymm12\n  vfmadd132ps %ymm6, %ymm13, %ymm13\n  vfmadd132ps %zmm6, %zmm14, %zmm14\n  vfmadd132ps %ymm6, %ymm5, %ymm5\n  vfmadd132ps %ymm6, %ymm7, %ymm7\n  vfmadd132ps %zmm6, %zmm8, %zmm8\n  vfmadd132ps %ymm6, %ymm9, %ymm9\n  vfmadd132ps %ymm6, %ymm10, %ymm10\n  vfmadd132ps %zmm6, %zmm11, %zmm11\n  vfmadd132ps %ymm6, %ymm12, %ymm12\n  vfmadd132ps %ymm6, %ymm13, %ymm13\n  vfmadd132ps %zmm6, %zmm14, %zmm14 \n  sub %r9, %rdi\n  jg mix21fma256fma512_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret \n\nmixfma256fma512:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  vbroadcastss %xmm6, %zmm6\n  vmovups %zmm6, %zmm5\n  vmovups %zmm6, %zmm7\n  vmovups %zmm6, %zmm8\n  vmovups %zmm6, %zmm9\n  vmovups %zmm6, %zmm10\n  vmovups %zmm6, %zmm11\n  vmovups %zmm6, %zmm12\n  vmovups %zmm6, %zmm13\n  vmovups %zmm6, %zmm14\n  vmovups %zmm6, %zmm15\nmixfma256fma512_loop:\n  vfmadd132ps %ymm6, %ymm5, %ymm5\n  vfmadd132ps %zmm6, %zmm7, %zmm7\n  vfmadd132ps %ymm6, %ymm8, %ymm8\n  vfmadd132ps %zmm6, %zmm9, %zmm9\n  vfmadd132ps %ymm6, %ymm10, %ymm10\n  vfmadd132ps %zmm6, %zmm11, %zmm11\n  vfmadd132ps %ymm6, %ymm12, %ymm12\n  vfmadd132ps %zmm6, %zmm13, %zmm13\n  vfmadd132ps %ymm6, %ymm14, %ymm14\n  vfmadd132ps %zmm6, %zmm15, %zmm15\n  vfmadd132ps %ymm6, %ymm5, %ymm5\n  vfmadd132ps %zmm6, %zmm7, %zmm7\n  vfmadd132ps %ymm6, %ymm8, %ymm8\n  vfmadd132ps %zmm6, %zmm9, %zmm9\n  vfmadd132ps %ymm6, %ymm10, %ymm10\n  vfmadd132ps %zmm6, %zmm11, %zmm11\n  vfmadd132ps %ymm6, %ymm12, %ymm12\n  vfmadd132ps %zmm6, %zmm13, %zmm13\n  vfmadd132ps %ymm6, %ymm14, %ymm14\n  vfmadd132ps %zmm6, %zmm15, %zmm15\n  sub %r9, %rdi\n  jnz mixfma256fma512_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nfma256:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  movups %xmm6, -32(%rsp)\n  vbroadcastss -32(%rsp), %ymm6\n  vmovups %ymm6, %ymm5\n  vmovups %ymm6, %ymm7\n  vmovups %ymm6, %ymm8\n  vmovups %ymm6, %ymm9\n  vmovups %ymm6, %ymm10\n  vmovups %ymm6, %ymm11\n  vmovups %ymm6, %ymm12\n  vmovups %ymm6, %ymm13\n  vmovups %ymm6, %ymm14\n  vmovups %ymm6, %ymm15\nfma256_loop:\n  vfmadd132ps %ymm6, %ymm5, %ymm5\n  vfmadd132ps %ymm6, %ymm7, %ymm7\n  vfmadd132ps %ymm6, %ymm8, %ymm8\n  vfmadd132ps %ymm6, %ymm9, %ymm9\n  vfmadd132ps %ymm6, %ymm10, %ymm10\n  vfmadd132ps %ymm6, %ymm11, %ymm11\n  vfmadd132ps %ymm6, %ymm12, %ymm12\n  vfmadd132ps %ymm6, %ymm13, %ymm13\n  vfmadd132ps %ymm6, %ymm14, %ymm14\n  vfmadd132ps %ymm6, %ymm15, %ymm15\n  vfmadd132ps %ymm6, %ymm5, %ymm5\n  vfmadd132ps %ymm6, %ymm7, %ymm7\n  vfmadd132ps %ymm6, %ymm8, %ymm8\n  vfmadd132ps %ymm6, %ymm9, %ymm9\n  vfmadd132ps %ymm6, %ymm10, %ymm10\n  vfmadd132ps %ymm6, %ymm11, %ymm11\n  vfmadd132ps %ymm6, %ymm12, %ymm12\n  vfmadd132ps %ymm6, %ymm13, %ymm13\n  vfmadd132ps %ymm6, %ymm14, %ymm14\n  vfmadd132ps %ymm6, %ymm15, %ymm15\n  sub %r9, %rdi\n  jnz fma256_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nfma128:\n  push %r9\n  push %r8\n  vzeroupper\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  movups %xmm6, -16(%rsp)\n  vbroadcastss -16(%rsp), %xmm6\n  vmovups %xmm6, %xmm5\n  vmovups %xmm6, %xmm7\n  vmovups %xmm6, %xmm8\n  vmovups %xmm6, %xmm9\n  vmovups %xmm6, %xmm10\n  vmovups %xmm6, %xmm11\n  vmovups %xmm6, %xmm12\n  vmovups %xmm6, %xmm13\n  vmovups %xmm6, %xmm14\n  vmovups %xmm6, %xmm15\nfma128_loop:\n  vfmadd132ps %xmm6, %xmm5, %xmm5\n  vfmadd132ps %xmm6, %xmm7, %xmm7\n  vfmadd132ps %xmm6, %xmm8, %xmm8\n  vfmadd132ps %xmm6, %xmm9, %xmm9\n  vfmadd132ps %xmm6, %xmm10, %xmm10\n  vfmadd132ps %xmm6, %xmm11, %xmm11\n  vfmadd132ps %xmm6, %xmm12, %xmm12\n  vfmadd132ps %xmm6, %xmm13, %xmm13\n  vfmadd132ps %xmm6, %xmm14, %xmm14\n  vfmadd132ps %xmm6, %xmm15, %xmm15\n  vfmadd132ps %xmm6, %xmm5, %xmm5\n  vfmadd132ps %xmm6, %xmm7, %xmm7\n  vfmadd132ps %xmm6, %xmm8, %xmm8\n  vfmadd132ps %xmm6, %xmm9, %xmm9\n  vfmadd132ps %xmm6, %xmm10, %xmm10\n  vfmadd132ps %xmm6, %xmm11, %xmm11\n  vfmadd132ps %xmm6, %xmm12, %xmm12\n  vfmadd132ps %xmm6, %xmm13, %xmm13\n  vfmadd132ps %xmm6, %xmm14, %xmm14\n  vfmadd132ps %xmm6, %xmm15, %xmm15\n  sub %r9, %rdi\n  jnz fma128_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmixfmafadd256:\n  push %r9\n  push %r8\n  mov $30, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  vbroadcastss %xmm6, %ymm6\n  vmovups %ymm6, %ymm0\n  vmovups %ymm6, %ymm1\n  vmovups %ymm6, %ymm2\n  vmovups %ymm6, %ymm3\n  vmovups %ymm6, %ymm4\n  vmovups %ymm6, %ymm5\n  vmovups %ymm6, %ymm7\n  vmovups %ymm6, %ymm8\n  vmovups %ymm6, %ymm9\n  vmovups %ymm6, %ymm10\n  vmovups %ymm6, %ymm11\n  vmovups %ymm6, %ymm12\n  vmovups %ymm6, %ymm13\n  vmovups %ymm6, %ymm14\n  vmovups %ymm6, %ymm15\nmixfmafadd256_loop:\n  vfmadd132ps %ymm6, %ymm5, %ymm5\n  vfmadd132ps %ymm6, %ymm7, %ymm7\n  vaddps %ymm10, %ymm5, %ymm11\n  vfmadd132ps %ymm6, %ymm8, %ymm8\n  vfmadd132ps %ymm6, %ymm9, %ymm9\n  vaddps %ymm12, %ymm5, %ymm13\n  vfmadd132ps %ymm6, %ymm14, %ymm14\n  vfmadd132ps %ymm6, %ymm15, %ymm15\n  vaddps %ymm12, %ymm6, %ymm13\n  vfmadd132ps %ymm6, %ymm0, %ymm1\n  vfmadd132ps %ymm6, %ymm2, %ymm3\n  vaddps %ymm6, %ymm5, %ymm4\n  vfmadd132ps %ymm6, %ymm5, %ymm5\n  vfmadd132ps %ymm6, %ymm7, %ymm7\n  vaddps %ymm10, %ymm6, %ymm11\n  vfmadd132ps %ymm6, %ymm8, %ymm8\n  vfmadd132ps %ymm6, %ymm9, %ymm9\n  vaddps %ymm12, %ymm7, %ymm13\n  vfmadd132ps %ymm6, %ymm14, %ymm14\n  vfmadd132ps %ymm6, %ymm15, %ymm15\n  vaddps %ymm12, %ymm5, %ymm13\n  vfmadd132ps %ymm6, %ymm0, %ymm1\n  vfmadd132ps %ymm6, %ymm2, %ymm3\n  vaddps %ymm6, %ymm5, %ymm4\n  vfmadd132ps %ymm6, %ymm5, %ymm5\n  vfmadd132ps %ymm6, %ymm7, %ymm7\n  vaddps %ymm10, %ymm6, %ymm11\n  vfmadd132ps %ymm6, %ymm8, %ymm8\n  vfmadd132ps %ymm6, %ymm9, %ymm9\n  vaddps %ymm12, %ymm5, %ymm13\n  sub %r9, %rdi\n  jnz mixfmafadd256_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmixfmaadd512:\n  push %r9\n  push %r8\n  mov $16, %r9\n  movq %r9, %xmm0\n  vpbroadcastq %xmm0, %zmm0\n  cvtsi2ss %r9, %xmm1\n  vbroadcastss %xmm1, %zmm1\n  vmovdqa64 %zmm0, %zmm3\n  vmovdqa64 %zmm0, %zmm6\n  vmovdqa64 %zmm0, %zmm9\n  vmovdqa64 %zmm0, %zmm12\n  vmovdqa64 %zmm0, %zmm15\n  vmovaps %zmm1, %zmm2\n  vmovaps %zmm1, %zmm4\n  vmovaps %zmm1, %zmm5\n  vmovaps %zmm1, %zmm7\n  vmovaps %zmm1, %zmm8\n  vmovaps %zmm1, %zmm10\n  vmovaps %zmm1, %zmm11\n  vmovaps %zmm1, %zmm13\n  vmovaps %zmm1, %zmm14\nmixfmaadd512_loop:\n  vpaddq %zmm0, %zmm15, %zmm0\n  vfmadd132ps %zmm1, %zmm1, %zmm1\n  vfmadd132ps %zmm2, %zmm2, %zmm2\n  vpaddq %zmm3, %zmm15, %zmm3\n  vfmadd132ps %zmm4, %zmm4, %zmm4\n  vfmadd132ps %zmm5, %zmm5, %zmm5\n  vpaddq %zmm6, %zmm15, %zmm6\n  vfmadd132ps %zmm7, %zmm7, %zmm7\n  vfmadd132ps %zmm8, %zmm8, %zmm8\n  vpaddq %zmm9, %zmm15, %zmm9\n  vfmadd132ps %zmm10, %zmm10, %zmm10\n  vfmadd132ps %zmm11, %zmm11, %zmm11\n  vpaddq %zmm12, %zmm15, %zmm12\n  vfmadd132ps %zmm13, %zmm13, %zmm13\n  vfmadd132ps %zmm14, %zmm14, %zmm14\n  sub %r9, %rdi\n  jg mixfmaadd512_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmixfma512add256:\n  push %r9\n  push %r8\n  mov $16, %r9\n  movq %r9, %xmm0\n  vpbroadcastq %xmm0, %ymm0\n  cvtsi2ss %r9, %xmm1\n  vbroadcastss %xmm1, %zmm1\n  vmovdqa %ymm0, %ymm3\n  vmovdqa %ymm0, %ymm6\n  vmovdqa %ymm0, %ymm9\n  vmovdqa %ymm0, %ymm12\n  vmovdqa %ymm0, %ymm15\n  vmovaps %zmm1, %zmm2\n  vmovaps %zmm1, %zmm4\n  vmovaps %zmm1, %zmm5\n  vmovaps %zmm1, %zmm7\n  vmovaps %zmm1, %zmm8\n  vmovaps %zmm1, %zmm10\n  vmovaps %zmm1, %zmm11\n  vmovaps %zmm1, %zmm13\n  vmovaps %zmm1, %zmm14\nmixfma512add256_loop:\n  vpaddq %ymm0, %ymm15, %ymm0\n  vfmadd132ps %zmm1, %zmm1, %zmm1\n  vfmadd132ps %zmm2, %zmm2, %zmm2\n  vpaddq %ymm3, %ymm15, %ymm3\n  vfmadd132ps %zmm4, %zmm4, %zmm4\n  vfmadd132ps %zmm5, %zmm5, %zmm5\n  vpaddq %ymm6, %ymm15, %ymm6\n  vfmadd132ps %zmm7, %zmm7, %zmm7\n  vfmadd132ps %zmm8, %zmm8, %zmm8\n  vpaddq %ymm9, %ymm15, %ymm9\n  vfmadd132ps %zmm10, %zmm10, %zmm10\n  vfmadd132ps %zmm11, %zmm11, %zmm11\n  vpaddq %ymm12, %ymm15, %ymm12\n  vfmadd132ps %zmm13, %zmm13, %zmm13\n  vfmadd132ps %zmm14, %zmm14, %zmm14\n  sub %r9, %rdi\n  jg mixfma512add256_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmixfmaadd256:\n  push %r9\n  push %r8\n  mov $16, %r9\n  movq %r9, %xmm0\n  vpbroadcastq %xmm0, %ymm0\n  cvtsi2ss %r9, %xmm1\n  vbroadcastss %xmm1, %ymm1\n  vmovdqa %ymm0, %ymm3\n  vmovdqa %ymm0, %ymm6\n  vmovdqa %ymm0, %ymm9\n  vmovdqa %ymm0, %ymm12\n  vmovdqa %ymm0, %ymm15\n  vmovaps %ymm1, %ymm2\n  vmovaps %ymm1, %ymm4\n  vmovaps %ymm1, %ymm5\n  vmovaps %ymm1, %ymm7\n  vmovaps %ymm1, %ymm8\n  vmovaps %ymm1, %ymm10\n  vmovaps %ymm1, %ymm11\n  vmovaps %ymm1, %ymm13\n  vmovaps %ymm1, %ymm14\nmixfmaadd256_loop:\n  vpaddq %ymm0, %ymm15, %ymm0\n  vfmadd132ps %ymm1, %ymm1, %ymm1\n  vfmadd132ps %ymm2, %ymm2, %ymm2\n  vpaddq %ymm3, %ymm15, %ymm3\n  vfmadd132ps %ymm4, %ymm4, %ymm4\n  vfmadd132ps %ymm5, %ymm5, %ymm5\n  vpaddq %ymm6, %ymm15, %ymm6\n  vfmadd132ps %ymm7, %ymm7, %ymm7\n  vfmadd132ps %ymm8, %ymm8, %ymm8\n  vpaddq %ymm9, %ymm15, %ymm9\n  vfmadd132ps %ymm10, %ymm10, %ymm10\n  vfmadd132ps %ymm11, %ymm11, %ymm11\n  vpaddq %ymm12, %ymm15, %ymm12\n  vfmadd132ps %ymm13, %ymm13, %ymm13\n  vfmadd132ps %ymm14, %ymm14, %ymm14\n  sub %r9, %rdi\n  jg mixfmaadd256_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmixfmaand256:\n  push %r9\n  push %r8\n  mov $15, %r9\n  movq %r9, %xmm0\n  vpbroadcastq %xmm0, %ymm0\n  cvtsi2ss %r9, %xmm1\n  vbroadcastss %xmm1, %ymm1\n  vmovdqa %ymm0, %ymm3\n  vmovdqa %ymm0, %ymm6\n  vmovdqa %ymm0, %ymm9\n  vmovdqa %ymm0, %ymm12\n  vmovdqa %ymm0, %ymm15\n  vmovaps %ymm1, %ymm2\n  vmovaps %ymm1, %ymm4\n  vmovaps %ymm1, %ymm5\n  vmovaps %ymm1, %ymm7\n  vmovaps %ymm1, %ymm8\n  vmovaps %ymm1, %ymm10\n  vmovaps %ymm1, %ymm11\n  vmovaps %ymm1, %ymm13\n  vmovaps %ymm1, %ymm14\nmixfmaand256_loop:\n  vpand %ymm0, %ymm15, %ymm0\n  vfmadd132ps %ymm1, %ymm1, %ymm1\n  vfmadd132ps %ymm2, %ymm2, %ymm2\n  vpand %ymm3, %ymm15, %ymm3\n  vfmadd132ps %ymm4, %ymm4, %ymm4\n  vfmadd132ps %ymm5, %ymm5, %ymm5\n  vpand %ymm6, %ymm15, %ymm6\n  vfmadd132ps %ymm7, %ymm7, %ymm7\n  vfmadd132ps %ymm8, %ymm8, %ymm8\n  vpand %ymm9, %ymm15, %ymm9\n  vfmadd132ps %ymm10, %ymm10, %ymm10\n  vfmadd132ps %ymm11, %ymm11, %ymm11\n  vpand %ymm12, %ymm15, %ymm12\n  vfmadd132ps %ymm13, %ymm13, %ymm13\n  vfmadd132ps %ymm14, %ymm14, %ymm14\n  sub %r9, %rdi\n  jg mixfmaand256_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmixfmaandmem256:\n  push %r9\n  push %r8\n  mov $22, %r9\n  movq %r9, %xmm0\n  vpbroadcastq %xmm0, %ymm0\n  cvtsi2ss %r9, %xmm1\n  vbroadcastss %xmm1, %ymm1\n  vmovdqa %ymm0, %ymm3\n  vmovaps %ymm1, %ymm6\n  vmovaps %ymm1, %ymm9\n  vmovaps %ymm1, %ymm12\n  vmovaps %ymm1, %ymm15\n  vmovaps %ymm1, %ymm2\n  vmovaps %ymm1, %ymm4\n  vmovaps %ymm1, %ymm5\n  vmovaps %ymm1, %ymm7\n  vmovaps %ymm1, %ymm8\n  vmovaps %ymm1, %ymm10\n  vmovaps %ymm1, %ymm11\n  vmovaps %ymm1, %ymm13\n  vmovaps %ymm1, %ymm14\nmixfmaandmem256_loop:\n  vpand %ymm0, %ymm0, %ymm0\n  vfmadd132ps %ymm1, %ymm1, %ymm1\n  vfmadd132ps (%rsi), %ymm2, %ymm2\n  vpand %ymm3, %ymm3, %ymm3\n  vfmadd132ps %ymm4, %ymm4, %ymm4\n  vfmadd132ps (%rsi), %ymm5, %ymm5\n  vpand %ymm0, %ymm0, %ymm0\n  vfmadd132ps %ymm7, %ymm7, %ymm7\n  vfmadd132ps (%rsi), %ymm8, %ymm8\n  vpand %ymm3, %ymm3, %ymm3\n  vfmadd132ps %ymm10, %ymm10, %ymm10\n  vfmadd132ps (%rsi), %ymm11, %ymm11\n  vpand %ymm0, %ymm0, %ymm0\n  vfmadd132ps %ymm13, %ymm13, %ymm13\n  vfmadd132ps (%rsi), %ymm14, %ymm14\n\n  vpand %ymm3, %ymm3, %ymm3\n  vfmadd132ps %ymm6, %ymm6, %ymm6\n  vfmadd132ps (%rsi), %ymm9, %ymm9\n  vpand %ymm0, %ymm0, %ymm0\n  vfmadd132ps %ymm12, %ymm12, %ymm12\n  vfmadd132ps (%rsi), %ymm15, %ymm15\n  sub %r9, %rdi\n  jg mixfmaandmem256_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nmixfmaaddmem256:\n  push %r9\n  push %r8\n  mov $22, %r9\n  movq %r9, %xmm0\n  vpbroadcastq %xmm0, %ymm0\n  cvtsi2ss %r9, %xmm1\n  vbroadcastss %xmm1, %ymm1\n  vmovdqa %ymm0, %ymm3\n  vmovaps %ymm1, %ymm6\n  vmovaps %ymm1, %ymm9\n  vmovaps %ymm1, %ymm12\n  vmovaps %ymm1, %ymm15\n  vmovaps %ymm1, %ymm2\n  vmovaps %ymm1, %ymm4\n  vmovaps %ymm1, %ymm5\n  vmovaps %ymm1, %ymm7\n  vmovaps %ymm1, %ymm8\n  vmovaps %ymm1, %ymm10\n  vmovaps %ymm1, %ymm11\n  vmovaps %ymm1, %ymm13\n  vmovaps %ymm1, %ymm14\nmixfmaaddmem256_loop:\n  vpaddq %ymm0, %ymm0, %ymm0\n  vfmadd132ps %ymm1, %ymm1, %ymm1\n  vfmadd132ps (%rsi), %ymm2, %ymm2\n  vpaddq %ymm3, %ymm3, %ymm3\n  vfmadd132ps %ymm4, %ymm4, %ymm4\n  vfmadd132ps (%rsi), %ymm5, %ymm5\n  vpaddq %ymm0, %ymm0, %ymm0\n  vfmadd132ps %ymm7, %ymm7, %ymm7\n  vfmadd132ps (%rsi), %ymm8, %ymm8\n  vpaddq %ymm3, %ymm3, %ymm3\n  vfmadd132ps %ymm10, %ymm10, %ymm10\n  vfmadd132ps (%rsi), %ymm11, %ymm11\n  vpaddq %ymm0, %ymm0, %ymm0\n  vfmadd132ps %ymm13, %ymm13, %ymm13\n  vfmadd132ps (%rsi), %ymm14, %ymm14\n\n  vpaddq %ymm3, %ymm3, %ymm3\n  vfmadd132ps %ymm6, %ymm6, %ymm6\n  vfmadd132ps (%rsi), %ymm9, %ymm9\n  vpaddq %ymm0, %ymm0, %ymm0\n  vfmadd132ps %ymm12, %ymm12, %ymm12\n  vfmadd132ps (%rsi), %ymm15, %ymm15\n\n  sub %r9, %rdi\n  jg mixfmaaddmem256_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nnemesfpu512mix21:\n  push %r9\n  mov $16, %r9\n  cvtsi2ss %r9, %xmm0\n  vbroadcastss %xmm0, %zmm1\n  vmovdqa64 %zmm1, %zmm2\n  vmovdqa64 %zmm1, %zmm3\n  vmovdqa64 %zmm1, %zmm4\n  vmovdqa64 %zmm1, %zmm5\n  vmovdqa64 %zmm1, %zmm6\n  vmovdqa64 %zmm1, %zmm7\n  vmovdqa64 %zmm1, %zmm8\n  vmovdqa64 %zmm1, %zmm9\n  vmovdqa64 %zmm1, %zmm10\n  vmovdqa64 %zmm1, %zmm11\n  vmovdqa64 %zmm1, %zmm12\n  vmovdqa64 %zmm1, %zmm13\n  vmovdqa64 %zmm1, %zmm14\n  vmovdqa64 %zmm1, %zmm15\nnemesfpu512mix21_loop:\n  vaddps %zmm0, %zmm0, %zmm0\n  vfmadd132ps %zmm1, %zmm1, %zmm1\n  vfmadd132ps %zmm2, %zmm2, %zmm2\n  vaddps %zmm3, %zmm3, %zmm3\n  vfmadd132ps %zmm4, %zmm4, %zmm4\n  vfmadd132ps %zmm5, %zmm5, %zmm5\n  vaddps %zmm6, %zmm6, %zmm6\n  vfmadd132ps %zmm7, %zmm7, %zmm7\n  vfmadd132ps %zmm8, %zmm8, %zmm8\n  vaddps %zmm9, %zmm9, %zmm9\n  vfmadd132ps %zmm10, %zmm10, %zmm10\n  vfmadd132ps %zmm11, %zmm11, %zmm11\n  vaddps %ymm12, %ymm12, %ymm12\n  vfmadd132ps %zmm13, %zmm13, %zmm13\n  vfmadd132ps %zmm14, %zmm14, %zmm14\n  vaddps %zmm15, %zmm15, %zmm15\n  sub %r9, %rdi\n  jg nemesfpu512mix21_loop\n  pop %r9\n  ret\n\nnemesfpumix21:\n  push %r9\n  mov $16, %r9\n  cvtsi2ss %r9, %xmm0\n  vbroadcastss %xmm0, %ymm1\n  vmovdqa %ymm1, %ymm2\n  vmovdqa %ymm1, %ymm3\n  vmovdqa %ymm1, %ymm4\n  vmovdqa %ymm1, %ymm5\n  vmovdqa %ymm1, %ymm6\n  vmovdqa %ymm1, %ymm7\n  vmovdqa %ymm1, %ymm8\n  vmovdqa %ymm1, %ymm9\n  vmovdqa %ymm1, %ymm10\n  vmovdqa %ymm1, %ymm11\n  vmovdqa %ymm1, %ymm12\n  vmovdqa %ymm1, %ymm13\n  vmovdqa %ymm1, %ymm14\n  vmovdqa %ymm1, %ymm15\nnemesfpumix21_loop:\n  vaddps %ymm0, %ymm0, %ymm0\n  vfmadd132ps %ymm1, %ymm1, %ymm1\n  vfmadd132ps %ymm2, %ymm2, %ymm2\n  vaddps %ymm3, %ymm3, %ymm3\n  vfmadd132ps %ymm4, %ymm4, %ymm4\n  vfmadd132ps %ymm5, %ymm5, %ymm5\n  vaddps %ymm6, %ymm6, %ymm6\n  vfmadd132ps %ymm7, %ymm7, %ymm7\n  vfmadd132ps %ymm8, %ymm8, %ymm8\n  vaddps %ymm9, %ymm9, %ymm9\n  vfmadd132ps %ymm10, %ymm10, %ymm10\n  vfmadd132ps %ymm11, %ymm11, %ymm11\n  vaddps %ymm12, %ymm12, %ymm12\n  vfmadd132ps %ymm13, %ymm13, %ymm13\n  vfmadd132ps %ymm14, %ymm14, %ymm14\n  vaddps %ymm15, %ymm15, %ymm15\n  sub %r9, %rdi\n  jg nemesfpumix21_loop\n  pop %r9\n  ret\n\nlatfma512:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  vbroadcastss %xmm6, %zmm6\n  vmovups %zmm6, %zmm5\n  vmovups %zmm6, %zmm7\n  vmovups %zmm6, %zmm8\n  vmovups %zmm6, %zmm9\n  vmovups %zmm6, %zmm10\n  vmovups %zmm6, %zmm11\n  vmovups %zmm6, %zmm12\n  vmovups %zmm6, %zmm13\n  vmovups %zmm6, %zmm14\n  vmovups %zmm6, %zmm15\nlatfma512_loop:\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  vfmadd132ps %zmm6, %zmm5, %zmm7\n  sub %r9, %rdi\n  jnz latfma512_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nlatfma256:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  vbroadcastss %xmm6, %ymm6\n  vmovups %ymm6, %ymm5\n  vmovups %ymm6, %ymm7\n  vmovups %ymm6, %ymm8\n  vmovups %ymm6, %ymm9\n  vmovups %ymm6, %ymm10\n  vmovups %ymm6, %ymm11\n  vmovups %ymm6, %ymm12\n  vmovups %ymm6, %ymm13\n  vmovups %ymm6, %ymm14\n  vmovups %ymm6, %ymm15\nlatfma256_loop:\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  vfmadd132ps %ymm6, %ymm5, %ymm7\n  sub %r9, %rdi\n  jnz latfma256_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nlatfma128:\n  push %r9\n  push %r8\n  vzeroupper\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  vbroadcastss %xmm6, %xmm6\n  vmovups %xmm6, %xmm5\n  vmovups %xmm6, %xmm7\n  vmovups %xmm6, %xmm8\n  vmovups %xmm6, %xmm9\n  vmovups %xmm6, %xmm10\n  vmovups %xmm6, %xmm11\n  vmovups %xmm6, %xmm12\n  vmovups %xmm6, %xmm13\n  vmovups %xmm6, %xmm14\n  vmovups %xmm6, %xmm15\nlatfma128_loop:\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  vfmadd132ps %xmm6, %xmm5, %xmm7\n  sub %r9, %rdi\n  jnz latfma128_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\n\nlatadd128fp:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  //vbroadcastss %xmm6, %xmm6\nlatadd128fp_loop:\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  addps %xmm6, %xmm6\n  sub %r9, %rdi\n  jnz latadd128fp_loop\n  movq %xmm1, %rax\n  pop %r8\n  pop %r9\n  ret\n\nlatmul128fp:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  //vbroadcastss %xmm6, %xmm6\nlatmul128fp_loop:\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  mulps %xmm6, %xmm6\n  sub %r9, %rdi\n  jnz latmul128fp_loop\n  movq %xmm1, %rax\n  pop %r8\n  pop %r9\n  ret\n\nmul128fp:\n  push %r9\n  push %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm4\n  cvtsi2ss %r9, %xmm3\n  cvtsi2ss %r9, %xmm2\n  cvtsi2ss %r9, %xmm1\n  cvtsi2ss %r9, %xmm0\nmul128fp_loop:\n  mulps %xmm0, %xmm0\n  mulps %xmm1, %xmm1\n  mulps %xmm2, %xmm2\n  mulps %xmm3, %xmm3\n  mulps %xmm4, %xmm4\n  mulps %xmm0, %xmm0\n  mulps %xmm1, %xmm1\n  mulps %xmm2, %xmm2\n  mulps %xmm3, %xmm3\n  mulps %xmm4, %xmm4\n  mulps %xmm0, %xmm0\n  mulps %xmm1, %xmm1\n  mulps %xmm2, %xmm2\n  mulps %xmm3, %xmm3\n  mulps %xmm4, %xmm4\n  mulps %xmm0, %xmm0\n  mulps %xmm1, %xmm1\n  mulps %xmm2, %xmm2\n  mulps %xmm3, %xmm3\n  mulps %xmm4, %xmm4\n  sub %r9, %rdi\n  jnz mul128fp_loop\n  movq %xmm1, %rax\n  pop %r8\n  pop %r9\n  ret\n\nadd128fp:\n  push %r9\n  push %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm4\n  cvtsi2ss %r9, %xmm3\n  cvtsi2ss %r9, %xmm2\n  cvtsi2ss %r9, %xmm1\n  cvtsi2ss %r9, %xmm0\nadd128fp_loop:\n  addps %xmm0, %xmm0\n  addps %xmm1, %xmm1\n  addps %xmm2, %xmm2\n  addps %xmm3, %xmm3\n  addps %xmm4, %xmm4\n  addps %xmm0, %xmm0\n  addps %xmm1, %xmm1\n  addps %xmm2, %xmm2\n  addps %xmm3, %xmm3\n  addps %xmm4, %xmm4\n  addps %xmm0, %xmm0\n  addps %xmm1, %xmm1\n  addps %xmm2, %xmm2\n  addps %xmm3, %xmm3\n  addps %xmm4, %xmm4\n  addps %xmm0, %xmm0\n  addps %xmm1, %xmm1\n  addps %xmm2, %xmm2\n  addps %xmm3, %xmm3\n  addps %xmm4, %xmm4\n  sub %r9, %rdi\n  jnz add128fp_loop\n  movq %xmm1, %rax\n  pop %r8\n  pop %r9\n  ret\n\nlatmul64:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r9, %r15\nlatmul64_loop:\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  imul %r9, %r15\n  sub %r9, %rdi\n  jnz latmul64_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nlatmul16:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r9, %r15\nlatmul16_loop:\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  imul %r9w, %r15w\n  sub %r9, %rdi\n  jnz latmul16_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nmul16:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r9, %r15\nmul16_loop:\n  imul %r9w, %r15w\n  imul %r9w, %r14w\n  imul %r9w, %r13w\n  imul %r9w, %r12w\n  imul %r9w, %r11w\n  imul %r9w, %r15w\n  imul %r9w, %r14w\n  imul %r9w, %r13w\n  imul %r9w, %r12w\n  imul %r9w, %r11w\n  imul %r9w, %r15w\n  imul %r9w, %r14w\n  imul %r9w, %r13w\n  imul %r9w, %r12w\n  imul %r9w, %r11w\n  imul %r9w, %r15w\n  imul %r9w, %r14w\n  imul %r9w, %r13w\n  imul %r9w, %r12w\n  imul %r9w, %r11w\n  sub %r9, %rdi\n  jnz mul16_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nmul64:\n  push %rbx\n  push %rcx\n  push %rsi\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r9, %r15\nmul64_loop:\n  imul %r9, %r15\n  mov %r9, %r15\n  imul %r9, %r14\n  mov %r9, %r14\n  imul %r9, %r13\n  mov %r9, %r13\n  imul %r9, %r12\n  mov %r9, %r12\n  imul %r9, %r11\n  mov %r9, %r11\n  imul %r9, %r10\n  mov %r9, %r10\n  imul %r9, %r8\n  mov %r9, %r8\n  imul %r9, %rbx\n  mov %r9, %rbx\n  imul %r9, %rcx\n  mov %r9, %rcx\n  imul %r9, %rsi\n  mov %r9, %rsi\n  imul %r9, %r15\n  mov %r9, %r15\n  imul %r9, %r14\n  mov %r9, %r14\n  imul %r9, %r13\n  mov %r9, %r13\n  imul %r9, %r12\n  mov %r9, %r12\n  imul %r9, %r11\n  mov %r9, %r11\n  imul %r9, %r10\n  mov %r9, %r10\n  imul %r9, %r8\n  mov %r9, %r8\n  imul %r9, %rbx\n  mov %r9, %rbx\n  imul %r9, %rcx\n  mov %r9, %rcx\n  imul %r9, %rsi\n  mov %r9, %rsi\n  sub %r9, %rdi\n  jnz mul64_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rsi\n  pop %rcx\n  pop %rbx\n  ret\n\nmixmul16mul64:\n  push %rbx\n  push %rcx\n  push %rsi\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r9, %r15\nmixmul16mul64_loop:\n  imul %r9, %r15\n  imul %r9w, %r14w\n  imul %r9, %r13\n  imul %r9w, %r12w\n  imul %r9, %r11\n  imul %r9w, %r10w\n  imul %r9, %r8\n  imul %r9w, %bx\n  imul %r9, %rcx\n  imul %r9w, %si\n  imul %r9, %r15\n  imul %r9w, %r14w\n  imul %r9, %r13\n  imul %r9w, %r12w\n  imul %r9, %r11\n  imul %r9w, %r10w\n  imul %r9, %r8\n  imul %r9w, %bx\n  imul %r9, %rcx\n  imul %r9w, %si\n  sub %r9, %rdi\n  jnz mixmul16mul64_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rsi\n  pop %rcx\n  pop %rbx\n  ret\n\nmixmul16mul64_21:\n  push %rbx\n  push %rcx\n  push %rdx\n  push %rsi\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $24, %r9\n  mov %r8, %rbx\n  mov %r8, %rcx\n  mov %r8, %rsi\n  mov %r8, %r10\n  mov %r8, %r11\n  mov %r8, %r12\n  mov %r8, %r13\n  mov %r8, %r14\n  mov %r9, %r15\nmixmul16mul64_21_loop:\n  imul %r9, %r15\n  imul %r9w, %r14w\n  imul %r9w, %r13w\n  imul %r9, %r12\n  imul %r9w, %r11w\n  imul %r9w, %r10w\n\n  imul %r9, %r8\n  imul %r9w, %r14w\n  imul %r9w, %r13w\n  imul %r9, %rcx\n  imul %r9w, %r11w\n  imul %r9w, %r10w\n\n  imul %r9, %rbx\n  imul %r9w, %r14w\n  imul %r9w, %r13w\n  imul %r9, %rax\n  imul %r9w, %r11w\n  imul %r9w, %r10w\n\n  imul %r9, %rsi\n  imul %r9w, %r14w\n  imul %r9w, %r13w\n  imul %r9, %rdx\n  imul %r9w, %r11w\n  imul %r9w, %r10w\n\n  sub %r9, %rdi\n  jge mixmul16mul64_21_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rsi\n  pop %rdx\n  pop %rcx\n  pop %rbx\n  ret\n\nloadscalar:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $20, %r9\nloadscalar_loop:\n  mov (%rsi), %r15\n  mov 8(%rsi), %r14\n  mov 16(%rsi), %r13\n  mov 24(%rsi), %r12\n  mov 32(%rsi), %r11\n  mov 40(%rsi), %r10\n  mov 48(%rsi), %r15\n  mov 56(%rsi), %r14\n  mov 64(%rsi), %r13\n  mov 72(%rsi), %r12 \n  mov 80(%rsi), %r11\n  mov 88(%rsi), %r10\n  mov 96(%rsi), %r15\n  mov 104(%rsi), %r14\n  mov 112(%rsi), %r13  \n  mov 120(%rsi), %r12\n  mov 128(%rsi), %r11\n  mov 136(%rsi), %r10\n  mov 144(%rsi), %r15\n  mov 152(%rsi), %r14   \n  sub %r9, %rdi\n  jnz loadscalar_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret \n\nspacedstorescalar:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  mov $20, %r9\nspacedstorescalar_loop:\n  mov %rdi, (%rsi)\n  mov %rdi, 64(%rsi)\n  mov %rdi, 128(%rsi)\n  mov %rdi, 192(%rsi)\n  mov %rdi, 256(%rsi)\n  mov %rdi, 320(%rsi)\n  mov %rdi, 384(%rsi)\n  mov %rdi, 448(%rsi)\n  mov %rdi, 512(%rsi)\n  mov %rdi, 576(%rsi)\n  mov %rdi, 640(%rsi)\n  mov %rdi, 704(%rsi)\n  mov %rdi, 768(%rsi)\n  mov %rdi, 832(%rsi)\n  mov %rdi, 896(%rsi)\n  mov %rdi, 960(%rsi)\n  mov %rdi, 1024(%rsi)\n  mov %rdi, 1088(%rsi)\n  mov %rdi, 1152(%rsi)\n  mov %rdi, 1216(%rsi)\n  sub %r9, %rdi\n  jnz spacedstorescalar_loop\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nmixedscalarloadstore:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $12, %r9\nmixedscalarloadstore_loop:\n  mov (%rsi), %r15\n  mov 8(%rsi), %r14\n  mov %r9, 400(%rsi)\n\n  mov 16(%rsi), %r13\n  mov 24(%rsi), %r12\n  mov %r9, 408(%rsi)\n\n  mov 32(%rsi), %r11\n  mov 40(%rsi), %r10\n  mov %r9, 416(%rsi)\n\n  mov 48(%rsi), %r15\n  mov 56(%rsi), %r14\n  mov %r9, 424(%rsi)\n\n  sub %r9, %rdi\n  jg mixedscalarloadstore_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\n\nspacedload128:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  mov $20, %r9\nspacedload128_loop:\n  movdqa (%rsi), %xmm10\n  movdqa 64(%rsi), %xmm11\n  movdqa 128(%rsi), %xmm12\n  movdqa 192(%rsi), %xmm13\n  movdqa 256(%rsi), %xmm14\n  movdqa 320(%rsi), %xmm10\n  movdqa 384(%rsi), %xmm11\n  movdqa 448(%rsi), %xmm12\n  movdqa 512(%rsi), %xmm13\n  movdqa 576(%rsi), %xmm14\n  movdqa 640(%rsi), %xmm10\n  movdqa 704(%rsi), %xmm11\n  movdqa 768(%rsi), %xmm12\n  movdqa 832(%rsi), %xmm13\n  movdqa 896(%rsi), %xmm14\n  movdqa 960(%rsi), %xmm10\n  movdqa 1024(%rsi), %xmm11\n  movdqa 1088(%rsi), %xmm12\n  movdqa 1152(%rsi), %xmm13\n  movdqa 1216(%rsi), %xmm14\n  sub %r9, %rdi\n  jnz spacedload128_loop\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nload128:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  mov $20, %r9\nload128_loop:\n  movdqa (%rsi), %xmm10\n  movdqa (%rsi), %xmm11\n  movdqa (%rsi), %xmm12\n  movdqa (%rsi), %xmm13\n  movdqa (%rsi), %xmm14\n  movdqa (%rsi), %xmm10\n  movdqa (%rsi), %xmm11\n  movdqa (%rsi), %xmm12\n  movdqa (%rsi), %xmm13\n  movdqa (%rsi), %xmm14\n  movdqa (%rsi), %xmm10\n  movdqa (%rsi), %xmm11\n  movdqa (%rsi), %xmm12\n  movdqa (%rsi), %xmm13\n  movdqa (%rsi), %xmm14\n  movdqa (%rsi), %xmm10\n  movdqa (%rsi), %xmm11\n  movdqa (%rsi), %xmm12\n  movdqa (%rsi), %xmm13\n  movdqa (%rsi), %xmm14\n  sub %r9, %rdi\n  jnz load128_loop\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nload256:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  mov $20, %r9\nload256_loop:\n  vmovaps (%rsi), %ymm10\n  vmovaps (%rsi), %ymm11\n  vmovaps (%rsi), %ymm12\n  vmovaps (%rsi), %ymm13\n  vmovaps (%rsi), %ymm14\n  vmovaps (%rsi), %ymm10\n  vmovaps (%rsi), %ymm11\n  vmovaps (%rsi), %ymm12\n  vmovaps (%rsi), %ymm13\n  vmovaps (%rsi), %ymm14\n  vmovaps (%rsi), %ymm10\n  vmovaps (%rsi), %ymm11\n  vmovaps (%rsi), %ymm12\n  vmovaps (%rsi), %ymm13\n  vmovaps (%rsi), %ymm14\n  vmovaps (%rsi), %ymm10\n  vmovaps (%rsi), %ymm11\n  vmovaps (%rsi), %ymm12\n  vmovaps (%rsi), %ymm13\n  vmovaps (%rsi), %ymm14\n  sub %r9, %rdi\n  jnz load256_loop\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nload512:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  mov $20, %r9\nload512_loop:\n  vmovaps (%rsi), %zmm10\n  vmovaps (%rsi), %zmm11\n  vmovaps (%rsi), %zmm12\n  vmovaps (%rsi), %zmm13\n  vmovaps (%rsi), %zmm14\n  vmovaps (%rsi), %zmm10\n  vmovaps (%rsi), %zmm11\n  vmovaps (%rsi), %zmm12\n  vmovaps (%rsi), %zmm13\n  vmovaps (%rsi), %zmm14\n  vmovaps (%rsi), %zmm10\n  vmovaps (%rsi), %zmm11\n  vmovaps (%rsi), %zmm12\n  vmovaps (%rsi), %zmm13\n  vmovaps (%rsi), %zmm14\n  vmovaps (%rsi), %zmm10\n  vmovaps (%rsi), %zmm11\n  vmovaps (%rsi), %zmm12\n  vmovaps (%rsi), %zmm13\n  vmovaps (%rsi), %zmm14\n  sub %r9, %rdi\n  jnz load512_loop\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nstore128:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  movdqa (%rsi), %xmm10\n  movdqa %xmm10, %xmm11\n  movdqa %xmm10, %xmm12\n  movdqa %xmm10, %xmm13\n  movdqa %xmm10, %xmm14\n  mov $20, %r9\nstore128_loop:\n  movdqa %xmm10, (%rdx)\n  movdqa %xmm11, (%rdx)\n  movdqa %xmm12, (%rdx)\n  movdqa %xmm13, (%rdx)\n  movdqa %xmm14, (%rdx)\n  movdqa %xmm10, (%rdx)\n  movdqa %xmm11, (%rdx)\n  movdqa %xmm12, (%rdx)\n  movdqa %xmm13, (%rdx)\n  movdqa %xmm14, (%rdx)\n  movdqa %xmm10, (%rdx)\n  movdqa %xmm11, (%rdx)\n  movdqa %xmm12, (%rdx)\n  movdqa %xmm13, (%rdx)\n  movdqa %xmm14, (%rdx)\n  movdqa %xmm10, (%rdx)\n  movdqa %xmm11, (%rdx)\n  movdqa %xmm12, (%rdx)\n  movdqa %xmm13, (%rdx)\n  movdqa %xmm14, (%rdx)\n  sub %r9, %rdi\n  jnz store128_loop\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nstore256:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  vmovaps (%rsi), %ymm10\n  vmovaps %ymm10, %ymm11\n  vmovaps %ymm10, %ymm12\n  vmovaps %ymm10, %ymm13\n  vmovaps %ymm10, %ymm14\n  mov $20, %r9\nstore256_loop:\n  vmovaps %ymm10, (%rdx)\n  vmovaps %ymm11, (%rdx)\n  vmovaps %ymm12, (%rdx)\n  vmovaps %ymm13, (%rdx)\n  vmovaps %ymm14, (%rdx)\n  vmovaps %ymm10, (%rdx)\n  vmovaps %ymm11, (%rdx)\n  vmovaps %ymm12, (%rdx)\n  vmovaps %ymm13, (%rdx)\n  vmovaps %ymm14, (%rdx)\n  vmovaps %ymm10, (%rdx)\n  vmovaps %ymm11, (%rdx)\n  vmovaps %ymm12, (%rdx)\n  vmovaps %ymm13, (%rdx)\n  vmovaps %ymm14, (%rdx)\n  vmovaps %ymm10, (%rdx)\n  vmovaps %ymm11, (%rdx)\n  vmovaps %ymm12, (%rdx)\n  vmovaps %ymm13, (%rdx)\n  vmovaps %ymm14, (%rdx)\n  sub %r9, %rdi\n  jnz store256_loop\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nstore512:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  vmovaps (%rsi), %zmm10\n  vmovaps %zmm10, %zmm11\n  vmovaps %zmm10, %zmm12\n  vmovaps %zmm10, %zmm13\n  vmovaps %zmm10, %zmm14\n  mov $20, %r9\nstore512_loop:\n  vmovaps %zmm10, (%rdx)\n  vmovaps %zmm11, (%rdx)\n  vmovaps %zmm12, (%rdx)\n  vmovaps %zmm13, (%rdx)\n  vmovaps %zmm14, (%rdx)\n  vmovaps %zmm10, (%rdx)\n  vmovaps %zmm11, (%rdx)\n  vmovaps %zmm12, (%rdx)\n  vmovaps %zmm13, (%rdx)\n  vmovaps %zmm14, (%rdx)\n  vmovaps %zmm10, (%rdx)\n  vmovaps %zmm11, (%rdx)\n  vmovaps %zmm12, (%rdx)\n  vmovaps %zmm13, (%rdx)\n  vmovaps %zmm14, (%rdx)\n  vmovaps %zmm10, (%rdx)\n  vmovaps %zmm11, (%rdx)\n  vmovaps %zmm12, (%rdx)\n  vmovaps %zmm13, (%rdx)\n  vmovaps %zmm14, (%rdx)\n  sub %r9, %rdi\n  jnz store512_loop\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\npdeptest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\npdeptest_loop:\n  pdep %r8, %r15, %r15\n  pdep %r8, %r14, %r14\n  pdep %r8, %r13, %r13\n  pdep %r8, %r12, %r12\n  pdep %r8, %r11, %r11\n  pdep %r8, %r10, %r10\n  pdep %r8, %rcx, %rcx\n  pdep %r8, %rbx, %rbx\n  pdep %r8, %r15, %r15\n  pdep %r8, %r14, %r14\n  pdep %r8, %r13, %r13\n  pdep %r8, %r12, %r12\n  pdep %r8, %r11, %r11\n  pdep %r8, %r10, %r10\n  pdep %r8, %rcx, %rcx\n  pdep %r8, %rbx, %rbx\n  pdep %r8, %r15, %r15\n  pdep %r8, %r14, %r14\n  pdep %r8, %r13, %r13\n  pdep %r8, %r12, %r12\n  sub %r9, %rdi\n  jnz pdeptest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\npdepmultest:\n  push %rbx\n  push %rcx\n  push %rsi\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %rsi, %rsi\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\npdepmultest_loop:\n  pdep %r8, %r15, %r15\n  imul %r9, %r14\n  pdep %r8, %r13, %r13\n  imul %r9, %r12\n  pdep %r8, %r11, %r11\n  imul %r9, %r10\n  pdep %r8, %rcx, %rcx\n  imul %r9, %rbx\n  pdep %r8, %r15, %r15\n  imul %r9, %rsi\n  pdep %r8, %r15, %r15\n  imul %r9, %r14\n  pdep %r8, %r13, %r13\n  imul %r9, %r12\n  pdep %r8, %r11, %r11\n  imul %r9, %r10\n  pdep %r8, %rcx, %rcx\n  imul %r9, %rbx\n  pdep %r8, %r15, %r15\n  imul %r9, %rsi\n  sub %r9, %rdi\n  jnz pdepmultest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rsi\n  pop %rcx\n  pop %rbx\n  ret\n\n\npexttest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r10\n  push %r11\n  push %r12\n  push %r13\n  push %r14\n  push %r15\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\n  xor %rcx, %rcx\n  xor %r10, %r10\n  xor %r11, %r11\n  xor %r12, %r12\n  xor %r13, %r13\n  xor %r14, %r14\n  xor %r15, %r15\npexttest_loop:\n  pext %r8, %r15, %r15\n  pext %r8, %r14, %r14\n  pext %r8, %r13, %r13\n  pext %r8, %r12, %r12\n  pext %r8, %r11, %r11\n  pext %r8, %r10, %r10\n  pext %r8, %rcx, %rcx\n  pext %r8, %rbx, %rbx\n  pext %r8, %r15, %r15\n  pext %r8, %r14, %r14\n  pext %r8, %r13, %r13\n  pext %r8, %r12, %r12\n  pext %r8, %r11, %r11\n  pext %r8, %r10, %r10\n  pext %r8, %rcx, %rcx\n  pext %r8, %rbx, %rbx\n  pext %r8, %r15, %r15\n  pext %r8, %r14, %r14\n  pext %r8, %r13, %r13\n  pext %r8, %r12, %r12\n  sub %r9, %rdi\n  jnz pexttest_loop\n  pop %r15\n  pop %r14\n  pop %r13\n  pop %r12\n  pop %r11\n  pop %r10\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\ndepmovtest:\n  push %rbx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\ndepmovtest_loop:\n  mov %r15, %r12\n  mov %r12, %r14\n  mov %r14, %r13\n  mov %r13, %r11\n  mov %r11, %r15\n  mov %r15, %r12\n  mov %r12, %r14\n  mov %r14, %r13\n  mov %r13, %r11\n  mov %r11, %r15\n  mov %r15, %r12\n  mov %r12, %r14\n  mov %r14, %r13\n  mov %r13, %r11\n  mov %r11, %r15\n  mov %r15, %r12\n  mov %r12, %r14\n  mov %r14, %r13\n  mov %r13, %r11\n  mov %r11, %r15\n  sub %r9, %rdi\n  jnz depmovtest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret\n\nvecdepmovtest:\n  push %rbx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm15\n  xor %rbx, %rbx\nvecdepmovtest_loop:\n  movaps %xmm15, %xmm12\n  movaps %xmm12, %xmm14\n  movaps %xmm14, %xmm13\n  movaps %xmm13, %xmm11\n  movaps %xmm11, %xmm15\n  movaps %xmm15, %xmm12\n  movaps %xmm12, %xmm14\n  movaps %xmm14, %xmm13\n  movaps %xmm13, %xmm11\n  movaps %xmm11, %xmm15\n  movaps %xmm15, %xmm12\n  movaps %xmm12, %xmm14\n  movaps %xmm14, %xmm13\n  movaps %xmm13, %xmm11\n  movaps %xmm11, %xmm15\n  movaps %xmm15, %xmm12\n  movaps %xmm12, %xmm14\n  movaps %xmm14, %xmm13\n  movaps %xmm13, %xmm11\n  movaps %xmm11, %xmm15\n  sub %r9, %rdi\n  jnz vecdepmovtest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rbx\n  ret \n\nvecindepmovtest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm0\n  movaps %xmm0, %xmm1\n  movaps %xmm0, %xmm10\n  movaps %xmm0, %xmm11\n  movaps %xmm0, %xmm12\n  xor %rbx, %rbx\nvecindepmovtest_loop:\n  movaps %xmm10, %xmm15\n  movaps %xmm11, %xmm14\n  movaps %xmm12, %xmm13\n  movaps %xmm0, %xmm15\n  movaps %xmm1, %xmm14\n  movaps %xmm10, %xmm15\n  movaps %xmm11, %xmm14\n  movaps %xmm12, %xmm13\n  movaps %xmm0, %xmm15\n  movaps %xmm1, %xmm14\n  movaps %xmm10, %xmm15\n  movaps %xmm11, %xmm14\n  movaps %xmm12, %xmm13\n  movaps %xmm0, %xmm15\n  movaps %xmm1, %xmm14\n  movaps %xmm10, %xmm15\n  movaps %xmm11, %xmm14\n  movaps %xmm12, %xmm13\n  movaps %xmm0, %xmm15\n  movaps %xmm1, %xmm14\n  sub %r9, %rdi\n  jnz vecindepmovtest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nindepmovtest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\nindepmovtest_loop:\n  mov %r10, %r15\n  mov %r11, %r14\n  mov %r12, %r13\n  mov %rax, %r15\n  mov %rcx, %r14\n  mov %r10, %r15\n  mov %r11, %r14\n  mov %r12, %r13\n  mov %rax, %r15\n  mov %rcx, %r14\n  mov %r10, %r15\n  mov %r11, %r14\n  mov %r12, %r13\n  mov %rax, %r15\n  mov %rcx, %r14\n  mov %r10, %r15\n  mov %r11, %r14\n  mov %r12, %r13\n  mov %rax, %r15\n  mov %rcx, %r14\n  sub %r9, %rdi\n  jnz indepmovtest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nmovzerotest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\nmovzerotest_loop:\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  mov $0, %r15\n  sub %r9, %rdi\n  jnz movzerotest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nxorzerotest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\nxorzerotest_loop:\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  xor %r15, %r15\n  sub %r9, %rdi\n  jnz xorzerotest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nvecxorzerotest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm0\n  xor %rbx, %rbx\nvecxorzerotest_loop:\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  xorps %xmm0, %xmm0\n  sub %r9, %rdi\n  jnz vecxorzerotest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret \n\nsubzerotest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\nsubzerotest_loop:\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r15, %r15\n  sub %r9, %rdi\n  jnz subzerotest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nvecsubzerotest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm15\n  xor %rbx, %rbx\nvecsubzerotest_loop:\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  subss %xmm15, %xmm15\n  sub %r9, %rdi\n  jnz subzerotest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret \n\ndepaddimmtest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\ndepaddimmtest_loop:\n  add $1, %r15\n  add $2, %r15\n  add $3, %r15\n  add $4, %r15\n  add $5, %r15\n  add $6, %r15\n  add $7, %r15\n  add $8, %r15\n  add $9, %r15\n  add $10, %r15\n  add $11, %r15\n  add $12, %r15\n  add $13, %r15\n  add $14, %r15\n  add $15, %r15\n  add $16, %r15\n  add $17, %r15\n  add $18, %r15\n  add $19, %r15\n  add $20, %r15\n  sub %r9, %rdi\n  jnz depaddimmtest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\nmemrenametest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $10, %r10\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx \nmemrenametest_loop:\n  .rept 20\n  mov %r10, (%rsi)\n  mov (%rsi), %r10\n  .endr\n  sub %r9, %rdi\n  jnz memrenametest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret \n\ndepinctest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  xor %rbx, %rbx\ndepinctest_loop:\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  inc %r15\n  sub %r9, %rdi\n  jnz depinctest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\ndepdectest:\n  push %rbx\n  push %rcx\n  push %r8\n  push %r9\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %r11\n  push %r10\n  mov $1, %r8\n  mov $20, %r9\n  xor %r15, %r15\n  not %r15\n  xor %rbx, %rbx\ndepdectest_loop:\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  dec %r15\n  sub %r9, %rdi\n  jnz depdectest_loop\n  pop %r10\n  pop %r11\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  pop %r9\n  pop %r8\n  pop %rcx\n  pop %rbx\n  ret\n\n/* FMA4 tests */\nfma4_256:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  movups %xmm6, -32(%rsp)\n  vbroadcastss -32(%rsp), %ymm6\n  vmovups %ymm6, %ymm5\n  vmovups %ymm6, %ymm7\n  vmovups %ymm6, %ymm8\n  vmovups %ymm6, %ymm9\n  vmovups %ymm6, %ymm10\n  vmovups %ymm6, %ymm11\n  vmovups %ymm6, %ymm12\n  vmovups %ymm6, %ymm13\n  vmovups %ymm6, %ymm14\n  vmovups %ymm6, %ymm15\nfma4_256_loop:\n  vfmaddps %ymm6, %ymm6, %ymm5, %ymm5\n  vfmaddps %ymm6, %ymm6, %ymm7, %ymm7\n  vfmaddps %ymm6, %ymm6, %ymm8, %ymm8\n  vfmaddps %ymm6, %ymm6, %ymm9, %ymm9\n  vfmaddps %ymm6, %ymm6, %ymm10, %ymm10\n  vfmaddps %ymm6, %ymm6, %ymm11, %ymm11\n  vfmaddps %ymm6, %ymm6, %ymm12, %ymm12\n  vfmaddps %ymm6, %ymm6, %ymm13, %ymm13\n  vfmaddps %ymm6, %ymm6, %ymm14, %ymm14\n  vfmaddps %ymm6, %ymm6, %ymm15, %ymm15\n  vfmaddps %ymm6, %ymm6, %ymm5, %ymm5\n  vfmaddps %ymm6, %ymm6, %ymm7, %ymm7\n  vfmaddps %ymm6, %ymm6, %ymm8, %ymm8\n  vfmaddps %ymm6, %ymm6, %ymm9, %ymm9\n  vfmaddps %ymm6, %ymm6, %ymm10, %ymm10\n  vfmaddps %ymm6, %ymm6, %ymm11, %ymm11\n  vfmaddps %ymm6, %ymm6, %ymm12, %ymm12\n  vfmaddps %ymm6, %ymm6, %ymm13, %ymm13\n  vfmaddps %ymm6, %ymm6, %ymm14, %ymm14\n  vfmaddps %ymm6, %ymm6, %ymm15, %ymm15\n  sub %r9, %rdi\n  jnz fma4_256_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nfma4_128:\n  push %r9\n  push %r8\n  mov $20, %r9\n  movq %r9, %xmm1\n  cvtsi2ss %r9, %xmm6\n  movups %xmm6, -32(%rsp)\n  vbroadcastss -32(%rsp), %xmm6\n  movups %xmm6, %xmm5\n  movups %xmm6, %xmm7\n  movups %xmm6, %xmm8\n  movups %xmm6, %xmm9\n  movups %xmm6, %xmm10\n  movups %xmm6, %xmm11\n  movups %xmm6, %xmm12\n  movups %xmm6, %xmm13\n  movups %xmm6, %xmm14\n  movups %xmm6, %xmm15\nfma4_128_loop:\n  vfmaddps %xmm6, %xmm6, %xmm5, %xmm5\n  vfmaddps %xmm6, %xmm6, %xmm7, %xmm7\n  vfmaddps %xmm6, %xmm6, %xmm8, %xmm8\n  vfmaddps %xmm6, %xmm6, %xmm9, %xmm9\n  vfmaddps %xmm6, %xmm6, %xmm10, %xmm10\n  vfmaddps %xmm6, %xmm6, %xmm11, %xmm11\n  vfmaddps %xmm6, %xmm6, %xmm12, %xmm12\n  vfmaddps %xmm6, %xmm6, %xmm13, %xmm13\n  vfmaddps %xmm6, %xmm6, %xmm14, %xmm14\n  vfmaddps %xmm6, %xmm6, %xmm15, %xmm15\n  vfmaddps %xmm6, %xmm6, %xmm5, %xmm5\n  vfmaddps %xmm6, %xmm6, %xmm7, %xmm7\n  vfmaddps %xmm6, %xmm6, %xmm8, %xmm8\n  vfmaddps %xmm6, %xmm6, %xmm9, %xmm9\n  vfmaddps %xmm6, %xmm6, %xmm10, %xmm10\n  vfmaddps %xmm6, %xmm6, %xmm11, %xmm11\n  vfmaddps %xmm6, %xmm6, %xmm12, %xmm12\n  vfmaddps %xmm6, %xmm6, %xmm13, %xmm13\n  vfmaddps %xmm6, %xmm6, %xmm14, %xmm14\n  vfmaddps %xmm6, %xmm6, %xmm15, %xmm15\n  sub %r9, %rdi\n  jnz fma4_128_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\n\n\nfdivtest:\n  push %r9\n  push %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm6\n  movss %xmm6, %xmm5\n  movss %xmm6, %xmm7\n  movss %xmm6, %xmm8\n  movss %xmm6, %xmm9\n  movss %xmm6, %xmm10\n  movss %xmm6, %xmm11\n  movss %xmm6, %xmm12\n  movss %xmm6, %xmm13\n  movss %xmm6, %xmm14\n  movss %xmm6, %xmm15\nfdivtest_loop:\n  divss %xmm6, %xmm5 \n  divss %xmm6, %xmm7 \n  divss %xmm6, %xmm8 \n  divss %xmm6, %xmm9 \n  divss %xmm6, %xmm10\n  divss %xmm6, %xmm11\n  divss %xmm6, %xmm12\n  divss %xmm6, %xmm13\n  divss %xmm6, %xmm14\n  divss %xmm6, %xmm15\n  divss %xmm6, %xmm5 \n  divss %xmm6, %xmm7 \n  divss %xmm6, %xmm8 \n  divss %xmm6, %xmm9 \n  divss %xmm6, %xmm10\n  divss %xmm6, %xmm11\n  divss %xmm6, %xmm12\n  divss %xmm6, %xmm13\n  divss %xmm6, %xmm14\n  divss %xmm6, %xmm15\n  sub %r9, %rdi\n  jnz fdivtest_loop\n  movq %xmm1, %rax\n  pop %r8\n  pop %r9\n  ret \n\nfdivlattest:\n  push %r9\n  push %r8\n  mov $20, %r9\n  cvtsi2ss %r9, %xmm6\nfdivlattest_loop:\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  divss %xmm6, %xmm6\n  sub %r9, %rdi\n  jnz fdivtest_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nfmuldenormlattest:\n  push %r9\n  push %r8\n  mov $0x00800000, %r9 /* smallest normal */\n  mov $0x3f000000, %r8 /* 0.5 */\n  movq %r9, %xmm6\n  movq %r8, %xmm7\n\tmov $0x40000000, %r8 /* 2 */\n\tmovq %r8, %xmm4\n  mov $20, %r9\nfmuldenormlattest_loop:\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm5, %xmm6\n  mulss %xmm4, %xmm6\n  mulss %xmm4, %xmm6\n  mulss %xmm4, %xmm6\n  mulss %xmm4, %xmm6\n  mulss %xmm4, %xmm6\n  mulss %xmm4, %xmm6\n  mulss %xmm4, %xmm6\n  mulss %xmm4, %xmm6\n  sub %r9, %rdi\n  jnz fmuldenormlattest_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret\n\nfmuldenormtest:\n  push %r9\n  push %r8\n  mov $0x00800000, %r9\n  mov $0x3e4ccccd, %r8\n  movq %r9, %xmm6\n  movq %r8, %xmm7\n  movaps %xmm7, %xmm5\n  mov $20, %r9\nfmuldenormtest_loop:\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  mulss %xmm6, %xmm5\n  movaps %xmm7, %xmm5\n  sub %r9, %rdi\n  jnz fmuldenormtest_loop\n  movq %xmm1, %rax\n  vzeroupper\n  pop %r8\n  pop %r9\n  ret \n\nmovqtoxmmtest:\n  push %r9\n  push %r8\n  push %r10\n  mov $20, %r9\n  mov $123, %r10\nmovqtoxmmtest_loop:\n  movq %r10, %xmm1\n  movq %xmm1, %r10\n  movq %r10, %xmm1\n  movq %xmm1, %r10 \n  movq %r10, %xmm1\n  movq %xmm1, %r10 \n  movq %r10, %xmm1\n  movq %xmm1, %r10 \n  movq %r10, %xmm1\n  movq %xmm1, %r10 \n  movq %r10, %xmm1\n  movq %xmm1, %r10\n  movq %r10, %xmm1\n  movq %xmm1, %r10 \n  movq %r10, %xmm1\n  movq %xmm1, %r10 \n  movq %r10, %xmm1\n  movq %xmm1, %r10 \n  movq %r10, %xmm1\n  movq %xmm1, %r10  \n  sub %r9, %rdi\n  jnz movqtoxmmtest_loop\n  movq %xmm1, %rax\n  pop %r10\n  pop %r8\n  pop %r9\n  ret \n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.asm",
    "content": "section .text\r\nbits 64\r\n\r\nglobal asm_read\r\n\r\n; rcx = ptr to array\r\n; rdx = array length in bytes\r\n; r8 = stop flag\r\n; r9 = throttle factor\r\n; return bytes read in rax\r\nasm_read:\r\n  push rdi\r\n  push rsi\r\n  push r10\r\n  push r11\r\n  mov rdi, rcx  ; save array base address\r\n  xor rsi, rsi  ; index\r\n  xor rax, rax  ; return value\r\nasm_read_pass_loop:\r\n  movups xmm0, [rdi]\r\n  movups xmm0, [rdi + 16]\r\n  movups xmm0, [rdi + 32]\r\n  movups xmm0, [rdi + 48]\r\n  movups xmm0, [rdi + 64]\r\n  movups xmm0, [rdi + 80]\r\n  movups xmm0, [rdi + 96]\r\n  movups xmm0, [rdi + 112]\r\n\r\n  add rdi, 128\r\n  add rsi, 128    ; update index\r\n  add rax, 128    ; update return value\r\n\r\n  test r9, r9               ; need to throttle?\r\n  jz asm_read_throttle_end\r\n  mov r10, r9\r\nasm_read_throttle:\r\n  dec r10\r\n  jnz asm_read_throttle;\r\nasm_read_throttle_end:\r\n  mov r10d, [r8]           ; check stop flag\r\n  test r10d, r10d\r\n  jnz asm_read_end\r\n\r\n  cmp rdx, rsi             ; array len - index > 0?\r\n  jg asm_read_pass_loop\r\n  mov rdi, rcx             ; reset to start\r\n  xor rsi, rsi             ; and reset index\r\n  jmp asm_read_pass_loop\r\nasm_read_end:\r\n  pop r11\r\n  pop r10\r\n  pop rsi\r\n  pop rdi\r\n  ret"
  },
  {
    "path": "LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.cpp",
    "content": "#include <stdio.h>\r\n#include <stdlib.h>\r\n#include <stdint.h>\r\n#include <string.h>\r\n#include <math.h>\r\n#include <errno.h>\r\n#include <sys/timeb.h>\r\n#include <Windows.h>\r\n\r\n#define CACHELINE_SIZE 64\r\n\r\nstruct BandwidthTestThreadData {\r\n    uint64_t read_bytes;\r\n    uint64_t arr_length_bytes;\r\n    char* arr;\r\n    volatile int* flag;\r\n    HANDLE threadHandle;\r\n};\r\n\r\nstruct LatencyTestData {\r\n    uint32_t iterations;\r\n    uint32_t* arr;\r\n    float latency;\r\n    HANDLE threadHandle;\r\n};\r\n\r\nextern \"C\" uint64_t asm_read(char* arr, uint64_t arr_length, volatile int* flag, int waitfactor);\r\nDWORD ReadBandwidthTestThread(void* param);\r\nDWORD FillBandwidthTestArr(void* param);\r\nvoid FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment);\r\nDWORD RunLatencyTest(void* param);\r\nbool GetPrivilege();\r\nfloat RunTest(uint64_t latencyAffinityMask, uint64_t bwAffinityMask, int bwThreadCount, int hugepages, float* measuredBw);\r\n\r\nvoid StartMonitoring();\r\nvoid EndMonitoring();\r\nvoid SetupMonitoring();\r\nvoid CloseMonitoring();\r\n\r\nuint64_t BandwidthTestMemoryKB = 1048576 * 4;\r\nuint64_t LatencyTestMemoryKB = 1048576;\r\nuint64_t LatencyTestIterations = 1e5;\r\nuint64_t throttle = 0;\r\n\r\nint main(int argc, char* argv[]) {\r\n    SYSTEM_INFO sysInfo;\r\n    GetSystemInfo(&sysInfo);\r\n    int bwThreadCap = sysInfo.dwNumberOfProcessors - 1;\r\n    int coreCount = sysInfo.dwNumberOfProcessors;\r\n    int latencyCore = 0;\r\n    int* customCores = NULL;\r\n    if (argc == 1) {\r\n        fprintf(stderr, \"Options:\\n\");\r\n        fprintf(stderr, \"-bwthreads [int]: Number of bandwidth test threads\\n\");\r\n        fprintf(stderr, \"-latencyaffinity [int]: Core to run latency test thread on\\n\");\r\n        fprintf(stderr, \"-bwcores [comma separated list]: Cores to run bandwidth load on\\n\");\r\n        fprintf(stderr, \"-scaleiterations [int]: Iterations scaling factor\\n\");\r\n        fprintf(stderr, \"-throttle [int]: Reduce bandwidth load per bandwidth test thread\\n\");\r\n    }\r\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\r\n        if (*(argv[argIdx]) == '-') {\r\n            char* arg = argv[argIdx] + 1;\r\n            if (strncmp(arg, \"bwthreads\", 9) == 0) {\r\n                argIdx++;\r\n                bwThreadCap = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Using up to %d bw threads\\n\", bwThreadCap);\r\n            }\r\n            else if (strncmp(arg, \"latencyaffinity\", 15) == 0) {\r\n                argIdx++;\r\n                latencyCore = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Latency test thread will run in core %d\\n\", latencyCore);\r\n            }\r\n            else if (strncmp(arg, \"scaleiterations\", 15) == 0) {\r\n                argIdx++;\r\n                int scaleFactor = atoi(argv[argIdx]);\r\n                LatencyTestIterations *= scaleFactor;\r\n                fprintf(stderr, \"Scaling iterations up by a factor of %d\\n\", scaleFactor);\r\n            }\r\n            else if (strncmp(arg, \"throttle\", 8) == 0) {\r\n                argIdx++;\r\n                throttle = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Pulling memory bandwidth test threads back, factor of %lld\\n\", throttle);\r\n            }\r\n            else if (strncmp(arg, \"bwcores\", 7) == 0) {\r\n                argIdx++;\r\n                char* customCoreListStr = argv[argIdx];\r\n                bwThreadCap = 1;\r\n                for (int i = 0; customCoreListStr[i] != 0; i++) {   // shell should null terminate this\r\n                    if (customCoreListStr[i] == ',') {\r\n                        bwThreadCap++;\r\n                    }\r\n                }\r\n\r\n                customCores = (int*)malloc(sizeof(int) * bwThreadCap);\r\n                memset(customCores, 0, sizeof(int) * bwThreadCap);\r\n                int commaIdx = 1;\r\n                for (int i = 0; customCoreListStr[i] != 0; i++) {\r\n                    if (customCoreListStr[i] == ',') {\r\n                        customCores[commaIdx] = i + 1;\r\n                        commaIdx++;\r\n                        customCoreListStr[i] = '\\0';\r\n                    }\r\n                }\r\n\r\n                fprintf(stderr, \"Cores used for bandwidth load:\");\r\n                for (int i = 0; i < bwThreadCap; i++) {\r\n                    customCores[i] = atoi(customCoreListStr + customCores[i]);\r\n                    fprintf(stderr, \" %d\", customCores[i]);\r\n                }\r\n\r\n                fprintf(stderr, \"\\n\");\r\n            }\r\n        }\r\n    }\r\n\r\n    GetPrivilege();\r\n    //SetupMonitoring();\r\n\r\n    uint64_t latencyAffinityMask = 1UL << latencyCore;\r\n    uint64_t bwAffinityMask = 0;\r\n\r\n    fprintf(stderr, \"%d cores, will use up to %d for BW threads\\n\", coreCount, bwThreadCap);\r\n    float* latencies = (float*)malloc(sizeof(float) * bwThreadCap + 1);\r\n    float* bandwidths = (float*)malloc(sizeof(float) * bwThreadCap + 1);\r\n    for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {\r\n        float bw;\r\n        int nextCore;\r\n        if (bwThreadCount > 0) {\r\n            if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1;\r\n            else nextCore = customCores[bwThreadCount - 1];\r\n            fprintf(stderr, \"next core is %d\\n\", nextCore);\r\n            bwAffinityMask |= 1UL << nextCore;\r\n        }\r\n\r\n        float latencyNs = RunTest(latencyAffinityMask, bwAffinityMask, bwThreadCount, 1, &bw);\r\n        fprintf(stderr, \"%d bw threads %f GB/s %f ns\\n\", bwThreadCount, bw, latencyNs);\r\n        latencies[bwThreadCount] = latencyNs;\r\n        bandwidths[bwThreadCount] = bw;\r\n    }\r\n\r\n    printf(\"BW Threads, Bandwidth (GB/s), Latency (ns)\\n\");\r\n    for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {\r\n        printf(\"%d, %f, %f\\n\", bwThreadCount, bandwidths[bwThreadCount], latencies[bwThreadCount]);\r\n    }\r\n\r\n    free(latencies);\r\n    free(bandwidths);\r\n    if (customCores != NULL) free(customCores);\r\n    //CloseMonitoring();\r\n    return 0;\r\n}\r\n\r\n// returns latency in ns\r\n// sets measuredBw = measured bandwidth\r\nfloat RunTest(uint64_t latencyAffinity, uint64_t bwAffinity, int bwThreadCount, int hugepages, float* measuredBw) {\r\n    uint64_t perThreadArrSizeBytes = ceil((double)BandwidthTestMemoryKB / (double)bwThreadCount) * 1024;\r\n    volatile int flag = 0;  // set 1 to stop\r\n    struct timeb start, end;\r\n    int map_failed = 0;\r\n\r\n    // MT bw test array fill\r\n    struct BandwidthTestThreadData* bandwidthTestData = (struct BandwidthTestThreadData*)malloc(sizeof(struct BandwidthTestThreadData) * bwThreadCount);\r\n    HANDLE* threadHandles = (HANDLE*)malloc(sizeof(HANDLE) * bwThreadCount);\r\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\r\n        bandwidthTestData[threadIdx].read_bytes = 0;\r\n        bandwidthTestData[threadIdx].flag = &flag;\r\n        bandwidthTestData[threadIdx].arr = (char*)malloc(perThreadArrSizeBytes);\r\n        bandwidthTestData[threadIdx].arr_length_bytes = perThreadArrSizeBytes;\r\n        threadHandles[threadIdx] = CreateThread(NULL, 0, FillBandwidthTestArr, bandwidthTestData + threadIdx, 0, NULL);\r\n    }\r\n\r\n    // set up latency test\r\n    uint32_t* latencyArr;\r\n    latencyArr = (uint32_t *)VirtualAlloc(NULL, LatencyTestMemoryKB * 1024, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE);\r\n    if (latencyArr == NULL) {  // MAP_FAILED\r\n        fprintf(stderr, \"Failed to get memory via VirtualAlloc. Using plain malloc\\n\");\r\n        latencyArr = (uint32_t *)malloc(LatencyTestMemoryKB * 1024);\r\n        map_failed = 1;\r\n    }\r\n\r\n    struct LatencyTestData latencyTestData;\r\n    latencyTestData.iterations = LatencyTestIterations;\r\n    latencyTestData.latency = 0.0f;\r\n    latencyTestData.arr = latencyArr;\r\n    FillPatternArr(latencyArr, (LatencyTestMemoryKB * 256), CACHELINE_SIZE);\r\n\r\n    WaitForMultipleObjects(bwThreadCount, threadHandles, true, INFINITE);\r\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) threadHandles[threadIdx] = INVALID_HANDLE_VALUE;\r\n\r\n    // create bw test threads\r\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++)\r\n    {\r\n        threadHandles[threadIdx] = CreateThread(NULL, 0, ReadBandwidthTestThread, bandwidthTestData + threadIdx, CREATE_SUSPENDED, NULL);\r\n        SetThreadAffinityMask(threadHandles[threadIdx], bwAffinity);\r\n    }\r\n\r\n    //StartMonitoring();\r\n    ftime(&start);\r\n    // start bw test threads\r\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\r\n        ResumeThread(threadHandles[threadIdx]);\r\n    }\r\n\r\n    HANDLE latencyThreadHandle = CreateThread(NULL, 0, RunLatencyTest, (void*)&latencyTestData, CREATE_SUSPENDED, NULL);\r\n    SetThreadAffinityMask(latencyThreadHandle, latencyAffinity);\r\n    ResumeThread(latencyThreadHandle);\r\n    WaitForSingleObject(latencyThreadHandle, INFINITE);\r\n    flag = 1;\r\n\r\n    WaitForMultipleObjects(bwThreadCount, threadHandles, true, INFINITE);\r\n    ftime(&end);\r\n    //EndMonitoring();\r\n\r\n    // count on a cacheline basis even though the test only loads 4B at a time\r\n    uint64_t latencyReadBytes = 64 * LatencyTestIterations;\r\n\r\n    uint64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);\r\n    float totalReadData = (float)latencyReadBytes;\r\n    float bwReadBytes = 0.0f;\r\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\r\n        free(bandwidthTestData[threadIdx].arr);\r\n        totalReadData += (float)bandwidthTestData[threadIdx].read_bytes;\r\n        bwReadBytes += (float)bandwidthTestData[threadIdx].read_bytes;\r\n    }\r\n\r\n    *measuredBw = 1000 * (totalReadData / (float)1e9) / (float)time_diff_ms;\r\n    float bwBandwidth = 1000 * (bwReadBytes / (float)1e9) / (float)time_diff_ms;\r\n    float latencyBandwidth = 1000 * (latencyReadBytes / (float)1e9) / (float)time_diff_ms;\r\n\r\n    fprintf(stderr, \"%d bw threads - %f BW bandwidth, %f latency bandwidth\\n\", bwThreadCount, bwBandwidth, latencyBandwidth);\r\n\r\n    free(bandwidthTestData);\r\n    if (map_failed) free(latencyArr);\r\n    else VirtualFree(latencyArr, 0, MEM_RELEASE);\r\n    return latencyTestData.latency;\r\n}\r\n\r\nvoid FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {\r\n    uint32_t increment = byte_increment / sizeof(uint32_t);\r\n    uint32_t element_count = list_size / increment;\r\n    for (int i = 0; i < element_count; i++) {\r\n        pattern_arr[i * increment] = i * increment;\r\n    }\r\n\r\n    int iter = element_count;\r\n    while (iter > 1) {\r\n        iter -= 1;\r\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\r\n        uint32_t tmp = pattern_arr[iter * increment];\r\n        pattern_arr[iter * increment] = pattern_arr[j * increment];\r\n        pattern_arr[j * increment] = tmp;\r\n    }\r\n}\r\n\r\n// No need for simple addressing because this test should be operating well in DRAM\r\n// where an extra cycle for indexed addressing should not make a big difference\r\n// returns load to use latency in nanoseconds\r\n// size_kb should be divisible by 2M, or whatever the hugepage size is\r\nDWORD RunLatencyTest(void* param) {\r\n    struct timeb start, end;\r\n    struct LatencyTestData* testData = (struct LatencyTestData*)param;\r\n    uint32_t* A = testData->arr;\r\n    uint32_t iterations = testData->iterations;\r\n    uint32_t sum = 0, current;\r\n\r\n    // Run test\r\n    ftime(&start);\r\n    current = A[0];\r\n    for (int i = 0; i < iterations; i++) {\r\n        current = A[current];\r\n        sum += current;\r\n    }\r\n    ftime(&end);\r\n    uint64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);\r\n    testData->latency = 1e6 * (float)time_diff_ms / (float)iterations;\r\n\r\n    return sum;\r\n}\r\n\r\nDWORD FillBandwidthTestArr(void* param) {\r\n    struct BandwidthTestThreadData* bwTestData = (struct BandwidthTestThreadData*)param;\r\n    float* arr = (float*)bwTestData->arr;\r\n    uint64_t float_elements = bwTestData->arr_length_bytes / 4;\r\n    for (int i = 0; i < float_elements; i++) {\r\n        arr[i] = (i + ((uint64_t)arr & 0x3)) + 0.2f;\r\n    }\r\n\r\n    return 0;\r\n}\r\n\r\nDWORD ReadBandwidthTestThread(void* param) {\r\n    struct BandwidthTestThreadData* bwTestData = (struct BandwidthTestThreadData*)param;\r\n    uint64_t totalDataBytes = asm_read(bwTestData->arr, bwTestData->arr_length_bytes, bwTestData->flag, throttle);\r\n    bwTestData->read_bytes = totalDataBytes;\r\n    return 0;\r\n}\r\n\r\n// For winring0\r\n#define RDMSR_FUNCTION 0x821\r\n#define WRMSR_FUNCTION 0x822\r\n#define WINRING0_DEVICE_TYPE 40000\r\nHANDLE driverHandle = INVALID_HANDLE_VALUE;\r\n\r\nvoid SetupMonitoring() {\r\n    driverHandle = CreateFileA(\"\\\\\\\\.\\\\WinRing0_1_2_0\", FILE_SHARE_READ | FILE_SHARE_WRITE, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);\r\n    if (driverHandle == INVALID_HANDLE_VALUE)\r\n    {\r\n        fprintf(stderr, \"Could not open WinRing0 driver: %d\\n\", GetLastError());\r\n    }\r\n}\r\n\r\nuint64_t ReadMsr(uint32_t index)\r\n{\r\n    uint32_t code = (WINRING0_DEVICE_TYPE << 16) | (RDMSR_FUNCTION << 2);\r\n    uint64_t rc;\r\n    DWORD bytesReturned;\r\n    if (!DeviceIoControl(driverHandle, code, &index, sizeof(uint32_t), &rc, sizeof(uint64_t), &bytesReturned, NULL))\r\n    {\r\n        fprintf(stderr, \"ReadMsr failed (ioctl returned false)\\n\");\r\n    }\r\n\r\n    return rc;\r\n}\r\n\r\nvoid WriteMsr(uint32_t index, uint64_t value)\r\n{\r\n    uint32_t code = (WINRING0_DEVICE_TYPE << 16) | (WRMSR_FUNCTION << 2);\r\n    char inputBuffer[sizeof(uint32_t) + sizeof(uint64_t)];\r\n    *(uint32_t*)inputBuffer = index;\r\n    *(uint64_t*)(inputBuffer + sizeof(uint32_t)) = value;\r\n    DWORD bytesReturned;\r\n    if (!DeviceIoControl(driverHandle, code, &inputBuffer, sizeof(uint32_t) + sizeof(uint64_t), NULL, 0, &bytesReturned, NULL))\r\n    {\r\n        fprintf(stderr, \"WriteMsr failed (ioctl returned false)\\n\");\r\n    }\r\n}\r\n\r\n#define L3_PERF_CTL0 0xC0010230\r\n#define L3_PERF_CTL1 0xC0010232\r\n#define L3_PERF_CTL2 0xC0010234\r\n#define L3_PERF_CTL3 0xC0010236\r\n#define L3_PERF_CTR0 0xC0010231\r\n#define L3_PERF_CTR1 0xC0010233\r\n#define L3_PERF_CTR2 0xC0010235\r\n#define L3_PERF_CTR3 0xC0010237\r\n\r\nvoid ClearL3Counters() {\r\n    WriteMsr(L3_PERF_CTR0, 0);\r\n    WriteMsr(L3_PERF_CTR1, 0);\r\n    WriteMsr(L3_PERF_CTR2, 0);\r\n    WriteMsr(L3_PERF_CTR3, 0);\r\n}\r\n\r\nvoid StartMonitoring() {\r\n    uint64_t l3access = 0x0300c0000040ff04;\r\n    uint64_t l3miss = 0x0300c00000400104;\r\n    uint64_t l3miss_sampled_dram_req = 0x0303c000004003ad;\r\n    uint64_t l3miss_sampled_dram_req_latency = 0x0303c000004003ac;\r\n\r\n    SetThreadAffinityMask(GetCurrentThread(), 1); // use core 0 in ccd 0\r\n    WriteMsr(L3_PERF_CTL0, l3access);\r\n    WriteMsr(L3_PERF_CTL1, l3miss);\r\n    WriteMsr(L3_PERF_CTL2, l3miss_sampled_dram_req);\r\n    WriteMsr(L3_PERF_CTL3, l3miss_sampled_dram_req_latency);\r\n    ClearL3Counters();\r\n\r\n    SetThreadAffinityMask(GetCurrentThread(), 16); // use core 0 in ccd 1\r\n    WriteMsr(L3_PERF_CTL0, l3access);\r\n    WriteMsr(L3_PERF_CTL1, l3miss);\r\n    WriteMsr(L3_PERF_CTL2, l3miss_sampled_dram_req);\r\n    WriteMsr(L3_PERF_CTL3, l3miss_sampled_dram_req_latency);\r\n    ClearL3Counters();\r\n}\r\n\r\nvoid EndMonitoring() {\r\n    SetThreadAffinityMask(GetCurrentThread(), 1); // use core 0 in ccd 0\r\n    uint64_t ccd0L3Access = ReadMsr(L3_PERF_CTR0);\r\n    uint64_t ccd0L3Miss = ReadMsr(L3_PERF_CTR1);\r\n    uint64_t ccd0L3SampledDramReq = ReadMsr(L3_PERF_CTR2);\r\n    uint64_t ccd0L3SampledDramReqLatency = ReadMsr(L3_PERF_CTR3);\r\n    float ccd0SampledLatencyNs = 10.0f * ccd0L3SampledDramReqLatency / ccd0L3SampledDramReq;\r\n    ClearL3Counters();\r\n\r\n    SetThreadAffinityMask(GetCurrentThread(), 16); // use core 0 in ccd 1\r\n    uint64_t ccd1L3Access = ReadMsr(L3_PERF_CTR0);\r\n    uint64_t ccd1L3Miss = ReadMsr(L3_PERF_CTR1);\r\n    uint64_t ccd1L3SampledDramReq = ReadMsr(L3_PERF_CTR2);\r\n    uint64_t ccd1L3SampledDramReqLatency = ReadMsr(L3_PERF_CTR3);\r\n    float ccd1SampledLatencyNs = 10.0f * ccd1L3SampledDramReqLatency / ccd1L3SampledDramReq;\r\n    ClearL3Counters();\r\n\r\n    fprintf(stderr, \"CCD 0: %f ns, CCD1: %f ns\\n\", ccd0SampledLatencyNs, ccd1SampledLatencyNs);\r\n}\r\n\r\nvoid CloseMonitoring()\r\n{\r\n    if (driverHandle != INVALID_HANDLE_VALUE) CloseHandle(driverHandle);\r\n    driverHandle = INVALID_HANDLE_VALUE;\r\n}\r\n\r\nbool GetPrivilege()\r\n{\r\n    HANDLE           hToken;\r\n    TOKEN_PRIVILEGES tp;\r\n    BOOL             status;\r\n    DWORD            error;\r\n\r\n    // open process token\r\n    if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))\r\n    {\r\n        fprintf(stderr, \"OpenProcessToken failed: %d\\n\", GetLastError());\r\n        return false;\r\n    }\r\n\r\n    // get the luid\r\n    if (!LookupPrivilegeValue(NULL, TEXT(\"SeLockMemoryPrivilege\"), &tp.Privileges[0].Luid))\r\n    {\r\n        fprintf(stderr, \"Could not get luid: %d\\n\", GetLastError());\r\n        return false;\r\n    }\r\n\r\n    // enable privilege\r\n    tp.PrivilegeCount = 1;\r\n    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;\r\n    status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);\r\n\r\n    // It is possible for AdjustTokenPrivileges to return TRUE and still not succeed.\r\n    // So always check for the last error value.\r\n    error = GetLastError();\r\n    if (!status || (error != ERROR_SUCCESS))\r\n    {\r\n        fprintf(stderr, \"AdjustTokenPrivileges failed with status %d, error %d\\n\", status, error);\r\n        return false;\r\n    }\r\n\r\n    // close the handle\r\n    if (!CloseHandle(hToken))\r\n    {\r\n        fprintf(stderr, \"CloseHandle failed: %d\\n\", GetLastError());\r\n        return false;\r\n    }\r\n\r\n    fprintf(stderr, \"Got SeLockMemoryPrivilege\\n\");\r\n}\r\n"
  },
  {
    "path": "LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 17\r\nVisualStudioVersion = 17.11.35327.3\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"LoadedMemoryLatency\", \"LoadedMemoryLatency.vcxproj\", \"{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|x64 = Debug|x64\r\n\t\tDebug|x86 = Debug|x86\r\n\t\tRelease|x64 = Release|x64\r\n\t\tRelease|x86 = Release|x86\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.ActiveCfg = Debug|Win32\r\n\t\t{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.Build.0 = Debug|Win32\r\n\t\t{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.Build.0 = Release|x64\r\n\t\t{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.ActiveCfg = Release|Win32\r\n\t\t{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.Build.0 = Release|Win32\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\n\tGlobalSection(ExtensibilityGlobals) = postSolution\r\n\t\tSolutionGuid = {5656BCBF-7F82-471C-8AFE-1FE48AD34114}\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.vcxproj",
    "content": "﻿<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>17.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{e7b51ed8-5c4a-4cb5-9874-dc4b9caf056d}</ProjectGuid>\r\n    <RootNamespace>LoadedMemoryLatency</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"LoadedMemoryLatency.asm\">\r\n      <FileType>Document</FileType>\r\n      <Message Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">Running NASM</Message>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">nasm -f win64 LoadedMemoryLatency.asm</Command>\r\n      <Message Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">Running NASM</Message>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">nasm -f win64 LoadedMemoryLatency.asm</Command>\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">false</ExcludedFromBuild>\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">false</ExcludedFromBuild>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">LoadedMemoryLatency.obj</Outputs>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">LoadedMemoryLatency.obj</Outputs>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"LoadedMemoryLatency.cpp\" />\r\n  </ItemGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>"
  },
  {
    "path": "LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.vcxproj.filters",
    "content": "﻿<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project ToolsVersion=\"4.0\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup>\r\n    <Filter Include=\"Source Files\">\r\n      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>\r\n      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Header Files\">\r\n      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>\r\n      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Resource Files\">\r\n      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>\r\n      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>\r\n    </Filter>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"LoadedMemoryLatency.cpp\">\r\n      <Filter>Source Files</Filter>\r\n    </ClCompile>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"LoadedMemoryLatency.asm\">\r\n      <Filter>Source Files</Filter>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n</Project>"
  },
  {
    "path": "LoadedMemoryLatency/LoadedMemoryLatency.c",
    "content": "#define _GNU_SOURCE\n#include <stdio.h>\n#include <stdlib.h>\n#include <stdint.h>\n#include <string.h>\n#include <unistd.h>\n#include <sys/mman.h>\n#include <sys/sysinfo.h>\n#include <sys/time.h>\n#include <sched.h>\n#include <pthread.h>\n#include <math.h>\n#include <errno.h>\n\n#define CACHELINE_SIZE 64\n\nenum TestMethod {\n    Read,\n    Add\n};\n\nstruct BandwidthTestThreadData {\n    uint64_t read_bytes;\n    uint64_t arr_length_bytes;\n    char *arr;\n    volatile int *flag;\n    cpu_set_t cpuset;\n    pthread_t handle;\n    enum TestMethod test_method;\n};\n\nstruct LatencyTestData {\n    uint32_t iterations;\n    uint32_t *arr;\n    float latency;\n    cpu_set_t cpuset;\n    pthread_t handle;\n};\n\nint default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 2304, 2560,\n                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 13312, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 65536, 98304,\n                               131072, 262144, 393216, 524288, 1048576 };\n\nextern uint64_t asm_read(char *arr, uint64_t arr_length, volatile int *flag, int waitfactor) __attribute__((ms_abi)); \nextern uint64_t asm_add(char *arr, uint64_t arr_length, volatile int *flag, int waitfactor) __attribute__((ms_abi)); \nvoid *ReadBandwidthTestThread(void *param);\nvoid *FillBandwidthTestArr(void *param);\nvoid FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment);\nvoid *RunLatencyTest(void *param);\nfloat RunTest(cpu_set_t latencyAffinity, cpu_set_t bwAffinity, int bwThreadCount, int hugepages, int sharedLatency, float *measuredBw); \nfloat RunBandwidthOnlyTest(cpu_set_t bwAffinity, int bwThreadCount, int sizeKb);\n\nuint64_t BandwidthTestMemoryKB = 1048576;\nuint64_t LatencyTestMemoryKB = 1048576;\nuint64_t LatencyTestIterations = 1e5;\nuint64_t throttle = 0;\nenum TestMethod testMethod = Read;\n\nint main(int argc, char *argv[]) {\n    int bwThreadCap = get_nprocs() - 1;\n    int coreCount = get_nprocs();\n    int latencyCore = 0;\n    int *customCores = NULL;\n    int sharedLatency = 0, bwonly = 0;\n    if (argc == 1) {\n        fprintf(stderr, \"Options:\\n\");\n        fprintf(stderr, \"-bwthreads [int]: Number of bandwidth test threads\\n\");\n        fprintf(stderr, \"-latencyaffinity [int]: Core to run latency test thread on\\n\");\n        fprintf(stderr, \"-bwcores [comma separated list]: Cores to run bandwidth load on\\n\");\n        fprintf(stderr, \"-scaleiterations [int]: Iterations scaling factor\\n\");\n        fprintf(stderr, \"-throttle [int]: Reduce bandwidth load per bandwidth test thread\\n\");\n    }\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\n        if (*(argv[argIdx]) == '-') {\n            char *arg = argv[argIdx] + 1;\n            if (strncmp(arg, \"bwthreads\", 9) == 0) {\n                argIdx++;\n                bwThreadCap = atoi(argv[argIdx]);\n                fprintf(stderr, \"Using up to %d bw threads\\n\", bwThreadCap);\n            } else if (strncmp(arg, \"latencyaffinity\", 15) == 0) {\n                argIdx++;\n                latencyCore = atoi(argv[argIdx]);\n                fprintf(stderr, \"Latency test thread will run in core %d\\n\", latencyCore);\n            } else if (strncmp(arg, \"scaleiterations\", 15) == 0) {\n                argIdx++;\n                int scaleFactor = atoi(argv[argIdx]);\n                LatencyTestIterations *= scaleFactor;\n                fprintf(stderr, \"Scaling iterations up by a factor of %d\\n\", scaleFactor);\n            } else if (strncmp(arg, \"throttle\", 8) == 0) {\n                argIdx++;\n                throttle = atol(argv[argIdx]);\n                fprintf(stderr, \"Pulling memory bandwidth test threads back, factor of %lu\\n\", throttle);\n            } else if (strncmp(arg, \"bwcores\", 7) == 0) {\n                argIdx++;\n                char *customCoreListStr = argv[argIdx];\n                bwThreadCap = 1;\n                for (int i = 0; customCoreListStr[i] != 0; i++) {   // shell should null terminate this\n                    if (customCoreListStr[i] == ',') {\n                        bwThreadCap++;\n                    }\n                }\n\n                customCores = (int *)malloc(sizeof(int) * bwThreadCap);\n                memset(customCores, 0, sizeof(int) * bwThreadCap);\n                int commaIdx = 1;\n                for (int i = 0; customCoreListStr[i] != 0; i++) {\n                    if (customCoreListStr[i] == ',') {\n                        customCores[commaIdx] = i + 1;\n                        commaIdx++;\n                        customCoreListStr[i] = '\\0';\n                    }\n                }\n\n                fprintf(stderr, \"Cores used for bandwidth load:\");\n                for (int i = 0; i < bwThreadCap; i++) {\n                    customCores[i] = atoi(customCoreListStr + customCores[i]);\n                    fprintf(stderr, \" %d\", customCores[i]);\n                }\n\n                fprintf(stderr, \"\\n\");\n            } else if (strncmp(arg, \"sharedlatency\", 13) == 0) {\n                fprintf(stderr, \"Shared arr bw+latency\\n\");\n                sharedLatency = 1;\n            } else if (strncmp(arg, \"bwonly\", 6) == 0) {\n                fprintf(stderr, \"Only testing bandwidth\\n\");\n                bwonly = 1;\n            } else if (strncmp(arg, \"method\", 6) == 0) {\n                argIdx++;\n                if (strncmp(argv[argIdx], \"read\", 4) == 0) {\n                    testMethod = Read; \n                    fprintf(stderr, \"Testing with reads\\n\");\n                } else if (strncmp(argv[argIdx], \"add\", 3) == 0) {\n                    testMethod = Add;\n                    fprintf(stderr, \"Testing with adds (RMW)\\n\");\n                }\n            }\n        }\n    }\n        \n    cpu_set_t latency_cpuset;\n    CPU_ZERO(&latency_cpuset);\n    CPU_SET(latencyCore, &latency_cpuset);\n    \n    cpu_set_t bw_cpuset;\n    CPU_ZERO(&bw_cpuset);\n\n    if (bwonly) {\n        fprintf(stderr, \"Only testing bandwidth to abuse the iteration logic\\n\");\n        int testSizeCount = sizeof(default_test_sizes) / sizeof(int);\n        float *bandwidths = (float *)malloc(sizeof(float) * testSizeCount);\n        memset(bandwidths, 0, sizeof(float) * testSizeCount);\n        // set the entire affinity mask right away\n        for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {\n            int nextCore;\n             if (bwThreadCount > 0) {\n                if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1;\n                else nextCore = customCores[bwThreadCount - 1] ;\n                fprintf(stderr, \"next core is %d\\n\", nextCore);\n                CPU_SET(nextCore, &bw_cpuset);\n            } \n        }\n\n        for (int testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {\n            int testSizeKb = default_test_sizes[testSizeIdx];\n            if (testSizeKb < bwThreadCap) {\n                fprintf(stderr, \"Skipping size %d because it's too small for specified thread count of %d\\n\", testSizeKb, bwThreadCap);\n                continue;\n            }\n\n            float bandwidth = RunBandwidthOnlyTest(bw_cpuset, bwThreadCap, testSizeKb);\n            bandwidths[testSizeIdx] = bandwidth;\n            fprintf(stderr, \"Test Size %d KB: %f GB/s\\n\", default_test_sizes[testSizeIdx], bandwidths[testSizeIdx]);\n        }\n\n        for (int testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {\n            if (bandwidths[testSizeIdx] == 0.0f) continue;\n            printf(\"%d,%f\\n\", default_test_sizes[testSizeIdx], bandwidths[testSizeIdx]);\n        }\n    }\n    else if (!sharedLatency) {\n        fprintf(stderr, \"%d cores, will use up to %d for BW threads\\n\", coreCount, bwThreadCap);\n        float *latencies = (float *)malloc(sizeof(float) * bwThreadCap + 1);\n        float *bandwidths = (float *)malloc(sizeof(float) * bwThreadCap + 1);\n        for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {\n            float bw;\n            int nextCore;\n            if (bwThreadCount > 0) {\n                if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1;\n                else nextCore = customCores[bwThreadCount - 1] ;\n                fprintf(stderr, \"next core is %d\\n\", nextCore);\n                CPU_SET(nextCore, &bw_cpuset);\n            }\n\n            if (nextCore < 0) break;\n\n            // sharedlatency will always be false in this run mode\n            float latencyNs = RunTest(latency_cpuset, bw_cpuset, bwThreadCount, 1, sharedLatency, &bw);\n            fprintf(stderr, \"%d bw threads %f GB/s %f ns\\n\", bwThreadCount, bw, latencyNs);\n            latencies[bwThreadCount] = latencyNs;\n            bandwidths[bwThreadCount] = bw;\n        }\n\n        printf(\"BW Threads, Bandwidth (GB/s), Latency (ns)\\n\");\n        for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {\n            printf(\"%d, %f, %f\\n\", bwThreadCount, bandwidths[bwThreadCount], latencies[bwThreadCount]);\n        }\n        free(latencies);\n        free(bandwidths);\n    } else {\n        int testSizeCount = sizeof(default_test_sizes) / sizeof(int);\n        float *latencies = (float*)malloc(sizeof(float) * testSizeCount);\n        float *bandwidths = (float*)malloc(sizeof(float) * testSizeCount);\n        // set mask to all selected cores\n        for (int bwThreadCount = 0; bwThreadCount < bwThreadCap; bwThreadCount++) {\n            int nextCore;\n            if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1;\n            else nextCore = customCores[bwThreadCount];\n            CPU_SET(nextCore, &bw_cpuset);\n            fprintf(stderr, \"Set core %d\\n\", nextCore);\n        }\n\n        for (int i = 0; i < testSizeCount; i++) {\n            LatencyTestMemoryKB = default_test_sizes[i];\n            latencies[i] = RunTest(latency_cpuset, bw_cpuset, bwThreadCap, 1, sharedLatency, bandwidths + i);\n            fprintf(stderr, \"%lu KB: %f ns %f GB/s\\n\", LatencyTestMemoryKB, latencies[i], bandwidths[i]);\n        }\n\n        printf(\"Test Size (KB), Latency (ns), Bandwidth (GB/s)\\n\");\n        for (int i = 0; i < testSizeCount; i++) {\n            printf(\"%d,%f,%f\\n\", default_test_sizes[i], latencies[i], bandwidths[i]);\n        }\n\n        free(latencies);\n        free(bandwidths);\n    }\n\n    if (customCores != NULL) free(customCores);\n    return 0;\n}\n\n// Caller ensures at least 1 KB per thread. Runs in private mode\nfloat RunBandwidthOnlyTest(cpu_set_t bwAffinity, int bwThreadCount, int sizeKb) {\n    volatile int flag = 0;\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz;  \n    struct BandwidthTestThreadData *bandwidthTestData = (struct BandwidthTestThreadData *)malloc(sizeof(struct BandwidthTestThreadData) * bwThreadCount);\n    uint64_t perThreadArrSizeBytes = ceil((double)sizeKb / (double)bwThreadCount) * 1024;\n\n    // Same initialization routine\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n        bandwidthTestData[threadIdx].read_bytes = 0;\n        bandwidthTestData[threadIdx].test_method = testMethod;\n        bandwidthTestData[threadIdx].flag = &flag;\n        bandwidthTestData[threadIdx].cpuset = bwAffinity;\n\n        bandwidthTestData[threadIdx].arr = (char *)malloc(perThreadArrSizeBytes);\n        bandwidthTestData[threadIdx].arr_length_bytes = perThreadArrSizeBytes;\n        pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, FillBandwidthTestArr, (void *)(bandwidthTestData + threadIdx));\n    }\n\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n        pthread_join(bandwidthTestData[threadIdx].handle, NULL);\n    } \n\n    // Run bandwidth threads for a few seconds and get results\n    gettimeofday(&startTv, &startTz);\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n        pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, ReadBandwidthTestThread, (void *)(bandwidthTestData + threadIdx));\n    }\n\n    sleep(3);\n    flag = 1;\n\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n        pthread_join(bandwidthTestData[threadIdx].handle, NULL);\n    }\n    \n    gettimeofday(&endTv, &endTz);\n\n\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    float totalReadData = 0;\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n        free(bandwidthTestData[threadIdx].arr);\n        totalReadData += (float)bandwidthTestData[threadIdx].read_bytes;\n    }\n\n    float measuredBw = 1000 * (totalReadData / (float)1e9) / (float)time_diff_ms; \n    free(bandwidthTestData); \n\n    return measuredBw;\n}\n// returns latency in ns\n// sets measuredBw = measured bandwidth\nfloat RunTest(cpu_set_t latencyAffinity, cpu_set_t bwAffinity, int bwThreadCount, int hugepages, int sharedLatency, float *measuredBw) {\n    uint64_t perThreadArrSizeBytes = ceil((double)BandwidthTestMemoryKB / (double)bwThreadCount) * 1024;\n    volatile int flag = 0;  // set 1 to stop\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz; \n    int map_failed = 0;\n\n    // MT bw test array fill\n    struct BandwidthTestThreadData *bandwidthTestData = (struct BandwidthTestThreadData *)malloc(sizeof(struct BandwidthTestThreadData) * bwThreadCount);\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n        bandwidthTestData[threadIdx].read_bytes = 0;\n        bandwidthTestData[threadIdx].test_method = testMethod;\n        bandwidthTestData[threadIdx].flag = &flag;\n        bandwidthTestData[threadIdx].cpuset = bwAffinity;\n\n        if (!sharedLatency) {\n            bandwidthTestData[threadIdx].arr = (char *)malloc(perThreadArrSizeBytes);\n            bandwidthTestData[threadIdx].arr_length_bytes = perThreadArrSizeBytes;\n            pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, FillBandwidthTestArr, (void *)(bandwidthTestData + threadIdx));\n        }\n    }\n\n    // set up latency test\n    uint32_t *latencyArr;\n    latencyArr = mmap(NULL, LatencyTestMemoryKB * 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);\n    if (latencyArr == (void *)-1) {  // MAP_FAILED\n        fprintf(stderr, \"Failed to map hugepages arr, will use madvise\\n\");\n        if (0 != posix_memalign((void **)(&latencyArr), 64, LatencyTestMemoryKB * 1024)) {\n            fprintf(stderr, \"Failed to allocate %lu KB of memory for latency test\\n\", LatencyTestMemoryKB);\n            return 0.0f;\n        }\n\n        madvise(latencyArr, LatencyTestMemoryKB * 1024, MADV_HUGEPAGE);\n        map_failed = 1;\n    }\n\n    struct LatencyTestData latencyTestData;\n    latencyTestData.iterations = LatencyTestIterations;\n    latencyTestData.latency = 0.0f;\n    latencyTestData.cpuset = latencyAffinity;\n    latencyTestData.arr = latencyArr;\n    FillPatternArr(latencyArr, (LatencyTestMemoryKB * 256), CACHELINE_SIZE);\n\n    // let bw array fills finish\n    for (int threadIdx = 0; threadIdx < bwThreadCount && !sharedLatency; threadIdx++) {\n        pthread_join(bandwidthTestData[threadIdx].handle, NULL);\n    }\n\n    // use one array for all bw test threads. latency test size applies across bw threads\n    if (sharedLatency) {\n        for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n            bandwidthTestData[threadIdx].arr = (char *)latencyArr;\n            bandwidthTestData[threadIdx].arr_length_bytes = LatencyTestMemoryKB * 1024;\n        }\n    }\n\n    gettimeofday(&startTv, &startTz);\n    // start bw test threads\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n        pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, ReadBandwidthTestThread, (void *)(bandwidthTestData + threadIdx));\n    }\n\n    pthread_create(&(latencyTestData.handle), NULL, RunLatencyTest, (void *)&latencyTestData); \n    pthread_join(latencyTestData.handle, NULL);\n    flag = 1;\n\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n        pthread_join(bandwidthTestData[threadIdx].handle, NULL);\n    }\n    \n    gettimeofday(&endTv, &endTz);\n\n    // count on a cacheline basis even though the test only loads 4B at a time\n    uint64_t latencyReadBytes = 64 * LatencyTestIterations;\n\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    float totalReadData = (float)latencyReadBytes;\n    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {\n        if (!sharedLatency) free(bandwidthTestData[threadIdx].arr);\n        totalReadData += (float)bandwidthTestData[threadIdx].read_bytes;\n    }\n\n    *measuredBw = 1000 * (totalReadData / (float)1e9) / (float)time_diff_ms; \n\n    free(bandwidthTestData);\n    if (map_failed) free(latencyArr);\n    else munmap(latencyArr, LatencyTestMemoryKB * 1024); \n    return latencyTestData.latency;\n}\n\nvoid FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment) {\n    uint32_t increment = byte_increment / sizeof(uint32_t);\n    uint32_t element_count = list_size / increment;\n    for (int i = 0; i < element_count; i++) {\n        pattern_arr[i * increment] = i * increment;\n    }\n\n    int iter = element_count;\n    while (iter > 1) {\n        iter -= 1;\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\n        uint32_t tmp = pattern_arr[iter * increment];\n        pattern_arr[iter * increment] = pattern_arr[j * increment];\n        pattern_arr[j * increment] = tmp;\n    }\n}\n\n// No need for simple addressing because this test should be operating well in DRAM\n// where an extra cycle for indexed addressing should not make a big difference\n// returns load to use latency in nanoseconds\n// size_kb should be divisible by 2M, or whatever the hugepage size is\nvoid *RunLatencyTest(void *param) {\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz;\n    struct LatencyTestData *testData = (struct LatencyTestData *)param;\n    uint32_t *A = testData->arr;\n    uint32_t iterations = testData->iterations;\n    uint32_t sum = 0, current;\n\n    // fucking affinity setting does not work\n    int rc = sched_setaffinity(0, sizeof(cpu_set_t), &(testData->cpuset));\n    if (rc != 0) fprintf(stderr, \"Latency thread failed to set affinity\\n\");\n\n    // Run test\n    gettimeofday(&startTv, &startTz);\n    current = A[0];\n    for (int i = 0; i < iterations; i++) {\n        current = A[current];\n        sum += current;\n    }\n    gettimeofday(&endTv, &endTz);\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    testData->latency = 1e6 * (float)time_diff_ms / (float)iterations;\n\n    if (sum == 0) printf(\"sum == 0 (?)\\n\");\n}\n\nvoid *FillBandwidthTestArr(void *param) {\n    struct BandwidthTestThreadData *bwTestData = (struct BandwidthTestThreadData *)param;\n    float *arr = (float *)bwTestData->arr;\n    uint64_t float_elements = bwTestData->arr_length_bytes / 4;\n    for (int i = 0; i < float_elements;i++) {\n        arr[i] = (i + ((uint64_t)arr & 0x3)) + 0.2f;\n    }\n}\n\nvoid *ReadBandwidthTestThread(void *param) {\n    struct BandwidthTestThreadData *bwTestData = (struct BandwidthTestThreadData *)param;\n    int rc = sched_setaffinity(0, sizeof(cpu_set_t), &(bwTestData->cpuset));\n    if (rc != 0) {\n        fprintf(stderr, \"BW test thread failed to set affinity: %s\\n\", strerror(errno));\n        for (int i = 0; i < 8; i++) {\n            if (CPU_ISSET(i, &(bwTestData->cpuset))) fprintf(stderr, \"\\tCPU %d is set\\n\", i);\n            else fprintf(stderr, \"\\tCPU %d is NOT set\\n\", i);\n        }\n    }\n\n    uint64_t totalDataBytes;\n    if (bwTestData->test_method == Read) totalDataBytes = asm_read(bwTestData->arr, bwTestData->arr_length_bytes, bwTestData->flag, throttle);\n    else if (bwTestData->test_method == Add) totalDataBytes = asm_add(bwTestData->arr, bwTestData->arr_length_bytes, bwTestData->flag, throttle);\n    else fprintf(stderr, \"Unsupported test method\\n\");\n    bwTestData->read_bytes = totalDataBytes;\n}\n"
  },
  {
    "path": "LoadedMemoryLatency/LoadedMemoryLatency_amd64.s",
    "content": ".global asm_read\n.global asm_add\n\n/* rcx = ptr to array\n   rdx = arr length in bytes\n   r8 = stop flag\n   r9 = throttle factor\n   return bytes read in rax\n*/\nasm_read:\n  push %rdi\n  push %rsi\n  push %r10\n  push %r11\n  mov %rcx, %rdi\n  xor %rsi, %rsi\n  xor %rax, %rax\nasm_read_pass_loop:\n  /* load 128B */\n  movups (%rdi), %xmm0\n  movups 16(%rdi), %xmm0\n  movups 32(%rdi), %xmm0\n  movups 48(%rdi), %xmm0\n  movups 64(%rdi), %xmm0\n  movups 80(%rdi), %xmm0\n  movups 96(%rdi), %xmm0\n  movups 112(%rdi), %xmm0\n\n  add $128, %rdi\n  add $128, %rsi\n  add $128, %rax\n\n  test %r9, %r9\n  jz asm_read_throttle_end\n  mov %r9, %r10\nasm_read_throttle:\n  dec %r10\n  jnz asm_read_throttle\nasm_read_throttle_end:\n  /* check stop flag */\n  mov (%r8), %r10d\n  test %r10d, %r10d\n  jnz asm_read_end\n\n  cmp %rsi, %rdx\n  jg asm_read_pass_loop\n  mov %rcx, %rdi\n  xor %rsi, %rsi\n  jmp asm_read_pass_loop\nasm_read_end:\n  pop %r11\n  pop %r10\n  pop %rsi\n  pop %rdi\n  ret\n\nasm_add:\n  push %rdi\n  push %rsi\n  push %r10\n  push %r11\n  mov %rcx, %rdi\n  xor %rsi, %rsi\n  xor %rax, %rax\n  movups (%rdi), %xmm0\nasm_add_pass_loop:\n  /* load 128B */\n  movups %xmm0, %xmm1\n  addps (%rdi), %xmm1\n  movups %xmm1, (%rdi)\n  \n  movups %xmm0, %xmm1\n  addps 16(%rdi), %xmm1\n  movups %xmm1, 16(%rdi)\n\n  movups %xmm0, %xmm1\n  addps 32(%rdi), %xmm1\n  movups %xmm1, 32(%rdi)\n  \n  movups %xmm0, %xmm1\n  addps 48(%rdi), %xmm1\n  movups %xmm1, 32(%rdi)\n\n  movups %xmm0, %xmm1\n  addps 64(%rdi), %xmm1\n  movups %xmm1, 64(%rdi)\n\n  addps 80(%rdi), %xmm1\n  addps 96(%rdi), %xmm1\n  addps 112(%rdi), %xmm1\n\n  add $128, %rdi\n  add $128, %rsi\n  add $128, %rax\n\n  test %r9, %r9\n  jz asm_add_throttle_end\n  mov %r9, %r10\nasm_add_throttle:\n  dec %r10\n  jnz asm_add_throttle\nasm_add_throttle_end:\n  /* check stop flag */\n  mov (%r8), %r10d\n  test %r10d, %r10d\n  jnz asm_add_end\n\n  cmp %rsi, %rdx\n  jg asm_add_pass_loop\n  mov %rcx, %rdi\n  xor %rsi, %rsi\n  jmp asm_add_pass_loop\nasm_add_end:\n  pop %r11\n  pop %r10\n  pop %rsi\n  pop %rdi\n  shl $1, %rax /* count rmw as 2 */\n  ret \n"
  },
  {
    "path": "LoadedMemoryLatency/LoadedMemoryLatency_arm.s",
    "content": ".global asm_read\n.global _asm_read\n\n.global asm_add\n.global _asm_add\n\n/* x0 = ptr to array\n   x1 = arr length in bytes\n   x2 = stop flag\n   x3 = throttle factor\n   return bytes read in x0\n*/\n_asm_read:\nasm_read:\n  sub sp, sp, #0x40\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x11, x10, [sp, #0x30]\n  sub x1, x1, 128\n  mov x15, x0    /* ptr into array */\n  mov x12, 0     /* current offset into array */\n  mov x13, 0     /* data transferred in bytes */\nasm_read_pass_loop:\n  /* load 128B */\n  ldr q16, [x15]\n  ldr q16, [x15, 16]\n  ldr q16, [x15, 32]\n  ldr q16, [x15, 48]\n  ldr q16, [x15, 64]\n  ldr q16, [x15, 80]\n  ldr q16, [x15, 96]\n  ldr q16, [x15, 112]\n  add x12, x12, 128\n  add x15, x15, 128\n  add x13, x13, 128\n\n  cbz x3, asm_read_throttle_end\n  mov x10, x3    /* save throttle factor */\nasm_read_throttle:\n  sub x10, x10, 1\n  cbnz x10, asm_read_throttle\nasm_read_throttle_end:  \n\n  /* end condition */\n  ldr w14, [x2]\n  cbnz x14, asm_read_end\n\n  /* loop back condition */\n  cmp x1, x12\n  b.gt asm_read_pass_loop\n  mov x15, x0\n  mov x12, 0\n  b asm_read_pass_loop\nasm_read_end:\n  mov x0, x13\n  ldp x11, x10, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x40\n  ret\n\n_asm_add:\nasm_add:\n  sub sp, sp, #0x40\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x11, x10, [sp, #0x30]\n  sub x1, x1, 128\n  mov x15, x0    /* ptr into array */\n  mov x12, 0     /* current offset into array */\n  mov x13, 0     /* data transferred in bytes */\n  ldr q15, [x15]\nasm_add_pass_loop:\n  /* load 128B */\n  ldr q16, [x15]\n  \n  ldr q16, [x15, 16]\n  add v16.4s, v16.4s, v15.4s\n  str q16, [x15, 16]\n  ldr q16, [x15, 32]\n  add v16.4s, v16.4s, v15.4s\n  str q16, [x15, 32]\n  ldr q16, [x15, 48]\n  add v16.4s, v16.4s, v15.4s\n  str q16, [x15, 48]\n  ldr q16, [x15, 64]\n  add v16.4s, v16.4s, v15.4s\n  str q16, [x15, 64]\n  ldr q16, [x15, 80]\n  add v16.4s, v16.4s, v15.4s\n  str q16, [x15, 80]\n  ldr q16, [x15, 96]\n  add v16.4s, v16.4s, v15.4s\n  str q16, [x15, 96]\n  ldr q16, [x15, 112]\n  add v16.4s, v16.4s, v15.4s\n  str q16, [x15, 112]\n  add x12, x12, 128\n  add x15, x15, 128\n  add x13, x13, 256\n\n  cbz x3, asm_add_throttle_end\n  mov x10, x3    /* save throttle factor */\nasm_add_throttle:\n  sub x10, x10, 1\n  cbnz x10, asm_add_throttle\nasm_add_throttle_end:  \n\n  /* end condition */\n  ldr w14, [x2]\n  cbnz x14, asm_add_end\n\n  /* loop back condition */\n  cmp x1, x12\n  b.gt asm_add_pass_loop\n  mov x15, x0\n  mov x12, 0\n  b asm_add_pass_loop\nasm_add_end:\n  mov x0, x13\n  ldp x11, x10, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x40\n  ret \n"
  },
  {
    "path": "LoadedMemoryLatency/Makefile",
    "content": "amd64:\n\tgcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_amd64.s -o loadedlat_amd64 -lm\naarch64:\n\tgcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_arm.s -o loadedlat_aarch64 -lm\n"
  },
  {
    "path": "Makefile",
    "content": "include Common/arch_detect.mk\n\nCOMPONENTS = CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency\n\nall: $(COMPONENTS) \n\nci:\n\tfor COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT ci; done\n\npackage:\n\t@sh Common/ci_package.sh\n\nclean-package:\n\tfind . -maxdepth 1 -type d -name \"clammarks-*\" -exec rm -rf {} \\; && rm -f \"clammarks.txz\"\n\nclean: \n\tfor COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT clean; done\n\n$(COMPONENTS): .FORCE\n\t$(MAKE) -C $@ \n\n.FORCE:\n\n.PHONY: all ci package clean-package clean\n"
  },
  {
    "path": "MemoryBandwidth/Makefile",
    "content": "include ../Common/arch_detect.mk\n\nCFLAGS = -pthread -O3\nLDFLAGS= -lm\n\nall: $(TARGET)\n\namd64:\n\t$(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_amd64 $(LDFLAGS)\n\namd64-numa:\n\t$(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_numa_amd64 $(LDFLAGS) -lnuma\n\naarch64:\n\t$(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 $(LDFLAGS)\n\ntermux:\n\tgcc -O3 -pthread MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 -lm\n\naarch64-numa:\n\t$(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_numa_aarch64 $(LDFLAGS) -lnuma\n\nriscv64:\n\t$(CC) $(CFLAGS) -march=rv64gcv0p7 MemoryBandwidth.c MemoryBandwidth_riscv.s -o MemoryBandwidth_riscv64 $(LDFLAGS)\n\nw64:\n\t$(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_w64.exe $(LDFLAGS)\n\nci: amd64 amd64-numa aarch64 w64\n\nclean:\n\trm -f *.o && find . -type f -executable -delete\n\n.PHONY: all ci clean\n"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.cpp",
    "content": "// MemoryBandwidth.cpp : This file contains the 'main' function. Program execution begins and ends there.\r\n//\r\n\r\n#include <stdio.h>\r\n#include <stdlib.h>\r\n#include <stdint.h>\r\n#include <string.h>\r\n#ifdef __MINGW32__\r\n    #include <sys/timeb.h>\r\n#else\r\n    #include <sys\\timeb.h>\r\n#endif\r\n#include <math.h>\r\n#include <intrin.h>\r\n#include <immintrin.h>\r\n#include <windows.h>\r\n\r\n#define NUMA_STRIPE 1\r\n#define NUMA_SEQ 2\r\n#define NUMA_CROSSNODE 3\r\n#define NUMA_AUTO 4\r\n\r\n#ifdef _WIN64\r\nint default_test_sizes[39] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048,\r\n                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304,\r\n                               131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 };\r\n#else\r\nint default_test_sizes[35] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048,\r\n                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304,\r\n                               131072, 262144, 393216, 524288 };\r\n#endif\r\n\r\nenum NopType { None, FourByte, EightByte, K8_FourByte, Branch16, TenByte, LEA };\r\n\r\nstruct BandwidthTestThreadData {\r\n    uint32_t iterations;\r\n    uint32_t arr_length;\r\n    float* arr;\r\n    float bw; // written to by the thread\r\n};\r\n\r\n#ifdef _WIN64\r\nuint32_t dataGb = 512;\r\n#else\r\nuint32_t dataGb = 96;\r\n#endif\r\n//__int32 dataGb = 32;\r\n\r\n// array length = number of 4 byte elements\r\nfloat _fastcall scalar_read(void* arr, uint32_t arr_length, uint32_t iterations);\r\n\r\n#ifdef _WIN64\r\nextern \"C\" float sse_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float sse_asm_write(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float sse_asm_ntwrite(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float sse_asm_copy(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float sse_asm_add(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float avx_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float avx_asm_write(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float avx_asm_copy(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float avx_asm_cflip(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float avx_asm_add(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float avx512_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float repmovsb_copy(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float repstosb_write(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float clzero_asm_write(void* arr, uint64_t arr_length, uint64_t iterations);\r\nfloat (*bw_func)(void*, uint64_t, uint64_t) = sse_asm_read;\r\n\r\n#else\r\nextern \"C\" float __fastcall scalar_asm_read32(void* arr, uint32_t arr_length, uint32_t iterations);\r\nextern \"C\" float __fastcall mmx_asm_read32(void* arr, uint32_t arr_length, uint32_t iterations);\r\nextern \"C\" float __fastcall sse_asm_read32(void* arr, uint32_t arr_length, uint32_t iterations);\r\nextern \"C\" float __fastcall dummy(void* arr, uint32_t arr_length, uint32_t iterations);\r\nfloat(_fastcall *bw_func)(void*, uint32_t, uint32_t) = dummy;\r\n#endif\r\n\r\nfloat MeasureBw(uint32_t sizeKb, uint32_t iterations, uint32_t threads, int shared, enum NopType instr);\r\nfloat MeasureInstructionBw(uint64_t sizeKb, uint64_t iterations, enum NopType nopSize, uint32_t threads, int shared);\r\nvoid FillInstructionArray(uint64_t* arr, uint64_t sizeKb, enum NopType nopSize);\r\n#ifdef _WIN64\r\nfloat __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations);\r\n#else\r\nfloat __fastcall instr_read(void* arr, uint32_t arr_length, uint32_t iterations);\r\n#endif\r\n\r\nvoid PrintNumaInfo();\r\nuint32_t GetIterationCount(uint32_t testSize, uint32_t threads);\r\nDWORD WINAPI ReadBandwidthTestThread(LPVOID param);\r\n\r\nint numa = 0;\r\nchar coreNode, memNode;\r\nchar GetSeqNode(uint64_t);\r\nchar GetStripeNode(uint64_t);\r\n\r\nint main(int argc, char *argv[]) {\r\n    int threads = 1, shared = 0, methodSet = 0;\r\n    enum NopType instr = None;\r\n    int cpuid_data[4];\r\n    int singleSize = 0;\r\n\r\n    if (argc == 1) {\r\n        printf(\"Usage: [-threads <thread count>] [-method <scalar/sse/avx/asm_avx/asm_avx512>] [-shared] [-private] [-data <base GB to transfer, default = %d>]\\n\", dataGb);\r\n    }\r\n\r\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\r\n        if (*(argv[argIdx]) == '-') {\r\n            char* arg = argv[argIdx] + 1;\r\n            if (_strnicmp(arg, \"threads\", 7) == 0) {\r\n                argIdx++;\r\n                threads = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Using %d threads\\n\", threads);\r\n            }\r\n            else if (_strnicmp(arg, \"shared\", 6) == 0) {\r\n                shared = 1;\r\n                fprintf(stderr, \"Using one array shared across all threads\\n\");\r\n            }\r\n            else if (_strnicmp(arg, \"private\", 7) == 0) {\r\n                shared = 0;\r\n                fprintf(stderr, \"Using private array for each thread\\n\");\r\n            }\r\n            else if (_strnicmp(arg, \"method\", 6) == 0) {\r\n                methodSet = 1;\r\n                argIdx++;\r\n#ifdef _WIN64\r\n                if (_strnicmp(argv[argIdx], \"read_asm_sse\", 7) == 0) {\r\n                    bw_func = sse_asm_read;\r\n                    fprintf(stderr, \"Using SSE assembly\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"read_asm_avx512\", 10) == 0) {\r\n                    bw_func = avx512_asm_read;\r\n                    fprintf(stderr, \"Using AVX512 assembly\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"write_asm_avx\", 14) == 0) {\r\n                    bw_func = avx_asm_write;\r\n                    fprintf(stderr, \"Using AVX assembly, writing instead of reading\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"read_asm_avx\", 12) == 0) {\r\n                    bw_func = avx_asm_read;\r\n                    fprintf(stderr, \"Using AVX assembly\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"copy_asm_avx\", 12) == 0) {\r\n                    bw_func = avx_asm_copy;\r\n                    fprintf(stderr, \"Using AVX assembly, copying one half of array to the other\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"cflip_asm_avx\", 13) == 0) {\r\n                    bw_func = avx_asm_cflip;\r\n                    fprintf(stderr, \"Using AVX assembly, flipping order of vec sized elements within a cacheline\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"add_asm_avx\", 11) == 0) {\r\n                    bw_func = avx_asm_add;\r\n                    fprintf(stderr, \"Using AVX assembly, adding constant to array\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"copy_asm_sse\", 12) == 0) {\r\n                    bw_func = sse_asm_copy;\r\n                    fprintf(stderr, \"Using SSE assembly, copying one half of array to the other\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"write_asm_sse\", 13) == 0) {\r\n                    bw_func = sse_asm_write;\r\n                    fprintf(stderr, \"Using SSE assembly, writing\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"ntwrite_asm_sse\", 13) == 0) {\r\n                    bw_func = sse_asm_ntwrite;\r\n                    fprintf(stderr, \"Using SSE assembly, non-temporal writes\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"add_asm_sse\", 11) == 0) {\r\n                    bw_func = sse_asm_add;\r\n                    fprintf(stderr, \"Using SSE assembly, adding constant to array\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"copy_repmovsb\", 11) == 0) {\r\n                    bw_func = repmovsb_copy;\r\n                    fprintf(stderr, \"Using assembly, rep movsb to copy one half of the array to the other\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"write_repstosb\", 11) == 0) {\r\n                    bw_func = repstosb_write;\r\n                    fprintf(stderr, \"Using assembly, rep stosb to set array contents to 1\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"clzero\", 11) == 0) {\r\n                    bw_func = clzero_asm_write;\r\n                    fprintf(stderr, \"Using assembly, clzero to set array contents to 0\\n\");\r\n                }\r\n#else\r\n                if (_strnicmp(argv[argIdx], \"scalar\", 6) == 0) {\r\n                    bw_func = scalar_asm_read32;\r\n                    fprintf(stderr, \"Using scalar MOV r <- mem32\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"sse\", 3) == 0) {\r\n                    bw_func = sse_asm_read32;\r\n                    fprintf(stderr, \"Using SSE MOVAPS xmm <- mem128\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"mmx\", 3) == 0) {\r\n                    bw_func = mmx_asm_read32;\r\n                    fprintf(stderr, \"Using MMX MOVQ mm <- mem64\\n\");\r\n                }\r\n#endif\r\n                else if (_strnicmp(argv[argIdx], \"instr8\", 6) == 0) {\r\n                    instr = EightByte;\r\n                    fprintf(stderr, \"Using 8B NOPs\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"instr4\", 6) == 0) {\r\n                    instr = FourByte;\r\n                    fprintf(stderr, \"Using 4B NOPs\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"instrk8_4\", 6) == 0) {\r\n                    instr = K8_FourByte;\r\n                    fprintf(stderr, \"Using 4B NOPs, with encoding recommended in the Athlon optimization manual\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"instr_lea\", 6) == 0) {\r\n                    instr = LEA;\r\n                    fprintf(stderr, \"Using LEA\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"branch16\", 6) == 0) {\r\n                    instr = Branch16;\r\n                    fprintf(stderr, \"Using branch per 16B\\n\");\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"instr10\", 7) == 0)\r\n                {\r\n                    instr = TenByte;\r\n                    fprintf(stderr, \"Using 10B NOPs\\n\");\r\n                }\r\n                else {\r\n                    methodSet = 0;\r\n                    fprintf(stderr, \"I'm so confused. Gonna use whatever the CPU supports I guess\\n\");\r\n                }\r\n            }\r\n            else if (_strnicmp(arg, \"data\", 4) == 0) {\r\n                argIdx++;\r\n                dataGb = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Base data to transfer: %u\\n\", dataGb);\r\n            }\r\n            else if (_strnicmp(arg, \"printnumainfo\", 8) == 0) {\r\n                fprintf(stderr, \"Printing NUMA info and exiting\\n\");\r\n                PrintNumaInfo();\r\n                return 0;\r\n            }\r\n            else if (_strnicmp(arg, \"numa\", 4) == 0) {\r\n                argIdx++;\r\n                fprintf(stderr, \"Attempting to be NUMA aware\\n\");\r\n                numa = NUMA_SEQ;\r\n                if (_strnicmp(argv[argIdx], \"stripe\", 6) == 0) {\r\n                    numa = NUMA_STRIPE;\r\n                }\r\n                else if (_strnicmp(argv[argIdx], \"seq\", 3) == 0) {\r\n                    numa = NUMA_SEQ;\r\n                }\r\n\r\n                if (numa == NUMA_SEQ) fprintf(stderr, \"Filling NUMA nodes one by one\\n\");\r\n                else if (numa == NUMA_STRIPE) fprintf(stderr, \"Striping threads across NUMA nodes\\n\");\r\n            }\r\n            else if (_strnicmp(arg, \"autonuma\", 8) == 0) {\r\n                numa = NUMA_AUTO;\r\n            }\r\n            else if (_strnicmp(arg, \"crossnode\", 9) == 0) {\r\n                numa = NUMA_CROSSNODE;\r\n                argIdx++;\r\n                coreNode = atoi(argv[argIdx]);\r\n                argIdx++;\r\n                memNode = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Testing %d -> %d\\n\", coreNode, memNode);\r\n            }\r\n            else if (_strnicmp(arg, \"singlesize\", 10) == 0) {\r\n                argIdx++;\r\n                singleSize = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Testing %d KB\\n\", singleSize);\r\n            }\r\n        }\r\n    }\r\n\r\n    if (!methodSet) {\r\n        // cpuid_data[0] = eax, [1] = ebx, [2] = ecx, [3] = edx\r\n        __cpuidex(cpuid_data, 1, 0);\r\n#ifdef _WIN64\r\n        // EDX bit 25 = SSE\r\n        if (cpuid_data[3] & (1UL << 25)) {\r\n            fprintf(stderr, \"SSE supported\\n\");\r\n            bw_func = sse_asm_read;\r\n        }\r\n\r\n        if (cpuid_data[2] & (1UL << 28)) {\r\n            fprintf(stderr, \"AVX supported\\n\");\r\n            bw_func = avx_asm_read;\r\n        }\r\n\r\n        __cpuidex(cpuid_data, 7, 0);\r\n        if (cpuid_data[1] & (1UL << 16)) {\r\n            fprintf(stderr, \"AVX512 supported\\n\");\r\n            bw_func = avx512_asm_read;\r\n        }\r\n#else\r\n        int choice = 0;\r\n        printf(\"Pick a method. Choose wisely:\\n\");\r\n        printf(\"1. SSE movaps xmm <- mem128\");\r\n        if (cpuid_data[3] & (1UL << 25)) printf(\" (looks supported)\\n\");\r\n        else printf(\" (looks unsupported)\\n\");\r\n\r\n        printf(\"2. MMX movq mm <- mem64\");\r\n        if (cpuid_data[3] & (1UL << 23)) printf(\"  (looks supported)\\n\");\r\n        else printf(\"  (looks unsupported\\n\");\r\n\r\n        printf(\"3. mov gpr <- mem32 (better work)\\n\");\r\n        printf(\"4. instruction side, 8B NOPs (0F 1F 84 00 00 00 00 00)\\n\");\r\n        printf(\"5. instruction side, 4B NOPs (0F 1F 40 00)\\n\");\r\n        printf(\"6. instruction side, 4B NOPs (66 66 66 90)\\n\");\r\n        printf(\"Your choice: \");\r\n        scanf_s(\"%d\", &choice);\r\n        if (choice == 1) bw_func = sse_asm_read32;\r\n        else if (choice == 2) bw_func = mmx_asm_read32;\r\n        else if (choice == 3) bw_func = scalar_asm_read32;\r\n        else if (choice == 4) instr = EightByte;\r\n        else if (choice == 5) instr = FourByte;\r\n        else if (choice == 6) instr = K8_FourByte;\r\n        else { printf(\"Bye\\n\"); return 0; }\r\n#endif\r\n    }\r\n\r\n    if (instr) {\r\n        bw_func = instr_read;\r\n    }\r\n\r\n    if (singleSize) {\r\n        float bw = MeasureBw(singleSize, GetIterationCount(singleSize, threads), threads, shared, instr);\r\n        printf(\"%d,%f\\n\", singleSize, bw);\r\n    }\r\n    else if (numa == NUMA_AUTO) {\r\n        ULONG highestNumaNode;\r\n        if (!GetNumaHighestNodeNumber(&highestNumaNode)) {\r\n            fprintf(stderr, \"Could not get highest NUMA node number: %d\\n\", GetLastError());\r\n            return 0;\r\n        }\r\n\r\n        for (int coreNode = 0; coreNode <= highestNumaNode; coreNode++) printf(\",%d\", coreNode);\r\n        printf(\"\\n\");\r\n\r\n        for (int coreNodeIdx = 0; coreNodeIdx <= highestNumaNode; coreNodeIdx++) {\r\n            printf(\"%d\", coreNodeIdx);\r\n            for (int memNodeIdx = 0; memNodeIdx <= highestNumaNode; memNodeIdx++) {\r\n                ULONGLONG mask;\r\n                DWORD index;\r\n                coreNode = coreNodeIdx;\r\n                memNode = memNodeIdx;\r\n                numa = NUMA_CROSSNODE; // hacky, oh well\r\n                float bw = MeasureBw(\r\n                    default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1],\r\n                    GetIterationCount(default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1], threads),\r\n                    threads,\r\n                    shared,\r\n                    instr);\r\n                printf(\",%f\", bw);\r\n            }\r\n\r\n            printf(\"\\n\");\r\n        }\r\n    }\r\n    else {\r\n        printf(\"Using %d threads\\n\", threads);\r\n        for (int i = 0; i < sizeof(default_test_sizes) / sizeof(int); i++) {\r\n            float bw = MeasureBw(default_test_sizes[i], GetIterationCount(default_test_sizes[i], threads), threads, shared, instr);\r\n            if (bw > 0) printf(\"%d,%f\\n\", default_test_sizes[i], bw);\r\n        }\r\n    }\r\n\r\n    return 0;\r\n}\r\n\r\n/// <summary>\r\n/// Given test size in KB, return a good iteration count\r\n/// </summary>\r\n/// <param name=\"testSize\">test size in KB</param>\r\n/// <returns>Iterations per thread</returns>\r\nuint32_t GetIterationCount(uint32_t testSize, uint32_t threads)\r\n{\r\n    uint32_t gbToTransfer = dataGb;\r\n    if (testSize > 64) gbToTransfer = dataGb / 2;\r\n    if (testSize > 512) gbToTransfer = dataGb / 4;\r\n    if (testSize > 8192) gbToTransfer = dataGb / 8;\r\n    uint32_t iterations = gbToTransfer * 1024 * 1024 / testSize;\r\n    if (iterations % 2 != 0) iterations += 1;\r\n\r\n    if (iterations < 8) return 8; // set a minimum to reduce noise\r\n    else return iterations;\r\n}\r\n\r\nfloat MeasureBw(uint32_t sizeKb, uint32_t iterations, uint32_t threads, int shared, enum NopType instr) {\r\n    struct timeb start, end;\r\n    float bw = 0;\r\n    uint32_t elements = sizeKb * 1024 / sizeof(float);\r\n    uint32_t private_elements = ceil((double)sizeKb / (double)threads) * 256;\r\n    DWORD protection_flags = PAGE_EXECUTE_READWRITE;\r\n\r\n    //if (instr != None) protection_flags = PAGE_EXECUTE_READWRITE;\r\n    if (!shared) elements = private_elements;\r\n\r\n    //fprintf(stderr, \"%llu elements per thread\\n\", elements);\r\n\r\n    if (!shared && sizeKb < threads) {\r\n        //fprintf(stderr, \"Too many threads for this size, continuing\\n\");\r\n        return 0;\r\n    }\r\n\r\n    // make array and fill it with something\r\n    float* testArr = NULL;\r\n    if (shared) {\r\n        testArr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);\r\n        if (testArr == NULL) {\r\n            fprintf(stderr, \"Could not allocate memory\\n\");\r\n            return 0;\r\n        }\r\n\r\n        if (instr != None)\r\n        {\r\n            FillInstructionArray((uint64_t*)testArr, sizeKb, instr);\r\n        }\r\n        else {\r\n            for (uint32_t i = 0; i < elements; i++) {\r\n                testArr[i] = i + 0.5f;\r\n            }\r\n        }\r\n    }\r\n\r\n    HANDLE* testThreads = (HANDLE*)malloc(threads * sizeof(HANDLE));\r\n    DWORD* tids = (DWORD*)malloc(threads * sizeof(DWORD));\r\n    struct BandwidthTestThreadData* threadData = (struct BandwidthTestThreadData*)malloc(threads * sizeof(struct BandwidthTestThreadData));\r\n\r\n    for (uint64_t i = 0; i < threads; i++) {\r\n        char node;\r\n        if (shared) {\r\n            threadData[i].arr = testArr;\r\n            threadData[i].iterations = iterations;\r\n        }\r\n        else {\r\n            if (!numa) threadData[i].arr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);\r\n            else if (numa == NUMA_STRIPE) {\r\n                node = GetStripeNode(i);\r\n                threadData[i].arr = (float *)VirtualAllocExNuma(\r\n                    GetCurrentProcess(),\r\n                    NULL,\r\n                    elements * sizeof(float),\r\n                    MEM_RESERVE | MEM_COMMIT,\r\n                    protection_flags,\r\n                    node\r\n                );\r\n            }\r\n            else if (numa == NUMA_SEQ) {\r\n                node = GetSeqNode(i);\r\n                threadData[i].arr = (float*)VirtualAllocExNuma(\r\n                    GetCurrentProcess(),\r\n                    NULL,\r\n                    elements * sizeof(float),\r\n                    MEM_RESERVE | MEM_COMMIT,\r\n                    protection_flags,\r\n                    node\r\n                );\r\n            }\r\n            else if (numa == NUMA_CROSSNODE) {\r\n                threadData[i].arr = (float*)VirtualAllocExNuma(\r\n                    GetCurrentProcess(),\r\n                    NULL,\r\n                    elements * sizeof(float),\r\n                    MEM_RESERVE | MEM_COMMIT,\r\n                    protection_flags,\r\n                    memNode\r\n                );\r\n\r\n                node = memNode;\r\n            }\r\n\r\n            if (threadData[i].arr == NULL) {\r\n                fprintf(stderr, \"Could not allocate memory for thread %llu\\n\", i);\r\n                return 0;\r\n            }\r\n\r\n            if (instr != None)\r\n            {\r\n                FillInstructionArray((uint64_t*)threadData[i].arr, (elements * 4) / 1024, instr);\r\n            }\r\n            else\r\n            {\r\n                for (uint64_t arr_idx = 0; arr_idx < elements; arr_idx++) {\r\n                    threadData[i].arr[arr_idx] = arr_idx + i + 0.5f;\r\n                }\r\n            }\r\n\r\n            threadData[i].iterations = iterations * threads;\r\n        }\r\n\r\n        threadData[i].arr_length = elements;\r\n        threadData[i].bw = 0;\r\n        testThreads[i] = CreateThread(NULL, 0, ReadBandwidthTestThread, threadData + i, CREATE_SUSPENDED, tids + i);\r\n\r\n        // turns out setting affinity makes no difference, and it's easier to set affinity via start /affinity <mask> anyway\r\n        //SetThreadAffinityMask(testThreads[i], 1UL << i);\r\n        if (numa == NUMA_STRIPE || numa == NUMA_SEQ) {\r\n            ULONGLONG mask;\r\n            //fprintf(stderr, \"Thread %d pinned to node %d\\n\", i, node);\r\n            GetNumaNodeProcessorMask(node, &mask);\r\n            SetThreadAffinityMask(testThreads[i], mask);\r\n        }\r\n        else if (numa == NUMA_CROSSNODE) {\r\n            ULONGLONG mask;\r\n            GetNumaNodeProcessorMask(coreNode, &mask);\r\n            SetThreadAffinityMask(testThreads[i], mask);\r\n        }\r\n    }\r\n\r\n    ftime(&start);\r\n    for (uint32_t i = 0; i < threads; i++) ResumeThread(testThreads[i]);\r\n    WaitForMultipleObjects((DWORD)threads, testThreads, TRUE, INFINITE);\r\n    ftime(&end);\r\n\r\n    int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);\r\n    double gbTransferred = (uint64_t)iterations * sizeof(float) * elements * threads / (double)1e9;\r\n    bw = 1000 * gbTransferred / (double)time_diff_ms;\r\n    if (!shared) bw = bw * threads;\r\n    //printf(\"%u iterations\\n\", iterations);\r\n    //printf(\"%f GB, %lu ms\\n\", gbTransferred, time_diff_ms);\r\n\r\n    free(testThreads);\r\n    if (shared) VirtualFree(testArr, elements * sizeof(float), MEM_RELEASE);\r\n    free(tids);\r\n\r\n    if (!shared) {\r\n        for (int i = 0; i < threads; i++) {\r\n            VirtualFreeEx(GetCurrentProcess(), threadData[i].arr, 0, MEM_RELEASE);\r\n        }\r\n    }\r\n\r\n    free(threadData);\r\n    return bw;\r\n}\r\n\r\nvoid FillInstructionArray(uint64_t* arr, uint64_t sizeKb, enum NopType nopSize)\r\n{\r\n    char nop8b[8] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };\r\n\r\n    // zen/piledriver optimization manual uses this pattern\r\n    char nop4b[8] = { 0x0F, 0x1F, 0x40, 0x00, 0x0F, 0x1F, 0x40, 0x00 };\r\n\r\n    // athlon64 (K8) optimization manual pattern\r\n    char k8_nop4b[8] = { 0x66, 0x66, 0x66, 0x90, 0x66, 0x66, 0x66, 0x90 };\r\n\r\n    char lea[8] = { 0x48, 0x8D, 0x04, 0x4B, 0x66, 0x0F, 0xEF, 0xC0 };\r\n\r\n    char nop10b[10] = { 0x66, 0x66, 0xf, 0x1f, 0x84, 0, 0, 0, 0, 0 };\r\n\r\n    uint64_t elements = (sizeKb * 1024 / 8) - 1; // leave room for ret\r\n    unsigned char* functionEnd = (unsigned char*)(arr + elements);\r\n\r\n    if (nopSize != Branch16 && nopSize != TenByte) {\r\n        uint64_t* nopPtr;\r\n        if (nopSize == EightByte) nopPtr = (uint64_t*)(nop8b);\r\n        else if (nopSize == FourByte) nopPtr = (uint64_t*)(nop4b);\r\n        else if (nopSize == K8_FourByte) nopPtr = (uint64_t*)(k8_nop4b);\r\n        else if (nopSize == LEA) nopPtr = (uint64_t*)(lea);\r\n        else {\r\n            fprintf(stderr, \"%d (enum value) NOP size isn't supported :(\\n\", nopSize);\r\n            return;\r\n        }\r\n\r\n        for (uint64_t nopIdx = 0; nopIdx < elements; nopIdx++) {\r\n            arr[nopIdx] = *nopPtr;\r\n        }\r\n\r\n        functionEnd[0] = 0xC3;\r\n    }\r\n    else if (nopSize == TenByte) {\r\n        char* targetArr = (char*)arr;\r\n        uint64_t targetArrLenBytes = sizeKb * 1024 - 2; // leave room for ret\r\n        int targetArrIdx;\r\n        for (targetArrIdx = 0; targetArrIdx + 10 < targetArrLenBytes; targetArrIdx += 10)\r\n        {\r\n            memcpy(targetArr + targetArrIdx, nop10b, 10);\r\n        }\r\n\r\n        targetArr[targetArrIdx] = 0xC3;\r\n    }\r\n    else if (nopSize == Branch16) {\r\n        // jump forward 14 bytes\r\n        char branch16b[8] = { 0xEB, 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };\r\n        char ret8b[8] = { 0xC3, 0, 0, 0, 0, 0, 0, 0 };\r\n        uint64_t *branchPtr = (uint64_t*)(branch16b);\r\n        uint64_t* nopPtr = (uint64_t*)(nop8b); // doesn't really matter, we should never hit this\r\n\r\n        // last iteration must have nopIdx % 2 == 1, so the jump will go to the return statement\r\n        // i.e. branchElements for loop must be even, so the last iteration is odd\r\n        uint64_t branchElements = elements % 2 == 0 ? elements : elements - 1;\r\n        uint64_t nopIdx;\r\n        for (nopIdx = 0; nopIdx < branchElements; nopIdx++) {\r\n            arr[nopIdx] = nopIdx % 2 == 0 ? *branchPtr : *nopPtr;\r\n        }\r\n\r\n        arr[nopIdx] = *(uint64_t*)ret8b;\r\n    }\r\n}\r\n\r\n#ifdef _WIN64\r\nfloat __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations)\r\n#else\r\nfloat __fastcall instr_read(void* arr, uint32_t arr_length, uint32_t iterations)\r\n#endif\r\n{\r\n    void (*nopfunc)(uint64_t);\r\n    nopfunc = (void(*)(uint64_t))arr;\r\n    int iterIdx;\r\n    for (iterIdx = 0; iterIdx < iterations; iterIdx++) nopfunc(iterations);\r\n    return iterIdx;\r\n}\r\n\r\nfloat __fastcall scalar_read(void* a, uint32_t arr_length, uint32_t iterations)  {\r\n    float sum = 0;\r\n    if (16 >= arr_length) return 0;\r\n\r\n    uint32_t iter_idx = 0, i = 0;\r\n    float s1 = 0, s2 = 1, s3 = 0, s4 = 1, s5 = 0, s6 = 1, s7 = 0, s8 = 1;\r\n    float* arr = (float*)a;\r\n    while (iter_idx < iterations) {\r\n        s1 += arr[i];\r\n        s2 *= arr[i + 1];\r\n        s3 += arr[i + 2];\r\n        s4 *= arr[i + 3];\r\n        s5 += arr[i + 4];\r\n        s6 *= arr[i + 5];\r\n        s7 += arr[i + 6];\r\n        s8 *= arr[i + 7];\r\n        i += 8;\r\n        if (i + 7 >= arr_length) i = 0;\r\n        if (i == 0) iter_idx++;\r\n    }\r\n\r\n    sum += s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8;\r\n\r\n    return sum;\r\n}\r\n\r\n// return sum of array\r\nfloat sse_read(float* arr, uint64_t arr_length, uint64_t iterations) {\r\n    float sum = 0;\r\n    float iterSum = 0;\r\n    // zero two sums\r\n    __m128 s1 = _mm_setzero_ps();\r\n    __m128 s2 = _mm_setzero_ps();\r\n    __m128 s3 = _mm_loadu_ps(arr);\r\n    __m128 s4 = _mm_loadu_ps(arr);\r\n    __m128 s5 = _mm_setzero_ps();\r\n    __m128 s6 = _mm_setzero_ps();\r\n    __m128 s7 = _mm_loadu_ps(arr);\r\n    __m128 s8 = _mm_loadu_ps(arr);\r\n    __m128 zero = _mm_setzero_ps();\r\n\r\n    uint64_t iter_idx = 0, i = 0;\r\n    while (iter_idx < iterations) {\r\n        __m128 e1 = _mm_loadu_ps(arr + i);\r\n        __m128 e2 = _mm_loadu_ps(arr + i + 4);\r\n        __m128 e3 = _mm_loadu_ps(arr + i + 8);\r\n        __m128 e4 = _mm_loadu_ps(arr + i + 12);\r\n        __m128 e5 = _mm_loadu_ps(arr + i + 16);\r\n        __m128 e6 = _mm_loadu_ps(arr + i + 20);\r\n        __m128 e7 = _mm_loadu_ps(arr + i + 24);\r\n        __m128 e8 = _mm_loadu_ps(arr + i + 28);\r\n        s1 = _mm_add_ps(s1, e1);\r\n        s2 = _mm_add_ps(s2, e2);\r\n        s3 = _mm_mul_ps(s3, e3);\r\n        s4 = _mm_mul_ps(s4, e4);\r\n        s5 = _mm_add_ps(s5, e5);\r\n        s6 = _mm_add_ps(s6, e6);\r\n        s7 = _mm_mul_ps(s7, e7);\r\n        s8 = _mm_mul_ps(s8, e8);\r\n        i += 32;\r\n        if (i + 31 >= arr_length) i = 0;\r\n        if (i == 0) iter_idx++;\r\n    }\r\n\r\n    iterSum = _mm_cvtss_f32(s1) + _mm_cvtss_f32(s2) + _mm_cvtss_f32(s3) + _mm_cvtss_f32(s4) +\r\n        _mm_cvtss_f32(s5) + _mm_cvtss_f32(s6) + _mm_cvtss_f32(s7) + _mm_cvtss_f32(s8);\r\n    sum = iterSum;\r\n    return sum;\r\n}\r\n\r\n#ifdef _WIN64\r\nfloat avx_read(float* arr, uint64_t arr_length, uint64_t iterations) {\r\n    float sum = 0;\r\n    float iterSum = 0;\r\n    __m256 s1 = _mm256_setzero_ps();\r\n    __m256 s2 = _mm256_loadu_ps(arr);\r\n    __m256 s3 = _mm256_setzero_ps();\r\n    __m256 s4 = _mm256_loadu_ps(arr);\r\n    __m256 s5 = _mm256_loadu_ps(arr);\r\n    __m256 s6 = _mm256_loadu_ps(arr);\r\n    __m256 s7 = _mm256_loadu_ps(arr);\r\n    __m256 s8 = _mm256_loadu_ps(arr);\r\n    uint64_t iter_idx = 0, i = 0;\r\n\r\n    while (iter_idx < iterations) {\r\n        __m256 e1 = _mm256_loadu_ps(arr + i);\r\n        __m256 e2 = _mm256_loadu_ps(arr + i + 8);\r\n        __m256 e3 = _mm256_loadu_ps(arr + i + 16);\r\n        __m256 e4 = _mm256_loadu_ps(arr + i + 24);\r\n        __m256 e5 = _mm256_loadu_ps(arr + i + 32);\r\n        __m256 e6 = _mm256_loadu_ps(arr + i + 40);\r\n        __m256 e7 = _mm256_loadu_ps(arr + i + 48);\r\n        __m256 e8 = _mm256_loadu_ps(arr + i + 56);\r\n        s1 = _mm256_add_ps(s1, e1);\r\n        s2 = _mm256_mul_ps(s2, e2);\r\n        s3 = _mm256_add_ps(s3, e3);\r\n        s4 = _mm256_mul_ps(s4, e4);\r\n        s5 = _mm256_mul_ps(s5, e5);\r\n        s6 = _mm256_mul_ps(s6, e6);\r\n        s7 = _mm256_mul_ps(s7, e7);\r\n        s8 = _mm256_mul_ps(s8, e8);\r\n        i += 64;\r\n        if (i + 63 >= arr_length) i = 0;\r\n        if (i == 0) iter_idx++;\r\n    }\r\n\r\n    // sink the result somehow\r\n    iterSum = _mm256_cvtss_f32(s1) + _mm256_cvtss_f32(s2) + _mm256_cvtss_f32(s3) + _mm256_cvtss_f32(s4) +\r\n        _mm256_cvtss_f32(s5) + _mm256_cvtss_f32(s6) + _mm256_cvtss_f32(s7) + _mm256_cvtss_f32(s8);\r\n    sum = iterSum;\r\n\r\n    return sum;\r\n}\r\n#endif\r\n\r\nDWORD WINAPI ReadBandwidthTestThread(LPVOID param) {\r\n    BandwidthTestThreadData* bwTestData = (BandwidthTestThreadData*)param;\r\n    float sum = bw_func(bwTestData->arr, bwTestData->arr_length, bwTestData->iterations);\r\n    if (sum == 0) printf(\"woohoo\\n\");\r\n    return 0;\r\n}\r\n\r\nvoid PrintNumaInfo() {\r\n    ULONG highestNumaNode;\r\n    DWORD nProcs;\r\n    SYSTEM_INFO SystemInfo;\r\n    GetSystemInfo(&SystemInfo);\r\n    nProcs = SystemInfo.dwNumberOfProcessors;\r\n    if (!GetNumaHighestNodeNumber(&highestNumaNode)) {\r\n        fprintf(stderr, \"Could not get highest NUMA node number: %d\\n\", GetLastError());\r\n        return;\r\n    }\r\n\r\n    printf(\"%d processors, highest NUMA node is %lu\\n\", nProcs, highestNumaNode);\r\n\r\n    if (highestNumaNode == 0)\r\n    {\r\n        return;\r\n    }\r\n\r\n    for (int procIdx = 0; procIdx < nProcs; procIdx++)\r\n    {\r\n        unsigned char node;\r\n        GetNumaProcessorNode(procIdx, &node);\r\n        printf(\"Processor %d is on node %d\\n\", procIdx, node);\r\n    }\r\n\r\n    for (char nodeIdx = 0; nodeIdx <= highestNumaNode; nodeIdx++) {\r\n        ULONGLONG mask;\r\n        GetNumaNodeProcessorMask(nodeIdx, &mask);\r\n        printf(\"Node %d: %llx\\n\", nodeIdx, mask);\r\n    }\r\n}\r\n\r\nchar GetStripeNode(uint64_t threadIdx) {\r\n    ULONG highestNumaNode;\r\n    if (!GetNumaHighestNodeNumber(&highestNumaNode)) {\r\n        fprintf(stderr, \"Could not get highest NUMA node number: %d\\n\", GetLastError());\r\n        return 0;\r\n    }\r\n\r\n    return threadIdx % highestNumaNode;\r\n}\r\n\r\nchar GetSeqNode(uint64_t threadIdx) {\r\n    SYSTEM_INFO SystemInfo;\r\n    GetSystemInfo(&SystemInfo);\r\n    unsigned int clippedThreadIdx = threadIdx % SystemInfo.dwNumberOfProcessors;\r\n    unsigned char node;\r\n    GetNumaProcessorNode(clippedThreadIdx, &node);\r\n    return node;\r\n}\r\n"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 17\r\nVisualStudioVersion = 17.6.33815.320\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"MemoryBandwidth\", \"MemoryBandwidth.vcxproj\", \"{E968D202-64A2-43A5-8BBD-D7D010D06564}\"\r\nEndProject\r\nProject(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"MixedMemoryBandwidthTest\", \"..\\MixedMemoryBandwidthTest\\MixedMemoryBandwidthTest.vcxproj\", \"{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|x64 = Debug|x64\r\n\t\tDebug|x86 = Debug|x86\r\n\t\tRelease|x64 = Release|x64\r\n\t\tRelease|x86 = Release|x86\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.ActiveCfg = Debug|Win32\r\n\t\t{E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.Build.0 = Debug|Win32\r\n\t\t{E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.Build.0 = Release|x64\r\n\t\t{E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.ActiveCfg = Release|Win32\r\n\t\t{E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.Build.0 = Release|Win32\r\n\t\t{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.ActiveCfg = Debug|Win32\r\n\t\t{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.Build.0 = Debug|Win32\r\n\t\t{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.Build.0 = Release|x64\r\n\t\t{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.ActiveCfg = Release|Win32\r\n\t\t{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.Build.0 = Release|Win32\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\n\tGlobalSection(ExtensibilityGlobals) = postSolution\r\n\t\tSolutionGuid = {2EA86D6F-CEE0-40A9-B4DD-AC59CCAD358F}\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.vcxproj",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>16.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{e968d202-64a2-43a5-8bbd-d7d010d06564}</ProjectGuid>\r\n    <RootNamespace>MemoryBandwidth</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"MemoryBandwidth.cpp\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"MemoryBandwidthFunctions.asm\">\r\n      <FileType>Document</FileType>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">nasm -f win64 MemoryBandwidthFunctions.asm</Command>\r\n      <Message Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">Running NASM</Message>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">MemoryBandwidthFunctions.obj</Outputs>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">nasm -f win64 MemoryBandwidthFunctions.asm</Command>\r\n      <Message Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">Running NASM</Message>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">MemoryBandwidthFunctions.obj</Outputs>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"MemoryBandwidthFunctions32.asm\">\r\n      <FileType>Document</FileType>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">nasm -f win32 MemoryBandwidthFunctions32.asm</Command>\r\n      <Message Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">Running NASM, targeting 32-bit</Message>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">MemoryBandwidthFunctions32.obj</Outputs>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.vcxproj.filters",
    "content": "﻿<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project ToolsVersion=\"4.0\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup>\r\n    <Filter Include=\"Source Files\">\r\n      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>\r\n      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Header Files\">\r\n      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>\r\n      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Resource Files\">\r\n      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>\r\n      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>\r\n    </Filter>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"MemoryBandwidth.cpp\">\r\n      <Filter>Source Files</Filter>\r\n    </ClCompile>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"MemoryBandwidthFunctions.asm\">\r\n      <Filter>Source Files</Filter>\r\n    </CustomBuild>\r\n    <CustomBuild Include=\"MemoryBandwidthFunctions32.asm\">\r\n      <Filter>Source Files</Filter>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n</Project>"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth/MemoryBandwidthFunctions.asm",
    "content": "section .text\r\n\r\nbits 64\r\n\r\nglobal sse_asm_read\r\nglobal sse_asm_copy\r\nglobal sse_asm_write\r\nglobal sse_asm_ntwrite\r\nglobal sse_asm_add\r\nglobal avx_asm_read\r\nglobal avx_asm_write\r\nglobal avx_asm_ntwrite\r\nglobal avx_asm_copy\r\nglobal avx_asm_cflip\r\nglobal avx_asm_add\r\nglobal avx512_asm_read\r\nglobal clzero_asm_write\r\n\r\nglobal repmovsb_copy\r\nglobal repstosb_write\r\n\r\n; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations, r9 = start index\r\n; return something in xmm0\r\navx_asm_read:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\navx_asm_read_pass_loop:\r\n  ; xmm0 to 5 are considered volatile\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps ymm0, [rdi + 128]\r\n  vmovaps ymm1, [rdi + 160]\r\n  vmovaps ymm2, [rdi + 192]\r\n  vmovaps ymm3, [rdi + 224]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps ymm0, [rdi + 128]\r\n  vmovaps ymm1, [rdi + 160]\r\n  vmovaps ymm2, [rdi + 192]\r\n  vmovaps ymm3, [rdi + 224]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_test_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_test_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_read_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz avx_asm_read_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\navx_asm_write:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  vmovaps ymm0, [rcx]\r\navx_asm_write_pass_loop:\r\n  vmovaps [rdi], ymm0\r\n  vmovaps [rdi + 32], ymm0\r\n  vmovaps [rdi + 64], ymm0\r\n  vmovaps [rdi + 96], ymm0\r\n  vmovaps [rdi + 128], ymm0\r\n  vmovaps [rdi + 160], ymm0\r\n  vmovaps [rdi + 192], ymm0\r\n  vmovaps [rdi + 224], ymm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovaps [rdi], ymm0\r\n  vmovaps [rdi + 32], ymm0\r\n  vmovaps [rdi + 64], ymm0\r\n  vmovaps [rdi + 96], ymm0\r\n  vmovaps [rdi + 128], ymm0\r\n  vmovaps [rdi + 160], ymm0\r\n  vmovaps [rdi + 192], ymm0\r\n  vmovaps [rdi + 224], ymm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_write_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_write_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_write_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz avx_asm_write_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\navx_asm_ntwrite:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  vmovaps ymm0, [rcx]\r\navx_asm_ntwrite_pass_loop:\r\n  vmovntps [rdi], ymm0\r\n  vmovntps [rdi + 32], ymm0\r\n  vmovntps [rdi + 64], ymm0\r\n  vmovntps [rdi + 96], ymm0\r\n  vmovntps [rdi + 128], ymm0\r\n  vmovntps [rdi + 160], ymm0\r\n  vmovntps [rdi + 192], ymm0\r\n  vmovntps [rdi + 224], ymm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovntps [rdi], ymm0\r\n  vmovntps [rdi + 32], ymm0\r\n  vmovntps [rdi + 64], ymm0\r\n  vmovntps [rdi + 96], ymm0\r\n  vmovntps [rdi + 128], ymm0\r\n  vmovntps [rdi + 160], ymm0\r\n  vmovntps [rdi + 192], ymm0\r\n  vmovntps [rdi + 224], ymm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_ntwrite_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_ntwrite_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_ntwrite_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz avx_asm_ntwrite_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n; rcx = ptr to arr\r\n; rdx = arr_length\r\n; r8 = iterations\r\navx_asm_copy:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  push r13\r\n  xor rsi, rsi\r\n  mov r9, rdx\r\n  shr r9, 1    ; start destination at array + length / 2\r\n  mov r15, 256 ; load in blocks of 128 bytes\r\n  mov r13, r9\r\n  sub r13, 64\r\n  lea rdi, [rcx + rsi * 4]\r\n  lea r14, [rcx + r9 * 4]\r\navx_copy_pass_loop:\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps ymm4, [rdi + 128]\r\n  vmovaps ymm5, [rdi + 160]\r\n  vmovaps ymm6, [rdi + 192]\r\n  vmovaps ymm7, [rdi + 224]\r\n  vmovaps [r14], ymm0\r\n  vmovaps [r14 + 32], ymm1\r\n  vmovaps [r14 + 64], ymm2\r\n  vmovaps [r14 + 96], ymm3\r\n  vmovaps [r14 + 128], ymm4\r\n  vmovaps [r14 + 160], ymm5\r\n  vmovaps [r14 + 192], ymm6\r\n  vmovaps [r14 + 224], ymm7\r\n  add rsi, 64\r\n  add rdi, r15  ; increment src/dst pointers\r\n  add r14, r15\r\n  cmp r13, rsi  ; end location is at half\r\n  jge avx_copy_pass_loop\r\n  xor rsi, rsi\r\n  lea rdi, [rcx + rsi * 4] ; back to start\r\n  lea r14, [rcx + r9 * 4]\r\n  dec r8                  ; decrement iteration counter\r\n  jnz avx_copy_pass_loop\r\n  pop r13\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n\r\n; changes the ordering of vector sized elements within a cacheline\r\n avx_asm_cflip:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break. 128 elements per iteration\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\navx_asm_cflip_pass_loop:\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps [rdi + 96], ymm0\r\n  vmovaps [rdi + 64], ymm1\r\n  vmovaps [rdi + 32], ymm2\r\n  vmovaps [rdi], ymm3\r\n  vmovaps ymm0, [rdi + 128]\r\n  vmovaps ymm1, [rdi + 160]\r\n  vmovaps ymm2, [rdi + 192]\r\n  vmovaps ymm3, [rdi + 224]\r\n  vmovaps [rdi + 224], ymm0\r\n  vmovaps [rdi + 192], ymm1\r\n  vmovaps [rdi + 160], ymm2\r\n  vmovaps [rdi + 128], ymm3\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps [rdi + 96], ymm0\r\n  vmovaps [rdi + 64], ymm1\r\n  vmovaps [rdi + 32], ymm2\r\n  vmovaps [rdi], ymm3\r\n  vmovaps ymm0, [rdi + 128]\r\n  vmovaps ymm1, [rdi + 160]\r\n  vmovaps ymm2, [rdi + 192]\r\n  vmovaps ymm3, [rdi + 224]\r\n  vmovaps [rdi + 224], ymm0\r\n  vmovaps [rdi + 192], ymm1\r\n  vmovaps [rdi + 160], ymm2\r\n  vmovaps [rdi + 128], ymm3\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_cflip_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_cflip_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_cflip_pass_loop ; skip iteration decrement if we're not back to start\r\n  sub r8, 2\r\n  jnz avx_asm_cflip_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\navx_asm_add:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  vmovaps ymm4, [rdi]\r\navx_asm_add_pass_loop:\r\n  ; xmm0 to 5 are considered volatile\r\n  vaddps ymm0, ymm4, [rdi]\r\n  vaddps ymm1, ymm4, [rdi + 32]\r\n  vaddps ymm2, ymm4, [rdi + 64]\r\n  vaddps ymm3, ymm4, [rdi + 96]\r\n  vmovaps [rdi], ymm0\r\n  vmovaps [rdi + 32], ymm1\r\n  vmovaps [rdi + 64], ymm2\r\n  vmovaps [rdi + 96], ymm3\r\n  vaddps ymm0, ymm4, [rdi + 128]\r\n  vaddps ymm1, ymm4, [rdi + 160]\r\n  vaddps ymm2, ymm4, [rdi + 192]\r\n  vaddps ymm3, ymm4, [rdi + 224]\r\n  vmovaps [rdi + 128], ymm0\r\n  vmovaps [rdi + 160], ymm1\r\n  vmovaps [rdi + 192], ymm2\r\n  vmovaps [rdi + 224], ymm3\r\n\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vaddps ymm0, ymm4, [rdi]\r\n  vaddps ymm1, ymm4, [rdi + 32]\r\n  vaddps ymm2, ymm4, [rdi + 64]\r\n  vaddps ymm3, ymm4, [rdi + 96]\r\n  vmovaps [rdi], ymm0\r\n  vmovaps [rdi + 32], ymm1\r\n  vmovaps [rdi + 64], ymm2\r\n  vmovaps [rdi + 96], ymm3\r\n  vaddps ymm0, ymm4, [rdi + 128]\r\n  vaddps ymm1, ymm4, [rdi + 160]\r\n  vaddps ymm2, ymm4, [rdi + 192]\r\n  vaddps ymm3, ymm4, [rdi + 224]\r\n  vmovaps [rdi + 128], ymm0\r\n  vmovaps [rdi + 160], ymm1\r\n  vmovaps [rdi + 192], ymm2\r\n  vmovaps [rdi + 224], ymm3\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_add_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_add_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_add_pass_loop ; skip iteration decrement if we're not back to start\r\n  sub r8, 2\r\n  jnz avx_asm_add_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\navx512_asm_read:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\navx512_asm_read_pass_loop:\r\n  vmovaps zmm0, [rdi]\r\n  vmovaps zmm1, [rdi + 64]\r\n  vmovaps zmm2, [rdi + 128]\r\n  vmovaps zmm3, [rdi + 192]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovaps zmm0, [rdi]\r\n  vmovaps zmm1, [rdi + 64]\r\n  vmovaps zmm2, [rdi + 128]\r\n  vmovaps zmm3, [rdi + 192]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx512_test_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx512_test_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx512_asm_read_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz avx512_asm_read_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\nclzero_asm_write:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\nclzero_asm_write_pass_loop:\r\n  mov rax, rdi\r\n  clzero\r\n  add rax, 64\r\n  clzero\r\n  add rax, 64\r\n  clzero\r\n  add rax, 64\r\n  clzero\r\n  add rsi, 64\r\n  add rdi, r15\r\n  mov rax, rdi\r\n  clzero\r\n  add rax, 64\r\n  clzero\r\n  add rax, 64\r\n  clzero\r\n  add rax, 64\r\n  clzero\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge clzero_asm_write_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nclzero_asm_write_iteration_count:\r\n  cmp r9, rsi\r\n  jnz clzero_asm_write_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  sfence\r\n  jnz clzero_asm_write_pass_loop\r\n  mov rax, 1\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\nsse_asm_read:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\nsse_read_pass_loop:\r\n  ; xmm0 to 5 are considered volatile\r\n  movaps xmm0, [rdi]\r\n  movaps xmm1, [rdi + 16]\r\n  movaps xmm2, [rdi + 32]\r\n  movaps xmm3, [rdi + 48]\r\n  movaps xmm0, [rdi + 64]\r\n  movaps xmm1, [rdi + 80]\r\n  movaps xmm2, [rdi + 96]\r\n  movaps xmm3, [rdi + 112]\r\n  movaps xmm0, [rdi + 128]\r\n  movaps xmm1, [rdi + 144]\r\n  movaps xmm2, [rdi + 160]\r\n  movaps xmm3, [rdi + 176]\r\n  movaps xmm0, [rdi + 192]\r\n  movaps xmm2, [rdi + 208]\r\n  movaps xmm2, [rdi + 224]\r\n  movaps xmm2, [rdi + 240]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  movaps xmm0, [rdi]\r\n  movaps xmm1, [rdi + 16]\r\n  movaps xmm2, [rdi + 32]\r\n  movaps xmm3, [rdi + 48]\r\n  movaps xmm0, [rdi + 64]\r\n  movaps xmm1, [rdi + 80]\r\n  movaps xmm2, [rdi + 96]\r\n  movaps xmm3, [rdi + 112]\r\n  movaps xmm0, [rdi + 128]\r\n  movaps xmm1, [rdi + 144]\r\n  movaps xmm2, [rdi + 160]\r\n  movaps xmm3, [rdi + 176]\r\n  movaps xmm0, [rdi + 192]\r\n  movaps xmm2, [rdi + 208]\r\n  movaps xmm2, [rdi + 224]\r\n  movaps xmm2, [rdi + 240]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge sse_test_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nsse_test_iteration_count:\r\n  cmp r9, rsi\r\n  jnz sse_read_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz sse_read_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations\r\nsse_asm_write:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  movaps xmm0, [rdi]\r\nsse_write_pass_loop:\r\n  movaps [rdi], xmm0\r\n  movaps [rdi + 16], xmm0\r\n  movaps [rdi + 32], xmm0\r\n  movaps [rdi + 48], xmm0\r\n  movaps [rdi + 64], xmm0\r\n  movaps [rdi + 80], xmm0\r\n  movaps [rdi + 96], xmm0\r\n  movaps [rdi + 112], xmm0\r\n  movaps [rdi + 128], xmm0\r\n  movaps [rdi + 144], xmm0\r\n  movaps [rdi + 160], xmm0\r\n  movaps [rdi + 176], xmm0\r\n  movaps [rdi + 192], xmm0\r\n  movaps [rdi + 208], xmm0\r\n  movaps [rdi + 224], xmm0\r\n  movaps [rdi + 240], xmm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  movaps [rdi], xmm0\r\n  movaps [rdi + 16], xmm0\r\n  movaps [rdi + 32], xmm0\r\n  movaps [rdi + 48], xmm0\r\n  movaps [rdi + 64], xmm0\r\n  movaps [rdi + 80], xmm0\r\n  movaps [rdi + 96], xmm0\r\n  movaps [rdi + 112], xmm0\r\n  movaps [rdi + 128], xmm0\r\n  movaps [rdi + 144], xmm0\r\n  movaps [rdi + 160], xmm0\r\n  movaps [rdi + 176], xmm0\r\n  movaps [rdi + 192], xmm0\r\n  movaps [rdi + 208], xmm0\r\n  movaps [rdi + 224], xmm0\r\n  movaps [rdi + 240], xmm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge sse_write_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nsse_write_iteration_count:\r\n  cmp r9, rsi\r\n  jnz sse_write_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jg sse_write_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\nsse_asm_ntwrite:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  movaps xmm0, [rdi]\r\nsse_ntwrite_pass_loop:\r\n  movntps [rdi], xmm0\r\n  movntps [rdi + 16], xmm0\r\n  movntps [rdi + 32], xmm0\r\n  movntps [rdi + 48], xmm0\r\n  movntps [rdi + 64], xmm0\r\n  movntps [rdi + 80], xmm0\r\n  movntps [rdi + 96], xmm0\r\n  movntps [rdi + 112], xmm0\r\n  movntps [rdi + 128], xmm0\r\n  movntps [rdi + 144], xmm0\r\n  movntps [rdi + 160], xmm0\r\n  movntps [rdi + 176], xmm0\r\n  movntps [rdi + 192], xmm0\r\n  movntps [rdi + 208], xmm0\r\n  movntps [rdi + 224], xmm0\r\n  movntps [rdi + 240], xmm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  movntps [rdi], xmm0\r\n  movntps [rdi + 16], xmm0\r\n  movntps [rdi + 32], xmm0\r\n  movntps [rdi + 48], xmm0\r\n  movntps [rdi + 64], xmm0\r\n  movntps [rdi + 80], xmm0\r\n  movntps [rdi + 96], xmm0\r\n  movntps [rdi + 112], xmm0\r\n  movntps [rdi + 128], xmm0\r\n  movntps [rdi + 144], xmm0\r\n  movntps [rdi + 160], xmm0\r\n  movntps [rdi + 176], xmm0\r\n  movntps [rdi + 192], xmm0\r\n  movntps [rdi + 208], xmm0\r\n  movntps [rdi + 224], xmm0\r\n  movntps [rdi + 240], xmm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge sse_ntwrite_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nsse_ntwrite_iteration_count:\r\n  cmp r9, rsi\r\n  jnz sse_ntwrite_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jg sse_ntwrite_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n\r\n; rcx = ptr to arr\r\n; rdx = arr_length\r\n; r8 = iterations\r\nsse_asm_copy:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  push r13\r\n  xor rsi, rsi\r\n  mov r9, rdx\r\n  shr r9, 1    ; start destination at array + length / 2\r\n  mov r15, 256 ; load in blocks of 128 bytes\r\n  mov r13, r9\r\n  sub r13, 64\r\n  lea rdi, [rcx + rsi * 4]\r\n  lea r14, [rcx + r9 * 4]\r\nsse_copy_pass_loop:\r\n  movaps xmm0, [rdi]\r\n  movaps xmm1, [rdi + 16]\r\n  movaps xmm2, [rdi + 32]\r\n  movaps xmm3, [rdi + 48]\r\n  movaps xmm4, [rdi + 64]\r\n  movaps xmm5, [rdi + 80]\r\n  movaps xmm6, [rdi + 96]\r\n  movaps xmm7, [rdi + 112]\r\n  movaps [r14], xmm0\r\n  movaps [r14 + 16], xmm1\r\n  movaps [r14 + 32], xmm2\r\n  movaps [r14 + 48], xmm3\r\n  movaps [r14 + 64], xmm4\r\n  movaps [r14 + 80], xmm5\r\n  movaps [r14 + 96], xmm6\r\n  movaps [r14 + 112], xmm7\r\n\r\n  movaps xmm0, [rdi + 128]\r\n  movaps xmm1, [rdi + 144]\r\n  movaps xmm2, [rdi + 160]\r\n  movaps xmm3, [rdi + 176]\r\n  movaps xmm4, [rdi + 192]\r\n  movaps xmm5, [rdi + 208]\r\n  movaps xmm6, [rdi + 224]\r\n  movaps xmm7, [rdi + 240]\r\n  movaps [r14 + 128], xmm0\r\n  movaps [r14 + 144], xmm1\r\n  movaps [r14 + 160], xmm2\r\n  movaps [r14 + 176], xmm3\r\n  movaps [r14 + 192], xmm4\r\n  movaps [r14 + 208], xmm5\r\n  movaps [r14 + 224], xmm6\r\n  movaps [r14 + 240], xmm7\r\n\r\n  add rsi, 64\r\n  add rdi, r15  ; increment src/dst pointers\r\n  add r14, r15\r\n  cmp r13, rsi  ; end location is at half\r\n  jge sse_copy_pass_loop\r\n  xor rsi, rsi\r\n  lea rdi, [rcx + rsi * 4] ; back to start\r\n  lea r14, [rcx + r9 * 4]\r\n  dec r8                  ; decrement iteration counter\r\n  jnz sse_copy_pass_loop\r\n  pop r13\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\nsse_asm_add:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  movaps xmm5, [rdi]\r\nsse_add_pass_loop:\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi]\r\n  addps xmm1, [rdi + 16]\r\n  addps xmm2, [rdi + 32]\r\n  addps xmm3, [rdi + 48]\r\n  movaps [rdi], xmm0\r\n  movaps [rdi + 16], xmm1\r\n  movaps [rdi + 32], xmm2\r\n  movaps [rdi + 48], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 64]\r\n  addps xmm1, [rdi + 80]\r\n  addps xmm2, [rdi + 96]\r\n  addps xmm3, [rdi + 112]\r\n  movaps [rdi + 64], xmm0\r\n  movaps [rdi + 80], xmm1\r\n  movaps [rdi + 96], xmm2\r\n  movaps [rdi + 112], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 128]\r\n  addps xmm1, [rdi + 144]\r\n  addps xmm2, [rdi + 160]\r\n  addps xmm3, [rdi + 176]\r\n  movaps [rdi + 128], xmm0\r\n  movaps [rdi + 144], xmm1\r\n  movaps [rdi + 160], xmm2\r\n  movaps [rdi + 176], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 192]\r\n  addps xmm1, [rdi + 208]\r\n  addps xmm2, [rdi + 224]\r\n  addps xmm3, [rdi + 240]\r\n  movaps [rdi + 192], xmm0\r\n  movaps [rdi + 208], xmm1\r\n  movaps [rdi + 224], xmm2\r\n  movaps [rdi + 240], xmm3\r\n\r\n  add rsi, 64\r\n  add rdi, r15\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi]\r\n  addps xmm1, [rdi + 16]\r\n  addps xmm2, [rdi + 32]\r\n  addps xmm3, [rdi + 48]\r\n  movaps [rdi], xmm0\r\n  movaps [rdi + 16], xmm1\r\n  movaps [rdi + 32], xmm2\r\n  movaps [rdi + 48], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 64]\r\n  addps xmm1, [rdi + 80]\r\n  addps xmm2, [rdi + 96]\r\n  addps xmm3, [rdi + 112]\r\n  movaps [rdi + 64], xmm0\r\n  movaps [rdi + 80], xmm1\r\n  movaps [rdi + 96], xmm2\r\n  movaps [rdi + 112], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 128]\r\n  addps xmm1, [rdi + 144]\r\n  addps xmm2, [rdi + 160]\r\n  addps xmm3, [rdi + 176]\r\n  movaps [rdi + 128], xmm0\r\n  movaps [rdi + 144], xmm1\r\n  movaps [rdi + 160], xmm2\r\n  movaps [rdi + 176], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 192]\r\n  addps xmm1, [rdi + 208]\r\n  addps xmm2, [rdi + 224]\r\n  addps xmm3, [rdi + 240]\r\n  movaps [rdi + 192], xmm0\r\n  movaps [rdi + 208], xmm1\r\n  movaps [rdi + 224], xmm2\r\n  movaps [rdi + 240], xmm3\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge sse_add_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nsse_add_iteration_count:\r\n  cmp r9, rsi\r\n  jnz sse_add_pass_loop ; skip iteration decrement if we're not back to start\r\n  sub r8, 2\r\n  jg sse_add_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n\r\n; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations\r\nrepmovsb_copy:\r\n  push r15\r\n  push r14\r\n  push r13\r\n  push r12\r\n  push rsi\r\n  push rdi\r\n  push rax\r\n  cld\r\n  ; source = rsi, destination = rdi, count (in bytes) = rcx\r\n  mov rsi, rcx  ; set source\r\n  shr rdx, 1    ; set destination = source + (size / 2)\r\n  mov rdi, rcx\r\n  add rdi, rdx\r\n  mov rcx, rdx  ; set count = (size / 2) * (4 bytes per fp32 element)\r\n  shl rcx, 2\r\n  mov r12, rsi\r\n  mov r13, rdi\r\n  mov r14, rcx\r\nrepmovsb_copy_pass_loop:\r\n  mov rsi, r12\r\n  mov rdi, r13\r\n  mov rcx, r14\r\n  rep movsb\r\n  dec r8\r\n  jnz repmovsb_copy_pass_loop\r\n  movss xmm0, [r12]\r\n  pop rax\r\n  pop rdi\r\n  pop rsi\r\n  pop r12\r\n  pop r13\r\n  pop r14\r\n  pop r15\r\n  ret\r\n\r\n; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations\r\nrepstosb_write:\r\n  push r15\r\n  push r14\r\n  push r13\r\n  push r12\r\n  push rsi\r\n  push rdi\r\n  push rax\r\n  cld\r\n  ; source = value in al, destination = rdi, count (in bytes) = rcx\r\n  mov al, 1  ; set source\r\n  mov r13, rcx  ; destination = start of arr\r\n  mov r14, rdx  \r\n  shl r14, 2    ; count = (nr of FP32 elements) * 4\r\nrepstosb_write_pass_loop:\r\n  mov rdi, r13\r\n  mov rcx, r14\r\n  rep stosb\r\n  dec r8\r\n  jnz repstosb_write_pass_loop\r\n  movss xmm0, [r13]\r\n  pop rax\r\n  pop rdi\r\n  pop rsi\r\n  pop r12\r\n  pop r13\r\n  pop r14\r\n  pop r15\r\n  ret"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth/MemoryBandwidthFunctions32.asm",
    "content": "section .text\n\nbits 32\n\nglobal @sse_asm_read32@12\nglobal sse_asm_read32\nglobal @mmx_asm_read32@12\nglobal mmx_asm_read32\nglobal @scalar_asm_read32@12\nglobal scalar_asm_read32\nglobal @dummy@12\n\n@dummy@12:\n  mov eax, [esp]\n  mov [esp + 4], eax\n  add esp, 4\n  ret\n\n; ecx = ptr to float array\n; edx = arr length\n; [esp + 4] = iterations, put this into eax\nsse_asm_read32:\n@sse_asm_read32@12:\n  mov eax, [esp + 4]\n  push ecx\n  push edx\n  push esi\n  push edi\n  sub edx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\n  xor esi, esi ; index into array = 0\n  lea edi, [ecx + esi * 4]\nsse_read32_pass_loop:\n  movaps xmm0, [edi]\n  movaps xmm1, [edi + 16]\n  movaps xmm2, [edi + 32]\n  movaps xmm3, [edi + 48]\n  movaps xmm0, [edi + 64]\n  movaps xmm1, [edi + 80]\n  movaps xmm2, [edi + 96]\n  movaps xmm3, [edi + 112]\n  movaps xmm0, [edi + 128]\n  movaps xmm1, [edi + 144]\n  movaps xmm2, [edi + 160]\n  movaps xmm3, [edi + 176]\n  movaps xmm0, [edi + 192]\n  movaps xmm2, [edi + 208]\n  movaps xmm2, [edi + 224]\n  movaps xmm2, [edi + 240]\n  add esi, 64\n  add edi, 256\n  movaps xmm0, [edi]\n  movaps xmm1, [edi + 16]\n  movaps xmm2, [edi + 32]\n  movaps xmm3, [edi + 48]\n  movaps xmm0, [edi + 64]\n  movaps xmm1, [edi + 80]\n  movaps xmm2, [edi + 96]\n  movaps xmm3, [edi + 112]\n  movaps xmm0, [edi + 128]\n  movaps xmm1, [edi + 144]\n  movaps xmm2, [edi + 160]\n  movaps xmm3, [edi + 176]\n  movaps xmm0, [edi + 192]\n  movaps xmm2, [edi + 208]\n  movaps xmm2, [edi + 224]\n  movaps xmm2, [edi + 240]\n  add esi, 64\n  add edi, 256\n  cmp edx, esi              ; bounds check, expects size to be multiple of 64 elements\n  jge sse_read32_pass_loop\n\n  ; zero the index, get back to start, decrement iteration count\n  xor esi, esi\n  lea edi, [ecx + esi * 4]\n  dec eax\n  jnz sse_read32_pass_loop\n  pop edi\n  pop esi\n  pop edx\n  pop ecx\n\n  ; I don't understand this calling convention\n  ; nothing I looked up explains it\n  mov eax, [esp]\n  mov [esp + 4], eax\n  add esp, 4\n  ret\n\nmmx_asm_read32:\n@mmx_asm_read32@12:\n  mov eax, [esp + 4]\n  push ecx\n  push edx\n  push esi\n  push edi\n  sub edx, 64 ; last iteration: rsi == rdx. rsi > rdx = break\n  xor esi, esi ; index into array = 0\n  lea edi, [ecx + esi * 4]\nmmx_read32_pass_loop:\n  movq mm0, [edi]\n  movq mm1, [edi + 8]\n  movq mm2, [edi + 16]\n  movq mm3, [edi + 24]\n  movq mm4, [edi + 32]\n  movq mm5, [edi + 40]\n  movq mm6, [edi + 48]\n  movq mm7, [edi + 56]\n\n  movq mm0, [edi + 64]\n  movq mm1, [edi + 72]\n  movq mm2, [edi + 80]\n  movq mm3, [edi + 88]\n  movq mm4, [edi + 96]\n  movq mm5, [edi + 104]\n  movq mm6, [edi + 112]\n  movq mm7, [edi + 120]\n\n  movq mm0, [edi + 128]\n  movq mm1, [edi + 136]\n  movq mm2, [edi + 144]\n  movq mm3, [edi + 152]\n  movq mm4, [edi + 160]\n  movq mm5, [edi + 168]\n  movq mm6, [edi + 176]\n  movq mm7, [edi + 184]\n\n  movq mm0, [edi + 192]\n  movq mm1, [edi + 200]\n  movq mm2, [edi + 208]\n  movq mm3, [edi + 216]\n  movq mm4, [edi + 224]\n  movq mm5, [edi + 232]\n  movq mm6, [edi + 240]\n  movq mm7, [edi + 248]\n  add esi, 64\n  add edi, 256\n  cmp edx, esi              ; bounds check, expects size to be multiple of 64 elements\n  jge mmx_read32_pass_loop\n\n  ; zero the index, get back to start, decrement iteration count\n  xor esi, esi\n  lea edi, [ecx + esi * 4]\n  dec eax\n  jnz mmx_read32_pass_loop\n  pop edi\n  pop esi\n  pop edx\n  pop ecx\n\n  mov eax, [esp]\n  mov [esp + 4], eax\n  add esp, 4\n  fld1\n  ret\n\n; [esp + 4] = iterations\nscalar_asm_read32:\n@scalar_asm_read32@12:\n  push ebx\n  push ecx\n  push edx\n  push esi\n  push edi\n  sub edx, 32 ; last iteration: rsi == rdx. rsi > rdx = break\n  xor esi, esi ; index into array = 0\n  lea edi, [ecx + esi * 4]\nscalar_read32_pass_loop:\n  mov eax, [edi]\n  mov ebx, [edi + 4]\n  mov eax, [edi + 8]\n  mov ebx, [edi + 12]\n  mov eax, [edi + 16]\n  mov ebx, [edi + 20]\n  mov eax, [edi + 24]\n  mov ebx, [edi + 28]\n  mov eax, [edi + 32]\n  mov ebx, [edi + 36]\n  mov eax, [edi + 40]\n  mov ebx, [edi + 44]\n  mov eax, [edi + 48]\n  mov ebx, [edi + 52]\n  mov eax, [edi + 56]\n  mov ebx, [edi + 60]\n\n  mov eax, [edi + 64]\n  mov ebx, [edi + 68]\n  mov eax, [edi + 72]\n  mov ebx, [edi + 76]\n  mov eax, [edi + 80]\n  mov ebx, [edi + 84]\n  mov eax, [edi + 88]\n  mov ebx, [edi + 92]\n  mov eax, [edi + 96]\n  mov ebx, [edi + 100]\n  mov eax, [edi + 104]\n  mov ebx, [edi + 108]\n  mov eax, [edi + 112]\n  mov ebx, [edi + 116]\n  mov eax, [edi + 120]\n  mov ebx, [edi + 124]\n\n  add esi, 32\n  add edi, 128\n  cmp edx, esi              ; bounds check, expects size to be multiple of 64 elements\n  jge scalar_read32_pass_loop\n\n  ; zero the index, get back to start, decrement iteration count\n  xor esi, esi\n  lea edi, [ecx + esi * 4]\n  dec dword [esp + 24]\n  jnz scalar_read32_pass_loop\n  pop edi\n  pop esi\n  pop edx\n  pop ecx\n  pop ebx\n\n  mov eax, [esp]\n  mov [esp + 4], eax\n  add esp, 4\n  fld1\n  ret\n"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth.c",
    "content": "// MemoryBandwidth.c : Version for linux (x86 and ARM)\r\n// Mostly the same as the x86-only VS version, but a bit more manual\r\n\r\n#define _GNU_SOURCE\r\n#include <stdio.h>\r\n#include <stdlib.h>\r\n#include <stdint.h>\r\n#include <string.h>\r\n\r\n#include <sys/time.h>\r\n#include <unistd.h>\r\n#include <sched.h>\r\n#include <pthread.h>\r\n#include <sched.h>\r\n#include <math.h>\r\n#include <errno.h>\r\n\r\n#ifndef __MINGW32__\r\n#include <sys/mman.h>\r\n#include <sys/syscall.h>\r\n#include <sys/ioctl.h>\r\n#include <linux/perf_event.h>\r\n#include <unistd.h>\r\n#include <sys/types.h>\r\n#include <sys/stat.h>\r\n#include <fcntl.h> \r\n#include \"../Common/perfmon.h\"\r\n#endif \r\n\r\n#ifdef NUMA\r\n#include <sys/sysinfo.h>\r\n#include <numa.h>\r\n#endif\r\n\r\n#ifndef gettid\r\n#define gettid() ((pid_t)syscall(SYS_gettid))\r\n#endif\r\n\r\n#define HUGEPAGE_HACK 1\r\n#undef HUGEPAGE_HACK\r\n\r\n#pragma GCC diagnostic ignored \"-Wattributes\"\r\n\r\nint default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 400, 448, 512, 600, 768, 1024, 1536, 2048, 2560,\r\n                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 40960, 51200, 61440, 65536, 98304,\r\n                               131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 };\r\n\r\ntypedef struct BandwidthTestThreadData {\r\n    uint64_t iterations;\r\n    uint64_t arr_length;\r\n    uint64_t start;\r\n    float* arr;\r\n    float bw; // written to by the thread\r\n    #ifdef NUMA\r\n    cpu_set_t cpuset; // if numa set, will set affinity\r\n    #endif\r\n} BandwidthTestThreadData;\r\n\r\nfloat MeasureBw(uint64_t sizeKb, uint64_t iterations, uint64_t threads, int shared, int nopBytes, int coreNode, int memNode);\r\n\r\n#ifdef __x86_64\r\n#include <cpuid.h>\r\nfloat scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute((ms_abi));\r\nextern float sse_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float sse_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float sse_ntwrite(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float avx512_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float avx512_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float avx512_copy(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float avx512_add(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float repmovsb_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float repmovsd_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float repstosb_write(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float repstosd_write(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nfloat (*bw_func)(float*, uint64_t, uint64_t, uint64_t start) __attribute__((ms_abi));\r\n#else\r\nfloat scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start);\r\nfloat (*bw_func)(float*, uint64_t, uint64_t, uint64_t start);\r\n#endif\r\n\r\n#ifdef __x86_64\r\nextern float asm_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float asm_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float asm_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float asm_cflip(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\nextern float asm_add(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));\r\n#else\r\nextern float asm_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start);\r\nextern float asm_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start);\r\nextern float asm_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start);\r\nextern float asm_cflip(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start);\r\nextern float asm_add(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start);\r\n#endif\r\n\r\n#ifdef __aarch64__\r\nextern void flush_icache(void *arr, uint64_t length);\r\n#endif\r\n\r\n#ifdef __x86_64\r\n__attribute((ms_abi)) float instr_read(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) {\r\n#else\r\nfloat instr_read(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) { \r\n#endif\r\n    void (*nopfunc)(uint64_t) __attribute((ms_abi)) = (__attribute((ms_abi)) void(*)(uint64_t))arr;\r\n    for (int iterIdx = 0; iterIdx < iterations; iterIdx++) nopfunc(iterations);\r\n    return 1.1f;\r\n}\r\n\r\nvoid FillInstructionArray(uint64_t *nops, uint64_t sizeKb, int nopSize, int branchInterval); \r\nuint64_t GetIterationCount(uint64_t testSize, uint64_t threads);\r\nvoid *ReadBandwidthTestThread(void *param);\r\nvoid *allocate_memory(size_t bytes, unsigned int threadOffset);\r\nuint64_t gbToTransfer = 512;\r\nint branchInterval = 0; \r\n\r\ncpu_set_t global_cpuset;\r\nint hardaffinity = 0;\r\n\r\n#ifdef NUMA\r\n#define NUMA_STRIPE 1\r\n#define NUMA_SEQ 2\r\n#define NUMA_CROSSNODE 3\r\n#define NUMA_AUTO 4\r\n#define NUMA_DOUBLE_CROSSNODE 5\r\nint numa = 0;\r\n#endif\r\n\r\nint pmon = 0;\r\n\r\nint main(int argc, char *argv[]) {\r\n    int threads = 1;\r\n    int cpuid_data[4];\r\n    int shared = 1;\r\n    int sleepTime = 0;\r\n    int methodSet = 0, nopBytes = 0, testBankConflict = 0;\r\n    int testBankConflict128 = 0;\r\n    int singleSize = 0, autothreads = 0;\r\n    int testSizeCount = sizeof(default_test_sizes) / sizeof(int);\r\n\r\n#ifdef __x86_64\r\n    int sseSupported = 0, avxSupported = 0, avx512Supported = 0;\r\n    sseSupported = __builtin_cpu_supports(\"sse\");\r\n    if (sseSupported) fprintf(stderr, \"SSE supported\\n\");\r\n    avxSupported = __builtin_cpu_supports(\"avx\");\r\n    if (avxSupported) fprintf(stderr, \"AVX supported\\n\");\r\n    // gcc has no __builtin_cpu_supports for avx512, so check by hand.\r\n    // eax = 7 -> extended features, bit 16 of ebx = avx512f\r\n    uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx;\r\n    __cpuid_count(7, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx);\r\n    if (cpuidEbx & (1UL << 16)) {\r\n        fprintf(stderr, \"AVX512 supported\\n\");\r\n        avx512Supported = 1;\r\n    }\r\n#endif\r\n\r\n    bw_func = asm_read;\r\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\r\n        if (*(argv[argIdx]) == '-') {\r\n            char *arg = argv[argIdx] + 1;\r\n            if (strncmp(arg, \"threads\", 7) == 0) {\r\n                argIdx++;\r\n                threads = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Using %d threads\\n\", threads);\r\n            } else if (strncmp(arg, \"shared\", 6) == 0) {\r\n                shared = 1;\r\n                fprintf(stderr, \"Using shared array\\n\");\r\n            } else if (strncmp(arg, \"hardaffinity\", 12) == 0) {\r\n                hardaffinity = 1;\r\n                CPU_ZERO(&global_cpuset);\r\n                CPU_SET(0, &global_cpuset);\r\n                CPU_SET(1, &global_cpuset);\r\n                sched_setaffinity(gettid(), sizeof(cpu_set_t), &global_cpuset);\r\n                fprintf(stderr, \"hardaffinity 0,1\\n\");\r\n            }\r\n            else if (strncmp(arg, \"sleep\", 5) == 0) {\r\n                argIdx++;\r\n                sleepTime = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Sleeping for %d second between tests\\n\", sleepTime);\r\n            } else if (strncmp(arg, \"private\", 7) == 0) {\r\n                shared = 0;\r\n                fprintf(stderr, \"Using private array for each thread\\n\");\r\n            } else if (strncmp(arg, \"branchinterval\", 14) == 0) {\r\n                argIdx++;\r\n                branchInterval = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Will add a branch roughly every %d bytes\\n\", branchInterval * 8);\r\n            } else if (strncmp(arg, \"sizekb\", 6) == 0) {\r\n                argIdx++;\r\n        singleSize = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Testing %d KB\\n\", singleSize);\r\n            } else if (strncmp(arg, \"data\", 4) == 0) {\r\n                argIdx++;\r\n                gbToTransfer = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Base GB to transfer: %lu\\n\", gbToTransfer);\r\n            }\r\n            else if (strncmp(arg, \"autothreads\", 11) == 0) {\r\n                argIdx++;\r\n                autothreads = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Testing bw scaling up to %d threads\\n\", autothreads);\r\n            }\r\n#ifndef __MINGW32__\r\n            else if (strncmp(arg, \"pmon\", 4) == 0) {\r\n                pmon = 1;\r\n                fprintf(stderr, \"Using hardware performance monitoring\\n\");\r\n            }\r\n#endif\r\n#ifdef NUMA\r\n            else if (strncmp(arg, \"numa\", 4) == 0) {\r\n                argIdx++;\r\n                fprintf(stderr, \"Attempting to be NUMA aware\\n\");\r\n                if (strncmp(argv[argIdx], \"crossnode\", 4) == 0) {\r\n                    fprintf(stderr, \"Testing node to node bandwidth, 1 GB test size\\n\");\r\n                    numa = NUMA_CROSSNODE;\r\n                    singleSize = 1048576;\r\n                } else if (strncmp(argv[argIdx], \"seq\", 3) == 0) {\r\n                    fprintf(stderr, \"Filling NUMA nodes one by one\\n\");\r\n                    numa = NUMA_SEQ;\r\n                } else if (strncmp(argv[argIdx], \"stripe\", 6) == 0) {\r\n                    fprintf(stderr, \"Striping threads across NUMA nodes\\n\");\r\n                    numa = NUMA_STRIPE;\r\n                } else if (strncmp(argv[argIdx], \"doublecross\", 10) == 0) {\r\n                    fprintf(stderr, \"Crossnode, with two nodes\\n\");\r\n                    numa = NUMA_DOUBLE_CROSSNODE;\r\n                }\r\n            }\r\n#endif\r\n            else if (strncmp(arg, \"method\", 6) == 0) {\r\n                methodSet = 1;\r\n                argIdx++;\r\n                if (strncmp(argv[argIdx], \"scalar\", 6) == 0) {\r\n                    bw_func = scalar_read;\r\n                    fprintf(stderr, \"Using scalar C code\\n\");\r\n                } else if (strncmp(argv[argIdx], \"asm\", 3) == 0) {\r\n                    bw_func = asm_read;\r\n                    fprintf(stderr, \"Using ASM code (AVX or NEON)\\n\");\r\n                } else if (strncmp(argv[argIdx], \"write\", 5) == 0) {\r\n                    bw_func = asm_write;\r\n                    fprintf(stderr, \"Using ASM code (AVX or NEON), testing write bw instead of read\\n\");\r\n                    #ifdef __x86_64\r\n                    if (avx512Supported) {\r\n                        fprintf(stderr, \"Using AVX-512 because that's supported\\n\");\r\n                        bw_func = avx512_write;\r\n                    }\r\n                    #endif\r\n                } else if (strncmp(argv[argIdx], \"copy\", 4) == 0) {\r\n                    bw_func = asm_copy;\r\n                    fprintf(stderr, \"Using ASM code (AVX or NEON), testing copy bw instead of read\\n\");\r\n                    #ifdef __x86_64\r\n                    if (avx512Supported) {\r\n                        fprintf(stderr, \"Using AVX-512 because that's supported\\n\");\r\n                        bw_func = avx512_copy;\r\n                    }\r\n                    #endif\r\n                } else if (strncmp(argv[argIdx], \"cflip\", 5) == 0) {\r\n                    bw_func = asm_cflip;\r\n                    fprintf(stderr, \"Using ASM code (AVX or NEON), flipping order of elements within cacheline\\n\");\r\n                } else if (strncmp(argv[argIdx], \"add\", 3) == 0) {\r\n                    bw_func = asm_add;\r\n                    fprintf(stderr, \"Using ASM code (AVX or NEON), adding constant to array\\n\");\r\n                    #ifdef __x86_64\r\n                    if (avx512Supported) {\r\n                        fprintf(stderr, \"Using AVX-512 because that's supported\\n\");\r\n                        bw_func = avx512_add;\r\n                    }\r\n                    #endif\r\n                }\r\n\r\n                else if (strncmp(argv[argIdx], \"instr8\", 6) == 0) {\r\n                    nopBytes = 8;\r\n                     bw_func = instr_read;\r\n                    fprintf(stderr, \"Testing instruction fetch bandwidth with 8 byte instructions.\\n\");\r\n                } else if (strncmp(argv[argIdx], \"instr4\", 6) == 0) {\r\n                    nopBytes = 4;\r\n                     bw_func = instr_read;\r\n                    fprintf(stderr, \"Testing instruction fetch bandwidth with 4 byte instructions.\\n\");\r\n                } else if (strncmp(argv[argIdx], \"instr2\", 6) == 0) {\r\n                    nopBytes = 2;\r\n                    bw_func = instr_read;\r\n                    fprintf(stderr, \"Testing instruction fetch bandwith with 2 byte instructions.\\n\");\r\n                }\r\n                #ifdef __x86_64\r\n                else if (strncmp(argv[argIdx], \"instrk8_4\", 8) == 0) {\r\n                    nopBytes = 3;\r\n                    bw_func = instr_read;\r\n                    fprintf(stderr, \"Testing instruction bandwidth using 4B NOP encoding recommended in the Athlon optimization manual\\n\");\r\n                }\r\n                else if (strncmp(argv[argIdx], \"instr_funcs\", 11) == 0) {\r\n                    nopBytes = -1;\r\n                    bw_func = instr_read;\r\n                    fprintf(stderr, \"Testing instruction bandwidth with call to function/return blocks\\n\");\r\n                } \r\n                else if (strncmp(argv[argIdx], \"avx512\", 6) == 0) {\r\n                    bw_func = avx512_read;\r\n                    fprintf(stderr, \"Using ASM code, AVX512\\n\");\r\n                }\r\n                else if (strncmp(argv[argIdx], \"sse_write\", 9) == 0) {\r\n                    bw_func = sse_write;\r\n                    fprintf(stderr, \"Using SSE to test write bandwidth\\n\");\r\n                }\r\n                else if (strncmp(argv[argIdx], \"sse_ntwrite\", 11) == 0) {\r\n                    bw_func = sse_ntwrite;\r\n                    fprintf(stderr, \"Using SSE NT writes to test write bandwidth\\n\");\r\n                } \r\n                else if (strncmp(argv[argIdx], \"sse\", 3) == 0) {\r\n                    bw_func = sse_read;\r\n                    fprintf(stderr, \"Using ASM code, SSE\\n\");\r\n                }\r\n                else if (strncmp(argv[argIdx], \"avx\", 3) == 0) {\r\n                    bw_func = asm_read;\r\n                    fprintf(stderr, \"Using ASM code, AVX\\n\");\r\n                } \r\n                else if (strncmp(argv[argIdx], \"repmovsb\", 8) == 0) {\r\n                    bw_func = repmovsb_copy;\r\n                    fprintf(stderr, \"Using REP MOVSB to copy\\n\");\r\n                }\r\n                else if (strncmp(argv[argIdx], \"repmovsd\", 8) == 0) {\r\n                    bw_func = repmovsd_copy;\r\n                    fprintf(stderr, \"Using REP MOVSD to copy\\n\");\r\n                }\r\n                else if (strncmp(argv[argIdx], \"repstosb\", 9) == 0) {\r\n                    bw_func = repstosb_write;\r\n                    fprintf(stderr, \"Using REP STOSB to write\\n\");\r\n                } \r\n                else if (strncmp(argv[argIdx], \"repstosd\", 9) == 0) {\r\n                    bw_func = repstosd_write;\r\n                    fprintf(stderr, \"Using REP STOSD to write\\n\");\r\n                }  \r\n                #endif\r\n        \r\n            }\r\n        } else {\r\n            fprintf(stderr, \"Expected - parameter\\n\");\r\n            fprintf(stderr, \"Usage: [-threads <thread count>] [-private] [-method <scalar/asm/avx512>] [-sleep <time in seconds>] [-sizekb <single test size>]\\n\");\r\n        }\r\n    }\r\n\r\n#ifdef __x86_64\r\n    // if no method was specified, attempt to pick the best one for x86\r\n    // for aarch64 we'll just use NEON because SVE basically doesn't exist\r\n    if (!methodSet) {\r\n        bw_func = scalar_read;\r\n        if (sseSupported) {\r\n            bw_func = sse_read;\r\n        }\r\n\r\n        if (avxSupported) {\r\n            bw_func = asm_read;\r\n        }\r\n\r\n\r\n        if (avx512Supported) {\r\n            bw_func = avx512_read;\r\n        }\r\n    }\r\n#endif\r\n\r\n    if (autothreads > 0) {\r\n        float *threadResults = (float *)malloc(sizeof(float) * autothreads * testSizeCount);\r\n        printf(\"Auto threads mode, up to %d threads\\n\", autothreads);\r\n        for (int threadIdx = 1; threadIdx <= autothreads; threadIdx++) {\r\n            if (singleSize != 0) {\r\n                threadResults[threadIdx - 1] = MeasureBw(singleSize, GetIterationCount(singleSize, threadIdx), threadIdx, shared, nopBytes, 0, 0);\r\n                fprintf(stderr, \"%d threads: %f GB/s\\n\", threadIdx, threadResults[threadIdx - 1]);\r\n            } else {\r\n                for (int i = 0; i < testSizeCount; i++) {\r\n                    int currentTestSize = default_test_sizes[i];\r\n                    //fprintf(stderr, \"Testing size %d\\n\", currentTestSize);\r\n                    threadResults[(threadIdx - 1) * testSizeCount + i] = MeasureBw(currentTestSize, GetIterationCount(currentTestSize, threadIdx), threadIdx, shared, nopBytes, 0, 0);\r\n                    fprintf(stderr, \"%d threads, %d KB total: %f GB/s\\n\", threadIdx, currentTestSize, threadResults[(threadIdx - 1) * testSizeCount + i]);\r\n                }\r\n            }\r\n        }\r\n\r\n        if (singleSize != 0) {\r\n            printf(\"Threads, BW (GB/s)\\n\");\r\n            for (int i = 0;i < autothreads; i++) {\r\n                printf(\"%d,%f\\n\", i + 1, threadResults[i]);\r\n            }\r\n        } else {\r\n            printf(\"Test size down, threads across, value = GB/s\\n\");\r\n            for (int sizeIdx = 0; sizeIdx < testSizeCount; sizeIdx++) {\r\n                printf(\"%d\", default_test_sizes[sizeIdx]);\r\n                for (int threadIdx = 1; threadIdx <= autothreads; threadIdx++) {\r\n                    printf(\",%f\", threadResults[(threadIdx - 1) * testSizeCount + sizeIdx]);\r\n                }\r\n\r\n                printf(\"\\n\");\r\n            }\r\n        }\r\n\r\n        free(threadResults);\r\n    } \r\n#ifdef NUMA\r\n    else if (numa == NUMA_CROSSNODE) {\r\n        if (numa_available() == -1) {\r\n        fprintf(stderr, \"NUMA is not available\\n\");\r\n        return 0;\r\n    }\r\n\r\n        struct bitmask *nodeBitmask = numa_allocate_cpumask();\r\n    int numaNodeCount = numa_max_node() + 1;\r\n    fprintf(stderr, \"System has %d NUMA nodes\\n\", numaNodeCount);\r\n        float *crossnodeBandwidths = (float *)malloc(sizeof(float) * numaNodeCount * numaNodeCount);\r\n    memset(crossnodeBandwidths, 0, sizeof(float) * numaNodeCount * numaNodeCount);\r\n        for (int cpuNode = 0; cpuNode < numaNodeCount; cpuNode++) {\r\n            numa_node_to_cpus(cpuNode, nodeBitmask);\r\n        int nodeCpuCount = numa_bitmask_weight(nodeBitmask);\r\n        if (nodeCpuCount == 0) {\r\n            fprintf(stderr, \"Node %d has no cores\\n\", cpuNode);\r\n            continue;\r\n        }\r\n\r\n        fprintf(stderr, \"Node %d has %d cores\\n\", cpuNode, nodeCpuCount);\r\n            for (int memNode = 0; memNode < numaNodeCount; memNode++) {\r\n            fprintf(stderr, \"Testing CPU node %d to mem node %d\\n\", cpuNode, memNode);\r\n                crossnodeBandwidths[cpuNode * numaNodeCount + memNode] = \r\n                MeasureBw(singleSize, GetIterationCount(singleSize, nodeCpuCount), nodeCpuCount, shared, nopBytes, cpuNode, memNode);\r\n            fprintf(stderr, \"CPU node %d <- mem node %d: %f\\n\", cpuNode, memNode, crossnodeBandwidths[cpuNode * numaNodeCount + memNode]);\r\n            }\r\n        }\r\n\r\n        for (int memNode = 0; memNode < numaNodeCount; memNode++) {\r\n        printf(\",%d\", memNode);\r\n    }\r\n\r\n    printf(\"\\n\");\r\n    for (int cpuNode = 0; cpuNode < numaNodeCount; cpuNode++) {\r\n        printf(\"%d\", cpuNode);\r\n        for (int memNode = 0; memNode < numaNodeCount; memNode++) {\r\n            printf(\",%f\", crossnodeBandwidths[cpuNode * numaNodeCount + memNode]);\r\n        }\r\n\r\n        printf(\"\\n\");\r\n    }\r\n\r\n        numa_free_cpumask(nodeBitmask);\r\n    free(crossnodeBandwidths);\r\n    }\r\n#endif\r\n    else {\r\n        printf(\"Using %d threads\\n\", threads);\r\n        printf(\"Size (KB),Bandwidth (GB/s)\");\r\n#ifndef __MINGW32__\r\n        if (pmon) {\r\n            open_perf_monitoring();\r\n            append_perf_header();\r\n        }\r\n#endif\r\n        printf(\"\\n\");\r\n        if (singleSize == 0)\r\n        {\r\n            for (int i = 0; i < testSizeCount; i++)\r\n            {\r\n                printf(\"%d,%f\", default_test_sizes[i], MeasureBw(default_test_sizes[i], GetIterationCount(default_test_sizes[i], threads), threads, shared, nopBytes, 0, 0));\r\n\r\n#ifndef __MINGW32__\r\n                if (pmon) append_perf_values();\r\n#endif\r\n                printf(\"\\n\");\r\n                if (sleepTime > 0) sleep(sleepTime);\r\n            }\r\n        }\r\n        else\r\n        {\r\n            printf(\"%d,%f\", singleSize, MeasureBw(singleSize, GetIterationCount(singleSize, threads), threads, shared, nopBytes, 0, 0));\r\n            append_perf_values();\r\n            printf(\"\\n\");\r\n        }\r\n\r\n        close_perf_monitoring();\r\n    }\r\n\r\n    return 0;\r\n}\r\n\r\n/// <summary>\r\n/// Given test size in KB, return a good iteration count\r\n/// </summary>\r\n/// <param name=\"testSize\">test size in KB</param>\r\n/// <returns>Iterations per thread</returns>\r\nuint64_t GetIterationCount(uint64_t testSize, uint64_t threads)\r\n{\r\n    int scaledGbToTransfer = gbToTransfer;\r\n    if (testSize > 64) scaledGbToTransfer = gbToTransfer / 8;\r\n    uint64_t iterations = scaledGbToTransfer * 1024 * 1024 / testSize;\r\n    if (iterations % 2 != 0) iterations += 1;  // must be even\r\n\r\n    if (iterations < 8) return 8; // set a minimum to reduce noise\r\n    else return iterations;\r\n}\r\n\r\n// Writes 7B NOP + return\r\nvoid WriteReturn8BBlock(char *dst) {\r\n    dst[0] = 0xF;\r\n    dst[1] = 0x1F;\r\n    dst[2] = 0x80;\r\n    for (int i = 0; i < 4; i++) dst[i + 3] = 0;\r\n    dst[7] = 0xC3;\r\n}\r\n\r\nvoid FillInstructionArray(uint64_t *nops, uint64_t sizeKb, int nopSize, int branchInterval) {\r\n#ifdef __x86_64\r\n    char nop2b[8] = { 0x66, 0x90, 0x66, 0x90, 0x66, 0x90, 0x66, 0x90 };\r\n    char nop2b_xor[8] = { 0x31, 0xc0, 0x31, 0xc0, 0x31, 0xc0, 0x31, 0xc0 };\r\n    char nop8b[8] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };\r\n\r\n    // zen/piledriver optimization manual uses this pattern\r\n    char nop4b[8] = { 0x0F, 0x1F, 0x40, 0x00, 0x0F, 0x1F, 0x40, 0x00 };\r\n\r\n    // athlon64 (K8) optimization manual pattern\r\n    char k8_nop4b[8] = { 0x66, 0x66, 0x66, 0x90, 0x66, 0x66, 0x66, 0x90 };\r\n    char nop4b_with_branch[8] = { 0x0F, 0x1F, 0x40, 0x00, 0xEB, 0x00, 0x66, 0x90 };\r\n#endif\r\n\r\n#ifdef __aarch64__\r\n    char nop4b[8] = { 0x1F, 0x20, 0x03, 0xD5, 0x1F, 0x20, 0x03, 0xD5 };\r\n\r\n    // hack this to deal with graviton 1 / A72\r\n    // nop + mov x0, 0\r\n    char nop8b[8] = { 0x00, 0x00, 0x80, 0xD2, 0x00, 0x00, 0x80, 0xD2 }; \r\n    // mov x0, 0 + ldr x0, [sp] \r\n    char nop8b1[8] = { 0x00, 0x00, 0x80, 0xD2, 0x00, 0x00, 0x80, 0xD2 }; \r\n#endif\r\n\r\n#ifdef __riscv\r\n    // nop, fmv.s fa0, fa5\r\n    char nop4b[8] = { 0x13, 0x00, 0x00, 0x00, 0x53, 0x85, 0xf7, 0x20 };\r\n\r\n    // hack this to deal with graviton 1 / A72\r\n    // nop + mov x0, 0\r\n    char nop8b[8] = { 0x13, 0x00, 0x00, 0x00, 0x53, 0x85, 0xf7, 0x20  }; \r\n    // mov x0, 0 + ldr x0, [sp] \r\n    char nop8b1[8] = { 0x13, 0x00, 0x00, 0x00, 0xe0, 0x03, 0x40, 0xf9 };  \r\n#endif \r\n    \r\n    int specialFill = 0;\r\n    uint64_t *nop8bptr;\r\n    if (nopSize == 8) nop8bptr = (uint64_t *)(nop8b);\r\n    else if (nopSize == 4) nop8bptr = (uint64_t *)(nop4b);\r\n    #ifdef __x86_64\r\n    else if (nopSize == 2) nop8bptr = (uint64_t *)(nop2b_xor);\r\n    else if (nopSize == 3) nop8bptr = (uint64_t *)(k8_nop4b);\r\n    else if (nopSize == -1) {\r\n        // Special case for calls.\r\n        // [ cacheline ]    [ cacheline ]\r\n        //  call ---------->         ret\r\n        // each call+ret will take 128B\r\n        // Size is in KB so it's guaranteed to be divisible by 128B\r\n        // Each 1 KB block has eight 128B blocks\r\n        uint64_t callCount = sizeKb * 8;\r\n        char *instrArr = (char *)nops;\r\n        for (uint64_t callIdx = 0; callIdx < callCount; callIdx++) {\r\n            uint64_t callOffset = 64 * callIdx;\r\n            uint32_t callDestinationOffsetInArray = (sizeKb * 1024) / 2 + 64 * callIdx;\r\n            // call instruction: E8 [4B relative displacement], 5B total. \r\n            instrArr[callOffset] = 0xE8;\r\n            uint32_t *relativeDisplacementPtr = (uint32_t*)(instrArr + callOffset + 1);\r\n            *relativeDisplacementPtr = callDestinationOffsetInArray - callOffset - 5;\r\n\r\n            // pad out rest of 64B with NOPs, but no more than 8B per NOP\r\n            // finish out first 8B segment with a 3B NOP\r\n            instrArr[callOffset + 5] = 0x0F;\r\n            instrArr[callOffset + 6] = 0x1F;\r\n            instrArr[callOffset + 7] = 0;\r\n\r\n            // Then pad out the rest with 7x 8B NOPs\r\n            nop8bptr = (uint64_t *)(nop8b);\r\n            for (int nop8bIdx = 0; nop8bIdx < 7; nop8bIdx++) {\r\n                *(uint64_t *)(instrArr + callOffset + 8 * (nop8bIdx + 1)) = *nop8bptr;\r\n            }\r\n\r\n            // Last call block should have a return at the end\r\n            if (callIdx == callCount - 1) {\r\n                WriteReturn8BBlock(instrArr + callOffset + 56);\r\n            }\r\n\r\n            // 7x 8B NOPs in call target\r\n            for (int nop8bIdx = 0; nop8bIdx < 7; nop8bIdx++) {\r\n                *(uint64_t *)(instrArr + callDestinationOffsetInArray + (8 * nop8bIdx)) = *nop8bptr;\r\n            }\r\n\r\n            WriteReturn8BBlock(instrArr + callDestinationOffsetInArray + 56);\r\n        }\r\n\r\n        specialFill = 1;\r\n    }\r\n    #endif\r\n    else {\r\n        fprintf(stderr, \"%d byte instruction length isn't supported :(\\n\", nopSize);\r\n    }\r\n\r\n    uint64_t elements = sizeKb * 1024 / 8 - 1;\r\n    if (!specialFill) {\r\n        for (uint64_t nopIdx = 0; nopIdx < elements; nopIdx++) {\r\n            nops[nopIdx] = *nop8bptr;\r\n#ifdef __x86_64\r\n            uint64_t *nopBranchPtr = (uint64_t *)nop4b_with_branch;\r\n            if (branchInterval > 1 && nopIdx % branchInterval == 0) nops[nopIdx] = *nopBranchPtr;\r\n#endif\r\n#ifdef __aarch64__\r\n            if (nopSize == 8) {\r\n                  uint64_t *otherNops = (uint64_t *)nop8b1;\r\n                  if (nopIdx & 1) nops[nopIdx] = *otherNops;\r\n            }\r\n#endif\r\n        }\r\n        \r\n        // ret\r\n        #ifdef __x86_64\r\n        unsigned char *functionEnd = (unsigned char *)(nops + elements);\r\n        functionEnd[0] = 0xC3;\r\n        #endif\r\n        #ifdef __aarch64__\r\n        uint64_t *functionEnd = (uint64_t *)(nops + elements);\r\n        functionEnd[0] = 0XD65F03C0;\r\n        //flush_icache((void *)nops, funcLen);\r\n        __builtin___clear_cache(nops, functionEnd);\r\n        #endif\r\n        #ifdef __riscv\r\n        uint64_t *functionEnd = (unsigned char *)(nops + elements);\r\n        functionEnd[0] = 0x8082;\r\n        #endif \r\n    }\r\n\r\n#ifndef HUGEPAGE_HACK\r\n    size_t funcLen = sizeKb * 1024;\r\n    uint64_t nopfuncPage = (~0xFFF) & (uint64_t)(nops);\r\n    size_t mprotectLen = (0xFFF & (uint64_t)(nops)) + funcLen;\r\n    \r\n    if (mprotect((void *)nopfuncPage, mprotectLen, PROT_EXEC | PROT_READ | PROT_WRITE) < 0) {\r\n        fprintf(stderr, \"mprotect failed, errno %d\\n\", errno);\r\n    }\r\n#endif\r\n}\r\n\r\n// If coreNode and memNode are set, use the specified numa config\r\n// otherwise if numa is set to stripe or seq, respect that\r\nfloat MeasureBw(uint64_t sizeKb, uint64_t iterations, uint64_t threads, int shared, int nopBytes, int coreNode, int memNode) {\r\n    struct timeval startTv, endTv;\r\n    struct timezone startTz, endTz;\r\n    float bw = 0;\r\n    uint64_t elements = sizeKb * 1024 / sizeof(float);\r\n\r\n    if (!shared && sizeKb < threads) {\r\n        fprintf(stderr, \"Too many threads for this test size\\n\");\r\n        return 0;\r\n    }\r\n\r\n    // make sure this is divisble by 512 bytes, since the unrolled asm loop depends on that\r\n    // it's hard enough to get close to theoretical L1D BW as is, so we don't want additional cmovs or branches\r\n    // in the hot loop\r\n    uint64_t private_elements = ceil((double)sizeKb / (double)threads) * 256;\r\n    //fprintf(stderr, \"Actual data: %lu B\\n\", private_elements * 4 * threads);\r\n    //fprintf(stderr, \"Data per thread: %lu B\\n\", private_elements * 4);\r\n\r\n    // make array and fill it with something, if shared\r\n    float* testArr = NULL;\r\n    if (shared){\r\n        //testArr = (float*)aligned_alloc(64, elements * sizeof(float));\r\n        testArr = allocate_memory(elements * sizeof(float), 0);\r\n        if (testArr == NULL) {\r\n                fprintf(stderr, \"Could not allocate memory\\n\");\r\n                return 0;\r\n        }\r\n\r\n        if (nopBytes == 0) {\r\n            for (uint64_t i = 0; i < elements; i++) {\r\n                testArr[i] = i + 0.5f;\r\n            }\r\n        } else FillInstructionArray((uint64_t *)testArr, sizeKb, nopBytes, branchInterval);\r\n    }\r\n    else\r\n    {\r\n        elements = private_elements; // will fill arrays below, per-thread\r\n    }\r\n\r\n    pthread_t* testThreads = (pthread_t*)malloc(threads * sizeof(pthread_t));\r\n    struct BandwidthTestThreadData* threadData = (struct BandwidthTestThreadData*)malloc(threads * sizeof(struct BandwidthTestThreadData));\r\n#ifdef NUMA\r\n    // if numa, tell each thread to set an affinity mask\r\n    struct bitmask *nodeBitmask = NULL;\r\n    cpu_set_t cpuset;\r\n    \r\n    if (numa == NUMA_CROSSNODE) {\r\n        nodeBitmask = numa_allocate_cpumask();\r\n    int nprocs = get_nprocs();\r\n        numa_node_to_cpus(coreNode, nodeBitmask); \r\n    CPU_ZERO(&cpuset);\r\n\r\n    // provided functions for manipultaing bitmask don't work\r\n    // for (int i = 0; i < nprocs; i++)\r\n    //   if (numa_bitmask_isbitset(nodeBitmask, i)) CPU_SET(i, &cpuset);\r\n    // bitmask has fields:\r\n    // - size = number of bits\r\n    // - maskp = pointer to bitmap\r\n    // cpu_set_t has field __bits. have to assume it's CPU_SETSIZE bits\r\n    // also assume bitmap size is divisible by 8 (byte size)\r\n    memcpy(cpuset.__bits, nodeBitmask->maskp, nodeBitmask->size / 8);\r\n    }\r\n#endif\r\n\r\n    for (uint64_t i = 0; i < threads; i++) {\r\n        if (shared)\r\n        {\r\n            threadData[i].arr = testArr;\r\n            threadData[i].iterations = iterations;\r\n        }\r\n        else\r\n        {\r\n#ifdef NUMA\r\n            int cpuCount = get_nprocs();\r\n            if (numa == NUMA_CROSSNODE) {\r\n                threadData[i].arr = numa_alloc_onnode(elements * sizeof(float), memNode);\r\n                threadData[i].cpuset = cpuset;\r\n            } else if (numa) {\r\n                // Figure out which nodes actually have CPUs and memory\r\n                //int numaNodeCount = numa_max_node() + 1;\r\n                int numaNodeCount = 4;   // for knl. geez\r\n                if (numa == NUMA_SEQ) {\r\n                    // unimplemented\r\n                    fprintf(stderr, \"sequential numa node fill not implemented yet\\n\");\r\n                } else if (numa == NUMA_STRIPE) {\r\n                    memNode = i % numaNodeCount;\r\n                    coreNode = memNode;\r\n                } else if (numa == NUMA_DOUBLE_CROSSNODE) {\r\n                    // hardcode source nodes to 0,1 and destinations 2,3\r\n\t\t    // edit this later for one-off testing\r\n                    coreNode = i & 1;\r\n                    memNode = (i & 1);\r\n                    fprintf(stderr, \"Thread %d: Core %d -> mem %d\\n\", i, coreNode, memNode);\r\n                }\r\n\r\n                for(int cpuIdx = 0; cpuIdx < get_nprocs(); cpuIdx++) {\r\n                    CPU_ZERO(&(threadData[i].cpuset));\r\n                    if(CPU_ISSET(i, &(threadData[i].cpuset))) {\r\n                        fprintf(stderr, \"bitmask not cleared\\n\");\r\n                    }\r\n                }\r\n\r\n                threadData[i].arr = numa_alloc_onnode(elements * sizeof(float), memNode);\r\n\r\n                for(int cpuIdx = 0; cpuIdx < get_nprocs(); cpuIdx++) {\r\n                    CPU_ZERO(&(threadData[i].cpuset));\r\n                    if(CPU_ISSET(i, &(threadData[i].cpuset))) {\r\n                        fprintf(stderr, \"bitmask not cleared\\n\");\r\n                    }\r\n                }\r\n\r\n                // cpu node affinity has to be set for each thread\r\n                nodeBitmask = numa_allocate_cpumask();\r\n                numa_node_to_cpus(coreNode, nodeBitmask); \r\n                CPU_ZERO(&(threadData[i].cpuset));\r\n                fprintf(stderr, \"\\tNode %d has CPUs:\", coreNode);\r\n                for (int cpuIdx = 0; cpuIdx < cpuCount; cpuIdx++) { \r\n                    if (numa_bitmask_isbitset(nodeBitmask, cpuIdx))  {\r\n                        CPU_SET(cpuIdx, &(threadData[i].cpuset)); \r\n                    }\r\n                }\r\n            } else {\r\n#endif\r\n                // Not NUMA aware. Allocate memory normally\r\n\t\t//threadData[i].arr = (float*)aligned_alloc(64, elements * sizeof(float));\r\n                threadData[i].arr = allocate_memory(elements * sizeof(float), i);\r\n                if (threadData[i].arr == NULL)\r\n                {\r\n                    fprintf(stderr, \"Could not allocate memory for thread %ld\\n\", i);\r\n                    return 0;\r\n                }\r\n#ifdef NUMA\r\n\t}\r\n#endif\r\n\r\n        if (nopBytes == 0) {\r\n            for (uint64_t arr_idx = 0; arr_idx < elements; arr_idx++) {\r\n                threadData[i].arr[arr_idx] = arr_idx + i + 0.5f;\r\n            }\r\n        } else FillInstructionArray((uint64_t *)threadData[i].arr, elements * sizeof(float) / 1024, nopBytes, branchInterval);\r\n\r\n            threadData[i].iterations = iterations * threads;\r\n        }\r\n\r\n        threadData[i].arr_length = elements;\r\n        threadData[i].bw = 0;\r\n        threadData[i].start = 0;\r\n        //if (elements > 8192 * 1024) threadData[i].start = 4096 * i; // must be multiple of 128 because of unrolling\r\n        //int pthreadRc = pthread_create(testThreads + i, NULL, ReadBandwidthTestThread, (void *)(threadData + i));\r\n    }\r\n\r\n#ifndef __MINGW32__\r\n    if (pmon) start_perf_monitoring();\r\n#endif\r\n    gettimeofday(&startTv, &startTz);\r\n    for (uint64_t i = 0; i < threads; i++) pthread_create(testThreads + i, NULL, ReadBandwidthTestThread, (void *)(threadData + i));\r\n    for (uint64_t i = 0; i < threads; i++) pthread_join(testThreads[i], NULL);\r\n    gettimeofday(&endTv, &endTz);\r\n#ifndef __MINGW32__\r\n    if (pmon) stop_perf_monitoring();\r\n#endif\r\n\r\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\r\n    double gbTransferred = iterations * sizeof(float) * elements * threads / (double)1e9;\r\n    bw = 1000 * gbTransferred / (double)time_diff_ms;\r\n    if (!shared) bw = bw * threads; // iteration count is divided by thread count if in thread private mode\r\n    //printf(\"%f GB, %lu ms\\n\", gbTransferred, time_diff_ms);\r\n#ifdef NUMA\r\n    if (numa) numa_free_cpumask(nodeBitmask);\r\n#endif\r\n    free(testThreads);\r\n    #ifndef HUGEPAGE_HACK\r\n    free(testArr); // should be null in not-shared (private) mode\r\n    #endif\r\n\r\n    if (!shared) {\r\n        for (uint64_t i = 0; i < threads; i++) {\r\n#ifdef NUMA\r\n        if (numa) numa_free(threadData[i].arr, elements * sizeof(float));\r\n        else\r\n#endif\r\n#ifndef HUGEPAGE_HACK\r\n            free(threadData[i].arr);\r\n#endif\r\n        }\r\n    }\r\n\r\n    free(threadData);\r\n    return bw;\r\n}\r\n\r\n// one place to make memory allocation calls\r\n#define HUGEPAGE_HACK_SIZE (1048576*1024)\r\nvoid *hugepageBuffer = NULL;\r\nvoid *allocate_memory(size_t bytes, unsigned int threadOffset)\r\n{\r\n    void *dst = NULL;\r\n    #ifndef HUGEPAGE_HACK\r\n    int posix_memalign_rc = 0;\r\n    if (posix_memalign_rc != posix_memalign((void **)(&dst), 64, bytes)) {\r\n        fprintf(stderr, \"Could not allocate memory: %d\\n\", posix_memalign_rc);\r\n        return NULL;\r\n    }\r\n\r\n    return dst;\r\n    #else\r\n    // todo: make this less of a hack\r\n    if (hugepageBuffer == NULL)\r\n    {\r\n        hugepageBuffer = mmap(NULL, HUGEPAGE_HACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);\r\n        if (hugepageBuffer == NULL)\r\n        {\r\n            fprintf(stderr, \"Could not mmap memory with hugetlb\\n\");\r\n            return NULL;\r\n        }\r\n\r\n        if (threadOffset * bytes + bytes > HUGEPAGE_HACK_SIZE)\r\n        {\r\n            fprintf(stderr, \"Oh no\\n\");\r\n            return NULL;\r\n        }\r\n    }\r\n\r\n    // fprintf(stderr, \"Array offset for thread %d is %llu KB\\n\", threadOffset, bytes * threadOffset / 1024);\r\n    return (void *)((char *)hugepageBuffer + (bytes * threadOffset));\r\n    #endif\r\n}\r\n\r\n#ifdef __x86_64\r\n__attribute((ms_abi)) float scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) {\r\n#else\r\nfloat scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) {\r\n#endif\r\n    float sum = 0;\r\n    if (start + 16 >= arr_length) return 0;\r\n\r\n    uint64_t iter_idx = 0, i = start;\r\n    float s1 = 0, s2 = 1, s3 = 0, s4 = 1, s5 = 0, s6 = 1, s7 = 0, s8 = 1;\r\n    while (iter_idx < iterations) {\r\n        s1 += arr[i];\r\n        s2 *= arr[i + 1];\r\n        s3 += arr[i + 2];\r\n        s4 *= arr[i + 3];\r\n        s5 += arr[i + 4];\r\n        s6 *= arr[i + 5];\r\n        s7 += arr[i + 6];\r\n        s8 *= arr[i + 7];\r\n        i += 8;\r\n        if (i + 7 >= arr_length) i = 0;\r\n        if (i == start) iter_idx++;\r\n    }\r\n\r\n    sum += s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8;\r\n\r\n    return sum;\r\n}\r\n\r\nvoid *ReadBandwidthTestThread(void *param) {\r\n    BandwidthTestThreadData* bwTestData = (BandwidthTestThreadData*)param;\r\n    if (hardaffinity) sched_setaffinity(gettid(), sizeof(cpu_set_t), &global_cpuset);\r\n#ifdef NUMA\r\n    if (numa) {\r\n        int affinity_rc = sched_setaffinity(gettid(), sizeof(cpu_set_t), &(bwTestData->cpuset));\r\n    if (affinity_rc != 0) {\r\n        fprintf(stderr, \"wtf set affinity failed: %s\\n\",strerror(errno));\r\n        \r\n    }\r\n    }\r\n#endif\r\n    float sum = bw_func(bwTestData->arr, bwTestData->arr_length, bwTestData->iterations, bwTestData->start);\r\n    if (sum == 0) printf(\"woohoo\\n\");\r\n    pthread_exit(NULL);\r\n}\r\n"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth_arm.s",
    "content": ".arch armv8-a\r\n.text\r\n\r\n.global asm_read\r\n.global asm_write\r\n.global asm_cflip\r\n.global asm_copy\r\n.global asm_add\r\n.global flush_icache\r\n.global readbankconflict\r\n.global readbankconflict128\r\n\r\n.global _asm_read\r\n.global _asm_write\r\n.global _asm_cflip\r\n.global _asm_copy\r\n.global _asm_add\r\n.global _flush_icache\r\n.global _readbankconflict\r\n\r\n.balign 4\r\n\r\n/* x0 = ptr to array (was rcx)\r\n * x1 = arr length (was rdx)\r\n * x2 = iterations (was r8)\r\n * x3 = start (was r9)\r\n */\r\n_asm_read:\r\nasm_read:\r\n  sub sp, sp, #0x30\r\n  stp x14, x15, [sp, #0x10]\r\n  stp x12, x13, [sp, #0x20]\r\n  sub x1, x1, 128\r\n  mov x14, x3     /* set x14 = index into array to start location (x3) */\r\n  eor x13, x13, x13 /* x13 = 0 (for comparison) */\r\nasm_read_pass_loop:\r\n  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */\r\n  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q22, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q22, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q22, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q22, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  cmp x1, x14 /* if x1 (len - 128) - x14 < 0, loop back around */\r\n  csel x14, x13, x14, LT\r\n  cmp x14, x3\r\n  b.ne asm_read_pass_loop /* skip iteration decrement if we're not back to start */\r\n  sub x2, x2, 1\r\n  cbnz x2, asm_read_pass_loop\r\n  add v0.4s, v16.4s, v16.4s\r\n  ldp x12, x13, [sp, #0x20]\r\n  ldp x14, x15, [sp, #0x10]\r\n  add sp, sp, #0x30\r\n  ret\r\n\r\n_asm_write:\r\nasm_write:\r\n  sub sp, sp, #0x30\r\n  stp x14, x15, [sp, #0x10]\r\n  stp x12, x13, [sp, #0x20]\r\n  sub x1, x1, 128 /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov x14, x3     /* set x14 = index into array to start location (x3) */\r\n  eor x13, x13, x13 /* x13 = 0 (for comparison) */\r\n  ldr q16, [x0]\r\nasm_write_pass_loop:\r\n  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */\r\n  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */\r\n  str q16, [x15]\r\n  str q16, [x15, 16]\r\n  str q16, [x15, 32]\r\n  str q16, [x15, 48]\r\n  str q16, [x15, 64]\r\n  str q16, [x15, 80]\r\n  str q16, [x15, 96]\r\n  str q16, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  str q16, [x15]\r\n  str q16, [x15, 16]\r\n  str q16, [x15, 32]\r\n  str q16, [x15, 48]\r\n  str q16, [x15, 64]\r\n  str q16, [x15, 80]\r\n  str q16, [x15, 96]\r\n  str q16, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  str q16, [x15]\r\n  str q16, [x15, 16]\r\n  str q16, [x15, 32]\r\n  str q16, [x15, 48]\r\n  str q16, [x15, 64]\r\n  str q16, [x15, 80]\r\n  str q16, [x15, 96]\r\n  str q16, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  str q16, [x15]\r\n  str q16, [x15, 16]\r\n  str q16, [x15, 32]\r\n  str q16, [x15, 48]\r\n  str q16, [x15, 64]\r\n  str q16, [x15, 80]\r\n  str q16, [x15, 96]\r\n  str q16, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  cmp x1, x14 /* if x1 (len - 128) - x14 < 0, loop back around */\r\n  csel x14, x13, x14, LT\r\n  cmp x14, x3\r\n  b.ne asm_write_pass_loop /* skip iteration decrement if we're not back to start */\r\n  sub x2, x2, 1\r\n  cbnz x2, asm_write_pass_loop\r\n  add v0.4s, v16.4s, v16.4s\r\n  ldp x12, x13, [sp, #0x20]\r\n  ldp x14, x15, [sp, #0x10]\r\n  add sp, sp, #0x30\r\n  ret\r\n\r\n_asm_cflip:\r\nasm_cflip:\r\n  sub sp, sp, #0x30\r\n  stp x14, x15, [sp, #0x10]\r\n  stp x12, x13, [sp, #0x20]\r\n  sub x1, x1, 128\r\n  mov x14, x3     /* set x14 = index into array to start location (x3) */\r\n  eor x13, x13, x13 /* x13 = 0 (for comparison) */\r\nasm_cflip_pass_loop:\r\n  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */\r\n  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  str q16, [x15, 48]\r\n  str q17, [x15, 32]\r\n  str q18, [x15, 16]\r\n  str q19, [x15]\r\n  ldr q16, [x15, 64]\r\n  ldr q17, [x15, 80]\r\n  ldr q18, [x15, 96]\r\n  ldr q19, [x15, 112]\r\n  str q16, [x15, 112]\r\n  str q17, [x15, 96]\r\n  str q18, [x15, 80]\r\n  str q19, [x15, 64]\r\n\r\n  add x14, x14, 32\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  str q16, [x15, 48]\r\n  str q17, [x15, 32]\r\n  str q18, [x15, 16]\r\n  str q19, [x15]\r\n  ldr q16, [x15, 64]\r\n  ldr q17, [x15, 80]\r\n  ldr q18, [x15, 96]\r\n  ldr q19, [x15, 112]\r\n  str q16, [x15, 112]\r\n  str q17, [x15, 96]\r\n  str q18, [x15, 80]\r\n  str q19, [x15, 64]\r\n\r\n  add x14, x14, 32\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  str q16, [x15, 48]\r\n  str q17, [x15, 32]\r\n  str q18, [x15, 16]\r\n  str q19, [x15]\r\n  ldr q16, [x15, 64]\r\n  ldr q17, [x15, 80]\r\n  ldr q18, [x15, 96]\r\n  ldr q19, [x15, 112]\r\n  str q16, [x15, 112]\r\n  str q17, [x15, 96]\r\n  str q18, [x15, 80]\r\n  str q19, [x15, 64]\r\n\r\n  add x14, x14, 32\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  str q16, [x15, 48]\r\n  str q17, [x15, 32]\r\n  str q18, [x15, 16]\r\n  str q19, [x15]\r\n  ldr q16, [x15, 64]\r\n  ldr q17, [x15, 80]\r\n  ldr q18, [x15, 96]\r\n  ldr q19, [x15, 112]\r\n  str q16, [x15, 112]\r\n  str q17, [x15, 96]\r\n  str q18, [x15, 80]\r\n  str q19, [x15, 64]\r\n\r\n  cmp x1, x14 /* if x1 (len - 128) - x14 < 0, loop back around */\r\n  csel x14, x13, x14, LT\r\n  cmp x14, x3\r\n  b.ne asm_cflip_pass_loop /* skip iteration decrement if we're not back to start */\r\n  sub x2, x2, 2\r\n  cbnz x2, asm_cflip_pass_loop\r\n  add v0.4s, v16.4s, v16.4s\r\n  ldp x12, x13, [sp, #0x20]\r\n  ldp x14, x15, [sp, #0x10]\r\n  add sp, sp, #0x30\r\n  ret\r\n\r\n/* x0 = ptr to array (was rcx)\r\n * x1 = arr length (was rdx)\r\n * x2 = iterations (was r8)\r\n * x3 = start (was r9)\r\n */\r\n_asm_copy:\r\nasm_copy:\r\n  sub sp, sp, #0x50\r\n  stp x14, x15, [sp, #0x10]\r\n  stp x12, x13, [sp, #0x20]\r\n  stp x10, x11, [sp, #0x30]\r\n  stp x8, x9, [sp, #0x40]\r\n  asr x11, x1, 1    /* x11 = destination index (length / 2) */\r\n  sub x1, x1, 128\r\n  mov x10, x11      /* use x10 as index into destination */\r\n  mov x14, x3     /* set x14 = index into array to start location (x3) */\r\n  eor x13, x13, x13 /* x13 = 0 (for comparison) */\r\nasm_copy_pass_loop:\r\n  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */\r\n  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */\r\n  lsl x12, x10, 2  /* x12 = x10 * 4, to calculate destination */\r\n  add x9, x0, x12  /* x9 = ptr to destination */\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q23, [x15, 112]\r\n  str q16, [x9]\r\n  str q17, [x9, 16]\r\n  str q18, [x9, 32]\r\n  str q19, [x9, 48]\r\n  str q20, [x9, 64]\r\n  str q21, [x9, 80]\r\n  str q22, [x9, 96]\r\n  str q23, [x9, 112]\r\n  add x14, x14, 32\r\n  add x10, x10, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  lsl x12, x10, 2\r\n  add x9, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q23, [x15, 112]\r\n  str q16, [x9]\r\n  str q17, [x9, 16]\r\n  str q18, [x9, 32]\r\n  str q19, [x9, 48]\r\n  str q20, [x9, 64]\r\n  str q21, [x9, 80]\r\n  str q22, [x9, 96]\r\n  str q23, [x9, 112]\r\n  add x14, x14, 32\r\n  add x10, x10, 32\r\n\r\n  cmp x1, x10 /* if destination hits end, loop around */\r\n  csel x14, x13, x14, LT\r\n  csel x10, x11, x10, LT\r\n  cmp x14, x3\r\n  b.ne asm_copy_pass_loop /* skip iteration decrement if we're not back to start */\r\n  sub x2, x2, 1\r\n  cbnz x2, asm_copy_pass_loop\r\n  add v0.4s, v16.4s, v16.4s\r\n  ldp x8, x9, [sp, #0x40]\r\n  ldp x10, x11, [sp, #0x30]\r\n  ldp x12, x13, [sp, #0x20]\r\n  ldp x14, x15, [sp, #0x10]\r\n  add sp, sp, #0x50\r\n  ret\r\n\r\n/* x0 = ptr to array (was rcx)\r\n * x1 = arr length (was rdx)\r\n * x2 = iterations (was r8)\r\n * x3 = start (was r9)\r\n */\r\nasm_add:\r\n_asm_add:\r\n  sub sp, sp, #0x30\r\n  stp x14, x15, [sp, #0x10]\r\n  stp x12, x13, [sp, #0x20]\r\n  sub x1, x1, 128\r\n  mov x14, x3     /* set x14 = index into array to start location (x3) */\r\n  eor x13, x13, x13 /* x13 = 0 (for comparison) */\r\n  ldr q15, [x0]\r\nasm_add_pass_loop:\r\n  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */\r\n  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q23, [x15, 112]\r\n  add v16.4s, v16.4s, v15.4s\r\n  add v17.4s, v17.4s, v15.4s\r\n  add v18.4s, v18.4s, v15.4s\r\n  add v19.4s, v19.4s, v15.4s\r\n  add v20.4s, v20.4s, v15.4s\r\n  add v21.4s, v21.4s, v15.4s\r\n  add v22.4s, v22.4s, v15.4s\r\n  add v23.4s, v23.4s, v15.4s\r\n  str q16, [x15]\r\n  str q17, [x15, 16]\r\n  str q18, [x15, 32]\r\n  str q19, [x15, 48]\r\n  str q20, [x15, 64]\r\n  str q21, [x15, 80]\r\n  str q22, [x15, 96]\r\n  str q23, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q23, [x15, 112]\r\n  add v16.4s, v16.4s, v15.4s\r\n  add v17.4s, v17.4s, v15.4s\r\n  add v18.4s, v18.4s, v15.4s\r\n  add v19.4s, v19.4s, v15.4s\r\n  add v20.4s, v20.4s, v15.4s\r\n  add v21.4s, v21.4s, v15.4s\r\n  add v22.4s, v22.4s, v15.4s\r\n  add v23.4s, v23.4s, v15.4s\r\n  str q16, [x15]\r\n  str q17, [x15, 16]\r\n  str q18, [x15, 32]\r\n  str q19, [x15, 48]\r\n  str q20, [x15, 64]\r\n  str q21, [x15, 80]\r\n  str q22, [x15, 96]\r\n  str q23, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q23, [x15, 112]\r\n  add v16.4s, v16.4s, v15.4s\r\n  add v17.4s, v17.4s, v15.4s\r\n  add v18.4s, v18.4s, v15.4s\r\n  add v19.4s, v19.4s, v15.4s\r\n  add v20.4s, v20.4s, v15.4s\r\n  add v21.4s, v21.4s, v15.4s\r\n  add v22.4s, v22.4s, v15.4s\r\n  add v23.4s, v23.4s, v15.4s\r\n  str q16, [x15]\r\n  str q17, [x15, 16]\r\n  str q18, [x15, 32]\r\n  str q19, [x15, 48]\r\n  str q20, [x15, 64]\r\n  str q21, [x15, 80]\r\n  str q22, [x15, 96]\r\n  str q23, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  lsl x12, x14, 2\r\n  add x15, x0, x12\r\n  ldr q16, [x15]\r\n  ldr q17, [x15, 16]\r\n  ldr q18, [x15, 32]\r\n  ldr q19, [x15, 48]\r\n  ldr q20, [x15, 64]\r\n  ldr q21, [x15, 80]\r\n  ldr q22, [x15, 96]\r\n  ldr q23, [x15, 112]\r\n  add v16.4s, v16.4s, v15.4s\r\n  add v17.4s, v17.4s, v15.4s\r\n  add v18.4s, v18.4s, v15.4s\r\n  add v19.4s, v19.4s, v15.4s\r\n  add v20.4s, v20.4s, v15.4s\r\n  add v21.4s, v21.4s, v15.4s\r\n  add v22.4s, v22.4s, v15.4s\r\n  add v23.4s, v23.4s, v15.4s\r\n  str q16, [x15]\r\n  str q17, [x15, 16]\r\n  str q18, [x15, 32]\r\n  str q19, [x15, 48]\r\n  str q20, [x15, 64]\r\n  str q21, [x15, 80]\r\n  str q22, [x15, 96]\r\n  str q23, [x15, 112]\r\n  add x14, x14, 32\r\n\r\n  cmp x1, x14 /* if x1 (len - 128) - x14 < 0, loop back around */\r\n  csel x14, x13, x14, LT\r\n  cmp x14, x3\r\n  b.ne asm_add_pass_loop /* skip iteration decrement if we're not back to start */\r\n  sub x2, x2, 2\r\n  cmp x2, 0\r\n  b.gt asm_add_pass_loop\r\n  ldr q0, [x0]\r\n  ldp x12, x13, [sp, #0x20]\r\n  ldp x14, x15, [sp, #0x10]\r\n  add sp, sp, #0x30\r\n  ret\r\n\r\n\r\n/* Tests for cache bank conflicts by reading from two locations, spaced by some\r\n   number of bytes\r\n   x0 = ptr to array. first 32-bit int = increment step, because I'm too lazy to mess with the stack\r\n   x1 = array length, in bytes\r\n   x2 = load spacing, in bytes\r\n   x3 = iter count (number of loads to execute) */\r\nreadbankconflict:\r\nreadbankconflict128:\r\n_readbankconflict:\r\n   sub sp, sp, #0x40\r\n   stp x14, x15, [sp, #0x10]\r\n   stp x12, x13, [sp, #0x20]\r\n   stp x10, x11, [sp, #0x30]\r\n   cmp x1, x2               /* basic check - subtract load spacing from array len */\r\n   b.le readbankconflict_end /* exit immediately if we don't have enough space to iterate */\r\n   sub x12, x1, 20          /* use x12 to check bytes remaining */\r\n   mov x14, x0\r\n   add x13, x0, x2           /* x14 = first load location, x13 = second load location */\r\n   sub x12, x12, 20          /* we're reading 20B ahead */\r\n   ldr x11, [x0]   /* increment, not used right now */\r\nreadbankconflict_loop:\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   ldr x10, [x14]\r\n   ldr x15, [x13]\r\n   add x14, x14, 1\r\n   add x13, x13, 1\r\n\r\n   sub x12, x12, 20\r\n   sub x3, x3, 20\r\n   cmp x3, 0\r\n   b.le readbankconflict_end  /* iteration count = exit condition */\r\n   cmp x12, 0                 /* check bytes remaining */\r\n   b.ge readbankconflict_loop /* if positive or equal, continue loop */\r\n   sub x12, x1, 20     /* reset bytes remaining */\r\n   mov x14, x1\r\n   add x13, x1, x2\r\n   b readbankconflict_loop\r\nreadbankconflict_end:\r\n   ldp x10, x11, [sp, #0x30]\r\n   ldp x12, x13, [sp, #0x20]\r\n   ldp x14, x15, [sp, #0x10]\r\n   add sp, sp, #0x40\r\n   ret\r\n\r\n/* x0: ptr to array\r\n   x1: array size in bytes */\r\nflush_icache:\r\n_flush_icache:\r\n  sub sp, sp, #0x20\r\n  stp x14, x15, [sp, #0x10]\r\n  asr x0, x0, 6   /* align to 64B cacheline */\r\n  lsl x0, x0, 6\r\n  mov x14, x0\r\n  mov x15, x1\r\nflush_icache_clean_dcache_loop:\r\n  dc civac, x14\r\n  add x14, x14, 64\r\n  sub x15, x15, 64\r\n  b.gt flush_icache_clean_dcache_loop\r\n  dsb ish\r\n  mov x14, x0\r\n  mov x15, x1\r\nflush_icache_clean_icache_loop:\r\n  ic ivau, x14\r\n  add x14, x14, 64\r\n  sub x15, x15, 64\r\n  b.gt flush_icache_clean_icache_loop\r\n  dsb ish\r\n  isb\r\n  ldp x14, x15, [sp, #0x10]\r\n  add sp, sp, #0x20\r\n  ret\r\n"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth_riscv.s",
    "content": ".text\n\n.global asm_read\n.global asm_write\n.global asm_cflip\n.global asm_copy\n.global asm_add\n.global flush_icache\n.global readbankconflict\n.global readbankconflict128\n\n/* a0 = arr, a1 = arr_len, a2 = iterations */\nasm_read:\n  addi sp, sp, -16\n  sd s0, (sp)\n  mv t4, x0\n  addi t4, t4, 16     /* ??? */\n  vsetvli t0, t4, e32  /* request vector length of some big value? always gives back 4 but not if we ask for 4*/\n  mv t4, t0\n  mv t1, a0           /* t1 = current address */\n  slli t0, t0, 4      /* increment every loop by vec len * unroll factor of 4 */\n  slli a1, a1, 2      /* get array length in bytes */\n  add a1, a1, a0      /* array bound address */\n  sub a1, a1, t0      /* make sure there's enough room for the last iteration */\n  mv t3, x0           /* t3 = iteration counter */\n\n  /* precompute offsets */\n  slli t4, t4, 2  /* vec len given in number of 32-bit elements */\n  mv t5, t4\n  mv t6, t4\n  slli t5, t5, 1 /* vec len * 2 */\n  add t6, t6, t5 /* t6 = vec len * 3 */\nasm_read_pass_loop:\n  vlw.v v0, (t1)\n  add s0, t1, t4\n  vlw.v v1, (s0)\n  add s0, t1, t5\n  vlw.v v2, (s0)\n  add s0, t1, t6\n  vlw.v v3, (s0)\n  add t1, t1, t0      /* increment address */\n  blt t1, a1, asm_read_pass_loop\n  addi t3, t3, 1\n  mv t1, a0           /* reset array addr*/\n  blt t3, a2, asm_read_pass_loop\n  ld s0, (sp)\n  addi sp, sp, 16\n  fld fa0, (a0)\n  ret\n\nasm_write:\n  addi sp, sp, -16\n  sd s0, (sp)\n  mv t4, x0\n  addi t4, t4, 16     /* ??? */\n  vsetvli t0, t4, e32  /* request vector length of some big value? always gives back 4 but not if we ask for 4*/\n  mv t4, t0\n  mv t1, a0           /* t1 = current address */\n  slli t0, t0, 4      /* increment every loop by vec len * unroll factor of 4 */\n  slli a1, a1, 2      /* get array length in bytes */\n  add a1, a1, a0      /* array bound address */\n  sub a1, a1, t0      /* make sure there's enough room for the last iteration */\n  mv t3, x0           /* t3 = iteration counter */\n\n  /* precompute offsets */\n  slli t4, t4, 2  /* vec len given in number of 32-bit elements */\n  mv t5, t4\n  mv t6, t4\n  slli t5, t5, 1 /* vec len * 2 */\n  add t6, t6, t5 /* t6 = vec len * 3 */\n  vlw.v v0, (a0)\nasm_write_pass_loop:\n  vsw.v v0, (t1)\n  add s0, t1, t4\n  vsw.v v1, (s0)\n  add s0, t1, t5\n  vsw.v v2, (s0)\n  add s0, t1, t6\n  vsw.v v3, (s0)\n  add t1, t1, t0      /* increment address */\n  blt t1, a1, asm_write_pass_loop\n  addi t3, t3, 1\n  mv t1, a0           /* reset array addr*/\n  blt t3, a2, asm_write_pass_loop\n  ld s0, (sp)\n  addi sp, sp, 16\n  fld fa0, (a0)\n  ret\n\nasm_copy:\n  ret\n\nasm_add:\n  addi sp, sp, -16\n  sd s0, (sp)\n  mv t4, x0\n  addi t4, t4, 16     /* ??? */\n  vsetvli t0, t4, e32  /* request vector length of some big value? always gives back 4 but not if we ask for 4*/\n  mv t4, t0\n  mv t1, a0           /* t1 = current address */\n  slli t0, t0, 4      /* increment every loop by vec len * unroll factor of 4 */\n  slli a1, a1, 2      /* get array length in bytes */\n  add a1, a1, a0      /* array bound address */\n  sub a1, a1, t0      /* make sure there's enough room for the last iteration */\n  mv t3, x0           /* t3 = iteration counter */\n\n  /* precompute offsets */\n  slli t4, t4, 2  /* vec len given in number of 32-bit elements */\n  mv t5, t4\n  mv t6, t4\n  slli t5, t5, 1 /* vec len * 2 */\n  add t6, t6, t5 /* t6 = vec len * 3 */\n  vlw.v v4, (a0)\nasm_add_pass_loop:\n  vlw.v v0, (t1)\n  vadd.vv v0, v0, v4\n  vsw.v v0, (t1)\n\n  add s0, t1, t4\n  vlw.v v1, (s0)\n  vadd.vv v1, v1, v4\n  vsw.v v1, (s0)\n\n  add s0, t1, t5\n  vlw.v v2, (s0)\n  vadd.vv v2, v2, v4\n  vsw.v v2, (s0)\n\n  add s0, t1, t6\n  vlw.v v3, (s0)\n  vadd.vv v3, v3, v4\n  vsw.v v3, (s0)\n\n  add t1, t1, t0      /* increment address */\n  blt t1, a1, asm_add_pass_loop\n  addi t3, t3, 1\n  mv t1, a0           /* reset array addr*/\n  blt t3, a2, asm_add_pass_loop\n  ld s0, (sp)\n  addi sp, sp, 16\n  fld fa0, (a0)\n  ret\n\nasm_cflip:\n  ret\n\nreadbankconflict:\n  ret\n\nreadbankconflict128:\n  ret\n"
  },
  {
    "path": "MemoryBandwidth/MemoryBandwidth_x86.s",
    "content": ".text\r\n\r\n.global asm_read\r\n.global asm_write\r\n.global asm_copy\r\n.global asm_cflip\r\n.global asm_add\r\n.global sse_read\r\n.global sse_write\r\n.global sse_ntwrite\r\n.global avx512_read\r\n.global avx512_write\r\n.global avx512_copy\r\n.global avx512_add\r\n.global readbankconflict\r\n.global readbankconflict128\r\n\r\n.global repstosd_write\r\n.global repstosb_write\r\n.global repmovsb_copy\r\n.global repmovsd_copy\r\n\r\nasm_read:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $256, %r15 /* load in blocks of 256 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\navx_asm_read_pass_loop:\r\n\r\n  vmovaps (%rdi), %ymm0\r\n  vmovaps 32(%rdi), %ymm1\r\n  vmovaps 64(%rdi), %ymm2\r\n  vmovaps 96(%rdi), %ymm3\r\n  vmovaps 128(%rdi), %ymm0\r\n  vmovaps 160(%rdi), %ymm1\r\n  vmovaps 192(%rdi), %ymm2\r\n  vmovaps 224(%rdi), %ymm3\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  vmovaps (%rdi), %ymm0\r\n  vmovaps 32(%rdi), %ymm1\r\n  vmovaps 64(%rdi), %ymm2\r\n  vmovaps 96(%rdi), %ymm3\r\n  vmovaps 128(%rdi), %ymm0\r\n  vmovaps 160(%rdi), %ymm1\r\n  vmovaps 192(%rdi), %ymm2\r\n  vmovaps 224(%rdi), %ymm3\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge asm_avx_test_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\nasm_avx_test_iteration_count:\r\n\r\n  cmp %rsi, %r9\r\n  jnz avx_asm_read_pass_loop /* skip iteration decrement if we're not back to start */\r\n  dec %r8\r\n  jnz avx_asm_read_pass_loop\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret\r\n\r\nasm_write:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $256, %r15 /* load in blocks of 256 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\n  vmovaps (%rcx), %ymm0\r\navx_asm_write_pass_loop:\r\n\r\n  vmovaps %ymm0, (%rdi)\r\n  vmovaps %ymm0, 32(%rdi)\r\n  vmovaps %ymm0, 64(%rdi)\r\n  vmovaps %ymm0, 96(%rdi)\r\n  vmovaps %ymm0, 128(%rdi)\r\n  vmovaps %ymm0, 160(%rdi)\r\n  vmovaps %ymm0, 192(%rdi)\r\n  vmovaps %ymm0, 224(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  vmovaps %ymm0, (%rdi)\r\n  vmovaps %ymm0, 32(%rdi)\r\n  vmovaps %ymm0, 64(%rdi)\r\n  vmovaps %ymm0, 96(%rdi)\r\n  vmovaps %ymm0, 128(%rdi)\r\n  vmovaps %ymm0, 160(%rdi)\r\n  vmovaps %ymm0, 192(%rdi)\r\n  vmovaps %ymm0, 224(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge asm_avx_write_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\nasm_avx_write_iteration_count:\r\n\r\n  cmp %rsi, %r9\r\n  jnz avx_asm_write_pass_loop /* skip iteration decrement if we're not back to start */\r\n  dec %r8\r\n  jnz avx_asm_write_pass_loop\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret\r\n\r\n/* rcx = ptr to arr\r\n   rdx = arr_length\r\n   r8 = iterations */\r\nasm_copy:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  push %r13\r\n  xor %rsi, %rsi\r\n  mov %rdx, %r9\r\n  shr $1, %r9    /* start destination at array + length / 2 */\r\n  mov $256, %r15 /* load in blocks of 128 bytes */\r\n  mov %r9, %r13\r\n  sub $64, %r13 /* place loop limit 256B before end */\r\n  lea (%rcx,%rsi,4), %rdi\r\n  lea (%rcx,%r9,4), %r14\r\navx_asm_copy_pass_loop:\r\n\r\n  vmovaps (%rdi), %ymm0\r\n  vmovaps 32(%rdi), %ymm1\r\n  vmovaps 64(%rdi), %ymm2\r\n  vmovaps 96(%rdi), %ymm3\r\n  vmovaps 128(%rdi), %ymm4\r\n  vmovaps 160(%rdi), %ymm5\r\n  vmovaps 192(%rdi), %ymm6\r\n  vmovaps 224(%rdi), %ymm7\r\n  vmovaps %ymm0, (%r14)\r\n  vmovaps %ymm1, 32(%r14)\r\n  vmovaps %ymm2, 64(%r14)\r\n  vmovaps %ymm3, 96(%r14)\r\n  vmovaps %ymm4, 128(%r14)\r\n  vmovaps %ymm5, 160(%r14)\r\n  vmovaps %ymm6, 192(%r14)\r\n  vmovaps %ymm7, 224(%r14)\r\n  add $64, %rsi\r\n  add %r15, %rdi  /* increment src/dst pointers */\r\n  add %r15, %r14\r\n  cmp %rsi, %r13   /* end location is at half */\r\n  jge avx_asm_copy_pass_loop\r\n  xor %rsi, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\n  lea (%rcx,%r9,4), %r14\r\n  dec %r8                 /* decrement iteration counter */\r\n  jnz avx_asm_copy_pass_loop\r\n  pop %r13\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret\r\n\r\n\r\nasm_cflip:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $256, %r15 /* load in blocks of 256 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\navx_asm_cflip_pass_loop:\r\n\r\n  vmovaps (%rdi), %ymm0\r\n  vmovaps 32(%rdi), %ymm1\r\n  vmovaps 64(%rdi), %ymm2\r\n  vmovaps 96(%rdi), %ymm3\r\n  vmovaps %ymm0, 96(%rdi)\r\n  vmovaps %ymm1, 64(%rdi)\r\n  vmovaps %ymm2, 32(%rdi)\r\n  vmovaps %ymm3, (%rdi)\r\n  vmovaps 128(%rdi), %ymm0\r\n  vmovaps 160(%rdi), %ymm1\r\n  vmovaps 192(%rdi), %ymm2\r\n  vmovaps 224(%rdi), %ymm3\r\n  vmovaps %ymm0, 224(%rdi)\r\n  vmovaps %ymm1, 192(%rdi)\r\n  vmovaps %ymm2, 160(%rdi)\r\n  vmovaps %ymm3, 128(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  vmovaps (%rdi), %ymm0\r\n  vmovaps 32(%rdi), %ymm1\r\n  vmovaps 64(%rdi), %ymm2\r\n  vmovaps 96(%rdi), %ymm3\r\n  vmovaps %ymm0, 96(%rdi)\r\n  vmovaps %ymm1, 64(%rdi)\r\n  vmovaps %ymm2, 32(%rdi)\r\n  vmovaps %ymm3, (%rdi)\r\n  vmovaps 128(%rdi), %ymm0\r\n  vmovaps 160(%rdi), %ymm1\r\n  vmovaps 192(%rdi), %ymm2\r\n  vmovaps 224(%rdi), %ymm3\r\n  vmovaps %ymm0, 224(%rdi)\r\n  vmovaps %ymm1, 192(%rdi)\r\n  vmovaps %ymm2, 160(%rdi)\r\n  vmovaps %ymm3, 128(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge asm_avx_cflip_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\nasm_avx_cflip_iteration_count:\r\n  cmp %rsi, %r9\r\n  jnz avx_asm_cflip_pass_loop /* skip iteration decrement if we're not back to start */\r\n  sub $2, %r8  /* each iteration counts as two (hitting each element twice) */\r\n  jnz avx_asm_cflip_pass_loop\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret\r\n\r\nasm_add:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $256, %r15 /* load in blocks of 256 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\n  vmovaps (%rdi), %ymm4\r\navx_asm_add_pass_loop:\r\n  vaddps (%rdi), %ymm4, %ymm0\r\n  vaddps 32(%rdi), %ymm4, %ymm1\r\n  vaddps 64(%rdi), %ymm4, %ymm2\r\n  vaddps 96(%rdi), %ymm4, %ymm3\r\n  vmovaps %ymm0, (%rdi)\r\n  vmovaps %ymm1, 32(%rdi)\r\n  vmovaps %ymm2, 64(%rdi)\r\n  vmovaps %ymm3, 96(%rdi)\r\n  vaddps 128(%rdi), %ymm4, %ymm0\r\n  vaddps 160(%rdi), %ymm4, %ymm1\r\n  vaddps 192(%rdi), %ymm4, %ymm2\r\n  vaddps 224(%rdi), %ymm4, %ymm3\r\n  vmovaps %ymm0, 128(%rdi)\r\n  vmovaps %ymm1, 160(%rdi)\r\n  vmovaps %ymm2, 192(%rdi)\r\n  vmovaps %ymm3, 224(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  vaddps (%rdi), %ymm4, %ymm0\r\n  vaddps 32(%rdi), %ymm4, %ymm1\r\n  vaddps 64(%rdi), %ymm4, %ymm2\r\n  vaddps 96(%rdi), %ymm4, %ymm3\r\n  vmovaps %ymm0, (%rdi)\r\n  vmovaps %ymm1, 32(%rdi)\r\n  vmovaps %ymm2, 64(%rdi)\r\n  vmovaps %ymm3, 96(%rdi)\r\n  vaddps 128(%rdi), %ymm4, %ymm0\r\n  vaddps 160(%rdi), %ymm4, %ymm1\r\n  vaddps 192(%rdi), %ymm4, %ymm2\r\n  vaddps 224(%rdi), %ymm4, %ymm3\r\n  vmovaps %ymm0, 128(%rdi)\r\n  vmovaps %ymm1, 160(%rdi)\r\n  vmovaps %ymm2, 192(%rdi)\r\n  vmovaps %ymm3, 224(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge asm_avx_add_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\nasm_avx_add_iteration_count:\r\n  cmp %rsi, %r9\r\n  jnz avx_asm_add_pass_loop /* skip iteration decrement if we're not back to start */\r\n  sub $2, %r8\r\n  jg avx_asm_add_pass_loop\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  movss (%rdi), %xmm0\r\n  ret\r\n\r\nsse_read:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $256, %r15 /* load in blocks of 256 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\nsse_read_pass_loop:\r\n\r\n  movaps (%rdi), %xmm0\r\n  movaps 16(%rdi), %xmm1\r\n  movaps 32(%rdi), %xmm2\r\n  movaps 48(%rdi), %xmm3\r\n  movaps 64(%rdi), %xmm0\r\n  movaps 80(%rdi), %xmm1\r\n  movaps 96(%rdi), %xmm2\r\n  movaps 112(%rdi), %xmm3\r\n  movaps 128(%rdi), %xmm0\r\n  movaps 144(%rdi), %xmm1\r\n  movaps 160(%rdi), %xmm2\r\n  movaps 176(%rdi), %xmm3\r\n  movaps 192(%rdi), %xmm0\r\n  movaps 208(%rdi), %xmm1\r\n  movaps 224(%rdi), %xmm2\r\n  movaps 240(%rdi), %xmm3\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  movaps (%rdi), %xmm0\r\n  movaps 16(%rdi), %xmm1\r\n  movaps 32(%rdi), %xmm2\r\n  movaps 48(%rdi), %xmm3\r\n  movaps 64(%rdi), %xmm0\r\n  movaps 80(%rdi), %xmm1\r\n  movaps 96(%rdi), %xmm2\r\n  movaps 112(%rdi), %xmm3\r\n  movaps 128(%rdi), %xmm0\r\n  movaps 144(%rdi), %xmm1\r\n  movaps 160(%rdi), %xmm2\r\n  movaps 176(%rdi), %xmm3\r\n  movaps 192(%rdi), %xmm0\r\n  movaps 208(%rdi), %xmm1\r\n  movaps 224(%rdi), %xmm2\r\n  movaps 240(%rdi), %xmm3\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge sse_test_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\nsse_test_iteration_count:\r\n\r\n  cmp %rsi, %r9\r\n  jnz sse_read_pass_loop /* skip iteration decrement if we're not back to start */\r\n  dec %r8\r\n  jnz sse_read_pass_loop\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret\r\n\r\nsse_write:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $256, %r15 /* load in blocks of 256 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\n  movaps (%rdi), %xmm0\r\n  movaps 16(%rdi), %xmm1\r\n  movaps 32(%rdi), %xmm2\r\n  movaps 48(%rdi), %xmm3\r\nsse_write_pass_loop:\r\n\r\n  movaps %xmm0, (%rdi)\r\n  movaps %xmm1, 16(%rdi)\r\n  movaps %xmm2, 32(%rdi)\r\n  movaps %xmm3, 48(%rdi)\r\n  movaps %xmm0, 64(%rdi)\r\n  movaps %xmm1, 80(%rdi)\r\n  movaps %xmm2, 96(%rdi)\r\n  movaps %xmm3, 112(%rdi)\r\n  movaps %xmm0, 128(%rdi)\r\n  movaps %xmm1, 144(%rdi)\r\n  movaps %xmm2, 160(%rdi)\r\n  movaps %xmm3, 176(%rdi)\r\n  movaps %xmm0, 192(%rdi)\r\n  movaps %xmm1, 208(%rdi)\r\n  movaps %xmm2, 224(%rdi)\r\n  movaps %xmm3, 240(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  movaps %xmm0, (%rdi)\r\n  movaps %xmm1, 16(%rdi)\r\n  movaps %xmm2, 32(%rdi)\r\n  movaps %xmm3, 48(%rdi)\r\n  movaps %xmm0, 64(%rdi)\r\n  movaps %xmm1, 80(%rdi)\r\n  movaps %xmm2, 96(%rdi)\r\n  movaps %xmm3, 112(%rdi)\r\n  movaps %xmm0, 128(%rdi)\r\n  movaps %xmm1, 144(%rdi)\r\n  movaps %xmm2, 160(%rdi)\r\n  movaps %xmm3, 176(%rdi)\r\n  movaps %xmm0, 192(%rdi)\r\n  movaps %xmm1, 208(%rdi)\r\n  movaps %xmm2, 224(%rdi)\r\n  movaps %xmm3, 240(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge sse_write_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\nsse_write_iteration_count:\r\n\r\n  cmp %rsi, %r9\r\n  jnz sse_write_pass_loop /* skip iteration decrement if we're not back to start */\r\n  dec %r8\r\n  jnz sse_write_pass_loop\r\n  movaps (%rcx), %xmm0\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret\r\n\r\nsse_ntwrite:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $256, %r15 /* load in blocks of 256 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\n  movaps (%rdi), %xmm0\r\n  movaps 16(%rdi), %xmm1\r\n  movaps 32(%rdi), %xmm2\r\n  movaps 48(%rdi), %xmm3\r\nsse_ntwrite_pass_loop:\r\n  movntps %xmm0, (%rdi)\r\n  movntps %xmm1, 16(%rdi)\r\n  movntps %xmm2, 32(%rdi)\r\n  movntps %xmm3, 48(%rdi)\r\n  movntps %xmm0, 64(%rdi)\r\n  movntps %xmm1, 80(%rdi)\r\n  movntps %xmm2, 96(%rdi)\r\n  movntps %xmm3, 112(%rdi)\r\n  movntps %xmm0, 128(%rdi)\r\n  movntps %xmm1, 144(%rdi)\r\n  movntps %xmm2, 160(%rdi)\r\n  movntps %xmm3, 176(%rdi)\r\n  movntps %xmm0, 192(%rdi)\r\n  movntps %xmm1, 208(%rdi)\r\n  movntps %xmm2, 224(%rdi)\r\n  movntps %xmm3, 240(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  movntps %xmm0, (%rdi)\r\n  movntps %xmm1, 16(%rdi)\r\n  movntps %xmm2, 32(%rdi)\r\n  movntps %xmm3, 48(%rdi)\r\n  movntps %xmm0, 64(%rdi)\r\n  movntps %xmm1, 80(%rdi)\r\n  movntps %xmm2, 96(%rdi)\r\n  movntps %xmm3, 112(%rdi)\r\n  movntps %xmm0, 128(%rdi)\r\n  movntps %xmm1, 144(%rdi)\r\n  movntps %xmm2, 160(%rdi)\r\n  movntps %xmm3, 176(%rdi)\r\n  movntps %xmm0, 192(%rdi)\r\n  movntps %xmm1, 208(%rdi)\r\n  movntps %xmm2, 224(%rdi)\r\n  movntps %xmm3, 240(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge sse_ntwrite_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\nsse_ntwrite_iteration_count:\r\n\r\n  cmp %rsi, %r9\r\n  jnz sse_ntwrite_pass_loop /* skip iteration decrement if we're not back to start */\r\n  dec %r8\r\n  jnz sse_ntwrite_pass_loop\r\n  movaps (%rcx), %xmm0\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret \r\n\r\n\r\navx512_read:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $256, %r15 /* load in blocks of 256 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\navx512_read_pass_loop:\r\n\r\n  vmovaps (%rdi), %zmm0\r\n  vmovaps 64(%rdi), %zmm1\r\n  vmovaps 128(%rdi), %zmm2\r\n  vmovaps 192(%rdi), %zmm3\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  vmovaps (%rdi), %zmm0\r\n  vmovaps 64(%rdi), %zmm1\r\n  vmovaps 128(%rdi), %zmm2\r\n  vmovaps 192(%rdi), %zmm3\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge avx512_test_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\navx512_test_iteration_count:\r\n\r\n  cmp %rsi, %r9\r\n  jnz avx512_read_pass_loop /* skip iteration decrement if we're not back to start */\r\n  dec %r8\r\n  jnz avx512_read_pass_loop\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret\r\n\r\navx512_write:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $256, %r15 /* load in blocks of 256 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\n  vmovaps (%rdi), %zmm0\r\navx512_write_pass_loop:\r\n  vmovaps %zmm0, (%rdi)\r\n  vmovaps %zmm1, 64(%rdi)\r\n  vmovaps %zmm2, 128(%rdi)\r\n  vmovaps %zmm3, 192(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  vmovaps %zmm0, (%rdi)\r\n  vmovaps %zmm1, 64(%rdi)\r\n  vmovaps %zmm2, 128(%rdi)\r\n  vmovaps %zmm3, 192(%rdi)\r\n  add $64, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge avx512_write_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\navx512_write_iteration_count:\r\n\r\n  cmp %rsi, %r9\r\n  jnz avx512_write_pass_loop /* skip iteration decrement if we're not back to start */\r\n  dec %r8\r\n  jnz avx512_write_pass_loop\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret\r\n\r\n/* rcx = ptr to arr\r\n   rdx = arr_length\r\n   r8 = iterations */\r\navx512_copy:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  push %r13\r\n  xor %rsi, %rsi\r\n  mov %rdx, %r9\r\n  shr $1, %r9    /* start destination at array + length / 2 */\r\n  mov $256, %r15 /* load in blocks of 128 bytes */\r\n  mov %r9, %r13\r\n  sub $128, %r13 /* place loop limit 512B before end */\r\n  lea (%rcx,%rsi,4), %rdi\r\n  lea (%rcx,%r9,4), %r14\r\navx512_copy_pass_loop:\r\n\r\n  vmovaps (%rdi), %zmm0\r\n  vmovaps 64(%rdi), %zmm1\r\n  vmovaps 128(%rdi), %zmm2\r\n  vmovaps 192(%rdi), %zmm3\r\n  vmovaps 256(%rdi), %zmm4\r\n  vmovaps 320(%rdi), %zmm5\r\n  vmovaps 384(%rdi), %zmm6\r\n  vmovaps 448(%rdi), %zmm7\r\n  vmovaps %zmm0, (%r14)\r\n  vmovaps %zmm1, 64(%r14)\r\n  vmovaps %zmm2, 128(%r14)\r\n  vmovaps %zmm3, 192(%r14)\r\n  vmovaps %zmm4, 256(%r14)\r\n  vmovaps %zmm5, 320(%r14)\r\n  vmovaps %zmm6, 384(%r14)\r\n  vmovaps %zmm7, 448(%r14)\r\n  add $128, %rsi\r\n  add %r15, %rdi  /* increment src/dst pointers */\r\n  add %r15, %r14\r\n  cmp %rsi, %r13   /* end location is at half */\r\n  jge avx512_copy_pass_loop\r\n  xor %rsi, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\n  lea (%rcx,%r9,4), %r14\r\n  dec %r8                 /* decrement iteration counter */\r\n  jnz avx512_copy_pass_loop\r\n  pop %r13\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  ret\r\n\r\navx512_add:\r\n  push %rsi\r\n  push %rdi\r\n  push %rbx\r\n  push %r15\r\n  push %r14\r\n  mov $512, %r15 /* load in blocks of 512 bytes */\r\n  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */\r\n  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */\r\n  xor %rbx, %rbx\r\n  lea (%rcx,%rsi,4), %rdi\r\n  mov %rdi, %r14\r\n  vmovaps (%rcx), %zmm4\r\navx512_add_pass_loop:\r\n  vaddps (%rdi), %zmm4, %zmm0\r\n  vaddps 64(%rdi), %zmm4, %zmm1\r\n  vaddps 128(%rdi), %zmm4, %zmm2\r\n  vaddps 192(%rdi), %zmm4, %zmm3\r\n  vmovaps %zmm0, (%rdi)\r\n  vmovaps %zmm1, 64(%rdi)\r\n  vmovaps %zmm2, 128(%rdi)\r\n  vmovaps %zmm3, 192(%rdi)\r\n  vaddps 256(%rdi), %zmm4, %zmm0\r\n  vaddps 320(%rdi), %zmm4, %zmm1\r\n  vaddps 384(%rdi), %zmm4, %zmm2\r\n  vaddps 448(%rdi), %zmm4, %zmm3\r\n  vmovaps %zmm0, 256(%rdi)\r\n  vmovaps %zmm1, 320(%rdi)\r\n  vmovaps %zmm2, 384(%rdi)\r\n  vmovaps %zmm3, 448(%rdi)\r\n  add $128, %rsi\r\n  add %r15, %rdi\r\n  cmp %rsi, %rdx\r\n  jge avx512_add_iteration_count\r\n  mov %rbx, %rsi\r\n  lea (%rcx,%rsi,4), %rdi /* back to start */\r\navx512_add_iteration_count:\r\n\r\n  cmp %rsi, %r9\r\n  jnz avx512_add_pass_loop /* skip iteration decrement if we're not back to start */\r\n  sub $2, %r8\r\n  jg avx512_add_pass_loop\r\n  pop %r14\r\n  pop %r15\r\n  pop %rbx\r\n  pop %rdi\r\n  pop %rsi\r\n  movss (%rcx), %xmm0\r\n  ret\r\n\r\n/* rcx = ptr to arr, rdx = nr of fp32 elements in arr, r8 = iteration count */\r\nrepmovsb_copy:\r\n  push %r15\r\n  push %r14\r\n  push %r13\r\n  push %r12\r\n  push %rsi\r\n  push %rdi\r\n  cld\r\n  mov %rcx, %rsi  /* set source */\r\n  shr $1, %rdx    /* point destination to second half of array, or rcx + (rdx / 2) */\r\n  mov %rcx, %rdi\r\n  add %rdx, %rdi\r\n  mov %rdx, %rcx  /* rcx = count. set to (size / 2) * (4 bytes per FP32 element) */\r\n  shl $2, %rcx\r\n  mov %rsi, %r12\r\n  mov %rdi, %r13\r\n  mov %rcx, %r14\r\nrepmovsb_copy_pass_loop:\r\n  mov %r12, %rsi\r\n  mov %r13, %rdi\r\n  mov %r14, %rcx\r\n  rep movsb\r\n  dec %r8\r\n  jnz repmovsb_copy_pass_loop\r\n  movss (%r12), %xmm0\r\n  pop %rdi\r\n  pop %rsi\r\n  pop %r12\r\n  pop %r13\r\n  pop %r14\r\n  pop %r15\r\n  ret\r\n\r\n\r\nrepmovsd_copy:\r\n  push %r15\r\n  push %r14\r\n  push %r13\r\n  push %r12\r\n  push %rsi\r\n  push %rdi\r\n  cld\r\n  mov %rcx, %rsi  /* set source */\r\n  shr $1, %rdx    /* point destination to second half of array, or rcx + (rdx / 2) */\r\n  mov %rcx, %rdi\r\n  add %rdx, %rdi\r\n  mov %rdx, %rcx  /* rcx = count. set to (size / 2) */\r\n  mov %rsi, %r12\r\n  mov %rdi, %r13\r\n  mov %rcx, %r14\r\nrepmovsd_copy_pass_loop:\r\n  mov %r12, %rsi\r\n  mov %r13, %rdi\r\n  mov %r14, %rcx\r\n  rep movsd\r\n  dec %r8\r\n  jnz repmovsd_copy_pass_loop\r\n  movss (%r12), %xmm0\r\n  pop %rdi\r\n  pop %rsi\r\n  pop %r12\r\n  pop %r13\r\n  pop %r14\r\n  pop %r15\r\n  ret \r\n\r\nrepstosb_write:\r\n  push %r15\r\n  push %r14\r\n  push %r13\r\n  push %r12\r\n  push %rsi\r\n  push %rdi\r\n  cld\r\n  mov $1, %al     /* set source (1) */\r\n  mov %rcx, %r13  /* save destination into r13 */\r\n  mov %rdx, %r14  /* save count into r14 */\r\n  shl $2, %r14    /* multiply count by 4 because count is in FP32 elements and stosb works with bytes */\r\nrepstosb_copy_pass_loop:\r\n  mov %r13, %rdi\r\n  mov %r14, %rcx\r\n  rep stosb\r\n  dec %r8\r\n  jnz repstosb_copy_pass_loop\r\n  movss (%r13), %xmm0\r\n  pop %rdi\r\n  pop %rsi\r\n  pop %r12\r\n  pop %r13\r\n  pop %r14\r\n  pop %r15\r\n  ret  \r\n\r\nrepstosd_write:\r\n  push %r15\r\n  push %r14\r\n  push %r13\r\n  push %r12\r\n  push %rsi\r\n  push %rdi\r\n  cld\r\n  mov $1, %al     /* set source (1) */\r\n  mov %rcx, %r13  /* save destination into r13 */\r\n  mov %rdx, %r14  /* save count into r14 */\r\nrepstosd_copy_pass_loop:\r\n  mov %r13, %rdi\r\n  mov %r14, %rcx\r\n  rep stosl\r\n  dec %r8\r\n  jnz repstosd_copy_pass_loop\r\n  movss (%r13), %xmm0\r\n  pop %rdi\r\n  pop %rsi\r\n  pop %r12\r\n  pop %r13\r\n  pop %r14\r\n  pop %r15\r\n  ret   \r\n\r\n\r\n/* Tests for cache bank conflicts by reading from two locations, spaced by some\r\n   number of bytes\r\n   rcx = ptr to array. first 32-bit int = increment step, because I'm too lazy to mess with the stack\r\n   rdx = array length, in bytes\r\n   r8 = load spacing, in bytes\r\n   r9 = iter count (number of loads to execute) */\r\nreadbankconflict:\r\n   push %rbx\r\n   push %rdi\r\n   push %rsi\r\n   push %r10\r\n   push %r11\r\n   push %r12\r\n   mov $1, %rax\r\n   cmp %r8, %rdx  /* basic check - subtract load spacing from array len */\r\n   jle readbankconflict_end /* exit immediately if we don't have enough space to iterate */\r\n   xor %rax, %rax\r\n   mov %rcx, %rdi\r\n   mov %rcx, %rsi\r\n   mov %rcx, %r12\r\n   add %rdx, %r12  /* set end location */\r\n   sub $10, %r12   /* we're reading 10B ahead */\r\n   add %r8, %rsi   /* rdi = first load location, rsi = second load location */\r\n   mov (%rcx), %rbx  /* rbx = increment */\r\nreadbankconflict_loop:\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   mov (%rdi), %r10\r\n   mov (%rsi), %r11\r\n   sub $20, %r9\r\n   jl readbankconflict_end\r\n   cmp %rsi, %r12  /* subtract leading location from end location */\r\n   jg readbankconflict_loop /* if positive or equal, continue loop */\r\n   mov %rcx, %rdi  /* reset to start */\r\n   mov %rcx, %rsi\r\n   add %r8, %rsi\r\n   jmp readbankconflict_loop\r\nreadbankconflict_end:\r\n   pop %r12\r\n   pop %r11\r\n   pop %r10\r\n   pop %rsi\r\n   pop %rdi\r\n   pop %rbx\r\n   ret\r\n\r\nreadbankconflict128:\r\n   push %rbx\r\n   push %rdi\r\n   push %rsi\r\n   push %r10\r\n   push %r11\r\n   push %r12\r\n   mov $1, %rax\r\n   cmp %r8, %rdx  /* basic check - subtract load spacing from array len */\r\n   jle readbankconflict128_end /* exit immediately if we don't have enough space to iterate */\r\n   xor %rax, %rax\r\n   mov %rcx, %rdi\r\n   mov %rcx, %rsi\r\n   mov %rcx, %r12\r\n   add %rdx, %r12  /* set end location */\r\n   sub $10, %r12   /* we're reading 10B ahead */\r\n   add %r8, %rsi   /* rdi = first load location, rsi = second load location */\r\n   mov (%rcx), %rbx  /* rbx = increment */\r\nreadbankconflict128_loop:\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   movups (%rdi), %xmm0\r\n   movups (%rsi), %xmm1\r\n   sub $20, %r9\r\n   jl readbankconflict128_end\r\n   cmp %rsi, %r12  /* subtract leading location from end location */\r\n   jg readbankconflict128_loop /* if positive or equal, continue loop */\r\n   mov %rcx, %rdi  /* reset to start */\r\n   mov %rcx, %rsi\r\n   add %r8, %rsi\r\n   jmp readbankconflict128_loop\r\nreadbankconflict128_end:\r\n   pop %r12\r\n   pop %r11\r\n   pop %r10\r\n   pop %rsi\r\n   pop %rdi\r\n   pop %rbx\r\n   ret \r\n"
  },
  {
    "path": "MemoryBandwidth/MixedMemoryBandwidthTest/MemoryBandwidth.h",
    "content": "#pragma once\r\n\r\n#include <stdint.h>\r\nextern \"C\" float sse_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float avx_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float avx512_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);\r\nextern \"C\" float avx_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);\r\n\r\nfloat __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations)\r\n{\r\n    void (*nopfunc)(uint64_t);\r\n    nopfunc = (void(*)(uint64_t))arr;\r\n    int iterIdx;\r\n    for (iterIdx = 0; iterIdx < iterations; iterIdx++) nopfunc(iterations);\r\n    return iterIdx;\r\n}\r\n\r\nfloat (*bw_func)(void*, uint64_t, uint64_t) = sse_asm_read;\r\n\r\nenum InstructionTestType { None, FourByte, EightByte, K8_FourByte, Branch16 };\r\nstruct BandwidthTestThreadData {\r\n    uint32_t iterations;\r\n    uint32_t arr_length;\r\n    uint64_t tsc_duration;\r\n    float* arr;\r\n    float (*bw_func)(void*, uint64_t, uint64_t);\r\n};\r\n\r\n\r\n/// <summary>\r\n/// Automatically picks the best assembly read function supported by the current CPU\r\n/// </summary>\r\nvoid auto_set_bw_func()\r\n{\r\n    int cpuid_data[4];\r\n    __cpuidex(cpuid_data, 1, 0);\r\n    if (cpuid_data[3] & (1UL << 25)) {\r\n        fprintf(stderr, \"SSE supported\\n\");\r\n        bw_func = sse_asm_read;\r\n    }\r\n\r\n    if (cpuid_data[2] & (1UL << 28)) {\r\n        fprintf(stderr, \"AVX supported\\n\");\r\n        bw_func = avx_asm_read;\r\n    }\r\n\r\n    __cpuidex(cpuid_data, 7, 0);\r\n    if (cpuid_data[1] & (1UL << 16)) {\r\n        fprintf(stderr, \"AVX512 supported\\n\");\r\n        bw_func = avx512_asm_read;\r\n    }\r\n}\r\n\r\nvoid FillInstructionArray(uint64_t* arr, uint64_t sizeKb, enum InstructionTestType nopSize)\r\n{\r\n    char nop8b[8] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };\r\n\r\n    // zen/piledriver optimization manual uses this pattern\r\n    char nop4b[8] = { 0x0F, 0x1F, 0x40, 0x00, 0x0F, 0x1F, 0x40, 0x00 };\r\n\r\n    // athlon64 (K8) optimization manual pattern\r\n    char k8_nop4b[8] = { 0x66, 0x66, 0x66, 0x90, 0x66, 0x66, 0x66, 0x90 };\r\n\r\n    uint64_t elements = (sizeKb * 1024 / 8) - 1; // leave room for ret\r\n    unsigned char* functionEnd = (unsigned char*)(arr + elements);\r\n\r\n    if (nopSize != Branch16) {\r\n        uint64_t* nopPtr;\r\n        if (nopSize == EightByte) nopPtr = (uint64_t*)(nop8b);\r\n        else if (nopSize == FourByte) nopPtr = (uint64_t*)(nop4b);\r\n        else if (nopSize == K8_FourByte) nopPtr = (uint64_t*)(k8_nop4b);\r\n        else {\r\n            fprintf(stderr, \"%d (enum value) NOP size isn't supported :(\\n\", nopSize);\r\n            return;\r\n        }\r\n\r\n        for (uint64_t nopIdx = 0; nopIdx < elements; nopIdx++) {\r\n            arr[nopIdx] = *nopPtr;\r\n        }\r\n\r\n        functionEnd[0] = 0xC3;\r\n    }\r\n    else {\r\n        // jump forward 14 bytes\r\n        char branch16b[8] = { 0xEB, 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };\r\n        char ret8b[8] = { 0xC3, 0, 0, 0, 0, 0, 0, 0 };\r\n        uint64_t* branchPtr = (uint64_t*)(branch16b);\r\n        uint64_t* nopPtr = (uint64_t*)(nop8b); // doesn't really matter, we should never hit this\r\n\r\n        // last iteration must have nopIdx % 2 == 1, so the jump will go to the return statement\r\n        // i.e. branchElements for loop must be even, so the last iteration is odd\r\n        uint64_t branchElements = elements % 2 == 0 ? elements : elements - 1;\r\n        uint64_t nopIdx;\r\n        for (nopIdx = 0; nopIdx < branchElements; nopIdx++) {\r\n            arr[nopIdx] = nopIdx % 2 == 0 ? *branchPtr : *nopPtr;\r\n        }\r\n\r\n        arr[nopIdx] = *(uint64_t*)ret8b;\r\n    }\r\n}\r\n\r\n/// <summary>\r\n/// Given test size in KB, return a good iteration count\r\n/// </summary>\r\n/// <param name=\"testSize\">test size in KB</param>\r\n/// <returns>Iterations per thread</returns>\r\nuint32_t GetIterationCount(uint32_t testSize, uint32_t threads)\r\n{\r\n    uint32_t gbToTransfer = 512;\r\n    if (testSize > 64) gbToTransfer = 512 / 2;\r\n    if (testSize > 512) gbToTransfer = 512 / 4;\r\n    if (testSize > 8192) gbToTransfer = 512 / 8;\r\n    uint32_t iterations = gbToTransfer * 1024 * 1024 / testSize;\r\n    if (iterations % 2 != 0) iterations += 1;\r\n\r\n    if (iterations < 8) return 8; // set a minimum to reduce noise\r\n    else return iterations;\r\n}"
  },
  {
    "path": "MemoryBandwidth/MixedMemoryBandwidthTest/MemoryBandwidthFunctions.asm",
    "content": "section .text\r\n\r\nbits 64\r\n\r\nglobal sse_asm_read\r\nglobal sse_asm_copy\r\nglobal sse_asm_write\r\nglobal sse_asm_ntwrite\r\nglobal sse_asm_add\r\nglobal avx_asm_read\r\nglobal avx_asm_write\r\nglobal avx_asm_ntwrite\r\nglobal avx_asm_copy\r\nglobal avx_asm_cflip\r\nglobal avx_asm_add\r\nglobal avx512_asm_read\r\n\r\nglobal repmovsb_copy\r\nglobal repstosb_write\r\n\r\n; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations, r9 = start index\r\navx_asm_read:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\navx_asm_read_pass_loop:\r\n  ; xmm0 to 5 are considered volatile\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps ymm0, [rdi + 128]\r\n  vmovaps ymm1, [rdi + 160]\r\n  vmovaps ymm2, [rdi + 192]\r\n  vmovaps ymm3, [rdi + 224]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps ymm0, [rdi + 128]\r\n  vmovaps ymm1, [rdi + 160]\r\n  vmovaps ymm2, [rdi + 192]\r\n  vmovaps ymm3, [rdi + 224]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_test_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_test_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_read_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz avx_asm_read_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\navx_asm_write:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  vmovaps ymm0, [rcx]\r\navx_asm_write_pass_loop:\r\n  vmovaps [rdi], ymm0\r\n  vmovaps [rdi + 32], ymm0\r\n  vmovaps [rdi + 64], ymm0\r\n  vmovaps [rdi + 96], ymm0\r\n  vmovaps [rdi + 128], ymm0\r\n  vmovaps [rdi + 160], ymm0\r\n  vmovaps [rdi + 192], ymm0\r\n  vmovaps [rdi + 224], ymm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovaps [rdi], ymm0\r\n  vmovaps [rdi + 32], ymm0\r\n  vmovaps [rdi + 64], ymm0\r\n  vmovaps [rdi + 96], ymm0\r\n  vmovaps [rdi + 128], ymm0\r\n  vmovaps [rdi + 160], ymm0\r\n  vmovaps [rdi + 192], ymm0\r\n  vmovaps [rdi + 224], ymm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_write_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_write_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_write_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz avx_asm_write_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\navx_asm_ntwrite:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  vmovaps ymm0, [rcx]\r\navx_asm_ntwrite_pass_loop:\r\n  vmovntps [rdi], ymm0\r\n  vmovntps [rdi + 32], ymm0\r\n  vmovntps [rdi + 64], ymm0\r\n  vmovntps [rdi + 96], ymm0\r\n  vmovntps [rdi + 128], ymm0\r\n  vmovntps [rdi + 160], ymm0\r\n  vmovntps [rdi + 192], ymm0\r\n  vmovntps [rdi + 224], ymm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovntps [rdi], ymm0\r\n  vmovntps [rdi + 32], ymm0\r\n  vmovntps [rdi + 64], ymm0\r\n  vmovntps [rdi + 96], ymm0\r\n  vmovntps [rdi + 128], ymm0\r\n  vmovntps [rdi + 160], ymm0\r\n  vmovntps [rdi + 192], ymm0\r\n  vmovntps [rdi + 224], ymm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_ntwrite_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_ntwrite_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_ntwrite_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz avx_asm_ntwrite_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n; rcx = ptr to arr\r\n; rdx = arr_length\r\n; r8 = iterations\r\navx_asm_copy:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  push r13\r\n  xor rsi, rsi\r\n  mov r9, rdx\r\n  shr r9, 1    ; start destination at array + length / 2\r\n  mov r15, 256 ; load in blocks of 128 bytes\r\n  mov r13, r9\r\n  sub r13, 64\r\n  lea rdi, [rcx + rsi * 4]\r\n  lea r14, [rcx + r9 * 4]\r\navx_copy_pass_loop:\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps ymm4, [rdi + 128]\r\n  vmovaps ymm5, [rdi + 160]\r\n  vmovaps ymm6, [rdi + 192]\r\n  vmovaps ymm7, [rdi + 224]\r\n  vmovaps [r14], ymm0\r\n  vmovaps [r14 + 32], ymm1\r\n  vmovaps [r14 + 64], ymm2\r\n  vmovaps [r14 + 96], ymm3\r\n  vmovaps [r14 + 128], ymm4\r\n  vmovaps [r14 + 160], ymm5\r\n  vmovaps [r14 + 192], ymm6\r\n  vmovaps [r14 + 224], ymm7\r\n  add rsi, 64\r\n  add rdi, r15  ; increment src/dst pointers\r\n  add r14, r15\r\n  cmp r13, rsi  ; end location is at half\r\n  jge avx_copy_pass_loop\r\n  xor rsi, rsi\r\n  lea rdi, [rcx + rsi * 4] ; back to start\r\n  lea r14, [rcx + r9 * 4]\r\n  dec r8                  ; decrement iteration counter\r\n  jnz avx_copy_pass_loop\r\n  pop r13\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n\r\n; changes the ordering of vector sized elements within a cacheline\r\n avx_asm_cflip:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break. 128 elements per iteration\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\navx_asm_cflip_pass_loop:\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps [rdi + 96], ymm0\r\n  vmovaps [rdi + 64], ymm1\r\n  vmovaps [rdi + 32], ymm2\r\n  vmovaps [rdi], ymm3\r\n  vmovaps ymm0, [rdi + 128]\r\n  vmovaps ymm1, [rdi + 160]\r\n  vmovaps ymm2, [rdi + 192]\r\n  vmovaps ymm3, [rdi + 224]\r\n  vmovaps [rdi + 224], ymm0\r\n  vmovaps [rdi + 192], ymm1\r\n  vmovaps [rdi + 160], ymm2\r\n  vmovaps [rdi + 128], ymm3\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovaps ymm0, [rdi]\r\n  vmovaps ymm1, [rdi + 32]\r\n  vmovaps ymm2, [rdi + 64]\r\n  vmovaps ymm3, [rdi + 96]\r\n  vmovaps [rdi + 96], ymm0\r\n  vmovaps [rdi + 64], ymm1\r\n  vmovaps [rdi + 32], ymm2\r\n  vmovaps [rdi], ymm3\r\n  vmovaps ymm0, [rdi + 128]\r\n  vmovaps ymm1, [rdi + 160]\r\n  vmovaps ymm2, [rdi + 192]\r\n  vmovaps ymm3, [rdi + 224]\r\n  vmovaps [rdi + 224], ymm0\r\n  vmovaps [rdi + 192], ymm1\r\n  vmovaps [rdi + 160], ymm2\r\n  vmovaps [rdi + 128], ymm3\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_cflip_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_cflip_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_cflip_pass_loop ; skip iteration decrement if we're not back to start\r\n  sub r8, 2\r\n  jnz avx_asm_cflip_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\navx_asm_add:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  vmovaps ymm4, [rdi]\r\navx_asm_add_pass_loop:\r\n  ; xmm0 to 5 are considered volatile\r\n  vaddps ymm0, ymm4, [rdi]\r\n  vaddps ymm1, ymm4, [rdi + 32]\r\n  vaddps ymm2, ymm4, [rdi + 64]\r\n  vaddps ymm3, ymm4, [rdi + 96]\r\n  vmovaps [rdi], ymm0\r\n  vmovaps [rdi + 32], ymm1\r\n  vmovaps [rdi + 64], ymm2\r\n  vmovaps [rdi + 96], ymm3\r\n  vaddps ymm0, ymm4, [rdi + 128]\r\n  vaddps ymm1, ymm4, [rdi + 160]\r\n  vaddps ymm2, ymm4, [rdi + 192]\r\n  vaddps ymm3, ymm4, [rdi + 224]\r\n  vmovaps [rdi + 128], ymm0\r\n  vmovaps [rdi + 160], ymm1\r\n  vmovaps [rdi + 192], ymm2\r\n  vmovaps [rdi + 224], ymm3\r\n\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vaddps ymm0, ymm4, [rdi]\r\n  vaddps ymm1, ymm4, [rdi + 32]\r\n  vaddps ymm2, ymm4, [rdi + 64]\r\n  vaddps ymm3, ymm4, [rdi + 96]\r\n  vmovaps [rdi], ymm0\r\n  vmovaps [rdi + 32], ymm1\r\n  vmovaps [rdi + 64], ymm2\r\n  vmovaps [rdi + 96], ymm3\r\n  vaddps ymm0, ymm4, [rdi + 128]\r\n  vaddps ymm1, ymm4, [rdi + 160]\r\n  vaddps ymm2, ymm4, [rdi + 192]\r\n  vaddps ymm3, ymm4, [rdi + 224]\r\n  vmovaps [rdi + 128], ymm0\r\n  vmovaps [rdi + 160], ymm1\r\n  vmovaps [rdi + 192], ymm2\r\n  vmovaps [rdi + 224], ymm3\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx_add_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx_add_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx_asm_add_pass_loop ; skip iteration decrement if we're not back to start\r\n  sub r8, 2\r\n  jnz avx_asm_add_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\navx512_asm_read:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9   ; not doing start anymore, too lazy to clean up code\r\n  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\navx512_asm_read_pass_loop:\r\n  vmovaps zmm0, [rdi]\r\n  vmovaps zmm1, [rdi + 64]\r\n  vmovaps zmm2, [rdi + 128]\r\n  vmovaps zmm3, [rdi + 192]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  vmovaps zmm0, [rdi]\r\n  vmovaps zmm1, [rdi + 64]\r\n  vmovaps zmm2, [rdi + 128]\r\n  vmovaps zmm3, [rdi + 192]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge asm_avx512_test_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nasm_avx512_test_iteration_count:\r\n  cmp r9, rsi\r\n  jnz avx512_asm_read_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz avx512_asm_read_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\nsse_asm_read:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\nsse_read_pass_loop:\r\n  ; xmm0 to 5 are considered volatile\r\n  movaps xmm0, [rdi]\r\n  movaps xmm1, [rdi + 16]\r\n  movaps xmm2, [rdi + 32]\r\n  movaps xmm3, [rdi + 48]\r\n  movaps xmm0, [rdi + 64]\r\n  movaps xmm1, [rdi + 80]\r\n  movaps xmm2, [rdi + 96]\r\n  movaps xmm3, [rdi + 112]\r\n  movaps xmm0, [rdi + 128]\r\n  movaps xmm1, [rdi + 144]\r\n  movaps xmm2, [rdi + 160]\r\n  movaps xmm3, [rdi + 176]\r\n  movaps xmm0, [rdi + 192]\r\n  movaps xmm2, [rdi + 208]\r\n  movaps xmm2, [rdi + 224]\r\n  movaps xmm2, [rdi + 240]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  movaps xmm0, [rdi]\r\n  movaps xmm1, [rdi + 16]\r\n  movaps xmm2, [rdi + 32]\r\n  movaps xmm3, [rdi + 48]\r\n  movaps xmm0, [rdi + 64]\r\n  movaps xmm1, [rdi + 80]\r\n  movaps xmm2, [rdi + 96]\r\n  movaps xmm3, [rdi + 112]\r\n  movaps xmm0, [rdi + 128]\r\n  movaps xmm1, [rdi + 144]\r\n  movaps xmm2, [rdi + 160]\r\n  movaps xmm3, [rdi + 176]\r\n  movaps xmm0, [rdi + 192]\r\n  movaps xmm2, [rdi + 208]\r\n  movaps xmm2, [rdi + 224]\r\n  movaps xmm2, [rdi + 240]\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge sse_test_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nsse_test_iteration_count:\r\n  cmp r9, rsi\r\n  jnz sse_read_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jnz sse_read_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations\r\nsse_asm_write:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  movaps xmm0, [rdi]\r\nsse_write_pass_loop:\r\n  movaps [rdi], xmm0\r\n  movaps [rdi + 16], xmm0\r\n  movaps [rdi + 32], xmm0\r\n  movaps [rdi + 48], xmm0\r\n  movaps [rdi + 64], xmm0\r\n  movaps [rdi + 80], xmm0\r\n  movaps [rdi + 96], xmm0\r\n  movaps [rdi + 112], xmm0\r\n  movaps [rdi + 128], xmm0\r\n  movaps [rdi + 144], xmm0\r\n  movaps [rdi + 160], xmm0\r\n  movaps [rdi + 176], xmm0\r\n  movaps [rdi + 192], xmm0\r\n  movaps [rdi + 208], xmm0\r\n  movaps [rdi + 224], xmm0\r\n  movaps [rdi + 240], xmm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  movaps [rdi], xmm0\r\n  movaps [rdi + 16], xmm0\r\n  movaps [rdi + 32], xmm0\r\n  movaps [rdi + 48], xmm0\r\n  movaps [rdi + 64], xmm0\r\n  movaps [rdi + 80], xmm0\r\n  movaps [rdi + 96], xmm0\r\n  movaps [rdi + 112], xmm0\r\n  movaps [rdi + 128], xmm0\r\n  movaps [rdi + 144], xmm0\r\n  movaps [rdi + 160], xmm0\r\n  movaps [rdi + 176], xmm0\r\n  movaps [rdi + 192], xmm0\r\n  movaps [rdi + 208], xmm0\r\n  movaps [rdi + 224], xmm0\r\n  movaps [rdi + 240], xmm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge sse_write_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nsse_write_iteration_count:\r\n  cmp r9, rsi\r\n  jnz sse_write_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jg sse_write_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\nsse_asm_ntwrite:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  movaps xmm0, [rdi]\r\nsse_ntwrite_pass_loop:\r\n  movntps [rdi], xmm0\r\n  movntps [rdi + 16], xmm0\r\n  movntps [rdi + 32], xmm0\r\n  movntps [rdi + 48], xmm0\r\n  movntps [rdi + 64], xmm0\r\n  movntps [rdi + 80], xmm0\r\n  movntps [rdi + 96], xmm0\r\n  movntps [rdi + 112], xmm0\r\n  movntps [rdi + 128], xmm0\r\n  movntps [rdi + 144], xmm0\r\n  movntps [rdi + 160], xmm0\r\n  movntps [rdi + 176], xmm0\r\n  movntps [rdi + 192], xmm0\r\n  movntps [rdi + 208], xmm0\r\n  movntps [rdi + 224], xmm0\r\n  movntps [rdi + 240], xmm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  movntps [rdi], xmm0\r\n  movntps [rdi + 16], xmm0\r\n  movntps [rdi + 32], xmm0\r\n  movntps [rdi + 48], xmm0\r\n  movntps [rdi + 64], xmm0\r\n  movntps [rdi + 80], xmm0\r\n  movntps [rdi + 96], xmm0\r\n  movntps [rdi + 112], xmm0\r\n  movntps [rdi + 128], xmm0\r\n  movntps [rdi + 144], xmm0\r\n  movntps [rdi + 160], xmm0\r\n  movntps [rdi + 176], xmm0\r\n  movntps [rdi + 192], xmm0\r\n  movntps [rdi + 208], xmm0\r\n  movntps [rdi + 224], xmm0\r\n  movntps [rdi + 240], xmm0\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge sse_ntwrite_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nsse_ntwrite_iteration_count:\r\n  cmp r9, rsi\r\n  jnz sse_ntwrite_pass_loop ; skip iteration decrement if we're not back to start\r\n  dec r8\r\n  jg sse_ntwrite_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n\r\n; rcx = ptr to arr\r\n; rdx = arr_length\r\n; r8 = iterations\r\nsse_asm_copy:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  push r13\r\n  xor rsi, rsi\r\n  mov r9, rdx\r\n  shr r9, 1    ; start destination at array + length / 2\r\n  mov r15, 256 ; load in blocks of 128 bytes\r\n  mov r13, r9\r\n  sub r13, 64\r\n  lea rdi, [rcx + rsi * 4]\r\n  lea r14, [rcx + r9 * 4]\r\nsse_copy_pass_loop:\r\n  movaps xmm0, [rdi]\r\n  movaps xmm1, [rdi + 16]\r\n  movaps xmm2, [rdi + 32]\r\n  movaps xmm3, [rdi + 48]\r\n  movaps xmm4, [rdi + 64]\r\n  movaps xmm5, [rdi + 80]\r\n  movaps xmm6, [rdi + 96]\r\n  movaps xmm7, [rdi + 112]\r\n  movaps [r14], xmm0\r\n  movaps [r14 + 16], xmm1\r\n  movaps [r14 + 32], xmm2\r\n  movaps [r14 + 48], xmm3\r\n  movaps [r14 + 64], xmm4\r\n  movaps [r14 + 80], xmm5\r\n  movaps [r14 + 96], xmm6\r\n  movaps [r14 + 112], xmm7\r\n\r\n  movaps xmm0, [rdi + 128]\r\n  movaps xmm1, [rdi + 144]\r\n  movaps xmm2, [rdi + 160]\r\n  movaps xmm3, [rdi + 176]\r\n  movaps xmm4, [rdi + 192]\r\n  movaps xmm5, [rdi + 208]\r\n  movaps xmm6, [rdi + 224]\r\n  movaps xmm7, [rdi + 240]\r\n  movaps [r14 + 128], xmm0\r\n  movaps [r14 + 144], xmm1\r\n  movaps [r14 + 160], xmm2\r\n  movaps [r14 + 176], xmm3\r\n  movaps [r14 + 192], xmm4\r\n  movaps [r14 + 208], xmm5\r\n  movaps [r14 + 224], xmm6\r\n  movaps [r14 + 240], xmm7\r\n\r\n  add rsi, 64\r\n  add rdi, r15  ; increment src/dst pointers\r\n  add r14, r15\r\n  cmp r13, rsi  ; end location is at half\r\n  jge sse_copy_pass_loop\r\n  xor rsi, rsi\r\n  lea rdi, [rcx + rsi * 4] ; back to start\r\n  lea r14, [rcx + r9 * 4]\r\n  dec r8                  ; decrement iteration counter\r\n  jnz sse_copy_pass_loop\r\n  pop r13\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\nsse_asm_add:\r\n  push rsi\r\n  push rdi\r\n  push rbx\r\n  push r15\r\n  push r14\r\n  mov r15, 256 ; load in blocks of 256 bytes\r\n  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break\r\n  xor r9, r9\r\n  xor rsi, rsi\r\n  xor rbx, rbx\r\n  lea rdi, [rcx + rsi * 4]\r\n  mov r14, rdi\r\n  movaps xmm5, [rdi]\r\nsse_add_pass_loop:\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi]\r\n  addps xmm1, [rdi + 16]\r\n  addps xmm2, [rdi + 32]\r\n  addps xmm3, [rdi + 48]\r\n  movaps [rdi], xmm0\r\n  movaps [rdi + 16], xmm1\r\n  movaps [rdi + 32], xmm2\r\n  movaps [rdi + 48], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 64]\r\n  addps xmm1, [rdi + 80]\r\n  addps xmm2, [rdi + 96]\r\n  addps xmm3, [rdi + 112]\r\n  movaps [rdi + 64], xmm0\r\n  movaps [rdi + 80], xmm1\r\n  movaps [rdi + 96], xmm2\r\n  movaps [rdi + 112], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 128]\r\n  addps xmm1, [rdi + 144]\r\n  addps xmm2, [rdi + 160]\r\n  addps xmm3, [rdi + 176]\r\n  movaps [rdi + 128], xmm0\r\n  movaps [rdi + 144], xmm1\r\n  movaps [rdi + 160], xmm2\r\n  movaps [rdi + 176], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 192]\r\n  addps xmm1, [rdi + 208]\r\n  addps xmm2, [rdi + 224]\r\n  addps xmm3, [rdi + 240]\r\n  movaps [rdi + 192], xmm0\r\n  movaps [rdi + 208], xmm1\r\n  movaps [rdi + 224], xmm2\r\n  movaps [rdi + 240], xmm3\r\n\r\n  add rsi, 64\r\n  add rdi, r15\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi]\r\n  addps xmm1, [rdi + 16]\r\n  addps xmm2, [rdi + 32]\r\n  addps xmm3, [rdi + 48]\r\n  movaps [rdi], xmm0\r\n  movaps [rdi + 16], xmm1\r\n  movaps [rdi + 32], xmm2\r\n  movaps [rdi + 48], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 64]\r\n  addps xmm1, [rdi + 80]\r\n  addps xmm2, [rdi + 96]\r\n  addps xmm3, [rdi + 112]\r\n  movaps [rdi + 64], xmm0\r\n  movaps [rdi + 80], xmm1\r\n  movaps [rdi + 96], xmm2\r\n  movaps [rdi + 112], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 128]\r\n  addps xmm1, [rdi + 144]\r\n  addps xmm2, [rdi + 160]\r\n  addps xmm3, [rdi + 176]\r\n  movaps [rdi + 128], xmm0\r\n  movaps [rdi + 144], xmm1\r\n  movaps [rdi + 160], xmm2\r\n  movaps [rdi + 176], xmm3\r\n\r\n  movaps xmm0, xmm5\r\n  movaps xmm1, xmm5\r\n  movaps xmm2, xmm5\r\n  movaps xmm3, xmm5\r\n  addps xmm0, [rdi + 192]\r\n  addps xmm1, [rdi + 208]\r\n  addps xmm2, [rdi + 224]\r\n  addps xmm3, [rdi + 240]\r\n  movaps [rdi + 192], xmm0\r\n  movaps [rdi + 208], xmm1\r\n  movaps [rdi + 224], xmm2\r\n  movaps [rdi + 240], xmm3\r\n  add rsi, 64\r\n  add rdi, r15\r\n  cmp rdx, rsi\r\n  jge sse_add_iteration_count\r\n  mov rsi, rbx\r\n  lea rdi, [rcx + rsi * 4]  ; back to start\r\nsse_add_iteration_count:\r\n  cmp r9, rsi\r\n  jnz sse_add_pass_loop ; skip iteration decrement if we're not back to start\r\n  sub r8, 2\r\n  jg sse_add_pass_loop\r\n  pop r14\r\n  pop r15\r\n  pop rbx\r\n  pop rdi\r\n  pop rsi\r\n  ret\r\n\r\n\r\n; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations\r\nrepmovsb_copy:\r\n  push r15\r\n  push r14\r\n  push r13\r\n  push r12\r\n  push rsi\r\n  push rdi\r\n  push rax\r\n  cld\r\n  ; source = rsi, destination = rdi, count (in bytes) = rcx\r\n  mov rsi, rcx  ; set source\r\n  shr rdx, 1    ; set destination = source + (size / 2)\r\n  mov rdi, rcx\r\n  add rdi, rdx\r\n  mov rcx, rdx  ; set count = (size / 2) * (4 bytes per fp32 element)\r\n  shl rcx, 2\r\n  mov r12, rsi\r\n  mov r13, rdi\r\n  mov r14, rcx\r\nrepmovsb_copy_pass_loop:\r\n  mov rsi, r12\r\n  mov rdi, r13\r\n  mov rcx, r14\r\n  rep movsb\r\n  dec r8\r\n  jnz repmovsb_copy_pass_loop\r\n  movss xmm0, [r12]\r\n  pop rax\r\n  pop rdi\r\n  pop rsi\r\n  pop r12\r\n  pop r13\r\n  pop r14\r\n  pop r15\r\n  ret\r\n\r\n; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations\r\nrepstosb_write:\r\n  push r15\r\n  push r14\r\n  push r13\r\n  push r12\r\n  push rsi\r\n  push rdi\r\n  push rax\r\n  cld\r\n  ; source = value in al, destination = rdi, count (in bytes) = rcx\r\n  mov al, 1  ; set source\r\n  mov r13, rcx  ; destination = start of arr\r\n  mov r14, rdx  \r\n  shl r14, 2    ; count = (nr of FP32 elements) * 4\r\nrepstosb_write_pass_loop:\r\n  mov rdi, r13\r\n  mov rcx, r14\r\n  rep stosb\r\n  dec r8\r\n  jnz repstosb_write_pass_loop\r\n  movss xmm0, [r13]\r\n  pop rax\r\n  pop rdi\r\n  pop rsi\r\n  pop r12\r\n  pop r13\r\n  pop r14\r\n  pop r15\r\n  ret"
  },
  {
    "path": "MemoryBandwidth/MixedMemoryBandwidthTest/MixedMemoryBandwidthTest.cpp",
    "content": "// MixedMemoryBandwidthTest.cpp : One-off microbenchmark for hitting L2 with both instruction and data accesses\r\n//\r\n\r\n#include <stdio.h>\r\n#include <intrin.h>\r\n#include <immintrin.h>\r\n#include <sys\\timeb.h>\r\n#include <math.h>\r\n#include <windows.h>\r\n#include \"MemoryBandwidth.h\"\r\n\r\nint default_test_sizes[39] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048,\r\n                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304,\r\n                               131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 };\r\n\r\ndouble Measure2TBw(uint32_t sizeKb, uint32_t iterations, int shared, enum InstructionTestType instr, double* final_instr_bw, double* final_data_bw);\r\n\r\nint main(int argc, char *argv[])\r\n{\r\n    int shared = 0;\r\n\r\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\r\n        if (*(argv[argIdx]) == '-') {\r\n            char* arg = argv[argIdx] + 1;\r\n            if (_strnicmp(arg, \"shared\", 6) == 0) {\r\n                shared = 1;\r\n                fprintf(stderr, \"Using one array shared across all threads\\n\");\r\n            }\r\n        }\r\n    }\r\n    auto_set_bw_func();\r\n    double* test_results = (double *)malloc(2 * sizeof(double) * (sizeof(default_test_sizes) / sizeof(int)));\r\n    memset(test_results, 0, 2 * sizeof(double) * (sizeof(default_test_sizes) / sizeof(int)));\r\n    for (int test_size_idx = 0; test_size_idx < sizeof(default_test_sizes) / sizeof(int); test_size_idx++)\r\n    {\r\n        fprintf(stderr, \"Testing %d KB\\n\", default_test_sizes[test_size_idx]);\r\n        Measure2TBw(default_test_sizes[test_size_idx], GetIterationCount(default_test_sizes[test_size_idx], 2), shared, EightByte, &test_results[test_size_idx * 2], &test_results[test_size_idx * 2 + 1]);\r\n    }\r\n\r\n    printf(\"Test Size (KB), Instruction Bandwidth (GB/s), Data Bandwidth (GB/s)\\n\");\r\n    for (int test_size_idx = 0; test_size_idx < sizeof(default_test_sizes) / sizeof(int); test_size_idx++)\r\n    {\r\n        printf(\"%d,%f,%f\\n\", default_test_sizes[test_size_idx], test_results[test_size_idx * 2], test_results[test_size_idx * 2 + 1]);\r\n    }\r\n\r\n    free(test_results);\r\n    return 0;\r\n}\r\n\r\nDWORD WINAPI ReadBandwidthTestThread(LPVOID param) {\r\n    BandwidthTestThreadData* bwTestData = (BandwidthTestThreadData*)param;\r\n    uint64_t start_tsc = __rdtsc();\r\n    bwTestData->bw_func(bwTestData->arr, bwTestData->arr_length, bwTestData->iterations);\r\n    bwTestData->tsc_duration = __rdtsc() - start_tsc;\r\n    return 0;\r\n}\r\n\r\n// Use two threads to measure bandwidth and pin them to sibling SMT cores\r\n// One thread measures instruction bandwidth, the other measures data bw\r\n// Auto-adjusts iteration counts to prevent long-tailed behavior where one thread finishes first\r\ndouble Measure2TBw(uint32_t sizeKb, uint32_t iterations, int shared, enum InstructionTestType instr, double *final_instr_bw, double *final_data_bw) {\r\n    struct timeb start, end;\r\n    float bw = 0;\r\n    uint32_t elements = sizeKb * 1024 / sizeof(float);\r\n    uint32_t private_elements = ceil((double)sizeKb / 2) * 256;\r\n    DWORD protection_flags = PAGE_EXECUTE_READWRITE;\r\n    DWORD tids[2];\r\n    struct BandwidthTestThreadData instrThreadData, dataThreadData;\r\n\r\n    if (!shared) elements = private_elements;\r\n    if (!shared && sizeKb < 2) {\r\n        return 0;\r\n    }\r\n\r\n    // make array and fill it\r\n    float* sharedTestArr = NULL;\r\n    if (shared) {\r\n        // shared case: both threads read from the same array. it has to contain valid instructions\r\n        sharedTestArr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);\r\n        if (sharedTestArr == NULL) {\r\n            fprintf(stderr, \"Could not allocate memory\\n\");\r\n            return 0;\r\n        }\r\n\r\n        FillInstructionArray((uint64_t*)sharedTestArr, sizeKb, instr);\r\n        instrThreadData.arr = sharedTestArr;\r\n        dataThreadData.arr = sharedTestArr;\r\n    }\r\n    else {\r\n        // Give threads different arrays\r\n        instrThreadData.arr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);\r\n        dataThreadData.arr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);\r\n        FillInstructionArray((uint64_t*)instrThreadData.arr, (elements * 4) / 1024, instr);\r\n        for (uint64_t arr_idx = 0; arr_idx < elements; arr_idx++) {\r\n            dataThreadData.arr[arr_idx] = arr_idx + 0.5f;\r\n        }\r\n    }\r\n\r\n    instrThreadData.arr_length = elements;\r\n    instrThreadData.bw_func = instr_read;\r\n    instrThreadData.iterations = iterations;\r\n    dataThreadData.arr_length = elements;\r\n    dataThreadData.bw_func = bw_func;\r\n    dataThreadData.iterations = iterations;\r\n\r\n    while (true) {\r\n        HANDLE instrThread = CreateThread(NULL, 0, ReadBandwidthTestThread, &instrThreadData, CREATE_SUSPENDED, &tids[0]);\r\n        HANDLE dataThread = CreateThread(NULL, 0, ReadBandwidthTestThread, &dataThreadData, CREATE_SUSPENDED, &tids[1]);\r\n    \r\n        // set thread affinity to sibling SMT threads\r\n        ULONGLONG instrMask = 0, dataMask = 0;\r\n        instrMask = (1UL << 2);\r\n        dataMask = (1UL << 3);\r\n        SetThreadAffinityMask(instrThread, instrMask);\r\n        SetThreadAffinityMask(dataThread, dataMask);\r\n\r\n        ftime(&start);\r\n        ResumeThread(instrThread);\r\n        ResumeThread(dataThread);\r\n        WaitForSingleObject(dataThread, INFINITE);\r\n        WaitForSingleObject(instrThread, INFINITE);\r\n        ftime(&end);\r\n\r\n        int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);\r\n        double instrGbTransferred = (uint64_t)instrThreadData.iterations * sizeof(float) * elements / (double)1e9;\r\n        double dataGbTransferred = (uint64_t)dataThreadData.iterations * sizeof(float) * elements / (double)1e9;\r\n        double dataBw = 1000 * instrGbTransferred / (double)time_diff_ms;\r\n        double instrBw = 1000 * dataGbTransferred / (double)time_diff_ms;\r\n        bw = dataBw + instrBw;\r\n\r\n        double instr_over_data_ratio = (double)instrThreadData.tsc_duration / (double)dataThreadData.tsc_duration;\r\n        fprintf(stderr, \"Instr %f GB/s in %f G ticks, data %f GB/s in %f G ticks, time ratio %f\\n\", \r\n            instrBw, instrThreadData.tsc_duration / 1e9, dataBw, dataThreadData.tsc_duration / 1e9, instr_over_data_ratio);\r\n        if (fabs(instr_over_data_ratio - 1.0f) < .1f)\r\n        {\r\n            *final_instr_bw = instrBw;\r\n            *final_data_bw = dataBw;\r\n            break;\r\n        }\r\n        else\r\n        {\r\n            // adjust iteration count on data thread until they finish close enough\r\n            dataThreadData.iterations *= instr_over_data_ratio;\r\n        }\r\n    }\r\n\r\n    if (shared) VirtualFree(sharedTestArr, 0, MEM_RELEASE);\r\n    else {\r\n        VirtualFree(instrThreadData.arr, 0, MEM_RELEASE);\r\n        VirtualFree(dataThreadData.arr, 0, MEM_RELEASE);\r\n    }\r\n\r\n    return bw;\r\n}"
  },
  {
    "path": "MemoryBandwidth/MixedMemoryBandwidthTest/MixedMemoryBandwidthTest.vcxproj",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>16.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{5ab9dde0-c954-4d2f-aa46-bfa87ec585c4}</ProjectGuid>\r\n    <RootNamespace>MixedMemoryBandwidthTest</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"MixedMemoryBandwidthTest.cpp\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"MemoryBandwidthFunctions.asm\">\r\n      <FileType>Document</FileType>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">nasm -f win64 MemoryBandwidthFunctions.asm</Command>\r\n      <Message Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">Running NASM</Message>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">MemoryBandwidthFunctions.obj</Outputs>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">nasm -f win64 MemoryBandwidthFunctions.asm</Command>\r\n      <Message Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">Running NASM</Message>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">MemoryBandwidthFunctions.obj</Outputs>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClInclude Include=\"MemoryBandwidth.h\" />\r\n  </ItemGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>"
  },
  {
    "path": "MemoryBandwidth/MixedMemoryBandwidthTest/MixedMemoryBandwidthTest.vcxproj.filters",
    "content": "﻿<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project ToolsVersion=\"4.0\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup>\r\n    <Filter Include=\"Source Files\">\r\n      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>\r\n      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Header Files\">\r\n      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>\r\n      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Resource Files\">\r\n      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>\r\n      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>\r\n    </Filter>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"MixedMemoryBandwidthTest.cpp\">\r\n      <Filter>Source Files</Filter>\r\n    </ClCompile>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"MemoryBandwidthFunctions32.asm\">\r\n      <Filter>Source Files</Filter>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClInclude Include=\"MemoryBandwidth.h\">\r\n      <Filter>Header Files</Filter>\r\n    </ClInclude>\r\n  </ItemGroup>\r\n</Project>"
  },
  {
    "path": "MemoryBandwidth/README.md",
    "content": "# Memory Bandwidth Benchmark\r\nThis is a C and assembly project that tests memory bandwidth. There's a version in this directory for Linux that uses POSIX threads for multithreading. There's a Windows version in the MemoryBandwidth subdirectory that uses Windows threading APIs. The Windows version requires Visual Studio and nasm in the path to compile.\r\n\r\nTo compile the linux version, do `make amd64` or `make aarch64`, depending on the target architecture\r\n\r\n# Example usage\r\n\r\nTesting single threaded bandwidth: `MemoryBandwidth.exe` or `./membw_amd64` or `./membw_aarch64`\r\n\r\n# General parameters\r\n`-threads` - How many threads to spawn. If you spawn more than one (i.e. with `-threads 4`) you might want to specify `-private` or `-shared`\r\n\r\n`-private` - A separate test array is allocated for each thread. Each thread will access its own block of data, with the total amount of test data equal to the test size. For example, with a test size of 16 KB and 4 threads, each thread is given a 4 KB array. With this mode, test results will reflect combined cache capacity. If you have four cores, each with a private 32 KB L1D, expect to see L1D bandwidth up to 4 * 32 KB = 128 KB. This is usually the best mode to use because memory bandwidth results won't be inflated by request combining.\r\n\r\n`-shared` - A single test array is accessed by all threads. For example, with 4 threads and a 16 KB test size, a single 16 KB array will be allocated and all four threads will hit it. Useful for seeing small shared caches, where the sum of private cache capacity is very close to (or exceeds) shared cache capacity. This mode often gives erroneously high memory bandwidth results because requests to the same cachelines from multiple cores may be combined. Of course using this mode with anything other than read-only access patterns is....stupid.\r\n\r\n`-method` - What test to run. Methods will vary depending on what platform you're targeting and what version (Windows or Linux) you're using. There's some naming inconsistency here that I have to clean up. Good luck. If you don't specify it, it should pick the best read-only test function to use on your system. But a few options:\r\n- `asm` (Linux only) - Uses a default read-only test function with a handwritten, unrolled assembly loop. On x86, AVX is used. NEON is used on aarch64.\r\n- `avx512` (Linux, x86-64 only) - Uses AVX-512 instructions\r\n- `write` (Linux) - Tests write bandwidth instead of read bandwidth. Will use AVX-512 if available\r\n- `copy` (Linux) - Copies one half of the array to the other\r\n- `scalar` - Plain C code that should work on any system. Only option available if you're on a weird (not x86 or aarch64) platform. Unsuitable for testing cache bandwidth because compilers are really really bad at autovectorization\r\n- `instr8`, `instr4` - Tests instruction-side bandwidth (as opposed to data side) by filling an array with NOPs and a return at the end, marking it executable, and calling it as if it were a function. On x86-64, `instr8` uses 8 byte NOPs, while `instr4` uses 4 byte NOPs.\n"
  },
  {
    "path": "MemoryLatency/Makefile",
    "content": "include ../Common/arch_detect.mk\n\nCFLAGS = -O3\nLDFLAGS = -lm\n\nall: $(TARGET)\n\namd64:\n\t$(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency_amd64 $(LDFLAGS)\n\namd64-numa:\n\t$(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency_numa_amd64 $(LDFLAGS) -lnuma\n\naarch64:\n\t$(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency_aarch64 $(LDFLAGS)\n\naarch64-numa:\n\t$(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency_aarch64 $(LDFLAGS) -lnuma\n\nriscv64:\n\t$(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency_riscv64 $(LDFLAGS)\n\nriscv64-numa:\n\t$(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency_riscv64 $(LDFLAGS) -lnuma\n\nw64:\n\t$(CC) $(CFLAGS) MemoryLatency.cpp MemoryLatency_x86.s -o MemoryLatency_w64.exe $(LDFLAGS)\n\n# w64 can build with mingw 11, which isn't available on jammy\n\nci: amd64 amd64-numa aarch64 riscv64 w64\n\nclean:\n\trm -f *.o && find . -type f -executable -delete\n\n.PHONY: all ci clean\n"
  },
  {
    "path": "MemoryLatency/MemoryLatency.c",
    "content": "#define _GNU_SOURCE\n#include <stdio.h>\n#include <stdint.h>\n#include <stdlib.h>\n#include <string.h>\n#include <limits.h>\n#include <math.h>\n#include <sys/time.h>\n#include <unistd.h>\n\n#ifndef __MINGW32__\n#include <sys/mman.h>\n#endif\n\n#ifdef NUMA\n#include <numa.h>\n#include <numaif.h>\n#include <sys/sysinfo.h>\n#endif\n\n#include <errno.h>\n#include <sched.h>\n\n// TODO: possibly get this programatically\n#define PAGE_SIZE 4096\n#define CACHELINE_SIZE 64\n\nint default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 2304, 2560,\n                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 13312, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 65536, 98304,\n                               131072, 262144, 393216, 524288, 1048576 }; //2097152 };\n\n#ifdef __x86_64\nextern void preplatencyarr(uint64_t *arr, uint64_t len) __attribute__((ms_abi));\nextern uint32_t latencytest(uint64_t iterations, uint64_t *arr) __attribute((ms_abi));\n\n#ifdef __MINGW32__\nint posix_memalign(void **memptr, size_t alignment, size_t size)\n{\n    *memptr = _aligned_malloc(alignment, size);\n    return *memptr != NULL;\n}\n#endif\n\n#define LONGPATTERN 1\nextern uint32_t longpatternlatencytest(uint64_t iterations, uint64_t *arr) __attribute((ms_abi));\n\nextern void stlftest(uint64_t iterations, char *arr) __attribute((ms_abi));\nextern void matchedstlftest(uint64_t iterations, char *arr) __attribute((ms_abi));\nextern void stlftest32(uint64_t iterations, char *arr) __attribute((ms_abi));\nextern void stlftest128(uint64_t iterations, char *arr) __attribute((ms_abi));\nvoid (*stlfFunc)(uint64_t, char *) __attribute__((ms_abi)) = stlftest;\n#elif __i686\nextern void preplatencyarr(uint32_t *arr, uint32_t len) __attribute__((fastcall));\nextern uint32_t latencytest(uint32_t iterations, uint32_t *arr) __attribute((fastcall));\nextern void stlftest(uint32_t iterations, char *arr) __attribute((fastcall));\nextern void matchedstlftest(uint32_t iterations, char *arr) __attribute((fastcall));\nvoid (*stlfFunc)(uint32_t, char *) __attribute__((fastcall)) = stlftest;\n#define BITS_32\n#elif __aarch64__\nextern void preplatencyarr(uint64_t *arr, uint64_t len);\nextern uint32_t latencytest(uint64_t iterations, uint64_t *arr);\n\n#define LONGPATTERN 1\nextern uint32_t longpatternlatencytest(uint64_t iterations, uint64_t *arr);\n\nextern void matchedstlftest(uint64_t iterations, char *arr);\nextern void stlftest(uint64_t iterations, char *arr);\nextern void stlftest32(uint64_t iterations, char *arr);\nextern void stlftest128(uint64_t iterations, char *arr);\nvoid (*stlfFunc)(uint64_t, char *) = stlftest;\n#elif __riscv\nextern void preplatencyarr(uint64_t *arr, uint64_t len);\nextern uint32_t latencytest(uint64_t iterations, uint64_t *arr);\nextern void matchedstlftest(uint64_t iterations, char *arr);\nextern void stlftest(uint64_t iterations, char *arr);\nextern void stlftest32(uint64_t iterations, char *arr);\nextern void stlftest128(uint64_t iterations, char *arr);\nvoid (*stlfFunc)(uint64_t, char *) = stlftest; \n#else\n#define UNKNOWN_ARCH 1\nextern uint32_t latencytest(uint64_t iterations, uint64_t *arr);\nvoid (*stlfFunc)(uint64_t, char *) = NULL;\n#endif\n\nfloat RunTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr);\nfloat RunAsmTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr);\nfloat RunTlbTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr);\nfloat RunMlpTest(uint32_t size_kb, uint32_t iterations, uint32_t parallelism);\nfloat RunAopTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr);\nvoid RunStlfTest(uint32_t iterations, int mode, int pageEnd, int loadDistance);\nvoid FillPatternArr64(uint64_t *pattern_arr, uint64_t list_size, uint64_t byte_increment);\nvoid FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment);\n\nfloat (*testFunc)(uint32_t, uint32_t, uint32_t *) = RunTest;\n\nuint32_t ITERATIONS = 100000000;\nuint32_t pageByPage = 0;\nuint32_t longpattern = 0;\n\nint main(int argc, char* argv[]) {\n    uint32_t maxTestSizeMb = 0;\n    uint32_t singleSize = 0;\n    uint32_t testSizeCount = sizeof(default_test_sizes) / sizeof(int);\n    int mlpTest = 0;  // if > 0, run MLP test with (value) levels of parallelism max\n    int stlf = 0, hugePages = 0;\n    int stlfPageEnd = 0, numa = 0, stlfLoadDistance = 0;\n    uint32_t *hugePagesArr = NULL;\n    size_t hugePagesAllocatedBytes = 0;\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\n        if (*(argv[argIdx]) == '-') {\n            char *arg = argv[argIdx] + 1;\n            if (strncmp(arg, \"test\", 4) == 0) {\n                argIdx++;\n                char *testType = argv[argIdx];\n\n        if (strncmp(testType, \"c\", 1) == 0) {\n                    testFunc = RunTest;\n                    fprintf(stderr, \"Using simple C test\\n\");\n                } else if (strncmp(testType, \"tlb\", 3) == 0) {\n                    testFunc = RunTlbTest;\n                    fprintf(stderr, \"Testing TLB with one element accessed per 4K page\\n\");\n                } else if (strncmp(testType, \"mlp\", 3) == 0) {\n                    mlpTest = 32;\n                    fprintf(stderr, \"Running memory parallelism test\\n\");\n                } else if (strncmp(testType, \"aop\", 3) == 0) {\n                    testFunc = RunAopTest;\n                    fprintf(stderr, \"Running array-of-pointers test\\n\");\n                }\n                #ifndef UNKNOWN_ARCH\n                else if (strncmp(testType, \"asm\", 3) == 0) {\n                    testFunc = RunAsmTest;\n                    fprintf(stderr, \"Using ASM (simple address) test\\n\");\n                } else if (strncmp(testType, \"stlf\", 4) == 0) {\n                    stlf = 1;\n                    fprintf(stderr, \"Running store to load forwarding test\\n\");\n                } else if (strncmp(testType, \"matched_stlf\", 4) == 0) {\n                    stlf = 1;\n                    stlfFunc = matchedstlftest;\n                    fprintf(stderr, \"Running store to load forwarding test, with matched load/store sizes\\n\");\n                } else if (strncmp(testType, \"128_stlf\", 4) == 0) {\n                    stlf = 1;\n                    stlfFunc = stlftest128;\n                    fprintf(stderr, \"Running store to load forwarding test, with 128-bit store, 64-bit load\\n\");\n                } \n                #ifdef LONGPATTERN\n                else if (strncmp(testType, \"longpattern\", 11) == 0) {\n                    testFunc = RunAsmTest;\n                    longpattern = 1;\n                    fprintf(stderr, \"Using ASM (simple address) test with longer pattern\\n\");\n                }\n                #endif\n                #ifndef BITS_32\n                else if (strncmp(testType, \"dword_stlf\", 9) == 0) {\n                    stlf = 2;\n                    stlfFunc = stlftest32;\n                    fprintf(stderr, \"Running store to load forwarding test, with 32-bit stores\\n\");\n                }\n                #endif\n                #endif  // end UNKNOWN_ARCH\n                else {\n                    fprintf(stderr, \"Unrecognized test type: %s\\n\", testType);\n                    fprintf(stderr, \"Valid test types: c, tlb, mlp\");\n            #ifndef UNKNOWN_ARCH\n            fprintf(stderr, \", asm, stlf, matched_stlf, dword_stlf\");\n            #endif\n            fprintf(stderr, \"\\n\");\n                }\n            } else if (strncmp(arg, \"maxsizemb\", 9) == 0) {\n                argIdx++;\n                maxTestSizeMb = atoi(argv[argIdx]);\n                fprintf(stderr, \"Will not exceed %u MB\\n\", maxTestSizeMb);\n            } else if (strncmp(arg, \"iter\", 4) == 0) {\n                argIdx++;\n                ITERATIONS = atoi(argv[argIdx]);\n                fprintf(stderr, \"Base iterations: %u\\n\", ITERATIONS);\n            } \n            else if (strncmp(arg, \"stlf_page_end\", 13) == 0) {\n                    argIdx++;\n                    stlfPageEnd = atoi(argv[argIdx]);\n                    fprintf(stderr, \"Store to load forwarding test will be pushed to end of %d byte page\\n\", stlfPageEnd);\n            }\n            else if (strncmp(arg, \"stlf_load_offset\", 16) == 0) {\n                    argIdx++;\n                    stlfLoadDistance = atoi(argv[argIdx]);\n                    fprintf(stderr, \"Loads will be offset by %d bytes\\n\", stlfLoadDistance);\n            }\n            #ifndef __MINGW32__\n            else if (strncmp(arg, \"hugepages\", 9) == 0) {\n                  hugePages = 1;\n                  fprintf(stderr, \"If applicable, will use huge pages. Will allocate max memory at start, make sure system has enough memory.\\n\");\n            } \n\t    else if (strncmp(arg, \"affinity\", 8) == 0) {\n                argIdx++;\n\t\tint targetThread = atoi(argv[argIdx]);\n                fprintf(stderr, \"Affinity set to core %d\\n\", targetThread);\n                cpu_set_t cpuset;\n                CPU_ZERO(&cpuset);\n                CPU_SET(targetThread, &cpuset);\n                sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);\n\t    }\n            #endif\n            else if (strncmp(arg, \"pagebypage\", 10) == 0) {\n                pageByPage = 1;\n                fprintf(stderr, \"If applicable, will hit all elements in a page before moving to another page to reduce TLB penalties\\n\");\n            }\n            else if (strncmp(arg, \"sizekb\", 6) == 0) {\n                argIdx++;\n                singleSize = atoi(argv[argIdx]);\n                fprintf(stderr, \"Testing %u KB only\\n\", singleSize);\n            }\n\n#ifdef NUMA\n            else if (strncmp(arg, \"numa\", 4) == 0) {\n                numa = 1;\n                singleSize = 1048576;\n                fprintf(stderr, \"Testing node to node latency. If test size is not set, it will be 1 GB\\n\");\n            }\n#endif\n        else {\n                fprintf(stderr, \"Unrecognized option: %s\\n\", arg);\n            }\n        }\n    }\n\n    if (argc == 1) {\n        fprintf(stderr, \"Usage: [-test <c/asm/tlb/mlp>] [-maxsizemb <max test size in MB>] [-iter <base iterations, default 100000000]\\n\");\n    }\n\n#ifdef __linux__\n    if (hugePages) {\n       size_t hugePageSize = 1 << 21;\n       size_t testSizeKb = singleSize ? singleSize : default_test_sizes[testSizeCount - 1];\n       size_t maxMemRequired = testSizeKb * (size_t)1024;\n       hugePagesAllocatedBytes = maxMemRequired;\n       if (maxTestSizeMb > 0 && maxMemRequired > maxTestSizeMb * 1024 * 1024) maxMemRequired = maxTestSizeMb * 1024 * 1024;\n       maxMemRequired = (((maxMemRequired - 1) / hugePageSize) + 1) * hugePageSize;\n       fprintf(stderr, \"mmap-ing %lu bytes\\n\", maxMemRequired);\n       hugePagesArr = mmap(NULL, maxMemRequired, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);\n       if (hugePagesArr == (void *)-1) { // on failure, mmap will return MAP_FAILED, or (void *)-1\n           fprintf(stderr, \"Failed to mmap huge pages, errno %d = %s\\nWill try to use madvise\\n\", errno, strerror(errno));\n           if (0 != posix_memalign((void **)(&hugePagesArr), 2 * 1024 * 1024, maxMemRequired)) {\n               fprintf(stderr, \"Failed to allocate 2 MB aligned memory, will not use hugepages\\n\");\n           hugePagesArr = NULL;\n               return 0;\n           }\n\n           madvise(hugePagesArr, maxMemRequired, MADV_HUGEPAGE);\n       }\n    }\n#endif\n\n    if (mlpTest) {\n        // allocate arr to hold results\n        float *results = (float *)malloc(testSizeCount * mlpTest * sizeof(float));\n        for (int size_idx = 0; size_idx < testSizeCount; size_idx++) {\n            for (int parallelism = 0; parallelism < mlpTest; parallelism++) {\n                results[size_idx * mlpTest + parallelism] = RunMlpTest(default_test_sizes[size_idx], ITERATIONS, parallelism + 1);\n                printf(\"%d KB, %dx parallelism, %f MB/s\\n\", default_test_sizes[size_idx], parallelism + 1, results[size_idx * mlpTest + parallelism]);\n            }\n        }\n\n        for (int size_idx = 0; size_idx < testSizeCount; size_idx++) {\n            printf(\",%d\", default_test_sizes[size_idx]);\n        }\n\n        printf(\"\\n\");\n\n        for (int parallelism = 0; parallelism < mlpTest; parallelism++) {\n            printf(\"%d\", parallelism + 1);\n            for (int size_idx = 0; size_idx < default_test_sizes[size_idx]; size_idx++) {\n                printf(\",%f\", results[size_idx * mlpTest + parallelism]);\n            }\n            printf(\"\\n\");\n        }\n\n        free(results);\n    } else if (stlf) {\n        RunStlfTest(ITERATIONS, stlf, stlfPageEnd, stlfLoadDistance);\n    } \n#ifdef NUMA\n    else if (numa) {\n        if (numa_available() == -1) {\n        fprintf(stderr, \"NUMA is not available\\n\");\n        return 0;\n    }\n\n    int numaNodeCount = numa_max_node() + 1;\n    if (numaNodeCount > 64) {\n        fprintf(stderr, \"Too many NUMA nodes. Go home.\\n\");\n        return 0;\n    }\n\n    struct bitmask *nodeBitmask = numa_allocate_cpumask();\n    float *crossnodeLatencies = (float *)malloc(sizeof(float) * numaNodeCount * numaNodeCount);\n    memset(crossnodeLatencies, 0, sizeof(float) * numaNodeCount * numaNodeCount);\n        for (int cpuNode = 0; cpuNode < numaNodeCount; cpuNode++) {\n        numa_node_to_cpus(cpuNode, nodeBitmask);\n        int nodeCpuCount = numa_bitmask_weight(nodeBitmask);\n        if (nodeCpuCount == 0) {\n            fprintf(stderr, \"Node %d has no cores\\n\", cpuNode);\n        continue;\n        }\n\n            fprintf(stderr, \"Node %d has %d cores\\n\", cpuNode, nodeCpuCount);\n        cpu_set_t cpuset;\n        memcpy(cpuset.__bits, nodeBitmask->maskp, nodeBitmask->size / 8);\n            // for (int i = 0; i < get_nprocs(); i++) \n            //  if (numa_bitmask_isbitset(nodeBitmask, i)) CPU_SET(i, &cpuset); \n\n        sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);\n\n        for (int memNode = 0; memNode < numaNodeCount; memNode++) {\n            uint64_t nodeMask = 1UL << memNode;\n        uint32_t *arr;\n            if (hugePagesArr) {\n            fprintf(stderr, \"mbind-ing pre-allocated arr, size %lu bytes\\n\", hugePagesAllocatedBytes);\n            long mbind_rc = mbind(hugePagesArr, hugePagesAllocatedBytes, MPOL_BIND, &nodeMask, 64, MPOL_MF_STRICT | MPOL_MF_MOVE);\n            fprintf(stderr, \"mbind returned %ld\\n\", mbind_rc);\n            if (mbind_rc != 0) {\n                fprintf(stderr, \"errno: %d\\n\", errno);\n            }\n            arr = hugePagesArr;\n        } else {\n                    arr = numa_alloc_onnode(singleSize * 1024, memNode);\n                    madvise(arr, singleSize * 1024, MADV_HUGEPAGE);\n        }\n            \n        float latency = testFunc(singleSize, ITERATIONS, arr);\n        crossnodeLatencies[cpuNode * numaNodeCount + memNode] = latency;\n        fprintf(stderr, \"CPU node %d -> mem node %d: %f ns\\n\", cpuNode, memNode, latency);\n        if (!hugePages) numa_free(arr, singleSize * 1024);\n        }\n    }\n\n    for (int memNode = 0; memNode < numaNodeCount; memNode++) {\n        printf(\",%d\", memNode);\n    }\n\n    printf(\"\\n\");\n    for (int cpuNode = 0; cpuNode < numaNodeCount; cpuNode++) {\n        printf(\"%d\", cpuNode);\n        for (int memNode = 0; memNode < numaNodeCount; memNode++) {\n            printf(\",%f\", crossnodeLatencies[cpuNode * numaNodeCount + memNode]);\n        }\n\n        printf(\"\\n\");\n    }\n\n    free(crossnodeLatencies);\n    }\n#endif\n    else {\n        if (singleSize == 0) {\n        printf(\"Region,Latency (ns)\\n\");\n            for (int i = 0; i < testSizeCount; i++) {\n                if ((maxTestSizeMb == 0) || (default_test_sizes[i] <= maxTestSizeMb * 1024))\n                    printf(\"%d,%f\\n\", default_test_sizes[i], testFunc(default_test_sizes[i], ITERATIONS, hugePagesArr));\n                else {\n                    fprintf(stderr, \"Test size %u KB exceeds max test size of %u KB\\n\", default_test_sizes[i], maxTestSizeMb * 1024);\n                    break;\n                }\n            }\n        } else {\n            printf(\"%d,%f\\n\", singleSize, testFunc(singleSize, ITERATIONS, hugePagesArr));\n        }\n    }\n\n    return 0;\n}\n\n/// <summary>\n/// Heuristic to make sure test runs for enough time but not too long\n/// </summary>\n/// <param name=\"size_kb\">Region size</param>\n/// <param name=\"iterations\">base iterations</param>\n/// <returns>scaled iterations</returns>\nuint64_t scale_iterations(uint32_t size_kb, uint32_t iterations) {\n    return 10 * iterations / pow(size_kb, 1.0 / 4.0);\n}\n\n// Fills an array so that traversal completes within one page before going to another\n// random page. Tries to avoid TLB penalties at the cost of not being completely random\n// list_size = size of pattern arr in 32-bit elements\nvoid FillPageByPage(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment) {\n    uint32_t pageCount = list_size * sizeof(uint32_t) / PAGE_SIZE;\n    uint32_t page_element_count = PAGE_SIZE / sizeof(uint32_t);\n    if (pageCount <= 2) {\n        FillPatternArr(pattern_arr, list_size, byte_increment);\n        return;\n    }\n\n    // If test size is not divisible by page size, handle the extra page separately\n    short extraPage = 0;\n    if (pageCount * PAGE_SIZE / sizeof(uint32_t) < list_size) extraPage = 1;\n\n    uint32_t *pagePatternArr = malloc(sizeof(uint32_t) * (pageCount + extraPage));\n    FillPatternArr(pagePatternArr, pageCount + extraPage, 4);\n    for (uint32_t page_idx = 0; page_idx < pageCount; page_idx++)\n    {\n        uint32_t *page_base = pattern_arr + (page_element_count * page_idx);\n        FillPatternArr(page_base, page_element_count, byte_increment);\n\n        uint32_t page_last_element_index;\n        for (uint32_t page_element_idx = 0; page_element_idx < (PAGE_SIZE / sizeof(uint32_t)); page_element_idx += (byte_increment / sizeof(uint32_t))) {\n            // element that points to 0 should be directed to the next page\n            if (page_base[page_element_idx] == 0) page_base[page_element_idx] = pagePatternArr[page_idx] * (PAGE_SIZE / sizeof(uint32_t));\n\n            // otherwise make sure the offset is set relative to the start of the uber-array\n            else page_base[page_element_idx] += page_element_count * page_idx;\n        }\n    }\n\n    free(pagePatternArr);\n    return;\n}\n\n// Fills an array so that traversal completes within one page before going to another\n// random page. Tries to avoid TLB penalties at the cost of not being completely random\n// list_size = size of pattern arr in 32-bit elements\nvoid FillPageByPage64(uint64_t *pattern_arr, uint32_t list_size, uint32_t byte_increment) {\n    uint32_t pageCount = list_size * sizeof(uint64_t) / PAGE_SIZE;\n    uint32_t page_element_count = PAGE_SIZE / sizeof(uint64_t);\n    if (pageCount <= 2) {\n        FillPatternArr64(pattern_arr, list_size, byte_increment);\n        return;\n    }\n\n    // If test size is not divisible by page size, handle the extra page separately\n    short extraPage = 0;\n    if (pageCount * PAGE_SIZE / sizeof(uint64_t) < list_size) extraPage = 1;\n\n    uint32_t *pagePatternArr = malloc(sizeof(uint32_t) * (pageCount + extraPage));\n    FillPatternArr(pagePatternArr, pageCount + extraPage, 4);\n    for (uint32_t page_idx = 0; page_idx < pageCount; page_idx++)\n    {\n        uint64_t *page_base = pattern_arr + (page_element_count * page_idx);\n        FillPatternArr((uint32_t *)page_base, page_element_count, byte_increment);\n\n        uint32_t page_last_element_index;\n        for (uint32_t page_element_idx = 0; page_element_idx < (PAGE_SIZE / sizeof(uint64_t)); page_element_idx += (byte_increment / sizeof(uint64_t))) {\n            // element that points to 0 should be directed to the next page\n            if (page_base[page_element_idx] == 0) page_base[page_element_idx] = pagePatternArr[page_idx] * (PAGE_SIZE / sizeof(uint64_t));\n\n            // otherwise make sure the offset is set relative to the start of the uber-array\n            else page_base[page_element_idx] += page_element_count * page_idx;\n        }\n    }\n\n    free(pagePatternArr);\n    return;\n}\n\n// Fills an array using Sattolo's algo\n// pattern_arr = array to fill\n// list_size = size of pattern arr in 32-bit elements\n// byte_increment = one element per this many bytes\nvoid FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment) {\n    uint32_t increment = byte_increment / sizeof(uint32_t);\n    uint32_t element_count = list_size / increment;\n    for (int i = 0; i < element_count; i++) {\n        pattern_arr[i * increment] = i * increment;\n    }\n\n    int iter = element_count;\n    while (iter > 1) {\n        iter -= 1;\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\n        uint32_t tmp = pattern_arr[iter * increment];\n        pattern_arr[iter * increment] = pattern_arr[j * increment];\n        pattern_arr[j * increment] = tmp;\n    }\n}\n\n// Same thing but with 64-bit elements\n// pattern_arr = array to fill\n// list_size = number of 64-bit elements in array\n// byte_increment = cacheline size, in bytes\nvoid FillPatternArr64(uint64_t *pattern_arr, uint64_t list_size, uint64_t byte_increment) {\n    uint32_t increment = byte_increment / sizeof(uint64_t); // number of 64-bit integers in a cacheline\n    uint32_t element_count = list_size / increment;\n    for (int increment_offset = 0; increment_offset < increment; increment_offset++) {\n        for (int i = 0; i < element_count; i++) {\n            pattern_arr[i * increment + increment_offset] = i * increment + increment_offset;\n        }\n\n        int iter = element_count;\n        while (iter > 1) {\n            iter -= 1;\n            int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\n            uint64_t tmp = pattern_arr[iter * increment + increment_offset];\n            pattern_arr[iter * increment + increment_offset] = pattern_arr[j * increment + increment_offset];\n            pattern_arr[j * increment + increment_offset] = tmp;\n        }\n    }\n}\n\nfloat RunTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr) {\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz;\n    uint32_t list_size = size_kb * 1024 / 4;\n    uint32_t sum = 0, current;\n\n    // Fill list to create random access pattern\n    uint32_t *A;\n    if (preallocatedArr == NULL) {\n        if (0 != posix_memalign((void **)(&A), 64, sizeof(uint32_t) * list_size)) {\n            fprintf(stderr, \"Failed to allocate memory for %u KB test\\n\", size_kb);\n        }\n    } else {\n        A = (uint32_t *)preallocatedArr;\n    }\n\n    if (!pageByPage) FillPatternArr(A, list_size, CACHELINE_SIZE);\n    else FillPageByPage(A, list_size, CACHELINE_SIZE);\n\n    uint32_t scaled_iterations = scale_iterations(size_kb, iterations);\n\n    // Run test\n    gettimeofday(&startTv, &startTz);\n    current = A[0];\n    for (int i = 0; i < scaled_iterations; i++) {\n        current = A[current];\n        sum += current;\n    }\n    gettimeofday(&endTv, &endTz);\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;\n    if (preallocatedArr == NULL) free(A);\n\n    if (sum == 0) printf(\"sum == 0 (?)\\n\");\n    return latency;\n}\n\n// Test array of pointers\nfloat RunAopTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr) {\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz;\n    uint32_t element_count = size_kb * 1024 / 64;  // 64B cachelines\n    uint32_t sum = 0, current;\n\n    // allocate pattern array\n    uint32_t *pattern_arr = (uint32_t *)malloc(element_count * sizeof(uint32_t));\n    uint32_t **pointer_arr = (uint32_t **)malloc(element_count * sizeof(uint32_t *));\n\n    for (int i = 0; i < element_count; i++) pattern_arr[i] = i;\n    \n    int iter = element_count;\n    while (iter > 1) {\n        iter -= 1;\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\n        uint32_t tmp = pattern_arr[iter];\n        pattern_arr[iter] = pattern_arr[j];\n        pattern_arr[j] = tmp;\n    }\n\n    uint32_t *A;\n    if (preallocatedArr == NULL) {\n        if (0 != posix_memalign((void **)(&A), 64, 1024 * size_kb)) {\n            fprintf(stderr, \"Failed to allocate memory for %u KB test\\n\", size_kb);\n        }\n    } else {\n        A = (uint32_t *)preallocatedArr;\n    }\n\n    // make pattern array actually pointers\n    for (int i = 0; i < element_count; i++) {\n        pointer_arr[i] = A + (pattern_arr[i] * (64 / sizeof(uint32_t)));\n        *pointer_arr[i] = i + 1;\n    }\n    free(pattern_arr); \n\n    uint32_t scaled_iterations = scale_iterations(size_kb, iterations);\n    gettimeofday(&startTv, &startTz);\n    for (int i = 0; i < scaled_iterations;) {\n        for (int pointer_idx = 0; (pointer_idx < element_count) && (i < scaled_iterations); pointer_idx++, i++)\n            sum += *pointer_arr[pointer_idx]; \n    }\n    gettimeofday(&endTv, &endTz);\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;\n    if (sum == 0) fprintf(stderr, \"something is not right\\n\");\n    if (preallocatedArr == NULL) free(A);\n\n    free(pointer_arr);\n    return latency;\n}\n\n// Tests memory level parallelism. Returns achieved BW in MB/s using specified number of\n// independent pointer chasing chains\nfloat RunMlpTest(uint32_t size_kb, uint32_t iterations, uint32_t parallelism) {\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz;\n    uint32_t list_size = size_kb * 1024 / 4;\n    uint32_t sum = 0, current;\n\n    if (parallelism < 1) return 0;\n\n    // Fill list to create random access pattern, and hold temporary data\n    uint32_t *A = (uint32_t *)malloc(sizeof(uint32_t) * list_size);\n    uint32_t *offsets = (uint32_t *)malloc(sizeof(uint32_t) * parallelism);\n    if (!A || !offsets) {\n        fprintf(stderr, \"Failed to allocate memory for %u KB test\\n\", size_kb);\n        return 0;\n    }\n\n    FillPatternArr(A, list_size, CACHELINE_SIZE);\n    for (int i = 0; i < parallelism; i++) offsets[i] = i * (CACHELINE_SIZE / sizeof(uint32_t));\n    uint32_t scaled_iterations = scale_iterations(size_kb, iterations) / parallelism;\n\n    // Run test\n    gettimeofday(&startTv, &startTz);\n    for (uint32_t i = 0; i < scaled_iterations; i++) {\n        for (uint32_t j = 0; j < parallelism; j++)\n        {\n            offsets[j] = A[offsets[j]];\n        }\n    }\n    gettimeofday(&endTv, &endTz);\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    double mbTransferred = (scaled_iterations * parallelism * sizeof(uint32_t))  / (double)1e6;\n    float bw = 1000 * mbTransferred / (double)time_diff_ms;\n\n    sum = 0;\n    for (uint32_t i = 0; i < parallelism; i++) sum += offsets[i];\n    if (sum == 0) printf(\"sum == 0 (?)\\n\");\n\n    free(A);\n    free (offsets);\n    return bw;\n}\n\n#ifdef __i686\n#define POINTER_SIZE 4\n#define POINTER_INT uint32_t\n#else\n#define POINTER_SIZE 8\n#define POINTER_INT uint64_t\n#endif\n\n#ifndef UNKNOWN_ARCH\nfloat RunAsmTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr) {\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz;\n    uint64_t list_size = size_kb * 1024 / POINTER_SIZE; // using 32-bit pointers\n    uint32_t sum = 0, current;\n\n    // Fill list to create random access pattern\n    POINTER_INT *A;\n    if (preallocatedArr == NULL) {\n        if (0 != posix_memalign((void **)(&A), 64, POINTER_SIZE * list_size)) {\n            fprintf(stderr, \"Failed to allocate memory for %u KB test\\n\", size_kb);\n        }\n    } else {\n        A = (POINTER_INT *)preallocatedArr;\n    }\n\n    memset(A, 0, POINTER_SIZE * list_size);\n\n#ifdef __i686\n    if (!pageByPage) FillPatternArr(A, list_size, CACHELINE_SIZE);\n    else FillPageByPage(A, list_size, CACHELINE_SIZE);\n#else\n    if (!pageByPage) FillPatternArr64(A, list_size, CACHELINE_SIZE);\n    else FillPageByPage64(A, list_size, CACHELINE_SIZE);\n#endif\n\n    preplatencyarr(A, list_size);\n\n    uint32_t scaled_iterations = scale_iterations(size_kb, iterations);\n\n    // Run test\n    gettimeofday(&startTv, &startTz);\n    #ifdef LONGPATTERN\n    if (longpattern)\n        sum = longpatternlatencytest(scaled_iterations, A);\n    else\n        sum = latencytest(scaled_iterations, A);\n    #endif\n    gettimeofday(&endTv, &endTz);\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;\n    if (preallocatedArr == NULL) free(A);\n\n    // if (sum == 0) printf(\"sum == 0 (?)\\n\");\n    return latency;\n}\n#endif\n\n// Tries to isolate virtual to physical address translation latency by accessing\n// one element per page, and checking latency difference between that and hitting the same amount of \"hot\"\n// cachelines using a normal latency test.. 4 KB pages are assumed.\nfloat RunTlbTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr) {\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz;\n    uint32_t element_count = size_kb / 4;\n    uint32_t list_size = size_kb * 1024 / 4;\n    uint32_t sum = 0, current;\n\n    if (element_count == 0) element_count = 1;\n\n    //fprintf(stderr, \"Element count for size %u: %u\\n\", size_kb, element_count);\n\n    // create access pattern first, then fill it into the test array spaced by page size\n    uint32_t *pattern_arr = (uint32_t*)malloc(sizeof(uint32_t) * element_count);\n    if (!pattern_arr) {\n        fprintf(stderr, \"Failed to allocate memory for %u KB test (offset array)\\n\", size_kb);\n        return 0;\n    }\n\n    for (int i = 0; i < element_count; i++) {\n        pattern_arr[i] = i;\n    }\n\n    int iter = element_count;\n    while (iter > 1) {\n        iter -= 1;\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\n        uint32_t tmp = pattern_arr[iter];\n        pattern_arr[iter] = pattern_arr[j];\n        pattern_arr[j] = tmp;\n    }\n\n    // translate offsets and fill the test array\n    // [offset-------page-------][offset-----page------....etc\n    uint32_t *A;\n    if (preallocatedArr == NULL) {\n        A = (uint32_t *)malloc(sizeof(uint32_t) * list_size);\n        if (!A) {\n            fprintf(stderr, \"Failed to allocate memory for %u KB test (pointer array)\\n\", size_kb);\n        }\n    } else {\n        A = preallocatedArr;\n    }\n\n    memset(A, INT_MAX, list_size); // catch any bad accesses immediately\n    int pageIncrement = PAGE_SIZE / sizeof(uint32_t);\n    for (int i = 0;i < element_count; i++) {\n        // offset each by i cachelines to avoid conflict misses. If we just use the first cacheline\n        // in each page, the index bits for every VIPT access will be the same and we'll run into L1D misses\n        // faster than we would like\n        int idx = i * pageIncrement + ((i * 16) & (pageIncrement - 1));\n        int target_idx = pattern_arr[i] * pageIncrement + ((pattern_arr[i] * 16) & (pageIncrement - 1));\n        A[idx] = target_idx;\n    }\n\n    free(pattern_arr);  // don't need this anymore\n\n    uint32_t scaled_iterations = scale_iterations(size_kb, iterations);\n\n    // Run test\n    gettimeofday(&startTv, &startTz);\n    current = A[0];\n    for (int i = 0; i < scaled_iterations; i++) {\n        current = A[current];\n        sum += current;\n        //if (size_kb == 48) fprintf(stderr, \"idx: %u\\n\", current);\n    }\n    gettimeofday(&endTv, &endTz);\n    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);\n    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;\n    if (preallocatedArr == NULL) free(A);\n\n    if (element_count > 1 && sum == 0) printf(\"sum == 0 (?)\\n\");\n\n    // Get a reference timing for the size, to isolate TLB latency from cache latency\n    uint32_t memoryUsedKb = (element_count * CACHELINE_SIZE) / 1024;\n    if (memoryUsedKb == 0) memoryUsedKb = 1;\n    float cacheLatency = RunTest(memoryUsedKb, iterations, preallocatedArr);\n\n    //fprintf(stderr, \"Memory used - %u KB, latency: %f, ref latency: %f\\n\", memoryUsedKb, latency, cacheLatency);\n    return latency - cacheLatency;\n}\n\n// Run store to load forwarding test, as described in https://blog.stuffedcow.net/2014/01/x86-memory-disambiguation/\n// uses 4B loads and 8B stores to see when/if store forwarding can succeed when sizes are not matched\n// pageEnd = push test to the end of (pageEnd) sized page. 0 = just test cacheline\n// loadDistance = how far ahead to push the load (for testing aliasing)\n// cannot set both pageEnd and loadDistance\nvoid RunStlfTest(uint32_t iterations, int mode, int pageEnd, int loadDistance) {\n    struct timeval startTv, endTv;\n    struct timezone startTz, endTz;\n    uint64_t time_diff_ms;\n    float latency;\n    float stlfResults[64][64];\n    char *arr; \n    char *allocArr;\n\n    // defaults: grab a couple of cachelines\n    int testAlignment = 64, testAllocSize = 128, testOffset = 0;\n\n    if (pageEnd != 0) {\n        testAlignment = pageEnd;\n        testAllocSize = pageEnd * 2;\n        testOffset = pageEnd - 64;\n    } else if (loadDistance != 0) {\n        testAlignment = 4096;\n        testAllocSize = loadDistance + 128; // enough if I ever go to avx-512 loads\n    }\n\n    // obtain a couple of cachelines, assuming 64B cacheline size\n#ifdef _WIN32\n    allocArr = (char *)_aligned_malloc(testAllocSize, testAlignment);\n    if (allocArr == NULL) {\n        fprintf(stderr, \"Could not obtain aligned memory\\n\");\n        return;\n    }\n#else\n    if (0 != posix_memalign((void **)(&allocArr), testAlignment, testAllocSize)) {\n        fprintf(stderr, \"Could not obtain aligned memory\\n\");\n        return;\n    }\n#endif\n\n    arr = allocArr + testOffset;\n\n    for (int storeOffset = 0; storeOffset < 64; storeOffset++)\n        for (int loadOffset = 0; loadOffset < 64; loadOffset++) {\n            ((uint32_t *)(arr))[0] = storeOffset;\n            ((uint32_t *)(arr))[1] = loadOffset + loadDistance;\n            gettimeofday(&startTv, &startTz);\n            stlfFunc(iterations, arr);\n            gettimeofday(&endTv, &endTz);\n            time_diff_ms = 1e6 * (endTv.tv_sec - startTv.tv_sec) + (endTv.tv_usec - startTv.tv_usec);\n            latency = 1e3 * (float) time_diff_ms / (float) iterations;\n            stlfResults[storeOffset][loadOffset] = latency;\n            fprintf(stderr, \"Store offset %d, load offset %d: %f ns\\n\", storeOffset, loadOffset, latency);\n        }\n\n    // output as CSV\n    for (int loadOffset = 0; loadOffset < 64; loadOffset++) printf(\",%d\", loadOffset);\n    printf(\"\\n\");\n    for (int storeOffset = 0; storeOffset < 64; storeOffset++) {\n        printf(\"%d\", storeOffset);\n        for (int loadOffset = 0; loadOffset < 64; loadOffset++) {\n            printf(\",%f\", stlfResults[storeOffset][loadOffset]);\n        }\n        printf(\"\\n\");\n    }\n#ifdef _WIN32\n    _aligned_free(allocArr);\n#else\n    free(allocArr);\n#endif\n    return;\n}\n"
  },
  {
    "path": "MemoryLatency/MemoryLatency.cpp",
    "content": "#include <stdio.h>\r\n#include <stdint.h>\r\n#include <stdlib.h>\r\n#ifdef __MINGW32__\r\n    #include <sys/timeb.h>\r\n#else\r\n    #include <sys\\timeb.h>\r\n#endif\r\n#include <math.h>\r\n#include <windows.h>\r\n#include <tchar.h>\r\n#include <intrin.h>\r\n\r\n#define ITERATIONS 400000000\r\n\r\nint default_test_sizes[36] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048,\r\n                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304,\r\n                               131072, 262144, 393216, 524288, 1048576 };\r\n\r\nfloat RunTest(uint32_t size_kb, uint64_t iterations, void *mem);\r\nfloat RunAsmTest(uint32_t size_kb, uint64_t iterations, void* mem);\r\nbool GetPrivilege();\r\n\r\nextern \"C\" void preplatencyarr(uint64_t * mem, uint64_t element_count);\r\nextern \"C\" uint64_t latencytest(uint64_t iterations, uint64_t *mem);\r\n\r\nint main(int argc, char* argv[]) {\r\n    void* arr = NULL;\r\n    int numa = 0, coreNode = 0, memNode = 0, largepages = 0;\r\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\r\n        if (*(argv[argIdx]) == '-') {\r\n            char* arg = argv[argIdx] + 1;\r\n            if (_strnicmp(arg, \"hugepages\", 9) == 0) {\r\n                fprintf(stderr, \"Will attempt to use large pages\\n\");\r\n                largepages = 1;\r\n                GetPrivilege();\r\n            } else if (_strnicmp(arg, \"autonuma\", 8) == 0) {\r\n                fprintf(stderr, \"Testing NUMA, 1 GB test size\\n\");\r\n                numa = 1;\r\n            }\r\n            else if (_strnicmp(arg, \"numa\", 4) == 0) {\r\n                numa = 2;\r\n                argIdx++;\r\n                coreNode = atoi(argv[argIdx]);\r\n                argIdx++;\r\n                memNode = atoi(argv[argIdx]);\r\n                fprintf(stderr, \"Testing %d -> %d\\n\", coreNode, memNode);\r\n            }\r\n        }\r\n    }\r\n\r\n    DWORD allocationType = MEM_RESERVE | MEM_COMMIT;\r\n    if (largepages) allocationType |= MEM_LARGE_PAGES;\r\n\r\n    if (numa == 1) {\r\n        ULONG highestNumaNode;\r\n        DWORD nProcs;\r\n        SYSTEM_INFO SystemInfo;\r\n        GetSystemInfo(&SystemInfo);\r\n        nProcs = SystemInfo.dwNumberOfProcessors;\r\n        if (!GetNumaHighestNodeNumber(&highestNumaNode)) {\r\n            fprintf(stderr, \"Could not get highest NUMA node number: %d\\n\", GetLastError());\r\n            return 0;\r\n        }\r\n\r\n        // auto numa latency mode - use highest test size and test latency from core node to mem node\r\n        for (int coreNode = 0; coreNode <= highestNumaNode; coreNode++) printf(\",%d\", coreNode);\r\n        printf(\"\\n\");\r\n\r\n        for (int coreNode = 0; coreNode <= highestNumaNode; coreNode++) {\r\n            printf(\"%d\", coreNode);\r\n            for (int memNode = 0; memNode <= highestNumaNode; memNode++) {\r\n                ULONGLONG mask;\r\n                DWORD index;\r\n                arr = VirtualAllocExNuma(GetCurrentProcess(),\r\n                    NULL,\r\n                    default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1] * 1024,\r\n                    allocationType,\r\n                    PAGE_READWRITE,\r\n                    memNode);\r\n                GetNumaNodeProcessorMask(coreNode, &mask);\r\n                BitScanReverse64(&index, mask);\r\n                mask = 0;\r\n                mask |= 1ULL << (ULONGLONG)index;\r\n                SetProcessAffinityMask(GetCurrentProcess(), mask);\r\n                float latency = RunAsmTest(1048576, ITERATIONS, arr);\r\n                printf(\",%f\", latency);\r\n                VirtualFree(arr, 0, MEM_RELEASE);\r\n            }\r\n\r\n            printf(\"\\n\");\r\n        }\r\n    } else {\r\n        if (numa == 2) {\r\n            ULONG highestNumaNode;\r\n            ULONGLONG mask;\r\n            DWORD nProcs, index;\r\n            SYSTEM_INFO SystemInfo;\r\n            GetSystemInfo(&SystemInfo);\r\n            nProcs = SystemInfo.dwNumberOfProcessors;\r\n\r\n            GetNumaNodeProcessorMask(coreNode, &mask);\r\n            fprintf(stderr, \"node core mask: %llx\\n\", mask);\r\n            BitScanReverse64(&index, mask);\r\n            mask = 0;\r\n            mask |= 1ULL << (ULONGLONG)index;\r\n            SetProcessAffinityMask(GetCurrentProcess(), mask);\r\n            fprintf(stderr, \"core mask: %llx, index %u\\n\", mask, index);\r\n            arr = VirtualAllocExNuma(GetCurrentProcess(),\r\n                NULL,\r\n                default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1] * 1024,\r\n                allocationType,\r\n                PAGE_READWRITE,\r\n                memNode);\r\n        }\r\n        else if (largepages) {\r\n            arr = VirtualAlloc(NULL, default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1] * 1024, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE);\r\n            if (arr == NULL)\r\n            {\r\n                fprintf(stderr, \"Failed to get memory via VirtualAlloc: %d\\n\", GetLastError());\r\n                return -1;\r\n            }\r\n        }\r\n\r\n        printf(\"Region,Latency (ns)\\n\");\r\n        for (int i = 0; i < sizeof(default_test_sizes) / sizeof(int); i++)\r\n        {\r\n            printf(\"%d,%f\\n\", default_test_sizes[i], RunAsmTest(default_test_sizes[i], ITERATIONS, arr));\r\n        }\r\n    }\r\n\r\n    printf(\"If you didn't run this through cmd, now you can copy the results\");\r\n\r\n    return 0;\r\n}\r\n\r\n/// <summary>\r\n/// Heuristic to make sure test runs for enough time but not too long\r\n/// </summary>\r\n/// <param name=\"size_kb\">Region size</param>\r\n/// <param name=\"iterations\">base iterations</param>\r\n/// <returns>scaled iterations</returns>\r\nuint64_t scale_iterations(uint32_t size_kb, uint64_t iterations) {\r\n    return 10 * iterations / pow(size_kb, 1.0 / 4.0);\r\n}\r\n\r\nvoid FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {\r\n    uint32_t increment = byte_increment / sizeof(uint32_t);\r\n    uint32_t element_count = list_size / increment;\r\n    for (int i = 0; i < element_count; i++) {\r\n        pattern_arr[i * increment] = i * increment;\r\n    }\r\n\r\n    int iter = element_count;\r\n    while (iter > 1) {\r\n        iter -= 1;\r\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\r\n        uint32_t tmp = pattern_arr[iter * increment];\r\n        pattern_arr[iter * increment] = pattern_arr[j * increment];\r\n        pattern_arr[j * increment] = tmp;\r\n    }\r\n}\r\n\r\nvoid FillPatternArr64(uint64_t* pattern_arr, uint64_t list_size, uint64_t byte_increment) {\r\n    uint32_t increment = byte_increment / sizeof(uint64_t);\r\n    uint32_t element_count = list_size / increment;\r\n    for (int i = 0; i < element_count; i++) {\r\n        pattern_arr[i * increment] = i * increment;\r\n    }\r\n\r\n    int iter = element_count;\r\n    while (iter > 1) {\r\n        iter -= 1;\r\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\r\n        uint64_t tmp = pattern_arr[iter * increment];\r\n        pattern_arr[iter * increment] = pattern_arr[j * increment];\r\n        pattern_arr[j * increment] = tmp;\r\n    }\r\n}\r\n\r\nfloat RunAsmTest(uint32_t size_kb, uint64_t iterations, void* mem) {\r\n    struct timeb start, end;\r\n    uint32_t list_size = size_kb * 1024 / sizeof(void *);\r\n\r\n    uint64_t* A;\r\n    if (mem == NULL) {\r\n        A = (uint64_t *)malloc(size_kb * 1024);\r\n    }\r\n    else {\r\n        A = (uint64_t *)mem;\r\n    }\r\n\r\n    memset(A, 0, 1024 * size_kb);\r\n    FillPatternArr64(A, size_kb * 1024 / sizeof(uint64_t), 64);\r\n    preplatencyarr(A, size_kb * 1024 / sizeof(uint64_t));\r\n    uint64_t scaled_iterations = scale_iterations(size_kb, iterations);\r\n\r\n    ftime(&start);\r\n    uint64_t sum = latencytest(scaled_iterations, A);\r\n    ftime(&end);\r\n    int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);\r\n    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;\r\n    if (mem == NULL) free(A);\r\n\r\n    if (sum == 0) printf(\"sum == 0 (???)\\n\");\r\n    return latency;\r\n}\r\n\r\nfloat RunTest(uint32_t size_kb, uint64_t iterations, void *mem) {\r\n    struct timeb start, end;\r\n    uint32_t list_size = size_kb * 1024 / 4;\r\n    uint32_t sum = 0, current;\r\n\r\n    // Fill list to create random access pattern\r\n    int* A;\r\n    if (mem == NULL) {\r\n        A = (int*)malloc(sizeof(int) * list_size);\r\n    } else {\r\n        A = (int*)mem;\r\n    }\r\n\r\n    for (int i = 0; i < list_size; i++) {\r\n        A[i] = i;\r\n    }\r\n\r\n    int iter = list_size;\r\n    while (iter > 1) {\r\n        iter -= 1;\r\n        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);\r\n        uint32_t tmp = A[iter];\r\n        A[iter] = A[j];\r\n        A[j] = tmp;\r\n    }\r\n\r\n    uint64_t scaled_iterations = scale_iterations(size_kb, iterations);\r\n\r\n    // Run test\r\n    ftime(&start);\r\n    current = A[0];\r\n    for (int i = 0; i < scaled_iterations; i++) {\r\n        current = A[current];\r\n        sum += current;\r\n    }\r\n    ftime(&end);\r\n    int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);\r\n    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;\r\n    if (mem == NULL) free(A);\r\n\r\n    if (sum == 0) printf(\"sum == 0 (???)\\n\");\r\n    return latency;\r\n}\r\n\r\nbool GetPrivilege()\r\n{\r\n    HANDLE           hToken;\r\n    TOKEN_PRIVILEGES tp;\r\n    BOOL             status;\r\n    DWORD            error;\r\n\r\n    // open process token\r\n    if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))\r\n    {\r\n        fprintf(stderr, \"OpenProcessToken failed: %d\\n\", GetLastError());\r\n        return false;\r\n    }\r\n\r\n    // get the luid\r\n    if (!LookupPrivilegeValue(NULL, TEXT(\"SeLockMemoryPrivilege\"), &tp.Privileges[0].Luid))\r\n    {\r\n        fprintf(stderr, \"Could not get luid: %d\\n\", GetLastError());\r\n        return false;\r\n    }\r\n\r\n    // enable privilege\r\n    tp.PrivilegeCount = 1;\r\n    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;\r\n    status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);\r\n\r\n    // It is possible for AdjustTokenPrivileges to return TRUE and still not succeed.\r\n    // So always check for the last error value.\r\n    error = GetLastError();\r\n    if (!status || (error != ERROR_SUCCESS))\r\n    {\r\n        fprintf(stderr, \"AdjustTokenPrivileges failed with status %d, error %d\\n\", status, error);\r\n        return false;\r\n    }\r\n\r\n    // close the handle\r\n    if (!CloseHandle(hToken))\r\n    {\r\n        fprintf(stderr, \"CloseHandle failed: %d\\n\", GetLastError());\r\n        return false;\r\n    }\r\n\r\n    fprintf(stderr, \"Got SeLockMemoryPrivilege\\n\");\r\n}\r\n"
  },
  {
    "path": "MemoryLatency/MemoryLatency.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 16\r\nVisualStudioVersion = 16.0.31229.75\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"MemoryLatency\", \"MemoryLatency.vcxproj\", \"{3A98A230-A87B-432D-931D-369872DE24AF}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|x64 = Debug|x64\r\n\t\tDebug|x86 = Debug|x86\r\n\t\tRelease|x64 = Release|x64\r\n\t\tRelease|x86 = Release|x86\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x86.ActiveCfg = Debug|Win32\r\n\t\t{3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x86.Build.0 = Debug|Win32\r\n\t\t{3A98A230-A87B-432D-931D-369872DE24AF}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{3A98A230-A87B-432D-931D-369872DE24AF}.Release|x64.Build.0 = Release|x64\r\n\t\t{3A98A230-A87B-432D-931D-369872DE24AF}.Release|x86.ActiveCfg = Release|Win32\r\n\t\t{3A98A230-A87B-432D-931D-369872DE24AF}.Release|x86.Build.0 = Release|Win32\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\n\tGlobalSection(ExtensibilityGlobals) = postSolution\r\n\t\tSolutionGuid = {F2D00DD2-A22B-4A3C-A2FF-9CE8CF9070D1}\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "MemoryLatency/MemoryLatency.vcxproj",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>16.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{3a98a230-a87b-432d-931d-369872de24af}</ProjectGuid>\r\n    <RootNamespace>MemoryLatency</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v142</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <LinkIncremental>true</LinkIncremental>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <LinkIncremental>false</LinkIncremental>\r\n  </PropertyGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"MemoryLatency.cpp\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"MemoryLatencyFunctions.asm\">\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">false</ExcludedFromBuild>\r\n      <FileType>Document</FileType>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">nasm -f win64 MemoryLatencyFunctions.asm</Command>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">MemoryLatencyFunctions.obj</Outputs>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>\n"
  },
  {
    "path": "MemoryLatency/MemoryLatencyFunctions.asm",
    "content": "section .text\r\nbits 64\r\n\r\nglobal preplatencyarr\r\nglobal latencytest\r\n\r\npreplatencyarr:\r\n  push r15\r\n  push r14\r\n  xor r15, r15 ; array index\r\npreplatencyarr_loop:\r\n  mov r14, [rcx + r15 * 8]\r\n  lea r14, [rcx + r14 * 8]\r\n  mov [rcx + r15 * 8], r14\r\n  inc r15\r\n  cmp rdx, r15\r\n  jne preplatencyarr_loop\r\n  pop r14\r\n  pop r15\r\n  ret\r\n\r\nlatencytest:\r\n  push r15\r\n  mov r15, [rdx]\r\n  xor rax, rax\r\nlatencytest_loop:\r\n  mov r15, [r15]\r\n  add rax, r15\r\n  dec rcx\r\n  jnz latencytest_loop\r\n  pop r15\r\n  ret\n"
  },
  {
    "path": "MemoryLatency/MemoryLatency_arm.s",
    "content": ".text\n\n.global latencytest\n.global longpatternlatencytest\n.global preplatencyarr\n.global stlftest\n.global stlftest32\n.global stlftest128\n.global matchedstlftest\n\n.global _latencytest\n.global _longpatternlatencytest\n.global _preplatencyarr\n.global _stlftest\n.global _stlftest32\n.global _stlftest128\n.global _matchedstlftest\n\n.balign 4\n\n/* x0 = ptr to arr\n   x1 = arr len\n   convert values in array from array indexes to pointers */\n_preplatencyarr:\npreplatencyarr:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x15, 0\npreplatencyarr_loop:\n  ldr x14, [x0, w15, uxtw #3]\n  lsl x14, x14, 3\n  add x14, x14, x0\n  str x14, [x0, w15, uxtw #3]\n  add w15, w15, 1\n  cmp x15, x1\n  b.ne preplatencyarr_loop\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n/* x0 = iteration count\n   x1 = ptr to arr\n   do pointer chasing for specified iteration count */\n_latencytest:\nlatencytest:\n  sub sp, sp, #0x20\n  stp x14, x15, [sp, #0x10]\n  mov x14, 0\n  ldr x15, [x1]\nlatencytest_loop:\n  ldr x15, [x15]\n  add x14, x14, x15\n  sub x0, x0, 1\n  cbnz x0, latencytest_loop\n  mov x0, x14\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x20\n  ret\n\n/* x0 = iteration count\n   x1 = ptr to arr\n   do pointer chasing with longer pattern, given different patterns\n   within each cacheline */\n_longpatternlatencytest:\nlongpatternlatencytest:\n  sub sp, sp, #0x50\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]\n  stp x10, x11, [sp, #0x30]\n  stp x8, x9, [sp, #0x40]\n  mov x14, 0\n  ldr x15, [x1]\n  mov x12, 63    /* mask for offset into cacheline */\n  mvn x13, x12   /* mask for cacheline address comparison */\n  and x10, x13, x1  /* x10 = cacheline address of first element */\nlongpatternlatencytest_loop:\n  mov x9, x15\n  ldr x15, [x15]\n\n  /* if we're back at the first cacheline */\n  and x11, x13, x15\n  cmp x11, x10\n  b.ne longpatternlatencytest_loop_inc\n  add x14, x14, 8\n  and x14, x14, x12\n  and x15, x15, x13\n  add x15, x15, x14  /* move to the next element within that cacheline */\nlongpatternlatencytest_loop_inc:\n  sub x0, x0, 1\n  cbnz x0, longpatternlatencytest_loop\n  mov x0, x14\n  ldp x8, x9, [sp, #0x40]\n  ldp x10, x11, [sp, #0x30]\n  ldp x12, x13, [sp, #0x20]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x50\n  ret\n\n\n/* x0 = iteration count\n   x1 = ptr to arr. first 32-bit int = store offset, second = load offset */\n_stlftest:\nstlftest:\n  sub sp, sp, #0x40\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]  /* x12 = store ptr, x13 = load ptr */\n  ldr x15, [x1]\n  ldr w12, [x1]\n  ldr w13, [x1, 4]\n  add x12, x12, x1\n  add x13, x13, x1\nstlftest_loop:\n  str x15, [x12]\n  ldr w15, [x13]\n  str x15, [x12]\n  ldr w15, [x13]\n  str x15, [x12]\n  ldr w15, [x13]\n  str x15, [x12]\n  ldr w15, [x13]\n  str x15, [x12]\n  ldr w15, [x13]\n  sub x0, x0, 5\n  cmp x0, 0\n  b.gt stlftest_loop\n  ldp x12, x13, [sp, #0x10]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x40\n  ret\n\n_stlftest32:\nstlftest32:\n  sub sp, sp, #0x40\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]  /* x12 = store ptr, x13 = load ptr */\n  ldr x15, [x1]\n  ldr w12, [x1]\n  ldr w13, [x1, 4]\n  add x12, x12, x1\n  add x13, x13, x1\nstlftest32_loop:\n  str w15, [x12]\n  ldrh w15, [x13]\n  str w15, [x12]\n  ldrh w15, [x13]\n  str w15, [x12]\n  ldrh w15, [x13]\n  str w15, [x12]\n  ldrh w15, [x13]\n  str w15, [x12]\n  ldrh w15, [x13]\n  sub x0, x0, 5\n  cmp x0, 0\n  b.gt stlftest32_loop\n  ldp x12, x13, [sp, #0x10]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x40\n  ret\n\n/* x0 = iteration count\n   x1 = ptr to arr. first 32-bit int = store offset, second = load offset */\n_stlftest128:\nstlftest128:\n  sub sp, sp, #0x40\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]  /* x12 = store ptr, x13 = load ptr */\n  ldr x15, [x1]\n  ldr w12, [x1]\n  ldr w13, [x1, 4]\n  add x12, x12, x1\n  add x13, x13, x1\nstlftest128_loop:\n  str q15, [x12]\n  ldr d15, [x13]\n  str q15, [x12]\n  ldr d15, [x13]\n  str q15, [x12]\n  ldr d15, [x13]\n  str q15, [x12]\n  ldr d15, [x13]\n  str q15, [x12]\n  ldr d15, [x13]\n  sub x0, x0, 5\n  cmp x0, 0\n  b.gt stlftest128_loop\n  ldp x12, x13, [sp, #0x10]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x40\n  ret\n\n_matchedstlftest:\nmatchedstlftest:\n  sub sp, sp, #0x40\n  stp x14, x15, [sp, #0x10]\n  stp x12, x13, [sp, #0x20]  /* x12 = store ptr, x13 = load ptr */\n  ldr x15, [x1]\n  ldr w12, [x1]\n  ldr w13, [x1, 4]\n  add x12, x12, x1\n  add x13, x13, x1\nmatchedstlftest_loop:\n  str x15, [x12]\n  ldr x15, [x13]\n  str x15, [x12]\n  ldr x15, [x13]\n  str x15, [x12]\n  ldr x15, [x13]\n  str x15, [x12]\n  ldr x15, [x13]\n  str x15, [x12]\n  ldr x15, [x13]\n  sub x0, x0, 5\n  cmp x0, 0\n  b.gt matchedstlftest_loop\n  ldp x12, x13, [sp, #0x10]\n  ldp x14, x15, [sp, #0x10]\n  add sp, sp, #0x40\n  ret\n"
  },
  {
    "path": "MemoryLatency/MemoryLatency_i686.s",
    "content": ".text\n\n.global @latencytest@8\n.global @preplatencyarr@8\n.global @stlftest@8\n.global @matchedstlftest@8\n.global latencytest\n.global preplatencyarr\n.global stlftest\n.global matchedstlftest\n\n/* fastcall specified in source file, so\n   ecx = ptr to arr\n   edx = arr len\n   convert values in array from array indexes to pointers\n   there has to be a way to make C do this but high level\n   programming languages suck and make simple things harder than they should be\n*/\npreplatencyarr:\n@preplatencyarr@8:\n  push %eax\n  push %esi\n  xor %esi, %esi    /* esi = array index */\npreplatencyarr_loop:\n  mov (%ecx,%esi,4), %eax   /* load target array index into eax */\n  lea (%ecx,%eax,4), %eax   /* calculate target address -> eax */\n  mov %eax, (%ecx,%esi,4)   /* replace array index with target address */\n  inc %esi\n  cmp %esi, %edx\n  jne preplatencyarr_loop\n  pop %esi\n  pop %eax\n  ret\n\n/* ecx = iterations\n   edx = ptr to arr\n   do pointer chasing for specified iteration count\n*/\nlatencytest:\n@latencytest@8:\n  push %esi\n  mov (%edx), %esi\n  xor %eax, %eax\nlatencytest_loop:\n  mov (%esi), %esi\n  add %esi, %eax\n  dec %ecx\n  jnz latencytest_loop\n  pop %esi\n  ret\n\n/* ecx = iterations\n   edx = ptr to array. first two 32-bit ints in array are store and load offsets respectively\n   mismatch load and store sizes by using 16-bit loads and 32-bit stores\n*/\nstlftest:\n@stlftest@8:\n  push %esi\n  push %edi\n  mov (%edx), %eax   /* just get some value into rax (store value */\n  mov (%edx), %esi\n  mov 4(%edx), %edi\n  add %edx, %esi     /* esi = store ptr */\n  add %edx, %edi     /* edi = load ptr */\nstlftest_loop:\n  mov %eax, (%esi)   /* 32-bit store */\n  mov (%edi), %ax   /* 16-bit load that possibly gets forwarded result */\n  mov %eax, (%esi)\n  mov (%edi), %ax\n  mov %eax, (%esi)\n  mov (%edi), %ax\n  mov %eax, (%esi)\n  mov (%edi), %ax\n  mov %eax, (%esi)\n  mov (%edi), %ax\n  sub $5, %ecx\n  jg stlftest_loop\n  pop %edi\n  pop %esi\n  ret\n\nmatchedstlftest:\n@matchedstlftest@8:\n  push %esi\n  push %edi\n  mov (%edx), %eax   /* just get some value into rax (store value */\n  mov (%edx), %esi\n  mov 4(%edx), %edi\n  add %edx, %esi     /* esi = store ptr */\n  add %edx, %edi     /* edi = load ptr */\nmatchedstlftest_loop:\n  mov %eax, (%esi)\n  mov (%edi), %eax\n  mov %eax, (%esi)\n  mov (%edi), %eax\n  mov %eax, (%esi)\n  mov (%edi), %eax\n  mov %eax, (%esi)\n  mov (%edi), %eax\n  mov %eax, (%esi)\n  mov (%edi), %eax\n  sub $5, %ecx\n  jg matchedstlftest_loop\n  pop %edi\n  pop %esi\n  ret\n"
  },
  {
    "path": "MemoryLatency/MemoryLatency_riscv.s",
    "content": ".text\n\n.global latencytest\n.global preplatencyarr\n.global stlftest\n.global stlftest32\n.global stlftest128\n.global matchedstlftest\n\n/* x10 = ptr to arr\n   x11 = arr len\n   x5-x7, x28-31 are temporaries\n   convert values in array from array indexes to pointers */\npreplatencyarr:\n  li x7, 0   /* index */\n  mv x5, x10 /* x5 = pointer into array */\npreplatencyarr_loop:\n  ld x28, (x5)  /* x28 = index into array to translate */\n  slli x28, x28, 3\n  add x28, x28, x10\n  sd x28, (x5)\n  addi x5, x5, 8\n  addi x7, x7, 1\n  blt x7, x11, preplatencyarr_loop \n  ret\n\n/* x10 = iteration count\n   x11 = ptr to arr\n   do pointer chasing for specified iteration count */\nlatencytest:\n  li x7, 0 /* iteration count */\n  mv x5, x11\n  mv x6, x11\n  addi x6, x6, 64\n  li x28, 0\n  li x29, 0\nlatencytest_loop:\n  ld x5, (x5)\n  addi x7, x7, 1\n  blt x7, x10, latencytest_loop\n  mv x10, x5\n  ret\n\n/* a0 = iteration count\n   a1 = ptr to arr. first 32-bit int = store offset, second = load offset */\nstlftest:\n  lw t0, (a1)\n  lw t1, 4(a1)\n  add t0, t0, a1\n  add t1, t1, a1\n  mv t2, a0\n  mv t3, x0\nstlftest_loop:\n  sd t2, (t0)\n  lw t2, (t1)\n  sd t2, (t0)\n  lw t2, (t1) \n  sd t2, (t0)\n  lw t2, (t1) \n  sd t2, (t0)\n  lw t2, (t1) \n  sd t2, (t0)\n  lw t2, (t1) \n  addi t3, t3, 5\n  blt t3, a0, stlftest_loop\n  ret\n\nstlftest32:\n  lw t0, (a1)\n  lw t1, 4(a1)\n  add t0, t0, a1\n  add t1, t1, a1\n  mv t3, x0\nstlftest32_loop:\n  sw t2, (t0)\n  lh t2, (t1)\n  sw t2, (t0)\n  lh t2, (t1) \n  sw t2, (t0)\n  lh t2, (t1) \n  sw t2, (t0)\n  lh t2, (t1) \n  sw t2, (t0)\n  lh t2, (t1)\n  addi t3, t3, 5\n  blt t3, a0, stlftest32_loop\n  ret\n\n/* since I'm only dealing with C910 and I know the vec len is 128... */\nstlftest128:\n  mv t4, x0\n  addi t4, t4, 32     /* ??? */\n  vsetvli t0, t4, e8  /* request vector length of some big value? always gives back 4 but not if we ask for 4*/\n  lw t0, (a1)\n  lw t1, 4(a1)\n  add t0, t0, a1\n  add t1, t1, a1\n  mv t3, x0\nstlftest128_loop:\n  vsw.v v0, (t0)\n  fld f0, (t1)\n  vsw.v v0, (t0)\n  fld f0, (t1) \n  vsw.v v0, (t0)\n  fld f0, (t1) \n  vsw.v v0, (t0)\n  fld f0, (t1) \n  vsw.v v0, (t0)\n  fld f0, (t1)\n  addi t3, t3, 5\n  blt t3, a0, stlftest128_loop\n  ret\n\nmatchedstlftest:\n  lw t0, (a1)\n  lw t1, 4(a1)\n  add t0, t0, a1\n  add t1, t1, a1\n  mv t2, a0\n  mv t3, x0\nmatchedstlftest_loop:\n  sd t2, (t0)\n  ld t2, (t1)\n  sd t2, (t0)\n  ld t2, (t1) \n  sd t2, (t0)\n  ld t2, (t1) \n  sd t2, (t0)\n  ld t2, (t1) \n  sd t2, (t0)\n  ld t2, (t1) \n  addi t3, t3, 5\n  blt t3, a0, matchedstlftest_loop\n  ret\n"
  },
  {
    "path": "MemoryLatency/MemoryLatency_x86.s",
    "content": ".text\n\n.global latencytest\n.global longpatternlatencytest\n.global preplatencyarr\n.global stlftest\n.global stlftest32\n.global stlftest128\n.global matchedstlftest\n\n/* ms_abi specified in source file, so\n   rcx = ptr to arr\n   rdx = arr len\n   convert values in array from array indexes to pointers\n   there has to be a way to make C do this but high level\n   programming languages suck and make simple things harder than they should be\n*/\npreplatencyarr:\n  push %r15\n  push %r14\n  xor %r15, %r15    /* r15 = array index */\npreplatencyarr_loop:\n  mov (%rcx,%r15,8), %r14\n  lea (%rcx,%r14,8), %r14\n  mov %r14, (%rcx,%r15,8)\n  inc %r15\n  cmp %r15, %rdx\n  jne preplatencyarr_loop\n  pop %r14\n  pop %r15\n  ret\n\nlatencytest:\n  push %r15\n  mov (%rdx), %r15\n  xor %rax, %rax\nlatencytest_loop:\n  mov (%r15), %r15\n  add %r15, %rax\n  dec %rcx\n  jnz latencytest_loop\n  pop %r15\n  ret\n\n/* rcx = iterations\n   rdx = ptr to arr\n   do pointer chasing for specified iteration count\n*/\nlongpatternlatencytest:\n  push %r15\n  push %r14\n  push %r13\n  push %r12\n  push %rbx\n  mov (%rdx), %r15\n  xor %rax, %rax   /* rax = index into cacheline */\n\n  /* r14 = 64B aligned start address */\n  mov %rdx, %r14\n  mov $63, %r13\n  not %r13         /* r13 = mask for 64B cacheline addr */\n  and %r13, %r14\nlongpatternlatencytest_loop:\n  mov (%r15), %r15\n\n  /* if we're back at the first cacheline */\n  mov %r15, %r12\n  and %r13, %r12\n  cmp %r12, %r14\n  jnz longpatternlatencytest_loop_inc\n  add $8, %rax\n  and $63, %rax\n  and %r13, %r15\n  add %rax, %r15\nlongpatternlatencytest_loop_inc:\n  dec %rcx\n  jnz longpatternlatencytest_loop\n  pop %rbx\n  pop %r12\n  pop %r13\n  pop %r14\n  pop %r15\n  ret\n\n/* rcx = iterations\n   rdx = ptr to array. first two 32-bit ints in array are store and load offsets respectively */\nstlftest:\n  push %rsi\n  push %rdi\n  mov (%rdx), %rax   /* just get some value into rax (store value */\n  mov (%rdx), %esi\n  mov 4(%rdx), %edi\n  add %rdx, %rsi     /* rsi = store ptr */\n  add %rdx, %rdi     /* rdi = load ptr */\nstlftest_loop:\n  mov %rax, (%rsi)   /* store */\n  mov (%rdi), %eax   /* load that possibly gets forwarded result */\n  mov %rax, (%rsi)\n  mov (%rdi), %eax\n  mov %rax, (%rsi)\n  mov (%rdi), %eax\n  mov %rax, (%rsi)\n  mov (%rdi), %eax\n  mov %rax, (%rsi)\n  mov (%rdi), %eax\n  sub $5, %rcx\n  jg stlftest_loop\n  pop %rdi\n  pop %rsi\n  ret\n\nstlftest128:\n  push %rsi\n  push %rdi\n  mov (%rdx), %rax   /* just get some value into rax (store value */\n  mov (%rdx), %esi\n  mov 4(%rdx), %edi\n  add %rdx, %rsi     /* rsi = store ptr */\n  add %rdx, %rdi     /* rdi = load ptr */\nstlftest128_loop:\n  movups %xmm0, (%rsi)   /* store */\n  movsd (%rdi), %xmm0   /* load that possibly gets forwarded result */\n  movups %xmm0, (%rsi)\n  movsd (%rdi), %xmm0\n  movups %xmm0, (%rsi)\n  movsd (%rdi), %xmm0\n  movups %xmm0, (%rsi)\n  movsd (%rdi), %xmm0\n  movups %xmm0, (%rsi)\n  movsd (%rdi), %xmm0\n  sub $5, %rcx\n  jg stlftest128_loop\n  pop %rdi\n  pop %rsi\n  ret\n\nstlftest32:\n  push %rsi\n  push %rdi\n  mov (%rdx), %rax   /* just get some value into rax (store value */\n  mov (%rdx), %esi\n  mov 4(%rdx), %edi\n  add %rdx, %rsi     /* rsi = store ptr */\n  add %rdx, %rdi     /* rdi = load ptr */\nstlftest32_loop:\n  mov %eax, (%rsi)   /* store */\n  mov (%rdi), %ax    /* load that possibly gets forwarded result */\n  mov %eax, (%rsi)\n  mov (%rdi), %ax\n  mov %eax, (%rsi)\n  mov (%rdi), %ax\n  mov %eax, (%rsi)\n  mov (%rdi), %ax\n  mov %eax, (%rsi)\n  mov (%rdi), %ax\n  sub $5, %rcx\n  jg stlftest32_loop\n  pop %rdi\n  pop %rsi\n  ret\n\nmatchedstlftest:\n  push %rsi\n  push %rdi\n  mov (%rdx), %rax   /* just get some value into rax (store value */\n  mov (%rdx), %esi\n  mov 4(%rdx), %edi\n  add %rdx, %rsi     /* rsi = store ptr */\n  add %rdx, %rdi     /* rdi = load ptr */\nmatchedstlftest_loop:\n  mov %rax, (%rsi)   /* store */\n  mov (%rdi), %rax   /* load that possibly gets forwarded result */\n  mov %rax, (%rsi)\n  mov (%rdi), %rax\n  mov %rax, (%rsi)\n  mov (%rdi), %rax\n  mov %rax, (%rsi)\n  mov (%rdi), %rax\n  mov %rax, (%rsi)\n  mov (%rdi), %rax\n  sub $5, %rcx\n  jg matchedstlftest_loop\n  pop %rdi\n  pop %rsi\n  ret\n"
  },
  {
    "path": "MemoryLatency/README.md",
    "content": "# Memory Latency Test\n\nThis test measures random memory access latency within increasing array sizes, and (hopefully) shows the latency and size of caches as well as memory latency. Modes, passed as the first parameter:\n- (no parameter) - Uses plain C code and `current = A[current]` to measure latency\n- asm - Uses `mov r15, [r15]` for x86-64 or `ldr x15, [x15]`. This can help accurately measure L1D latency, because many x86 CPUs take an extra cycle to calculate \"complex\" addresses. And compilers like to do that for the plain C version above. This doesn't seem to make a difference for ARM\n- tlb - Accesses just one element per 4 KB region to measure virtual to physical address translation latency (so TLBs and page walkers). Cache latency is subtracted out to isolate address translation latency.\n\n# Building\n\nMake sure optimization is on, or L1D latencies may be quite a bit higher than expected.\n\n## Windows\nUnder WSL, do `x86_64-w64-mingw32-gcc-win32 -O3 MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency.exe`\n\nRun with\n`MemoryLatency.exe`\n`MemoryLatency.exe asm`\n`MemoryLatency.exe tlb`\n## Linux, x86-64\n`gcc -O3 MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency`\n\n## Linux/Android+Termux, aarch64\n`gcc -O3 MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency`\n\n## Linux, riscv64\n`gcc -O3 MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency`\n\n## VS version\nOpen solution and build. This is only around to hit large pages on Windows. \n\n# Running (Linux/Cross-Compiled Version)\n- `./MemoryLatency -test asm` Tests cache and memory latency with the default page size\n- `./MemoryLatency -test asm -hugepages` Tests cache and memory latency with huge pages, which should minimize address translation penalties. You'll need to `echo (page count) > /proc/sys/vm/nr_hugepages` or have a kernel capable of doing transparent hugepages via madvise.\n- `./MemoryLatency -test tlb` Roughly estimates address translation penalties. Currently only good for measuring L2 TLB hit latency.\n- `./MemoryLatency -test stlf` An implementation of the test described at https://blog.stuffedcow.net/2014/01/x86-memory-disambiguation/ for measuring store to load forwarding latency, described under the \"fast address\" section\n- `./MemoryLatency -test 128_stlf` Henry Wong's store to load forwarding latency test but with 128-bit vector loads and 64-bit stores with vector/FP registers. On some CPUs, this can show different behavior to the STLF test above, which uses 64-bit loads and 32-bit stores on the scalar integer side. \n"
  },
  {
    "path": "README.md",
    "content": "# Microbenchmarks\nTrying to figure various CPU (or GPU) things out.\n\nBasically my playground to microbenchmark various CPU-related things like ROB/register file sizes, lock/cache coherency latency, and cache/memory performance. This repo is loose collection of various experiments and is more of a playground than a well maintained piece of software. As such, various benchmarks may not work, or may not even compile. They're also not well documented and details of what's being tested may not be intuitive. Due to time constraints and real life priorities I won't be able to maintain this repo to an acceptable standard for public use.\n\nFeel free to try running the stuff here, but I highly suggest writing your own code because that'll provide a better understanding of the theory behind the benchmarks. Consider checking out https://github.com/travisdowns/robsize or https://github.com/Veedrac/microarchitecturometer.\n\n# Building Clammicrobench with Generated Code\nGet NASM (https://www.nasm.us/) and make sure it's in your path. Then things should build under Visual Studio 2022.\n\nSome microbenchmarks have the source code and assembly generated by C# code, to avoid crazy stuff like self modifying code. For clammicrobench, build/run the AsmGen project. Pass \"autocopy\" on the command line to have it automatically place generated ASM files for Visual Studio. Then, the clammicrobench project should build.\n"
  },
  {
    "path": "mt_instructionrate/InstructionRateFunctions.asm",
    "content": "section .text\r\n\r\nbits 64\r\n\r\nglobal sse_int32_add_test\r\nglobal sse_int32_mul_test\r\nglobal sse_int64_add_test\r\nglobal sse_int64_mul_test\r\nglobal avx2_int32_add_test\r\nglobal avx2_int32_mul_test\r\nglobal avx2_int64_add_test\r\nglobal avx2_int64_mul_test\r\nglobal sse_fp32_add_test\r\nglobal sse_fp32_mul_test\r\nglobal sse_fp32_muladd_test\r\nglobal sse_fp32_rsqrt_test\r\nglobal avx_fp32_add_test\r\nglobal avx_fp32_mul_test\r\nglobal avx_fp32_muladd_test\r\nglobal avx_fp32_rsqrt_test\r\nglobal fp32_fma_test\r\nglobal fp64_fma_test\r\n\r\nglobal sse_fp64_add_test\r\nglobal sse_fp64_mul_test\r\nglobal sse_fp64_muladd_test\r\nglobal avx_fp64_add_test\r\nglobal avx_fp64_mul_test\r\nglobal avx_fp64_muladd_test\r\n\r\nglobal avx512_int32_add_test\r\nglobal avx512_int32_mul_test\r\nglobal avx512_int64_add_test\r\nglobal avx512_int64_mul_test\r\nglobal avx512_fp32_rsqrt_test\r\nglobal avx512_fp32_add_test\r\nglobal avx512_fp32_fma_test\r\nglobal avx512_fp64_add_test\r\nglobal avx512_fp64_fma_test\r\n\r\nsse_int32_add_test:\r\n  movdqu xmm0, [rdx]\r\n  movdqu xmm1, [rdx + 16]\r\n  movdqu xmm2, [rdx + 32]\r\n  movdqu xmm3, [rdx + 48]\r\n  movdqu xmm4, [rdx + 64]\r\n  movdqu xmm5, [rdx + 72]\r\nsse_int32_add_test_loop:\r\n  paddd xmm0, xmm0\r\n  paddd xmm1, xmm1\r\n  paddd xmm2, xmm2\r\n  paddd xmm3, xmm3\r\n  paddd xmm4, xmm4\r\n  paddd xmm5, xmm5\r\n  sub rcx, 24\r\n  cmp rcx, 0\r\n  jg sse_int32_add_test_loop\r\n  ret\r\n\r\nsse_int64_add_test:\r\n  movdqu xmm0, [rdx]\r\n  movdqu xmm1, [rdx + 16]\r\n  movdqu xmm2, [rdx + 32]\r\n  movdqu xmm3, [rdx + 48]\r\n  movdqu xmm4, [rdx + 64]\r\n  movdqu xmm5, [rdx + 72]\r\nsse_int64_add_test_loop:\r\n  paddq xmm0, xmm0\r\n  paddq xmm1, xmm1\r\n  paddq xmm2, xmm2\r\n  paddq xmm3, xmm3\r\n  paddq xmm4, xmm4\r\n  paddq xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_int64_add_test_loop\r\n  ret\r\n\r\nsse_int32_mul_test:\r\n  movdqu xmm0, [rdx]\r\n  movdqu xmm1, [rdx + 16]\r\n  movdqu xmm2, [rdx + 32]\r\n  movdqu xmm3, [rdx + 48]\r\n  movdqu xmm4, [rdx + 64]\r\n  movdqu xmm5, [rdx + 72]\r\nsse_int32_mul_test_loop:\r\n  pmulld xmm0, xmm0\r\n  pmulld xmm1, xmm1\r\n  pmulld xmm2, xmm2\r\n  pmulld xmm3, xmm3\r\n  pmulld xmm4, xmm4\r\n  pmulld xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_int32_mul_test_loop\r\n  ret\r\n\r\nsse_int64_mul_test:\r\n  movdqu xmm0, [rdx]\r\n  movdqu xmm1, [rdx + 16]\r\n  movdqu xmm2, [rdx + 32]\r\n  movdqu xmm3, [rdx + 48]\r\n  movdqu xmm4, [rdx + 64]\r\n  movdqu xmm5, [rdx + 72]\r\nsse_int64_mul_test_loop:\r\n  pmuludq xmm0, xmm0\r\n  pmuludq xmm1, xmm1\r\n  pmuludq xmm2, xmm2\r\n  pmuludq xmm3, xmm3\r\n  pmuludq xmm4, xmm4\r\n  pmuludq xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_int64_mul_test_loop\r\n  ret\r\n\r\navx2_int32_add_test:\r\n  vmovdqu ymm0, [rdx]\r\n  vmovdqu ymm1, [rdx + 32]\r\n  vmovdqu ymm2, [rdx + 64]\r\n  vmovdqu ymm3, [rdx + 96]\r\n  vmovdqu ymm4, [rdx + 128]\r\n  vmovdqu ymm5, [rdx + 160]\r\navx2_int32_add_test_loop:\r\n  vpaddd ymm0, ymm0, ymm0\r\n  vpaddd ymm1, ymm1, ymm1\r\n  vpaddd ymm2, ymm2, ymm2\r\n  vpaddd ymm3, ymm3, ymm3\r\n  vpaddd ymm4, ymm4, ymm4\r\n  vpaddd ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  cmp rcx, 0\r\n  jg avx2_int32_add_test_loop\r\n  ret\r\n\r\navx2_int32_mul_test:\r\n  vmovdqu ymm0, [rdx]\r\n  vmovdqu ymm1, [rdx + 32]\r\n  vmovdqu ymm2, [rdx + 64]\r\n  vmovdqu ymm3, [rdx + 96]\r\n  vmovdqu ymm4, [rdx + 128]\r\n  vmovdqu ymm5, [rdx + 160]\r\navx2_int32_mul_test_loop:\r\n  vpmulld ymm0, ymm0, ymm0\r\n  vpmulld ymm1, ymm1, ymm1\r\n  vpmulld ymm2, ymm2, ymm2\r\n  vpmulld ymm3, ymm3, ymm3\r\n  vpmulld ymm4, ymm4, ymm4\r\n  vpmulld ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx2_int32_mul_test_loop\r\n  ret\r\n\r\navx2_int64_add_test:\r\n  vmovdqu ymm0, [rdx]\r\n  vmovdqu ymm1, [rdx + 32]\r\n  vmovdqu ymm2, [rdx + 64]\r\n  vmovdqu ymm3, [rdx + 96]\r\n  vmovdqu ymm4, [rdx + 128]\r\n  vmovdqu ymm5, [rdx + 160]\r\navx2_int64_add_test_loop:\r\n  vpaddq ymm0, ymm0, ymm0\r\n  vpaddq ymm1, ymm1, ymm1\r\n  vpaddq ymm2, ymm2, ymm2\r\n  vpaddq ymm3, ymm3, ymm3\r\n  vpaddq ymm4, ymm4, ymm4\r\n  vpaddq ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx2_int64_add_test_loop\r\n  ret\r\n\r\navx2_int64_mul_test:\r\n  vmovdqu ymm0, [rdx]\r\n  vmovdqu ymm1, [rdx + 32]\r\n  vmovdqu ymm2, [rdx + 64]\r\n  vmovdqu ymm3, [rdx + 96]\r\n  vmovdqu ymm4, [rdx + 128]\r\n  vmovdqu ymm5, [rdx + 160]\r\navx2_int64_mul_test_loop:\r\n  vpmuldq ymm0, ymm0, ymm0\r\n  vpmuldq ymm1, ymm1, ymm1\r\n  vpmuldq ymm2, ymm2, ymm2\r\n  vpmuldq ymm3, ymm3, ymm3\r\n  vpmuldq ymm4, ymm4, ymm4\r\n  vpmuldq ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx2_int64_mul_test_loop\r\n  ret\r\n\r\nsse_fp32_add_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp32_add_test_loop:\r\n  addps xmm0, xmm0\r\n  addps xmm1, xmm1\r\n  addps xmm2, xmm2\r\n  addps xmm3, xmm3\r\n  addps xmm4, xmm4\r\n  addps xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_fp32_add_test_loop\r\n  ret\r\n\r\nsse_fp64_add_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp64_add_test_loop:\r\n  addpd xmm0, xmm0\r\n  addpd xmm1, xmm1\r\n  addpd xmm2, xmm2\r\n  addpd xmm3, xmm3\r\n  addpd xmm4, xmm4\r\n  addpd xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_fp64_add_test_loop\r\n  ret\r\n\r\nsse_fp32_mul_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp32_mul_test_loop:\r\n  mulps xmm0, xmm0\r\n  mulps xmm1, xmm1\r\n  mulps xmm2, xmm2\r\n  mulps xmm3, xmm3\r\n  mulps xmm4, xmm4\r\n  mulps xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_fp32_mul_test_loop\r\n  ret\r\n\r\nsse_fp64_mul_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp64_mul_test_loop:\r\n  mulpd xmm0, xmm0\r\n  mulpd xmm1, xmm1\r\n  mulpd xmm2, xmm2\r\n  mulpd xmm3, xmm3\r\n  mulpd xmm4, xmm4\r\n  mulpd xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_fp64_mul_test_loop\r\n  ret\r\n\r\nsse_fp32_muladd_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp32_muladd_test_loop:\r\n  mulps xmm0, xmm0\r\n  addps xmm0, xmm0\r\n  mulps xmm1, xmm1\r\n  addps xmm1, xmm1\r\n  mulps xmm2, xmm2\r\n  addps xmm2, xmm2\r\n  mulps xmm3, xmm3\r\n  addps xmm3, xmm3\r\n  mulps xmm4, xmm4\r\n  addps xmm4, xmm4\r\n  mulps xmm5, xmm5\r\n  addps xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_fp32_muladd_test_loop\r\n  ret\r\n\r\nsse_fp32_rsqrt_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp32_rsqrt_test_loop:\r\n  rsqrtps xmm0, xmm0\r\n  rsqrtps xmm1, xmm1\r\n  rsqrtps xmm2, xmm2\r\n  rsqrtps xmm3, xmm3\r\n  rsqrtps xmm4, xmm4\r\n  rsqrtps xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_fp32_rsqrt_test_loop\r\n  ret\r\n\r\navx_fp32_rsqrt_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp32_rsqrt_test_loop:\r\n  vrsqrtps ymm0, ymm0\r\n  vrsqrtps ymm1, ymm1\r\n  vrsqrtps ymm2, ymm2\r\n  vrsqrtps ymm3, ymm3\r\n  vrsqrtps ymm4, ymm4\r\n  vrsqrtps ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx_fp32_rsqrt_test_loop\r\n  ret\r\n\r\nsse_fp64_muladd_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp64_muladd_test_loop:\r\n  mulpd xmm0, xmm0\r\n  addpd xmm0, xmm0\r\n  mulpd xmm1, xmm1\r\n  addpd xmm1, xmm1\r\n  mulpd xmm2, xmm2\r\n  addpd xmm2, xmm2\r\n  mulpd xmm3, xmm3\r\n  addpd xmm3, xmm3\r\n  mulpd xmm4, xmm4\r\n  addpd xmm4, xmm4\r\n  mulpd xmm5, xmm5\r\n  addpd xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_fp64_muladd_test_loop\r\n  ret\r\n\r\navx_fp32_add_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp32_add_test_loop:\r\n  vaddps ymm0, ymm0, ymm0\r\n  vaddps ymm1, ymm1, ymm1\r\n  vaddps ymm2, ymm2, ymm2\r\n  vaddps ymm3, ymm3, ymm3\r\n  vaddps ymm4, ymm4, ymm4\r\n  vaddps ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx_fp32_add_test_loop\r\n  ret\r\n\r\navx_fp64_add_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp64_add_test_loop:\r\n  vaddpd ymm0, ymm0, ymm0\r\n  vaddpd ymm1, ymm1, ymm1\r\n  vaddpd ymm2, ymm2, ymm2\r\n  vaddpd ymm3, ymm3, ymm3\r\n  vaddpd ymm4, ymm4, ymm4\r\n  vaddpd ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx_fp64_add_test_loop\r\n  ret\r\n\r\navx_fp32_mul_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp32_mul_test_loop:\r\n  vmulps ymm0, ymm0, ymm0\r\n  vmulps ymm1, ymm1, ymm1\r\n  vmulps ymm2, ymm2, ymm2\r\n  vmulps ymm3, ymm3, ymm3\r\n  vmulps ymm4, ymm4, ymm4\r\n  vmulps ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx_fp32_mul_test_loop\r\n  ret\r\n\r\navx_fp64_mul_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp64_mul_test_loop:\r\n  vmulpd ymm0, ymm0, ymm0\r\n  vmulpd ymm1, ymm1, ymm1\r\n  vmulpd ymm2, ymm2, ymm2\r\n  vmulpd ymm3, ymm3, ymm3\r\n  vmulpd ymm4, ymm4, ymm4\r\n  vmulpd ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx_fp64_mul_test_loop\r\n  ret\r\n\r\navx_fp32_muladd_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp32_muladd_test_loop:\r\n  vmulps ymm0, ymm0, ymm0\r\n  vaddps ymm0, ymm0, ymm0\r\n  vmulps ymm1, ymm1, ymm1\r\n  vaddps ymm1, ymm1, ymm1\r\n  vmulps ymm2, ymm2, ymm2\r\n  vaddps ymm2, ymm2, ymm2\r\n  vmulps ymm3, ymm3, ymm3\r\n  vaddps ymm3, ymm3, ymm3\r\n  vmulps ymm4, ymm4, ymm4\r\n  vaddps ymm4, ymm4, ymm4\r\n  vmulps ymm5, ymm5, ymm5\r\n  vaddps ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx_fp32_muladd_test_loop\r\n  ret\r\n\r\navx_fp64_muladd_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp64_muladd_test_loop:\r\n  vmulpd ymm0, ymm0, ymm0\r\n  vaddpd ymm0, ymm0, ymm0\r\n  vmulpd ymm1, ymm1, ymm1\r\n  vaddpd ymm1, ymm1, ymm1\r\n  vmulpd ymm2, ymm2, ymm2\r\n  vaddpd ymm2, ymm2, ymm2\r\n  vmulpd ymm3, ymm3, ymm3\r\n  vaddpd ymm3, ymm3, ymm3\r\n  vmulpd ymm4, ymm4, ymm4\r\n  vaddpd ymm4, ymm4, ymm4\r\n  vmulpd ymm5, ymm5, ymm5\r\n  vaddpd ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx_fp64_muladd_test_loop\r\n  ret\r\n\r\nfp32_fma_test:\r\n  vzeroall\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, ymm0\r\n  vmovups ymm2, ymm0\r\n  vmovups ymm3, ymm0\r\n  vmovups ymm4, ymm0\r\n  vmovups ymm5, ymm0\r\n  vmovups ymm6, ymm0\r\nfp32_fma_test_loop:\r\n  vfmadd132ps ymm0, ymm0, ymm6\r\n  vfmadd132ps ymm1, ymm1, ymm6\r\n  vfmadd132ps ymm2, ymm2, ymm6\r\n  vfmadd132ps ymm3, ymm3, ymm6\r\n  vfmadd132ps ymm4, ymm4, ymm6\r\n  vfmadd132ps ymm5, ymm5, ymm6\r\n  sub rcx, 48\r\n  jg fp32_fma_test_loop\r\n  ret\r\n\r\nfp64_fma_test:\r\n  vzeroall\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, ymm0\r\n  vmovups ymm2, ymm0\r\n  vmovups ymm3, ymm0\r\n  vmovups ymm4, ymm0\r\n  vmovups ymm5, ymm0\r\n  vmovups ymm6, ymm0\r\nfp64_fma_test_loop:\r\n  vfmadd132pd ymm0, ymm0, ymm6\r\n  vfmadd132pd ymm1, ymm1, ymm6\r\n  vfmadd132pd ymm2, ymm2, ymm6\r\n  vfmadd132pd ymm3, ymm3, ymm6\r\n  vfmadd132pd ymm4, ymm4, ymm6\r\n  vfmadd132pd ymm5, ymm5, ymm6\r\n  sub rcx, 24\r\n  jg fp64_fma_test_loop\r\n  ret\r\n\r\navx512_int32_add_test:\r\n  vmovdqu16 zmm0, [rdx]\r\n  vmovdqu16 zmm1, [rdx + 64]\r\n  vmovdqu16 zmm2, [rdx + 128]\r\n  vmovdqu16 zmm3, [rdx + 192]\r\n  vmovdqu16 zmm4, [rdx + 256]\r\n  vmovdqu16 zmm5, [rdx + 384]\r\navx512_int32_add_test_loop:\r\n  vpaddd zmm0, zmm0, zmm0\r\n  vpaddd zmm1, zmm1, zmm1\r\n  vpaddd zmm2, zmm2, zmm2\r\n  vpaddd zmm3, zmm3, zmm3\r\n  vpaddd zmm4, zmm4, zmm4\r\n  vpaddd zmm5, zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_int32_add_test_loop\r\n  ret\r\n\r\navx512_int32_mul_test:\r\n  vmovdqu16 zmm0, [rdx]\r\n  vmovdqu16 zmm1, [rdx + 64]\r\n  vmovdqu16 zmm2, [rdx + 128]\r\n  vmovdqu16 zmm3, [rdx + 192]\r\n  vmovdqu16 zmm4, [rdx + 256]\r\n  vmovdqu16 zmm5, [rdx + 384]\r\navx512_int32_mul_test_loop:\r\n  vpmulld zmm0, zmm0, zmm0\r\n  vpmulld zmm1, zmm1, zmm1\r\n  vpmulld zmm2, zmm2, zmm2\r\n  vpmulld zmm3, zmm3, zmm3\r\n  vpmulld zmm4, zmm4, zmm4\r\n  vpmulld zmm5, zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_int32_mul_test_loop\r\n  ret\r\n\r\navx512_int64_add_test:\r\n  vmovdqu16 zmm0, [rdx]\r\n  vmovdqu16 zmm1, [rdx + 64]\r\n  vmovdqu16 zmm2, [rdx + 128]\r\n  vmovdqu16 zmm3, [rdx + 192]\r\n  vmovdqu16 zmm4, [rdx + 256]\r\n  vmovdqu16 zmm5, [rdx + 384]\r\navx512_int64_add_test_loop:\r\n  vpaddq zmm0, zmm0, zmm0\r\n  vpaddq zmm1, zmm1, zmm1\r\n  vpaddq zmm2, zmm2, zmm2\r\n  vpaddq zmm3, zmm3, zmm3\r\n  vpaddq zmm4, zmm4, zmm4\r\n  vpaddq zmm5, zmm5, zmm5\r\n  sub rcx, 48\r\n  jg avx512_int64_add_test_loop\r\n  ret\r\n\r\navx512_int64_mul_test:\r\n  vmovdqu16 zmm0, [rdx]\r\n  vmovdqu16 zmm1, [rdx + 64]\r\n  vmovdqu16 zmm2, [rdx + 128]\r\n  vmovdqu16 zmm3, [rdx + 192]\r\n  vmovdqu16 zmm4, [rdx + 256]\r\n  vmovdqu16 zmm5, [rdx + 384]\r\navx512_int64_mul_test_loop:\r\n  vpmuldq zmm0, zmm0, zmm0\r\n  vpmuldq zmm1, zmm1, zmm1\r\n  vpmuldq zmm2, zmm2, zmm2\r\n  vpmuldq zmm3, zmm3, zmm3\r\n  vpmuldq zmm4, zmm4, zmm4\r\n  vpmuldq zmm5, zmm5, zmm5\r\n  sub rcx, 48\r\n  jg avx512_int64_mul_test_loop\r\n  ret\r\n\r\navx512_fp32_rsqrt_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp32_rsqrt_test_loop:\r\n  vrsqrt14ps zmm0, zmm0\r\n  vrsqrt14ps zmm1, zmm1\r\n  vrsqrt14ps zmm2, zmm2\r\n  vrsqrt14ps zmm3, zmm3\r\n  vrsqrt14ps zmm4, zmm4\r\n  vrsqrt14ps zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_fp32_rsqrt_test_loop\r\n  ret\r\n\r\navx512_fp32_add_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp32_add_test_loop:\r\n  vaddps zmm0, zmm0, zmm0\r\n  vaddps zmm1, zmm1, zmm1\r\n  vaddps zmm2, zmm2, zmm2\r\n  vaddps zmm3, zmm3, zmm3\r\n  vaddps zmm4, zmm4, zmm4\r\n  vaddps zmm5, zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_fp32_add_test_loop\r\n  ret\r\n\r\navx512_fp32_fma_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp32_fma_test_loop:\r\n  vfmadd132ps zmm0, zmm0, zmm0\r\n  vfmadd132ps zmm1, zmm1, zmm1\r\n  vfmadd132ps zmm2, zmm2, zmm2\r\n  vfmadd132ps zmm3, zmm3, zmm3\r\n  vfmadd132ps zmm4, zmm4, zmm4\r\n  vfmadd132ps zmm5, zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_fp32_fma_test_loop\r\n  ret\r\n\r\navx512_fp64_add_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp64_add_test_loop:\r\n  vfmadd132pd zmm0, zmm0, zmm0\r\n  vfmadd132pd zmm1, zmm1, zmm1\r\n  vfmadd132pd zmm2, zmm2, zmm2\r\n  vfmadd132pd zmm3, zmm3, zmm3\r\n  vfmadd132pd zmm4, zmm4, zmm4\r\n  vfmadd132pd zmm5, zmm5, zmm5\r\n  sub rcx, 48\r\n  jg avx512_fp64_add_test_loop\r\n  ret\r\n\r\navx512_fp64_fma_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp64_fma_test_loop:\r\n  vfmadd132ps zmm0, zmm0, zmm0\r\n  vfmadd132ps zmm1, zmm1, zmm1\r\n  vfmadd132ps zmm2, zmm2, zmm2\r\n  vfmadd132ps zmm3, zmm3, zmm3\r\n  vfmadd132ps zmm4, zmm4, zmm4\r\n  vfmadd132ps zmm5, zmm5, zmm5\r\n  sub rcx, 48\r\n  jg avx512_fp64_fma_test_loop\r\n  ret"
  },
  {
    "path": "mt_instructionrate/Makefile",
    "content": "x86:\n\tgcc -pthread -masm=intel x86_mt_instructionrate.s mt_instructionrate.c ../Common/timing.c -o x86_mt_instructionrate -static\naarch64:\n\tgcc -pthread mt_instructionrate.c arm_mt_instructionrate.s ../Common/timing.c -o arm_mt_instructionrate\nppc64:\n\tgcc -pthread -mregnames mt_instructionrate.c ppc64_mt_instructionrate.s ../Common/timing.c -o ppc64_mt_instructionrate\n"
  },
  {
    "path": "mt_instructionrate/Project1.vcxproj",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>17.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{0ad46eb5-549e-4e36-9cea-89d06cee1b5e}</ProjectGuid>\r\n    <RootNamespace>Project1</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n    <ProjectName>mt_instructionrate</ProjectName>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"..\\Common\\timing.c\" />\r\n    <ClCompile Include=\"mt_instructionrate.c\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClInclude Include=\"..\\Common\\timing.h\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"InstructionRateFunctions.asm\">\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">false</ExcludedFromBuild>\r\n      <FileType>Document</FileType>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">nasm -f win64 InstructionRateFunctions.asm</Command>\r\n      <Message Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">Building asm functions</Message>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">InstructionRateFunctions.obj</Outputs>\r\n      <ExcludedFromBuild Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">false</ExcludedFromBuild>\r\n      <Command Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">nasm -f win64 InstructionRateFunctions.asm</Command>\r\n      <Outputs Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">InstructionRateFunctions.obj</Outputs>\r\n      <Message Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">Building asm functions</Message>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>"
  },
  {
    "path": "mt_instructionrate/Project1.vcxproj.filters",
    "content": "﻿<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project ToolsVersion=\"4.0\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup>\r\n    <Filter Include=\"Source Files\">\r\n      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>\r\n      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Header Files\">\r\n      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>\r\n      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Resource Files\">\r\n      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>\r\n      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>\r\n    </Filter>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"mt_instructionrate.c\">\r\n      <Filter>Source Files</Filter>\r\n    </ClCompile>\r\n    <ClCompile Include=\"..\\Common\\timing.c\">\r\n      <Filter>Header Files</Filter>\r\n    </ClCompile>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClInclude Include=\"..\\Common\\timing.h\">\r\n      <Filter>Header Files</Filter>\r\n    </ClInclude>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CustomBuild Include=\"InstructionRateFunctions.asm\">\r\n      <Filter>Source Files</Filter>\r\n    </CustomBuild>\r\n  </ItemGroup>\r\n</Project>"
  },
  {
    "path": "mt_instructionrate/arm_mt_instructionrate.c",
    "content": "extern uint64_t vec_int32_add_test(uint64_t iterations, void *data);\nextern uint64_t vec_int32_mul_test(uint64_t iterations, void *data);\nextern uint64_t vec_fp32_add_test(uint64_t iterations, void *data);\nextern uint64_t vec_fp32_fma_test(uint64_t iterations, void *data);\nextern uint64_t vec_fp32_rsqrt_test(uint64_t iterations, void *data);\nextern uint64_t vec_fp64_add_test(uint64_t iterations, void *data);\nextern uint64_t vec_fp64_fma_test(uint64_t iterations, void *data);\nextern uint64_t vec_int64_add_test(uint64_t iterations, void *data);\n//extern uint64_t vec_int64_mul_test(uint64_t iterations, void *data);\n\n\nvoid RunTests() {\n  uint64_t iterations = 3500000000;\n  int testDataLength = 256; \n  uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);\n  uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);\n  uint64_t* int64TestArr = (uint64_t*)malloc(sizeof(uint64_t) * testDataLength);\n  double* fp64TestArr = (double*)malloc(sizeof(double) * testDataLength);\n  for (int i = 0; i < testDataLength; i++) {\n    intTestArr[i] = i;\n    fpTestArr[i] = i * 1.2f;\n    int64TestArr[i] = i * 2;\n    fp64TestArr[i] = 2.0f + 0.01f * i;\n  }\n\n  fprintf(stderr, \"Measuring INT32 adds\\n\");\n  float int32adds = measureFunction(iterations, vec_int32_add_test, intTestArr);\n  fprintf(stderr, \"Measuring INT32 multiplies\\n\");\n  float int32muls = measureFunction(iterations, vec_int32_mul_test, intTestArr);\n  fprintf(stderr, \"Measuring FP32 adds\\n\");\n  float fp32adds = measureFunction(iterations, vec_fp32_add_test, fpTestArr);\n  fprintf(stderr, \"Measuring FP32 FMAs\\n\");\n  float fp32fmas = measureFunction(iterations, vec_fp32_fma_test, fpTestArr);\n  fprintf(stderr, \"Measuring FP32 inverse square roots\\n\");\n  float fp32rsqrts = measureFunction(iterations, vec_fp32_rsqrt_test, fpTestArr);\n\n  fprintf(stderr, \"Measuring INT64 Adds\\n\");\n  float int64adds = measureFunction(iterations, vec_int64_add_test, int64TestArr);\n  //fprintf(stderr, \"Measuring INT64 Multiplies\\n\");\n  //float int64muls = measureFunction(iterations, vec_int64_mul_test, int64TestArr);\n  fprintf(stderr, \"Measuring FP64 Adds\\n\");\n  float fp64adds = measureFunction(iterations, vec_fp64_add_test, fp64TestArr);\n  fprintf(stderr, \"Measuring FP64 FMAs\\n\");\n  float fp64fmas = measureFunction(iterations, vec_fp64_fma_test, fp64TestArr);\n\n  printf(\"-----GOPS/s-----\\n\");\n  printf(\"INT32 Add: %f\\n\", int32adds);\n  printf(\"INT32 Multiply: %f\\n\", int32muls);\n  printf(\"FP32 Add: %f\\n\", fp32adds);\n  printf(\"FP32 FMA: %f\\n\", fp32fmas);\n  printf(\"FP32 Inverse Square Roots: %f\\n\", fp32rsqrts);\n  printf(\"INT64 Adds: %f\\n\", int64adds);\n  //printf(\"INT64 Multiply: %f\\n\", int64muls);\n  printf(\"FP64 Adds: %f\\n\", fp64adds);\n  printf(\"FP64 FMAs: %f\\n\", fp64fmas);\n\n  free(intTestArr);\n  free(fpTestArr);\n  return;\n}\n"
  },
  {
    "path": "mt_instructionrate/arm_mt_instructionrate.s",
    "content": ".text\n\n.global vec_int32_add_test\n.global vec_int32_mul_test\n.global vec_fp32_add_test\n.global vec_fp32_fma_test\n.global vec_fp32_rsqrt_test\n.global vec_int64_add_test\n/*.global vec_int64_mul_test*/\n.global vec_fp64_add_test\n.global vec_fp64_fma_test\n\n/* x0 = iteration count, x1 = data */\nvec_int32_add_test:\n  mov x14, 24\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \nvec_int32_add_test_loop:\n  add v16.4s, v16.4s, v16.4s\n  add v17.4s, v17.4s, v17.4s\n  add v18.4s, v18.4s, v18.4s\n  add v19.4s, v19.4s, v19.4s\n  add v20.4s, v20.4s, v20.4s\n  add v21.4s, v21.4s, v21.4s \n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt vec_int32_add_test_loop\n  ret\n\nvec_int32_mul_test:\n  mov x14, 24\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \nvec_int32_mul_test_loop:\n  mul v16.4s, v16.4s, v16.4s\n  mul v17.4s, v17.4s, v17.4s\n  mul v18.4s, v18.4s, v18.4s\n  mul v19.4s, v19.4s, v19.4s\n  mul v20.4s, v20.4s, v20.4s\n  mul v21.4s, v21.4s, v21.4s \n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt vec_int32_mul_test_loop\n  ret\n\nvec_fp32_add_test:\n  mov x14, 24\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \nvec_fp32_add_test_loop:\n  fadd v16.4s, v16.4s, v16.4s\n  fadd v17.4s, v17.4s, v17.4s\n  fadd v18.4s, v18.4s, v18.4s\n  fadd v19.4s, v19.4s, v19.4s\n  fadd v20.4s, v20.4s, v20.4s\n  fadd v21.4s, v21.4s, v21.4s \n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt vec_fp32_add_test_loop\n  ret\n\nvec_fp32_fma_test:\n  mov x14, 24\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \nvec_fp32_fma_test_loop:\n  fmla v16.4s, v16.4s, v16.4s\n  fmla v17.4s, v17.4s, v17.4s\n  fmla v18.4s, v18.4s, v18.4s\n  fmla v19.4s, v19.4s, v19.4s\n  fmla v20.4s, v20.4s, v20.4s\n  fmla v21.4s, v21.4s, v21.4s \n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt vec_fp32_fma_test_loop\n  ret\n\nvec_fp32_rsqrt_test:\n  mov x14, 24\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \nvec_fp32_rsqrt_test_loop:\n  ursqrte v16.4s, v16.4s\n  ursqrte v17.4s, v17.4s\n  ursqrte v18.4s, v18.4s\n  ursqrte v19.4s, v19.4s\n  ursqrte v20.4s, v20.4s\n  ursqrte v21.4s, v21.4s \n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt vec_fp32_rsqrt_test_loop\n  ret \n\nvec_int64_add_test:\n  mov x14, 12\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \nvec_int64_add_test_loop:\n  add v16.2d, v16.2d, v16.2d\n  add v17.2d, v17.2d, v17.2d\n  add v18.2d, v18.2d, v18.2d\n  add v19.2d, v19.2d, v19.2d\n  add v20.2d, v20.2d, v20.2d\n  add v21.2d, v21.2d, v21.2d \n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt vec_int64_add_test_loop\n  ret \n \n/*vec_int64_mul_test:\n  mov x14, 12\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \nvec_int64_mul_test_loop:\n  mul v16.2d, v16.2d, v16.2d\n  mul v17.2d, v17.2d, v17.2d\n  mul v18.2d, v18.2d, v18.2d\n  mul v19.2d, v19.2d, v19.2d\n  mul v20.2d, v20.2d, v20.2d\n  mul v21.2d, v21.2d, v21.2d \n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt vec_int64_mul_test_loop\n  ret */\n\nvec_fp64_add_test:\n  mov x14, 12\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \nvec_fp64_add_test_loop:\n  fadd v16.2d, v16.2d, v16.2d\n  fadd v17.2d, v17.2d, v17.2d\n  fadd v18.2d, v18.2d, v18.2d\n  fadd v19.2d, v19.2d, v19.2d\n  fadd v20.2d, v20.2d, v20.2d\n  fadd v21.2d, v21.2d, v21.2d \n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt vec_fp64_add_test_loop\n  ret \n\nvec_fp64_fma_test:\n  mov x14, 12\n  ldr q16, [x1]\n  ldr q17, [x1]\n  ldr q18, [x1]\n  ldr q19, [x1]\n  ldr q20, [x1]\n  ldr q21, [x1] \nvec_fp64_fma_test_loop:\n  fmla v16.2d, v16.2d, v16.2d\n  fmla v17.2d, v17.2d, v17.2d\n  fmla v18.2d, v18.2d, v18.2d\n  fmla v19.2d, v19.2d, v19.2d\n  fmla v20.2d, v20.2d, v20.2d\n  fmla v21.2d, v21.2d, v21.2d \n  sub x0, x0, x14\n  cmp x0, 0\n  b.gt vec_fp64_fma_test_loop\n  ret\n"
  },
  {
    "path": "mt_instructionrate/mt_instructionrate.c",
    "content": "#define _GNU_SOURCE\n#include <stdio.h>\n#include <stdlib.h>\n#include <stdint.h>\n#include <math.h>\n#include <string.h>\n\n#ifndef _MSC_VER\n#include <pthread.h>\n#include <unistd.h>\n#include <sys/syscall.h>\n#ifdef __x86_64\n#define SMALLKITTEN __attribute__((ms_abi))\n#else\n#define SMALLKITTEN\n#endif\n#define gettid() ((pid_t)syscall(SYS_gettid))\n#else \n#include <Windows.h>\n#define SMALLKITTEN\n#define _CRT_SECURE_NO_WARNINGS\n#endif\n#include \"../Common/timing.h\"\n\n\nstruct TestThreadData {\n    float timeMs;  // written by thread to indicate elapsed runtime for that thread\n    uint64_t iterations;\n    void *testData;\n    int core;     // -1 = don't set affinity. otherwise set affinity to specified core\n    uint64_t (*testfunc)(uint64_t, void *) SMALLKITTEN;\n};\n\nfloat measureFunction(uint64_t baseIterations, uint64_t (*testFunc)(uint64_t, void *) SMALLKITTEN, void *data);\nvoid *TestThread(void *param);\n\nint threadCount = 1;\nint *coreList = NULL;\n\n#ifdef __aarch64__\n#include \"arm_mt_instructionrate.c\"\n#endif \n\n#ifdef __x86_64\n#include \"x86_mt_instructionrate.c\"\n#endif\n\n#ifdef _MSC_VER\n#include \"x86_mt_instructionrate.c\"\n#endif\n\n\n#ifdef __PPC64__\n#include \"ppc64_mt_instructionrate.c\"\n#endif\n\nint main(int argc, char *argv[]) {\n   char parseBuffer[512];\n   int parseIndices[64];\n\n   for (int argIdx = 1; argIdx < argc; argIdx++) {\n      if (*(argv[argIdx]) == '-') {\n        char *arg = argv[argIdx] + 1;\n\tif (strncmp(arg, \"threads\", 7) == 0) {\n\t  argIdx++;\n\t  threadCount = atoi(argv[argIdx]);\n\t  fprintf(stderr, \"Using first %d cores\\n\", threadCount);\n\t} else if (strncmp(arg, \"cores\", 5) == 0) {\n\t  argIdx++;\n\t  \n\t  // whatever just parse it here\n\t  strncpy(parseBuffer, argv[argIdx], 511);\n          parseIndices[0] = 0;\n          int indexIdx = 1;\n          threadCount = 1;\n          for (int i = 0; i < 512 && indexIdx < 64; i++) {\n            if (parseBuffer[i] == ',') {\n              parseBuffer[i] = '\\0';\n              parseIndices[indexIdx] = i + 1;\n              indexIdx++;\n              threadCount++;\n            }\n          }\n\n          coreList = malloc(sizeof(int) * threadCount);\n\n          fprintf(stderr, \"Using %d cores:\", threadCount);\n          for (int i = 0;i < threadCount; i++) {\n            coreList[i] = atoi(parseBuffer + parseIndices[i]);\n            fprintf(stderr, \" %d\", coreList[i]); \n          }\n\n          fprintf(stderr, \"\\n\");\n\t}\n      }\n   }\n\n   RunTests();\n\n   free(coreList);\n   return 0;\n}\n\n// return billion operations per second\n// test function must perform iterations ops\nfloat measureFunction(uint64_t baseIterations, uint64_t (*testFunc)(uint64_t, void *) SMALLKITTEN, void *data){\n  int toleranceMet = 0, minTimeMet = 0;\n  unsigned int timeMs;\n  \n  struct TestThreadData *testData = (struct TestThreadData *)malloc(threadCount * sizeof(struct TestThreadData));\n  for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {\n    testData[threadIdx].iterations = baseIterations;\n    testData[threadIdx].testData = data;\n    testData[threadIdx].testfunc = testFunc;\n    if (coreList == NULL) testData[threadIdx].core = threadIdx;\n    else testData[threadIdx].core = coreList[threadIdx];\n  }\n\n#ifndef _MSC_VER\n  pthread_t* testThreads = (pthread_t*)malloc(threadCount * sizeof(pthread_t));\n#else\n  HANDLE* testThreads = (HANDLE*)malloc(threadCount * sizeof(HANDLE));\n#endif\n\n  do {\n    start_timing();\n    for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {\n#ifndef _MSC_VER\n      pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx);\n#else\n      testThreads[threadIdx] = CreateThread(NULL, 0, TestThread, testData + threadIdx, CREATE_SUSPENDED, NULL, NULL);\n      SetThreadAffinityMask(testThreads[threadIdx], 1UL << testData[threadIdx].core);\n      ResumeThread(testThreads[threadIdx]);\n#endif\n    }\n\n    float maxThreadTime = -1, minThreadTime = -1;\n    for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {\n#ifndef _MSC_VER\n      pthread_join(testThreads[threadIdx], NULL);\n#else\n      WaitForMultipleObjects((DWORD)threadCount, testThreads, TRUE, INFINITE);\n#endif\n      fprintf(stderr, \"Thread %d took %f ms\\n\", threadIdx, testData[threadIdx].timeMs);\n      if (maxThreadTime < 0 || testData[threadIdx].timeMs > maxThreadTime) maxThreadTime = testData[threadIdx].timeMs;\n      if (minThreadTime < 0 || testData[threadIdx].timeMs < minThreadTime) minThreadTime = testData[threadIdx].timeMs;\n    }\n\n    timeMs = end_timing();\n    minTimeMet = timeMs > 2000; // see if 2 seconds will work\n    toleranceMet = ((maxThreadTime - minThreadTime) / minThreadTime) < 0.2f; // allow 10% variation?\n\n    if (!minTimeMet) {\n      // Increase iteration count with 3s target\n      baseIterations = scale_iterations_to_target(baseIterations, (float)timeMs, 3000.0f); \n      for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {\n        testData[threadIdx].iterations = baseIterations;\n      }\n\n      fprintf(stderr, \"Setting %lu iterations\\n\", baseIterations);\n    } else if (!toleranceMet) {\n      for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {\n        testData[threadIdx].iterations = scale_iterations_to_target(\n          testData[threadIdx].iterations,\n          testData[threadIdx].timeMs,\n          maxThreadTime);\n        fprintf(stderr, \"Thread %d -> %lu iterations\\n\", threadIdx, testData[threadIdx].iterations); \n      }\n    }\n  } while ((!toleranceMet) || (!minTimeMet));\n\n  fprintf(stderr, \"time elapsed: %d ms\\n\", timeMs);\n\n  uint64_t totalIterations = 0;\n  for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {\n    totalIterations += testData[threadIdx].iterations;\n  }\n\n  free(testData);\n  free(testThreads);\n\n  return (1000 * totalIterations / timeMs) / 1e9;\n}\n\nvoid *TestThread(void *param) {\n  struct TestThreadData *testData = (struct TestThreadData *)param;\n\n#ifndef _MSC_VER\n  if (testData->core >= 0) {\n    cpu_set_t cpuset;\n    CPU_ZERO(&cpuset);\n    CPU_SET(testData->core, &cpuset);\n    sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);\n  }\n  \n  struct timeval start1;\n#else\n  struct timeb start1;\n#endif\n  start_timing_ts(&start1);\n  testData->testfunc(testData->iterations, testData->testData);\n  testData->timeMs = end_timing_ts(&start1);\n\n  return NULL;\n}\n"
  },
  {
    "path": "mt_instructionrate/mt_instructionrate.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 17\r\nVisualStudioVersion = 17.8.34511.84\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"Project1\", \"Project1.vcxproj\", \"{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|x64 = Debug|x64\r\n\t\tDebug|x86 = Debug|x86\r\n\t\tRelease|x64 = Release|x64\r\n\t\tRelease|x86 = Release|x86\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x86.ActiveCfg = Debug|Win32\r\n\t\t{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x86.Build.0 = Debug|Win32\r\n\t\t{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x64.Build.0 = Release|x64\r\n\t\t{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x86.ActiveCfg = Release|Win32\r\n\t\t{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x86.Build.0 = Release|Win32\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\n\tGlobalSection(ExtensibilityGlobals) = postSolution\r\n\t\tSolutionGuid = {B31B466E-F833-4B33-9E21-74616F970AA2}\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "mt_instructionrate/ppc64_mt_instructionrate.c",
    "content": "extern uint64_t vec_int32_add_test(uint64_t iterations, void *data);\nextern uint64_t vec_int32_mul_test(uint64_t iterations, void *data);\nextern uint64_t vec_fp32_add_test(uint64_t iterations, void *data);\nextern uint64_t vec_fp32_fma_test(uint64_t iterations, void *data);\nextern uint64_t vec_fp32_isqrt_test(uint64_t iterations, void *data);\nextern uint64_t fp64_add_test(uint64_t iterations, void *data);\nextern uint64_t fp64_fma_test(uint64_t iterations, void *data);\n\nvoid RunTests() {\n  uint64_t iterations = 3500000000;\n  int testDataLength = 256; \n  uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);\n  uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);\n  for (int i = 0; i < testDataLength; i++) {\n    intTestArr[i] = i;\n    fpTestArr[i] = i * 1.2f;\n  }\n\n  fprintf(stderr, \"Measuring INT32 adds\\n\");\n  float int32adds = measureFunction(iterations, vec_int32_add_test, intTestArr);\n  float int32muls = measureFunction(iterations, vec_int32_mul_test, intTestArr);\n  float fp32adds = measureFunction(iterations, vec_fp32_add_test, fpTestArr);\n  float fp32fmas = measureFunction(iterations, vec_fp32_fma_test, fpTestArr);\n  float fp32isqrt = measureFunction(iterations, vec_fp32_isqrt_test, fpTestArr);\n  float fp64adds = measureFunction(iterations, fp64_add_test, fpTestArr);\n  float fp64fmas = measureFunction(iterations, fp64_fma_test, fpTestArr);\n\n  printf(\"-----GOPS/s-----\\n\");\n  printf(\"Altivec INT32 Add: %f\\n\", int32adds); \n  printf(\"Altivec INT32 Multiply: %f\\n\", int32muls); \n  printf(\"Altivec FP32 Add: %f\\n\", fp32adds);\n  printf(\"Altivec FP32 FMA: %f (%f GFLOPS)\\n\", fp32fmas, 2 * fp32fmas);\n  printf(\"Altivec FP32 Inverse Square Root: %f\\n\", fp32isqrt);\n  printf(\"FP64 Add: %f\\n\", fp64adds);\n  printf(\"FP64 FMA: %f (%f GFLOPS)\\n\", fp64fmas, 2 * fp64fmas);\n  \n  free(intTestArr);\n  free(fpTestArr); \n  return;\n}\n"
  },
  {
    "path": "mt_instructionrate/ppc64_mt_instructionrate.s",
    "content": ".text\n\n.global vec_int32_add_test\n.global vec_int32_mul_test\n.global vec_fp32_add_test\n.global vec_fp32_fma_test\n.global vec_fp32_isqrt_test\n.global fp64_add_test\n.global fp64_fma_test\n\n/* r3 = iterations, r4 = ptr to arr */\nvec_int32_add_test:\n  .quad .L.vec_int32_add_test,.TOC.@tocbase,0\n.L.vec_int32_add_test:\n  li r9, 0\n  lvx v0, r4, r9\n  li r9, 16\n  lvx v1, r4, r9\n  li r9, 32\n  lvx v2, r4, r9\n  li r9, 48\n  lvx v3, r4, r9\n  li r9, 64\n  lvx v4, r4, r9\n  li r9, 80\n  lvx v5, r4, r9\n  li r9, 0\nvec_int32_add_test_loop:\n  vadduwm v0, v0, v0\n  vadduwm v1, v1, v1\n  vadduwm v2, v2, v2\n  vadduwm v3, v3, v3\n  vadduwm v4, v4, v4\n  vadduwm v5, v4, v4\n  addi r9, r9, 24\n  cmpld cr7, r3, r9\n  bgt cr7, vec_int32_add_test_loop\n  blr\n\nvec_int32_mul_test:\n  .quad .L.vec_int32_mul_test,.TOC.@tocbase,0\n.L.vec_int32_mul_test:\n  li r9, 0\n  lvx v0, r4, r9\n  li r9, 16\n  lvx v1, r4, r9\n  li r9, 32\n  lvx v2, r4, r9\n  li r9, 48\n  lvx v3, r4, r9\n  li r9, 64\n  lvx v4, r4, r9\n  li r9, 80\n  lvx v5, r4, r9\n  li r9, 96\n  lvx v6, r4, r9\n  li r9, 128\n  lvx v7, r4, r9\n  li r9, 0\nvec_int32_mul_test_loop:\n  vmuleuh v0, v0, v0\n  vmuleuh v1, v1, v1\n  vmuleuh v2, v2, v2\n  vmuleuh v3, v3, v3\n  vmuleuh v4, v4, v4\n  vmuleuh v5, v5, v5\n  vmuleuh v6, v6, v6\n  vmuleuh v7, v7, v7\n  addi r9, r9, 32\n  cmpld cr7, r3, r9\n  bgt cr7, vec_int32_mul_test_loop\n  blr \n\nvec_fp32_add_test:\n  .quad .L.vec_fp32_add_test,.TOC.@tocbase,0\n.L.vec_fp32_add_test:\n  li r9, 0\n  lvx v0, r4, r9\n  li r9, 16\n  lvx v1, r4, r9\n  li r9, 32\n  lvx v2, r4, r9\n  li r9, 48\n  lvx v3, r4, r9\n  li r9, 64\n  lvx v4, r4, r9\n  li r9, 80\n  lvx v5, r4, r9\n  li r9, 96\n  lvx v6, r4, r9\n  li r9, 128\n  lvx v7, r4, r9\n  li r9, 0 \nvec_fp32_add_test_loop:\n  vaddfp v0, v0, v0\n  vaddfp v1, v1, v1\n  vaddfp v2, v2, v2\n  vaddfp v3, v3, v3\n  vaddfp v4, v4, v4\n  vaddfp v5, v5, v5\n  vaddfp v6, v6, v6\n  vaddfp v7, v7, v7\n  addi r9, r9, 32\n  cmpld cr7, r3, r9\n  bgt cr7, vec_fp32_add_test_loop\n  blr  \n\nvec_fp32_fma_test:\n  .quad .L.vec_fp32_fma_test,.TOC.@tocbase,0\n.L.vec_fp32_fma_test:\n  li r9, 0\n  lvx v0, r4, r9\n  li r9, 16\n  lvx v1, r4, r9\n  li r9, 32\n  lvx v2, r4, r9\n  li r9, 48\n  lvx v3, r4, r9\n  li r9, 64\n  lvx v4, r4, r9\n  li r9, 80\n  lvx v5, r4, r9\n  li r9, 96\n  lvx v6, r4, r9\n  li r9, 128\n  lvx v7, r4, r9\n  li r9, 0   \nvec_fp32_fma_test_loop:\n  vmaddfp v0, v0, v0, v0\n  vmaddfp v1, v1, v1, v1\n  vmaddfp v2, v2, v2, v2\n  vmaddfp v3, v3, v3, v3\n  vmaddfp v4, v4, v4, v4\n  vmaddfp v5, v5, v5, v5\n  vmaddfp v6, v6, v6, v6\n  vmaddfp v7, v7, v7, v7\n  addi r9, r9, 32\n  cmpld cr7, r3, r9\n  bgt cr7, vec_fp32_add_test_loop\n  blr   \n\nvec_fp32_isqrt_test:\n  .quad .L.vec_fp32_isqrt_test,.TOC.@tocbase,0\n.L.vec_fp32_isqrt_test:\n  li r9, 0\n  lvx v0, r4, r9\n  li r9, 16\n  lvx v1, r4, r9\n  li r9, 32\n  lvx v2, r4, r9\n  li r9, 48\n  lvx v3, r4, r9\n  li r9, 64\n  lvx v4, r4, r9\n  li r9, 80\n  lvx v5, r4, r9\n  li r9, 96\n  lvx v6, r4, r9\n  li r9, 128\n  lvx v7, r4, r9\n  li r9, 0  \nvec_fp32_isqrt_test_loop:\n  vrsqrtefp v0, v0\n  vrsqrtefp v1, v1\n  vrsqrtefp v2, v2\n  vrsqrtefp v3, v3\n  vrsqrtefp v4, v4\n  vrsqrtefp v5, v5\n  vrsqrtefp v6, v6\n  vrsqrtefp v7, v7\n  addi r9, r9, 32\n  cmpld cr7, r3, r9\n  bgt cr7, vec_fp32_isqrt_test_loop\n  blr \n\nfp64_add_test:\n  .quad .L.fp64_add_test,.TOC.@tocbase,0\n.L.fp64_add_test:\n  lfd f0, 0(r4)\n  lfd f1, 8(r4)\n  lfd f2, 16(r4)\n  lfd f3, 24(r4)\n  lfd f4, 32(r4)\n  lfd f5, 40(r4)\n  lfd f6, 48(r4)\n  lfd f7, 56(r4)\nfp64_add_test_loop:\n  fadd f0, f0, f0\n  fadd f1, f1, f1\n  fadd f2, f2, f2\n  fadd f3, f3, f3\n  fadd f4, f4, f4\n  fadd f5, f5, f5\n  fadd f6, f6, f6\n  fadd f7, f7, f7\n  addi r9, r9, 8\n  cmpld cr7, r3, r9\n  bgt cr7, fp64_add_test_loop\n  blr  \n\nfp64_fma_test:\n  .quad .L.fp64_fma_test,.TOC.@tocbase,0\n.L.fp64_fma_test:\n  lfd f0, 0(r4)\n  lfd f1, 8(r4)\n  lfd f2, 16(r4)\n  lfd f3, 24(r4)\n  lfd f4, 32(r4)\n  lfd f5, 40(r4)\n  lfd f6, 48(r4)\n  lfd f7, 56(r4)\nfp64_fma_test_loop:\n  fmadd f0, f0, f0, f0\n  fmadd f1, f1, f1, f1\n  fmadd f2, f2, f2, f2\n  fmadd f3, f3, f3, f3\n  fmadd f4, f4, f4, f4\n  fmadd f5, f5, f5, f5\n  fmadd f6, f6, f6, f6\n  fmadd f7, f7, f7, f7\n  addi r9, r9, 8\n  cmpld cr7, r3, r9\n  bgt cr7, fp64_fma_test_loop\n  blr   \n"
  },
  {
    "path": "mt_instructionrate/x86_mt_instructionrate.c",
    "content": "extern uint64_t sse_int32_add_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t sse_int32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t sse_int64_add_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t sse_int64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx2_int32_add_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx2_int32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx2_int64_add_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx2_int64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t sse_fp32_add_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t sse_fp32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t sse_fp32_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx_fp32_add_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx_fp32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx_fp32_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t sse_fp64_add_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t sse_fp64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t sse_fp64_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx_fp64_add_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx_fp64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t avx_fp64_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t fp32_fma_test(uint64_t iterations, void* data) SMALLKITTEN;\nextern uint64_t fp64_fma_test(uint64_t iterations, void* data) SMALLKITTEN;\n\nextern uint64_t avx_fp32_rsqrt_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t sse_fp32_rsqrt_test(uint64_t iterations, void *data) SMALLKITTEN;\n\nextern uint64_t avx512_int32_add_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t avx512_int32_mul_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t avx512_int64_add_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t avx512_int64_mul_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t avx512_fp32_rsqrt_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t avx512_fp32_add_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t avx512_fp32_fma_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t avx512_fp64_add_test(uint64_t iterations, void *data) SMALLKITTEN;\nextern uint64_t avx512_fp64_fma_test(uint64_t iterations, void *data) SMALLKITTEN;\n\n#ifndef _MSC_VER\n#include <cpuid.h>\n//void __cpuidex(int *data, int function, int subfunction) {\n//  int eax, ebx, ecx, edx;\n//  __cpuid_count(function, subfunction, eax, ebx, ecx, edx);\n//  data[0] = eax;\n//  data[1] = ebx;\n//  data[2] = ecx;\n//  data[3] = edx;\n//}\n#endif\n\n\nvoid RunTests() {\n  int cpuid_data[4];\n  int avx_supported = 0, avx2_supported = 0, avx512_supported = 0, fma_supported = 0;\n  uint64_t iterations = 5500000000;\n  int testDataLength = 512; \n  uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);\n  uint64_t* int64TestArr = (uint64_t*)malloc(sizeof(uint64_t) * testDataLength);\n  float *fpTestArr = (float *)malloc(sizeof(uint32_t) * testDataLength);\n  double* fp64TestArr = (double*)malloc(sizeof(double) * testDataLength);\n  for (int i = 0; i < testDataLength; i++) {\n    intTestArr[i] = i;\n    int64TestArr[i] = i * 2;\n    fpTestArr[i] = 1.0f + 0.02f * i;\n    fp64TestArr[i] = 2.0f + 0.01f * i;\n  }\n\n  __cpuidex(cpuid_data, 1, 0);\n  if (cpuid_data[2] & (1UL << 28)) {\n      fprintf(stderr, \"AVX supported\\n\");\n      avx_supported = 1;\n  }\n\n  if (cpuid_data[2] & (1UL << 28)) {\n      fprintf(stderr, \"AVX2 supported\\n\");\n      avx2_supported = 1;\n  }\n\n  if (cpuid_data[2] & (1UL << 12)) {\n      fprintf(stderr, \"FMA supported\\n\");\n      fma_supported = 1;\n  }\n\n  __cpuidex(cpuid_data, 7, 0);\n  if (cpuid_data[1] & (1UL << 16)) {\n      fprintf(stderr, \"AVX512 supported\\n\");\n      avx512_supported = 1;\n  }\n\n  fprintf(stderr, \"Measuring INT32 adds with SSE\\n\");\n  float sseInt32Adds = measureFunction(iterations, sse_int32_add_test, intTestArr);\n  float sseInt32Muls = measureFunction(iterations, sse_int32_mul_test, intTestArr);\n  float sseInt64Adds = measureFunction(iterations, sse_int64_add_test, intTestArr);\n  float sseInt64Muls = measureFunction(iterations, sse_int64_mul_test, intTestArr);\n  float sseFp32Adds = measureFunction(iterations, sse_fp32_add_test, fpTestArr);\n  float sseFp32Muls = measureFunction(iterations, sse_fp32_mul_test, fpTestArr);\n  float sseFp32MulAdds = measureFunction(iterations, sse_fp32_muladd_test, fpTestArr);\n  float sseFp64Adds = measureFunction(iterations, sse_fp64_add_test, fp64TestArr);\n  float sseFp64Muls = measureFunction(iterations, sse_fp64_mul_test, fp64TestArr);\n  float sseFp64Muladds = measureFunction(iterations, sse_fp64_muladd_test, fp64TestArr);\n  float sseFp32Rsqrts = measureFunction(iterations, sse_fp32_rsqrt_test, fpTestArr);\n\n  float avx2Int32Adds, avx2Int32Muls, avx2Int64Adds, avx2Int64Muls;\n  if (avx2_supported) {\n      avx2Int32Adds = measureFunction(iterations, avx2_int32_add_test, intTestArr);\n      avx2Int32Muls = measureFunction(iterations, avx2_int32_mul_test, intTestArr);\n      avx2Int64Adds = measureFunction(iterations, avx2_int64_add_test, int64TestArr);\n      avx2Int64Muls = measureFunction(iterations, avx2_int64_mul_test, int64TestArr);\n  }\n\n  float avxFp32Adds, avxFp32Muls, avxFp32Muladds, avxFp64Adds, avxFp64Muls, avxFp64Muladds;\n  float avxFp32Rsqrts;\n  if (avx_supported)\n  {\n      avxFp32Adds = measureFunction(iterations, avx_fp32_add_test, fpTestArr);\n      avxFp32Muls = measureFunction(iterations, avx_fp32_mul_test, fpTestArr);\n      avxFp32Muladds = measureFunction(iterations, avx_fp32_muladd_test, fpTestArr);\n      avxFp64Adds = measureFunction(iterations, avx_fp64_add_test, fp64TestArr);\n      avxFp64Muls = measureFunction(iterations, avx_fp64_mul_test, fp64TestArr);\n      avxFp64Muladds = measureFunction(iterations, avx_fp64_muladd_test, fp64TestArr);\n      avxFp32Rsqrts = measureFunction(iterations, avx_fp32_rsqrt_test, fpTestArr);\n  }\n\n  float fmaFp32, fmaFp64;\n  if (fma_supported) {\n      fmaFp32 = measureFunction(iterations, fp32_fma_test, fpTestArr);\n      fmaFp64 = measureFunction(iterations, fp64_fma_test, fpTestArr);\n  }\n\n  float avx512Fp32Rsqrts, avx512Fp32Adds, avx512Fp32Fmas, avx512Fp64Adds, avx512Fp64Fmas;\n  float avx512Int32Adds, avx512Int32Muls, avx512Int64Adds, avx512Int64Muls;\n  if (avx512_supported) {\n    avx512Fp32Rsqrts = measureFunction(iterations, avx512_fp32_rsqrt_test, fpTestArr);\n    avx512Fp32Adds = measureFunction(iterations, avx512_fp32_add_test, fpTestArr);\n    avx512Fp32Fmas = measureFunction(iterations, avx512_fp32_fma_test, fpTestArr);\n    avx512Fp64Adds = measureFunction(iterations, avx512_fp64_add_test, fp64TestArr);\n    avx512Fp64Fmas = measureFunction(iterations, avx512_fp64_fma_test, fp64TestArr);\n    avx512Int32Adds = measureFunction(iterations, avx512_int32_add_test, intTestArr);\n    avx512Int32Muls = measureFunction(iterations, avx512_int32_mul_test, intTestArr);\n    avx512Int64Adds = measureFunction(iterations, avx512_int64_add_test, int64TestArr);\n    avx512Int64Muls = measureFunction(iterations, avx512_int64_mul_test, int64TestArr);\n  }\n\n  printf(\"\\n-----GOPS/s-----\\n\");\n\n  // INT32\n  printf(\"\\n-----INT32-----\\n\");\n  printf(\"SSE INT32 Adds: %f\\n\", sseInt32Adds);\n  if (avx2_supported) printf(\"AVX2 INT32 Adds: %f\\n\", avx2Int32Adds);\n  if (avx512_supported) printf(\"AVX512 INT32 Adds: %f\\n\", avx512Int32Adds);\n  printf(\"SSE INT32 Multiplies: %f\\n\", sseInt32Muls);\n  if (avx2_supported) printf(\"AVX2 INT32 Multiplies: %f\\n\", avx2Int32Muls);\n  if (avx512_supported) printf(\"AVX512 INT32 Multiplies: %f\\n\", avx512Int32Muls);\n\n  // FP32\n  printf(\"\\n-----FP32-----\\n\");\n  printf(\"SSE FP32 Adds: %f\\n\", sseFp32Adds);\n  if (avx_supported) printf(\"AVX FP32 Adds: %f\\n\", avxFp32Adds);\n  if (avx512_supported) printf(\"AVX512 FP32 Adds: %f\\n\", avx512Fp32Adds);\n  printf(\"SSE FP32 Multiplies: %f\\n\", sseFp32Muls);\n  if (avx_supported) printf(\"AVX FP32 Multiplies: %f\\n\", avxFp32Muls);\n  printf(\"SSE FP32 Multiply+Adds: %f\\n\", sseFp32MulAdds);\n  if (avx_supported) printf(\"AVX FP32 Multiply+Adds: %f (%f GFLOPS)\\n\", avxFp32Muladds, 2 * avxFp32Muladds);\n  if (fma_supported) printf(\"FP32 FMAs: %f (%f GFLOPS)\\n\", fmaFp32, 2 * fmaFp32);\n  if (avx512_supported) printf(\"AVX512 FP32 FMAs: %f (%f GFLOPS)\\n\", avx512Fp32Fmas, avx512Fp32Fmas * 2);\n  printf(\"SSE FP32 Inverse Square Roots: %f\\n\", sseFp32Rsqrts);\n  if (avx_supported) printf(\"AVX FP32 Inverse Square Roots: %f\\n\", avxFp32Rsqrts);\n  if (avx512_supported) printf(\"AVX512 FP32 Inverse Square Roots: %f\\n\", avx512Fp32Rsqrts);\n  \n  // INT64\n  printf(\"\\n-----INT64-----\\n\");\n  printf(\"SSE INT64 Adds: %f\\n\", sseInt64Adds);\n  if (avx2_supported) printf(\"AVX2 INT64 Adds: %f\\n\", avx2Int64Adds);\n  if (avx512_supported) printf(\"AVX512 INT64 Adds: %f\\n\", avx512Int64Adds);\n  printf(\"SSE INT64 Multiplies: %f\\n\", sseInt64Muls);\n  if (avx2_supported) printf(\"AVX2 INT64 Multiplies: %f\\n\", avx2Int64Muls);\n  if (avx512_supported) printf(\"AVX512 INT64 Multiplies: %f\\n\", avx512Int64Muls);\n\n  // FP64\n  printf(\"\\n-----FP64-----\\n\");\n  printf(\"SSE FP64 Adds: %f\\n\", sseFp64Adds);\n  if (avx_supported) printf(\"AVX FP64 Adds: %f\\n\", avxFp64Adds);\n  if (avx512_supported) printf(\"AVX512 FP64 Adds: %f\\n\", avx512Fp64Adds);\n  printf(\"SSE FP64 Multiplies: %f\\n\", sseFp64Muls);\n  if (avx_supported) printf(\"AVX FP64 Multiplies: %f\\n\", avxFp64Muls);\n  printf(\"SSE FP64 Multiply+Adds: %f (%f GFLOPS)\\n\", sseFp64Muladds, 2 * sseFp64Muladds);\n  if (avx_supported) printf(\"AVX FP64 Multiply+Adds: %f (%f GFLOPS)\\n\", avxFp64Muladds, 2 * avxFp64Muladds);\n  if (fma_supported) printf(\"AVX FP64 FMAs: %f (%f GFLOPS)\\n\", fmaFp64, 2 * fmaFp64);\n  if (avx512_supported) printf(\"AVX512 FP64 FMAs: %f (%f GFLOPS)\\n\", avx512Fp64Fmas, avx512Fp64Fmas * 2);\n\n  free(intTestArr);\n  free(fpTestArr);\n  return;\n}\n"
  },
  {
    "path": "mt_instructionrate/x86_mt_instructionrate.s",
    "content": ".intel_syntax noprefix\r\n.text\r\n\r\n.global sse_int32_add_test\r\n.global sse_int32_mul_test\r\n.global sse_int64_add_test\r\n.global sse_int64_mul_test\r\n.global avx2_int32_add_test\r\n.global avx2_int32_mul_test\r\n.global avx2_int64_add_test\r\n.global avx2_int64_mul_test\r\n.global sse_fp32_add_test\r\n.global sse_fp32_mul_test\r\n.global sse_fp32_muladd_test\r\n.global sse_fp32_rsqrt_test\r\n.global avx_fp32_add_test\r\n.global avx_fp32_mul_test\r\n.global avx_fp32_muladd_test\r\n.global avx_fp32_rsqrt_test\r\n.global fp32_fma_test\r\n.global fp64_fma_test\r\n\r\n.global sse_fp64_add_test\r\n.global sse_fp64_mul_test\r\n.global sse_fp64_muladd_test\r\n.global avx_fp64_add_test\r\n.global avx_fp64_mul_test\r\n.global avx_fp64_muladd_test\r\n\r\n.global avx512_int32_add_test\r\n.global avx512_int32_mul_test\r\n.global avx512_int64_add_test\r\n.global avx512_int64_mul_test\r\n.global avx512_fp32_rsqrt_test\r\n.global avx512_fp32_add_test\r\n.global avx512_fp32_fma_test\r\n.global avx512_fp64_add_test\r\n.global avx512_fp64_fma_test \r\n\r\nsse_int32_add_test:\r\n  movdqu xmm0, [rdx]\r\n  movdqu xmm1, [rdx + 16]\r\n  movdqu xmm2, [rdx + 32]\r\n  movdqu xmm3, [rdx + 48]\r\n  movdqu xmm4, [rdx + 64]\r\n  movdqu xmm5, [rdx + 72]\r\nsse_int32_add_test_loop:\r\n  paddd xmm0, xmm0\r\n  paddd xmm1, xmm1\r\n  paddd xmm2, xmm2\r\n  paddd xmm3, xmm3\r\n  paddd xmm4, xmm4\r\n  paddd xmm5, xmm5\r\n  sub rcx, 24\r\n  cmp rcx, 0\r\n  jg sse_int32_add_test_loop\r\n  ret\r\n\r\nsse_int64_add_test:\r\n  movdqu xmm0, [rdx]\r\n  movdqu xmm1, [rdx + 16]\r\n  movdqu xmm2, [rdx + 32]\r\n  movdqu xmm3, [rdx + 48]\r\n  movdqu xmm4, [rdx + 64]\r\n  movdqu xmm5, [rdx + 72]\r\nsse_int64_add_test_loop:\r\n  paddq xmm0, xmm0\r\n  paddq xmm1, xmm1\r\n  paddq xmm2, xmm2\r\n  paddq xmm3, xmm3\r\n  paddq xmm4, xmm4\r\n  paddq xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_int64_add_test_loop\r\n  ret\r\n\r\nsse_int32_mul_test:\r\n  movdqu xmm0, [rdx]\r\n  movdqu xmm1, [rdx + 16]\r\n  movdqu xmm2, [rdx + 32]\r\n  movdqu xmm3, [rdx + 48]\r\n  movdqu xmm4, [rdx + 64]\r\n  movdqu xmm5, [rdx + 72]\r\nsse_int32_mul_test_loop:\r\n  pmulld xmm0, xmm0\r\n  pmulld xmm1, xmm1\r\n  pmulld xmm2, xmm2\r\n  pmulld xmm3, xmm3\r\n  pmulld xmm4, xmm4\r\n  pmulld xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_int32_mul_test_loop\r\n  ret\r\n\r\nsse_int64_mul_test:\r\n  movdqu xmm0, [rdx]\r\n  movdqu xmm1, [rdx + 16]\r\n  movdqu xmm2, [rdx + 32]\r\n  movdqu xmm3, [rdx + 48]\r\n  movdqu xmm4, [rdx + 64]\r\n  movdqu xmm5, [rdx + 72]\r\nsse_int64_mul_test_loop:\r\n  pmuludq xmm0, xmm0\r\n  pmuludq xmm1, xmm1\r\n  pmuludq xmm2, xmm2\r\n  pmuludq xmm3, xmm3\r\n  pmuludq xmm4, xmm4\r\n  pmuludq xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_int64_mul_test_loop\r\n  ret\r\n\r\navx2_int32_add_test:\r\n  vmovdqu ymm0, [rdx]\r\n  vmovdqu ymm1, [rdx + 32]\r\n  vmovdqu ymm2, [rdx + 64]\r\n  vmovdqu ymm3, [rdx + 96]\r\n  vmovdqu ymm4, [rdx + 128]\r\n  vmovdqu ymm5, [rdx + 160]\r\navx2_int32_add_test_loop:\r\n  vpaddd ymm0, ymm0, ymm0\r\n  vpaddd ymm1, ymm1, ymm1\r\n  vpaddd ymm2, ymm2, ymm2\r\n  vpaddd ymm3, ymm3, ymm3\r\n  vpaddd ymm4, ymm4, ymm4\r\n  vpaddd ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  cmp rcx, 0\r\n  jg avx2_int32_add_test_loop\r\n  ret\r\n\r\navx2_int32_mul_test:\r\n  vmovdqu ymm0, [rdx]\r\n  vmovdqu ymm1, [rdx + 32]\r\n  vmovdqu ymm2, [rdx + 64]\r\n  vmovdqu ymm3, [rdx + 96]\r\n  vmovdqu ymm4, [rdx + 128]\r\n  vmovdqu ymm5, [rdx + 160]\r\navx2_int32_mul_test_loop:\r\n  vpmulld ymm0, ymm0, ymm0\r\n  vpmulld ymm1, ymm1, ymm1\r\n  vpmulld ymm2, ymm2, ymm2\r\n  vpmulld ymm3, ymm3, ymm3\r\n  vpmulld ymm4, ymm4, ymm4\r\n  vpmulld ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx2_int32_mul_test_loop\r\n  ret\r\n\r\navx2_int64_add_test:\r\n  vmovdqu ymm0, [rdx]\r\n  vmovdqu ymm1, [rdx + 32]\r\n  vmovdqu ymm2, [rdx + 64]\r\n  vmovdqu ymm3, [rdx + 96]\r\n  vmovdqu ymm4, [rdx + 128]\r\n  vmovdqu ymm5, [rdx + 160]\r\navx2_int64_add_test_loop:\r\n  vpaddq ymm0, ymm0, ymm0\r\n  vpaddq ymm1, ymm1, ymm1\r\n  vpaddq ymm2, ymm2, ymm2\r\n  vpaddq ymm3, ymm3, ymm3\r\n  vpaddq ymm4, ymm4, ymm4\r\n  vpaddq ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx2_int64_add_test_loop\r\n  ret\r\n\r\navx2_int64_mul_test:\r\n  vmovdqu ymm0, [rdx]\r\n  vmovdqu ymm1, [rdx + 32]\r\n  vmovdqu ymm2, [rdx + 64]\r\n  vmovdqu ymm3, [rdx + 96]\r\n  vmovdqu ymm4, [rdx + 128]\r\n  vmovdqu ymm5, [rdx + 160]\r\navx2_int64_mul_test_loop:\r\n  vpmuldq ymm0, ymm0, ymm0\r\n  vpmuldq ymm1, ymm1, ymm1\r\n  vpmuldq ymm2, ymm2, ymm2\r\n  vpmuldq ymm3, ymm3, ymm3\r\n  vpmuldq ymm4, ymm4, ymm4\r\n  vpmuldq ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx2_int64_mul_test_loop\r\n  ret\r\n\r\nsse_fp32_add_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp32_add_test_loop:\r\n  addps xmm0, xmm0\r\n  addps xmm1, xmm1\r\n  addps xmm2, xmm2\r\n  addps xmm3, xmm3\r\n  addps xmm4, xmm4\r\n  addps xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_fp32_add_test_loop\r\n  ret\r\n\r\nsse_fp64_add_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp64_add_test_loop:\r\n  addpd xmm0, xmm0\r\n  addpd xmm1, xmm1\r\n  addpd xmm2, xmm2\r\n  addpd xmm3, xmm3\r\n  addpd xmm4, xmm4\r\n  addpd xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_fp64_add_test_loop\r\n  ret\r\n\r\nsse_fp32_mul_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp32_mul_test_loop:\r\n  mulps xmm0, xmm0\r\n  mulps xmm1, xmm1\r\n  mulps xmm2, xmm2\r\n  mulps xmm3, xmm3\r\n  mulps xmm4, xmm4\r\n  mulps xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_fp32_mul_test_loop\r\n  ret\r\n\r\nsse_fp64_mul_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp64_mul_test_loop:\r\n  mulpd xmm0, xmm0\r\n  mulpd xmm1, xmm1\r\n  mulpd xmm2, xmm2\r\n  mulpd xmm3, xmm3\r\n  mulpd xmm4, xmm4\r\n  mulpd xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_fp64_mul_test_loop\r\n  ret\r\n\r\nsse_fp32_muladd_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp32_muladd_test_loop:\r\n  mulps xmm0, xmm0\r\n  addps xmm0, xmm0\r\n  mulps xmm1, xmm1\r\n  addps xmm1, xmm1\r\n  mulps xmm2, xmm2\r\n  addps xmm2, xmm2\r\n  mulps xmm3, xmm3\r\n  addps xmm3, xmm3\r\n  mulps xmm4, xmm4\r\n  addps xmm4, xmm4\r\n  mulps xmm5, xmm5\r\n  addps xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_fp32_muladd_test_loop\r\n  ret\r\n\r\nsse_fp32_rsqrt_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp32_rsqrt_test_loop:\r\n  rsqrtps xmm0, xmm0\r\n  rsqrtps xmm1, xmm1\r\n  rsqrtps xmm2, xmm2\r\n  rsqrtps xmm3, xmm3\r\n  rsqrtps xmm4, xmm4\r\n  rsqrtps xmm5, xmm5\r\n  sub rcx, 24\r\n  jg sse_fp32_rsqrt_test_loop\r\n  ret\r\n\r\navx_fp32_rsqrt_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp32_rsqrt_test_loop:\r\n  vrsqrtps ymm0, ymm0\r\n  vrsqrtps ymm1, ymm1\r\n  vrsqrtps ymm2, ymm2\r\n  vrsqrtps ymm3, ymm3\r\n  vrsqrtps ymm4, ymm4\r\n  vrsqrtps ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx_fp32_rsqrt_test_loop\r\n  ret\r\n\r\nsse_fp64_muladd_test:\r\n  movups xmm0, [rdx]\r\n  movups xmm1, [rdx + 16]\r\n  movups xmm2, [rdx + 32]\r\n  movups xmm3, [rdx + 48]\r\n  movups xmm4, [rdx + 64]\r\n  movups xmm5, [rdx + 72]\r\nsse_fp64_muladd_test_loop:\r\n  mulpd xmm0, xmm0\r\n  addpd xmm0, xmm0\r\n  mulpd xmm1, xmm1\r\n  addpd xmm1, xmm1\r\n  mulpd xmm2, xmm2\r\n  addpd xmm2, xmm2\r\n  mulpd xmm3, xmm3\r\n  addpd xmm3, xmm3\r\n  mulpd xmm4, xmm4\r\n  addpd xmm4, xmm4\r\n  mulpd xmm5, xmm5\r\n  addpd xmm5, xmm5\r\n  sub rcx, 12\r\n  jg sse_fp64_muladd_test_loop\r\n  ret\r\n\r\navx_fp32_add_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp32_add_test_loop:\r\n  vaddps ymm0, ymm0, ymm0\r\n  vaddps ymm1, ymm1, ymm1\r\n  vaddps ymm2, ymm2, ymm2\r\n  vaddps ymm3, ymm3, ymm3\r\n  vaddps ymm4, ymm4, ymm4\r\n  vaddps ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx_fp32_add_test_loop\r\n  ret\r\n\r\navx_fp64_add_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp64_add_test_loop:\r\n  vaddpd ymm0, ymm0, ymm0\r\n  vaddpd ymm1, ymm1, ymm1\r\n  vaddpd ymm2, ymm2, ymm2\r\n  vaddpd ymm3, ymm3, ymm3\r\n  vaddpd ymm4, ymm4, ymm4\r\n  vaddpd ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx_fp64_add_test_loop\r\n  ret\r\n\r\navx_fp32_mul_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp32_mul_test_loop:\r\n  vmulps ymm0, ymm0, ymm0\r\n  vmulps ymm1, ymm1, ymm1\r\n  vmulps ymm2, ymm2, ymm2\r\n  vmulps ymm3, ymm3, ymm3\r\n  vmulps ymm4, ymm4, ymm4\r\n  vmulps ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx_fp32_mul_test_loop\r\n  ret\r\n\r\navx_fp64_mul_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp64_mul_test_loop:\r\n  vmulpd ymm0, ymm0, ymm0\r\n  vmulpd ymm1, ymm1, ymm1\r\n  vmulpd ymm2, ymm2, ymm2\r\n  vmulpd ymm3, ymm3, ymm3\r\n  vmulpd ymm4, ymm4, ymm4\r\n  vmulpd ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx_fp64_mul_test_loop\r\n  ret\r\n\r\navx_fp32_muladd_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp32_muladd_test_loop:\r\n  vmulps ymm0, ymm0, ymm0\r\n  vaddps ymm0, ymm0, ymm0\r\n  vmulps ymm1, ymm1, ymm1\r\n  vaddps ymm1, ymm1, ymm1\r\n  vmulps ymm2, ymm2, ymm2\r\n  vaddps ymm2, ymm2, ymm2\r\n  vmulps ymm3, ymm3, ymm3\r\n  vaddps ymm3, ymm3, ymm3\r\n  vmulps ymm4, ymm4, ymm4\r\n  vaddps ymm4, ymm4, ymm4\r\n  vmulps ymm5, ymm5, ymm5\r\n  vaddps ymm5, ymm5, ymm5\r\n  sub rcx, 48\r\n  jg avx_fp32_muladd_test_loop\r\n  ret\r\n\r\navx_fp64_muladd_test:\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, [rdx + 32]\r\n  vmovups ymm2, [rdx + 64]\r\n  vmovups ymm3, [rdx + 96]\r\n  vmovups ymm4, [rdx + 128]\r\n  vmovups ymm5, [rdx + 160]\r\navx_fp64_muladd_test_loop:\r\n  vmulpd ymm0, ymm0, ymm0\r\n  vaddpd ymm0, ymm0, ymm0\r\n  vmulpd ymm1, ymm1, ymm1\r\n  vaddpd ymm1, ymm1, ymm1\r\n  vmulpd ymm2, ymm2, ymm2\r\n  vaddpd ymm2, ymm2, ymm2\r\n  vmulpd ymm3, ymm3, ymm3\r\n  vaddpd ymm3, ymm3, ymm3\r\n  vmulpd ymm4, ymm4, ymm4\r\n  vaddpd ymm4, ymm4, ymm4\r\n  vmulpd ymm5, ymm5, ymm5\r\n  vaddpd ymm5, ymm5, ymm5\r\n  sub rcx, 24\r\n  jg avx_fp64_muladd_test_loop\r\n  ret\r\n\r\nfp32_fma_test:\r\n  vzeroall\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, ymm0\r\n  vmovups ymm2, ymm0\r\n  vmovups ymm3, ymm0\r\n  vmovups ymm4, ymm0\r\n  vmovups ymm5, ymm0\r\n  vmovups ymm6, ymm0\r\nfp32_fma_test_loop:\r\n  vfmadd132ps ymm0, ymm0, ymm6\r\n  vfmadd132ps ymm1, ymm1, ymm6\r\n  vfmadd132ps ymm2, ymm2, ymm6\r\n  vfmadd132ps ymm3, ymm3, ymm6\r\n  vfmadd132ps ymm4, ymm4, ymm6\r\n  vfmadd132ps ymm5, ymm5, ymm6\r\n  sub rcx, 48\r\n  jg fp32_fma_test_loop\r\n  ret\r\n\r\nfp64_fma_test:\r\n  vzeroall\r\n  vmovups ymm0, [rdx]\r\n  vmovups ymm1, ymm0\r\n  vmovups ymm2, ymm0\r\n  vmovups ymm3, ymm0\r\n  vmovups ymm4, ymm0\r\n  vmovups ymm5, ymm0\r\n  vmovups ymm6, ymm0\r\nfp64_fma_test_loop:\r\n  vfmadd132pd ymm0, ymm0, ymm6\r\n  vfmadd132pd ymm1, ymm1, ymm6\r\n  vfmadd132pd ymm2, ymm2, ymm6\r\n  vfmadd132pd ymm3, ymm3, ymm6\r\n  vfmadd132pd ymm4, ymm4, ymm6\r\n  vfmadd132pd ymm5, ymm5, ymm6\r\n  sub rcx, 24\r\n  jg fp64_fma_test_loop\r\n  ret\r\n\r\navx512_int32_add_test:\r\n  vmovdqu16 zmm0, [rdx]\r\n  vmovdqu16 zmm1, [rdx + 64]\r\n  vmovdqu16 zmm2, [rdx + 128]\r\n  vmovdqu16 zmm3, [rdx + 192]\r\n  vmovdqu16 zmm4, [rdx + 256]\r\n  vmovdqu16 zmm5, [rdx + 384]\r\navx512_int32_add_test_loop:\r\n  vpaddd zmm0, zmm0, zmm0\r\n  vpaddd zmm1, zmm1, zmm1\r\n  vpaddd zmm2, zmm2, zmm2\r\n  vpaddd zmm3, zmm3, zmm3\r\n  vpaddd zmm4, zmm4, zmm4\r\n  vpaddd zmm5, zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_int32_add_test_loop\r\n  ret\r\n\r\navx512_int32_mul_test:\r\n  vmovdqu16 zmm0, [rdx]\r\n  vmovdqu16 zmm1, [rdx + 64]\r\n  vmovdqu16 zmm2, [rdx + 128]\r\n  vmovdqu16 zmm3, [rdx + 192]\r\n  vmovdqu16 zmm4, [rdx + 256]\r\n  vmovdqu16 zmm5, [rdx + 384]\r\navx512_int32_mul_test_loop:\r\n  vpmulld zmm0, zmm0, zmm0\r\n  vpmulld zmm1, zmm1, zmm1\r\n  vpmulld zmm2, zmm2, zmm2\r\n  vpmulld zmm3, zmm3, zmm3\r\n  vpmulld zmm4, zmm4, zmm4\r\n  vpmulld zmm5, zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_int32_mul_test_loop\r\n  ret\r\n\r\navx512_int64_add_test:\r\n  vmovdqu16 zmm0, [rdx]\r\n  vmovdqu16 zmm1, [rdx + 64]\r\n  vmovdqu16 zmm2, [rdx + 128]\r\n  vmovdqu16 zmm3, [rdx + 192]\r\n  vmovdqu16 zmm4, [rdx + 256]\r\n  vmovdqu16 zmm5, [rdx + 384]\r\navx512_int64_add_test_loop:\r\n  vpaddq zmm0, zmm0, zmm0\r\n  vpaddq zmm1, zmm1, zmm1\r\n  vpaddq zmm2, zmm2, zmm2\r\n  vpaddq zmm3, zmm3, zmm3\r\n  vpaddq zmm4, zmm4, zmm4\r\n  vpaddq zmm5, zmm5, zmm5\r\n  sub rcx, 48\r\n  jg avx512_int64_add_test_loop\r\n  ret\r\n\r\navx512_int64_mul_test:\r\n  vmovdqu16 zmm0, [rdx]\r\n  vmovdqu16 zmm1, [rdx + 64]\r\n  vmovdqu16 zmm2, [rdx + 128]\r\n  vmovdqu16 zmm3, [rdx + 192]\r\n  vmovdqu16 zmm4, [rdx + 256]\r\n  vmovdqu16 zmm5, [rdx + 384]\r\navx512_int64_mul_test_loop:\r\n  vpmuldq zmm0, zmm0, zmm0\r\n  vpmuldq zmm1, zmm1, zmm1\r\n  vpmuldq zmm2, zmm2, zmm2\r\n  vpmuldq zmm3, zmm3, zmm3\r\n  vpmuldq zmm4, zmm4, zmm4\r\n  vpmuldq zmm5, zmm5, zmm5\r\n  sub rcx, 48\r\n  jg avx512_int64_mul_test_loop\r\n  ret\r\n\r\navx512_fp32_rsqrt_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp32_rsqrt_test_loop:\r\n  vrsqrt14ps zmm0, zmm0\r\n  vrsqrt14ps zmm1, zmm1\r\n  vrsqrt14ps zmm2, zmm2\r\n  vrsqrt14ps zmm3, zmm3\r\n  vrsqrt14ps zmm4, zmm4\r\n  vrsqrt14ps zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_fp32_rsqrt_test_loop\r\n  ret\r\n\r\navx512_fp32_add_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp32_add_test_loop:\r\n  vaddps zmm0, zmm0, zmm0\r\n  vaddps zmm1, zmm1, zmm1\r\n  vaddps zmm2, zmm2, zmm2\r\n  vaddps zmm3, zmm3, zmm3\r\n  vaddps zmm4, zmm4, zmm4\r\n  vaddps zmm5, zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_fp32_add_test_loop\r\n  ret\r\n\r\navx512_fp32_fma_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp32_fma_test_loop:\r\n  vfmadd132ps zmm0, zmm0, zmm0\r\n  vfmadd132ps zmm1, zmm1, zmm1\r\n  vfmadd132ps zmm2, zmm2, zmm2\r\n  vfmadd132ps zmm3, zmm3, zmm3\r\n  vfmadd132ps zmm4, zmm4, zmm4\r\n  vfmadd132ps zmm5, zmm5, zmm5\r\n  sub rcx, 96\r\n  jg avx512_fp32_fma_test_loop\r\n  ret\r\n\r\navx512_fp64_add_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp64_add_test_loop:\r\n  vfmadd132pd zmm0, zmm0, zmm0\r\n  vfmadd132pd zmm1, zmm1, zmm1\r\n  vfmadd132pd zmm2, zmm2, zmm2\r\n  vfmadd132pd zmm3, zmm3, zmm3\r\n  vfmadd132pd zmm4, zmm4, zmm4\r\n  vfmadd132pd zmm5, zmm5, zmm5\r\n  sub rcx, 48\r\n  jg avx512_fp64_add_test_loop\r\n  ret\r\n\r\navx512_fp64_fma_test:\r\n  vmovups zmm0, [rdx]\r\n  vmovups zmm1, [rdx + 64]\r\n  vmovups zmm2, [rdx + 128]\r\n  vmovups zmm3, [rdx + 192]\r\n  vmovups zmm4, [rdx + 256]\r\n  vmovups zmm5, [rdx + 384]\r\navx512_fp64_fma_test_loop:\r\n  vfmadd132ps zmm0, zmm0, zmm0\r\n  vfmadd132ps zmm1, zmm1, zmm1\r\n  vfmadd132ps zmm2, zmm2, zmm2\r\n  vfmadd132ps zmm3, zmm3, zmm3\r\n  vfmadd132ps zmm4, zmm4, zmm4\r\n  vfmadd132ps zmm5, zmm5, zmm5\r\n  sub rcx, 48\r\n  jg avx512_fp64_fma_test_loop\r\n  ret \r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Context.h",
    "content": "#pragma once\r\n\r\n// OpenCL Utils includes\r\n#include \"OpenCLUtils_Export.h\"\r\n\r\n// OpenCL includes\r\n#include <CL/cl.h>\r\n\r\n// STL includes\r\n#include <time.h>\r\n\r\nUTILS_EXPORT\r\ncl_context cl_util_get_context(const cl_uint plat_id, const cl_uint dev_id,\r\n                               const cl_device_type type, cl_int* const error);\r\nUTILS_EXPORT\r\ncl_device_id cl_util_get_device(const cl_uint plat_id, const cl_uint dev_id,\r\n                                const cl_device_type type, cl_int* const error);\r\n\r\nUTILS_EXPORT\r\ncl_int cl_util_print_device_info(const cl_device_id device);\r\n\r\nUTILS_EXPORT\r\nchar* cl_util_get_device_info(const cl_device_id device,\r\n                              const cl_device_info info, cl_int* const error);\r\nUTILS_EXPORT\r\nchar* cl_util_get_platform_info(const cl_platform_id platform,\r\n                                const cl_platform_info info,\r\n                                cl_int* const error);\r\n\r\n// build program and show log if build is not successful\r\nUTILS_EXPORT\r\ncl_int cl_util_build_program(const cl_program pr, const cl_device_id dev,\r\n                             const char* const opt);\r\n\r\n#define GET_CURRENT_TIMER(time)                                                \\\r\n    struct timespec time;                                                      \\\r\n    timespec_get(&time, TIME_UTC);                                             \\\r\n    {                                                                          \\\r\n    }\r\n\r\n#define TIMER_DIFFERENCE(dt, time1, time2)                                     \\\r\n    {                                                                          \\\r\n        dt = (time2.tv_sec - time1.tv_sec) * 1000000000                        \\\r\n            + (time2.tv_nsec - time1.tv_nsec);                                 \\\r\n    }\r\n\r\n#define START_TIMER GET_CURRENT_TIMER(start_timer1)\r\n#define STOP_TIMER(dt)                                                         \\\r\n    GET_CURRENT_TIMER(stop_timer2)                                             \\\r\n    TIMER_DIFFERENCE(dt, start_timer1, stop_timer2)\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Context.hpp",
    "content": "#pragma once\r\n\r\n// OpenCL SDK includes\r\n#include \"OpenCLUtilsCpp_Export.h\"\r\n\r\n#include <CL/Utils/Error.hpp>\r\n\r\n// OpenCL includes\r\n#include <CL/opencl.hpp>\r\n\r\nnamespace cl {\r\nnamespace util {\r\n    Context UTILSCPP_EXPORT get_context(cl_uint plat_id, cl_uint dev_id,\r\n                                        cl_device_type type,\r\n                                        cl_int* error = nullptr);\r\n\r\n    void UTILSCPP_EXPORT print_device_info(const cl::Device& device);\r\n}\r\n}\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Detail.hpp",
    "content": "#pragma once\r\n\r\n// STL includes\r\n#include <stddef.h>\r\n#include <utility> // std::forward, std::integer_sequence\r\n#include <tuple> // std::tuple, std::get\r\n#include <initializer_list> // std::initializer_list\r\n\r\nnamespace cl {\r\nnamespace util {\r\n    namespace detail {\r\n        // Borrowed from:\r\n        // https://www.fluentcpp.com/2019/03/05/for_each_arg-applying-a-function-to-each-argument-of-a-function-in-cpp/\r\n        template <class F, class... Args> F for_each_arg(F f, Args&&... args)\r\n        {\r\n            (void)std::initializer_list<int>{ (\r\n                (void)f(std::forward<Args>(args)), 0)... };\r\n            return f;\r\n        }\r\n\r\n        namespace impl {\r\n            // Borrowed from: https://stackoverflow.com/a/16387374/1476661\r\n            template <typename T, typename F, int... Is>\r\n            void for_each_in_tuple(T&& t, F&& f,\r\n                                   std::integer_sequence<int, Is...>)\r\n            {\r\n                auto l = {\r\n                    (std::forward<F>(f)(std::get<Is>(std::forward<T>(t))), 0)...\r\n                };\r\n                (void)l;\r\n            }\r\n        }\r\n        template <typename... Ts, typename F>\r\n        void for_each_in_tuple(std::tuple<Ts...> const& t, F&& f)\r\n        {\r\n            impl::for_each_in_tuple(\r\n                t, std::forward<F>(f),\r\n                std::make_integer_sequence<int, sizeof...(Ts)>());\r\n        }\r\n\r\n        namespace impl {\r\n            // Borrowed from\r\n            // https://codereview.stackexchange.com/questions/193420/apply-a-function-to-each-element-of-a-tuple-map-a-tuple\r\n            template <class F, typename Tuple, std::size_t... Is>\r\n            auto transform_tuple(Tuple&& t, F&& f, std::index_sequence<Is...>)\r\n            {\r\n                return std::make_tuple(std::forward<F>(f)(std::get<Is>(t))...);\r\n            }\r\n        }\r\n        template <class F, typename... Args>\r\n        auto transform_tuple(const std::tuple<Args...>& t, F&& f)\r\n        {\r\n            return impl::transform_tuple(\r\n                t, std::forward<F>(f),\r\n                std::make_index_sequence<sizeof...(Args)>{});\r\n        }\r\n\r\n        namespace impl {\r\n            // Borrowed from\r\n            // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3658.html\r\n            // with modifications of Casey Carter at\r\n            // https://stackoverflow.com/a/51365112/1476661\r\n            template <typename F, typename Tuple, std::size_t... I>\r\n            auto apply(F&& f, Tuple&& args, std::index_sequence<I...>)\r\n                -> decltype(std::forward<F>(f)(\r\n                    std::get<I>(std::forward<Tuple>(args))...))\r\n            {\r\n                return std::forward<F>(f)(\r\n                    std::get<I>(std::forward<Tuple>(args))...);\r\n            }\r\n        }\r\n        template <typename F, typename Tuple,\r\n                  typename Indices = std::make_index_sequence<\r\n                      std::tuple_size<std::remove_reference_t<Tuple>>::value>>\r\n        auto apply(F&& f, Tuple&& args)\r\n            -> decltype(impl::apply(std::forward<F>(f),\r\n                                    std::forward<Tuple>(args), Indices()))\r\n        {\r\n            return impl::apply(std::forward<F>(f), std::forward<Tuple>(args),\r\n                               Indices());\r\n        }\r\n    }\r\n}\r\n}\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Device.hpp",
    "content": "#pragma once\r\n\r\n#include \"OpenCLUtilsCpp_Export.h\"\r\n#include <CL/Utils/Error.hpp>\r\n\r\n#include <CL/opencl.hpp>\r\n\r\nnamespace cl {\r\nnamespace util {\r\n    bool UTILSCPP_EXPORT opencl_c_version_contains(\r\n        const cl::Device& device, const cl::string& version_fragment);\r\n\r\n    bool UTILSCPP_EXPORT supports_extension(const cl::Device& device,\r\n                                            const cl::string& extension);\r\n\r\n#ifdef CL_VERSION_3_0\r\n    bool UTILSCPP_EXPORT supports_feature(const cl::Device& device,\r\n                                          const cl::string& feature_name);\r\n#endif\r\n}\r\n}\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Error.h",
    "content": "#pragma once\r\n\r\n// OpenCL Utils includes\r\n#include \"OpenCLUtils_Export.h\"\r\n\r\n// OpenCL Utils includes\r\n#include <CL/Utils/ErrorCodes.h>\r\n\r\n// STL includes\r\n#include <stdio.h> // fprintf\r\n\r\n// OpenCL includes\r\n#include <CL/cl.h>\r\n\r\n// RET = function returns error code\r\n// PAR = functions sets error code in the paremeter\r\n\r\n#ifdef _DEBUG\r\n\r\n#define OCLERROR_RET(func, err, label)                                         \\\r\n    do                                                                         \\\r\n    {                                                                          \\\r\n        err = func;                                                            \\\r\n        if (err != CL_SUCCESS)                                                 \\\r\n        {                                                                      \\\r\n            cl_util_print_error(err);                                          \\\r\n            fprintf(stderr, \"on line %d, in file %s\\n%s\\n\", __LINE__,          \\\r\n                    __FILE__, #func);                                          \\\r\n            goto label;                                                        \\\r\n        }                                                                      \\\r\n    } while (0)\r\n\r\n#define OCLERROR_PAR(func, err, label)                                         \\\r\n    do                                                                         \\\r\n    {                                                                          \\\r\n        func;                                                                  \\\r\n        if (err != CL_SUCCESS)                                                 \\\r\n        {                                                                      \\\r\n            cl_util_print_error(err);                                          \\\r\n            fprintf(stderr, \"on line %d, in file %s\\n%s\\n\", __LINE__,          \\\r\n                    __FILE__, #func);                                          \\\r\n            goto label;                                                        \\\r\n        }                                                                      \\\r\n    } while (0)\r\n\r\n#define MEM_CHECK(func, err, label)                                            \\\r\n    do                                                                         \\\r\n    {                                                                          \\\r\n        if ((func) == NULL)                                                    \\\r\n        {                                                                      \\\r\n            err = CL_OUT_OF_HOST_MEMORY;                                       \\\r\n            cl_util_print_error(err);                                          \\\r\n            fprintf(stderr, \"on line %d, in file %s\\n%s\\n\", __LINE__,          \\\r\n                    __FILE__, #func);                                          \\\r\n            goto label;                                                        \\\r\n        }                                                                      \\\r\n    } while (0)\r\n\r\n#else\r\n\r\n#define OCLERROR_RET(func, err, label)                                         \\\r\n    do                                                                         \\\r\n    {                                                                          \\\r\n        err = func;                                                            \\\r\n        if (err != CL_SUCCESS) goto label;                                     \\\r\n    } while (0)\r\n\r\n#define OCLERROR_PAR(func, err, label)                                         \\\r\n    do                                                                         \\\r\n    {                                                                          \\\r\n        func;                                                                  \\\r\n        if (err != CL_SUCCESS) goto label;                                     \\\r\n    } while (0)\r\n\r\n#define MEM_CHECK(func, err, label)                                            \\\r\n    do                                                                         \\\r\n    {                                                                          \\\r\n        if ((func) == NULL)                                                    \\\r\n        {                                                                      \\\r\n            err = CL_OUT_OF_HOST_MEMORY;                                       \\\r\n            goto label;                                                        \\\r\n        }                                                                      \\\r\n    } while (0)\r\n\r\n#endif\r\n\r\nUTILS_EXPORT\r\nvoid cl_util_print_error(cl_int error);\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Error.hpp",
    "content": "#pragma once\r\n\r\n// OpenCL Utils includes\r\n#include \"OpenCLUtilsCpp_Export.h\"\r\n\r\n// OpenCL Utils includes\r\n#include <CL/Utils/ErrorCodes.h>\r\n\r\n// OpenCL includes\r\n#include <CL/opencl.hpp>\r\n\r\nnamespace cl {\r\nnamespace util {\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n    /*! \\brief Exception class\r\n     *\r\n     *  This may be thrown by SDK utility functions when\r\n     * CL_HPP_ENABLE_EXCEPTIONS is defined.\r\n     */\r\n    class Error : public std::exception {\r\n    private:\r\n        int err_;\r\n        const char* errStr_;\r\n\r\n    public:\r\n        /*! \\brief Create a new SDK error exception for a given error code\r\n         *  and corresponding message.\r\n         *\r\n         *  \\param err error code value.\r\n         *\r\n         *  \\param errStr a descriptive string that must remain in scope until\r\n         *                handling of the exception has concluded.  If set, it\r\n         *                will be returned by what().\r\n         */\r\n        Error(cl_int err, const char* errStr = NULL): err_(err), errStr_(errStr)\r\n        {}\r\n\r\n        ~Error() noexcept {}\r\n\r\n        /*! \\brief Get error string associated with exception\r\n         *\r\n         * \\return A memory pointer to the error message string.\r\n         */\r\n        virtual const char* what() const noexcept\r\n        {\r\n            if (errStr_ == NULL)\r\n            {\r\n                return \"empty\";\r\n            }\r\n            else\r\n            {\r\n                return errStr_;\r\n            }\r\n        }\r\n\r\n        /*! \\brief Get error code associated with exception\r\n         *\r\n         *  \\return The error code.\r\n         */\r\n        cl_int err(void) const { return err_; }\r\n    };\r\n#endif\r\n\r\n    namespace detail {\r\n        UTILSCPP_EXPORT cl_int errHandler(cl_int err, cl_int* errPtr,\r\n                                          const char* errStr = nullptr);\r\n    }\r\n\r\n}\r\n}\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/ErrorCodes.h",
    "content": "#pragma once\r\n\r\n#define CL_UTIL_INDEX_OUT_OF_RANGE -2000\r\n#define CL_UTIL_DEVICE_NOT_INTEROPERABLE -2001\r\n#define CL_UTIL_FILE_OPERATION_ERROR -2002\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Event.h",
    "content": "#pragma once\r\n\r\n// OpenCL Utils includes\r\n#include \"OpenCLUtils_Export.h\"\r\n\r\n// OpenCL includes\r\n#include <CL/cl.h>\r\n\r\nUTILS_EXPORT\r\ncl_ulong cl_util_get_event_duration(const cl_event event,\r\n                                    const cl_profiling_info start,\r\n                                    const cl_profiling_info end,\r\n                                    cl_int* const error);\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Event.hpp",
    "content": "#pragma once\r\n\r\n// OpenCL SDK includes\r\n#include \"OpenCLUtilsCpp_Export.h\"\r\n\r\n// STL includes\r\n#include <chrono>\r\n\r\n// OpenCL includes\r\n#include <CL/opencl.hpp>\r\n\r\nnamespace cl {\r\nnamespace util {\r\n    template <cl_int From, cl_int To, typename Dur = std::chrono::nanoseconds>\r\n    auto get_duration(cl::Event& ev)\r\n    {\r\n        return std::chrono::duration_cast<Dur>(std::chrono::nanoseconds{\r\n            ev.getProfilingInfo<To>() - ev.getProfilingInfo<From>() });\r\n    }\r\n}\r\n}\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/File.h",
    "content": "#pragma once\r\n\r\n// OpenCL Utils includes\r\n#include \"OpenCLUtils_Export.h\"\r\n\r\n// OpenCL includes\r\n#include <CL/cl.h>\r\n\r\n// read all the text file contents securely in ANSI C89\r\n// return pointer to C-string with file contents\r\n// can handle streams with no known size and no support for fseek\r\n// based on https://stackoverflow.com/questions/14002954/ by Nominal Animal\r\nUTILS_EXPORT\r\nchar* cl_util_read_text_file(const char* const filename, size_t* const length,\r\n                             cl_int* const error);\r\n\r\n// read all the binary file contents securely in ANSI C89\r\n// return pointer to file contents\r\n// can handle streams with no known size and no support for fseek\r\n// based on https://stackoverflow.com/questions/14002954/ by Nominal Animal\r\nUTILS_EXPORT\r\nunsigned char* cl_util_read_binary_file(const char* const filename,\r\n                                        size_t* const length,\r\n                                        cl_int* const error);\r\n\r\n// write binaries of OpenCL compiled program\r\n// binaries are written as separate files for each device\r\n// with file name \"(program_file_name)_(name of device).bin\"\r\n// based on variant of Logan\r\n// http://logan.tw/posts/2014/11/22/pre-compile-the-opencl-kernel-program-part-2/\r\nUTILS_EXPORT\r\ncl_int cl_util_write_binaries(const cl_program program,\r\n                              const char* const program_file_name);\r\n\r\n// read binaries of OpenCL compiled program\r\n// from files of file names \"(program_file_name)_(name of device).bin\"\r\nUTILS_EXPORT\r\ncl_program cl_util_read_binaries(const cl_context context,\r\n                                 const cl_device_id* const devices,\r\n                                 const cl_uint num_devices,\r\n                                 const char* const program_file_name,\r\n                                 cl_int* const error);\r\n\r\n// returns the folder containing the running executable\r\nUTILS_EXPORT\r\ncl_int cl_util_executable_folder(char* filename, size_t* const length);\r\n\r\n// read all the text file contents securely in ANSI C89\r\n// return pointer to C-string with file contents\r\n// interprets filename relative to the folder containing\r\n// the running executable\r\nUTILS_EXPORT\r\nchar* cl_util_read_exe_relative_text_file(const char* const rel_path,\r\n                                          size_t* const length,\r\n                                          cl_int* const error);\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/File.hpp",
    "content": "#pragma once\r\n\r\n// OpenCL SDK includes\r\n#include \"OpenCLUtilsCpp_Export.h\"\r\n\r\n#include <CL/Utils/Error.hpp>\r\n\r\n// OpenCL includes\r\n#include <CL/opencl.hpp>\r\n\r\n\r\nnamespace cl {\r\nnamespace util {\r\n\r\n    std::string UTILSCPP_EXPORT read_text_file(const char* const filename,\r\n                                               cl_int* const error = nullptr);\r\n\r\n    std::vector<unsigned char> UTILSCPP_EXPORT\r\n    read_binary_file(const char* const filename, cl_int* const error = nullptr);\r\n\r\n    Program::Binaries UTILSCPP_EXPORT read_binary_files(\r\n        const std::vector<cl::Device>& devices,\r\n        const char* const program_file_name, cl_int* const error = nullptr);\r\n\r\n    cl_int UTILSCPP_EXPORT\r\n    write_binaries(const cl::Program::Binaries& binaries,\r\n                   const std::vector<cl::Device>& devices,\r\n                   const char* const program_file_name);\r\n\r\n    std::string UTILSCPP_EXPORT\r\n    executable_folder(cl_int* const error = nullptr);\r\n\r\n    std::string UTILSCPP_EXPORT read_exe_relative_text_file(\r\n        const char* const filename, cl_int* const error = nullptr);\r\n}\r\n}\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/InteropContext.hpp",
    "content": "#pragma once\r\n\r\n#include \"OpenCLUtilsCpp_Export.h\"\r\n#include <CL/Utils/Error.hpp>\r\n\r\n#include <CL/opencl.hpp>\r\n\r\nnamespace cl {\r\nnamespace util {\r\n    vector<cl_context_properties>\r\n        UTILSCPP_EXPORT get_interop_context_properties(const cl::Device& plat,\r\n                                                       cl_int* error = nullptr);\r\n\r\n    Context UTILSCPP_EXPORT get_interop_context(int plat_id, int dev_id,\r\n                                                cl_device_type type,\r\n                                                cl_int* error = nullptr);\r\n}\r\n}\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/OpenCLUtilsCpp_Export.h",
    "content": "\r\n#ifndef UTILSCPP_EXPORT_H\r\n#define UTILSCPP_EXPORT_H\r\n\r\n#ifdef OPENCLUTILSCPP_STATIC_DEFINE\r\n#  define UTILSCPP_EXPORT\r\n#  define OPENCLUTILSCPP_NO_EXPORT\r\n#else\r\n#  ifndef UTILSCPP_EXPORT\r\n#    ifdef OpenCLUtilsCpp_EXPORTS\r\n        /* We are building this library */\r\n#      define UTILSCPP_EXPORT \r\n#    else\r\n        /* We are using this library */\r\n#      define UTILSCPP_EXPORT \r\n#    endif\r\n#  endif\r\n\r\n#  ifndef OPENCLUTILSCPP_NO_EXPORT\r\n#    define OPENCLUTILSCPP_NO_EXPORT \r\n#  endif\r\n#endif\r\n\r\n#ifndef OPENCLUTILSCPP_DEPRECATED\r\n#  define OPENCLUTILSCPP_DEPRECATED __declspec(deprecated)\r\n#endif\r\n\r\n#ifndef OPENCLUTILSCPP_DEPRECATED_EXPORT\r\n#  define OPENCLUTILSCPP_DEPRECATED_EXPORT UTILSCPP_EXPORT OPENCLUTILSCPP_DEPRECATED\r\n#endif\r\n\r\n#ifndef OPENCLUTILSCPP_DEPRECATED_NO_EXPORT\r\n#  define OPENCLUTILSCPP_DEPRECATED_NO_EXPORT OPENCLUTILSCPP_NO_EXPORT OPENCLUTILSCPP_DEPRECATED\r\n#endif\r\n\r\n/* NOLINTNEXTLINE(readability-avoid-unconditional-preprocessor-if) */\r\n#if 0 /* DEFINE_NO_DEPRECATED */\r\n#  ifndef OPENCLUTILSCPP_NO_DEPRECATED\r\n#    define OPENCLUTILSCPP_NO_DEPRECATED\r\n#  endif\r\n#endif\r\n\r\n#endif /* UTILSCPP_EXPORT_H */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/OpenCLUtils_Export.h",
    "content": "\r\n#ifndef UTILS_EXPORT_H\r\n#define UTILS_EXPORT_H\r\n\r\n#ifdef OPENCLUTILS_STATIC_DEFINE\r\n#  define UTILS_EXPORT\r\n#  define OPENCLUTILS_NO_EXPORT\r\n#else\r\n#  ifndef UTILS_EXPORT\r\n#    ifdef OpenCLUtils_EXPORTS\r\n        /* We are building this library */\r\n#      define UTILS_EXPORT \r\n#    else\r\n        /* We are using this library */\r\n#      define UTILS_EXPORT \r\n#    endif\r\n#  endif\r\n\r\n#  ifndef OPENCLUTILS_NO_EXPORT\r\n#    define OPENCLUTILS_NO_EXPORT \r\n#  endif\r\n#endif\r\n\r\n#ifndef OPENCLUTILS_DEPRECATED\r\n#  define OPENCLUTILS_DEPRECATED __declspec(deprecated)\r\n#endif\r\n\r\n#ifndef OPENCLUTILS_DEPRECATED_EXPORT\r\n#  define OPENCLUTILS_DEPRECATED_EXPORT UTILS_EXPORT OPENCLUTILS_DEPRECATED\r\n#endif\r\n\r\n#ifndef OPENCLUTILS_DEPRECATED_NO_EXPORT\r\n#  define OPENCLUTILS_DEPRECATED_NO_EXPORT OPENCLUTILS_NO_EXPORT OPENCLUTILS_DEPRECATED\r\n#endif\r\n\r\n/* NOLINTNEXTLINE(readability-avoid-unconditional-preprocessor-if) */\r\n#if 0 /* DEFINE_NO_DEPRECATED */\r\n#  ifndef OPENCLUTILS_NO_DEPRECATED\r\n#    define OPENCLUTILS_NO_DEPRECATED\r\n#  endif\r\n#endif\r\n\r\n#endif /* UTILS_EXPORT_H */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Platform.hpp",
    "content": "#pragma once\r\n\r\n#include \"OpenCLUtilsCpp_Export.h\"\r\n#include <CL/Utils/Error.hpp>\r\n\r\n#include <CL/opencl.hpp>\r\n\r\nnamespace cl {\r\nnamespace util {\r\n    bool UTILSCPP_EXPORT supports_extension(const cl::Platform& platform,\r\n                                            const cl::string& extension);\r\n\r\n    bool UTILSCPP_EXPORT platform_version_contains(\r\n        const cl::Platform& platform, const cl::string& version_fragment);\r\n}\r\n}\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Utils.h",
    "content": "#pragma once\r\n\r\n// OpenCL Utils includes\r\n#include \"OpenCLUtils_Export.h\"\r\n\r\n#include <CL/Utils/Error.h>\r\n#include <CL/Utils/File.h>\r\n#include <CL/Utils/Context.h>\r\n\r\n// OpenCL includes\r\n#include <CL/cl.h>\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/Utils/Utils.hpp",
    "content": "#pragma once\r\n\r\n// OpenCL Utils includes\r\n#include \"OpenCLUtils_Export.h\"\r\n\r\n#include <CL/Utils/Detail.hpp>\r\n#include <CL/Utils/Error.hpp>\r\n#include <CL/Utils/Platform.hpp>\r\n#include <CL/Utils/Device.hpp>\r\n#include <CL/Utils/Context.hpp>\r\n#include <CL/Utils/Event.hpp>\r\n#include <CL/Utils/File.hpp>\r\n\r\n// OpenCL includes\r\n#include <CL/opencl.hpp>\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2020 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef __OPENCL_CL_H\r\n#define __OPENCL_CL_H\r\n\r\n#include <CL/cl_version.h>\r\n#include <CL/cl_platform.h>\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/******************************************************************************/\r\n\r\ntypedef struct _cl_platform_id *    cl_platform_id;\r\ntypedef struct _cl_device_id *      cl_device_id;\r\ntypedef struct _cl_context *        cl_context;\r\ntypedef struct _cl_command_queue *  cl_command_queue;\r\ntypedef struct _cl_mem *            cl_mem;\r\ntypedef struct _cl_program *        cl_program;\r\ntypedef struct _cl_kernel *         cl_kernel;\r\ntypedef struct _cl_event *          cl_event;\r\ntypedef struct _cl_sampler *        cl_sampler;\r\n\r\ntypedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */\r\ntypedef cl_ulong            cl_bitfield;\r\ntypedef cl_ulong            cl_properties;\r\ntypedef cl_bitfield         cl_device_type;\r\ntypedef cl_uint             cl_platform_info;\r\ntypedef cl_uint             cl_device_info;\r\ntypedef cl_bitfield         cl_device_fp_config;\r\ntypedef cl_uint             cl_device_mem_cache_type;\r\ntypedef cl_uint             cl_device_local_mem_type;\r\ntypedef cl_bitfield         cl_device_exec_capabilities;\r\n#ifdef CL_VERSION_2_0\r\ntypedef cl_bitfield         cl_device_svm_capabilities;\r\n#endif\r\ntypedef cl_bitfield         cl_command_queue_properties;\r\n#ifdef CL_VERSION_1_2\r\ntypedef intptr_t            cl_device_partition_property;\r\ntypedef cl_bitfield         cl_device_affinity_domain;\r\n#endif\r\n\r\ntypedef intptr_t            cl_context_properties;\r\ntypedef cl_uint             cl_context_info;\r\n#ifdef CL_VERSION_2_0\r\ntypedef cl_properties       cl_queue_properties;\r\n#endif\r\ntypedef cl_uint             cl_command_queue_info;\r\ntypedef cl_uint             cl_channel_order;\r\ntypedef cl_uint             cl_channel_type;\r\ntypedef cl_bitfield         cl_mem_flags;\r\n#ifdef CL_VERSION_2_0\r\ntypedef cl_bitfield         cl_svm_mem_flags;\r\n#endif\r\ntypedef cl_uint             cl_mem_object_type;\r\ntypedef cl_uint             cl_mem_info;\r\n#ifdef CL_VERSION_1_2\r\ntypedef cl_bitfield         cl_mem_migration_flags;\r\n#endif\r\ntypedef cl_uint             cl_image_info;\r\n#ifdef CL_VERSION_1_1\r\ntypedef cl_uint             cl_buffer_create_type;\r\n#endif\r\ntypedef cl_uint             cl_addressing_mode;\r\ntypedef cl_uint             cl_filter_mode;\r\ntypedef cl_uint             cl_sampler_info;\r\ntypedef cl_bitfield         cl_map_flags;\r\n#ifdef CL_VERSION_2_0\r\ntypedef intptr_t            cl_pipe_properties;\r\ntypedef cl_uint             cl_pipe_info;\r\n#endif\r\ntypedef cl_uint             cl_program_info;\r\ntypedef cl_uint             cl_program_build_info;\r\n#ifdef CL_VERSION_1_2\r\ntypedef cl_uint             cl_program_binary_type;\r\n#endif\r\ntypedef cl_int              cl_build_status;\r\ntypedef cl_uint             cl_kernel_info;\r\n#ifdef CL_VERSION_1_2\r\ntypedef cl_uint             cl_kernel_arg_info;\r\ntypedef cl_uint             cl_kernel_arg_address_qualifier;\r\ntypedef cl_uint             cl_kernel_arg_access_qualifier;\r\ntypedef cl_bitfield         cl_kernel_arg_type_qualifier;\r\n#endif\r\ntypedef cl_uint             cl_kernel_work_group_info;\r\n#ifdef CL_VERSION_2_1\r\ntypedef cl_uint             cl_kernel_sub_group_info;\r\n#endif\r\ntypedef cl_uint             cl_event_info;\r\ntypedef cl_uint             cl_command_type;\r\ntypedef cl_uint             cl_profiling_info;\r\n#ifdef CL_VERSION_2_0\r\ntypedef cl_properties       cl_sampler_properties;\r\ntypedef cl_uint             cl_kernel_exec_info;\r\n#endif\r\n#ifdef CL_VERSION_3_0\r\ntypedef cl_bitfield         cl_device_atomic_capabilities;\r\ntypedef cl_bitfield         cl_device_device_enqueue_capabilities;\r\ntypedef cl_uint             cl_khronos_vendor_id;\r\ntypedef cl_properties cl_mem_properties;\r\n#endif\r\ntypedef cl_uint cl_version;\r\n\r\ntypedef struct _cl_image_format {\r\n    cl_channel_order        image_channel_order;\r\n    cl_channel_type         image_channel_data_type;\r\n} cl_image_format;\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\ntypedef struct _cl_image_desc {\r\n    cl_mem_object_type      image_type;\r\n    size_t                  image_width;\r\n    size_t                  image_height;\r\n    size_t                  image_depth;\r\n    size_t                  image_array_size;\r\n    size_t                  image_row_pitch;\r\n    size_t                  image_slice_pitch;\r\n    cl_uint                 num_mip_levels;\r\n    cl_uint                 num_samples;\r\n#ifdef CL_VERSION_2_0\r\n#if defined(__GNUC__)\r\n    __extension__                   /* Prevents warnings about anonymous union in -pedantic builds */\r\n#endif\r\n#if defined(_MSC_VER) && !defined(__STDC__)\r\n#pragma warning( push )\r\n#pragma warning( disable : 4201 )   /* Prevents warning about nameless struct/union in /W4 builds */\r\n#endif\r\n#ifdef __clang__\r\n#pragma clang diagnostic push\r\n#pragma clang diagnostic ignored \"-Wc11-extensions\" /* Prevents warning about nameless union being C11 extension*/\r\n#endif\r\n#if defined(_MSC_VER) && defined(__STDC__)\r\n    /* Anonymous unions are not supported in /Za builds */\r\n#else\r\n    union {\r\n#endif\r\n#endif\r\n      cl_mem                  buffer;\r\n#ifdef CL_VERSION_2_0\r\n#if defined(_MSC_VER) && defined(__STDC__)\r\n    /* Anonymous unions are not supported in /Za builds */\r\n#else\r\n      cl_mem                  mem_object;\r\n    };\r\n#endif\r\n#if defined(_MSC_VER) && !defined(__STDC__)\r\n#pragma warning( pop )\r\n#endif\r\n#ifdef __clang__\r\n#pragma clang diagnostic pop\r\n#endif\r\n#endif\r\n} cl_image_desc;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_1\r\n\r\ntypedef struct _cl_buffer_region {\r\n    size_t                  origin;\r\n    size_t                  size;\r\n} cl_buffer_region;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_3_0\r\n\r\n#define CL_NAME_VERSION_MAX_NAME_SIZE 64\r\n\r\ntypedef struct _cl_name_version {\r\n    cl_version              version;\r\n    char                    name[CL_NAME_VERSION_MAX_NAME_SIZE];\r\n} cl_name_version;\r\n\r\n#endif\r\n\r\n/******************************************************************************/\r\n\r\n/* Error Codes */\r\n#define CL_SUCCESS                                  0\r\n#define CL_DEVICE_NOT_FOUND                         -1\r\n#define CL_DEVICE_NOT_AVAILABLE                     -2\r\n#define CL_COMPILER_NOT_AVAILABLE                   -3\r\n#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4\r\n#define CL_OUT_OF_RESOURCES                         -5\r\n#define CL_OUT_OF_HOST_MEMORY                       -6\r\n#define CL_PROFILING_INFO_NOT_AVAILABLE             -7\r\n#define CL_MEM_COPY_OVERLAP                         -8\r\n#define CL_IMAGE_FORMAT_MISMATCH                    -9\r\n#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10\r\n#define CL_BUILD_PROGRAM_FAILURE                    -11\r\n#define CL_MAP_FAILURE                              -12\r\n#ifdef CL_VERSION_1_1\r\n#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13\r\n#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14\r\n#endif\r\n#ifdef CL_VERSION_1_2\r\n#define CL_COMPILE_PROGRAM_FAILURE                  -15\r\n#define CL_LINKER_NOT_AVAILABLE                     -16\r\n#define CL_LINK_PROGRAM_FAILURE                     -17\r\n#define CL_DEVICE_PARTITION_FAILED                  -18\r\n#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19\r\n#endif\r\n\r\n#define CL_INVALID_VALUE                            -30\r\n#define CL_INVALID_DEVICE_TYPE                      -31\r\n#define CL_INVALID_PLATFORM                         -32\r\n#define CL_INVALID_DEVICE                           -33\r\n#define CL_INVALID_CONTEXT                          -34\r\n#define CL_INVALID_QUEUE_PROPERTIES                 -35\r\n#define CL_INVALID_COMMAND_QUEUE                    -36\r\n#define CL_INVALID_HOST_PTR                         -37\r\n#define CL_INVALID_MEM_OBJECT                       -38\r\n#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39\r\n#define CL_INVALID_IMAGE_SIZE                       -40\r\n#define CL_INVALID_SAMPLER                          -41\r\n#define CL_INVALID_BINARY                           -42\r\n#define CL_INVALID_BUILD_OPTIONS                    -43\r\n#define CL_INVALID_PROGRAM                          -44\r\n#define CL_INVALID_PROGRAM_EXECUTABLE               -45\r\n#define CL_INVALID_KERNEL_NAME                      -46\r\n#define CL_INVALID_KERNEL_DEFINITION                -47\r\n#define CL_INVALID_KERNEL                           -48\r\n#define CL_INVALID_ARG_INDEX                        -49\r\n#define CL_INVALID_ARG_VALUE                        -50\r\n#define CL_INVALID_ARG_SIZE                         -51\r\n#define CL_INVALID_KERNEL_ARGS                      -52\r\n#define CL_INVALID_WORK_DIMENSION                   -53\r\n#define CL_INVALID_WORK_GROUP_SIZE                  -54\r\n#define CL_INVALID_WORK_ITEM_SIZE                   -55\r\n#define CL_INVALID_GLOBAL_OFFSET                    -56\r\n#define CL_INVALID_EVENT_WAIT_LIST                  -57\r\n#define CL_INVALID_EVENT                            -58\r\n#define CL_INVALID_OPERATION                        -59\r\n#define CL_INVALID_GL_OBJECT                        -60\r\n#define CL_INVALID_BUFFER_SIZE                      -61\r\n#define CL_INVALID_MIP_LEVEL                        -62\r\n#define CL_INVALID_GLOBAL_WORK_SIZE                 -63\r\n#ifdef CL_VERSION_1_1\r\n#define CL_INVALID_PROPERTY                         -64\r\n#endif\r\n#ifdef CL_VERSION_1_2\r\n#define CL_INVALID_IMAGE_DESCRIPTOR                 -65\r\n#define CL_INVALID_COMPILER_OPTIONS                 -66\r\n#define CL_INVALID_LINKER_OPTIONS                   -67\r\n#define CL_INVALID_DEVICE_PARTITION_COUNT           -68\r\n#endif\r\n#ifdef CL_VERSION_2_0\r\n#define CL_INVALID_PIPE_SIZE                        -69\r\n#define CL_INVALID_DEVICE_QUEUE                     -70\r\n#endif\r\n#ifdef CL_VERSION_2_2\r\n#define CL_INVALID_SPEC_ID                          -71\r\n#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72\r\n#endif\r\n\r\n\r\n/* cl_bool */\r\n#define CL_FALSE                                    0\r\n#define CL_TRUE                                     1\r\n#ifdef CL_VERSION_1_2\r\n#define CL_BLOCKING                                 CL_TRUE\r\n#define CL_NON_BLOCKING                             CL_FALSE\r\n#endif\r\n\r\n/* cl_platform_info */\r\n#define CL_PLATFORM_PROFILE                         0x0900\r\n#define CL_PLATFORM_VERSION                         0x0901\r\n#define CL_PLATFORM_NAME                            0x0902\r\n#define CL_PLATFORM_VENDOR                          0x0903\r\n#define CL_PLATFORM_EXTENSIONS                      0x0904\r\n#ifdef CL_VERSION_2_1\r\n#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905\r\n#endif\r\n#ifdef CL_VERSION_3_0\r\n#define CL_PLATFORM_NUMERIC_VERSION                 0x0906\r\n#define CL_PLATFORM_EXTENSIONS_WITH_VERSION         0x0907\r\n#endif\r\n\r\n/* cl_device_type - bitfield */\r\n#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)\r\n#define CL_DEVICE_TYPE_CPU                          (1 << 1)\r\n#define CL_DEVICE_TYPE_GPU                          (1 << 2)\r\n#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)\r\n#ifdef CL_VERSION_1_2\r\n#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)\r\n#endif\r\n#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_TYPE                                   0x1000\r\n#define CL_DEVICE_VENDOR_ID                              0x1001\r\n#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002\r\n#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003\r\n#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004\r\n#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005\r\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006\r\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007\r\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008\r\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009\r\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A\r\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B\r\n#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C\r\n#define CL_DEVICE_ADDRESS_BITS                           0x100D\r\n#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E\r\n#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F\r\n#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010\r\n#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011\r\n#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012\r\n#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013\r\n#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014\r\n#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015\r\n#define CL_DEVICE_IMAGE_SUPPORT                          0x1016\r\n#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017\r\n#define CL_DEVICE_MAX_SAMPLERS                           0x1018\r\n#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019\r\n#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A\r\n#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B\r\n#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C\r\n#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D\r\n#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E\r\n#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F\r\n#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020\r\n#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021\r\n#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022\r\n#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023\r\n#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024\r\n#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025\r\n#define CL_DEVICE_ENDIAN_LITTLE                          0x1026\r\n#define CL_DEVICE_AVAILABLE                              0x1027\r\n#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028\r\n#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029\r\n#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */\r\n#ifdef CL_VERSION_2_0\r\n#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A\r\n#endif\r\n#define CL_DEVICE_NAME                                   0x102B\r\n#define CL_DEVICE_VENDOR                                 0x102C\r\n#define CL_DRIVER_VERSION                                0x102D\r\n#define CL_DEVICE_PROFILE                                0x102E\r\n#define CL_DEVICE_VERSION                                0x102F\r\n#define CL_DEVICE_EXTENSIONS                             0x1030\r\n#define CL_DEVICE_PLATFORM                               0x1031\r\n#ifdef CL_VERSION_1_2\r\n#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032\r\n#endif\r\n/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in \"cl_ext.h\" */\r\n#ifdef CL_VERSION_1_1\r\n#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034\r\n#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */\r\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036\r\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037\r\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038\r\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039\r\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A\r\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B\r\n#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C\r\n#define CL_DEVICE_OPENCL_C_VERSION                       0x103D\r\n#endif\r\n#ifdef CL_VERSION_1_2\r\n#define CL_DEVICE_LINKER_AVAILABLE                       0x103E\r\n#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F\r\n#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040\r\n#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041\r\n#define CL_DEVICE_PARENT_DEVICE                          0x1042\r\n#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043\r\n#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044\r\n#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045\r\n#define CL_DEVICE_PARTITION_TYPE                         0x1046\r\n#define CL_DEVICE_REFERENCE_COUNT                        0x1047\r\n#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048\r\n#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049\r\n#endif\r\n#ifdef CL_VERSION_2_0\r\n#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A\r\n#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B\r\n#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C\r\n#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D\r\n#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E\r\n#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F\r\n#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050\r\n#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051\r\n#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052\r\n#define CL_DEVICE_SVM_CAPABILITIES                       0x1053\r\n#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054\r\n#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055\r\n#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056\r\n#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057\r\n#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058\r\n#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059\r\n#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A\r\n#endif\r\n#ifdef CL_VERSION_2_1\r\n#define CL_DEVICE_IL_VERSION                             0x105B\r\n#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C\r\n#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D\r\n#endif\r\n#ifdef CL_VERSION_3_0\r\n#define CL_DEVICE_NUMERIC_VERSION                        0x105E\r\n#define CL_DEVICE_EXTENSIONS_WITH_VERSION                0x1060\r\n#define CL_DEVICE_ILS_WITH_VERSION                       0x1061\r\n#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION          0x1062\r\n#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES             0x1063\r\n#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES              0x1064\r\n#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT         0x1065\r\n#define CL_DEVICE_OPENCL_C_ALL_VERSIONS                  0x1066\r\n#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE     0x1067\r\n#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068\r\n#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT          0x1069\r\n/* 0x106A to 0x106E - Reserved for upcoming KHR extension */\r\n#define CL_DEVICE_OPENCL_C_FEATURES                      0x106F\r\n#define CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES            0x1070\r\n#define CL_DEVICE_PIPE_SUPPORT                           0x1071\r\n#define CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED      0x1072\r\n#endif\r\n\r\n/* cl_device_fp_config - bitfield */\r\n#define CL_FP_DENORM                                (1 << 0)\r\n#define CL_FP_INF_NAN                               (1 << 1)\r\n#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)\r\n#define CL_FP_ROUND_TO_ZERO                         (1 << 3)\r\n#define CL_FP_ROUND_TO_INF                          (1 << 4)\r\n#define CL_FP_FMA                                   (1 << 5)\r\n#ifdef CL_VERSION_1_1\r\n#define CL_FP_SOFT_FLOAT                            (1 << 6)\r\n#endif\r\n#ifdef CL_VERSION_1_2\r\n#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)\r\n#endif\r\n\r\n/* cl_device_mem_cache_type */\r\n#define CL_NONE                                     0x0\r\n#define CL_READ_ONLY_CACHE                          0x1\r\n#define CL_READ_WRITE_CACHE                         0x2\r\n\r\n/* cl_device_local_mem_type */\r\n#define CL_LOCAL                                    0x1\r\n#define CL_GLOBAL                                   0x2\r\n\r\n/* cl_device_exec_capabilities - bitfield */\r\n#define CL_EXEC_KERNEL                              (1 << 0)\r\n#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)\r\n\r\n/* cl_command_queue_properties - bitfield */\r\n#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)\r\n#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)\r\n#ifdef CL_VERSION_2_0\r\n#define CL_QUEUE_ON_DEVICE                          (1 << 2)\r\n#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)\r\n#endif\r\n\r\n/* cl_context_info */\r\n#define CL_CONTEXT_REFERENCE_COUNT                  0x1080\r\n#define CL_CONTEXT_DEVICES                          0x1081\r\n#define CL_CONTEXT_PROPERTIES                       0x1082\r\n#ifdef CL_VERSION_1_1\r\n#define CL_CONTEXT_NUM_DEVICES                      0x1083\r\n#endif\r\n\r\n/* cl_context_properties */\r\n#define CL_CONTEXT_PLATFORM                         0x1084\r\n#ifdef CL_VERSION_1_2\r\n#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\n/* cl_device_partition_property */\r\n#define CL_DEVICE_PARTITION_EQUALLY                 0x1086\r\n#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087\r\n#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0\r\n#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\n/* cl_device_affinity_domain */\r\n#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)\r\n#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)\r\n#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)\r\n#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)\r\n#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)\r\n#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\n/* cl_device_svm_capabilities */\r\n#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)\r\n#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)\r\n#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)\r\n#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)\r\n\r\n#endif\r\n\r\n/* cl_command_queue_info */\r\n#define CL_QUEUE_CONTEXT                            0x1090\r\n#define CL_QUEUE_DEVICE                             0x1091\r\n#define CL_QUEUE_REFERENCE_COUNT                    0x1092\r\n#define CL_QUEUE_PROPERTIES                         0x1093\r\n#ifdef CL_VERSION_2_0\r\n#define CL_QUEUE_SIZE                               0x1094\r\n#endif\r\n#ifdef CL_VERSION_2_1\r\n#define CL_QUEUE_DEVICE_DEFAULT                     0x1095\r\n#endif\r\n#ifdef CL_VERSION_3_0\r\n#define CL_QUEUE_PROPERTIES_ARRAY                   0x1098\r\n#endif\r\n\r\n/* cl_mem_flags and cl_svm_mem_flags - bitfield */\r\n#define CL_MEM_READ_WRITE                           (1 << 0)\r\n#define CL_MEM_WRITE_ONLY                           (1 << 1)\r\n#define CL_MEM_READ_ONLY                            (1 << 2)\r\n#define CL_MEM_USE_HOST_PTR                         (1 << 3)\r\n#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)\r\n#define CL_MEM_COPY_HOST_PTR                        (1 << 5)\r\n/* reserved                                         (1 << 6)    */\r\n#ifdef CL_VERSION_1_2\r\n#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)\r\n#define CL_MEM_HOST_READ_ONLY                       (1 << 8)\r\n#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)\r\n#endif\r\n#ifdef CL_VERSION_2_0\r\n#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */\r\n#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */\r\n#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\n/* cl_mem_migration_flags - bitfield */\r\n#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)\r\n#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)\r\n\r\n#endif\r\n\r\n/* cl_channel_order */\r\n#define CL_R                                        0x10B0\r\n#define CL_A                                        0x10B1\r\n#define CL_RG                                       0x10B2\r\n#define CL_RA                                       0x10B3\r\n#define CL_RGB                                      0x10B4\r\n#define CL_RGBA                                     0x10B5\r\n#define CL_BGRA                                     0x10B6\r\n#define CL_ARGB                                     0x10B7\r\n#define CL_INTENSITY                                0x10B8\r\n#define CL_LUMINANCE                                0x10B9\r\n#ifdef CL_VERSION_1_1\r\n#define CL_Rx                                       0x10BA\r\n#define CL_RGx                                      0x10BB\r\n#define CL_RGBx                                     0x10BC\r\n#endif\r\n#ifdef CL_VERSION_2_0\r\n#define CL_DEPTH                                    0x10BD\r\n#define CL_sRGB                                     0x10BF\r\n#define CL_sRGBx                                    0x10C0\r\n#define CL_sRGBA                                    0x10C1\r\n#define CL_sBGRA                                    0x10C2\r\n#define CL_ABGR                                     0x10C3\r\n#endif\r\n\r\n/* cl_channel_type */\r\n#define CL_SNORM_INT8                               0x10D0\r\n#define CL_SNORM_INT16                              0x10D1\r\n#define CL_UNORM_INT8                               0x10D2\r\n#define CL_UNORM_INT16                              0x10D3\r\n#define CL_UNORM_SHORT_565                          0x10D4\r\n#define CL_UNORM_SHORT_555                          0x10D5\r\n#define CL_UNORM_INT_101010                         0x10D6\r\n#define CL_SIGNED_INT8                              0x10D7\r\n#define CL_SIGNED_INT16                             0x10D8\r\n#define CL_SIGNED_INT32                             0x10D9\r\n#define CL_UNSIGNED_INT8                            0x10DA\r\n#define CL_UNSIGNED_INT16                           0x10DB\r\n#define CL_UNSIGNED_INT32                           0x10DC\r\n#define CL_HALF_FLOAT                               0x10DD\r\n#define CL_FLOAT                                    0x10DE\r\n#ifdef CL_VERSION_2_1\r\n#define CL_UNORM_INT_101010_2                       0x10E0\r\n#endif\r\n\r\n/* cl_mem_object_type */\r\n#define CL_MEM_OBJECT_BUFFER                        0x10F0\r\n#define CL_MEM_OBJECT_IMAGE2D                       0x10F1\r\n#define CL_MEM_OBJECT_IMAGE3D                       0x10F2\r\n#ifdef CL_VERSION_1_2\r\n#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3\r\n#define CL_MEM_OBJECT_IMAGE1D                       0x10F4\r\n#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5\r\n#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6\r\n#endif\r\n#ifdef CL_VERSION_2_0\r\n#define CL_MEM_OBJECT_PIPE                          0x10F7\r\n#endif\r\n\r\n/* cl_mem_info */\r\n#define CL_MEM_TYPE                                 0x1100\r\n#define CL_MEM_FLAGS                                0x1101\r\n#define CL_MEM_SIZE                                 0x1102\r\n#define CL_MEM_HOST_PTR                             0x1103\r\n#define CL_MEM_MAP_COUNT                            0x1104\r\n#define CL_MEM_REFERENCE_COUNT                      0x1105\r\n#define CL_MEM_CONTEXT                              0x1106\r\n#ifdef CL_VERSION_1_1\r\n#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107\r\n#define CL_MEM_OFFSET                               0x1108\r\n#endif\r\n#ifdef CL_VERSION_2_0\r\n#define CL_MEM_USES_SVM_POINTER                     0x1109\r\n#endif\r\n#ifdef CL_VERSION_3_0\r\n#define CL_MEM_PROPERTIES                           0x110A\r\n#endif\r\n\r\n/* cl_image_info */\r\n#define CL_IMAGE_FORMAT                             0x1110\r\n#define CL_IMAGE_ELEMENT_SIZE                       0x1111\r\n#define CL_IMAGE_ROW_PITCH                          0x1112\r\n#define CL_IMAGE_SLICE_PITCH                        0x1113\r\n#define CL_IMAGE_WIDTH                              0x1114\r\n#define CL_IMAGE_HEIGHT                             0x1115\r\n#define CL_IMAGE_DEPTH                              0x1116\r\n#ifdef CL_VERSION_1_2\r\n#define CL_IMAGE_ARRAY_SIZE                         0x1117\r\n#define CL_IMAGE_BUFFER                             0x1118\r\n#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119\r\n#define CL_IMAGE_NUM_SAMPLES                        0x111A\r\n#endif\r\n\r\n\r\n/* cl_pipe_info */\r\n#ifdef CL_VERSION_2_0\r\n#define CL_PIPE_PACKET_SIZE                         0x1120\r\n#define CL_PIPE_MAX_PACKETS                         0x1121\r\n#endif\r\n#ifdef CL_VERSION_3_0\r\n#define CL_PIPE_PROPERTIES                          0x1122\r\n#endif\r\n\r\n/* cl_addressing_mode */\r\n#define CL_ADDRESS_NONE                             0x1130\r\n#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131\r\n#define CL_ADDRESS_CLAMP                            0x1132\r\n#define CL_ADDRESS_REPEAT                           0x1133\r\n#ifdef CL_VERSION_1_1\r\n#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134\r\n#endif\r\n\r\n/* cl_filter_mode */\r\n#define CL_FILTER_NEAREST                           0x1140\r\n#define CL_FILTER_LINEAR                            0x1141\r\n\r\n/* cl_sampler_info */\r\n#define CL_SAMPLER_REFERENCE_COUNT                  0x1150\r\n#define CL_SAMPLER_CONTEXT                          0x1151\r\n#define CL_SAMPLER_NORMALIZED_COORDS                0x1152\r\n#define CL_SAMPLER_ADDRESSING_MODE                  0x1153\r\n#define CL_SAMPLER_FILTER_MODE                      0x1154\r\n#ifdef CL_VERSION_2_0\r\n/* These enumerants are for the cl_khr_mipmap_image extension.\r\n   They have since been added to cl_ext.h with an appropriate\r\n   KHR suffix, but are left here for backwards compatibility. */\r\n#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155\r\n#define CL_SAMPLER_LOD_MIN                          0x1156\r\n#define CL_SAMPLER_LOD_MAX                          0x1157\r\n#endif\r\n#ifdef CL_VERSION_3_0\r\n#define CL_SAMPLER_PROPERTIES                       0x1158\r\n#endif\r\n\r\n/* cl_map_flags - bitfield */\r\n#define CL_MAP_READ                                 (1 << 0)\r\n#define CL_MAP_WRITE                                (1 << 1)\r\n#ifdef CL_VERSION_1_2\r\n#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)\r\n#endif\r\n\r\n/* cl_program_info */\r\n#define CL_PROGRAM_REFERENCE_COUNT                  0x1160\r\n#define CL_PROGRAM_CONTEXT                          0x1161\r\n#define CL_PROGRAM_NUM_DEVICES                      0x1162\r\n#define CL_PROGRAM_DEVICES                          0x1163\r\n#define CL_PROGRAM_SOURCE                           0x1164\r\n#define CL_PROGRAM_BINARY_SIZES                     0x1165\r\n#define CL_PROGRAM_BINARIES                         0x1166\r\n#ifdef CL_VERSION_1_2\r\n#define CL_PROGRAM_NUM_KERNELS                      0x1167\r\n#define CL_PROGRAM_KERNEL_NAMES                     0x1168\r\n#endif\r\n#ifdef CL_VERSION_2_1\r\n#define CL_PROGRAM_IL                               0x1169\r\n#endif\r\n#ifdef CL_VERSION_2_2\r\n#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A\r\n#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B\r\n#endif\r\n\r\n/* cl_program_build_info */\r\n#define CL_PROGRAM_BUILD_STATUS                     0x1181\r\n#define CL_PROGRAM_BUILD_OPTIONS                    0x1182\r\n#define CL_PROGRAM_BUILD_LOG                        0x1183\r\n#ifdef CL_VERSION_1_2\r\n#define CL_PROGRAM_BINARY_TYPE                      0x1184\r\n#endif\r\n#ifdef CL_VERSION_2_0\r\n#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\n/* cl_program_binary_type */\r\n#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0\r\n#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1\r\n#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2\r\n#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4\r\n\r\n#endif\r\n\r\n/* cl_build_status */\r\n#define CL_BUILD_SUCCESS                            0\r\n#define CL_BUILD_NONE                               -1\r\n#define CL_BUILD_ERROR                              -2\r\n#define CL_BUILD_IN_PROGRESS                        -3\r\n\r\n/* cl_kernel_info */\r\n#define CL_KERNEL_FUNCTION_NAME                     0x1190\r\n#define CL_KERNEL_NUM_ARGS                          0x1191\r\n#define CL_KERNEL_REFERENCE_COUNT                   0x1192\r\n#define CL_KERNEL_CONTEXT                           0x1193\r\n#define CL_KERNEL_PROGRAM                           0x1194\r\n#ifdef CL_VERSION_1_2\r\n#define CL_KERNEL_ATTRIBUTES                        0x1195\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\n/* cl_kernel_arg_info */\r\n#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196\r\n#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197\r\n#define CL_KERNEL_ARG_TYPE_NAME                     0x1198\r\n#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199\r\n#define CL_KERNEL_ARG_NAME                          0x119A\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\n/* cl_kernel_arg_address_qualifier */\r\n#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B\r\n#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C\r\n#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D\r\n#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\n/* cl_kernel_arg_access_qualifier */\r\n#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0\r\n#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1\r\n#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2\r\n#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\n/* cl_kernel_arg_type_qualifier */\r\n#define CL_KERNEL_ARG_TYPE_NONE                     0\r\n#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)\r\n#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)\r\n#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)\r\n#ifdef CL_VERSION_2_0\r\n#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)\r\n#endif\r\n\r\n#endif\r\n\r\n/* cl_kernel_work_group_info */\r\n#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0\r\n#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1\r\n#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2\r\n#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3\r\n#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4\r\n#ifdef CL_VERSION_1_2\r\n#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5\r\n#endif\r\n\r\n#ifdef CL_VERSION_2_1\r\n\r\n/* cl_kernel_sub_group_info */\r\n#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033\r\n#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034\r\n#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8\r\n#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9\r\n#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\n/* cl_kernel_exec_info */\r\n#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6\r\n#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7\r\n\r\n#endif\r\n\r\n/* cl_event_info */\r\n#define CL_EVENT_COMMAND_QUEUE                      0x11D0\r\n#define CL_EVENT_COMMAND_TYPE                       0x11D1\r\n#define CL_EVENT_REFERENCE_COUNT                    0x11D2\r\n#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3\r\n#ifdef CL_VERSION_1_1\r\n#define CL_EVENT_CONTEXT                            0x11D4\r\n#endif\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0\r\n#define CL_COMMAND_TASK                             0x11F1\r\n#define CL_COMMAND_NATIVE_KERNEL                    0x11F2\r\n#define CL_COMMAND_READ_BUFFER                      0x11F3\r\n#define CL_COMMAND_WRITE_BUFFER                     0x11F4\r\n#define CL_COMMAND_COPY_BUFFER                      0x11F5\r\n#define CL_COMMAND_READ_IMAGE                       0x11F6\r\n#define CL_COMMAND_WRITE_IMAGE                      0x11F7\r\n#define CL_COMMAND_COPY_IMAGE                       0x11F8\r\n#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9\r\n#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA\r\n#define CL_COMMAND_MAP_BUFFER                       0x11FB\r\n#define CL_COMMAND_MAP_IMAGE                        0x11FC\r\n#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD\r\n#define CL_COMMAND_MARKER                           0x11FE\r\n#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF\r\n#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200\r\n#ifdef CL_VERSION_1_1\r\n#define CL_COMMAND_READ_BUFFER_RECT                 0x1201\r\n#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202\r\n#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203\r\n#define CL_COMMAND_USER                             0x1204\r\n#endif\r\n#ifdef CL_VERSION_1_2\r\n#define CL_COMMAND_BARRIER                          0x1205\r\n#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206\r\n#define CL_COMMAND_FILL_BUFFER                      0x1207\r\n#define CL_COMMAND_FILL_IMAGE                       0x1208\r\n#endif\r\n#ifdef CL_VERSION_2_0\r\n#define CL_COMMAND_SVM_FREE                         0x1209\r\n#define CL_COMMAND_SVM_MEMCPY                       0x120A\r\n#define CL_COMMAND_SVM_MEMFILL                      0x120B\r\n#define CL_COMMAND_SVM_MAP                          0x120C\r\n#define CL_COMMAND_SVM_UNMAP                        0x120D\r\n#endif\r\n#ifdef CL_VERSION_3_0\r\n#define CL_COMMAND_SVM_MIGRATE_MEM                  0x120E\r\n#endif\r\n\r\n/* command execution status */\r\n#define CL_COMPLETE                                 0x0\r\n#define CL_RUNNING                                  0x1\r\n#define CL_SUBMITTED                                0x2\r\n#define CL_QUEUED                                   0x3\r\n\r\n/* cl_buffer_create_type */\r\n#ifdef CL_VERSION_1_1\r\n#define CL_BUFFER_CREATE_TYPE_REGION                0x1220\r\n#endif\r\n\r\n/* cl_profiling_info */\r\n#define CL_PROFILING_COMMAND_QUEUED                 0x1280\r\n#define CL_PROFILING_COMMAND_SUBMIT                 0x1281\r\n#define CL_PROFILING_COMMAND_START                  0x1282\r\n#define CL_PROFILING_COMMAND_END                    0x1283\r\n#ifdef CL_VERSION_2_0\r\n#define CL_PROFILING_COMMAND_COMPLETE               0x1284\r\n#endif\r\n\r\n/* cl_device_atomic_capabilities - bitfield */\r\n#ifdef CL_VERSION_3_0\r\n#define CL_DEVICE_ATOMIC_ORDER_RELAXED          (1 << 0)\r\n#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL          (1 << 1)\r\n#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST          (1 << 2)\r\n#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM        (1 << 3)\r\n#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP       (1 << 4)\r\n#define CL_DEVICE_ATOMIC_SCOPE_DEVICE           (1 << 5)\r\n#define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES      (1 << 6)\r\n#endif\r\n\r\n/* cl_device_device_enqueue_capabilities - bitfield */\r\n#ifdef CL_VERSION_3_0\r\n#define CL_DEVICE_QUEUE_SUPPORTED               (1 << 0)\r\n#define CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT     (1 << 1)\r\n#endif\r\n\r\n/* cl_khronos_vendor_id */\r\n#define CL_KHRONOS_VENDOR_ID_CODEPLAY               0x10004\r\n\r\n/* cl_version */\r\n#define CL_VERSION_MAJOR_BITS (10)\r\n#define CL_VERSION_MINOR_BITS (10)\r\n#define CL_VERSION_PATCH_BITS (12)\r\n\r\n#define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1)\r\n#define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1)\r\n#define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1)\r\n\r\n#define CL_VERSION_MAJOR(version) \\\r\n  ((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS))\r\n\r\n#define CL_VERSION_MINOR(version) \\\r\n  (((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK)\r\n\r\n#define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK)\r\n\r\n#define CL_MAKE_VERSION(major, minor, patch)                      \\\r\n  ((((major) & CL_VERSION_MAJOR_MASK)                             \\\r\n       << (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) |      \\\r\n   (((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \\\r\n   ((patch) & CL_VERSION_PATCH_MASK))\r\n\r\n/********************************************************************************************************/\r\n\r\n/* CL_NO_PROTOTYPES implies CL_NO_CORE_PROTOTYPES: */\r\n#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_CORE_PROTOTYPES)\r\n#define CL_NO_CORE_PROTOTYPES\r\n#endif\r\n\r\n#if !defined(CL_NO_CORE_PROTOTYPES)\r\n\r\n/* Platform API */\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetPlatformIDs(cl_uint          num_entries,\r\n                 cl_platform_id * platforms,\r\n                 cl_uint *        num_platforms) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetPlatformInfo(cl_platform_id   platform,\r\n                  cl_platform_info param_name,\r\n                  size_t           param_value_size,\r\n                  void *           param_value,\r\n                  size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n/* Device APIs */\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetDeviceIDs(cl_platform_id   platform,\r\n               cl_device_type   device_type,\r\n               cl_uint          num_entries,\r\n               cl_device_id *   devices,\r\n               cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetDeviceInfo(cl_device_id    device,\r\n                cl_device_info  param_name,\r\n                size_t          param_value_size,\r\n                void *          param_value,\r\n                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCreateSubDevices(cl_device_id                         in_device,\r\n                   const cl_device_partition_property * properties,\r\n                   cl_uint                              num_devices,\r\n                   cl_device_id *                       out_devices,\r\n                   cl_uint *                            num_devices_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_2_1\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetDefaultDeviceCommandQueue(cl_context           context,\r\n                               cl_device_id         device,\r\n                               cl_command_queue     command_queue) CL_API_SUFFIX__VERSION_2_1;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetDeviceAndHostTimer(cl_device_id    device,\r\n                        cl_ulong*       device_timestamp,\r\n                        cl_ulong*       host_timestamp) CL_API_SUFFIX__VERSION_2_1;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetHostTimer(cl_device_id device,\r\n               cl_ulong *   host_timestamp) CL_API_SUFFIX__VERSION_2_1;\r\n\r\n#endif\r\n\r\n/* Context APIs */\r\nextern CL_API_ENTRY cl_context CL_API_CALL\r\nclCreateContext(const cl_context_properties * properties,\r\n                cl_uint              num_devices,\r\n                const cl_device_id * devices,\r\n                void (CL_CALLBACK * pfn_notify)(const char * errinfo,\r\n                                                const void * private_info,\r\n                                                size_t       cb,\r\n                                                void *       user_data),\r\n                void *               user_data,\r\n                cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_context CL_API_CALL\r\nclCreateContextFromType(const cl_context_properties * properties,\r\n                        cl_device_type      device_type,\r\n                        void (CL_CALLBACK * pfn_notify)(const char * errinfo,\r\n                                                        const void * private_info,\r\n                                                        size_t       cb,\r\n                                                        void *       user_data),\r\n                        void *              user_data,\r\n                        cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetContextInfo(cl_context         context,\r\n                 cl_context_info    param_name,\r\n                 size_t             param_value_size,\r\n                 void *             param_value,\r\n                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_3_0\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetContextDestructorCallback(cl_context         context,\r\n                               void (CL_CALLBACK* pfn_notify)(cl_context context,\r\n                                                              void* user_data),\r\n                               void*              user_data) CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#endif\r\n\r\n/* Command Queue APIs */\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\nextern CL_API_ENTRY cl_command_queue CL_API_CALL\r\nclCreateCommandQueueWithProperties(cl_context               context,\r\n                                   cl_device_id             device,\r\n                                   const cl_queue_properties *    properties,\r\n                                   cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetCommandQueueInfo(cl_command_queue      command_queue,\r\n                      cl_command_queue_info param_name,\r\n                      size_t                param_value_size,\r\n                      void *                param_value,\r\n                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n/* Memory Object APIs */\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateBuffer(cl_context   context,\r\n               cl_mem_flags flags,\r\n               size_t       size,\r\n               void *       host_ptr,\r\n               cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_1\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateSubBuffer(cl_mem                   buffer,\r\n                  cl_mem_flags             flags,\r\n                  cl_buffer_create_type    buffer_create_type,\r\n                  const void *             buffer_create_info,\r\n                  cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateImage(cl_context              context,\r\n              cl_mem_flags            flags,\r\n              const cl_image_format * image_format,\r\n              const cl_image_desc *   image_desc,\r\n              void *                  host_ptr,\r\n              cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreatePipe(cl_context                 context,\r\n             cl_mem_flags               flags,\r\n             cl_uint                    pipe_packet_size,\r\n             cl_uint                    pipe_max_packets,\r\n             const cl_pipe_properties * properties,\r\n             cl_int *                   errcode_ret) CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_3_0\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateBufferWithProperties(cl_context                context,\r\n                             const cl_mem_properties * properties,\r\n                             cl_mem_flags              flags,\r\n                             size_t                    size,\r\n                             void *                    host_ptr,\r\n                             cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateImageWithProperties(cl_context                context,\r\n                            const cl_mem_properties * properties,\r\n                            cl_mem_flags              flags,\r\n                            const cl_image_format *   image_format,\r\n                            const cl_image_desc *     image_desc,\r\n                            void *                    host_ptr,\r\n                            cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetSupportedImageFormats(cl_context           context,\r\n                           cl_mem_flags         flags,\r\n                           cl_mem_object_type   image_type,\r\n                           cl_uint              num_entries,\r\n                           cl_image_format *    image_formats,\r\n                           cl_uint *            num_image_formats) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetMemObjectInfo(cl_mem           memobj,\r\n                   cl_mem_info      param_name,\r\n                   size_t           param_value_size,\r\n                   void *           param_value,\r\n                   size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetImageInfo(cl_mem           image,\r\n               cl_image_info    param_name,\r\n               size_t           param_value_size,\r\n               void *           param_value,\r\n               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetPipeInfo(cl_mem           pipe,\r\n              cl_pipe_info     param_name,\r\n              size_t           param_value_size,\r\n              void *           param_value,\r\n              size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_1\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetMemObjectDestructorCallback(cl_mem memobj,\r\n                                 void (CL_CALLBACK * pfn_notify)(cl_mem memobj,\r\n                                                                 void * user_data),\r\n                                 void * user_data) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif\r\n\r\n/* SVM Allocation APIs */\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\nextern CL_API_ENTRY void * CL_API_CALL\r\nclSVMAlloc(cl_context       context,\r\n           cl_svm_mem_flags flags,\r\n           size_t           size,\r\n           cl_uint          alignment) CL_API_SUFFIX__VERSION_2_0;\r\n\r\nextern CL_API_ENTRY void CL_API_CALL\r\nclSVMFree(cl_context        context,\r\n          void *            svm_pointer) CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#endif\r\n\r\n/* Sampler APIs */\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\nextern CL_API_ENTRY cl_sampler CL_API_CALL\r\nclCreateSamplerWithProperties(cl_context                     context,\r\n                              const cl_sampler_properties *  sampler_properties,\r\n                              cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetSamplerInfo(cl_sampler         sampler,\r\n                 cl_sampler_info    param_name,\r\n                 size_t             param_value_size,\r\n                 void *             param_value,\r\n                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n/* Program Object APIs */\r\nextern CL_API_ENTRY cl_program CL_API_CALL\r\nclCreateProgramWithSource(cl_context        context,\r\n                          cl_uint           count,\r\n                          const char **     strings,\r\n                          const size_t *    lengths,\r\n                          cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_program CL_API_CALL\r\nclCreateProgramWithBinary(cl_context                     context,\r\n                          cl_uint                        num_devices,\r\n                          const cl_device_id *           device_list,\r\n                          const size_t *                 lengths,\r\n                          const unsigned char **         binaries,\r\n                          cl_int *                       binary_status,\r\n                          cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_program CL_API_CALL\r\nclCreateProgramWithBuiltInKernels(cl_context            context,\r\n                                  cl_uint               num_devices,\r\n                                  const cl_device_id *  device_list,\r\n                                  const char *          kernel_names,\r\n                                  cl_int *              errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_2_1\r\n\r\nextern CL_API_ENTRY cl_program CL_API_CALL\r\nclCreateProgramWithIL(cl_context    context,\r\n                     const void*    il,\r\n                     size_t         length,\r\n                     cl_int*        errcode_ret) CL_API_SUFFIX__VERSION_2_1;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclBuildProgram(cl_program           program,\r\n               cl_uint              num_devices,\r\n               const cl_device_id * device_list,\r\n               const char *         options,\r\n               void (CL_CALLBACK *  pfn_notify)(cl_program program,\r\n                                                void * user_data),\r\n               void *               user_data) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCompileProgram(cl_program           program,\r\n                 cl_uint              num_devices,\r\n                 const cl_device_id * device_list,\r\n                 const char *         options,\r\n                 cl_uint              num_input_headers,\r\n                 const cl_program *   input_headers,\r\n                 const char **        header_include_names,\r\n                 void (CL_CALLBACK *  pfn_notify)(cl_program program,\r\n                                                  void * user_data),\r\n                 void *               user_data) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_program CL_API_CALL\r\nclLinkProgram(cl_context           context,\r\n              cl_uint              num_devices,\r\n              const cl_device_id * device_list,\r\n              const char *         options,\r\n              cl_uint              num_input_programs,\r\n              const cl_program *   input_programs,\r\n              void (CL_CALLBACK *  pfn_notify)(cl_program program,\r\n                                               void * user_data),\r\n              void *               user_data,\r\n              cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_2_2\r\n\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int CL_API_CALL\r\nclSetProgramReleaseCallback(cl_program          program,\r\n                            void (CL_CALLBACK * pfn_notify)(cl_program program,\r\n                                                            void * user_data),\r\n                            void *              user_data) CL_API_SUFFIX__VERSION_2_2_DEPRECATED;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetProgramSpecializationConstant(cl_program  program,\r\n                                   cl_uint     spec_id,\r\n                                   size_t      spec_size,\r\n                                   const void* spec_value) CL_API_SUFFIX__VERSION_2_2;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetProgramInfo(cl_program         program,\r\n                 cl_program_info    param_name,\r\n                 size_t             param_value_size,\r\n                 void *             param_value,\r\n                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetProgramBuildInfo(cl_program            program,\r\n                      cl_device_id          device,\r\n                      cl_program_build_info param_name,\r\n                      size_t                param_value_size,\r\n                      void *                param_value,\r\n                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n/* Kernel Object APIs */\r\nextern CL_API_ENTRY cl_kernel CL_API_CALL\r\nclCreateKernel(cl_program      program,\r\n               const char *    kernel_name,\r\n               cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCreateKernelsInProgram(cl_program     program,\r\n                         cl_uint        num_kernels,\r\n                         cl_kernel *    kernels,\r\n                         cl_uint *      num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_2_1\r\n\r\nextern CL_API_ENTRY cl_kernel CL_API_CALL\r\nclCloneKernel(cl_kernel     source_kernel,\r\n              cl_int*       errcode_ret) CL_API_SUFFIX__VERSION_2_1;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainKernel(cl_kernel    kernel) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseKernel(cl_kernel   kernel) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetKernelArg(cl_kernel    kernel,\r\n               cl_uint      arg_index,\r\n               size_t       arg_size,\r\n               const void * arg_value) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetKernelArgSVMPointer(cl_kernel    kernel,\r\n                         cl_uint      arg_index,\r\n                         const void * arg_value) CL_API_SUFFIX__VERSION_2_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetKernelExecInfo(cl_kernel            kernel,\r\n                    cl_kernel_exec_info  param_name,\r\n                    size_t               param_value_size,\r\n                    const void *         param_value) CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetKernelInfo(cl_kernel       kernel,\r\n                cl_kernel_info  param_name,\r\n                size_t          param_value_size,\r\n                void *          param_value,\r\n                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetKernelArgInfo(cl_kernel       kernel,\r\n                   cl_uint         arg_indx,\r\n                   cl_kernel_arg_info  param_name,\r\n                   size_t          param_value_size,\r\n                   void *          param_value,\r\n                   size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetKernelWorkGroupInfo(cl_kernel                  kernel,\r\n                         cl_device_id               device,\r\n                         cl_kernel_work_group_info  param_name,\r\n                         size_t                     param_value_size,\r\n                         void *                     param_value,\r\n                         size_t *                   param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_2_1\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetKernelSubGroupInfo(cl_kernel                   kernel,\r\n                        cl_device_id                device,\r\n                        cl_kernel_sub_group_info    param_name,\r\n                        size_t                      input_value_size,\r\n                        const void*                 input_value,\r\n                        size_t                      param_value_size,\r\n                        void*                       param_value,\r\n                        size_t*                     param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;\r\n\r\n#endif\r\n\r\n/* Event Object APIs */\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclWaitForEvents(cl_uint             num_events,\r\n                const cl_event *    event_list) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetEventInfo(cl_event         event,\r\n               cl_event_info    param_name,\r\n               size_t           param_value_size,\r\n               void *           param_value,\r\n               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_1\r\n\r\nextern CL_API_ENTRY cl_event CL_API_CALL\r\nclCreateUserEvent(cl_context    context,\r\n                  cl_int *      errcode_ret) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_1\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetUserEventStatus(cl_event   event,\r\n                     cl_int     execution_status) CL_API_SUFFIX__VERSION_1_1;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetEventCallback(cl_event    event,\r\n                   cl_int      command_exec_callback_type,\r\n                   void (CL_CALLBACK * pfn_notify)(cl_event event,\r\n                                                   cl_int   event_command_status,\r\n                                                   void *   user_data),\r\n                   void *      user_data) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif\r\n\r\n/* Profiling APIs */\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetEventProfilingInfo(cl_event            event,\r\n                        cl_profiling_info   param_name,\r\n                        size_t              param_value_size,\r\n                        void *              param_value,\r\n                        size_t *            param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n/* Flush and Finish APIs */\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n/* Enqueued Commands APIs */\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReadBuffer(cl_command_queue    command_queue,\r\n                    cl_mem              buffer,\r\n                    cl_bool             blocking_read,\r\n                    size_t              offset,\r\n                    size_t              size,\r\n                    void *              ptr,\r\n                    cl_uint             num_events_in_wait_list,\r\n                    const cl_event *    event_wait_list,\r\n                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_1\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReadBufferRect(cl_command_queue    command_queue,\r\n                        cl_mem              buffer,\r\n                        cl_bool             blocking_read,\r\n                        const size_t *      buffer_origin,\r\n                        const size_t *      host_origin,\r\n                        const size_t *      region,\r\n                        size_t              buffer_row_pitch,\r\n                        size_t              buffer_slice_pitch,\r\n                        size_t              host_row_pitch,\r\n                        size_t              host_slice_pitch,\r\n                        void *              ptr,\r\n                        cl_uint             num_events_in_wait_list,\r\n                        const cl_event *    event_wait_list,\r\n                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueWriteBuffer(cl_command_queue   command_queue,\r\n                     cl_mem             buffer,\r\n                     cl_bool            blocking_write,\r\n                     size_t             offset,\r\n                     size_t             size,\r\n                     const void *       ptr,\r\n                     cl_uint            num_events_in_wait_list,\r\n                     const cl_event *   event_wait_list,\r\n                     cl_event *         event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_1\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueWriteBufferRect(cl_command_queue    command_queue,\r\n                         cl_mem              buffer,\r\n                         cl_bool             blocking_write,\r\n                         const size_t *      buffer_origin,\r\n                         const size_t *      host_origin,\r\n                         const size_t *      region,\r\n                         size_t              buffer_row_pitch,\r\n                         size_t              buffer_slice_pitch,\r\n                         size_t              host_row_pitch,\r\n                         size_t              host_slice_pitch,\r\n                         const void *        ptr,\r\n                         cl_uint             num_events_in_wait_list,\r\n                         const cl_event *    event_wait_list,\r\n                         cl_event *          event) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueFillBuffer(cl_command_queue   command_queue,\r\n                    cl_mem             buffer,\r\n                    const void *       pattern,\r\n                    size_t             pattern_size,\r\n                    size_t             offset,\r\n                    size_t             size,\r\n                    cl_uint            num_events_in_wait_list,\r\n                    const cl_event *   event_wait_list,\r\n                    cl_event *         event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueCopyBuffer(cl_command_queue    command_queue,\r\n                    cl_mem              src_buffer,\r\n                    cl_mem              dst_buffer,\r\n                    size_t              src_offset,\r\n                    size_t              dst_offset,\r\n                    size_t              size,\r\n                    cl_uint             num_events_in_wait_list,\r\n                    const cl_event *    event_wait_list,\r\n                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_1\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueCopyBufferRect(cl_command_queue    command_queue,\r\n                        cl_mem              src_buffer,\r\n                        cl_mem              dst_buffer,\r\n                        const size_t *      src_origin,\r\n                        const size_t *      dst_origin,\r\n                        const size_t *      region,\r\n                        size_t              src_row_pitch,\r\n                        size_t              src_slice_pitch,\r\n                        size_t              dst_row_pitch,\r\n                        size_t              dst_slice_pitch,\r\n                        cl_uint             num_events_in_wait_list,\r\n                        const cl_event *    event_wait_list,\r\n                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReadImage(cl_command_queue     command_queue,\r\n                   cl_mem               image,\r\n                   cl_bool              blocking_read,\r\n                   const size_t *       origin,\r\n                   const size_t *       region,\r\n                   size_t               row_pitch,\r\n                   size_t               slice_pitch,\r\n                   void *               ptr,\r\n                   cl_uint              num_events_in_wait_list,\r\n                   const cl_event *     event_wait_list,\r\n                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueWriteImage(cl_command_queue    command_queue,\r\n                    cl_mem              image,\r\n                    cl_bool             blocking_write,\r\n                    const size_t *      origin,\r\n                    const size_t *      region,\r\n                    size_t              input_row_pitch,\r\n                    size_t              input_slice_pitch,\r\n                    const void *        ptr,\r\n                    cl_uint             num_events_in_wait_list,\r\n                    const cl_event *    event_wait_list,\r\n                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueFillImage(cl_command_queue   command_queue,\r\n                   cl_mem             image,\r\n                   const void *       fill_color,\r\n                   const size_t *     origin,\r\n                   const size_t *     region,\r\n                   cl_uint            num_events_in_wait_list,\r\n                   const cl_event *   event_wait_list,\r\n                   cl_event *         event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueCopyImage(cl_command_queue     command_queue,\r\n                   cl_mem               src_image,\r\n                   cl_mem               dst_image,\r\n                   const size_t *       src_origin,\r\n                   const size_t *       dst_origin,\r\n                   const size_t *       region,\r\n                   cl_uint              num_events_in_wait_list,\r\n                   const cl_event *     event_wait_list,\r\n                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueCopyImageToBuffer(cl_command_queue command_queue,\r\n                           cl_mem           src_image,\r\n                           cl_mem           dst_buffer,\r\n                           const size_t *   src_origin,\r\n                           const size_t *   region,\r\n                           size_t           dst_offset,\r\n                           cl_uint          num_events_in_wait_list,\r\n                           const cl_event * event_wait_list,\r\n                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueCopyBufferToImage(cl_command_queue command_queue,\r\n                           cl_mem           src_buffer,\r\n                           cl_mem           dst_image,\r\n                           size_t           src_offset,\r\n                           const size_t *   dst_origin,\r\n                           const size_t *   region,\r\n                           cl_uint          num_events_in_wait_list,\r\n                           const cl_event * event_wait_list,\r\n                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY void * CL_API_CALL\r\nclEnqueueMapBuffer(cl_command_queue command_queue,\r\n                   cl_mem           buffer,\r\n                   cl_bool          blocking_map,\r\n                   cl_map_flags     map_flags,\r\n                   size_t           offset,\r\n                   size_t           size,\r\n                   cl_uint          num_events_in_wait_list,\r\n                   const cl_event * event_wait_list,\r\n                   cl_event *       event,\r\n                   cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY void * CL_API_CALL\r\nclEnqueueMapImage(cl_command_queue  command_queue,\r\n                  cl_mem            image,\r\n                  cl_bool           blocking_map,\r\n                  cl_map_flags      map_flags,\r\n                  const size_t *    origin,\r\n                  const size_t *    region,\r\n                  size_t *          image_row_pitch,\r\n                  size_t *          image_slice_pitch,\r\n                  cl_uint           num_events_in_wait_list,\r\n                  const cl_event *  event_wait_list,\r\n                  cl_event *        event,\r\n                  cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueUnmapMemObject(cl_command_queue command_queue,\r\n                        cl_mem           memobj,\r\n                        void *           mapped_ptr,\r\n                        cl_uint          num_events_in_wait_list,\r\n                        const cl_event * event_wait_list,\r\n                        cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueMigrateMemObjects(cl_command_queue       command_queue,\r\n                           cl_uint                num_mem_objects,\r\n                           const cl_mem *         mem_objects,\r\n                           cl_mem_migration_flags flags,\r\n                           cl_uint                num_events_in_wait_list,\r\n                           const cl_event *       event_wait_list,\r\n                           cl_event *             event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueNDRangeKernel(cl_command_queue command_queue,\r\n                       cl_kernel        kernel,\r\n                       cl_uint          work_dim,\r\n                       const size_t *   global_work_offset,\r\n                       const size_t *   global_work_size,\r\n                       const size_t *   local_work_size,\r\n                       cl_uint          num_events_in_wait_list,\r\n                       const cl_event * event_wait_list,\r\n                       cl_event *       event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueNativeKernel(cl_command_queue  command_queue,\r\n                      void (CL_CALLBACK * user_func)(void *),\r\n                      void *            args,\r\n                      size_t            cb_args,\r\n                      cl_uint           num_mem_objects,\r\n                      const cl_mem *    mem_list,\r\n                      const void **     args_mem_loc,\r\n                      cl_uint           num_events_in_wait_list,\r\n                      const cl_event *  event_wait_list,\r\n                      cl_event *        event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueMarkerWithWaitList(cl_command_queue  command_queue,\r\n                            cl_uint           num_events_in_wait_list,\r\n                            const cl_event *  event_wait_list,\r\n                            cl_event *        event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueBarrierWithWaitList(cl_command_queue  command_queue,\r\n                             cl_uint           num_events_in_wait_list,\r\n                             const cl_event *  event_wait_list,\r\n                             cl_event *        event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMFree(cl_command_queue  command_queue,\r\n                 cl_uint           num_svm_pointers,\r\n                 void *            svm_pointers[],\r\n                 void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,\r\n                                                    cl_uint          num_svm_pointers,\r\n                                                    void *           svm_pointers[],\r\n                                                    void *           user_data),\r\n                 void *            user_data,\r\n                 cl_uint           num_events_in_wait_list,\r\n                 const cl_event *  event_wait_list,\r\n                 cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMMemcpy(cl_command_queue  command_queue,\r\n                   cl_bool           blocking_copy,\r\n                   void *            dst_ptr,\r\n                   const void *      src_ptr,\r\n                   size_t            size,\r\n                   cl_uint           num_events_in_wait_list,\r\n                   const cl_event *  event_wait_list,\r\n                   cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMMemFill(cl_command_queue  command_queue,\r\n                    void *            svm_ptr,\r\n                    const void *      pattern,\r\n                    size_t            pattern_size,\r\n                    size_t            size,\r\n                    cl_uint           num_events_in_wait_list,\r\n                    const cl_event *  event_wait_list,\r\n                    cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMMap(cl_command_queue  command_queue,\r\n                cl_bool           blocking_map,\r\n                cl_map_flags      flags,\r\n                void *            svm_ptr,\r\n                size_t            size,\r\n                cl_uint           num_events_in_wait_list,\r\n                const cl_event *  event_wait_list,\r\n                cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMUnmap(cl_command_queue  command_queue,\r\n                  void *            svm_ptr,\r\n                  cl_uint           num_events_in_wait_list,\r\n                  const cl_event *  event_wait_list,\r\n                  cl_event *        event) CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_2_1\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMMigrateMem(cl_command_queue         command_queue,\r\n                       cl_uint                  num_svm_pointers,\r\n                       const void **            svm_pointers,\r\n                       const size_t *           sizes,\r\n                       cl_mem_migration_flags   flags,\r\n                       cl_uint                  num_events_in_wait_list,\r\n                       const cl_event *         event_wait_list,\r\n                       cl_event *               event) CL_API_SUFFIX__VERSION_2_1;\r\n\r\n#endif\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\n/* Extension function access\r\n *\r\n * Returns the extension function address for the given function name,\r\n * or NULL if a valid function can not be found.  The client must\r\n * check to make sure the address is not NULL, before using or\r\n * calling the returned function address.\r\n */\r\nextern CL_API_ENTRY void * CL_API_CALL\r\nclGetExtensionFunctionAddressForPlatform(cl_platform_id platform,\r\n                                         const char *   func_name) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif\r\n\r\n#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS\r\n    /*\r\n     *  WARNING:\r\n     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED\r\n     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the\r\n     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.\r\n     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.\r\n     *\r\n     *  Software developers previously relying on this API are instructed to set the command queue\r\n     *  properties when creating the queue, instead.\r\n     */\r\n    extern CL_API_ENTRY cl_int CL_API_CALL\r\n    clSetCommandQueueProperty(cl_command_queue              command_queue,\r\n                              cl_command_queue_properties   properties,\r\n                              cl_bool                       enable,\r\n                              cl_command_queue_properties * old_properties) CL_API_SUFFIX__VERSION_1_0_DEPRECATED;\r\n#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */\r\n\r\n/* Deprecated OpenCL 1.1 APIs */\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL\r\nclCreateImage2D(cl_context              context,\r\n                cl_mem_flags            flags,\r\n                const cl_image_format * image_format,\r\n                size_t                  image_width,\r\n                size_t                  image_height,\r\n                size_t                  image_row_pitch,\r\n                void *                  host_ptr,\r\n                cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL\r\nclCreateImage3D(cl_context              context,\r\n                cl_mem_flags            flags,\r\n                const cl_image_format * image_format,\r\n                size_t                  image_width,\r\n                size_t                  image_height,\r\n                size_t                  image_depth,\r\n                size_t                  image_row_pitch,\r\n                size_t                  image_slice_pitch,\r\n                void *                  host_ptr,\r\n                cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL\r\nclEnqueueMarker(cl_command_queue    command_queue,\r\n                cl_event *          event) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL\r\nclEnqueueWaitForEvents(cl_command_queue  command_queue,\r\n                        cl_uint          num_events,\r\n                        const cl_event * event_list) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL\r\nclEnqueueBarrier(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL\r\nclUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL\r\nclGetExtensionFunctionAddress(const char * func_name) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\n/* Deprecated OpenCL 2.0 APIs */\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL\r\nclCreateCommandQueue(cl_context                     context,\r\n                     cl_device_id                   device,\r\n                     cl_command_queue_properties    properties,\r\n                     cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;\r\n\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL\r\nclCreateSampler(cl_context          context,\r\n                cl_bool             normalized_coords,\r\n                cl_addressing_mode  addressing_mode,\r\n                cl_filter_mode      filter_mode,\r\n                cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;\r\n\r\nextern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL\r\nclEnqueueTask(cl_command_queue  command_queue,\r\n              cl_kernel         kernel,\r\n              cl_uint           num_events_in_wait_list,\r\n              const cl_event *  event_wait_list,\r\n              cl_event *        event) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;\r\n\r\n#endif /* !defined(CL_NO_CORE_PROTOTYPES) */\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif  /* __OPENCL_CL_H */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl2.hpp",
    "content": "//\r\n// Copyright (c) 2020 The Khronos Group Inc.\r\n//\r\n// Licensed under the Apache License, Version 2.0 (the \"License\");\r\n// you may not use this file except in compliance with the License.\r\n// You may obtain a copy of the License at\r\n//\r\n//    http://www.apache.org/licenses/LICENSE-2.0\r\n//\r\n// Unless required by applicable law or agreed to in writing, software\r\n// distributed under the License is distributed on an \"AS IS\" BASIS,\r\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n// See the License for the specific language governing permissions and\r\n// limitations under the License.\r\n//\r\n\r\n#include <CL/opencl.hpp>\r\n#pragma message(\"cl2.hpp has been renamed to opencl.hpp to make it clear that it supports all versions of OpenCL. Please include opencl.hpp directly.\")\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_d3d10.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2023 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef OPENCL_CL_D3D10_H_\r\n#define OPENCL_CL_D3D10_H_\r\n\r\n/*\r\n** This header is generated from the Khronos OpenCL XML API Registry.\r\n*/\r\n\r\n#if defined(_MSC_VER)\r\n#if _MSC_VER >=1500\r\n#pragma warning( push )\r\n#pragma warning( disable : 4201 )\r\n#pragma warning( disable : 5105 )\r\n#endif\r\n#endif\r\n#include <d3d10.h>\r\n#if defined(_MSC_VER)\r\n#if _MSC_VER >=1500\r\n#pragma warning( pop )\r\n#endif\r\n#endif\r\n\r\n#include <CL/cl.h>\r\n\r\n/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)\r\n#define CL_NO_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n/* CL_NO_EXTENSION_PROTOTYPES implies\r\n   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and\r\n   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/***************************************************************\r\n* cl_khr_d3d10_sharing\r\n***************************************************************/\r\n#define cl_khr_d3d10_sharing 1\r\n#define CL_KHR_D3D10_SHARING_EXTENSION_NAME \\\r\n    \"cl_khr_d3d10_sharing\"\r\n\r\n\r\n#define CL_KHR_D3D10_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_uint             cl_d3d10_device_source_khr;\r\ntypedef cl_uint             cl_d3d10_device_set_khr;\r\n\r\n/* Error codes */\r\n#define CL_INVALID_D3D10_DEVICE_KHR                         -1002\r\n#define CL_INVALID_D3D10_RESOURCE_KHR                       -1003\r\n#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR              -1004\r\n#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR                  -1005\r\n\r\n/* cl_d3d10_device_source_khr */\r\n#define CL_D3D10_DEVICE_KHR                                 0x4010\r\n#define CL_D3D10_DXGI_ADAPTER_KHR                           0x4011\r\n\r\n/* cl_d3d10_device_set_khr */\r\n#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR                  0x4012\r\n#define CL_ALL_DEVICES_FOR_D3D10_KHR                        0x4013\r\n\r\n/* cl_context_info */\r\n#define CL_CONTEXT_D3D10_DEVICE_KHR                         0x4014\r\n#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR        0x402C\r\n\r\n/* cl_mem_info */\r\n#define CL_MEM_D3D10_RESOURCE_KHR                           0x4015\r\n\r\n/* cl_image_info */\r\n#define CL_IMAGE_D3D10_SUBRESOURCE_KHR                      0x4016\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR                0x4017\r\n#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR                0x4018\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetDeviceIDsFromD3D10KHR_t(\r\n    cl_platform_id platform,\r\n    cl_d3d10_device_source_khr d3d_device_source,\r\n    void* d3d_object,\r\n    cl_d3d10_device_set_khr d3d_device_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices);\r\n\r\ntypedef clGetDeviceIDsFromD3D10KHR_t *\r\nclGetDeviceIDsFromD3D10KHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromD3D10BufferKHR_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D10Buffer* resource,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromD3D10BufferKHR_t *\r\nclCreateFromD3D10BufferKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromD3D10Texture2DKHR_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D10Texture2D* resource,\r\n    UINT subresource,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromD3D10Texture2DKHR_t *\r\nclCreateFromD3D10Texture2DKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromD3D10Texture3DKHR_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D10Texture3D* resource,\r\n    UINT subresource,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromD3D10Texture3DKHR_t *\r\nclCreateFromD3D10Texture3DKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueAcquireD3D10ObjectsKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueAcquireD3D10ObjectsKHR_t *\r\nclEnqueueAcquireD3D10ObjectsKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReleaseD3D10ObjectsKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReleaseD3D10ObjectsKHR_t *\r\nclEnqueueReleaseD3D10ObjectsKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetDeviceIDsFromD3D10KHR(\r\n    cl_platform_id platform,\r\n    cl_d3d10_device_source_khr d3d_device_source,\r\n    void* d3d_object,\r\n    cl_d3d10_device_set_khr d3d_device_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromD3D10BufferKHR(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D10Buffer* resource,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromD3D10Texture2DKHR(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D10Texture2D* resource,\r\n    UINT subresource,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromD3D10Texture3DKHR(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D10Texture3D* resource,\r\n    UINT subresource,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueAcquireD3D10ObjectsKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReleaseD3D10ObjectsKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_intel_sharing_format_query_d3d10\r\n***************************************************************/\r\n#define cl_intel_sharing_format_query_d3d10 1\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_D3D10_EXTENSION_NAME \\\r\n    \"cl_intel_sharing_format_query_d3d10\"\r\n\r\n\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_D3D10_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* when cl_khr_d3d10_sharing is supported */\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetSupportedD3D10TextureFormatsINTEL_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint num_entries,\r\n    DXGI_FORMAT* d3d10_formats,\r\n    cl_uint* num_texture_formats);\r\n\r\ntypedef clGetSupportedD3D10TextureFormatsINTEL_t *\r\nclGetSupportedD3D10TextureFormatsINTEL_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetSupportedD3D10TextureFormatsINTEL(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint num_entries,\r\n    DXGI_FORMAT* d3d10_formats,\r\n    cl_uint* num_texture_formats) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif /* OPENCL_CL_D3D10_H_ */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_d3d11.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2023 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef OPENCL_CL_D3D11_H_\r\n#define OPENCL_CL_D3D11_H_\r\n\r\n/*\r\n** This header is generated from the Khronos OpenCL XML API Registry.\r\n*/\r\n\r\n#if defined(_MSC_VER)\r\n#if _MSC_VER >=1500\r\n#pragma warning( push )\r\n#pragma warning( disable : 4201 )\r\n#pragma warning( disable : 5105 )\r\n#endif\r\n#endif\r\n#include <d3d11.h>\r\n#if defined(_MSC_VER)\r\n#if _MSC_VER >=1500\r\n#pragma warning( pop )\r\n#endif\r\n#endif\r\n\r\n#include <CL/cl.h>\r\n\r\n/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)\r\n#define CL_NO_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n/* CL_NO_EXTENSION_PROTOTYPES implies\r\n   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and\r\n   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/***************************************************************\r\n* cl_khr_d3d11_sharing\r\n***************************************************************/\r\n#define cl_khr_d3d11_sharing 1\r\n#define CL_KHR_D3D11_SHARING_EXTENSION_NAME \\\r\n    \"cl_khr_d3d11_sharing\"\r\n\r\n\r\n#define CL_KHR_D3D11_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_uint             cl_d3d11_device_source_khr;\r\ntypedef cl_uint             cl_d3d11_device_set_khr;\r\n\r\n/* Error codes */\r\n#define CL_INVALID_D3D11_DEVICE_KHR                         -1006\r\n#define CL_INVALID_D3D11_RESOURCE_KHR                       -1007\r\n#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR              -1008\r\n#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR                  -1009\r\n\r\n/* cl_d3d11_device_source_khr */\r\n#define CL_D3D11_DEVICE_KHR                                 0x4019\r\n#define CL_D3D11_DXGI_ADAPTER_KHR                           0x401A\r\n\r\n/* cl_d3d11_device_set_khr */\r\n#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR                  0x401B\r\n#define CL_ALL_DEVICES_FOR_D3D11_KHR                        0x401C\r\n\r\n/* cl_context_info */\r\n#define CL_CONTEXT_D3D11_DEVICE_KHR                         0x401D\r\n#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR        0x402D\r\n\r\n/* cl_mem_info */\r\n#define CL_MEM_D3D11_RESOURCE_KHR                           0x401E\r\n\r\n/* cl_image_info */\r\n#define CL_IMAGE_D3D11_SUBRESOURCE_KHR                      0x401F\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR                0x4020\r\n#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR                0x4021\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetDeviceIDsFromD3D11KHR_t(\r\n    cl_platform_id platform,\r\n    cl_d3d11_device_source_khr d3d_device_source,\r\n    void* d3d_object,\r\n    cl_d3d11_device_set_khr d3d_device_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices);\r\n\r\ntypedef clGetDeviceIDsFromD3D11KHR_t *\r\nclGetDeviceIDsFromD3D11KHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromD3D11BufferKHR_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D11Buffer* resource,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromD3D11BufferKHR_t *\r\nclCreateFromD3D11BufferKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromD3D11Texture2DKHR_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D11Texture2D* resource,\r\n    UINT subresource,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromD3D11Texture2DKHR_t *\r\nclCreateFromD3D11Texture2DKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromD3D11Texture3DKHR_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D11Texture3D* resource,\r\n    UINT subresource,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromD3D11Texture3DKHR_t *\r\nclCreateFromD3D11Texture3DKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueAcquireD3D11ObjectsKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueAcquireD3D11ObjectsKHR_t *\r\nclEnqueueAcquireD3D11ObjectsKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReleaseD3D11ObjectsKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReleaseD3D11ObjectsKHR_t *\r\nclEnqueueReleaseD3D11ObjectsKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetDeviceIDsFromD3D11KHR(\r\n    cl_platform_id platform,\r\n    cl_d3d11_device_source_khr d3d_device_source,\r\n    void* d3d_object,\r\n    cl_d3d11_device_set_khr d3d_device_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromD3D11BufferKHR(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D11Buffer* resource,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromD3D11Texture2DKHR(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D11Texture2D* resource,\r\n    UINT subresource,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromD3D11Texture3DKHR(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    ID3D11Texture3D* resource,\r\n    UINT subresource,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueAcquireD3D11ObjectsKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReleaseD3D11ObjectsKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_intel_sharing_format_query_d3d11\r\n***************************************************************/\r\n#define cl_intel_sharing_format_query_d3d11 1\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_D3D11_EXTENSION_NAME \\\r\n    \"cl_intel_sharing_format_query_d3d11\"\r\n\r\n\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_D3D11_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* when cl_khr_d3d11_sharing is supported */\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetSupportedD3D11TextureFormatsINTEL_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint plane,\r\n    cl_uint num_entries,\r\n    DXGI_FORMAT* d3d11_formats,\r\n    cl_uint* num_texture_formats);\r\n\r\ntypedef clGetSupportedD3D11TextureFormatsINTEL_t *\r\nclGetSupportedD3D11TextureFormatsINTEL_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetSupportedD3D11TextureFormatsINTEL(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint plane,\r\n    cl_uint num_entries,\r\n    DXGI_FORMAT* d3d11_formats,\r\n    cl_uint* num_texture_formats) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif /* OPENCL_CL_D3D11_H_ */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_dx9_media_sharing.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2023 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef OPENCL_CL_DX9_MEDIA_SHARING_H_\r\n#define OPENCL_CL_DX9_MEDIA_SHARING_H_\r\n\r\n/*\r\n** This header is generated from the Khronos OpenCL XML API Registry.\r\n*/\r\n\r\n#if defined(_WIN32)\r\n#if defined(_MSC_VER)\r\n#if _MSC_VER >=1500\r\n#pragma warning( push )\r\n#pragma warning( disable : 4201 )\r\n#pragma warning( disable : 5105 )\r\n#endif\r\n#endif\r\n#include <d3d9.h>\r\n#if defined(_MSC_VER)\r\n#if _MSC_VER >=1500\r\n#pragma warning( pop )\r\n#endif\r\n#endif\r\n#endif\r\n\r\n#include <CL/cl.h>\r\n\r\n/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)\r\n#define CL_NO_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n/* CL_NO_EXTENSION_PROTOTYPES implies\r\n   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and\r\n   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/***************************************************************\r\n* cl_khr_dx9_media_sharing\r\n***************************************************************/\r\n#define cl_khr_dx9_media_sharing 1\r\n#define CL_KHR_DX9_MEDIA_SHARING_EXTENSION_NAME \\\r\n    \"cl_khr_dx9_media_sharing\"\r\n\r\n\r\n#define CL_KHR_DX9_MEDIA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_uint             cl_dx9_media_adapter_type_khr;\r\ntypedef cl_uint             cl_dx9_media_adapter_set_khr;\r\n\r\n#if defined(_WIN32)\r\ntypedef struct _cl_dx9_surface_info_khr {\r\n    IDirect3DSurface9* resource;\r\n    HANDLE shared_handle;\r\n} cl_dx9_surface_info_khr;\r\n\r\n#endif /* defined(_WIN32) */\r\n\r\n/* Error codes */\r\n#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                    -1010\r\n#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                    -1011\r\n#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR           -1012\r\n#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR               -1013\r\n\r\n/* cl_media_adapter_type_khr */\r\n#define CL_ADAPTER_D3D9_KHR                                 0x2020\r\n#define CL_ADAPTER_D3D9EX_KHR                               0x2021\r\n#define CL_ADAPTER_DXVA_KHR                                 0x2022\r\n\r\n/* cl_media_adapter_set_khr */\r\n#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR      0x2023\r\n#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR            0x2024\r\n\r\n/* cl_context_info */\r\n#define CL_CONTEXT_ADAPTER_D3D9_KHR                         0x2025\r\n#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                       0x2026\r\n#define CL_CONTEXT_ADAPTER_DXVA_KHR                         0x2027\r\n\r\n/* cl_mem_info */\r\n#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                   0x2028\r\n#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                   0x2029\r\n\r\n/* cl_image_info */\r\n#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                        0x202A\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR           0x202B\r\n#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR           0x202C\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetDeviceIDsFromDX9MediaAdapterKHR_t(\r\n    cl_platform_id platform,\r\n    cl_uint num_media_adapters,\r\n    cl_dx9_media_adapter_type_khr* media_adapter_type,\r\n    void* media_adapters,\r\n    cl_dx9_media_adapter_set_khr media_adapter_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices);\r\n\r\ntypedef clGetDeviceIDsFromDX9MediaAdapterKHR_t *\r\nclGetDeviceIDsFromDX9MediaAdapterKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromDX9MediaSurfaceKHR_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_dx9_media_adapter_type_khr adapter_type,\r\n    void* surface_info,\r\n    cl_uint plane,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromDX9MediaSurfaceKHR_t *\r\nclCreateFromDX9MediaSurfaceKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueAcquireDX9MediaSurfacesKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueAcquireDX9MediaSurfacesKHR_t *\r\nclEnqueueAcquireDX9MediaSurfacesKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReleaseDX9MediaSurfacesKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReleaseDX9MediaSurfacesKHR_t *\r\nclEnqueueReleaseDX9MediaSurfacesKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetDeviceIDsFromDX9MediaAdapterKHR(\r\n    cl_platform_id platform,\r\n    cl_uint num_media_adapters,\r\n    cl_dx9_media_adapter_type_khr* media_adapter_type,\r\n    void* media_adapters,\r\n    cl_dx9_media_adapter_set_khr media_adapter_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromDX9MediaSurfaceKHR(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_dx9_media_adapter_type_khr adapter_type,\r\n    void* surface_info,\r\n    cl_uint plane,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueAcquireDX9MediaSurfacesKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReleaseDX9MediaSurfacesKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_intel_dx9_media_sharing\r\n***************************************************************/\r\n#define cl_intel_dx9_media_sharing 1\r\n#define CL_INTEL_DX9_MEDIA_SHARING_EXTENSION_NAME \\\r\n    \"cl_intel_dx9_media_sharing\"\r\n\r\n\r\n#define CL_INTEL_DX9_MEDIA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef cl_uint             cl_dx9_device_source_intel;\r\ntypedef cl_uint             cl_dx9_device_set_intel;\r\n\r\n/* Error codes */\r\n#define CL_INVALID_DX9_DEVICE_INTEL                         -1010\r\n#define CL_INVALID_DX9_RESOURCE_INTEL                       -1011\r\n#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL              -1012\r\n#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL                  -1013\r\n\r\n/* cl_dx9_device_source_intel */\r\n#define CL_D3D9_DEVICE_INTEL                                0x4022\r\n#define CL_D3D9EX_DEVICE_INTEL                              0x4070\r\n#define CL_DXVA_DEVICE_INTEL                                0x4071\r\n\r\n/* cl_dx9_device_set_intel */\r\n#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL                  0x4024\r\n#define CL_ALL_DEVICES_FOR_DX9_INTEL                        0x4025\r\n\r\n/* cl_context_info */\r\n#define CL_CONTEXT_D3D9_DEVICE_INTEL                        0x4026\r\n#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                      0x4072\r\n#define CL_CONTEXT_DXVA_DEVICE_INTEL                        0x4073\r\n\r\n/* cl_mem_info */\r\n#define CL_MEM_DX9_RESOURCE_INTEL                           0x4027\r\n#define CL_MEM_DX9_SHARED_HANDLE_INTEL                      0x4074\r\n\r\n/* cl_image_info */\r\n#define CL_IMAGE_DX9_PLANE_INTEL                            0x4075\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL                0x402A\r\n#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL                0x402B\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetDeviceIDsFromDX9INTEL_t(\r\n    cl_platform_id platform,\r\n    cl_dx9_device_source_intel dx9_device_source,\r\n    void* dx9_object,\r\n    cl_dx9_device_set_intel dx9_device_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices);\r\n\r\ntypedef clGetDeviceIDsFromDX9INTEL_t *\r\nclGetDeviceIDsFromDX9INTEL_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromDX9MediaSurfaceINTEL_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    IDirect3DSurface9* resource,\r\n    HANDLE sharedHandle,\r\n    UINT plane,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromDX9MediaSurfaceINTEL_t *\r\nclCreateFromDX9MediaSurfaceINTEL_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueAcquireDX9ObjectsINTEL_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueAcquireDX9ObjectsINTEL_t *\r\nclEnqueueAcquireDX9ObjectsINTEL_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReleaseDX9ObjectsINTEL_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReleaseDX9ObjectsINTEL_t *\r\nclEnqueueReleaseDX9ObjectsINTEL_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetDeviceIDsFromDX9INTEL(\r\n    cl_platform_id platform,\r\n    cl_dx9_device_source_intel dx9_device_source,\r\n    void* dx9_object,\r\n    cl_dx9_device_set_intel dx9_device_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromDX9MediaSurfaceINTEL(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    IDirect3DSurface9* resource,\r\n    HANDLE sharedHandle,\r\n    UINT plane,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueAcquireDX9ObjectsINTEL(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_1;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReleaseDX9ObjectsINTEL(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_intel_sharing_format_query_dx9\r\n***************************************************************/\r\n#define cl_intel_sharing_format_query_dx9 1\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_DX9_EXTENSION_NAME \\\r\n    \"cl_intel_sharing_format_query_dx9\"\r\n\r\n\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_DX9_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* when cl_khr_dx9_media_sharing or cl_intel_dx9_media_sharing is supported */\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetSupportedDX9MediaSurfaceFormatsINTEL_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint plane,\r\n    cl_uint num_entries,\r\n    D3DFORMAT* dx9_formats,\r\n    cl_uint* num_surface_formats);\r\n\r\ntypedef clGetSupportedDX9MediaSurfaceFormatsINTEL_t *\r\nclGetSupportedDX9MediaSurfaceFormatsINTEL_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetSupportedDX9MediaSurfaceFormatsINTEL(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint plane,\r\n    cl_uint num_entries,\r\n    D3DFORMAT* dx9_formats,\r\n    cl_uint* num_surface_formats) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif /* OPENCL_CL_DX9_MEDIA_SHARING_H_ */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_dx9_media_sharing_intel.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2020 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#include <CL/cl_dx9_media_sharing.h>\r\n#pragma message(\"The Intel DX9 media sharing extensions have been moved into cl_dx9_media_sharing.h.  Please include cl_dx9_media_sharing.h directly.\")\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_egl.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2023 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef OPENCL_CL_EGL_H_\r\n#define OPENCL_CL_EGL_H_\r\n\r\n/*\r\n** This header is generated from the Khronos OpenCL XML API Registry.\r\n*/\r\n\r\n#include <CL/cl.h>\r\n\r\n/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)\r\n#define CL_NO_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n/* CL_NO_EXTENSION_PROTOTYPES implies\r\n   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and\r\n   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/***************************************************************\r\n* cl_khr_egl_image\r\n***************************************************************/\r\n#define cl_khr_egl_image 1\r\n#define CL_KHR_EGL_IMAGE_EXTENSION_NAME \\\r\n    \"cl_khr_egl_image\"\r\n\r\n\r\n#define CL_KHR_EGL_IMAGE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */\r\n#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR                0x202F\r\n#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR                  0x202D\r\n#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR                  0x202E\r\n\r\n/* Error type for clCreateFromEGLImageKHR */\r\n#define CL_INVALID_EGL_OBJECT_KHR                           -1093\r\n#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR                    -1092\r\n\r\n/* CLeglImageKHR is an opaque handle to an EGLImage */\r\ntypedef void*               CLeglImageKHR;\r\n\r\n/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */\r\ntypedef void*               CLeglDisplayKHR;\r\n\r\n/* properties passed to clCreateFromEGLImageKHR */\r\ntypedef intptr_t            cl_egl_image_properties_khr;\r\n\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromEGLImageKHR_t(\r\n    cl_context context,\r\n    CLeglDisplayKHR egldisplay,\r\n    CLeglImageKHR eglimage,\r\n    cl_mem_flags flags,\r\n    const cl_egl_image_properties_khr* properties,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromEGLImageKHR_t *\r\nclCreateFromEGLImageKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueAcquireEGLObjectsKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueAcquireEGLObjectsKHR_t *\r\nclEnqueueAcquireEGLObjectsKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReleaseEGLObjectsKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReleaseEGLObjectsKHR_t *\r\nclEnqueueReleaseEGLObjectsKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromEGLImageKHR(\r\n    cl_context context,\r\n    CLeglDisplayKHR egldisplay,\r\n    CLeglImageKHR eglimage,\r\n    cl_mem_flags flags,\r\n    const cl_egl_image_properties_khr* properties,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueAcquireEGLObjectsKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReleaseEGLObjectsKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_egl_event\r\n***************************************************************/\r\n#define cl_khr_egl_event 1\r\n#define CL_KHR_EGL_EVENT_EXTENSION_NAME \\\r\n    \"cl_khr_egl_event\"\r\n\r\n\r\n#define CL_KHR_EGL_EVENT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */\r\n/* type CLeglDisplayKHR */\r\n\r\n/* CLeglSyncKHR is an opaque handle to an EGLSync object */\r\ntypedef void*               CLeglSyncKHR;\r\n\r\n\r\ntypedef cl_event CL_API_CALL\r\nclCreateEventFromEGLSyncKHR_t(\r\n    cl_context context,\r\n    CLeglSyncKHR sync,\r\n    CLeglDisplayKHR display,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateEventFromEGLSyncKHR_t *\r\nclCreateEventFromEGLSyncKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_event CL_API_CALL\r\nclCreateEventFromEGLSyncKHR(\r\n    cl_context context,\r\n    CLeglSyncKHR sync,\r\n    CLeglDisplayKHR display,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif /* OPENCL_CL_EGL_H_ */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_ext.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2023 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef OPENCL_CL_EXT_H_\r\n#define OPENCL_CL_EXT_H_\r\n\r\n/*\r\n** This header is generated from the Khronos OpenCL XML API Registry.\r\n*/\r\n\r\n#include <CL/cl.h>\r\n\r\n/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)\r\n#define CL_NO_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n/* CL_NO_EXTENSION_PROTOTYPES implies\r\n   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and\r\n   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/***************************************************************\r\n* cl_khr_command_buffer\r\n***************************************************************/\r\n#define cl_khr_command_buffer 1\r\n#define CL_KHR_COMMAND_BUFFER_EXTENSION_NAME \\\r\n    \"cl_khr_command_buffer\"\r\n\r\n\r\n#define CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 5)\r\n\r\ntypedef cl_bitfield         cl_device_command_buffer_capabilities_khr;\r\ntypedef struct _cl_command_buffer_khr* cl_command_buffer_khr;\r\ntypedef cl_uint             cl_sync_point_khr;\r\ntypedef cl_uint             cl_command_buffer_info_khr;\r\ntypedef cl_uint             cl_command_buffer_state_khr;\r\ntypedef cl_properties       cl_command_buffer_properties_khr;\r\ntypedef cl_bitfield         cl_command_buffer_flags_khr;\r\ntypedef cl_properties       cl_command_properties_khr;\r\ntypedef struct _cl_mutable_command_khr* cl_mutable_command_khr;\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR           0x12A9\r\n#define CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR 0x12AA\r\n\r\n/* cl_device_command_buffer_capabilities_khr - bitfield */\r\n#define CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR      (1 << 0)\r\n#define CL_COMMAND_BUFFER_CAPABILITY_DEVICE_SIDE_ENQUEUE_KHR (1 << 1)\r\n#define CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR   (1 << 2)\r\n#define CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR       (1 << 3)\r\n\r\n/* cl_command_buffer_properties_khr */\r\n#define CL_COMMAND_BUFFER_FLAGS_KHR                         0x1293\r\n\r\n/* cl_command_buffer_flags_khr - bitfield */\r\n#define CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR              (1 << 0)\r\n\r\n/* Error codes */\r\n#define CL_INVALID_COMMAND_BUFFER_KHR                       -1138\r\n#define CL_INVALID_SYNC_POINT_WAIT_LIST_KHR                 -1139\r\n#define CL_INCOMPATIBLE_COMMAND_QUEUE_KHR                   -1140\r\n\r\n/* cl_command_buffer_info_khr */\r\n#define CL_COMMAND_BUFFER_QUEUES_KHR                        0x1294\r\n#define CL_COMMAND_BUFFER_NUM_QUEUES_KHR                    0x1295\r\n#define CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR               0x1296\r\n#define CL_COMMAND_BUFFER_STATE_KHR                         0x1297\r\n#define CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR              0x1298\r\n#define CL_COMMAND_BUFFER_CONTEXT_KHR                       0x1299\r\n\r\n/* cl_command_buffer_state_khr */\r\n#define CL_COMMAND_BUFFER_STATE_RECORDING_KHR               0\r\n#define CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR              1\r\n#define CL_COMMAND_BUFFER_STATE_PENDING_KHR                 2\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_COMMAND_BUFFER_KHR                       0x12A8\r\n\r\n\r\ntypedef cl_command_buffer_khr CL_API_CALL\r\nclCreateCommandBufferKHR_t(\r\n    cl_uint num_queues,\r\n    const cl_command_queue* queues,\r\n    const cl_command_buffer_properties_khr* properties,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateCommandBufferKHR_t *\r\nclCreateCommandBufferKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclFinalizeCommandBufferKHR_t(\r\n    cl_command_buffer_khr command_buffer);\r\n\r\ntypedef clFinalizeCommandBufferKHR_t *\r\nclFinalizeCommandBufferKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclRetainCommandBufferKHR_t(\r\n    cl_command_buffer_khr command_buffer);\r\n\r\ntypedef clRetainCommandBufferKHR_t *\r\nclRetainCommandBufferKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclReleaseCommandBufferKHR_t(\r\n    cl_command_buffer_khr command_buffer);\r\n\r\ntypedef clReleaseCommandBufferKHR_t *\r\nclReleaseCommandBufferKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueCommandBufferKHR_t(\r\n    cl_uint num_queues,\r\n    cl_command_queue* queues,\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueCommandBufferKHR_t *\r\nclEnqueueCommandBufferKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandBarrierWithWaitListKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandBarrierWithWaitListKHR_t *\r\nclCommandBarrierWithWaitListKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandCopyBufferKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_buffer,\r\n    cl_mem dst_buffer,\r\n    size_t src_offset,\r\n    size_t dst_offset,\r\n    size_t size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandCopyBufferKHR_t *\r\nclCommandCopyBufferKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandCopyBufferRectKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_buffer,\r\n    cl_mem dst_buffer,\r\n    const size_t* src_origin,\r\n    const size_t* dst_origin,\r\n    const size_t* region,\r\n    size_t src_row_pitch,\r\n    size_t src_slice_pitch,\r\n    size_t dst_row_pitch,\r\n    size_t dst_slice_pitch,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandCopyBufferRectKHR_t *\r\nclCommandCopyBufferRectKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandCopyBufferToImageKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_buffer,\r\n    cl_mem dst_image,\r\n    size_t src_offset,\r\n    const size_t* dst_origin,\r\n    const size_t* region,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandCopyBufferToImageKHR_t *\r\nclCommandCopyBufferToImageKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandCopyImageKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_image,\r\n    cl_mem dst_image,\r\n    const size_t* src_origin,\r\n    const size_t* dst_origin,\r\n    const size_t* region,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandCopyImageKHR_t *\r\nclCommandCopyImageKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandCopyImageToBufferKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_image,\r\n    cl_mem dst_buffer,\r\n    const size_t* src_origin,\r\n    const size_t* region,\r\n    size_t dst_offset,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandCopyImageToBufferKHR_t *\r\nclCommandCopyImageToBufferKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandFillBufferKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem buffer,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t offset,\r\n    size_t size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandFillBufferKHR_t *\r\nclCommandFillBufferKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandFillImageKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem image,\r\n    const void* fill_color,\r\n    const size_t* origin,\r\n    const size_t* region,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandFillImageKHR_t *\r\nclCommandFillImageKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandNDRangeKernelKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_kernel kernel,\r\n    cl_uint work_dim,\r\n    const size_t* global_work_offset,\r\n    const size_t* global_work_size,\r\n    const size_t* local_work_size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandNDRangeKernelKHR_t *\r\nclCommandNDRangeKernelKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetCommandBufferInfoKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_buffer_info_khr param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetCommandBufferInfoKHR_t *\r\nclGetCommandBufferInfoKHR_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_command_buffer_khr CL_API_CALL\r\nclCreateCommandBufferKHR(\r\n    cl_uint num_queues,\r\n    const cl_command_queue* queues,\r\n    const cl_command_buffer_properties_khr* properties,\r\n    cl_int* errcode_ret) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclFinalizeCommandBufferKHR(\r\n    cl_command_buffer_khr command_buffer) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainCommandBufferKHR(\r\n    cl_command_buffer_khr command_buffer) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseCommandBufferKHR(\r\n    cl_command_buffer_khr command_buffer) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueCommandBufferKHR(\r\n    cl_uint num_queues,\r\n    cl_command_queue* queues,\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandBarrierWithWaitListKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandCopyBufferKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_buffer,\r\n    cl_mem dst_buffer,\r\n    size_t src_offset,\r\n    size_t dst_offset,\r\n    size_t size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandCopyBufferRectKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_buffer,\r\n    cl_mem dst_buffer,\r\n    const size_t* src_origin,\r\n    const size_t* dst_origin,\r\n    const size_t* region,\r\n    size_t src_row_pitch,\r\n    size_t src_slice_pitch,\r\n    size_t dst_row_pitch,\r\n    size_t dst_slice_pitch,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandCopyBufferToImageKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_buffer,\r\n    cl_mem dst_image,\r\n    size_t src_offset,\r\n    const size_t* dst_origin,\r\n    const size_t* region,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandCopyImageKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_image,\r\n    cl_mem dst_image,\r\n    const size_t* src_origin,\r\n    const size_t* dst_origin,\r\n    const size_t* region,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandCopyImageToBufferKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem src_image,\r\n    cl_mem dst_buffer,\r\n    const size_t* src_origin,\r\n    const size_t* region,\r\n    size_t dst_offset,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandFillBufferKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem buffer,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t offset,\r\n    size_t size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandFillImageKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_mem image,\r\n    const void* fill_color,\r\n    const size_t* origin,\r\n    const size_t* region,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandNDRangeKernelKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    cl_kernel kernel,\r\n    cl_uint work_dim,\r\n    const size_t* global_work_offset,\r\n    const size_t* global_work_size,\r\n    const size_t* local_work_size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetCommandBufferInfoKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_buffer_info_khr param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/* From version 0.9.4 of the extension */\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandSVMMemcpyKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    void* dst_ptr,\r\n    const void* src_ptr,\r\n    size_t size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandSVMMemcpyKHR_t *\r\nclCommandSVMMemcpyKHR_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCommandSVMMemFillKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    void* svm_ptr,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle);\r\n\r\ntypedef clCommandSVMMemFillKHR_t *\r\nclCommandSVMMemFillKHR_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandSVMMemcpyKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    void* dst_ptr,\r\n    const void* src_ptr,\r\n    size_t size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) CL_API_SUFFIX__VERSION_2_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCommandSVMMemFillKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_command_queue command_queue,\r\n    const cl_command_properties_khr* properties,\r\n    void* svm_ptr,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t size,\r\n    cl_uint num_sync_points_in_wait_list,\r\n    const cl_sync_point_khr* sync_point_wait_list,\r\n    cl_sync_point_khr* sync_point,\r\n    cl_mutable_command_khr* mutable_handle) CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_command_buffer_multi_device\r\n***************************************************************/\r\n#define cl_khr_command_buffer_multi_device 1\r\n#define CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_NAME \\\r\n    \"cl_khr_command_buffer_multi_device\"\r\n\r\n\r\n#define CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 1)\r\n\r\ntypedef cl_bitfield         cl_platform_command_buffer_capabilities_khr;\r\n\r\n/* cl_platform_info */\r\n#define CL_PLATFORM_COMMAND_BUFFER_CAPABILITIES_KHR         0x0908\r\n\r\n/* cl_platform_command_buffer_capabilities_khr - bitfield */\r\n#define CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR       (1 << 0)\r\n#define CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR         (1 << 1)\r\n#define CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR      (1 << 2)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_COMMAND_BUFFER_NUM_SYNC_DEVICES_KHR       0x12AB\r\n#define CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR           0x12AC\r\n\r\n/* cl_device_command_buffer_capabilities_khr - bitfield */\r\n#define CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR     (1 << 4)\r\n\r\n/* cl_command_buffer_flags_khr - bitfield */\r\n#define CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR              (1 << 2)\r\n\r\n\r\ntypedef cl_command_buffer_khr CL_API_CALL\r\nclRemapCommandBufferKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_bool automatic,\r\n    cl_uint num_queues,\r\n    const cl_command_queue* queues,\r\n    cl_uint num_handles,\r\n    const cl_mutable_command_khr* handles,\r\n    cl_mutable_command_khr* handles_ret,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clRemapCommandBufferKHR_t *\r\nclRemapCommandBufferKHR_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_command_buffer_khr CL_API_CALL\r\nclRemapCommandBufferKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_bool automatic,\r\n    cl_uint num_queues,\r\n    const cl_command_queue* queues,\r\n    cl_uint num_handles,\r\n    const cl_mutable_command_khr* handles,\r\n    cl_mutable_command_khr* handles_ret,\r\n    cl_int* errcode_ret) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_command_buffer_mutable_dispatch\r\n***************************************************************/\r\n#define cl_khr_command_buffer_mutable_dispatch 1\r\n#define CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME \\\r\n    \"cl_khr_command_buffer_mutable_dispatch\"\r\n\r\n\r\n#define CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 3)\r\n\r\ntypedef cl_uint             cl_command_buffer_update_type_khr;\r\ntypedef cl_bitfield         cl_mutable_dispatch_fields_khr;\r\ntypedef cl_uint             cl_mutable_command_info_khr;\r\ntypedef struct _cl_mutable_dispatch_arg_khr {\r\n    cl_uint arg_index;\r\n    size_t arg_size;\r\n    const void* arg_value;\r\n} cl_mutable_dispatch_arg_khr;\r\ntypedef struct _cl_mutable_dispatch_exec_info_khr {\r\n    cl_uint param_name;\r\n    size_t param_value_size;\r\n    const void* param_value;\r\n} cl_mutable_dispatch_exec_info_khr;\r\ntypedef struct _cl_mutable_dispatch_config_khr {\r\n    cl_mutable_command_khr command;\r\n    cl_uint num_args;\r\n    cl_uint num_svm_args;\r\n    cl_uint num_exec_infos;\r\n    cl_uint work_dim;\r\n    const cl_mutable_dispatch_arg_khr* arg_list;\r\n    const cl_mutable_dispatch_arg_khr* arg_svm_list;\r\n    const cl_mutable_dispatch_exec_info_khr* exec_info_list;\r\n    const size_t* global_work_offset;\r\n    const size_t* global_work_size;\r\n    const size_t* local_work_size;\r\n} cl_mutable_dispatch_config_khr;\r\ntypedef cl_bitfield         cl_mutable_dispatch_asserts_khr;\r\n\r\n/* cl_command_buffer_flags_khr - bitfield */\r\n#define CL_COMMAND_BUFFER_MUTABLE_KHR                       (1 << 1)\r\n\r\n/* Error codes */\r\n#define CL_INVALID_MUTABLE_COMMAND_KHR                      -1141\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR         0x12B0\r\n\r\n/* cl_command_properties_khr */\r\n#define CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR            0x12B1\r\n\r\n/* cl_mutable_dispatch_fields_khr - bitfield */\r\n#define CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR               (1 << 0)\r\n#define CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR                 (1 << 1)\r\n#define CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR                  (1 << 2)\r\n#define CL_MUTABLE_DISPATCH_ARGUMENTS_KHR                   (1 << 3)\r\n#define CL_MUTABLE_DISPATCH_EXEC_INFO_KHR                   (1 << 4)\r\n\r\n/* cl_mutable_command_info_khr */\r\n#define CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR                0x12A0\r\n#define CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR               0x12A1\r\n#define CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR                 0x12AD\r\n#define CL_MUTABLE_COMMAND_PROPERTIES_ARRAY_KHR             0x12A2\r\n#define CL_MUTABLE_DISPATCH_KERNEL_KHR                      0x12A3\r\n#define CL_MUTABLE_DISPATCH_DIMENSIONS_KHR                  0x12A4\r\n#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR          0x12A5\r\n#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR            0x12A6\r\n#define CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR             0x12A7\r\n\r\n/* cl_command_buffer_update_type_khr */\r\n#define CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR       0\r\n\r\n/* cl_command_buffer_properties_khr */\r\n#define CL_COMMAND_BUFFER_MUTABLE_DISPATCH_ASSERTS_KHR      0x12B7\r\n\r\n/* cl_command_properties_khr */\r\n#define CL_MUTABLE_DISPATCH_ASSERTS_KHR                     0x12B8\r\n\r\n/* cl_mutable_dispatch_asserts_khr - bitfield */\r\n#define CL_MUTABLE_DISPATCH_ASSERT_NO_ADDITIONAL_WORK_GROUPS_KHR (1 << 0)\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclUpdateMutableCommandsKHR_t(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_uint num_configs,\r\n    const cl_command_buffer_update_type_khr* config_types,\r\n    const void** configs);\r\n\r\ntypedef clUpdateMutableCommandsKHR_t *\r\nclUpdateMutableCommandsKHR_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetMutableCommandInfoKHR_t(\r\n    cl_mutable_command_khr command,\r\n    cl_mutable_command_info_khr param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetMutableCommandInfoKHR_t *\r\nclGetMutableCommandInfoKHR_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclUpdateMutableCommandsKHR(\r\n    cl_command_buffer_khr command_buffer,\r\n    cl_uint num_configs,\r\n    const cl_command_buffer_update_type_khr* config_types,\r\n    const void** configs) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetMutableCommandInfoKHR(\r\n    cl_mutable_command_khr command,\r\n    cl_mutable_command_info_khr param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_fp64\r\n***************************************************************/\r\n#define cl_khr_fp64 1\r\n#define CL_KHR_FP64_EXTENSION_NAME \\\r\n    \"cl_khr_fp64\"\r\n\r\n\r\n#define CL_KHR_FP64_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n#if !defined(CL_VERSION_1_2)\r\n/* cl_device_info - defined in CL.h for OpenCL 1.2 and newer */\r\n#define CL_DEVICE_DOUBLE_FP_CONFIG                          0x1032\r\n\r\n#endif /* !defined(CL_VERSION_1_2) */\r\n\r\n/***************************************************************\r\n* cl_khr_fp16\r\n***************************************************************/\r\n#define cl_khr_fp16 1\r\n#define CL_KHR_FP16_EXTENSION_NAME \\\r\n    \"cl_khr_fp16\"\r\n\r\n\r\n#define CL_KHR_FP16_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_HALF_FP_CONFIG                            0x1033\r\n\r\n/***************************************************************\r\n* cl_APPLE_SetMemObjectDestructor\r\n***************************************************************/\r\n#define cl_APPLE_SetMemObjectDestructor 1\r\n#define CL_APPLE_SETMEMOBJECTDESTRUCTOR_EXTENSION_NAME \\\r\n    \"cl_APPLE_SetMemObjectDestructor\"\r\n\r\n\r\n#define CL_APPLE_SETMEMOBJECTDESTRUCTOR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclSetMemObjectDestructorAPPLE_t(\r\n    cl_mem memobj,\r\n    void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data),\r\n    void* user_data);\r\n\r\ntypedef clSetMemObjectDestructorAPPLE_t *\r\nclSetMemObjectDestructorAPPLE_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetMemObjectDestructorAPPLE(\r\n    cl_mem memobj,\r\n    void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data),\r\n    void* user_data) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_APPLE_ContextLoggingFunctions\r\n***************************************************************/\r\n#define cl_APPLE_ContextLoggingFunctions 1\r\n#define CL_APPLE_CONTEXTLOGGINGFUNCTIONS_EXTENSION_NAME \\\r\n    \"cl_APPLE_ContextLoggingFunctions\"\r\n\r\n\r\n#define CL_APPLE_CONTEXTLOGGINGFUNCTIONS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n\r\ntypedef void CL_API_CALL\r\nclLogMessagesToSystemLogAPPLE_t(\r\n    const char* errstr,\r\n    const void* private_info,\r\n    size_t cb,\r\n    void* user_data);\r\n\r\ntypedef clLogMessagesToSystemLogAPPLE_t *\r\nclLogMessagesToSystemLogAPPLE_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef void CL_API_CALL\r\nclLogMessagesToStdoutAPPLE_t(\r\n    const char* errstr,\r\n    const void* private_info,\r\n    size_t cb,\r\n    void* user_data);\r\n\r\ntypedef clLogMessagesToStdoutAPPLE_t *\r\nclLogMessagesToStdoutAPPLE_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef void CL_API_CALL\r\nclLogMessagesToStderrAPPLE_t(\r\n    const char* errstr,\r\n    const void* private_info,\r\n    size_t cb,\r\n    void* user_data);\r\n\r\ntypedef clLogMessagesToStderrAPPLE_t *\r\nclLogMessagesToStderrAPPLE_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY void CL_API_CALL\r\nclLogMessagesToSystemLogAPPLE(\r\n    const char* errstr,\r\n    const void* private_info,\r\n    size_t cb,\r\n    void* user_data) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY void CL_API_CALL\r\nclLogMessagesToStdoutAPPLE(\r\n    const char* errstr,\r\n    const void* private_info,\r\n    size_t cb,\r\n    void* user_data) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY void CL_API_CALL\r\nclLogMessagesToStderrAPPLE(\r\n    const char* errstr,\r\n    const void* private_info,\r\n    size_t cb,\r\n    void* user_data) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_icd\r\n***************************************************************/\r\n#define cl_khr_icd 1\r\n#define CL_KHR_ICD_EXTENSION_NAME \\\r\n    \"cl_khr_icd\"\r\n\r\n\r\n#define CL_KHR_ICD_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_platform_info */\r\n#define CL_PLATFORM_ICD_SUFFIX_KHR                          0x0920\r\n\r\n/* Error codes */\r\n#define CL_PLATFORM_NOT_FOUND_KHR                           -1001\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclIcdGetPlatformIDsKHR_t(\r\n    cl_uint num_entries,\r\n    cl_platform_id* platforms,\r\n    cl_uint* num_platforms);\r\n\r\ntypedef clIcdGetPlatformIDsKHR_t *\r\nclIcdGetPlatformIDsKHR_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclIcdGetPlatformIDsKHR(\r\n    cl_uint num_entries,\r\n    cl_platform_id* platforms,\r\n    cl_uint* num_platforms) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_il_program\r\n***************************************************************/\r\n#define cl_khr_il_program 1\r\n#define CL_KHR_IL_PROGRAM_EXTENSION_NAME \\\r\n    \"cl_khr_il_program\"\r\n\r\n\r\n#define CL_KHR_IL_PROGRAM_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_IL_VERSION_KHR                            0x105B\r\n\r\n/* cl_program_info */\r\n#define CL_PROGRAM_IL_KHR                                   0x1169\r\n\r\n\r\ntypedef cl_program CL_API_CALL\r\nclCreateProgramWithILKHR_t(\r\n    cl_context context,\r\n    const void* il,\r\n    size_t length,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateProgramWithILKHR_t *\r\nclCreateProgramWithILKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_program CL_API_CALL\r\nclCreateProgramWithILKHR(\r\n    cl_context context,\r\n    const void* il,\r\n    size_t length,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_image2d_from_buffer\r\n***************************************************************/\r\n#define cl_khr_image2d_from_buffer 1\r\n#define CL_KHR_IMAGE2D_FROM_BUFFER_EXTENSION_NAME \\\r\n    \"cl_khr_image2d_from_buffer\"\r\n\r\n\r\n#define CL_KHR_IMAGE2D_FROM_BUFFER_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR                 0x104A\r\n#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR          0x104B\r\n\r\n/***************************************************************\r\n* cl_khr_initialize_memory\r\n***************************************************************/\r\n#define cl_khr_initialize_memory 1\r\n#define CL_KHR_INITIALIZE_MEMORY_EXTENSION_NAME \\\r\n    \"cl_khr_initialize_memory\"\r\n\r\n\r\n#define CL_KHR_INITIALIZE_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_context_memory_initialize_khr;\r\n\r\n/* cl_context_properties */\r\n#define CL_CONTEXT_MEMORY_INITIALIZE_KHR                    0x2030\r\n\r\n/* cl_context_memory_initialize_khr */\r\n#define CL_CONTEXT_MEMORY_INITIALIZE_LOCAL_KHR              (1 << 0)\r\n#define CL_CONTEXT_MEMORY_INITIALIZE_PRIVATE_KHR            (1 << 1)\r\n\r\n/***************************************************************\r\n* cl_khr_terminate_context\r\n***************************************************************/\r\n#define cl_khr_terminate_context 1\r\n#define CL_KHR_TERMINATE_CONTEXT_EXTENSION_NAME \\\r\n    \"cl_khr_terminate_context\"\r\n\r\n\r\n#define CL_KHR_TERMINATE_CONTEXT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_device_terminate_capability_khr;\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_TERMINATE_CAPABILITY_KHR                  0x2031\r\n\r\n/* cl_context_properties */\r\n#define CL_CONTEXT_TERMINATE_KHR                            0x2032\r\n\r\n/* cl_device_terminate_capability_khr */\r\n#define CL_DEVICE_TERMINATE_CAPABILITY_CONTEXT_KHR          (1 << 0)\r\n\r\n/* Error codes */\r\n#define CL_CONTEXT_TERMINATED_KHR                           -1121\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclTerminateContextKHR_t(\r\n    cl_context context);\r\n\r\ntypedef clTerminateContextKHR_t *\r\nclTerminateContextKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclTerminateContextKHR(\r\n    cl_context context) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_spir\r\n***************************************************************/\r\n#define cl_khr_spir 1\r\n#define CL_KHR_SPIR_EXTENSION_NAME \\\r\n    \"cl_khr_spir\"\r\n\r\n\r\n#define CL_KHR_SPIR_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_SPIR_VERSIONS                             0x40E0\r\n\r\n/* cl_program_binary_type */\r\n#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE                 0x40E1\r\n\r\n/***************************************************************\r\n* cl_khr_create_command_queue\r\n***************************************************************/\r\n#define cl_khr_create_command_queue 1\r\n#define CL_KHR_CREATE_COMMAND_QUEUE_EXTENSION_NAME \\\r\n    \"cl_khr_create_command_queue\"\r\n\r\n\r\n#define CL_KHR_CREATE_COMMAND_QUEUE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_properties       cl_queue_properties_khr;\r\n\r\n\r\ntypedef cl_command_queue CL_API_CALL\r\nclCreateCommandQueueWithPropertiesKHR_t(\r\n    cl_context context,\r\n    cl_device_id device,\r\n    const cl_queue_properties_khr* properties,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateCommandQueueWithPropertiesKHR_t *\r\nclCreateCommandQueueWithPropertiesKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_command_queue CL_API_CALL\r\nclCreateCommandQueueWithPropertiesKHR(\r\n    cl_context context,\r\n    cl_device_id device,\r\n    const cl_queue_properties_khr* properties,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_nv_device_attribute_query\r\n***************************************************************/\r\n#define cl_nv_device_attribute_query 1\r\n#define CL_NV_DEVICE_ATTRIBUTE_QUERY_EXTENSION_NAME \\\r\n    \"cl_nv_device_attribute_query\"\r\n\r\n\r\n#define CL_NV_DEVICE_ATTRIBUTE_QUERY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV               0x4000\r\n#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV               0x4001\r\n#define CL_DEVICE_REGISTERS_PER_BLOCK_NV                    0x4002\r\n#define CL_DEVICE_WARP_SIZE_NV                              0x4003\r\n#define CL_DEVICE_GPU_OVERLAP_NV                            0x4004\r\n#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV                    0x4005\r\n#define CL_DEVICE_INTEGRATED_MEMORY_NV                      0x4006\r\n\r\n/***************************************************************\r\n* cl_amd_device_attribute_query\r\n***************************************************************/\r\n#define cl_amd_device_attribute_query 1\r\n#define CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXTENSION_NAME \\\r\n    \"cl_amd_device_attribute_query\"\r\n\r\n\r\n#define CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD                0x4036\r\n#define CL_DEVICE_TOPOLOGY_AMD                              0x4037\r\n#define CL_DEVICE_BOARD_NAME_AMD                            0x4038\r\n#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD                    0x4039\r\n#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD                 0x4040\r\n#define CL_DEVICE_SIMD_WIDTH_AMD                            0x4041\r\n#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD                0x4042\r\n#define CL_DEVICE_WAVEFRONT_WIDTH_AMD                       0x4043\r\n#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD                   0x4044\r\n#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD              0x4045\r\n#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD         0x4046\r\n#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD       0x4047\r\n#define CL_DEVICE_LOCAL_MEM_BANKS_AMD                       0x4048\r\n#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD                0x4049\r\n#define CL_DEVICE_GFXIP_MAJOR_AMD                           0x404A\r\n#define CL_DEVICE_GFXIP_MINOR_AMD                           0x404B\r\n#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD                0x404C\r\n#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD             0x4030\r\n#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD                   0x4031\r\n#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD        0x4033\r\n#define CL_DEVICE_PCIE_ID_AMD                               0x4034\r\n\r\n/***************************************************************\r\n* cl_arm_printf\r\n***************************************************************/\r\n#define cl_arm_printf 1\r\n#define CL_ARM_PRINTF_EXTENSION_NAME \\\r\n    \"cl_arm_printf\"\r\n\r\n\r\n#define CL_ARM_PRINTF_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_context_properties */\r\n#define CL_PRINTF_CALLBACK_ARM                              0x40B0\r\n#define CL_PRINTF_BUFFERSIZE_ARM                            0x40B1\r\n\r\n/***************************************************************\r\n* cl_ext_device_fission\r\n***************************************************************/\r\n#define cl_ext_device_fission 1\r\n#define CL_EXT_DEVICE_FISSION_EXTENSION_NAME \\\r\n    \"cl_ext_device_fission\"\r\n\r\n\r\n#define CL_EXT_DEVICE_FISSION_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_ulong            cl_device_partition_property_ext;\r\n\r\n/* Error codes */\r\n#define CL_DEVICE_PARTITION_FAILED_EXT                      -1057\r\n#define CL_INVALID_PARTITION_COUNT_EXT                      -1058\r\n#define CL_INVALID_PARTITION_NAME_EXT                       -1059\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_PARENT_DEVICE_EXT                         0x4054\r\n#define CL_DEVICE_PARTITION_TYPES_EXT                       0x4055\r\n#define CL_DEVICE_AFFINITY_DOMAINS_EXT                      0x4056\r\n#define CL_DEVICE_REFERENCE_COUNT_EXT                       0x4057\r\n#define CL_DEVICE_PARTITION_STYLE_EXT                       0x4058\r\n\r\n/* cl_device_partition_property_ext */\r\n#define CL_DEVICE_PARTITION_EQUALLY_EXT                     0x4050\r\n#define CL_DEVICE_PARTITION_BY_COUNTS_EXT                   0x4051\r\n#define CL_DEVICE_PARTITION_BY_NAMES_EXT                    0x4052\r\n#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT          0x4053\r\n\r\n/* cl_device_partition_property_ext - affinity domains */\r\n#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT                     0x1\r\n#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT                     0x2\r\n#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT                     0x3\r\n#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT                     0x4\r\n#define CL_AFFINITY_DOMAIN_NUMA_EXT                         0x10\r\n#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT             0x100\r\n\r\n/* cl_device_partition_property_ext - list terminators */\r\n#define CL_PROPERTIES_LIST_END_EXT                          ((cl_device_partition_property_ext)0)\r\n#define CL_PARTITION_BY_COUNTS_LIST_END_EXT                 ((cl_device_partition_property_ext)0)\r\n#define CL_PARTITION_BY_NAMES_LIST_END_EXT                  ((cl_device_partition_property_ext)0 - 1)\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclReleaseDeviceEXT_t(\r\n    cl_device_id device);\r\n\r\ntypedef clReleaseDeviceEXT_t *\r\nclReleaseDeviceEXT_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclRetainDeviceEXT_t(\r\n    cl_device_id device);\r\n\r\ntypedef clRetainDeviceEXT_t *\r\nclRetainDeviceEXT_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCreateSubDevicesEXT_t(\r\n    cl_device_id in_device,\r\n    const cl_device_partition_property_ext* properties,\r\n    cl_uint num_entries,\r\n    cl_device_id* out_devices,\r\n    cl_uint* num_devices);\r\n\r\ntypedef clCreateSubDevicesEXT_t *\r\nclCreateSubDevicesEXT_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseDeviceEXT(\r\n    cl_device_id device) CL_API_SUFFIX__VERSION_1_1;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainDeviceEXT(\r\n    cl_device_id device) CL_API_SUFFIX__VERSION_1_1;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCreateSubDevicesEXT(\r\n    cl_device_id in_device,\r\n    const cl_device_partition_property_ext* properties,\r\n    cl_uint num_entries,\r\n    cl_device_id* out_devices,\r\n    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_ext_migrate_memobject\r\n***************************************************************/\r\n#define cl_ext_migrate_memobject 1\r\n#define CL_EXT_MIGRATE_MEMOBJECT_EXTENSION_NAME \\\r\n    \"cl_ext_migrate_memobject\"\r\n\r\n\r\n#define CL_EXT_MIGRATE_MEMOBJECT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_mem_migration_flags_ext;\r\n\r\n/* cl_mem_migration_flags_ext */\r\n#define CL_MIGRATE_MEM_OBJECT_HOST_EXT                      (1 << 0)\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT                   0x4040\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueMigrateMemObjectEXT_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_mem_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_mem_migration_flags_ext flags,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueMigrateMemObjectEXT_t *\r\nclEnqueueMigrateMemObjectEXT_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueMigrateMemObjectEXT(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_mem_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_mem_migration_flags_ext flags,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_ext_cxx_for_opencl\r\n***************************************************************/\r\n#define cl_ext_cxx_for_opencl 1\r\n#define CL_EXT_CXX_FOR_OPENCL_EXTENSION_NAME \\\r\n    \"cl_ext_cxx_for_opencl\"\r\n\r\n\r\n#define CL_EXT_CXX_FOR_OPENCL_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_CXX_FOR_OPENCL_NUMERIC_VERSION_EXT        0x4230\r\n\r\n/***************************************************************\r\n* cl_qcom_ext_host_ptr\r\n***************************************************************/\r\n#define cl_qcom_ext_host_ptr 1\r\n#define CL_QCOM_EXT_HOST_PTR_EXTENSION_NAME \\\r\n    \"cl_qcom_ext_host_ptr\"\r\n\r\n\r\n#define CL_QCOM_EXT_HOST_PTR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef cl_uint             cl_image_pitch_info_qcom;\r\ntypedef struct _cl_mem_ext_host_ptr {\r\n    cl_uint allocation_type;\r\n    cl_uint host_cache_policy;\r\n} cl_mem_ext_host_ptr;\r\n\r\n/* cl_mem_flags */\r\n#define CL_MEM_EXT_HOST_PTR_QCOM                            (1 << 29)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM             0x40A0\r\n#define CL_DEVICE_PAGE_SIZE_QCOM                            0x40A1\r\n\r\n/* cl_image_pitch_info_qcom */\r\n#define CL_IMAGE_ROW_ALIGNMENT_QCOM                         0x40A2\r\n#define CL_IMAGE_SLICE_ALIGNMENT_QCOM                       0x40A3\r\n\r\n/* cl_uint host_cache_policy */\r\n#define CL_MEM_HOST_UNCACHED_QCOM                           0x40A4\r\n#define CL_MEM_HOST_WRITEBACK_QCOM                          0x40A5\r\n#define CL_MEM_HOST_WRITETHROUGH_QCOM                       0x40A6\r\n#define CL_MEM_HOST_WRITE_COMBINING_QCOM                    0x40A7\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetDeviceImageInfoQCOM_t(\r\n    cl_device_id device,\r\n    size_t image_width,\r\n    size_t image_height,\r\n    const cl_image_format* image_format,\r\n    cl_image_pitch_info_qcom param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetDeviceImageInfoQCOM_t *\r\nclGetDeviceImageInfoQCOM_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetDeviceImageInfoQCOM(\r\n    cl_device_id device,\r\n    size_t image_width,\r\n    size_t image_height,\r\n    const cl_image_format* image_format,\r\n    cl_image_pitch_info_qcom param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_qcom_ext_host_ptr_iocoherent\r\n***************************************************************/\r\n#define cl_qcom_ext_host_ptr_iocoherent 1\r\n#define CL_QCOM_EXT_HOST_PTR_IOCOHERENT_EXTENSION_NAME \\\r\n    \"cl_qcom_ext_host_ptr_iocoherent\"\r\n\r\n\r\n#define CL_QCOM_EXT_HOST_PTR_IOCOHERENT_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_uint host_cache_policy */\r\n#define CL_MEM_HOST_IOCOHERENT_QCOM                         0x40A9\r\n\r\n/***************************************************************\r\n* cl_qcom_ion_host_ptr\r\n***************************************************************/\r\n#define cl_qcom_ion_host_ptr 1\r\n#define CL_QCOM_ION_HOST_PTR_EXTENSION_NAME \\\r\n    \"cl_qcom_ion_host_ptr\"\r\n\r\n\r\n#define CL_QCOM_ION_HOST_PTR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* type cl_mem_ext_host_ptr */\r\ntypedef struct _cl_mem_ion_host_ptr {\r\n    cl_mem_ext_host_ptr ext_host_ptr;\r\n    int ion_filedesc;\r\n    void* ion_hostptr;\r\n} cl_mem_ion_host_ptr;\r\n\r\n/* cl_uint allocation_type */\r\n#define CL_MEM_ION_HOST_PTR_QCOM                            0x40A8\r\n\r\n/***************************************************************\r\n* cl_qcom_android_native_buffer_host_ptr\r\n***************************************************************/\r\n#define cl_qcom_android_native_buffer_host_ptr 1\r\n#define CL_QCOM_ANDROID_NATIVE_BUFFER_HOST_PTR_EXTENSION_NAME \\\r\n    \"cl_qcom_android_native_buffer_host_ptr\"\r\n\r\n\r\n#define CL_QCOM_ANDROID_NATIVE_BUFFER_HOST_PTR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* type cl_mem_ext_host_ptr */\r\ntypedef struct _cl_mem_android_native_buffer_host_ptr {\r\n    cl_mem_ext_host_ptr ext_host_ptr;\r\n    void* anb_ptr;\r\n} cl_mem_android_native_buffer_host_ptr;\r\n\r\n/* cl_uint allocation_type */\r\n#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM          0x40C6\r\n\r\n/***************************************************************\r\n* cl_img_yuv_image\r\n***************************************************************/\r\n#define cl_img_yuv_image 1\r\n#define CL_IMG_YUV_IMAGE_EXTENSION_NAME \\\r\n    \"cl_img_yuv_image\"\r\n\r\n\r\n#define CL_IMG_YUV_IMAGE_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_channel_order */\r\n#define CL_NV21_IMG                                         0x40D0\r\n#define CL_YV12_IMG                                         0x40D1\r\n\r\n/***************************************************************\r\n* cl_img_cached_allocations\r\n***************************************************************/\r\n#define cl_img_cached_allocations 1\r\n#define CL_IMG_CACHED_ALLOCATIONS_EXTENSION_NAME \\\r\n    \"cl_img_cached_allocations\"\r\n\r\n\r\n#define CL_IMG_CACHED_ALLOCATIONS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_mem_flags */\r\n#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG                  (1 << 26)\r\n#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG                    (1 << 27)\r\n\r\n/***************************************************************\r\n* cl_img_use_gralloc_ptr\r\n***************************************************************/\r\n#define cl_img_use_gralloc_ptr 1\r\n#define CL_IMG_USE_GRALLOC_PTR_EXTENSION_NAME \\\r\n    \"cl_img_use_gralloc_ptr\"\r\n\r\n\r\n#define CL_IMG_USE_GRALLOC_PTR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* Error codes */\r\n#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG                0x40D4\r\n#define CL_INVALID_GRALLOC_OBJECT_IMG                       0x40D5\r\n\r\n/* cl_mem_flags */\r\n#define CL_MEM_USE_GRALLOC_PTR_IMG                          (1 << 28)\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG              0x40D2\r\n#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG              0x40D3\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueAcquireGrallocObjectsIMG_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueAcquireGrallocObjectsIMG_t *\r\nclEnqueueAcquireGrallocObjectsIMG_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReleaseGrallocObjectsIMG_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReleaseGrallocObjectsIMG_t *\r\nclEnqueueReleaseGrallocObjectsIMG_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueAcquireGrallocObjectsIMG(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReleaseGrallocObjectsIMG(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_img_generate_mipmap\r\n***************************************************************/\r\n#define cl_img_generate_mipmap 1\r\n#define CL_IMG_GENERATE_MIPMAP_EXTENSION_NAME \\\r\n    \"cl_img_generate_mipmap\"\r\n\r\n\r\n#define CL_IMG_GENERATE_MIPMAP_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef cl_uint             cl_mipmap_filter_mode_img;\r\n\r\n/* cl_mipmap_filter_mode_img */\r\n#define CL_MIPMAP_FILTER_ANY_IMG                            0x0\r\n#define CL_MIPMAP_FILTER_BOX_IMG                            0x1\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_GENERATE_MIPMAP_IMG                      0x40D6\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueGenerateMipmapIMG_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem src_image,\r\n    cl_mem dst_image,\r\n    cl_mipmap_filter_mode_img mipmap_filter_mode,\r\n    const size_t* array_region,\r\n    const size_t* mip_region,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueGenerateMipmapIMG_t *\r\nclEnqueueGenerateMipmapIMG_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueGenerateMipmapIMG(\r\n    cl_command_queue command_queue,\r\n    cl_mem src_image,\r\n    cl_mem dst_image,\r\n    cl_mipmap_filter_mode_img mipmap_filter_mode,\r\n    const size_t* array_region,\r\n    const size_t* mip_region,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_img_mem_properties\r\n***************************************************************/\r\n#define cl_img_mem_properties 1\r\n#define CL_IMG_MEM_PROPERTIES_EXTENSION_NAME \\\r\n    \"cl_img_mem_properties\"\r\n\r\n\r\n#define CL_IMG_MEM_PROPERTIES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_mem_properties */\r\n#define CL_MEM_ALLOC_FLAGS_IMG                              0x40D7\r\n\r\n/* cl_mem_alloc_flags_img */\r\n#define CL_MEM_ALLOC_RELAX_REQUIREMENTS_IMG                 (1 << 0)\r\n#define CL_MEM_ALLOC_GPU_WRITE_COMBINE_IMG                  (1 << 1)\r\n#define CL_MEM_ALLOC_GPU_CACHED_IMG                         (1 << 2)\r\n#define CL_MEM_ALLOC_CPU_LOCAL_IMG                          (1 << 3)\r\n#define CL_MEM_ALLOC_GPU_LOCAL_IMG                          (1 << 4)\r\n#define CL_MEM_ALLOC_GPU_PRIVATE_IMG                        (1 << 5)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_MEMORY_CAPABILITIES_IMG                   0x40D8\r\n\r\n/***************************************************************\r\n* cl_khr_subgroups\r\n***************************************************************/\r\n#define cl_khr_subgroups 1\r\n#define CL_KHR_SUBGROUPS_EXTENSION_NAME \\\r\n    \"cl_khr_subgroups\"\r\n\r\n\r\n#define CL_KHR_SUBGROUPS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n#if !defined(CL_VERSION_2_1)\r\n/* defined in CL.h for OpenCL 2.1 and newer */\r\ntypedef cl_uint             cl_kernel_sub_group_info;\r\n\r\n#endif /* !defined(CL_VERSION_2_1) */\r\n\r\n/* cl_kernel_sub_group_info */\r\n#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR        0x2033\r\n#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR           0x2034\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetKernelSubGroupInfoKHR_t(\r\n    cl_kernel in_kernel,\r\n    cl_device_id in_device,\r\n    cl_kernel_sub_group_info param_name,\r\n    size_t input_value_size,\r\n    const void* input_value,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetKernelSubGroupInfoKHR_t *\r\nclGetKernelSubGroupInfoKHR_fn CL_API_SUFFIX__VERSION_2_0_DEPRECATED;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetKernelSubGroupInfoKHR(\r\n    cl_kernel in_kernel,\r\n    cl_device_id in_device,\r\n    cl_kernel_sub_group_info param_name,\r\n    size_t input_value_size,\r\n    const void* input_value,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_mipmap_image\r\n***************************************************************/\r\n#define cl_khr_mipmap_image 1\r\n#define CL_KHR_MIPMAP_IMAGE_EXTENSION_NAME \\\r\n    \"cl_khr_mipmap_image\"\r\n\r\n\r\n#define CL_KHR_MIPMAP_IMAGE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_sampler_properties */\r\n#define CL_SAMPLER_MIP_FILTER_MODE_KHR                      0x1155\r\n#define CL_SAMPLER_LOD_MIN_KHR                              0x1156\r\n#define CL_SAMPLER_LOD_MAX_KHR                              0x1157\r\n\r\n/***************************************************************\r\n* cl_khr_priority_hints\r\n***************************************************************/\r\n#define cl_khr_priority_hints 1\r\n#define CL_KHR_PRIORITY_HINTS_EXTENSION_NAME \\\r\n    \"cl_khr_priority_hints\"\r\n\r\n\r\n#define CL_KHR_PRIORITY_HINTS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* To be used by clGetEventInfo */\r\ntypedef cl_uint             cl_queue_priority_khr;\r\n\r\n/* cl_queue_properties */\r\n#define CL_QUEUE_PRIORITY_KHR                               0x1096\r\n\r\n/* cl_queue_priority_khr */\r\n#define CL_QUEUE_PRIORITY_HIGH_KHR                          (1 << 0)\r\n#define CL_QUEUE_PRIORITY_MED_KHR                           (1 << 1)\r\n#define CL_QUEUE_PRIORITY_LOW_KHR                           (1 << 2)\r\n\r\n/***************************************************************\r\n* cl_khr_throttle_hints\r\n***************************************************************/\r\n#define cl_khr_throttle_hints 1\r\n#define CL_KHR_THROTTLE_HINTS_EXTENSION_NAME \\\r\n    \"cl_khr_throttle_hints\"\r\n\r\n\r\n#define CL_KHR_THROTTLE_HINTS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* To be used by clGetEventInfo */\r\ntypedef cl_uint             cl_queue_throttle_khr;\r\n\r\n/* cl_queue_properties */\r\n#define CL_QUEUE_THROTTLE_KHR                               0x1097\r\n\r\n/* cl_queue_throttle_khr */\r\n#define CL_QUEUE_THROTTLE_HIGH_KHR                          (1 << 0)\r\n#define CL_QUEUE_THROTTLE_MED_KHR                           (1 << 1)\r\n#define CL_QUEUE_THROTTLE_LOW_KHR                           (1 << 2)\r\n\r\n/***************************************************************\r\n* cl_khr_subgroup_named_barrier\r\n***************************************************************/\r\n#define cl_khr_subgroup_named_barrier 1\r\n#define CL_KHR_SUBGROUP_NAMED_BARRIER_EXTENSION_NAME \\\r\n    \"cl_khr_subgroup_named_barrier\"\r\n\r\n\r\n#define CL_KHR_SUBGROUP_NAMED_BARRIER_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR               0x2035\r\n\r\n/***************************************************************\r\n* cl_khr_extended_versioning\r\n***************************************************************/\r\n#define cl_khr_extended_versioning 1\r\n#define CL_KHR_EXTENDED_VERSIONING_EXTENSION_NAME \\\r\n    \"cl_khr_extended_versioning\"\r\n\r\n\r\n#define CL_KHR_EXTENDED_VERSIONING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n#define CL_VERSION_MAJOR_BITS_KHR                           10\r\n#define CL_VERSION_MINOR_BITS_KHR                           10\r\n#define CL_VERSION_PATCH_BITS_KHR                           12\r\n\r\n#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)\r\n#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)\r\n#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)\r\n\r\n#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))\r\n#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)\r\n#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)\r\n\r\n#define CL_MAKE_VERSION_KHR(major, minor, patch) \\\r\n    ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \\\r\n    (((minor) &  CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \\\r\n    ((patch) & CL_VERSION_PATCH_MASK_KHR))\r\n\r\n#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR                   64\r\n\r\ntypedef cl_uint             cl_version_khr;\r\ntypedef struct _cl_name_version_khr {\r\n    cl_version_khr version;\r\n    char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];\r\n} cl_name_version_khr;\r\n\r\n/* cl_platform_info */\r\n#define CL_PLATFORM_NUMERIC_VERSION_KHR                     0x0906\r\n#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR             0x0907\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_NUMERIC_VERSION_KHR                       0x105E\r\n#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR              0x105F\r\n#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR               0x1060\r\n#define CL_DEVICE_ILS_WITH_VERSION_KHR                      0x1061\r\n#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR         0x1062\r\n\r\n/***************************************************************\r\n* cl_khr_device_uuid\r\n***************************************************************/\r\n#define cl_khr_device_uuid 1\r\n#define CL_KHR_DEVICE_UUID_EXTENSION_NAME \\\r\n    \"cl_khr_device_uuid\"\r\n\r\n\r\n#define CL_KHR_DEVICE_UUID_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* Size Constants */\r\n#define CL_UUID_SIZE_KHR                                    16\r\n#define CL_LUID_SIZE_KHR                                    8\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_UUID_KHR                                  0x106A\r\n#define CL_DRIVER_UUID_KHR                                  0x106B\r\n#define CL_DEVICE_LUID_VALID_KHR                            0x106C\r\n#define CL_DEVICE_LUID_KHR                                  0x106D\r\n#define CL_DEVICE_NODE_MASK_KHR                             0x106E\r\n\r\n/***************************************************************\r\n* cl_khr_pci_bus_info\r\n***************************************************************/\r\n#define cl_khr_pci_bus_info 1\r\n#define CL_KHR_PCI_BUS_INFO_EXTENSION_NAME \\\r\n    \"cl_khr_pci_bus_info\"\r\n\r\n\r\n#define CL_KHR_PCI_BUS_INFO_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef struct _cl_device_pci_bus_info_khr {\r\n    cl_uint pci_domain;\r\n    cl_uint pci_bus;\r\n    cl_uint pci_device;\r\n    cl_uint pci_function;\r\n} cl_device_pci_bus_info_khr;\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_PCI_BUS_INFO_KHR                          0x410F\r\n\r\n/***************************************************************\r\n* cl_khr_suggested_local_work_size\r\n***************************************************************/\r\n#define cl_khr_suggested_local_work_size 1\r\n#define CL_KHR_SUGGESTED_LOCAL_WORK_SIZE_EXTENSION_NAME \\\r\n    \"cl_khr_suggested_local_work_size\"\r\n\r\n\r\n#define CL_KHR_SUGGESTED_LOCAL_WORK_SIZE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetKernelSuggestedLocalWorkSizeKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    cl_uint work_dim,\r\n    const size_t* global_work_offset,\r\n    const size_t* global_work_size,\r\n    size_t* suggested_local_work_size);\r\n\r\ntypedef clGetKernelSuggestedLocalWorkSizeKHR_t *\r\nclGetKernelSuggestedLocalWorkSizeKHR_fn CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetKernelSuggestedLocalWorkSizeKHR(\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    cl_uint work_dim,\r\n    const size_t* global_work_offset,\r\n    const size_t* global_work_size,\r\n    size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_integer_dot_product\r\n***************************************************************/\r\n#define cl_khr_integer_dot_product 1\r\n#define CL_KHR_INTEGER_DOT_PRODUCT_EXTENSION_NAME \\\r\n    \"cl_khr_integer_dot_product\"\r\n\r\n\r\n#define CL_KHR_INTEGER_DOT_PRODUCT_EXTENSION_VERSION CL_MAKE_VERSION(2, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_device_integer_dot_product_capabilities_khr;\r\ntypedef struct _cl_device_integer_dot_product_acceleration_properties_khr {\r\n    cl_bool signed_accelerated;\r\n    cl_bool unsigned_accelerated;\r\n    cl_bool mixed_signedness_accelerated;\r\n    cl_bool accumulating_saturating_signed_accelerated;\r\n    cl_bool accumulating_saturating_unsigned_accelerated;\r\n    cl_bool accumulating_saturating_mixed_signedness_accelerated;\r\n} cl_device_integer_dot_product_acceleration_properties_khr;\r\n\r\n/* cl_device_integer_dot_product_capabilities_khr */\r\n#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR (1 << 0)\r\n#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR      (1 << 1)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR      0x1073\r\n#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR 0x1074\r\n#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR 0x1075\r\n\r\n/***************************************************************\r\n* cl_khr_external_memory\r\n***************************************************************/\r\n#define cl_khr_external_memory 1\r\n#define CL_KHR_EXTERNAL_MEMORY_EXTENSION_NAME \\\r\n    \"cl_khr_external_memory\"\r\n\r\n\r\n#define CL_KHR_EXTERNAL_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 1)\r\n\r\ntypedef cl_uint             cl_external_memory_handle_type_khr;\r\n\r\n/* cl_platform_info */\r\n#define CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR 0x2044\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR   0x204F\r\n#define CL_DEVICE_EXTERNAL_MEMORY_IMPORT_ASSUME_LINEAR_IMAGES_HANDLE_TYPES_KHR 0x2052\r\n\r\n/* cl_mem_properties */\r\n#define CL_MEM_DEVICE_HANDLE_LIST_KHR                       0x2051\r\n#define CL_MEM_DEVICE_HANDLE_LIST_END_KHR                   0\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_ACQUIRE_EXTERNAL_MEM_OBJECTS_KHR         0x2047\r\n#define CL_COMMAND_RELEASE_EXTERNAL_MEM_OBJECTS_KHR         0x2048\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueAcquireExternalMemObjectsKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_mem_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueAcquireExternalMemObjectsKHR_t *\r\nclEnqueueAcquireExternalMemObjectsKHR_fn CL_API_SUFFIX__VERSION_3_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReleaseExternalMemObjectsKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_mem_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReleaseExternalMemObjectsKHR_t *\r\nclEnqueueReleaseExternalMemObjectsKHR_fn CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueAcquireExternalMemObjectsKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_mem_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_3_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReleaseExternalMemObjectsKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_mem_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_external_memory_dma_buf\r\n***************************************************************/\r\n#define cl_khr_external_memory_dma_buf 1\r\n#define CL_KHR_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME \\\r\n    \"cl_khr_external_memory_dma_buf\"\r\n\r\n\r\n#define CL_KHR_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_external_memory_handle_type_khr */\r\n#define CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR               0x2067\r\n\r\n/***************************************************************\r\n* cl_khr_external_memory_opaque_fd\r\n***************************************************************/\r\n#define cl_khr_external_memory_opaque_fd 1\r\n#define CL_KHR_EXTERNAL_MEMORY_OPAQUE_FD_EXTENSION_NAME \\\r\n    \"cl_khr_external_memory_opaque_fd\"\r\n\r\n\r\n#define CL_KHR_EXTERNAL_MEMORY_OPAQUE_FD_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_external_memory_handle_type_khr */\r\n#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR             0x2060\r\n\r\n/***************************************************************\r\n* cl_khr_external_memory_win32\r\n***************************************************************/\r\n#define cl_khr_external_memory_win32 1\r\n#define CL_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME \\\r\n    \"cl_khr_external_memory_win32\"\r\n\r\n\r\n#define CL_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_VERSION CL_MAKE_VERSION(1, 1, 0)\r\n\r\n/* cl_external_memory_handle_type_khr */\r\n#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR          0x2061\r\n#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR      0x2062\r\n#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_NAME_KHR     0x2069\r\n\r\n/***************************************************************\r\n* cl_khr_external_semaphore\r\n***************************************************************/\r\n#define cl_khr_external_semaphore 1\r\n#define CL_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME \\\r\n    \"cl_khr_external_semaphore\"\r\n\r\n\r\n#define CL_KHR_EXTERNAL_SEMAPHORE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 1)\r\n\r\ntypedef struct _cl_semaphore_khr * cl_semaphore_khr;\r\ntypedef cl_uint             cl_external_semaphore_handle_type_khr;\r\n\r\n/* cl_platform_info */\r\n#define CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR       0x2037\r\n#define CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR       0x2038\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR         0x204D\r\n#define CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR         0x204E\r\n\r\n/* cl_semaphore_properties_khr */\r\n#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR                0x203F\r\n#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR       0\r\n\r\n/* cl_semaphore_info_khr */\r\n#define CL_SEMAPHORE_EXPORTABLE_KHR                         0x2054\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetSemaphoreHandleForTypeKHR_t(\r\n    cl_semaphore_khr sema_object,\r\n    cl_device_id device,\r\n    cl_external_semaphore_handle_type_khr handle_type,\r\n    size_t handle_size,\r\n    void* handle_ptr,\r\n    size_t* handle_size_ret);\r\n\r\ntypedef clGetSemaphoreHandleForTypeKHR_t *\r\nclGetSemaphoreHandleForTypeKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetSemaphoreHandleForTypeKHR(\r\n    cl_semaphore_khr sema_object,\r\n    cl_device_id device,\r\n    cl_external_semaphore_handle_type_khr handle_type,\r\n    size_t handle_size,\r\n    void* handle_ptr,\r\n    size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_external_semaphore_opaque_fd\r\n***************************************************************/\r\n#define cl_khr_external_semaphore_opaque_fd 1\r\n#define CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXTENSION_NAME \\\r\n    \"cl_khr_external_semaphore_opaque_fd\"\r\n\r\n\r\n#define CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_external_semaphore_handle_type_khr */\r\n#define CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR                   0x2055\r\n\r\n/***************************************************************\r\n* cl_khr_external_semaphore_sync_fd\r\n***************************************************************/\r\n#define cl_khr_external_semaphore_sync_fd 1\r\n#define CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXTENSION_NAME \\\r\n    \"cl_khr_external_semaphore_sync_fd\"\r\n\r\n\r\n#define CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_properties       cl_semaphore_reimport_properties_khr;\r\n\r\n/* cl_external_semaphore_handle_type_khr */\r\n#define CL_SEMAPHORE_HANDLE_SYNC_FD_KHR                     0x2058\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclReImportSemaphoreSyncFdKHR_t(\r\n    cl_semaphore_khr sema_object,\r\n    cl_semaphore_reimport_properties_khr* reimport_props,\r\n    int fd);\r\n\r\ntypedef clReImportSemaphoreSyncFdKHR_t *\r\nclReImportSemaphoreSyncFdKHR_fn CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReImportSemaphoreSyncFdKHR(\r\n    cl_semaphore_khr sema_object,\r\n    cl_semaphore_reimport_properties_khr* reimport_props,\r\n    int fd) CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_external_semaphore_win32\r\n***************************************************************/\r\n#define cl_khr_external_semaphore_win32 1\r\n#define CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME \\\r\n    \"cl_khr_external_semaphore_win32\"\r\n\r\n\r\n#define CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 1)\r\n\r\n/* cl_external_semaphore_handle_type_khr */\r\n#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR                0x2056\r\n#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR            0x2057\r\n#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_NAME_KHR           0x2068\r\n\r\n/***************************************************************\r\n* cl_khr_semaphore\r\n***************************************************************/\r\n#define cl_khr_semaphore 1\r\n#define CL_KHR_SEMAPHORE_EXTENSION_NAME \\\r\n    \"cl_khr_semaphore\"\r\n\r\n\r\n#define CL_KHR_SEMAPHORE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* type cl_semaphore_khr */\r\ntypedef cl_properties       cl_semaphore_properties_khr;\r\ntypedef cl_uint             cl_semaphore_info_khr;\r\ntypedef cl_uint             cl_semaphore_type_khr;\r\ntypedef cl_ulong            cl_semaphore_payload_khr;\r\n\r\n/* cl_semaphore_type */\r\n#define CL_SEMAPHORE_TYPE_BINARY_KHR                        1\r\n\r\n/* cl_platform_info */\r\n#define CL_PLATFORM_SEMAPHORE_TYPES_KHR                     0x2036\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_SEMAPHORE_TYPES_KHR                       0x204C\r\n\r\n/* cl_semaphore_info_khr */\r\n#define CL_SEMAPHORE_CONTEXT_KHR                            0x2039\r\n#define CL_SEMAPHORE_REFERENCE_COUNT_KHR                    0x203A\r\n#define CL_SEMAPHORE_PROPERTIES_KHR                         0x203B\r\n#define CL_SEMAPHORE_PAYLOAD_KHR                            0x203C\r\n\r\n/* cl_semaphore_info_khr or cl_semaphore_properties_khr */\r\n#define CL_SEMAPHORE_TYPE_KHR                               0x203D\r\n#define CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR                 0x2053\r\n#define CL_SEMAPHORE_DEVICE_HANDLE_LIST_END_KHR             0\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_SEMAPHORE_WAIT_KHR                       0x2042\r\n#define CL_COMMAND_SEMAPHORE_SIGNAL_KHR                     0x2043\r\n\r\n/* Error codes */\r\n#define CL_INVALID_SEMAPHORE_KHR                            -1142\r\n\r\n\r\ntypedef cl_semaphore_khr CL_API_CALL\r\nclCreateSemaphoreWithPropertiesKHR_t(\r\n    cl_context context,\r\n    const cl_semaphore_properties_khr* sema_props,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateSemaphoreWithPropertiesKHR_t *\r\nclCreateSemaphoreWithPropertiesKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueWaitSemaphoresKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_sema_objects,\r\n    const cl_semaphore_khr* sema_objects,\r\n    const cl_semaphore_payload_khr* sema_payload_list,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueWaitSemaphoresKHR_t *\r\nclEnqueueWaitSemaphoresKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueSignalSemaphoresKHR_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_sema_objects,\r\n    const cl_semaphore_khr* sema_objects,\r\n    const cl_semaphore_payload_khr* sema_payload_list,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSignalSemaphoresKHR_t *\r\nclEnqueueSignalSemaphoresKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetSemaphoreInfoKHR_t(\r\n    cl_semaphore_khr sema_object,\r\n    cl_semaphore_info_khr param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetSemaphoreInfoKHR_t *\r\nclGetSemaphoreInfoKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclReleaseSemaphoreKHR_t(\r\n    cl_semaphore_khr sema_object);\r\n\r\ntypedef clReleaseSemaphoreKHR_t *\r\nclReleaseSemaphoreKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclRetainSemaphoreKHR_t(\r\n    cl_semaphore_khr sema_object);\r\n\r\ntypedef clRetainSemaphoreKHR_t *\r\nclRetainSemaphoreKHR_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_semaphore_khr CL_API_CALL\r\nclCreateSemaphoreWithPropertiesKHR(\r\n    cl_context context,\r\n    const cl_semaphore_properties_khr* sema_props,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueWaitSemaphoresKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_sema_objects,\r\n    const cl_semaphore_khr* sema_objects,\r\n    const cl_semaphore_payload_khr* sema_payload_list,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSignalSemaphoresKHR(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_sema_objects,\r\n    const cl_semaphore_khr* sema_objects,\r\n    const cl_semaphore_payload_khr* sema_payload_list,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetSemaphoreInfoKHR(\r\n    cl_semaphore_khr sema_object,\r\n    cl_semaphore_info_khr param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseSemaphoreKHR(\r\n    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainSemaphoreKHR(\r\n    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_arm_import_memory\r\n***************************************************************/\r\n#define cl_arm_import_memory 1\r\n#define CL_ARM_IMPORT_MEMORY_EXTENSION_NAME \\\r\n    \"cl_arm_import_memory\"\r\n\r\n\r\n#define CL_ARM_IMPORT_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef intptr_t            cl_import_properties_arm;\r\n\r\n/* cl_import_properties_arm */\r\n#define CL_IMPORT_TYPE_ARM                                  0x40B2\r\n#define CL_IMPORT_TYPE_HOST_ARM                             0x40B3\r\n#define CL_IMPORT_TYPE_DMA_BUF_ARM                          0x40B4\r\n#define CL_IMPORT_TYPE_PROTECTED_ARM                        0x40B5\r\n#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM          0x41E2\r\n#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM    0x41E3\r\n#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM               SIZE_MAX\r\n#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_PLANE_INDEX_ARM   0x41EF\r\n#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_LAYER_INDEX_ARM   0x41F0\r\n\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclImportMemoryARM_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    const cl_import_properties_arm* properties,\r\n    void* memory,\r\n    size_t size,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clImportMemoryARM_t *\r\nclImportMemoryARM_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclImportMemoryARM(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    const cl_import_properties_arm* properties,\r\n    void* memory,\r\n    size_t size,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_arm_shared_virtual_memory\r\n***************************************************************/\r\n#define cl_arm_shared_virtual_memory 1\r\n#define CL_ARM_SHARED_VIRTUAL_MEMORY_EXTENSION_NAME \\\r\n    \"cl_arm_shared_virtual_memory\"\r\n\r\n\r\n#define CL_ARM_SHARED_VIRTUAL_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_svm_mem_flags_arm;\r\ntypedef cl_uint             cl_kernel_exec_info_arm;\r\ntypedef cl_bitfield         cl_device_svm_capabilities_arm;\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_SVM_CAPABILITIES_ARM                      0x40B6\r\n\r\n/* cl_mem_info */\r\n#define CL_MEM_USES_SVM_POINTER_ARM                         0x40B7\r\n\r\n/* cl_kernel_exec_info_arm */\r\n#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                    0x40B8\r\n#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM       0x40B9\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_SVM_FREE_ARM                             0x40BA\r\n#define CL_COMMAND_SVM_MEMCPY_ARM                           0x40BB\r\n#define CL_COMMAND_SVM_MEMFILL_ARM                          0x40BC\r\n#define CL_COMMAND_SVM_MAP_ARM                              0x40BD\r\n#define CL_COMMAND_SVM_UNMAP_ARM                            0x40BE\r\n\r\n/* cl_device_svm_capabilities_arm */\r\n#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM               (1 << 0)\r\n#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM                 (1 << 1)\r\n#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM                 (1 << 2)\r\n#define CL_DEVICE_SVM_ATOMICS_ARM                           (1 << 3)\r\n\r\n/* cl_svm_mem_flags_arm */\r\n#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                    (1 << 10)\r\n#define CL_MEM_SVM_ATOMICS_ARM                              (1 << 11)\r\n\r\n\r\ntypedef void* CL_API_CALL\r\nclSVMAllocARM_t(\r\n    cl_context context,\r\n    cl_svm_mem_flags_arm flags,\r\n    size_t size,\r\n    cl_uint alignment);\r\n\r\ntypedef clSVMAllocARM_t *\r\nclSVMAllocARM_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef void CL_API_CALL\r\nclSVMFreeARM_t(\r\n    cl_context context,\r\n    void* svm_pointer);\r\n\r\ntypedef clSVMFreeARM_t *\r\nclSVMFreeARM_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueSVMFreeARM_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_svm_pointers,\r\n    void* svm_pointers[],\r\n    void (CL_CALLBACK* pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void *user_data),\r\n    void* user_data,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMFreeARM_t *\r\nclEnqueueSVMFreeARM_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueSVMMemcpyARM_t(\r\n    cl_command_queue command_queue,\r\n    cl_bool blocking_copy,\r\n    void* dst_ptr,\r\n    const void* src_ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMMemcpyARM_t *\r\nclEnqueueSVMMemcpyARM_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueSVMMemFillARM_t(\r\n    cl_command_queue command_queue,\r\n    void* svm_ptr,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMMemFillARM_t *\r\nclEnqueueSVMMemFillARM_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueSVMMapARM_t(\r\n    cl_command_queue command_queue,\r\n    cl_bool blocking_map,\r\n    cl_map_flags flags,\r\n    void* svm_ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMMapARM_t *\r\nclEnqueueSVMMapARM_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueSVMUnmapARM_t(\r\n    cl_command_queue command_queue,\r\n    void* svm_ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMUnmapARM_t *\r\nclEnqueueSVMUnmapARM_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclSetKernelArgSVMPointerARM_t(\r\n    cl_kernel kernel,\r\n    cl_uint arg_index,\r\n    const void* arg_value);\r\n\r\ntypedef clSetKernelArgSVMPointerARM_t *\r\nclSetKernelArgSVMPointerARM_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclSetKernelExecInfoARM_t(\r\n    cl_kernel kernel,\r\n    cl_kernel_exec_info_arm param_name,\r\n    size_t param_value_size,\r\n    const void* param_value);\r\n\r\ntypedef clSetKernelExecInfoARM_t *\r\nclSetKernelExecInfoARM_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY void* CL_API_CALL\r\nclSVMAllocARM(\r\n    cl_context context,\r\n    cl_svm_mem_flags_arm flags,\r\n    size_t size,\r\n    cl_uint alignment) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY void CL_API_CALL\r\nclSVMFreeARM(\r\n    cl_context context,\r\n    void* svm_pointer) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMFreeARM(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_svm_pointers,\r\n    void* svm_pointers[],\r\n    void (CL_CALLBACK* pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void *user_data),\r\n    void* user_data,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMMemcpyARM(\r\n    cl_command_queue command_queue,\r\n    cl_bool blocking_copy,\r\n    void* dst_ptr,\r\n    const void* src_ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMMemFillARM(\r\n    cl_command_queue command_queue,\r\n    void* svm_ptr,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMMapARM(\r\n    cl_command_queue command_queue,\r\n    cl_bool blocking_map,\r\n    cl_map_flags flags,\r\n    void* svm_ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueSVMUnmapARM(\r\n    cl_command_queue command_queue,\r\n    void* svm_ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetKernelArgSVMPointerARM(\r\n    cl_kernel kernel,\r\n    cl_uint arg_index,\r\n    const void* arg_value) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetKernelExecInfoARM(\r\n    cl_kernel kernel,\r\n    cl_kernel_exec_info_arm param_name,\r\n    size_t param_value_size,\r\n    const void* param_value) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_arm_get_core_id\r\n***************************************************************/\r\n#if defined(CL_VERSION_1_2)\r\n\r\n#define cl_arm_get_core_id 1\r\n#define CL_ARM_GET_CORE_ID_EXTENSION_NAME \\\r\n    \"cl_arm_get_core_id\"\r\n\r\n\r\n#define CL_ARM_GET_CORE_ID_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM                0x40BF\r\n\r\n#endif /* defined(CL_VERSION_1_2) */\r\n\r\n/***************************************************************\r\n* cl_arm_job_slot_selection\r\n***************************************************************/\r\n#define cl_arm_job_slot_selection 1\r\n#define CL_ARM_JOB_SLOT_SELECTION_EXTENSION_NAME \\\r\n    \"cl_arm_job_slot_selection\"\r\n\r\n\r\n#define CL_ARM_JOB_SLOT_SELECTION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_JOB_SLOTS_ARM                             0x41E0\r\n\r\n/* cl_queue_properties */\r\n#define CL_QUEUE_JOB_SLOT_ARM                               0x41E1\r\n\r\n/***************************************************************\r\n* cl_arm_scheduling_controls\r\n***************************************************************/\r\n#define cl_arm_scheduling_controls 1\r\n#define CL_ARM_SCHEDULING_CONTROLS_EXTENSION_NAME \\\r\n    \"cl_arm_scheduling_controls\"\r\n\r\n\r\n#define CL_ARM_SCHEDULING_CONTROLS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* Types */\r\ntypedef cl_bitfield         cl_device_scheduling_controls_capabilities_arm;\r\n\r\n/* cl_device_scheduling_controls_capabilities_arm */\r\n#define CL_DEVICE_SCHEDULING_KERNEL_BATCHING_ARM            (1 << 0)\r\n#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_ARM       (1 << 1)\r\n#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_MODIFIER_ARM (1 << 2)\r\n#define CL_DEVICE_SCHEDULING_DEFERRED_FLUSH_ARM             (1 << 3)\r\n#define CL_DEVICE_SCHEDULING_REGISTER_ALLOCATION_ARM        (1 << 4)\r\n#define CL_DEVICE_SCHEDULING_WARP_THROTTLING_ARM            (1 << 5)\r\n#define CL_DEVICE_SCHEDULING_COMPUTE_UNIT_BATCH_QUEUE_SIZE_ARM (1 << 6)\r\n#define CL_DEVICE_SCHEDULING_COMPUTE_UNIT_LIMIT_ARM         (1 << 7)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM      0x41E4\r\n#define CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM        0x41EB\r\n#define CL_DEVICE_MAX_WARP_COUNT_ARM                        0x41EA\r\n\r\n/* cl_kernel_exec_info */\r\n#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM        0x41E5\r\n#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM 0x41E6\r\n#define CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM            0x41E8\r\n#define CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM 0x41F1\r\n\r\n/* cl_kernel_info */\r\n#define CL_KERNEL_MAX_WARP_COUNT_ARM                        0x41E9\r\n\r\n/* cl_queue_properties */\r\n#define CL_QUEUE_KERNEL_BATCHING_ARM                        0x41E7\r\n#define CL_QUEUE_DEFERRED_FLUSH_ARM                         0x41EC\r\n#define CL_QUEUE_COMPUTE_UNIT_LIMIT_ARM                     0x41F3\r\n\r\n/***************************************************************\r\n* cl_arm_controlled_kernel_termination\r\n***************************************************************/\r\n#define cl_arm_controlled_kernel_termination 1\r\n#define CL_ARM_CONTROLLED_KERNEL_TERMINATION_EXTENSION_NAME \\\r\n    \"cl_arm_controlled_kernel_termination\"\r\n\r\n\r\n#define CL_ARM_CONTROLLED_KERNEL_TERMINATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* Types */\r\ntypedef cl_bitfield         cl_device_controlled_termination_capabilities_arm;\r\n\r\n/* Error codes */\r\n#define CL_COMMAND_TERMINATED_ITSELF_WITH_FAILURE_ARM       -1108\r\n\r\n/* cl_device_controlled_termination_capabilities_arm */\r\n#define CL_DEVICE_CONTROLLED_TERMINATION_SUCCESS_ARM        (1 << 0)\r\n#define CL_DEVICE_CONTROLLED_TERMINATION_FAILURE_ARM        (1 << 1)\r\n#define CL_DEVICE_CONTROLLED_TERMINATION_QUERY_ARM          (1 << 2)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_CONTROLLED_TERMINATION_CAPABILITIES_ARM   0x41EE\r\n\r\n/* cl_event_info */\r\n#define CL_EVENT_COMMAND_TERMINATION_REASON_ARM             0x41ED\r\n\r\n/* cl_command_termination_reason_arm */\r\n#define CL_COMMAND_TERMINATION_COMPLETION_ARM               0\r\n#define CL_COMMAND_TERMINATION_CONTROLLED_SUCCESS_ARM       1\r\n#define CL_COMMAND_TERMINATION_CONTROLLED_FAILURE_ARM       2\r\n#define CL_COMMAND_TERMINATION_ERROR_ARM                    3\r\n\r\n/***************************************************************\r\n* cl_arm_protected_memory_allocation\r\n***************************************************************/\r\n#define cl_arm_protected_memory_allocation 1\r\n#define CL_ARM_PROTECTED_MEMORY_ALLOCATION_EXTENSION_NAME \\\r\n    \"cl_arm_protected_memory_allocation\"\r\n\r\n\r\n#define CL_ARM_PROTECTED_MEMORY_ALLOCATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n#define CL_MEM_PROTECTED_ALLOC_ARM                          ((cl_bitfield)1 << 36)\r\n\r\n/***************************************************************\r\n* cl_intel_exec_by_local_thread\r\n***************************************************************/\r\n#define cl_intel_exec_by_local_thread 1\r\n#define CL_INTEL_EXEC_BY_LOCAL_THREAD_EXTENSION_NAME \\\r\n    \"cl_intel_exec_by_local_thread\"\r\n\r\n\r\n#define CL_INTEL_EXEC_BY_LOCAL_THREAD_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_command_queue_properties - bitfield */\r\n#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL             ((cl_bitfield)1 << 31)\r\n\r\n/***************************************************************\r\n* cl_intel_device_attribute_query\r\n***************************************************************/\r\n#define cl_intel_device_attribute_query 1\r\n#define CL_INTEL_DEVICE_ATTRIBUTE_QUERY_EXTENSION_NAME \\\r\n    \"cl_intel_device_attribute_query\"\r\n\r\n\r\n#define CL_INTEL_DEVICE_ATTRIBUTE_QUERY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_device_feature_capabilities_intel;\r\n\r\n/* cl_device_feature_capabilities_intel */\r\n#define CL_DEVICE_FEATURE_FLAG_DP4A_INTEL                   (1 << 0)\r\n#define CL_DEVICE_FEATURE_FLAG_DPAS_INTEL                   (1 << 1)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_IP_VERSION_INTEL                          0x4250\r\n#define CL_DEVICE_ID_INTEL                                  0x4251\r\n#define CL_DEVICE_NUM_SLICES_INTEL                          0x4252\r\n#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL            0x4253\r\n#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL               0x4254\r\n#define CL_DEVICE_NUM_THREADS_PER_EU_INTEL                  0x4255\r\n#define CL_DEVICE_FEATURE_CAPABILITIES_INTEL                0x4256\r\n\r\n/***************************************************************\r\n* cl_intel_device_partition_by_names\r\n***************************************************************/\r\n#define cl_intel_device_partition_by_names 1\r\n#define CL_INTEL_DEVICE_PARTITION_BY_NAMES_EXTENSION_NAME \\\r\n    \"cl_intel_device_partition_by_names\"\r\n\r\n\r\n#define CL_INTEL_DEVICE_PARTITION_BY_NAMES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n#define CL_DEVICE_PARTITION_BY_NAMES_INTEL                  0x4052\r\n#define CL_PARTITION_BY_NAMES_LIST_END_INTEL                -1\r\n\r\n/***************************************************************\r\n* cl_intel_accelerator\r\n***************************************************************/\r\n#define cl_intel_accelerator 1\r\n#define CL_INTEL_ACCELERATOR_EXTENSION_NAME \\\r\n    \"cl_intel_accelerator\"\r\n\r\n\r\n#define CL_INTEL_ACCELERATOR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef struct _cl_accelerator_intel* cl_accelerator_intel;\r\ntypedef cl_uint             cl_accelerator_type_intel;\r\ntypedef cl_uint             cl_accelerator_info_intel;\r\n\r\n/* cl_accelerator_info_intel */\r\n#define CL_ACCELERATOR_DESCRIPTOR_INTEL                     0x4090\r\n#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                0x4091\r\n#define CL_ACCELERATOR_CONTEXT_INTEL                        0x4092\r\n#define CL_ACCELERATOR_TYPE_INTEL                           0x4093\r\n\r\n/* Error codes */\r\n#define CL_INVALID_ACCELERATOR_INTEL                        -1094\r\n#define CL_INVALID_ACCELERATOR_TYPE_INTEL                   -1095\r\n#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL             -1096\r\n#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL             -1097\r\n\r\n\r\ntypedef cl_accelerator_intel CL_API_CALL\r\nclCreateAcceleratorINTEL_t(\r\n    cl_context context,\r\n    cl_accelerator_type_intel accelerator_type,\r\n    size_t descriptor_size,\r\n    const void* descriptor,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateAcceleratorINTEL_t *\r\nclCreateAcceleratorINTEL_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetAcceleratorInfoINTEL_t(\r\n    cl_accelerator_intel accelerator,\r\n    cl_accelerator_info_intel param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetAcceleratorInfoINTEL_t *\r\nclGetAcceleratorInfoINTEL_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclRetainAcceleratorINTEL_t(\r\n    cl_accelerator_intel accelerator);\r\n\r\ntypedef clRetainAcceleratorINTEL_t *\r\nclRetainAcceleratorINTEL_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclReleaseAcceleratorINTEL_t(\r\n    cl_accelerator_intel accelerator);\r\n\r\ntypedef clReleaseAcceleratorINTEL_t *\r\nclReleaseAcceleratorINTEL_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_accelerator_intel CL_API_CALL\r\nclCreateAcceleratorINTEL(\r\n    cl_context context,\r\n    cl_accelerator_type_intel accelerator_type,\r\n    size_t descriptor_size,\r\n    const void* descriptor,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetAcceleratorInfoINTEL(\r\n    cl_accelerator_intel accelerator,\r\n    cl_accelerator_info_intel param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclRetainAcceleratorINTEL(\r\n    cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclReleaseAcceleratorINTEL(\r\n    cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_intel_motion_estimation\r\n***************************************************************/\r\n#define cl_intel_motion_estimation 1\r\n#define CL_INTEL_MOTION_ESTIMATION_EXTENSION_NAME \\\r\n    \"cl_intel_motion_estimation\"\r\n\r\n\r\n#define CL_INTEL_MOTION_ESTIMATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef struct _cl_motion_estimation_desc_intel {\r\n    cl_uint mb_block_type;\r\n    cl_uint subpixel_mode;\r\n    cl_uint sad_adjust_mode;\r\n    cl_uint search_path_type;\r\n} cl_motion_estimation_desc_intel;\r\n\r\n/* cl_accelerator_type_intel */\r\n#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL         0x0\r\n\r\n/* cl_uint mb_block_type */\r\n#define CL_ME_MB_TYPE_16x16_INTEL                           0x0\r\n#define CL_ME_MB_TYPE_8x8_INTEL                             0x1\r\n#define CL_ME_MB_TYPE_4x4_INTEL                             0x2\r\n\r\n/* cl_uint subpixel_mode */\r\n#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                   0x0\r\n#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                      0x1\r\n#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                      0x2\r\n\r\n/* cl_uint sad_adjust_mode */\r\n#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                    0x0\r\n#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                    0x1\r\n\r\n/* cl_uint search_path_type */\r\n#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                  0x0\r\n#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                  0x1\r\n#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                0x5\r\n\r\n/***************************************************************\r\n* cl_intel_advanced_motion_estimation\r\n***************************************************************/\r\n#define cl_intel_advanced_motion_estimation 1\r\n#define CL_INTEL_ADVANCED_MOTION_ESTIMATION_EXTENSION_NAME \\\r\n    \"cl_intel_advanced_motion_estimation\"\r\n\r\n\r\n#define CL_INTEL_ADVANCED_MOTION_ESTIMATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_ME_VERSION_INTEL                          0x407E\r\n\r\n#define CL_ME_VERSION_LEGACY_INTEL                          0x0\r\n#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                  0x1\r\n#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                  0x2\r\n\r\n#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL            0x1\r\n#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL              0x2\r\n\r\n#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                   0x0\r\n#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                     0x4\r\n\r\n#define CL_ME_COST_PENALTY_NONE_INTEL                       0x0\r\n#define CL_ME_COST_PENALTY_LOW_INTEL                        0x1\r\n#define CL_ME_COST_PENALTY_NORMAL_INTEL                     0x2\r\n#define CL_ME_COST_PENALTY_HIGH_INTEL                       0x3\r\n\r\n#define CL_ME_COST_PRECISION_QPEL_INTEL                     0x0\r\n#define CL_ME_COST_PRECISION_HPEL_INTEL                     0x1\r\n#define CL_ME_COST_PRECISION_PEL_INTEL                      0x2\r\n#define CL_ME_COST_PRECISION_DPEL_INTEL                     0x3\r\n\r\n#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0\r\n#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1\r\n#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2\r\n#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3\r\n#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4\r\n#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4\r\n#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5\r\n#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6\r\n#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7\r\n#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8\r\n\r\n#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0\r\n#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1\r\n#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2\r\n#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3\r\n\r\n#define CL_ME_FORWARD_INPUT_MODE_INTEL                      0x1\r\n#define CL_ME_BACKWARD_INPUT_MODE_INTEL                     0x2\r\n#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                  0x3\r\n\r\n#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                    16\r\n#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                      21\r\n#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                       32\r\n#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                  43\r\n#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL              48\r\n\r\n/***************************************************************\r\n* cl_intel_simultaneous_sharing\r\n***************************************************************/\r\n#define cl_intel_simultaneous_sharing 1\r\n#define CL_INTEL_SIMULTANEOUS_SHARING_EXTENSION_NAME \\\r\n    \"cl_intel_simultaneous_sharing\"\r\n\r\n\r\n#define CL_INTEL_SIMULTANEOUS_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL               0x4104\r\n#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL           0x4105\r\n\r\n/***************************************************************\r\n* cl_intel_egl_image_yuv\r\n***************************************************************/\r\n#define cl_intel_egl_image_yuv 1\r\n#define CL_INTEL_EGL_IMAGE_YUV_EXTENSION_NAME \\\r\n    \"cl_intel_egl_image_yuv\"\r\n\r\n\r\n#define CL_INTEL_EGL_IMAGE_YUV_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_egl_image_properties_khr */\r\n#define CL_EGL_YUV_PLANE_INTEL                              0x4107\r\n\r\n/***************************************************************\r\n* cl_intel_packed_yuv\r\n***************************************************************/\r\n#define cl_intel_packed_yuv 1\r\n#define CL_INTEL_PACKED_YUV_EXTENSION_NAME \\\r\n    \"cl_intel_packed_yuv\"\r\n\r\n\r\n#define CL_INTEL_PACKED_YUV_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_channel_order */\r\n#define CL_YUYV_INTEL                                       0x4076\r\n#define CL_UYVY_INTEL                                       0x4077\r\n#define CL_YVYU_INTEL                                       0x4078\r\n#define CL_VYUY_INTEL                                       0x4079\r\n\r\n/***************************************************************\r\n* cl_intel_required_subgroup_size\r\n***************************************************************/\r\n#define cl_intel_required_subgroup_size 1\r\n#define CL_INTEL_REQUIRED_SUBGROUP_SIZE_EXTENSION_NAME \\\r\n    \"cl_intel_required_subgroup_size\"\r\n\r\n\r\n#define CL_INTEL_REQUIRED_SUBGROUP_SIZE_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                     0x4108\r\n\r\n/* cl_kernel_work_group_info */\r\n#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                      0x4109\r\n\r\n/* cl_kernel_sub_group_info */\r\n#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL              0x410A\r\n\r\n/***************************************************************\r\n* cl_intel_driver_diagnostics\r\n***************************************************************/\r\n#define cl_intel_driver_diagnostics 1\r\n#define CL_INTEL_DRIVER_DIAGNOSTICS_EXTENSION_NAME \\\r\n    \"cl_intel_driver_diagnostics\"\r\n\r\n\r\n#define CL_INTEL_DRIVER_DIAGNOSTICS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_diagnostic_verbose_level_intel;\r\n\r\n/* cl_context_properties */\r\n#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                   0x4106\r\n\r\n/* cl_diagnostic_verbose_level_intel */\r\n#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL              0xff\r\n#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL             (1 << 0)\r\n#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL              (1 << 1)\r\n#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL          (1 << 2)\r\n\r\n/***************************************************************\r\n* cl_intel_planar_yuv\r\n***************************************************************/\r\n#define cl_intel_planar_yuv 1\r\n#define CL_INTEL_PLANAR_YUV_EXTENSION_NAME \\\r\n    \"cl_intel_planar_yuv\"\r\n\r\n\r\n#define CL_INTEL_PLANAR_YUV_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_channel_order */\r\n#define CL_NV12_INTEL                                       0x410E\r\n\r\n/* cl_mem_flags */\r\n#define CL_MEM_NO_ACCESS_INTEL                              (1 << 24)\r\n#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              (1 << 25)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E\r\n#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F\r\n\r\n/***************************************************************\r\n* cl_intel_device_side_avc_motion_estimation\r\n***************************************************************/\r\n#define cl_intel_device_side_avc_motion_estimation 1\r\n#define CL_INTEL_DEVICE_SIDE_AVC_MOTION_ESTIMATION_EXTENSION_NAME \\\r\n    \"cl_intel_device_side_avc_motion_estimation\"\r\n\r\n\r\n#define CL_INTEL_DEVICE_SIDE_AVC_MOTION_ESTIMATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B\r\n#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C\r\n#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D\r\n\r\n/* returned by CL_DEVICE_AVC_ME_VERSION_INTEL */\r\n#define CL_AVC_ME_VERSION_0_INTEL                           0x0\r\n#define CL_AVC_ME_VERSION_1_INTEL                           0x1\r\n\r\n/* Inter macro-block major shape values */\r\n#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0\r\n#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1\r\n#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2\r\n#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3\r\n\r\n/* Inter macro-block minor shape values */\r\n#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0\r\n#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1\r\n#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2\r\n#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3\r\n\r\n/* Inter macro-block major direction values */\r\n#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0\r\n#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1\r\n#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2\r\n\r\n/* Inter (IME) partition mask values */\r\n#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0\r\n#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E\r\n#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D\r\n#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B\r\n#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77\r\n#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F\r\n#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F\r\n#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F\r\n\r\n/* Search window configuration */\r\n#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0\r\n#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1\r\n#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2\r\n#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3\r\n#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4\r\n#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5\r\n#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6\r\n#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7\r\n#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8\r\n#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9\r\n#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2\r\n#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa\r\n\r\n/* SAD adjustment mode */\r\n#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0\r\n#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2\r\n\r\n/* Pixel resolution */\r\n#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0\r\n#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1\r\n#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3\r\n\r\n/* Cost precision values */\r\n#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0\r\n#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1\r\n#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2\r\n#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3\r\n\r\n/* Inter bidirectional weights */\r\n#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10\r\n#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15\r\n#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20\r\n#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B\r\n#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30\r\n\r\n/* Inter border reached values */\r\n#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0\r\n#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2\r\n#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4\r\n#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8\r\n\r\n/* Inter skip block partition type */\r\n#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0\r\n#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000\r\n\r\n/* Inter skip motion vector mask */\r\n#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     (0x1 << 24)\r\n#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    (0x2 << 24)\r\n#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        (0x3 << 24)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       (0x55 << 24)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      (0xAA << 24)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          (0xFF << 24)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     (0x1 << 24)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    (0x2 << 24)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     (0x1 << 26)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    (0x2 << 26)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     (0x1 << 28)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    (0x2 << 28)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     (0x1 << 30)\r\n#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    (0x2 << 30)\r\n\r\n/* Block based skip type values */\r\n#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00\r\n#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80\r\n\r\n/* cl_intel_device_side_avc_motion_estimation.?? */\r\n#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0\r\n#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1\r\n#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2\r\n\r\n/* Luma intra partition mask values */\r\n#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6\r\n#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5\r\n#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3\r\n\r\n/* Intra neighbor availability mask values */\r\n#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL     0x60\r\n#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL    0x10\r\n#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8\r\n#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4\r\n\r\n/* Luma intra modes */\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL        0x0\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL      0x1\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL              0x2\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL           0x4\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL  0x5\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL   0x7\r\n#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL   0x8\r\n\r\n/* Chroma intra modes */\r\n#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL            0x0\r\n#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL    0x1\r\n#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL      0x2\r\n#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL         0x3\r\n\r\n/* Reference image select values */\r\n#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1\r\n#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2\r\n#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3\r\n\r\n/* Slice type values */\r\n#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0\r\n#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1\r\n#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2\r\n\r\n/* Interlaced image field polarity values */\r\n#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0\r\n#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1\r\n\r\n/***************************************************************\r\n* cl_intel_unified_shared_memory\r\n***************************************************************/\r\n#define cl_intel_unified_shared_memory 1\r\n#define CL_INTEL_UNIFIED_SHARED_MEMORY_EXTENSION_NAME \\\r\n    \"cl_intel_unified_shared_memory\"\r\n\r\n\r\n#define CL_INTEL_UNIFIED_SHARED_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_device_unified_shared_memory_capabilities_intel;\r\ntypedef cl_properties       cl_mem_properties_intel;\r\ntypedef cl_bitfield         cl_mem_alloc_flags_intel;\r\ntypedef cl_uint             cl_mem_info_intel;\r\ntypedef cl_uint             cl_unified_shared_memory_type_intel;\r\ntypedef cl_uint             cl_mem_advice_intel;\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL               0x4190\r\n#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL             0x4191\r\n#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192\r\n#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193\r\n#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL      0x4194\r\n\r\n/* cl_unified_shared_memory_capabilities_intel - bitfield */\r\n#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL               (1 << 0)\r\n#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL        (1 << 1)\r\n#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL    (1 << 2)\r\n#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)\r\n\r\n/* cl_mem_properties_intel */\r\n#define CL_MEM_ALLOC_FLAGS_INTEL                            0x4195\r\n\r\n/* cl_mem_alloc_flags_intel - bitfield */\r\n#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL                   (1 << 0)\r\n#define CL_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE_INTEL         (1 << 1)\r\n#define CL_MEM_ALLOC_INITIAL_PLACEMENT_HOST_INTEL           (1 << 2)\r\n\r\n/* cl_mem_alloc_info_intel */\r\n#define CL_MEM_ALLOC_TYPE_INTEL                             0x419A\r\n#define CL_MEM_ALLOC_BASE_PTR_INTEL                         0x419B\r\n#define CL_MEM_ALLOC_SIZE_INTEL                             0x419C\r\n#define CL_MEM_ALLOC_DEVICE_INTEL                           0x419D\r\n\r\n/* cl_unified_shared_memory_type_intel */\r\n#define CL_MEM_TYPE_UNKNOWN_INTEL                           0x4196\r\n#define CL_MEM_TYPE_HOST_INTEL                              0x4197\r\n#define CL_MEM_TYPE_DEVICE_INTEL                            0x4198\r\n#define CL_MEM_TYPE_SHARED_INTEL                            0x4199\r\n\r\n/* cl_kernel_exec_info */\r\n#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL      0x4200\r\n#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL    0x4201\r\n#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL    0x4202\r\n#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL                  0x4203\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_MEMFILL_INTEL                            0x4204\r\n#define CL_COMMAND_MEMCPY_INTEL                             0x4205\r\n#define CL_COMMAND_MIGRATEMEM_INTEL                         0x4206\r\n#define CL_COMMAND_MEMADVISE_INTEL                          0x4207\r\n\r\n\r\ntypedef void* CL_API_CALL\r\nclHostMemAllocINTEL_t(\r\n    cl_context context,\r\n    const cl_mem_properties_intel* properties,\r\n    size_t size,\r\n    cl_uint alignment,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clHostMemAllocINTEL_t *\r\nclHostMemAllocINTEL_fn ;\r\n\r\ntypedef void* CL_API_CALL\r\nclDeviceMemAllocINTEL_t(\r\n    cl_context context,\r\n    cl_device_id device,\r\n    const cl_mem_properties_intel* properties,\r\n    size_t size,\r\n    cl_uint alignment,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clDeviceMemAllocINTEL_t *\r\nclDeviceMemAllocINTEL_fn ;\r\n\r\ntypedef void* CL_API_CALL\r\nclSharedMemAllocINTEL_t(\r\n    cl_context context,\r\n    cl_device_id device,\r\n    const cl_mem_properties_intel* properties,\r\n    size_t size,\r\n    cl_uint alignment,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clSharedMemAllocINTEL_t *\r\nclSharedMemAllocINTEL_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclMemFreeINTEL_t(\r\n    cl_context context,\r\n    void* ptr);\r\n\r\ntypedef clMemFreeINTEL_t *\r\nclMemFreeINTEL_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclMemBlockingFreeINTEL_t(\r\n    cl_context context,\r\n    void* ptr);\r\n\r\ntypedef clMemBlockingFreeINTEL_t *\r\nclMemBlockingFreeINTEL_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetMemAllocInfoINTEL_t(\r\n    cl_context context,\r\n    const void* ptr,\r\n    cl_mem_info_intel param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetMemAllocInfoINTEL_t *\r\nclGetMemAllocInfoINTEL_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclSetKernelArgMemPointerINTEL_t(\r\n    cl_kernel kernel,\r\n    cl_uint arg_index,\r\n    const void* arg_value);\r\n\r\ntypedef clSetKernelArgMemPointerINTEL_t *\r\nclSetKernelArgMemPointerINTEL_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueMemFillINTEL_t(\r\n    cl_command_queue command_queue,\r\n    void* dst_ptr,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueMemFillINTEL_t *\r\nclEnqueueMemFillINTEL_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueMemcpyINTEL_t(\r\n    cl_command_queue command_queue,\r\n    cl_bool blocking,\r\n    void* dst_ptr,\r\n    const void* src_ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueMemcpyINTEL_t *\r\nclEnqueueMemcpyINTEL_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueMemAdviseINTEL_t(\r\n    cl_command_queue command_queue,\r\n    const void* ptr,\r\n    size_t size,\r\n    cl_mem_advice_intel advice,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueMemAdviseINTEL_t *\r\nclEnqueueMemAdviseINTEL_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY void* CL_API_CALL\r\nclHostMemAllocINTEL(\r\n    cl_context context,\r\n    const cl_mem_properties_intel* properties,\r\n    size_t size,\r\n    cl_uint alignment,\r\n    cl_int* errcode_ret) ;\r\n\r\nextern CL_API_ENTRY void* CL_API_CALL\r\nclDeviceMemAllocINTEL(\r\n    cl_context context,\r\n    cl_device_id device,\r\n    const cl_mem_properties_intel* properties,\r\n    size_t size,\r\n    cl_uint alignment,\r\n    cl_int* errcode_ret) ;\r\n\r\nextern CL_API_ENTRY void* CL_API_CALL\r\nclSharedMemAllocINTEL(\r\n    cl_context context,\r\n    cl_device_id device,\r\n    const cl_mem_properties_intel* properties,\r\n    size_t size,\r\n    cl_uint alignment,\r\n    cl_int* errcode_ret) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclMemFreeINTEL(\r\n    cl_context context,\r\n    void* ptr) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclMemBlockingFreeINTEL(\r\n    cl_context context,\r\n    void* ptr) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetMemAllocInfoINTEL(\r\n    cl_context context,\r\n    const void* ptr,\r\n    cl_mem_info_intel param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetKernelArgMemPointerINTEL(\r\n    cl_kernel kernel,\r\n    cl_uint arg_index,\r\n    const void* arg_value) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueMemFillINTEL(\r\n    cl_command_queue command_queue,\r\n    void* dst_ptr,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueMemcpyINTEL(\r\n    cl_command_queue command_queue,\r\n    cl_bool blocking,\r\n    void* dst_ptr,\r\n    const void* src_ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueMemAdviseINTEL(\r\n    cl_command_queue command_queue,\r\n    const void* ptr,\r\n    size_t size,\r\n    cl_mem_advice_intel advice,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#if defined(CL_VERSION_1_2)\r\n/* Requires OpenCL 1.2 for cl_mem_migration_flags: */\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueMigrateMemINTEL_t(\r\n    cl_command_queue command_queue,\r\n    const void* ptr,\r\n    size_t size,\r\n    cl_mem_migration_flags flags,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueMigrateMemINTEL_t *\r\nclEnqueueMigrateMemINTEL_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueMigrateMemINTEL(\r\n    cl_command_queue command_queue,\r\n    const void* ptr,\r\n    size_t size,\r\n    cl_mem_migration_flags flags,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#endif /* defined(CL_VERSION_1_2) */\r\n\r\n/* deprecated, use clEnqueueMemFillINTEL instead */\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueMemsetINTEL_t(\r\n    cl_command_queue command_queue,\r\n    void* dst_ptr,\r\n    cl_int value,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueMemsetINTEL_t *\r\nclEnqueueMemsetINTEL_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueMemsetINTEL(\r\n    cl_command_queue command_queue,\r\n    void* dst_ptr,\r\n    cl_int value,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_intel_mem_alloc_buffer_location\r\n***************************************************************/\r\n#define cl_intel_mem_alloc_buffer_location 1\r\n#define CL_INTEL_MEM_ALLOC_BUFFER_LOCATION_EXTENSION_NAME \\\r\n    \"cl_intel_mem_alloc_buffer_location\"\r\n\r\n\r\n#define CL_INTEL_MEM_ALLOC_BUFFER_LOCATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_mem_properties_intel */\r\n#define CL_MEM_ALLOC_BUFFER_LOCATION_INTEL                  0x419E\r\n\r\n/* cl_mem_alloc_info_intel */\r\n/* enum CL_MEM_ALLOC_BUFFER_LOCATION_INTEL */\r\n\r\n/***************************************************************\r\n* cl_intel_create_buffer_with_properties\r\n***************************************************************/\r\n#define cl_intel_create_buffer_with_properties 1\r\n#define CL_INTEL_CREATE_BUFFER_WITH_PROPERTIES_EXTENSION_NAME \\\r\n    \"cl_intel_create_buffer_with_properties\"\r\n\r\n\r\n#define CL_INTEL_CREATE_BUFFER_WITH_PROPERTIES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* type cl_mem_properties_intel */\r\n\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateBufferWithPropertiesINTEL_t(\r\n    cl_context context,\r\n    const cl_mem_properties_intel* properties,\r\n    cl_mem_flags flags,\r\n    size_t size,\r\n    void* host_ptr,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateBufferWithPropertiesINTEL_t *\r\nclCreateBufferWithPropertiesINTEL_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateBufferWithPropertiesINTEL(\r\n    cl_context context,\r\n    const cl_mem_properties_intel* properties,\r\n    cl_mem_flags flags,\r\n    size_t size,\r\n    void* host_ptr,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_intel_program_scope_host_pipe\r\n***************************************************************/\r\n#define cl_intel_program_scope_host_pipe 1\r\n#define CL_INTEL_PROGRAM_SCOPE_HOST_PIPE_EXTENSION_NAME \\\r\n    \"cl_intel_program_scope_host_pipe\"\r\n\r\n\r\n#define CL_INTEL_PROGRAM_SCOPE_HOST_PIPE_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* clGetEventInfo response when param_name is CL_EVENT_COMMAND_TYPE */\r\n#define CL_COMMAND_READ_HOST_PIPE_INTEL                     0x4214\r\n#define CL_COMMAND_WRITE_HOST_PIPE_INTEL                    0x4215\r\n\r\n/* clGetProgramInfo param_name */\r\n#define CL_PROGRAM_NUM_HOST_PIPES_INTEL                     0x4216\r\n#define CL_PROGRAM_HOST_PIPE_NAMES_INTEL                    0x4217\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReadHostPipeINTEL_t(\r\n    cl_command_queue command_queue,\r\n    cl_program program,\r\n    const char* pipe_symbol,\r\n    cl_bool blocking_read,\r\n    void* ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReadHostPipeINTEL_t *\r\nclEnqueueReadHostPipeINTEL_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueWriteHostPipeINTEL_t(\r\n    cl_command_queue command_queue,\r\n    cl_program program,\r\n    const char* pipe_symbol,\r\n    cl_bool blocking_write,\r\n    const void* ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueWriteHostPipeINTEL_t *\r\nclEnqueueWriteHostPipeINTEL_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReadHostPipeINTEL(\r\n    cl_command_queue command_queue,\r\n    cl_program program,\r\n    const char* pipe_symbol,\r\n    cl_bool blocking_read,\r\n    void* ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueWriteHostPipeINTEL(\r\n    cl_command_queue command_queue,\r\n    cl_program program,\r\n    const char* pipe_symbol,\r\n    cl_bool blocking_write,\r\n    const void* ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_intel_mem_channel_property\r\n***************************************************************/\r\n#define cl_intel_mem_channel_property 1\r\n#define CL_INTEL_MEM_CHANNEL_PROPERTY_EXTENSION_NAME \\\r\n    \"cl_intel_mem_channel_property\"\r\n\r\n\r\n#define CL_INTEL_MEM_CHANNEL_PROPERTY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_mem_properties_intel */\r\n#define CL_MEM_CHANNEL_INTEL                                0x4213\r\n\r\n/***************************************************************\r\n* cl_intel_mem_force_host_memory\r\n***************************************************************/\r\n#define cl_intel_mem_force_host_memory 1\r\n#define CL_INTEL_MEM_FORCE_HOST_MEMORY_EXTENSION_NAME \\\r\n    \"cl_intel_mem_force_host_memory\"\r\n\r\n\r\n#define CL_INTEL_MEM_FORCE_HOST_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_mem_flags */\r\n#define CL_MEM_FORCE_HOST_MEMORY_INTEL                      (1 << 20)\r\n\r\n/***************************************************************\r\n* cl_intel_command_queue_families\r\n***************************************************************/\r\n#define cl_intel_command_queue_families 1\r\n#define CL_INTEL_COMMAND_QUEUE_FAMILIES_EXTENSION_NAME \\\r\n    \"cl_intel_command_queue_families\"\r\n\r\n\r\n#define CL_INTEL_COMMAND_QUEUE_FAMILIES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_command_queue_capabilities_intel;\r\n\r\n#define CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL                 64\r\n\r\ntypedef struct _cl_queue_family_properties_intel {\r\n    cl_command_queue_properties properties;\r\n    cl_command_queue_capabilities_intel capabilities;\r\n    cl_uint count;\r\n    char name[CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL];\r\n} cl_queue_family_properties_intel;\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL             0x418B\r\n\r\n/* cl_queue_properties */\r\n#define CL_QUEUE_FAMILY_INTEL                               0x418C\r\n#define CL_QUEUE_INDEX_INTEL                                0x418D\r\n\r\n/* cl_command_queue_capabilities_intel */\r\n#define CL_QUEUE_DEFAULT_CAPABILITIES_INTEL                 0\r\n#define CL_QUEUE_CAPABILITY_CREATE_SINGLE_QUEUE_EVENTS_INTEL (1 << 0)\r\n#define CL_QUEUE_CAPABILITY_CREATE_CROSS_QUEUE_EVENTS_INTEL (1 << 1)\r\n#define CL_QUEUE_CAPABILITY_SINGLE_QUEUE_EVENT_WAIT_LIST_INTEL (1 << 2)\r\n#define CL_QUEUE_CAPABILITY_CROSS_QUEUE_EVENT_WAIT_LIST_INTEL (1 << 3)\r\n#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_INTEL           (1 << 8)\r\n#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_RECT_INTEL      (1 << 9)\r\n#define CL_QUEUE_CAPABILITY_MAP_BUFFER_INTEL                (1 << 10)\r\n#define CL_QUEUE_CAPABILITY_FILL_BUFFER_INTEL               (1 << 11)\r\n#define CL_QUEUE_CAPABILITY_TRANSFER_IMAGE_INTEL            (1 << 12)\r\n#define CL_QUEUE_CAPABILITY_MAP_IMAGE_INTEL                 (1 << 13)\r\n#define CL_QUEUE_CAPABILITY_FILL_IMAGE_INTEL                (1 << 14)\r\n#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_IMAGE_INTEL     (1 << 15)\r\n#define CL_QUEUE_CAPABILITY_TRANSFER_IMAGE_BUFFER_INTEL     (1 << 16)\r\n#define CL_QUEUE_CAPABILITY_MARKER_INTEL                    (1 << 24)\r\n#define CL_QUEUE_CAPABILITY_BARRIER_INTEL                   (1 << 25)\r\n#define CL_QUEUE_CAPABILITY_KERNEL_INTEL                    (1 << 26)\r\n\r\n/***************************************************************\r\n* cl_intel_queue_no_sync_operations\r\n***************************************************************/\r\n#define cl_intel_queue_no_sync_operations 1\r\n#define CL_INTEL_QUEUE_NO_SYNC_OPERATIONS_EXTENSION_NAME \\\r\n    \"cl_intel_queue_no_sync_operations\"\r\n\r\n\r\n#define CL_INTEL_QUEUE_NO_SYNC_OPERATIONS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_command_queue_properties */\r\n#define CL_QUEUE_NO_SYNC_OPERATIONS_INTEL                   (1 << 29)\r\n\r\n/***************************************************************\r\n* cl_intel_sharing_format_query\r\n***************************************************************/\r\n#define cl_intel_sharing_format_query 1\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_EXTENSION_NAME \\\r\n    \"cl_intel_sharing_format_query\"\r\n\r\n\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_ext_image_requirements_info\r\n***************************************************************/\r\n#if defined(CL_VERSION_3_0)\r\n\r\n#define cl_ext_image_requirements_info 1\r\n#define CL_EXT_IMAGE_REQUIREMENTS_INFO_EXTENSION_NAME \\\r\n    \"cl_ext_image_requirements_info\"\r\n\r\n\r\n#define CL_EXT_IMAGE_REQUIREMENTS_INFO_EXTENSION_VERSION CL_MAKE_VERSION(0, 5, 0)\r\n\r\n/* Types */\r\ntypedef cl_uint             cl_image_requirements_info_ext;\r\n\r\n/* cl_image_requirements_info_ext */\r\n#define CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT    0x1292\r\n#define CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT       0x1290\r\n#define CL_IMAGE_REQUIREMENTS_SIZE_EXT                      0x12B2\r\n#define CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT                 0x12B3\r\n#define CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT                0x12B4\r\n#define CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT                 0x12B5\r\n#define CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT            0x12B6\r\n\r\n/* Enqueued Commands APIs */\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetImageRequirementsInfoEXT_t(\r\n    cl_context context,\r\n    const cl_mem_properties* properties,\r\n    cl_mem_flags flags,\r\n    const cl_image_format* image_format,\r\n    const cl_image_desc* image_desc,\r\n    cl_image_requirements_info_ext param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetImageRequirementsInfoEXT_t *\r\nclGetImageRequirementsInfoEXT_fn CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetImageRequirementsInfoEXT(\r\n    cl_context context,\r\n    const cl_mem_properties* properties,\r\n    cl_mem_flags flags,\r\n    const cl_image_format* image_format,\r\n    const cl_image_desc* image_desc,\r\n    cl_image_requirements_info_ext param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#endif /* defined(CL_VERSION_3_0) */\r\n\r\n/***************************************************************\r\n* cl_ext_image_from_buffer\r\n***************************************************************/\r\n#if defined(CL_VERSION_3_0)\r\n\r\n#define cl_ext_image_from_buffer 1\r\n#define CL_EXT_IMAGE_FROM_BUFFER_EXTENSION_NAME \\\r\n    \"cl_ext_image_from_buffer\"\r\n\r\n\r\n#define CL_EXT_IMAGE_FROM_BUFFER_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_image_requirements_info_ext */\r\n#define CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT     0x1291\r\n\r\n#endif /* defined(CL_VERSION_3_0) */\r\n\r\n/***************************************************************\r\n* cl_loader_info\r\n***************************************************************/\r\n#define cl_loader_info 1\r\n#define CL_LOADER_INFO_EXTENSION_NAME \\\r\n    \"cl_loader_info\"\r\n\r\n\r\n#define CL_LOADER_INFO_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_uint             cl_icdl_info;\r\n\r\n/* cl_icdl_info */\r\n#define CL_ICDL_OCL_VERSION                                 1\r\n#define CL_ICDL_VERSION                                     2\r\n#define CL_ICDL_NAME                                        3\r\n#define CL_ICDL_VENDOR                                      4\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetICDLoaderInfoOCLICD_t(\r\n    cl_icdl_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetICDLoaderInfoOCLICD_t *\r\nclGetICDLoaderInfoOCLICD_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetICDLoaderInfoOCLICD(\r\n    cl_icdl_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_depth_images\r\n***************************************************************/\r\n#define cl_khr_depth_images 1\r\n#define CL_KHR_DEPTH_IMAGES_EXTENSION_NAME \\\r\n    \"cl_khr_depth_images\"\r\n\r\n\r\n#define CL_KHR_DEPTH_IMAGES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n#if !defined(CL_VERSION_2_0)\r\n/* cl_channel_order - defined in CL.h for OpenCL 2.0 and newer */\r\n#define CL_DEPTH                                            0x10BD\r\n\r\n#endif /* !defined(CL_VERSION_2_0) */\r\n\r\n/***************************************************************\r\n* cl_ext_float_atomics\r\n***************************************************************/\r\n#define cl_ext_float_atomics 1\r\n#define CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME \\\r\n    \"cl_ext_float_atomics\"\r\n\r\n\r\n#define CL_EXT_FLOAT_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_bitfield         cl_device_fp_atomic_capabilities_ext;\r\n\r\n/* cl_device_fp_atomic_capabilities_ext */\r\n#define CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT           (1 << 0)\r\n#define CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT                  (1 << 1)\r\n#define CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT              (1 << 2)\r\n#define CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT            (1 << 16)\r\n#define CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT                   (1 << 17)\r\n#define CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT               (1 << 18)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT         0x4231\r\n#define CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT         0x4232\r\n#define CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT           0x4233\r\n\r\n/***************************************************************\r\n* cl_intel_create_mem_object_properties\r\n***************************************************************/\r\n#define cl_intel_create_mem_object_properties 1\r\n#define CL_INTEL_CREATE_MEM_OBJECT_PROPERTIES_EXTENSION_NAME \\\r\n    \"cl_intel_create_mem_object_properties\"\r\n\r\n\r\n#define CL_INTEL_CREATE_MEM_OBJECT_PROPERTIES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* cl_mem_properties */\r\n#define CL_MEM_LOCALLY_UNCACHED_RESOURCE_INTEL              0x4218\r\n#define CL_MEM_DEVICE_ID_INTEL                              0x4219\r\n\r\n/***************************************************************\r\n* cl_pocl_content_size\r\n***************************************************************/\r\n#define cl_pocl_content_size 1\r\n#define CL_POCL_CONTENT_SIZE_EXTENSION_NAME \\\r\n    \"cl_pocl_content_size\"\r\n\r\n\r\n#define CL_POCL_CONTENT_SIZE_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclSetContentSizeBufferPoCL_t(\r\n    cl_mem buffer,\r\n    cl_mem content_size_buffer);\r\n\r\ntypedef clSetContentSizeBufferPoCL_t *\r\nclSetContentSizeBufferPoCL_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclSetContentSizeBufferPoCL(\r\n    cl_mem buffer,\r\n    cl_mem content_size_buffer) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_ext_image_raw10_raw12\r\n***************************************************************/\r\n#define cl_ext_image_raw10_raw12 1\r\n#define CL_EXT_IMAGE_RAW10_RAW12_EXTENSION_NAME \\\r\n    \"cl_ext_image_raw10_raw12\"\r\n\r\n\r\n#define CL_EXT_IMAGE_RAW10_RAW12_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_channel_type */\r\n#define CL_UNSIGNED_INT_RAW10_EXT                           0x10E3\r\n#define CL_UNSIGNED_INT_RAW12_EXT                           0x10E4\r\n\r\n/***************************************************************\r\n* cl_khr_3d_image_writes\r\n***************************************************************/\r\n#define cl_khr_3d_image_writes 1\r\n#define CL_KHR_3D_IMAGE_WRITES_EXTENSION_NAME \\\r\n    \"cl_khr_3d_image_writes\"\r\n\r\n\r\n#define CL_KHR_3D_IMAGE_WRITES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_async_work_group_copy_fence\r\n***************************************************************/\r\n#define cl_khr_async_work_group_copy_fence 1\r\n#define CL_KHR_ASYNC_WORK_GROUP_COPY_FENCE_EXTENSION_NAME \\\r\n    \"cl_khr_async_work_group_copy_fence\"\r\n\r\n\r\n#define CL_KHR_ASYNC_WORK_GROUP_COPY_FENCE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_byte_addressable_store\r\n***************************************************************/\r\n#define cl_khr_byte_addressable_store 1\r\n#define CL_KHR_BYTE_ADDRESSABLE_STORE_EXTENSION_NAME \\\r\n    \"cl_khr_byte_addressable_store\"\r\n\r\n\r\n#define CL_KHR_BYTE_ADDRESSABLE_STORE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_device_enqueue_local_arg_types\r\n***************************************************************/\r\n#define cl_khr_device_enqueue_local_arg_types 1\r\n#define CL_KHR_DEVICE_ENQUEUE_LOCAL_ARG_TYPES_EXTENSION_NAME \\\r\n    \"cl_khr_device_enqueue_local_arg_types\"\r\n\r\n\r\n#define CL_KHR_DEVICE_ENQUEUE_LOCAL_ARG_TYPES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_expect_assume\r\n***************************************************************/\r\n#define cl_khr_expect_assume 1\r\n#define CL_KHR_EXPECT_ASSUME_EXTENSION_NAME \\\r\n    \"cl_khr_expect_assume\"\r\n\r\n\r\n#define CL_KHR_EXPECT_ASSUME_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_extended_async_copies\r\n***************************************************************/\r\n#define cl_khr_extended_async_copies 1\r\n#define CL_KHR_EXTENDED_ASYNC_COPIES_EXTENSION_NAME \\\r\n    \"cl_khr_extended_async_copies\"\r\n\r\n\r\n#define CL_KHR_EXTENDED_ASYNC_COPIES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_extended_bit_ops\r\n***************************************************************/\r\n#define cl_khr_extended_bit_ops 1\r\n#define CL_KHR_EXTENDED_BIT_OPS_EXTENSION_NAME \\\r\n    \"cl_khr_extended_bit_ops\"\r\n\r\n\r\n#define CL_KHR_EXTENDED_BIT_OPS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_global_int32_base_atomics\r\n***************************************************************/\r\n#define cl_khr_global_int32_base_atomics 1\r\n#define CL_KHR_GLOBAL_INT32_BASE_ATOMICS_EXTENSION_NAME \\\r\n    \"cl_khr_global_int32_base_atomics\"\r\n\r\n\r\n#define CL_KHR_GLOBAL_INT32_BASE_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_global_int32_extended_atomics\r\n***************************************************************/\r\n#define cl_khr_global_int32_extended_atomics 1\r\n#define CL_KHR_GLOBAL_INT32_EXTENDED_ATOMICS_EXTENSION_NAME \\\r\n    \"cl_khr_global_int32_extended_atomics\"\r\n\r\n\r\n#define CL_KHR_GLOBAL_INT32_EXTENDED_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_int64_base_atomics\r\n***************************************************************/\r\n#define cl_khr_int64_base_atomics 1\r\n#define CL_KHR_INT64_BASE_ATOMICS_EXTENSION_NAME \\\r\n    \"cl_khr_int64_base_atomics\"\r\n\r\n\r\n#define CL_KHR_INT64_BASE_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_int64_extended_atomics\r\n***************************************************************/\r\n#define cl_khr_int64_extended_atomics 1\r\n#define CL_KHR_INT64_EXTENDED_ATOMICS_EXTENSION_NAME \\\r\n    \"cl_khr_int64_extended_atomics\"\r\n\r\n\r\n#define CL_KHR_INT64_EXTENDED_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_kernel_clock\r\n***************************************************************/\r\n#define cl_khr_kernel_clock 1\r\n#define CL_KHR_KERNEL_CLOCK_EXTENSION_NAME \\\r\n    \"cl_khr_kernel_clock\"\r\n\r\n\r\n#define CL_KHR_KERNEL_CLOCK_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 0)\r\n\r\n/* cl_device_info */\r\n#define CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR             0x1076\r\n\r\ntypedef cl_bitfield         cl_device_kernel_clock_capabilities_khr;\r\n\r\n/* cl_device_kernel_clock_capabilities_khr */\r\n#define CL_DEVICE_KERNEL_CLOCK_SCOPE_DEVICE_KHR             (1 << 0)\r\n#define CL_DEVICE_KERNEL_CLOCK_SCOPE_WORK_GROUP_KHR         (1 << 1)\r\n#define CL_DEVICE_KERNEL_CLOCK_SCOPE_SUB_GROUP_KHR          (1 << 2)\r\n\r\n/***************************************************************\r\n* cl_khr_local_int32_base_atomics\r\n***************************************************************/\r\n#define cl_khr_local_int32_base_atomics 1\r\n#define CL_KHR_LOCAL_INT32_BASE_ATOMICS_EXTENSION_NAME \\\r\n    \"cl_khr_local_int32_base_atomics\"\r\n\r\n\r\n#define CL_KHR_LOCAL_INT32_BASE_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_local_int32_extended_atomics\r\n***************************************************************/\r\n#define cl_khr_local_int32_extended_atomics 1\r\n#define CL_KHR_LOCAL_INT32_EXTENDED_ATOMICS_EXTENSION_NAME \\\r\n    \"cl_khr_local_int32_extended_atomics\"\r\n\r\n\r\n#define CL_KHR_LOCAL_INT32_EXTENDED_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_mipmap_image_writes\r\n***************************************************************/\r\n#define cl_khr_mipmap_image_writes 1\r\n#define CL_KHR_MIPMAP_IMAGE_WRITES_EXTENSION_NAME \\\r\n    \"cl_khr_mipmap_image_writes\"\r\n\r\n\r\n#define CL_KHR_MIPMAP_IMAGE_WRITES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_select_fprounding_mode\r\n***************************************************************/\r\n#define cl_khr_select_fprounding_mode 1\r\n#define CL_KHR_SELECT_FPROUNDING_MODE_EXTENSION_NAME \\\r\n    \"cl_khr_select_fprounding_mode\"\r\n\r\n\r\n#define CL_KHR_SELECT_FPROUNDING_MODE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_spirv_extended_debug_info\r\n***************************************************************/\r\n#define cl_khr_spirv_extended_debug_info 1\r\n#define CL_KHR_SPIRV_EXTENDED_DEBUG_INFO_EXTENSION_NAME \\\r\n    \"cl_khr_spirv_extended_debug_info\"\r\n\r\n\r\n#define CL_KHR_SPIRV_EXTENDED_DEBUG_INFO_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_spirv_linkonce_odr\r\n***************************************************************/\r\n#define cl_khr_spirv_linkonce_odr 1\r\n#define CL_KHR_SPIRV_LINKONCE_ODR_EXTENSION_NAME \\\r\n    \"cl_khr_spirv_linkonce_odr\"\r\n\r\n\r\n#define CL_KHR_SPIRV_LINKONCE_ODR_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_spirv_no_integer_wrap_decoration\r\n***************************************************************/\r\n#define cl_khr_spirv_no_integer_wrap_decoration 1\r\n#define CL_KHR_SPIRV_NO_INTEGER_WRAP_DECORATION_EXTENSION_NAME \\\r\n    \"cl_khr_spirv_no_integer_wrap_decoration\"\r\n\r\n\r\n#define CL_KHR_SPIRV_NO_INTEGER_WRAP_DECORATION_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_srgb_image_writes\r\n***************************************************************/\r\n#define cl_khr_srgb_image_writes 1\r\n#define CL_KHR_SRGB_IMAGE_WRITES_EXTENSION_NAME \\\r\n    \"cl_khr_srgb_image_writes\"\r\n\r\n\r\n#define CL_KHR_SRGB_IMAGE_WRITES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_subgroup_ballot\r\n***************************************************************/\r\n#define cl_khr_subgroup_ballot 1\r\n#define CL_KHR_SUBGROUP_BALLOT_EXTENSION_NAME \\\r\n    \"cl_khr_subgroup_ballot\"\r\n\r\n\r\n#define CL_KHR_SUBGROUP_BALLOT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_subgroup_clustered_reduce\r\n***************************************************************/\r\n#define cl_khr_subgroup_clustered_reduce 1\r\n#define CL_KHR_SUBGROUP_CLUSTERED_REDUCE_EXTENSION_NAME \\\r\n    \"cl_khr_subgroup_clustered_reduce\"\r\n\r\n\r\n#define CL_KHR_SUBGROUP_CLUSTERED_REDUCE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_subgroup_extended_types\r\n***************************************************************/\r\n#define cl_khr_subgroup_extended_types 1\r\n#define CL_KHR_SUBGROUP_EXTENDED_TYPES_EXTENSION_NAME \\\r\n    \"cl_khr_subgroup_extended_types\"\r\n\r\n\r\n#define CL_KHR_SUBGROUP_EXTENDED_TYPES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_subgroup_non_uniform_arithmetic\r\n***************************************************************/\r\n#define cl_khr_subgroup_non_uniform_arithmetic 1\r\n#define CL_KHR_SUBGROUP_NON_UNIFORM_ARITHMETIC_EXTENSION_NAME \\\r\n    \"cl_khr_subgroup_non_uniform_arithmetic\"\r\n\r\n\r\n#define CL_KHR_SUBGROUP_NON_UNIFORM_ARITHMETIC_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_subgroup_non_uniform_vote\r\n***************************************************************/\r\n#define cl_khr_subgroup_non_uniform_vote 1\r\n#define CL_KHR_SUBGROUP_NON_UNIFORM_VOTE_EXTENSION_NAME \\\r\n    \"cl_khr_subgroup_non_uniform_vote\"\r\n\r\n\r\n#define CL_KHR_SUBGROUP_NON_UNIFORM_VOTE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_subgroup_rotate\r\n***************************************************************/\r\n#define cl_khr_subgroup_rotate 1\r\n#define CL_KHR_SUBGROUP_ROTATE_EXTENSION_NAME \\\r\n    \"cl_khr_subgroup_rotate\"\r\n\r\n\r\n#define CL_KHR_SUBGROUP_ROTATE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_subgroup_shuffle\r\n***************************************************************/\r\n#define cl_khr_subgroup_shuffle 1\r\n#define CL_KHR_SUBGROUP_SHUFFLE_EXTENSION_NAME \\\r\n    \"cl_khr_subgroup_shuffle\"\r\n\r\n\r\n#define CL_KHR_SUBGROUP_SHUFFLE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_subgroup_shuffle_relative\r\n***************************************************************/\r\n#define cl_khr_subgroup_shuffle_relative 1\r\n#define CL_KHR_SUBGROUP_SHUFFLE_RELATIVE_EXTENSION_NAME \\\r\n    \"cl_khr_subgroup_shuffle_relative\"\r\n\r\n\r\n#define CL_KHR_SUBGROUP_SHUFFLE_RELATIVE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_khr_work_group_uniform_arithmetic\r\n***************************************************************/\r\n#define cl_khr_work_group_uniform_arithmetic 1\r\n#define CL_KHR_WORK_GROUP_UNIFORM_ARITHMETIC_EXTENSION_NAME \\\r\n    \"cl_khr_work_group_uniform_arithmetic\"\r\n\r\n\r\n#define CL_KHR_WORK_GROUP_UNIFORM_ARITHMETIC_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/***************************************************************\r\n* cl_ext_image_unorm_int_2_101010\r\n***************************************************************/\r\n#define cl_ext_image_unorm_int_2_101010 1\r\n#define CL_EXT_IMAGE_UNORM_INT_2_101010_EXTENSION_NAME \\\r\n    \"cl_ext_image_unorm_int_2_101010\"\r\n\r\n\r\n#define CL_EXT_IMAGE_UNORM_INT_2_101010_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_channel_type */\r\n#define CL_UNORM_INT_2_101010_EXT                           0x10E5\r\n\r\n/***************************************************************\r\n* cl_img_cancel_command\r\n***************************************************************/\r\n#define cl_img_cancel_command 1\r\n#define CL_IMG_CANCEL_COMMAND_EXTENSION_NAME \\\r\n    \"cl_img_cancel_command\"\r\n\r\n\r\n#define CL_IMG_CANCEL_COMMAND_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* Error codes */\r\n#define CL_CANCELLED_IMG                                    -1126\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclCancelCommandsIMG_t(\r\n    const cl_event* event_list,\r\n    size_t num_events_in_list);\r\n\r\ntypedef clCancelCommandsIMG_t *\r\nclCancelCommandsIMG_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclCancelCommandsIMG(\r\n    const cl_event* event_list,\r\n    size_t num_events_in_list) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif /* OPENCL_CL_EXT_H_ */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_ext_intel.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2020 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n *\r\n ******************************************************************************/\r\n\r\n#include <CL/cl_ext.h>\r\n#pragma message(\"The Intel extensions have been moved into cl_ext.h.  Please include cl_ext.h directly.\")\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_function_types.h",
    "content": "/*\r\n * Copyright (c) 2023 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *     http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n *\r\n * OpenCL is a trademark of Apple Inc. used under license by Khronos.\r\n */\r\n\r\n#ifndef OPENCL_CL_FUNCTION_TYPES_H_\r\n#define OPENCL_CL_FUNCTION_TYPES_H_\r\n\r\n#include <CL/cl.h>\r\n\r\ntypedef cl_int CL_API_CALL clGetPlatformIDs_t(\r\n    cl_uint num_entries,\r\n    cl_platform_id* platforms,\r\n    cl_uint* num_platforms);\r\n\r\ntypedef clGetPlatformIDs_t *\r\nclGetPlatformIDs_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetPlatformInfo_t(\r\n    cl_platform_id platform,\r\n    cl_platform_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetPlatformInfo_t *\r\nclGetPlatformInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetDeviceIDs_t(\r\n    cl_platform_id platform,\r\n    cl_device_type device_type,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices);\r\n\r\ntypedef clGetDeviceIDs_t *\r\nclGetDeviceIDs_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetDeviceInfo_t(\r\n    cl_device_id device,\r\n    cl_device_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetDeviceInfo_t *\r\nclGetDeviceInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_context CL_API_CALL clCreateContext_t(\r\n    const cl_context_properties* properties,\r\n    cl_uint num_devices,\r\n    const cl_device_id* devices,\r\n    void (CL_CALLBACK* pfn_notify)(const char* errinfo, const void* private_info, size_t cb, void* user_data),\r\n    void* user_data,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateContext_t *\r\nclCreateContext_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_context CL_API_CALL clCreateContextFromType_t(\r\n    const cl_context_properties* properties,\r\n    cl_device_type device_type,\r\n    void (CL_CALLBACK* pfn_notify)(const char* errinfo, const void* private_info, size_t cb, void* user_data),\r\n    void* user_data,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateContextFromType_t *\r\nclCreateContextFromType_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clRetainContext_t(\r\n    cl_context context);\r\n\r\ntypedef clRetainContext_t *\r\nclRetainContext_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clReleaseContext_t(\r\n    cl_context context);\r\n\r\ntypedef clReleaseContext_t *\r\nclReleaseContext_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetContextInfo_t(\r\n    cl_context context,\r\n    cl_context_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetContextInfo_t *\r\nclGetContextInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clRetainCommandQueue_t(\r\n    cl_command_queue command_queue);\r\n\r\ntypedef clRetainCommandQueue_t *\r\nclRetainCommandQueue_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clReleaseCommandQueue_t(\r\n    cl_command_queue command_queue);\r\n\r\ntypedef clReleaseCommandQueue_t *\r\nclReleaseCommandQueue_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetCommandQueueInfo_t(\r\n    cl_command_queue command_queue,\r\n    cl_command_queue_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetCommandQueueInfo_t *\r\nclGetCommandQueueInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_mem CL_API_CALL clCreateBuffer_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    size_t size,\r\n    void* host_ptr,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateBuffer_t *\r\nclCreateBuffer_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clRetainMemObject_t(\r\n    cl_mem memobj);\r\n\r\ntypedef clRetainMemObject_t *\r\nclRetainMemObject_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clReleaseMemObject_t(\r\n    cl_mem memobj);\r\n\r\ntypedef clReleaseMemObject_t *\r\nclReleaseMemObject_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetSupportedImageFormats_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint num_entries,\r\n    cl_image_format* image_formats,\r\n    cl_uint* num_image_formats);\r\n\r\ntypedef clGetSupportedImageFormats_t *\r\nclGetSupportedImageFormats_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetMemObjectInfo_t(\r\n    cl_mem memobj,\r\n    cl_mem_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetMemObjectInfo_t *\r\nclGetMemObjectInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetImageInfo_t(\r\n    cl_mem image,\r\n    cl_image_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetImageInfo_t *\r\nclGetImageInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clRetainSampler_t(\r\n    cl_sampler sampler);\r\n\r\ntypedef clRetainSampler_t *\r\nclRetainSampler_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clReleaseSampler_t(\r\n    cl_sampler sampler);\r\n\r\ntypedef clReleaseSampler_t *\r\nclReleaseSampler_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetSamplerInfo_t(\r\n    cl_sampler sampler,\r\n    cl_sampler_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetSamplerInfo_t *\r\nclGetSamplerInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_program CL_API_CALL clCreateProgramWithSource_t(\r\n    cl_context context,\r\n    cl_uint count,\r\n    const char** strings,\r\n    const size_t* lengths,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateProgramWithSource_t *\r\nclCreateProgramWithSource_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_program CL_API_CALL clCreateProgramWithBinary_t(\r\n    cl_context context,\r\n    cl_uint num_devices,\r\n    const cl_device_id* device_list,\r\n    const size_t* lengths,\r\n    const unsigned char** binaries,\r\n    cl_int* binary_status,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateProgramWithBinary_t *\r\nclCreateProgramWithBinary_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clRetainProgram_t(\r\n    cl_program program);\r\n\r\ntypedef clRetainProgram_t *\r\nclRetainProgram_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clReleaseProgram_t(\r\n    cl_program program);\r\n\r\ntypedef clReleaseProgram_t *\r\nclReleaseProgram_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clBuildProgram_t(\r\n    cl_program program,\r\n    cl_uint num_devices,\r\n    const cl_device_id* device_list,\r\n    const char* options,\r\n    void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),\r\n    void* user_data);\r\n\r\ntypedef clBuildProgram_t *\r\nclBuildProgram_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetProgramInfo_t(\r\n    cl_program program,\r\n    cl_program_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetProgramInfo_t *\r\nclGetProgramInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetProgramBuildInfo_t(\r\n    cl_program program,\r\n    cl_device_id device,\r\n    cl_program_build_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetProgramBuildInfo_t *\r\nclGetProgramBuildInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_kernel CL_API_CALL clCreateKernel_t(\r\n    cl_program program,\r\n    const char* kernel_name,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateKernel_t *\r\nclCreateKernel_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clCreateKernelsInProgram_t(\r\n    cl_program program,\r\n    cl_uint num_kernels,\r\n    cl_kernel* kernels,\r\n    cl_uint* num_kernels_ret);\r\n\r\ntypedef clCreateKernelsInProgram_t *\r\nclCreateKernelsInProgram_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clRetainKernel_t(\r\n    cl_kernel kernel);\r\n\r\ntypedef clRetainKernel_t *\r\nclRetainKernel_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clReleaseKernel_t(\r\n    cl_kernel kernel);\r\n\r\ntypedef clReleaseKernel_t *\r\nclReleaseKernel_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clSetKernelArg_t(\r\n    cl_kernel kernel,\r\n    cl_uint arg_index,\r\n    size_t arg_size,\r\n    const void* arg_value);\r\n\r\ntypedef clSetKernelArg_t *\r\nclSetKernelArg_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetKernelInfo_t(\r\n    cl_kernel kernel,\r\n    cl_kernel_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetKernelInfo_t *\r\nclGetKernelInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetKernelWorkGroupInfo_t(\r\n    cl_kernel kernel,\r\n    cl_device_id device,\r\n    cl_kernel_work_group_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetKernelWorkGroupInfo_t *\r\nclGetKernelWorkGroupInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clWaitForEvents_t(\r\n    cl_uint num_events,\r\n    const cl_event* event_list);\r\n\r\ntypedef clWaitForEvents_t *\r\nclWaitForEvents_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetEventInfo_t(\r\n    cl_event event,\r\n    cl_event_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetEventInfo_t *\r\nclGetEventInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clRetainEvent_t(\r\n    cl_event event);\r\n\r\ntypedef clRetainEvent_t *\r\nclRetainEvent_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clReleaseEvent_t(\r\n    cl_event event);\r\n\r\ntypedef clReleaseEvent_t *\r\nclReleaseEvent_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetEventProfilingInfo_t(\r\n    cl_event event,\r\n    cl_profiling_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetEventProfilingInfo_t *\r\nclGetEventProfilingInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clFlush_t(\r\n    cl_command_queue command_queue);\r\n\r\ntypedef clFlush_t *\r\nclFlush_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clFinish_t(\r\n    cl_command_queue command_queue);\r\n\r\ntypedef clFinish_t *\r\nclFinish_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueReadBuffer_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem buffer,\r\n    cl_bool blocking_read,\r\n    size_t offset,\r\n    size_t size,\r\n    void* ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReadBuffer_t *\r\nclEnqueueReadBuffer_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueWriteBuffer_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem buffer,\r\n    cl_bool blocking_write,\r\n    size_t offset,\r\n    size_t size,\r\n    const void* ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueWriteBuffer_t *\r\nclEnqueueWriteBuffer_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueCopyBuffer_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem src_buffer,\r\n    cl_mem dst_buffer,\r\n    size_t src_offset,\r\n    size_t dst_offset,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueCopyBuffer_t *\r\nclEnqueueCopyBuffer_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueReadImage_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem image,\r\n    cl_bool blocking_read,\r\n    const size_t* origin,\r\n    const size_t* region,\r\n    size_t row_pitch,\r\n    size_t slice_pitch,\r\n    void* ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReadImage_t *\r\nclEnqueueReadImage_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueWriteImage_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem image,\r\n    cl_bool blocking_write,\r\n    const size_t* origin,\r\n    const size_t* region,\r\n    size_t input_row_pitch,\r\n    size_t input_slice_pitch,\r\n    const void* ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueWriteImage_t *\r\nclEnqueueWriteImage_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueCopyImage_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem src_image,\r\n    cl_mem dst_image,\r\n    const size_t* src_origin,\r\n    const size_t* dst_origin,\r\n    const size_t* region,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueCopyImage_t *\r\nclEnqueueCopyImage_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueCopyImageToBuffer_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem src_image,\r\n    cl_mem dst_buffer,\r\n    const size_t* src_origin,\r\n    const size_t* region,\r\n    size_t dst_offset,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueCopyImageToBuffer_t *\r\nclEnqueueCopyImageToBuffer_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueCopyBufferToImage_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem src_buffer,\r\n    cl_mem dst_image,\r\n    size_t src_offset,\r\n    const size_t* dst_origin,\r\n    const size_t* region,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueCopyBufferToImage_t *\r\nclEnqueueCopyBufferToImage_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef void* CL_API_CALL clEnqueueMapBuffer_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem buffer,\r\n    cl_bool blocking_map,\r\n    cl_map_flags map_flags,\r\n    size_t offset,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clEnqueueMapBuffer_t *\r\nclEnqueueMapBuffer_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef void* CL_API_CALL clEnqueueMapImage_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem image,\r\n    cl_bool blocking_map,\r\n    cl_map_flags map_flags,\r\n    const size_t* origin,\r\n    const size_t* region,\r\n    size_t* image_row_pitch,\r\n    size_t* image_slice_pitch,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clEnqueueMapImage_t *\r\nclEnqueueMapImage_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueUnmapMemObject_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem memobj,\r\n    void* mapped_ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueUnmapMemObject_t *\r\nclEnqueueUnmapMemObject_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueNDRangeKernel_t(\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    cl_uint work_dim,\r\n    const size_t* global_work_offset,\r\n    const size_t* global_work_size,\r\n    const size_t* local_work_size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueNDRangeKernel_t *\r\nclEnqueueNDRangeKernel_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueNativeKernel_t(\r\n    cl_command_queue command_queue,\r\n    void (CL_CALLBACK* user_func)(void*),\r\n    void* args,\r\n    size_t cb_args,\r\n    cl_uint num_mem_objects,\r\n    const cl_mem* mem_list,\r\n    const void** args_mem_loc,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueNativeKernel_t *\r\nclEnqueueNativeKernel_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL clSetCommandQueueProperty_t(\r\n    cl_command_queue command_queue,\r\n    cl_command_queue_properties properties,\r\n    cl_bool enable,\r\n    cl_command_queue_properties* old_properties);\r\n\r\ntypedef clSetCommandQueueProperty_t *\r\nclSetCommandQueueProperty_fn CL_API_SUFFIX__VERSION_1_0_DEPRECATED;\r\n\r\ntypedef cl_mem CL_API_CALL clCreateImage2D_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    const cl_image_format* image_format,\r\n    size_t image_width,\r\n    size_t image_height,\r\n    size_t image_row_pitch,\r\n    void* host_ptr,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateImage2D_t *\r\nclCreateImage2D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\ntypedef cl_mem CL_API_CALL clCreateImage3D_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    const cl_image_format* image_format,\r\n    size_t image_width,\r\n    size_t image_height,\r\n    size_t image_depth,\r\n    size_t image_row_pitch,\r\n    size_t image_slice_pitch,\r\n    void* host_ptr,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateImage3D_t *\r\nclCreateImage3D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueMarker_t(\r\n    cl_command_queue command_queue,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueMarker_t *\r\nclEnqueueMarker_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueWaitForEvents_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_events,\r\n    const cl_event* event_list);\r\n\r\ntypedef clEnqueueWaitForEvents_t *\r\nclEnqueueWaitForEvents_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueBarrier_t(\r\n    cl_command_queue command_queue);\r\n\r\ntypedef clEnqueueBarrier_t *\r\nclEnqueueBarrier_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\ntypedef cl_int CL_API_CALL clUnloadCompiler_t(\r\n    void );\r\n\r\ntypedef clUnloadCompiler_t *\r\nclUnloadCompiler_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\ntypedef void* CL_API_CALL clGetExtensionFunctionAddress_t(\r\n    const char* func_name);\r\n\r\ntypedef clGetExtensionFunctionAddress_t *\r\nclGetExtensionFunctionAddress_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\ntypedef cl_command_queue CL_API_CALL clCreateCommandQueue_t(\r\n    cl_context context,\r\n    cl_device_id device,\r\n    cl_command_queue_properties properties,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateCommandQueue_t *\r\nclCreateCommandQueue_fn CL_API_SUFFIX__VERSION_1_2_DEPRECATED;\r\n\r\ntypedef cl_sampler CL_API_CALL clCreateSampler_t(\r\n    cl_context context,\r\n    cl_bool normalized_coords,\r\n    cl_addressing_mode addressing_mode,\r\n    cl_filter_mode filter_mode,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateSampler_t *\r\nclCreateSampler_fn CL_API_SUFFIX__VERSION_1_2_DEPRECATED;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueTask_t(\r\n    cl_command_queue command_queue,\r\n    cl_kernel kernel,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueTask_t *\r\nclEnqueueTask_fn CL_API_SUFFIX__VERSION_1_2_DEPRECATED;\r\n\r\n#ifdef CL_VERSION_1_1\r\n\r\ntypedef cl_mem CL_API_CALL clCreateSubBuffer_t(\r\n    cl_mem buffer,\r\n    cl_mem_flags flags,\r\n    cl_buffer_create_type buffer_create_type,\r\n    const void* buffer_create_info,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateSubBuffer_t *\r\nclCreateSubBuffer_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL clSetMemObjectDestructorCallback_t(\r\n    cl_mem memobj,\r\n    void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data),\r\n    void* user_data);\r\n\r\ntypedef clSetMemObjectDestructorCallback_t *\r\nclSetMemObjectDestructorCallback_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_event CL_API_CALL clCreateUserEvent_t(\r\n    cl_context context,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateUserEvent_t *\r\nclCreateUserEvent_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL clSetUserEventStatus_t(\r\n    cl_event event,\r\n    cl_int execution_status);\r\n\r\ntypedef clSetUserEventStatus_t *\r\nclSetUserEventStatus_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL clSetEventCallback_t(\r\n    cl_event event,\r\n    cl_int command_exec_callback_type,\r\n    void (CL_CALLBACK* pfn_notify)(cl_event event, cl_int event_command_status, void *user_data),\r\n    void* user_data);\r\n\r\ntypedef clSetEventCallback_t *\r\nclSetEventCallback_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueReadBufferRect_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem buffer,\r\n    cl_bool blocking_read,\r\n    const size_t* buffer_origin,\r\n    const size_t* host_origin,\r\n    const size_t* region,\r\n    size_t buffer_row_pitch,\r\n    size_t buffer_slice_pitch,\r\n    size_t host_row_pitch,\r\n    size_t host_slice_pitch,\r\n    void* ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReadBufferRect_t *\r\nclEnqueueReadBufferRect_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueWriteBufferRect_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem buffer,\r\n    cl_bool blocking_write,\r\n    const size_t* buffer_origin,\r\n    const size_t* host_origin,\r\n    const size_t* region,\r\n    size_t buffer_row_pitch,\r\n    size_t buffer_slice_pitch,\r\n    size_t host_row_pitch,\r\n    size_t host_slice_pitch,\r\n    const void* ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueWriteBufferRect_t *\r\nclEnqueueWriteBufferRect_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueCopyBufferRect_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem src_buffer,\r\n    cl_mem dst_buffer,\r\n    const size_t* src_origin,\r\n    const size_t* dst_origin,\r\n    const size_t* region,\r\n    size_t src_row_pitch,\r\n    size_t src_slice_pitch,\r\n    size_t dst_row_pitch,\r\n    size_t dst_slice_pitch,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueCopyBufferRect_t *\r\nclEnqueueCopyBufferRect_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif /* CL_VERSION_1_1 */\r\n\r\n#ifdef CL_VERSION_1_2\r\n\r\ntypedef cl_int CL_API_CALL clCreateSubDevices_t(\r\n    cl_device_id in_device,\r\n    const cl_device_partition_property* properties,\r\n    cl_uint num_devices,\r\n    cl_device_id* out_devices,\r\n    cl_uint* num_devices_ret);\r\n\r\ntypedef clCreateSubDevices_t *\r\nclCreateSubDevices_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clRetainDevice_t(\r\n    cl_device_id device);\r\n\r\ntypedef clRetainDevice_t *\r\nclRetainDevice_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clReleaseDevice_t(\r\n    cl_device_id device);\r\n\r\ntypedef clReleaseDevice_t *\r\nclReleaseDevice_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_mem CL_API_CALL clCreateImage_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    const cl_image_format* image_format,\r\n    const cl_image_desc* image_desc,\r\n    void* host_ptr,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateImage_t *\r\nclCreateImage_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_program CL_API_CALL clCreateProgramWithBuiltInKernels_t(\r\n    cl_context context,\r\n    cl_uint num_devices,\r\n    const cl_device_id* device_list,\r\n    const char* kernel_names,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateProgramWithBuiltInKernels_t *\r\nclCreateProgramWithBuiltInKernels_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clCompileProgram_t(\r\n    cl_program program,\r\n    cl_uint num_devices,\r\n    const cl_device_id* device_list,\r\n    const char* options,\r\n    cl_uint num_input_headers,\r\n    const cl_program* input_headers,\r\n    const char** header_include_names,\r\n    void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),\r\n    void* user_data);\r\n\r\ntypedef clCompileProgram_t *\r\nclCompileProgram_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_program CL_API_CALL clLinkProgram_t(\r\n    cl_context context,\r\n    cl_uint num_devices,\r\n    const cl_device_id* device_list,\r\n    const char* options,\r\n    cl_uint num_input_programs,\r\n    const cl_program* input_programs,\r\n    void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),\r\n    void* user_data,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clLinkProgram_t *\r\nclLinkProgram_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clUnloadPlatformCompiler_t(\r\n    cl_platform_id platform);\r\n\r\ntypedef clUnloadPlatformCompiler_t *\r\nclUnloadPlatformCompiler_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clGetKernelArgInfo_t(\r\n    cl_kernel kernel,\r\n    cl_uint arg_index,\r\n    cl_kernel_arg_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetKernelArgInfo_t *\r\nclGetKernelArgInfo_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueFillBuffer_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem buffer,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t offset,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueFillBuffer_t *\r\nclEnqueueFillBuffer_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueFillImage_t(\r\n    cl_command_queue command_queue,\r\n    cl_mem image,\r\n    const void* fill_color,\r\n    const size_t* origin,\r\n    const size_t* region,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueFillImage_t *\r\nclEnqueueFillImage_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueMigrateMemObjects_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_mem_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_mem_migration_flags flags,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueMigrateMemObjects_t *\r\nclEnqueueMigrateMemObjects_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueMarkerWithWaitList_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueMarkerWithWaitList_t *\r\nclEnqueueMarkerWithWaitList_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueBarrierWithWaitList_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueBarrierWithWaitList_t *\r\nclEnqueueBarrierWithWaitList_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef void* CL_API_CALL clGetExtensionFunctionAddressForPlatform_t(\r\n    cl_platform_id platform,\r\n    const char* func_name);\r\n\r\ntypedef clGetExtensionFunctionAddressForPlatform_t *\r\nclGetExtensionFunctionAddressForPlatform_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* CL_VERSION_1_2 */\r\n\r\n#ifdef CL_VERSION_2_0\r\n\r\ntypedef cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties_t(\r\n    cl_context context,\r\n    cl_device_id device,\r\n    const cl_queue_properties* properties,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateCommandQueueWithProperties_t *\r\nclCreateCommandQueueWithProperties_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_mem CL_API_CALL clCreatePipe_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_uint pipe_packet_size,\r\n    cl_uint pipe_max_packets,\r\n    const cl_pipe_properties* properties,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreatePipe_t *\r\nclCreatePipe_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_int CL_API_CALL clGetPipeInfo_t(\r\n    cl_mem pipe,\r\n    cl_pipe_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetPipeInfo_t *\r\nclGetPipeInfo_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef void* CL_API_CALL clSVMAlloc_t(\r\n    cl_context context,\r\n    cl_svm_mem_flags flags,\r\n    size_t size,\r\n    cl_uint alignment);\r\n\r\ntypedef clSVMAlloc_t *\r\nclSVMAlloc_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef void CL_API_CALL clSVMFree_t(\r\n    cl_context context,\r\n    void* svm_pointer);\r\n\r\ntypedef clSVMFree_t *\r\nclSVMFree_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_sampler CL_API_CALL clCreateSamplerWithProperties_t(\r\n    cl_context context,\r\n    const cl_sampler_properties* sampler_properties,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateSamplerWithProperties_t *\r\nclCreateSamplerWithProperties_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_int CL_API_CALL clSetKernelArgSVMPointer_t(\r\n    cl_kernel kernel,\r\n    cl_uint arg_index,\r\n    const void* arg_value);\r\n\r\ntypedef clSetKernelArgSVMPointer_t *\r\nclSetKernelArgSVMPointer_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_int CL_API_CALL clSetKernelExecInfo_t(\r\n    cl_kernel kernel,\r\n    cl_kernel_exec_info param_name,\r\n    size_t param_value_size,\r\n    const void* param_value);\r\n\r\ntypedef clSetKernelExecInfo_t *\r\nclSetKernelExecInfo_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueSVMFree_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_svm_pointers,\r\n    void* svm_pointers[],\r\n    void (CL_CALLBACK* pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void* svm_pointers[], void* user_data),\r\n    void* user_data,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMFree_t *\r\nclEnqueueSVMFree_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueSVMMemcpy_t(\r\n    cl_command_queue command_queue,\r\n    cl_bool blocking_copy,\r\n    void* dst_ptr,\r\n    const void* src_ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMMemcpy_t *\r\nclEnqueueSVMMemcpy_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueSVMMemFill_t(\r\n    cl_command_queue command_queue,\r\n    void* svm_ptr,\r\n    const void* pattern,\r\n    size_t pattern_size,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMMemFill_t *\r\nclEnqueueSVMMemFill_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueSVMMap_t(\r\n    cl_command_queue command_queue,\r\n    cl_bool blocking_map,\r\n    cl_map_flags flags,\r\n    void* svm_ptr,\r\n    size_t size,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMMap_t *\r\nclEnqueueSVMMap_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueSVMUnmap_t(\r\n    cl_command_queue command_queue,\r\n    void* svm_ptr,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMUnmap_t *\r\nclEnqueueSVMUnmap_fn CL_API_SUFFIX__VERSION_2_0;\r\n\r\n#endif /* CL_VERSION_2_0 */\r\n\r\n#ifdef CL_VERSION_2_1\r\n\r\ntypedef cl_int CL_API_CALL clSetDefaultDeviceCommandQueue_t(\r\n    cl_context context,\r\n    cl_device_id device,\r\n    cl_command_queue command_queue);\r\n\r\ntypedef clSetDefaultDeviceCommandQueue_t *\r\nclSetDefaultDeviceCommandQueue_fn CL_API_SUFFIX__VERSION_2_1;\r\n\r\ntypedef cl_int CL_API_CALL clGetDeviceAndHostTimer_t(\r\n    cl_device_id device,\r\n    cl_ulong* device_timestamp,\r\n    cl_ulong* host_timestamp);\r\n\r\ntypedef clGetDeviceAndHostTimer_t *\r\nclGetDeviceAndHostTimer_fn CL_API_SUFFIX__VERSION_2_1;\r\n\r\ntypedef cl_int CL_API_CALL clGetHostTimer_t(\r\n    cl_device_id device,\r\n    cl_ulong* host_timestamp);\r\n\r\ntypedef clGetHostTimer_t *\r\nclGetHostTimer_fn CL_API_SUFFIX__VERSION_2_1;\r\n\r\ntypedef cl_program CL_API_CALL clCreateProgramWithIL_t(\r\n    cl_context context,\r\n    const void* il,\r\n    size_t length,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateProgramWithIL_t *\r\nclCreateProgramWithIL_fn CL_API_SUFFIX__VERSION_2_1;\r\n\r\ntypedef cl_kernel CL_API_CALL clCloneKernel_t(\r\n    cl_kernel source_kernel,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCloneKernel_t *\r\nclCloneKernel_fn CL_API_SUFFIX__VERSION_2_1;\r\n\r\ntypedef cl_int CL_API_CALL clGetKernelSubGroupInfo_t(\r\n    cl_kernel kernel,\r\n    cl_device_id device,\r\n    cl_kernel_sub_group_info param_name,\r\n    size_t input_value_size,\r\n    const void* input_value,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetKernelSubGroupInfo_t *\r\nclGetKernelSubGroupInfo_fn CL_API_SUFFIX__VERSION_2_1;\r\n\r\ntypedef cl_int CL_API_CALL clEnqueueSVMMigrateMem_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_svm_pointers,\r\n    const void** svm_pointers,\r\n    const size_t* sizes,\r\n    cl_mem_migration_flags flags,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueSVMMigrateMem_t *\r\nclEnqueueSVMMigrateMem_fn CL_API_SUFFIX__VERSION_2_1;\r\n\r\n#endif /* CL_VERSION_2_1 */\r\n\r\n#ifdef CL_VERSION_2_2\r\n\r\ntypedef cl_int CL_API_CALL clSetProgramSpecializationConstant_t(\r\n    cl_program program,\r\n    cl_uint spec_id,\r\n    size_t spec_size,\r\n    const void* spec_value);\r\n\r\ntypedef clSetProgramSpecializationConstant_t *\r\nclSetProgramSpecializationConstant_fn CL_API_SUFFIX__VERSION_2_2;\r\n\r\ntypedef cl_int CL_API_CALL clSetProgramReleaseCallback_t(\r\n    cl_program program,\r\n    void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),\r\n    void* user_data);\r\n\r\ntypedef clSetProgramReleaseCallback_t *\r\nclSetProgramReleaseCallback_fn CL_API_SUFFIX__VERSION_2_2_DEPRECATED;\r\n\r\n#endif /* CL_VERSION_2_2 */\r\n\r\n#ifdef CL_VERSION_3_0\r\n\r\ntypedef cl_int CL_API_CALL clSetContextDestructorCallback_t(\r\n    cl_context context,\r\n    void (CL_CALLBACK* pfn_notify)(cl_context context, void* user_data),\r\n    void* user_data);\r\n\r\ntypedef clSetContextDestructorCallback_t *\r\nclSetContextDestructorCallback_fn CL_API_SUFFIX__VERSION_3_0;\r\n\r\ntypedef cl_mem CL_API_CALL clCreateBufferWithProperties_t(\r\n    cl_context context,\r\n    const cl_mem_properties* properties,\r\n    cl_mem_flags flags,\r\n    size_t size,\r\n    void* host_ptr,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateBufferWithProperties_t *\r\nclCreateBufferWithProperties_fn CL_API_SUFFIX__VERSION_3_0;\r\n\r\ntypedef cl_mem CL_API_CALL clCreateImageWithProperties_t(\r\n    cl_context context,\r\n    const cl_mem_properties* properties,\r\n    cl_mem_flags flags,\r\n    const cl_image_format* image_format,\r\n    const cl_image_desc* image_desc,\r\n    void* host_ptr,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateImageWithProperties_t *\r\nclCreateImageWithProperties_fn CL_API_SUFFIX__VERSION_3_0;\r\n\r\n#endif /* CL_VERSION_3_0 */\r\n\r\n#endif /* OPENCL_CL_FUNCTION_TYPES_H_ */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_gl.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2023 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef OPENCL_CL_GL_H_\r\n#define OPENCL_CL_GL_H_\r\n\r\n/*\r\n** This header is generated from the Khronos OpenCL XML API Registry.\r\n*/\r\n\r\n#include <CL/cl.h>\r\n\r\n/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)\r\n#define CL_NO_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n/* CL_NO_EXTENSION_PROTOTYPES implies\r\n   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and\r\n   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/***************************************************************\r\n* cl_khr_gl_sharing\r\n***************************************************************/\r\n#define cl_khr_gl_sharing 1\r\n#define CL_KHR_GL_SHARING_EXTENSION_NAME \\\r\n    \"cl_khr_gl_sharing\"\r\n\r\n\r\n#define CL_KHR_GL_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef int                 cl_GLint;\r\ntypedef unsigned int        cl_GLenum;\r\ntypedef unsigned int        cl_GLuint;\r\n\r\ntypedef cl_uint             cl_gl_context_info;\r\n\r\n/* Error codes */\r\n#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR              -1000\r\n\r\n/* cl_gl_context_info */\r\n#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR                0x2006\r\n#define CL_DEVICES_FOR_GL_CONTEXT_KHR                       0x2007\r\n\r\n/* Additional cl_context_properties */\r\n#define CL_GL_CONTEXT_KHR                                   0x2008\r\n#define CL_EGL_DISPLAY_KHR                                  0x2009\r\n#define CL_GLX_DISPLAY_KHR                                  0x200A\r\n#define CL_WGL_HDC_KHR                                      0x200B\r\n#define CL_CGL_SHAREGROUP_KHR                               0x200C\r\n\r\ntypedef cl_uint             cl_gl_object_type;\r\ntypedef cl_uint             cl_gl_texture_info;\r\ntypedef cl_uint             cl_gl_platform_info;\r\n\r\n/* cl_gl_object_type */\r\n#define CL_GL_OBJECT_BUFFER                                 0x2000\r\n#define CL_GL_OBJECT_TEXTURE2D                              0x2001\r\n#define CL_GL_OBJECT_TEXTURE3D                              0x2002\r\n#define CL_GL_OBJECT_RENDERBUFFER                           0x2003\r\n\r\n#if defined(CL_VERSION_1_2)\r\n/* cl_gl_object_type */\r\n#define CL_GL_OBJECT_TEXTURE2D_ARRAY                        0x200E\r\n#define CL_GL_OBJECT_TEXTURE1D                              0x200F\r\n#define CL_GL_OBJECT_TEXTURE1D_ARRAY                        0x2010\r\n#define CL_GL_OBJECT_TEXTURE_BUFFER                         0x2011\r\n\r\n#endif /* defined(CL_VERSION_1_2) */\r\n\r\n/* cl_gl_texture_info */\r\n#define CL_GL_TEXTURE_TARGET                                0x2004\r\n#define CL_GL_MIPMAP_LEVEL                                  0x2005\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetGLContextInfoKHR_t(\r\n    const cl_context_properties* properties,\r\n    cl_gl_context_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetGLContextInfoKHR_t *\r\nclGetGLContextInfoKHR_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromGLBuffer_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLuint bufobj,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromGLBuffer_t *\r\nclCreateFromGLBuffer_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetGLContextInfoKHR(\r\n    const cl_context_properties* properties,\r\n    cl_gl_context_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromGLBuffer(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLuint bufobj,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#if defined(CL_VERSION_1_2)\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromGLTexture_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLenum target,\r\n    cl_GLint miplevel,\r\n    cl_GLuint texture,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromGLTexture_t *\r\nclCreateFromGLTexture_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromGLTexture(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLenum target,\r\n    cl_GLint miplevel,\r\n    cl_GLuint texture,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#endif /* defined(CL_VERSION_1_2) */\r\n\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromGLRenderbuffer_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLuint renderbuffer,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromGLRenderbuffer_t *\r\nclCreateFromGLRenderbuffer_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetGLObjectInfo_t(\r\n    cl_mem memobj,\r\n    cl_gl_object_type* gl_object_type,\r\n    cl_GLuint* gl_object_name);\r\n\r\ntypedef clGetGLObjectInfo_t *\r\nclGetGLObjectInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetGLTextureInfo_t(\r\n    cl_mem memobj,\r\n    cl_gl_texture_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetGLTextureInfo_t *\r\nclGetGLTextureInfo_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueAcquireGLObjects_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueAcquireGLObjects_t *\r\nclEnqueueAcquireGLObjects_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReleaseGLObjects_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReleaseGLObjects_t *\r\nclEnqueueReleaseGLObjects_fn CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromGLRenderbuffer(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLuint renderbuffer,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetGLObjectInfo(\r\n    cl_mem memobj,\r\n    cl_gl_object_type* gl_object_type,\r\n    cl_GLuint* gl_object_name) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetGLTextureInfo(\r\n    cl_mem memobj,\r\n    cl_gl_texture_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueAcquireGLObjects(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReleaseGLObjects(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_0;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/* OpenCL 1.0 APIs that were deprecated in OpenCL 1.2 */\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromGLTexture2D_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLenum target,\r\n    cl_GLint miplevel,\r\n    cl_GLuint texture,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromGLTexture2D_t *\r\nclCreateFromGLTexture2D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromGLTexture3D_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLenum target,\r\n    cl_GLint miplevel,\r\n    cl_GLuint texture,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromGLTexture3D_t *\r\nclCreateFromGLTexture3D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromGLTexture2D(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLenum target,\r\n    cl_GLint miplevel,\r\n    cl_GLuint texture,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromGLTexture3D(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_GLenum target,\r\n    cl_GLint miplevel,\r\n    cl_GLuint texture,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_gl_event\r\n***************************************************************/\r\n#define cl_khr_gl_event 1\r\n#define CL_KHR_GL_EVENT_EXTENSION_NAME \\\r\n    \"cl_khr_gl_event\"\r\n\r\n\r\n#define CL_KHR_GL_EVENT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef struct __GLsync *   cl_GLsync;\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR                 0x200D\r\n\r\n\r\ntypedef cl_event CL_API_CALL\r\nclCreateEventFromGLsyncKHR_t(\r\n    cl_context context,\r\n    cl_GLsync sync,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateEventFromGLsyncKHR_t *\r\nclCreateEventFromGLsyncKHR_fn CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_event CL_API_CALL\r\nclCreateEventFromGLsyncKHR(\r\n    cl_context context,\r\n    cl_GLsync sync,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;\r\n\r\n#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_khr_gl_depth_images\r\n***************************************************************/\r\n#define cl_khr_gl_depth_images 1\r\n#define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_NAME \\\r\n    \"cl_khr_gl_depth_images\"\r\n\r\n\r\n#define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_channel_order */\r\n#define CL_DEPTH_STENCIL                                    0x10BE\r\n\r\n/* cl_channel_type */\r\n#define CL_UNORM_INT24                                      0x10DF\r\n\r\n/***************************************************************\r\n* cl_khr_gl_msaa_sharing\r\n***************************************************************/\r\n#define cl_khr_gl_msaa_sharing 1\r\n#define CL_KHR_GL_MSAA_SHARING_EXTENSION_NAME \\\r\n    \"cl_khr_gl_msaa_sharing\"\r\n\r\n\r\n#define CL_KHR_GL_MSAA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\n/* cl_gl_texture_info */\r\n#define CL_GL_NUM_SAMPLES                                   0x2012\r\n\r\n/***************************************************************\r\n* cl_intel_sharing_format_query_gl\r\n***************************************************************/\r\n#define cl_intel_sharing_format_query_gl 1\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_NAME \\\r\n    \"cl_intel_sharing_format_query_gl\"\r\n\r\n\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* when cl_khr_gl_sharing is supported */\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetSupportedGLTextureFormatsINTEL_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint num_entries,\r\n    cl_GLenum* gl_formats,\r\n    cl_uint* num_texture_formats);\r\n\r\ntypedef clGetSupportedGLTextureFormatsINTEL_t *\r\nclGetSupportedGLTextureFormatsINTEL_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetSupportedGLTextureFormatsINTEL(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint num_entries,\r\n    cl_GLenum* gl_formats,\r\n    cl_uint* num_texture_formats) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif /* OPENCL_CL_GL_H_ */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_gl_ext.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2021 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#include <CL/cl_gl.h>\r\n#pragma message(\"The extensions in cl_gl_ext.h have been moved into cl_gl.h.  Please include cl_gl.h directly.\")\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_half.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2019-2020 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n/**\r\n * This is a header-only utility library that provides OpenCL host code with\r\n * routines for converting to/from cl_half values.\r\n *\r\n * Example usage:\r\n *\r\n *    #include <CL/cl_half.h>\r\n *    ...\r\n *    cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE);\r\n *    cl_float f = cl_half_to_float(h);\r\n */\r\n\r\n#ifndef OPENCL_CL_HALF_H\r\n#define OPENCL_CL_HALF_H\r\n\r\n#include <CL/cl_platform.h>\r\n\r\n#include <stdint.h>\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n\r\n/**\r\n * Rounding mode used when converting to cl_half.\r\n */\r\ntypedef enum\r\n{\r\n  CL_HALF_RTE, // round to nearest even\r\n  CL_HALF_RTZ, // round towards zero\r\n  CL_HALF_RTP, // round towards positive infinity\r\n  CL_HALF_RTN, // round towards negative infinity\r\n} cl_half_rounding_mode;\r\n\r\n\r\n/* Private utility macros. */\r\n#define CL_HALF_EXP_MASK 0x7C00\r\n#define CL_HALF_MAX_FINITE_MAG 0x7BFF\r\n\r\n\r\n/*\r\n * Utility to deal with values that overflow when converting to half precision.\r\n */\r\nstatic inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode,\r\n                                              uint16_t sign)\r\n{\r\n  if (rounding_mode == CL_HALF_RTZ)\r\n  {\r\n    // Round overflow towards zero -> largest finite number (preserving sign)\r\n    return (sign << 15) | CL_HALF_MAX_FINITE_MAG;\r\n  }\r\n  else if (rounding_mode == CL_HALF_RTP && sign)\r\n  {\r\n    // Round negative overflow towards positive infinity -> most negative finite number\r\n    return (1 << 15) | CL_HALF_MAX_FINITE_MAG;\r\n  }\r\n  else if (rounding_mode == CL_HALF_RTN && !sign)\r\n  {\r\n    // Round positive overflow towards negative infinity -> largest finite number\r\n    return CL_HALF_MAX_FINITE_MAG;\r\n  }\r\n\r\n  // Overflow to infinity\r\n  return (sign << 15) | CL_HALF_EXP_MASK;\r\n}\r\n\r\n/*\r\n * Utility to deal with values that underflow when converting to half precision.\r\n */\r\nstatic inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode,\r\n                                               uint16_t sign)\r\n{\r\n  if (rounding_mode == CL_HALF_RTP && !sign)\r\n  {\r\n    // Round underflow towards positive infinity -> smallest positive value\r\n    return (sign << 15) | 1;\r\n  }\r\n  else if (rounding_mode == CL_HALF_RTN && sign)\r\n  {\r\n    // Round underflow towards negative infinity -> largest negative value\r\n    return (sign << 15) | 1;\r\n  }\r\n\r\n  // Flush to zero\r\n  return (sign << 15);\r\n}\r\n\r\n\r\n/**\r\n * Convert a cl_float to a cl_half.\r\n */\r\nstatic inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode)\r\n{\r\n  // Type-punning to get direct access to underlying bits\r\n  union\r\n  {\r\n    cl_float f;\r\n    uint32_t i;\r\n  } f32;\r\n  f32.f = f;\r\n\r\n  // Extract sign bit\r\n  uint16_t sign = f32.i >> 31;\r\n\r\n  // Extract FP32 exponent and mantissa\r\n  uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF;\r\n  uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1);\r\n\r\n  // Remove FP32 exponent bias\r\n  int32_t exp = f_exp - CL_FLT_MAX_EXP + 1;\r\n\r\n  // Add FP16 exponent bias\r\n  uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);\r\n\r\n  // Position of the bit that will become the FP16 mantissa LSB\r\n  uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG;\r\n\r\n  // Check for NaN / infinity\r\n  if (f_exp == 0xFF)\r\n  {\r\n    if (f_mant)\r\n    {\r\n      // NaN -> propagate mantissa and silence it\r\n      uint16_t h_mant = (uint16_t)(f_mant >> lsb_pos);\r\n      h_mant |= 0x200;\r\n      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;\r\n    }\r\n    else\r\n    {\r\n      // Infinity -> zero mantissa\r\n      return (sign << 15) | CL_HALF_EXP_MASK;\r\n    }\r\n  }\r\n\r\n  // Check for zero\r\n  if (!f_exp && !f_mant)\r\n  {\r\n    return (sign << 15);\r\n  }\r\n\r\n  // Check for overflow\r\n  if (exp >= CL_HALF_MAX_EXP)\r\n  {\r\n    return cl_half_handle_overflow(rounding_mode, sign);\r\n  }\r\n\r\n  // Check for underflow\r\n  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))\r\n  {\r\n    return cl_half_handle_underflow(rounding_mode, sign);\r\n  }\r\n\r\n  // Check for value that will become denormal\r\n  if (exp < -14)\r\n  {\r\n    // Denormal -> include the implicit 1 from the FP32 mantissa\r\n    h_exp = 0;\r\n    f_mant |= 1 << (CL_FLT_MANT_DIG - 1);\r\n\r\n    // Mantissa shift amount depends on exponent\r\n    lsb_pos = -exp + (CL_FLT_MANT_DIG - 25);\r\n  }\r\n\r\n  // Generate FP16 mantissa by shifting FP32 mantissa\r\n  uint16_t h_mant = (uint16_t)(f_mant >> lsb_pos);\r\n\r\n  // Check whether we need to round\r\n  uint32_t halfway = 1 << (lsb_pos - 1);\r\n  uint32_t mask = (halfway << 1) - 1;\r\n  switch (rounding_mode)\r\n  {\r\n    case CL_HALF_RTE:\r\n      if ((f_mant & mask) > halfway)\r\n      {\r\n        // More than halfway -> round up\r\n        h_mant += 1;\r\n      }\r\n      else if ((f_mant & mask) == halfway)\r\n      {\r\n        // Exactly halfway -> round to nearest even\r\n        if (h_mant & 0x1)\r\n          h_mant += 1;\r\n      }\r\n      break;\r\n    case CL_HALF_RTZ:\r\n      // Mantissa has already been truncated -> do nothing\r\n      break;\r\n    case CL_HALF_RTP:\r\n      if ((f_mant & mask) && !sign)\r\n      {\r\n        // Round positive numbers up\r\n        h_mant += 1;\r\n      }\r\n      break;\r\n    case CL_HALF_RTN:\r\n      if ((f_mant & mask) && sign)\r\n      {\r\n        // Round negative numbers down\r\n        h_mant += 1;\r\n      }\r\n      break;\r\n  }\r\n\r\n  // Check for mantissa overflow\r\n  if (h_mant & 0x400)\r\n  {\r\n    h_exp += 1;\r\n    h_mant = 0;\r\n  }\r\n\r\n  return (sign << 15) | (h_exp << 10) | h_mant;\r\n}\r\n\r\n\r\n/**\r\n * Convert a cl_double to a cl_half.\r\n */\r\nstatic inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode)\r\n{\r\n  // Type-punning to get direct access to underlying bits\r\n  union\r\n  {\r\n    cl_double d;\r\n    uint64_t i;\r\n  } f64;\r\n  f64.d = d;\r\n\r\n  // Extract sign bit\r\n  uint16_t sign = f64.i >> 63;\r\n\r\n  // Extract FP64 exponent and mantissa\r\n  uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF;\r\n  uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1);\r\n\r\n  // Remove FP64 exponent bias\r\n  int64_t exp = d_exp - CL_DBL_MAX_EXP + 1;\r\n\r\n  // Add FP16 exponent bias\r\n  uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);\r\n\r\n  // Position of the bit that will become the FP16 mantissa LSB\r\n  uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG;\r\n\r\n  // Check for NaN / infinity\r\n  if (d_exp == 0x7FF)\r\n  {\r\n    if (d_mant)\r\n    {\r\n      // NaN -> propagate mantissa and silence it\r\n      uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);\r\n      h_mant |= 0x200;\r\n      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;\r\n    }\r\n    else\r\n    {\r\n      // Infinity -> zero mantissa\r\n      return (sign << 15) | CL_HALF_EXP_MASK;\r\n    }\r\n  }\r\n\r\n  // Check for zero\r\n  if (!d_exp && !d_mant)\r\n  {\r\n    return (sign << 15);\r\n  }\r\n\r\n  // Check for overflow\r\n  if (exp >= CL_HALF_MAX_EXP)\r\n  {\r\n    return cl_half_handle_overflow(rounding_mode, sign);\r\n  }\r\n\r\n  // Check for underflow\r\n  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))\r\n  {\r\n    return cl_half_handle_underflow(rounding_mode, sign);\r\n  }\r\n\r\n  // Check for value that will become denormal\r\n  if (exp < -14)\r\n  {\r\n    // Include the implicit 1 from the FP64 mantissa\r\n    h_exp = 0;\r\n    d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1);\r\n\r\n    // Mantissa shift amount depends on exponent\r\n    lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25));\r\n  }\r\n\r\n  // Generate FP16 mantissa by shifting FP64 mantissa\r\n  uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);\r\n\r\n  // Check whether we need to round\r\n  uint64_t halfway = (uint64_t)1 << (lsb_pos - 1);\r\n  uint64_t mask = (halfway << 1) - 1;\r\n  switch (rounding_mode)\r\n  {\r\n    case CL_HALF_RTE:\r\n      if ((d_mant & mask) > halfway)\r\n      {\r\n        // More than halfway -> round up\r\n        h_mant += 1;\r\n      }\r\n      else if ((d_mant & mask) == halfway)\r\n      {\r\n        // Exactly halfway -> round to nearest even\r\n        if (h_mant & 0x1)\r\n          h_mant += 1;\r\n      }\r\n      break;\r\n    case CL_HALF_RTZ:\r\n      // Mantissa has already been truncated -> do nothing\r\n      break;\r\n    case CL_HALF_RTP:\r\n      if ((d_mant & mask) && !sign)\r\n      {\r\n        // Round positive numbers up\r\n        h_mant += 1;\r\n      }\r\n      break;\r\n    case CL_HALF_RTN:\r\n      if ((d_mant & mask) && sign)\r\n      {\r\n        // Round negative numbers down\r\n        h_mant += 1;\r\n      }\r\n      break;\r\n  }\r\n\r\n  // Check for mantissa overflow\r\n  if (h_mant & 0x400)\r\n  {\r\n    h_exp += 1;\r\n    h_mant = 0;\r\n  }\r\n\r\n  return (sign << 15) | (h_exp << 10) | h_mant;\r\n}\r\n\r\n\r\n/**\r\n * Convert a cl_half to a cl_float.\r\n */\r\nstatic inline cl_float cl_half_to_float(cl_half h)\r\n{\r\n  // Type-punning to get direct access to underlying bits\r\n  union\r\n  {\r\n    cl_float f;\r\n    uint32_t i;\r\n  } f32;\r\n\r\n  // Extract sign bit\r\n  uint16_t sign = h >> 15;\r\n\r\n  // Extract FP16 exponent and mantissa\r\n  uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F;\r\n  uint16_t h_mant = h & 0x3FF;\r\n\r\n  // Remove FP16 exponent bias\r\n  int32_t exp = h_exp - CL_HALF_MAX_EXP + 1;\r\n\r\n  // Add FP32 exponent bias\r\n  uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1;\r\n\r\n  // Check for NaN / infinity\r\n  if (h_exp == 0x1F)\r\n  {\r\n    if (h_mant)\r\n    {\r\n      // NaN -> propagate mantissa and silence it\r\n      uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG);\r\n      f_mant |= 0x400000;\r\n      f32.i = (sign << 31) | 0x7F800000 | f_mant;\r\n      return f32.f;\r\n    }\r\n    else\r\n    {\r\n      // Infinity -> zero mantissa\r\n      f32.i = (sign << 31) | 0x7F800000;\r\n      return f32.f;\r\n    }\r\n  }\r\n\r\n  // Check for zero / denormal\r\n  if (h_exp == 0)\r\n  {\r\n    if (h_mant == 0)\r\n    {\r\n      // Zero -> zero exponent\r\n      f_exp = 0;\r\n    }\r\n    else\r\n    {\r\n      // Denormal -> normalize it\r\n      // - Shift mantissa to make most-significant 1 implicit\r\n      // - Adjust exponent accordingly\r\n      uint32_t shift = 0;\r\n      while ((h_mant & 0x400) == 0)\r\n      {\r\n        h_mant <<= 1;\r\n        shift++;\r\n      }\r\n      h_mant &= 0x3FF;\r\n      f_exp -= shift - 1;\r\n    }\r\n  }\r\n\r\n  f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13);\r\n  return f32.f;\r\n}\r\n\r\n\r\n#undef CL_HALF_EXP_MASK\r\n#undef CL_HALF_MAX_FINITE_MAG\r\n\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n\r\n#endif  /* OPENCL_CL_HALF_H */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_icd.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2019-2020 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef OPENCL_CL_ICD_H\r\n#define OPENCL_CL_ICD_H\r\n\r\n#include <CL/cl.h>\r\n#include <CL/cl_function_types.h>\r\n#include <CL/cl_egl.h>\r\n#include <CL/cl_ext.h>\r\n#include <CL/cl_gl.h>\r\n\r\n#if defined(_WIN32)\r\n#include <CL/cl_d3d11.h>\r\n#include <CL/cl_d3d10.h>\r\n#include <CL/cl_dx9_media_sharing.h>\r\n#endif\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/* Vendor dispatch table structure */\r\n\r\ntypedef struct _cl_icd_dispatch {\r\n  /* OpenCL 1.0 */\r\n  clGetPlatformIDs_t *clGetPlatformIDs;\r\n  clGetPlatformInfo_t *clGetPlatformInfo;\r\n  clGetDeviceIDs_t *clGetDeviceIDs;\r\n  clGetDeviceInfo_t *clGetDeviceInfo;\r\n  clCreateContext_t *clCreateContext;\r\n  clCreateContextFromType_t *clCreateContextFromType;\r\n  clRetainContext_t *clRetainContext;\r\n  clReleaseContext_t *clReleaseContext;\r\n  clGetContextInfo_t *clGetContextInfo;\r\n  clCreateCommandQueue_t *clCreateCommandQueue;\r\n  clRetainCommandQueue_t *clRetainCommandQueue;\r\n  clReleaseCommandQueue_t *clReleaseCommandQueue;\r\n  clGetCommandQueueInfo_t *clGetCommandQueueInfo;\r\n  clSetCommandQueueProperty_t *clSetCommandQueueProperty;\r\n  clCreateBuffer_t *clCreateBuffer;\r\n  clCreateImage2D_t *clCreateImage2D;\r\n  clCreateImage3D_t *clCreateImage3D;\r\n  clRetainMemObject_t *clRetainMemObject;\r\n  clReleaseMemObject_t *clReleaseMemObject;\r\n  clGetSupportedImageFormats_t *clGetSupportedImageFormats;\r\n  clGetMemObjectInfo_t *clGetMemObjectInfo;\r\n  clGetImageInfo_t *clGetImageInfo;\r\n  clCreateSampler_t *clCreateSampler;\r\n  clRetainSampler_t *clRetainSampler;\r\n  clReleaseSampler_t *clReleaseSampler;\r\n  clGetSamplerInfo_t *clGetSamplerInfo;\r\n  clCreateProgramWithSource_t *clCreateProgramWithSource;\r\n  clCreateProgramWithBinary_t *clCreateProgramWithBinary;\r\n  clRetainProgram_t *clRetainProgram;\r\n  clReleaseProgram_t *clReleaseProgram;\r\n  clBuildProgram_t *clBuildProgram;\r\n  clUnloadCompiler_t *clUnloadCompiler;\r\n  clGetProgramInfo_t *clGetProgramInfo;\r\n  clGetProgramBuildInfo_t *clGetProgramBuildInfo;\r\n  clCreateKernel_t *clCreateKernel;\r\n  clCreateKernelsInProgram_t *clCreateKernelsInProgram;\r\n  clRetainKernel_t *clRetainKernel;\r\n  clReleaseKernel_t *clReleaseKernel;\r\n  clSetKernelArg_t *clSetKernelArg;\r\n  clGetKernelInfo_t *clGetKernelInfo;\r\n  clGetKernelWorkGroupInfo_t *clGetKernelWorkGroupInfo;\r\n  clWaitForEvents_t *clWaitForEvents;\r\n  clGetEventInfo_t *clGetEventInfo;\r\n  clRetainEvent_t *clRetainEvent;\r\n  clReleaseEvent_t *clReleaseEvent;\r\n  clGetEventProfilingInfo_t *clGetEventProfilingInfo;\r\n  clFlush_t *clFlush;\r\n  clFinish_t *clFinish;\r\n  clEnqueueReadBuffer_t *clEnqueueReadBuffer;\r\n  clEnqueueWriteBuffer_t *clEnqueueWriteBuffer;\r\n  clEnqueueCopyBuffer_t *clEnqueueCopyBuffer;\r\n  clEnqueueReadImage_t *clEnqueueReadImage;\r\n  clEnqueueWriteImage_t *clEnqueueWriteImage;\r\n  clEnqueueCopyImage_t *clEnqueueCopyImage;\r\n  clEnqueueCopyImageToBuffer_t *clEnqueueCopyImageToBuffer;\r\n  clEnqueueCopyBufferToImage_t *clEnqueueCopyBufferToImage;\r\n  clEnqueueMapBuffer_t *clEnqueueMapBuffer;\r\n  clEnqueueMapImage_t *clEnqueueMapImage;\r\n  clEnqueueUnmapMemObject_t *clEnqueueUnmapMemObject;\r\n  clEnqueueNDRangeKernel_t *clEnqueueNDRangeKernel;\r\n  clEnqueueTask_t *clEnqueueTask;\r\n  clEnqueueNativeKernel_t *clEnqueueNativeKernel;\r\n  clEnqueueMarker_t *clEnqueueMarker;\r\n  clEnqueueWaitForEvents_t *clEnqueueWaitForEvents;\r\n  clEnqueueBarrier_t *clEnqueueBarrier;\r\n  clGetExtensionFunctionAddress_t *clGetExtensionFunctionAddress;\r\n  clCreateFromGLBuffer_t *clCreateFromGLBuffer;\r\n  clCreateFromGLTexture2D_t *clCreateFromGLTexture2D;\r\n  clCreateFromGLTexture3D_t *clCreateFromGLTexture3D;\r\n  clCreateFromGLRenderbuffer_t *clCreateFromGLRenderbuffer;\r\n  clGetGLObjectInfo_t *clGetGLObjectInfo;\r\n  clGetGLTextureInfo_t *clGetGLTextureInfo;\r\n  clEnqueueAcquireGLObjects_t *clEnqueueAcquireGLObjects;\r\n  clEnqueueReleaseGLObjects_t *clEnqueueReleaseGLObjects;\r\n  clGetGLContextInfoKHR_t *clGetGLContextInfoKHR;\r\n\r\n  /* cl_khr_d3d10_sharing */\r\n#ifdef _WIN32\r\n  clGetDeviceIDsFromD3D10KHR_t *clGetDeviceIDsFromD3D10KHR;\r\n  clCreateFromD3D10BufferKHR_t *clCreateFromD3D10BufferKHR;\r\n  clCreateFromD3D10Texture2DKHR_t *clCreateFromD3D10Texture2DKHR;\r\n  clCreateFromD3D10Texture3DKHR_t *clCreateFromD3D10Texture3DKHR;\r\n  clEnqueueAcquireD3D10ObjectsKHR_t *clEnqueueAcquireD3D10ObjectsKHR;\r\n  clEnqueueReleaseD3D10ObjectsKHR_t *clEnqueueReleaseD3D10ObjectsKHR;\r\n#else\r\n  void *clGetDeviceIDsFromD3D10KHR;\r\n  void *clCreateFromD3D10BufferKHR;\r\n  void *clCreateFromD3D10Texture2DKHR;\r\n  void *clCreateFromD3D10Texture3DKHR;\r\n  void *clEnqueueAcquireD3D10ObjectsKHR;\r\n  void *clEnqueueReleaseD3D10ObjectsKHR;\r\n#endif\r\n\r\n  /* OpenCL 1.1 */\r\n#ifdef CL_VERSION_1_1\r\n  clSetEventCallback_t *clSetEventCallback;\r\n  clCreateSubBuffer_t *clCreateSubBuffer;\r\n  clSetMemObjectDestructorCallback_t *clSetMemObjectDestructorCallback;\r\n  clCreateUserEvent_t *clCreateUserEvent;\r\n  clSetUserEventStatus_t *clSetUserEventStatus;\r\n  clEnqueueReadBufferRect_t *clEnqueueReadBufferRect;\r\n  clEnqueueWriteBufferRect_t *clEnqueueWriteBufferRect;\r\n  clEnqueueCopyBufferRect_t *clEnqueueCopyBufferRect;\r\n#else\r\n  void *clSetEventCallback;\r\n  void *clCreateSubBuffer;\r\n  void *clSetMemObjectDestructorCallback;\r\n  void *clCreateUserEvent;\r\n  void *clSetUserEventStatus;\r\n  void *clEnqueueReadBufferRect;\r\n  void *clEnqueueWriteBufferRect;\r\n  void *clEnqueueCopyBufferRect;\r\n#endif\r\n\r\n  /* cl_ext_device_fission */\r\n  clCreateSubDevicesEXT_t *clCreateSubDevicesEXT;\r\n  clRetainDeviceEXT_t *clRetainDeviceEXT;\r\n  clReleaseDeviceEXT_t *clReleaseDeviceEXT;\r\n\r\n  /* cl_khr_gl_event */\r\n  clCreateEventFromGLsyncKHR_t *clCreateEventFromGLsyncKHR;\r\n\r\n  /* OpenCL 1.2 */\r\n#ifdef CL_VERSION_1_2\r\n  clCreateSubDevices_t *clCreateSubDevices;\r\n  clRetainDevice_t *clRetainDevice;\r\n  clReleaseDevice_t *clReleaseDevice;\r\n  clCreateImage_t *clCreateImage;\r\n  clCreateProgramWithBuiltInKernels_t *clCreateProgramWithBuiltInKernels;\r\n  clCompileProgram_t *clCompileProgram;\r\n  clLinkProgram_t *clLinkProgram;\r\n  clUnloadPlatformCompiler_t *clUnloadPlatformCompiler;\r\n  clGetKernelArgInfo_t *clGetKernelArgInfo;\r\n  clEnqueueFillBuffer_t *clEnqueueFillBuffer;\r\n  clEnqueueFillImage_t *clEnqueueFillImage;\r\n  clEnqueueMigrateMemObjects_t *clEnqueueMigrateMemObjects;\r\n  clEnqueueMarkerWithWaitList_t *clEnqueueMarkerWithWaitList;\r\n  clEnqueueBarrierWithWaitList_t *clEnqueueBarrierWithWaitList;\r\n  clGetExtensionFunctionAddressForPlatform_t *\r\n      clGetExtensionFunctionAddressForPlatform;\r\n  clCreateFromGLTexture_t *clCreateFromGLTexture;\r\n#else\r\n  void *clCreateSubDevices;\r\n  void *clRetainDevice;\r\n  void *clReleaseDevice;\r\n  void *clCreateImage;\r\n  void *clCreateProgramWithBuiltInKernels;\r\n  void *clCompileProgram;\r\n  void *clLinkProgram;\r\n  void *clUnloadPlatformCompiler;\r\n  void *clGetKernelArgInfo;\r\n  void *clEnqueueFillBuffer;\r\n  void *clEnqueueFillImage;\r\n  void *clEnqueueMigrateMemObjects;\r\n  void *clEnqueueMarkerWithWaitList;\r\n  void *clEnqueueBarrierWithWaitList;\r\n  void *clGetExtensionFunctionAddressForPlatform;\r\n  void *clCreateFromGLTexture;\r\n#endif\r\n\r\n  /* cl_khr_d3d11_sharing and cl_khr_dx9_media_sharing */\r\n#ifdef _WIN32\r\n  clGetDeviceIDsFromD3D11KHR_t *clGetDeviceIDsFromD3D11KHR;\r\n  clCreateFromD3D11BufferKHR_t *clCreateFromD3D11BufferKHR;\r\n  clCreateFromD3D11Texture2DKHR_t *clCreateFromD3D11Texture2DKHR;\r\n  clCreateFromD3D11Texture3DKHR_t *clCreateFromD3D11Texture3DKHR;\r\n  clCreateFromDX9MediaSurfaceKHR_t *clCreateFromDX9MediaSurfaceKHR;\r\n  clEnqueueAcquireD3D11ObjectsKHR_t *clEnqueueAcquireD3D11ObjectsKHR;\r\n  clEnqueueReleaseD3D11ObjectsKHR_t *clEnqueueReleaseD3D11ObjectsKHR;\r\n  clGetDeviceIDsFromDX9MediaAdapterKHR_t *\r\n      clGetDeviceIDsFromDX9MediaAdapterKHR;\r\n  clEnqueueAcquireDX9MediaSurfacesKHR_t *\r\n      clEnqueueAcquireDX9MediaSurfacesKHR;\r\n  clEnqueueReleaseDX9MediaSurfacesKHR_t *\r\n      clEnqueueReleaseDX9MediaSurfacesKHR;\r\n#else\r\n  void *clGetDeviceIDsFromD3D11KHR;\r\n  void *clCreateFromD3D11BufferKHR;\r\n  void *clCreateFromD3D11Texture2DKHR;\r\n  void *clCreateFromD3D11Texture3DKHR;\r\n  void *clCreateFromDX9MediaSurfaceKHR;\r\n  void *clEnqueueAcquireD3D11ObjectsKHR;\r\n  void *clEnqueueReleaseD3D11ObjectsKHR;\r\n  void *clGetDeviceIDsFromDX9MediaAdapterKHR;\r\n  void *clEnqueueAcquireDX9MediaSurfacesKHR;\r\n  void *clEnqueueReleaseDX9MediaSurfacesKHR;\r\n#endif\r\n\r\n  /* cl_khr_egl_image */\r\n  clCreateFromEGLImageKHR_t *clCreateFromEGLImageKHR;\r\n  clEnqueueAcquireEGLObjectsKHR_t *clEnqueueAcquireEGLObjectsKHR;\r\n  clEnqueueReleaseEGLObjectsKHR_t *clEnqueueReleaseEGLObjectsKHR;\r\n\r\n  /* cl_khr_egl_event */\r\n  clCreateEventFromEGLSyncKHR_t *clCreateEventFromEGLSyncKHR;\r\n\r\n  /* OpenCL 2.0 */\r\n#ifdef CL_VERSION_2_0\r\n  clCreateCommandQueueWithProperties_t *clCreateCommandQueueWithProperties;\r\n  clCreatePipe_t *clCreatePipe;\r\n  clGetPipeInfo_t *clGetPipeInfo;\r\n  clSVMAlloc_t *clSVMAlloc;\r\n  clSVMFree_t *clSVMFree;\r\n  clEnqueueSVMFree_t *clEnqueueSVMFree;\r\n  clEnqueueSVMMemcpy_t *clEnqueueSVMMemcpy;\r\n  clEnqueueSVMMemFill_t *clEnqueueSVMMemFill;\r\n  clEnqueueSVMMap_t *clEnqueueSVMMap;\r\n  clEnqueueSVMUnmap_t *clEnqueueSVMUnmap;\r\n  clCreateSamplerWithProperties_t *clCreateSamplerWithProperties;\r\n  clSetKernelArgSVMPointer_t *clSetKernelArgSVMPointer;\r\n  clSetKernelExecInfo_t *clSetKernelExecInfo;\r\n#else\r\n  void *clCreateCommandQueueWithProperties;\r\n  void *clCreatePipe;\r\n  void *clGetPipeInfo;\r\n  void *clSVMAlloc;\r\n  void *clSVMFree;\r\n  void *clEnqueueSVMFree;\r\n  void *clEnqueueSVMMemcpy;\r\n  void *clEnqueueSVMMemFill;\r\n  void *clEnqueueSVMMap;\r\n  void *clEnqueueSVMUnmap;\r\n  void *clCreateSamplerWithProperties;\r\n  void *clSetKernelArgSVMPointer;\r\n  void *clSetKernelExecInfo;\r\n#endif\r\n\r\n  /* cl_khr_sub_groups */\r\n  clGetKernelSubGroupInfoKHR_t *clGetKernelSubGroupInfoKHR;\r\n\r\n  /* OpenCL 2.1 */\r\n#ifdef CL_VERSION_2_1\r\n  clCloneKernel_t *clCloneKernel;\r\n  clCreateProgramWithIL_t *clCreateProgramWithIL;\r\n  clEnqueueSVMMigrateMem_t *clEnqueueSVMMigrateMem;\r\n  clGetDeviceAndHostTimer_t *clGetDeviceAndHostTimer;\r\n  clGetHostTimer_t *clGetHostTimer;\r\n  clGetKernelSubGroupInfo_t *clGetKernelSubGroupInfo;\r\n  clSetDefaultDeviceCommandQueue_t *clSetDefaultDeviceCommandQueue;\r\n#else\r\n  void *clCloneKernel;\r\n  void *clCreateProgramWithIL;\r\n  void *clEnqueueSVMMigrateMem;\r\n  void *clGetDeviceAndHostTimer;\r\n  void *clGetHostTimer;\r\n  void *clGetKernelSubGroupInfo;\r\n  void *clSetDefaultDeviceCommandQueue;\r\n#endif\r\n\r\n  /* OpenCL 2.2 */\r\n#ifdef CL_VERSION_2_2\r\n  clSetProgramReleaseCallback_t *clSetProgramReleaseCallback;\r\n  clSetProgramSpecializationConstant_t *clSetProgramSpecializationConstant;\r\n#else\r\n  void *clSetProgramReleaseCallback;\r\n  void *clSetProgramSpecializationConstant;\r\n#endif\r\n\r\n  /* OpenCL 3.0 */\r\n#ifdef CL_VERSION_3_0\r\n  clCreateBufferWithProperties_t *clCreateBufferWithProperties;\r\n  clCreateImageWithProperties_t *clCreateImageWithProperties;\r\n  clSetContextDestructorCallback_t *clSetContextDestructorCallback;\r\n#else\r\n  void *clCreateBufferWithProperties;\r\n  void *clCreateImageWithProperties;\r\n  void *clSetContextDestructorCallback;\r\n#endif\r\n\r\n} cl_icd_dispatch;\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif /* #ifndef OPENCL_CL_ICD_H */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_layer.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2023 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef OPENCL_CL_LAYER_H_\r\n#define OPENCL_CL_LAYER_H_\r\n\r\n/*\r\n** This header is generated from the Khronos OpenCL XML API Registry.\r\n*/\r\n\r\n#include <CL/cl_icd.h>\r\n\r\n#include <CL/cl.h>\r\n\r\n/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)\r\n#define CL_NO_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n/* CL_NO_EXTENSION_PROTOTYPES implies\r\n   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and\r\n   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/***************************************************************\r\n* cl_loader_layers\r\n***************************************************************/\r\n#define cl_loader_layers 1\r\n#define CL_LOADER_LAYERS_EXTENSION_NAME \\\r\n    \"cl_loader_layers\"\r\n\r\n\r\n#define CL_LOADER_LAYERS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)\r\n\r\ntypedef cl_uint             cl_layer_info;\r\ntypedef cl_uint             cl_layer_api_version;\r\n\r\n/* cl_layer_info */\r\n#define CL_LAYER_API_VERSION                                0x4240\r\n#define CL_LAYER_NAME                                       0x4241\r\n\r\n/* Misc API enums */\r\n#define CL_LAYER_API_VERSION_100                            100\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetLayerInfo_t(\r\n    cl_layer_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret);\r\n\r\ntypedef clGetLayerInfo_t *\r\nclGetLayerInfo_fn ;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclInitLayer_t(\r\n    cl_uint num_entries,\r\n    const cl_icd_dispatch* target_dispatch,\r\n    cl_uint* num_entries_ret,\r\n    const cl_icd_dispatch** layer_dispatch_ret);\r\n\r\ntypedef clInitLayer_t *\r\nclInitLayer_fn ;\r\n\r\n/*\r\n** The function pointer typedefs prefixed with \"pfn_\" are provided for\r\n** compatibility with earlier versions of the headers.  New code is\r\n** encouraged to use the function pointer typedefs that are suffixed with\r\n** \"_fn\" instead, for consistency.\r\n*/\r\n\r\ntypedef clGetLayerInfo_t *\r\npfn_clGetLayerInfo ;\r\n\r\ntypedef clInitLayer_t *\r\npfn_clInitLayer ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetLayerInfo(\r\n    cl_layer_info param_name,\r\n    size_t param_value_size,\r\n    void* param_value,\r\n    size_t* param_value_size_ret) ;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclInitLayer(\r\n    cl_uint num_entries,\r\n    const cl_icd_dispatch* target_dispatch,\r\n    cl_uint* num_entries_ret,\r\n    const cl_icd_dispatch** layer_dispatch_ret) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif /* OPENCL_CL_LAYER_H_ */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_platform.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2020 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef __CL_PLATFORM_H\r\n#define __CL_PLATFORM_H\r\n\r\n#include <CL/cl_version.h>\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n#if defined(_WIN32)\r\n    #if !defined(CL_API_ENTRY)\r\n        #define CL_API_ENTRY\r\n    #endif\r\n    #if !defined(CL_API_CALL)\r\n        #define CL_API_CALL     __stdcall\r\n    #endif\r\n    #if !defined(CL_CALLBACK)\r\n        #define CL_CALLBACK     __stdcall\r\n    #endif\r\n#else\r\n    #if !defined(CL_API_ENTRY)\r\n        #define CL_API_ENTRY\r\n    #endif\r\n    #if !defined(CL_API_CALL)\r\n        #define CL_API_CALL\r\n    #endif\r\n    #if !defined(CL_CALLBACK)\r\n        #define CL_CALLBACK\r\n    #endif\r\n#endif\r\n\r\n/*\r\n * Deprecation flags refer to the last version of the header in which the\r\n * feature was not deprecated.\r\n *\r\n * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without\r\n * deprecation but is deprecated in versions later than 1.1.\r\n */\r\n\r\n#ifndef CL_API_SUFFIX_USER\r\n#define CL_API_SUFFIX_USER\r\n#endif\r\n\r\n#ifndef CL_API_PREFIX_USER\r\n#define CL_API_PREFIX_USER\r\n#endif\r\n\r\n#define CL_API_SUFFIX_COMMON CL_API_SUFFIX_USER\r\n#define CL_API_PREFIX_COMMON CL_API_PREFIX_USER\r\n\r\n#define CL_API_SUFFIX__VERSION_1_0 CL_API_SUFFIX_COMMON\r\n#define CL_API_SUFFIX__VERSION_1_1 CL_API_SUFFIX_COMMON\r\n#define CL_API_SUFFIX__VERSION_1_2 CL_API_SUFFIX_COMMON\r\n#define CL_API_SUFFIX__VERSION_2_0 CL_API_SUFFIX_COMMON\r\n#define CL_API_SUFFIX__VERSION_2_1 CL_API_SUFFIX_COMMON\r\n#define CL_API_SUFFIX__VERSION_2_2 CL_API_SUFFIX_COMMON\r\n#define CL_API_SUFFIX__VERSION_3_0 CL_API_SUFFIX_COMMON\r\n#define CL_API_SUFFIX__EXPERIMENTAL CL_API_SUFFIX_COMMON\r\n\r\n\r\n#ifdef __GNUC__\r\n  #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated))\r\n  #define CL_API_PREFIX_DEPRECATED\r\n#elif defined(_MSC_VER) && !defined(__clang__)\r\n  #define CL_API_SUFFIX_DEPRECATED\r\n  #define CL_API_PREFIX_DEPRECATED __declspec(deprecated)\r\n#else\r\n  #define CL_API_SUFFIX_DEPRECATED\r\n  #define CL_API_PREFIX_DEPRECATED\r\n#endif\r\n\r\n#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS\r\n    #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON\r\n    #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON\r\n#else\r\n    #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED\r\n    #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED\r\n#endif\r\n\r\n#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS\r\n    #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON\r\n    #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON\r\n#else\r\n    #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED\r\n    #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED\r\n#endif\r\n\r\n#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS\r\n    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON\r\n    #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON\r\n#else\r\n    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED\r\n    #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED\r\n #endif\r\n\r\n#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS\r\n    #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON\r\n    #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON\r\n#else\r\n    #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED\r\n    #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED\r\n#endif\r\n\r\n#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS\r\n    #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON\r\n    #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON\r\n#else\r\n    #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED\r\n    #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED\r\n#endif\r\n\r\n#ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS\r\n    #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON\r\n    #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON\r\n#else\r\n    #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED\r\n    #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED\r\n#endif\r\n\r\n#if (defined (_WIN32) && defined(_MSC_VER))\r\n\r\n#if defined(__clang__)\r\n#pragma clang diagnostic push\r\n#pragma clang diagnostic ignored \"-Wlanguage-extension-token\"\r\n#endif\r\n\r\n/* intptr_t is used in cl.h and provided by stddef.h in Visual C++, but not in clang */\r\n/* stdint.h was missing before Visual Studio 2010, include it for later versions and for clang */\r\n#if defined(__clang__) || _MSC_VER >= 1600\r\n    #include <stdint.h>\r\n#endif\r\n\r\n/* scalar types  */\r\ntypedef signed   __int8         cl_char;\r\ntypedef unsigned __int8         cl_uchar;\r\ntypedef signed   __int16        cl_short;\r\ntypedef unsigned __int16        cl_ushort;\r\ntypedef signed   __int32        cl_int;\r\ntypedef unsigned __int32        cl_uint;\r\ntypedef signed   __int64        cl_long;\r\ntypedef unsigned __int64        cl_ulong;\r\n\r\ntypedef unsigned __int16        cl_half;\r\ntypedef float                   cl_float;\r\ntypedef double                  cl_double;\r\n\r\n#if defined(__clang__)\r\n#pragma clang diagnostic pop\r\n#endif\r\n\r\n/* Macro names and corresponding values defined by OpenCL */\r\n#define CL_CHAR_BIT         8\r\n#define CL_SCHAR_MAX        127\r\n#define CL_SCHAR_MIN        (-127-1)\r\n#define CL_CHAR_MAX         CL_SCHAR_MAX\r\n#define CL_CHAR_MIN         CL_SCHAR_MIN\r\n#define CL_UCHAR_MAX        255\r\n#define CL_SHRT_MAX         32767\r\n#define CL_SHRT_MIN         (-32767-1)\r\n#define CL_USHRT_MAX        65535\r\n#define CL_INT_MAX          2147483647\r\n#define CL_INT_MIN          (-2147483647-1)\r\n#define CL_UINT_MAX         0xffffffffU\r\n#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)\r\n#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)\r\n#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)\r\n\r\n#define CL_FLT_DIG          6\r\n#define CL_FLT_MANT_DIG     24\r\n#define CL_FLT_MAX_10_EXP   +38\r\n#define CL_FLT_MAX_EXP      +128\r\n#define CL_FLT_MIN_10_EXP   -37\r\n#define CL_FLT_MIN_EXP      -125\r\n#define CL_FLT_RADIX        2\r\n#define CL_FLT_MAX          340282346638528859811704183484516925440.0f\r\n#define CL_FLT_MIN          1.175494350822287507969e-38f\r\n#define CL_FLT_EPSILON      1.1920928955078125e-7f\r\n\r\n#define CL_HALF_DIG          3\r\n#define CL_HALF_MANT_DIG     11\r\n#define CL_HALF_MAX_10_EXP   +4\r\n#define CL_HALF_MAX_EXP      +16\r\n#define CL_HALF_MIN_10_EXP   -4\r\n#define CL_HALF_MIN_EXP      -13\r\n#define CL_HALF_RADIX        2\r\n#define CL_HALF_MAX          65504.0f\r\n#define CL_HALF_MIN          6.103515625e-05f\r\n#define CL_HALF_EPSILON      9.765625e-04f\r\n\r\n#define CL_DBL_DIG          15\r\n#define CL_DBL_MANT_DIG     53\r\n#define CL_DBL_MAX_10_EXP   +308\r\n#define CL_DBL_MAX_EXP      +1024\r\n#define CL_DBL_MIN_10_EXP   -307\r\n#define CL_DBL_MIN_EXP      -1021\r\n#define CL_DBL_RADIX        2\r\n#define CL_DBL_MAX          1.7976931348623158e+308\r\n#define CL_DBL_MIN          2.225073858507201383090e-308\r\n#define CL_DBL_EPSILON      2.220446049250313080847e-16\r\n\r\n#define CL_M_E              2.7182818284590452354\r\n#define CL_M_LOG2E          1.4426950408889634074\r\n#define CL_M_LOG10E         0.43429448190325182765\r\n#define CL_M_LN2            0.69314718055994530942\r\n#define CL_M_LN10           2.30258509299404568402\r\n#define CL_M_PI             3.14159265358979323846\r\n#define CL_M_PI_2           1.57079632679489661923\r\n#define CL_M_PI_4           0.78539816339744830962\r\n#define CL_M_1_PI           0.31830988618379067154\r\n#define CL_M_2_PI           0.63661977236758134308\r\n#define CL_M_2_SQRTPI       1.12837916709551257390\r\n#define CL_M_SQRT2          1.41421356237309504880\r\n#define CL_M_SQRT1_2        0.70710678118654752440\r\n\r\n#define CL_M_E_F            2.718281828f\r\n#define CL_M_LOG2E_F        1.442695041f\r\n#define CL_M_LOG10E_F       0.434294482f\r\n#define CL_M_LN2_F          0.693147181f\r\n#define CL_M_LN10_F         2.302585093f\r\n#define CL_M_PI_F           3.141592654f\r\n#define CL_M_PI_2_F         1.570796327f\r\n#define CL_M_PI_4_F         0.785398163f\r\n#define CL_M_1_PI_F         0.318309886f\r\n#define CL_M_2_PI_F         0.636619772f\r\n#define CL_M_2_SQRTPI_F     1.128379167f\r\n#define CL_M_SQRT2_F        1.414213562f\r\n#define CL_M_SQRT1_2_F      0.707106781f\r\n\r\n#define CL_NAN              (CL_INFINITY - CL_INFINITY)\r\n#define CL_HUGE_VALF        ((cl_float) 1e50)\r\n#define CL_HUGE_VAL         ((cl_double) 1e500)\r\n#define CL_MAXFLOAT         CL_FLT_MAX\r\n#define CL_INFINITY         CL_HUGE_VALF\r\n\r\n#else\r\n\r\n#include <stdint.h>\r\n\r\n/* scalar types  */\r\ntypedef int8_t          cl_char;\r\ntypedef uint8_t         cl_uchar;\r\ntypedef int16_t         cl_short;\r\ntypedef uint16_t        cl_ushort;\r\ntypedef int32_t         cl_int;\r\ntypedef uint32_t        cl_uint;\r\ntypedef int64_t         cl_long;\r\ntypedef uint64_t        cl_ulong;\r\n\r\ntypedef uint16_t        cl_half;\r\ntypedef float           cl_float;\r\ntypedef double          cl_double;\r\n\r\n/* Macro names and corresponding values defined by OpenCL */\r\n#define CL_CHAR_BIT         8\r\n#define CL_SCHAR_MAX        127\r\n#define CL_SCHAR_MIN        (-127-1)\r\n#define CL_CHAR_MAX         CL_SCHAR_MAX\r\n#define CL_CHAR_MIN         CL_SCHAR_MIN\r\n#define CL_UCHAR_MAX        255\r\n#define CL_SHRT_MAX         32767\r\n#define CL_SHRT_MIN         (-32767-1)\r\n#define CL_USHRT_MAX        65535\r\n#define CL_INT_MAX          2147483647\r\n#define CL_INT_MIN          (-2147483647-1)\r\n#define CL_UINT_MAX         0xffffffffU\r\n#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)\r\n#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)\r\n#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)\r\n\r\n#define CL_FLT_DIG          6\r\n#define CL_FLT_MANT_DIG     24\r\n#define CL_FLT_MAX_10_EXP   +38\r\n#define CL_FLT_MAX_EXP      +128\r\n#define CL_FLT_MIN_10_EXP   -37\r\n#define CL_FLT_MIN_EXP      -125\r\n#define CL_FLT_RADIX        2\r\n#define CL_FLT_MAX          340282346638528859811704183484516925440.0f\r\n#define CL_FLT_MIN          1.175494350822287507969e-38f\r\n#define CL_FLT_EPSILON      1.1920928955078125e-7f\r\n\r\n#define CL_HALF_DIG          3\r\n#define CL_HALF_MANT_DIG     11\r\n#define CL_HALF_MAX_10_EXP   +4\r\n#define CL_HALF_MAX_EXP      +16\r\n#define CL_HALF_MIN_10_EXP   -4\r\n#define CL_HALF_MIN_EXP      -13\r\n#define CL_HALF_RADIX        2\r\n#define CL_HALF_MAX          65504.0f\r\n#define CL_HALF_MIN          6.103515625e-05f\r\n#define CL_HALF_EPSILON      9.765625e-04f\r\n\r\n#define CL_DBL_DIG          15\r\n#define CL_DBL_MANT_DIG     53\r\n#define CL_DBL_MAX_10_EXP   +308\r\n#define CL_DBL_MAX_EXP      +1024\r\n#define CL_DBL_MIN_10_EXP   -307\r\n#define CL_DBL_MIN_EXP      -1021\r\n#define CL_DBL_RADIX        2\r\n#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0\r\n#define CL_DBL_MIN          2.225073858507201383090e-308\r\n#define CL_DBL_EPSILON      2.220446049250313080847e-16\r\n\r\n#define CL_M_E              2.7182818284590452354\r\n#define CL_M_LOG2E          1.4426950408889634074\r\n#define CL_M_LOG10E         0.43429448190325182765\r\n#define CL_M_LN2            0.69314718055994530942\r\n#define CL_M_LN10           2.30258509299404568402\r\n#define CL_M_PI             3.14159265358979323846\r\n#define CL_M_PI_2           1.57079632679489661923\r\n#define CL_M_PI_4           0.78539816339744830962\r\n#define CL_M_1_PI           0.31830988618379067154\r\n#define CL_M_2_PI           0.63661977236758134308\r\n#define CL_M_2_SQRTPI       1.12837916709551257390\r\n#define CL_M_SQRT2          1.41421356237309504880\r\n#define CL_M_SQRT1_2        0.70710678118654752440\r\n\r\n#define CL_M_E_F            2.718281828f\r\n#define CL_M_LOG2E_F        1.442695041f\r\n#define CL_M_LOG10E_F       0.434294482f\r\n#define CL_M_LN2_F          0.693147181f\r\n#define CL_M_LN10_F         2.302585093f\r\n#define CL_M_PI_F           3.141592654f\r\n#define CL_M_PI_2_F         1.570796327f\r\n#define CL_M_PI_4_F         0.785398163f\r\n#define CL_M_1_PI_F         0.318309886f\r\n#define CL_M_2_PI_F         0.636619772f\r\n#define CL_M_2_SQRTPI_F     1.128379167f\r\n#define CL_M_SQRT2_F        1.414213562f\r\n#define CL_M_SQRT1_2_F      0.707106781f\r\n\r\n#if defined( __GNUC__ )\r\n   #define CL_HUGE_VALF     __builtin_huge_valf()\r\n   #define CL_HUGE_VAL      __builtin_huge_val()\r\n   #define CL_NAN           __builtin_nanf( \"\" )\r\n#else\r\n   #define CL_HUGE_VALF     ((cl_float) 1e50)\r\n   #define CL_HUGE_VAL      ((cl_double) 1e500)\r\n   float nanf( const char * );\r\n   #define CL_NAN           nanf( \"\" )\r\n#endif\r\n#define CL_MAXFLOAT         CL_FLT_MAX\r\n#define CL_INFINITY         CL_HUGE_VALF\r\n\r\n#endif\r\n\r\n#include <stddef.h>\r\n\r\n/*\r\n * Vector types\r\n *\r\n *  Note:   OpenCL requires that all types be naturally aligned.\r\n *          This means that vector types must be naturally aligned.\r\n *          For example, a vector of four floats must be aligned to\r\n *          a 16 byte boundary (calculated as 4 * the natural 4-byte\r\n *          alignment of the float).  The alignment qualifiers here\r\n *          will only function properly if your compiler supports them\r\n *          and if you don't actively work to defeat them.  For example,\r\n *          in order for a cl_float4 to be 16 byte aligned in a struct,\r\n *          the start of the struct must itself be 16-byte aligned.\r\n *\r\n *          Maintaining proper alignment is the user's responsibility.\r\n */\r\n\r\n/* Define basic vector types */\r\n#if defined( __VEC__ )\r\n  #if !defined(__clang__)\r\n     #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */\r\n  #endif\r\n   typedef __vector unsigned char     __cl_uchar16;\r\n   typedef __vector signed char       __cl_char16;\r\n   typedef __vector unsigned short    __cl_ushort8;\r\n   typedef __vector signed short      __cl_short8;\r\n   typedef __vector unsigned int      __cl_uint4;\r\n   typedef __vector signed int        __cl_int4;\r\n   typedef __vector float             __cl_float4;\r\n   #define  __CL_UCHAR16__  1\r\n   #define  __CL_CHAR16__   1\r\n   #define  __CL_USHORT8__  1\r\n   #define  __CL_SHORT8__   1\r\n   #define  __CL_UINT4__    1\r\n   #define  __CL_INT4__     1\r\n   #define  __CL_FLOAT4__   1\r\n#endif\r\n\r\n#if defined( __SSE__ )\r\n    #if defined( __MINGW64__ )\r\n        #include <intrin.h>\r\n    #else\r\n        #include <xmmintrin.h>\r\n    #endif\r\n    #if defined( __GNUC__ )\r\n        typedef float __cl_float4   __attribute__((vector_size(16)));\r\n    #else\r\n        typedef __m128 __cl_float4;\r\n    #endif\r\n    #define __CL_FLOAT4__   1\r\n#endif\r\n\r\n#if defined( __SSE2__ )\r\n    #if defined( __MINGW64__ )\r\n        #include <intrin.h>\r\n    #else\r\n        #include <emmintrin.h>\r\n    #endif\r\n    #if defined( __GNUC__ )\r\n        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));\r\n        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));\r\n        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));\r\n        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));\r\n        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));\r\n        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));\r\n        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));\r\n        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));\r\n        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));\r\n    #else\r\n        typedef __m128i __cl_uchar16;\r\n        typedef __m128i __cl_char16;\r\n        typedef __m128i __cl_ushort8;\r\n        typedef __m128i __cl_short8;\r\n        typedef __m128i __cl_uint4;\r\n        typedef __m128i __cl_int4;\r\n        typedef __m128i __cl_ulong2;\r\n        typedef __m128i __cl_long2;\r\n        typedef __m128d __cl_double2;\r\n    #endif\r\n    #define __CL_UCHAR16__  1\r\n    #define __CL_CHAR16__   1\r\n    #define __CL_USHORT8__  1\r\n    #define __CL_SHORT8__   1\r\n    #define __CL_INT4__     1\r\n    #define __CL_UINT4__    1\r\n    #define __CL_ULONG2__   1\r\n    #define __CL_LONG2__    1\r\n    #define __CL_DOUBLE2__  1\r\n#endif\r\n\r\n#if defined( __MMX__ )\r\n    #include <mmintrin.h>\r\n    #if defined( __GNUC__ )\r\n        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));\r\n        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));\r\n        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));\r\n        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));\r\n        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));\r\n        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));\r\n        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));\r\n        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));\r\n        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));\r\n    #else\r\n        typedef __m64       __cl_uchar8;\r\n        typedef __m64       __cl_char8;\r\n        typedef __m64       __cl_ushort4;\r\n        typedef __m64       __cl_short4;\r\n        typedef __m64       __cl_uint2;\r\n        typedef __m64       __cl_int2;\r\n        typedef __m64       __cl_ulong1;\r\n        typedef __m64       __cl_long1;\r\n        typedef __m64       __cl_float2;\r\n    #endif\r\n    #define __CL_UCHAR8__   1\r\n    #define __CL_CHAR8__    1\r\n    #define __CL_USHORT4__  1\r\n    #define __CL_SHORT4__   1\r\n    #define __CL_INT2__     1\r\n    #define __CL_UINT2__    1\r\n    #define __CL_ULONG1__   1\r\n    #define __CL_LONG1__    1\r\n    #define __CL_FLOAT2__   1\r\n#endif\r\n\r\n#if defined( __AVX__ )\r\n    #if defined( __MINGW64__ )\r\n        #include <intrin.h>\r\n    #else\r\n        #include <immintrin.h>\r\n    #endif\r\n    #if defined( __GNUC__ )\r\n        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));\r\n        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));\r\n    #else\r\n        typedef __m256      __cl_float8;\r\n        typedef __m256d     __cl_double4;\r\n    #endif\r\n    #define __CL_FLOAT8__   1\r\n    #define __CL_DOUBLE4__  1\r\n#endif\r\n\r\n/* Define capabilities for anonymous struct members. */\r\n#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L\r\n#define  __CL_HAS_ANON_STRUCT__ 1\r\n#define  __CL_ANON_STRUCT__\r\n#elif defined(_WIN32) && defined(_MSC_VER) && !defined(__STDC__)\r\n#define  __CL_HAS_ANON_STRUCT__ 1\r\n#define  __CL_ANON_STRUCT__\r\n#elif defined(__GNUC__) && ! defined(__STRICT_ANSI__)\r\n#define  __CL_HAS_ANON_STRUCT__ 1\r\n#define  __CL_ANON_STRUCT__ __extension__\r\n#elif defined(__clang__)\r\n#define  __CL_HAS_ANON_STRUCT__ 1\r\n#define  __CL_ANON_STRUCT__ __extension__\r\n#else\r\n#define  __CL_HAS_ANON_STRUCT__ 0\r\n#define  __CL_ANON_STRUCT__\r\n#endif\r\n\r\n#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__\r\n   /* Disable warning C4201: nonstandard extension used : nameless struct/union */\r\n    #pragma warning( push )\r\n    #pragma warning( disable : 4201 )\r\n#endif\r\n\r\n/* Define alignment keys */\r\n#if defined( __GNUC__ ) || defined(__INTEGRITY)\r\n    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))\r\n#elif defined( _WIN32) && (_MSC_VER)\r\n    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */\r\n    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */\r\n    /* #include <crtdefs.h>                                                                                             */\r\n    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */\r\n    #define CL_ALIGNED(_x)\r\n#else\r\n   #warning  Need to implement some method to align data here\r\n   #define  CL_ALIGNED(_x)\r\n#endif\r\n\r\n/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */\r\n#if __CL_HAS_ANON_STRUCT__\r\n    /* .xyzw and .s0123...{f|F} are supported */\r\n    #define CL_HAS_NAMED_VECTOR_FIELDS 1\r\n    /* .hi and .lo are supported */\r\n    #define CL_HAS_HI_LO_VECTOR_FIELDS 1\r\n#endif\r\n\r\n/* Define cl_vector types */\r\n\r\n/* ---- cl_charn ---- */\r\ntypedef union\r\n{\r\n    cl_char  CL_ALIGNED(2) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };\r\n#endif\r\n#if defined( __CL_CHAR2__)\r\n    __cl_char2     v2;\r\n#endif\r\n}cl_char2;\r\n\r\ntypedef union\r\n{\r\n    cl_char  CL_ALIGNED(4) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };\r\n#endif\r\n#if defined( __CL_CHAR2__)\r\n    __cl_char2     v2[2];\r\n#endif\r\n#if defined( __CL_CHAR4__)\r\n    __cl_char4     v4;\r\n#endif\r\n}cl_char4;\r\n\r\n/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */\r\ntypedef  cl_char4  cl_char3;\r\n\r\ntypedef union\r\n{\r\n    cl_char   CL_ALIGNED(8) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };\r\n#endif\r\n#if defined( __CL_CHAR2__)\r\n    __cl_char2     v2[4];\r\n#endif\r\n#if defined( __CL_CHAR4__)\r\n    __cl_char4     v4[2];\r\n#endif\r\n#if defined( __CL_CHAR8__ )\r\n    __cl_char8     v8;\r\n#endif\r\n}cl_char8;\r\n\r\ntypedef union\r\n{\r\n    cl_char  CL_ALIGNED(16) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };\r\n#endif\r\n#if defined( __CL_CHAR2__)\r\n    __cl_char2     v2[8];\r\n#endif\r\n#if defined( __CL_CHAR4__)\r\n    __cl_char4     v4[4];\r\n#endif\r\n#if defined( __CL_CHAR8__ )\r\n    __cl_char8     v8[2];\r\n#endif\r\n#if defined( __CL_CHAR16__ )\r\n    __cl_char16    v16;\r\n#endif\r\n}cl_char16;\r\n\r\n\r\n/* ---- cl_ucharn ---- */\r\ntypedef union\r\n{\r\n    cl_uchar  CL_ALIGNED(2) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };\r\n#endif\r\n#if defined( __cl_uchar2__)\r\n    __cl_uchar2     v2;\r\n#endif\r\n}cl_uchar2;\r\n\r\ntypedef union\r\n{\r\n    cl_uchar  CL_ALIGNED(4) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };\r\n#endif\r\n#if defined( __CL_UCHAR2__)\r\n    __cl_uchar2     v2[2];\r\n#endif\r\n#if defined( __CL_UCHAR4__)\r\n    __cl_uchar4     v4;\r\n#endif\r\n}cl_uchar4;\r\n\r\n/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */\r\ntypedef  cl_uchar4  cl_uchar3;\r\n\r\ntypedef union\r\n{\r\n    cl_uchar   CL_ALIGNED(8) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };\r\n#endif\r\n#if defined( __CL_UCHAR2__)\r\n    __cl_uchar2     v2[4];\r\n#endif\r\n#if defined( __CL_UCHAR4__)\r\n    __cl_uchar4     v4[2];\r\n#endif\r\n#if defined( __CL_UCHAR8__ )\r\n    __cl_uchar8     v8;\r\n#endif\r\n}cl_uchar8;\r\n\r\ntypedef union\r\n{\r\n    cl_uchar  CL_ALIGNED(16) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };\r\n#endif\r\n#if defined( __CL_UCHAR2__)\r\n    __cl_uchar2     v2[8];\r\n#endif\r\n#if defined( __CL_UCHAR4__)\r\n    __cl_uchar4     v4[4];\r\n#endif\r\n#if defined( __CL_UCHAR8__ )\r\n    __cl_uchar8     v8[2];\r\n#endif\r\n#if defined( __CL_UCHAR16__ )\r\n    __cl_uchar16    v16;\r\n#endif\r\n}cl_uchar16;\r\n\r\n\r\n/* ---- cl_shortn ---- */\r\ntypedef union\r\n{\r\n    cl_short  CL_ALIGNED(4) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };\r\n#endif\r\n#if defined( __CL_SHORT2__)\r\n    __cl_short2     v2;\r\n#endif\r\n}cl_short2;\r\n\r\ntypedef union\r\n{\r\n    cl_short  CL_ALIGNED(8) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };\r\n#endif\r\n#if defined( __CL_SHORT2__)\r\n    __cl_short2     v2[2];\r\n#endif\r\n#if defined( __CL_SHORT4__)\r\n    __cl_short4     v4;\r\n#endif\r\n}cl_short4;\r\n\r\n/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */\r\ntypedef  cl_short4  cl_short3;\r\n\r\ntypedef union\r\n{\r\n    cl_short   CL_ALIGNED(16) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };\r\n#endif\r\n#if defined( __CL_SHORT2__)\r\n    __cl_short2     v2[4];\r\n#endif\r\n#if defined( __CL_SHORT4__)\r\n    __cl_short4     v4[2];\r\n#endif\r\n#if defined( __CL_SHORT8__ )\r\n    __cl_short8     v8;\r\n#endif\r\n}cl_short8;\r\n\r\ntypedef union\r\n{\r\n    cl_short  CL_ALIGNED(32) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };\r\n#endif\r\n#if defined( __CL_SHORT2__)\r\n    __cl_short2     v2[8];\r\n#endif\r\n#if defined( __CL_SHORT4__)\r\n    __cl_short4     v4[4];\r\n#endif\r\n#if defined( __CL_SHORT8__ )\r\n    __cl_short8     v8[2];\r\n#endif\r\n#if defined( __CL_SHORT16__ )\r\n    __cl_short16    v16;\r\n#endif\r\n}cl_short16;\r\n\r\n\r\n/* ---- cl_ushortn ---- */\r\ntypedef union\r\n{\r\n    cl_ushort  CL_ALIGNED(4) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };\r\n#endif\r\n#if defined( __CL_USHORT2__)\r\n    __cl_ushort2     v2;\r\n#endif\r\n}cl_ushort2;\r\n\r\ntypedef union\r\n{\r\n    cl_ushort  CL_ALIGNED(8) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };\r\n#endif\r\n#if defined( __CL_USHORT2__)\r\n    __cl_ushort2     v2[2];\r\n#endif\r\n#if defined( __CL_USHORT4__)\r\n    __cl_ushort4     v4;\r\n#endif\r\n}cl_ushort4;\r\n\r\n/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */\r\ntypedef  cl_ushort4  cl_ushort3;\r\n\r\ntypedef union\r\n{\r\n    cl_ushort   CL_ALIGNED(16) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };\r\n#endif\r\n#if defined( __CL_USHORT2__)\r\n    __cl_ushort2     v2[4];\r\n#endif\r\n#if defined( __CL_USHORT4__)\r\n    __cl_ushort4     v4[2];\r\n#endif\r\n#if defined( __CL_USHORT8__ )\r\n    __cl_ushort8     v8;\r\n#endif\r\n}cl_ushort8;\r\n\r\ntypedef union\r\n{\r\n    cl_ushort  CL_ALIGNED(32) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };\r\n#endif\r\n#if defined( __CL_USHORT2__)\r\n    __cl_ushort2     v2[8];\r\n#endif\r\n#if defined( __CL_USHORT4__)\r\n    __cl_ushort4     v4[4];\r\n#endif\r\n#if defined( __CL_USHORT8__ )\r\n    __cl_ushort8     v8[2];\r\n#endif\r\n#if defined( __CL_USHORT16__ )\r\n    __cl_ushort16    v16;\r\n#endif\r\n}cl_ushort16;\r\n\r\n\r\n/* ---- cl_halfn ---- */\r\ntypedef union\r\n{\r\n    cl_half  CL_ALIGNED(4) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };\r\n    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };\r\n    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };\r\n#endif\r\n#if defined( __CL_HALF2__)\r\n    __cl_half2     v2;\r\n#endif\r\n}cl_half2;\r\n\r\ntypedef union\r\n{\r\n    cl_half  CL_ALIGNED(8) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };\r\n    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };\r\n    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };\r\n#endif\r\n#if defined( __CL_HALF2__)\r\n    __cl_half2     v2[2];\r\n#endif\r\n#if defined( __CL_HALF4__)\r\n    __cl_half4     v4;\r\n#endif\r\n}cl_half4;\r\n\r\n/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */\r\ntypedef  cl_half4  cl_half3;\r\n\r\ntypedef union\r\n{\r\n    cl_half   CL_ALIGNED(16) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };\r\n    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };\r\n#endif\r\n#if defined( __CL_HALF2__)\r\n    __cl_half2     v2[4];\r\n#endif\r\n#if defined( __CL_HALF4__)\r\n    __cl_half4     v4[2];\r\n#endif\r\n#if defined( __CL_HALF8__ )\r\n    __cl_half8     v8;\r\n#endif\r\n}cl_half8;\r\n\r\ntypedef union\r\n{\r\n    cl_half  CL_ALIGNED(32) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };\r\n#endif\r\n#if defined( __CL_HALF2__)\r\n    __cl_half2     v2[8];\r\n#endif\r\n#if defined( __CL_HALF4__)\r\n    __cl_half4     v4[4];\r\n#endif\r\n#if defined( __CL_HALF8__ )\r\n    __cl_half8     v8[2];\r\n#endif\r\n#if defined( __CL_HALF16__ )\r\n    __cl_half16    v16;\r\n#endif\r\n}cl_half16;\r\n\r\n/* ---- cl_intn ---- */\r\ntypedef union\r\n{\r\n    cl_int  CL_ALIGNED(8) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };\r\n#endif\r\n#if defined( __CL_INT2__)\r\n    __cl_int2     v2;\r\n#endif\r\n}cl_int2;\r\n\r\ntypedef union\r\n{\r\n    cl_int  CL_ALIGNED(16) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };\r\n#endif\r\n#if defined( __CL_INT2__)\r\n    __cl_int2     v2[2];\r\n#endif\r\n#if defined( __CL_INT4__)\r\n    __cl_int4     v4;\r\n#endif\r\n}cl_int4;\r\n\r\n/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */\r\ntypedef  cl_int4  cl_int3;\r\n\r\ntypedef union\r\n{\r\n    cl_int   CL_ALIGNED(32) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };\r\n#endif\r\n#if defined( __CL_INT2__)\r\n    __cl_int2     v2[4];\r\n#endif\r\n#if defined( __CL_INT4__)\r\n    __cl_int4     v4[2];\r\n#endif\r\n#if defined( __CL_INT8__ )\r\n    __cl_int8     v8;\r\n#endif\r\n}cl_int8;\r\n\r\ntypedef union\r\n{\r\n    cl_int  CL_ALIGNED(64) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };\r\n#endif\r\n#if defined( __CL_INT2__)\r\n    __cl_int2     v2[8];\r\n#endif\r\n#if defined( __CL_INT4__)\r\n    __cl_int4     v4[4];\r\n#endif\r\n#if defined( __CL_INT8__ )\r\n    __cl_int8     v8[2];\r\n#endif\r\n#if defined( __CL_INT16__ )\r\n    __cl_int16    v16;\r\n#endif\r\n}cl_int16;\r\n\r\n\r\n/* ---- cl_uintn ---- */\r\ntypedef union\r\n{\r\n    cl_uint  CL_ALIGNED(8) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };\r\n#endif\r\n#if defined( __CL_UINT2__)\r\n    __cl_uint2     v2;\r\n#endif\r\n}cl_uint2;\r\n\r\ntypedef union\r\n{\r\n    cl_uint  CL_ALIGNED(16) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };\r\n#endif\r\n#if defined( __CL_UINT2__)\r\n    __cl_uint2     v2[2];\r\n#endif\r\n#if defined( __CL_UINT4__)\r\n    __cl_uint4     v4;\r\n#endif\r\n}cl_uint4;\r\n\r\n/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */\r\ntypedef  cl_uint4  cl_uint3;\r\n\r\ntypedef union\r\n{\r\n    cl_uint   CL_ALIGNED(32) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };\r\n#endif\r\n#if defined( __CL_UINT2__)\r\n    __cl_uint2     v2[4];\r\n#endif\r\n#if defined( __CL_UINT4__)\r\n    __cl_uint4     v4[2];\r\n#endif\r\n#if defined( __CL_UINT8__ )\r\n    __cl_uint8     v8;\r\n#endif\r\n}cl_uint8;\r\n\r\ntypedef union\r\n{\r\n    cl_uint  CL_ALIGNED(64) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };\r\n#endif\r\n#if defined( __CL_UINT2__)\r\n    __cl_uint2     v2[8];\r\n#endif\r\n#if defined( __CL_UINT4__)\r\n    __cl_uint4     v4[4];\r\n#endif\r\n#if defined( __CL_UINT8__ )\r\n    __cl_uint8     v8[2];\r\n#endif\r\n#if defined( __CL_UINT16__ )\r\n    __cl_uint16    v16;\r\n#endif\r\n}cl_uint16;\r\n\r\n/* ---- cl_longn ---- */\r\ntypedef union\r\n{\r\n    cl_long  CL_ALIGNED(16) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };\r\n#endif\r\n#if defined( __CL_LONG2__)\r\n    __cl_long2     v2;\r\n#endif\r\n}cl_long2;\r\n\r\ntypedef union\r\n{\r\n    cl_long  CL_ALIGNED(32) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };\r\n#endif\r\n#if defined( __CL_LONG2__)\r\n    __cl_long2     v2[2];\r\n#endif\r\n#if defined( __CL_LONG4__)\r\n    __cl_long4     v4;\r\n#endif\r\n}cl_long4;\r\n\r\n/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */\r\ntypedef  cl_long4  cl_long3;\r\n\r\ntypedef union\r\n{\r\n    cl_long   CL_ALIGNED(64) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };\r\n#endif\r\n#if defined( __CL_LONG2__)\r\n    __cl_long2     v2[4];\r\n#endif\r\n#if defined( __CL_LONG4__)\r\n    __cl_long4     v4[2];\r\n#endif\r\n#if defined( __CL_LONG8__ )\r\n    __cl_long8     v8;\r\n#endif\r\n}cl_long8;\r\n\r\ntypedef union\r\n{\r\n    cl_long  CL_ALIGNED(128) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };\r\n#endif\r\n#if defined( __CL_LONG2__)\r\n    __cl_long2     v2[8];\r\n#endif\r\n#if defined( __CL_LONG4__)\r\n    __cl_long4     v4[4];\r\n#endif\r\n#if defined( __CL_LONG8__ )\r\n    __cl_long8     v8[2];\r\n#endif\r\n#if defined( __CL_LONG16__ )\r\n    __cl_long16    v16;\r\n#endif\r\n}cl_long16;\r\n\r\n\r\n/* ---- cl_ulongn ---- */\r\ntypedef union\r\n{\r\n    cl_ulong  CL_ALIGNED(16) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };\r\n#endif\r\n#if defined( __CL_ULONG2__)\r\n    __cl_ulong2     v2;\r\n#endif\r\n}cl_ulong2;\r\n\r\ntypedef union\r\n{\r\n    cl_ulong  CL_ALIGNED(32) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };\r\n#endif\r\n#if defined( __CL_ULONG2__)\r\n    __cl_ulong2     v2[2];\r\n#endif\r\n#if defined( __CL_ULONG4__)\r\n    __cl_ulong4     v4;\r\n#endif\r\n}cl_ulong4;\r\n\r\n/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */\r\ntypedef  cl_ulong4  cl_ulong3;\r\n\r\ntypedef union\r\n{\r\n    cl_ulong   CL_ALIGNED(64) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };\r\n#endif\r\n#if defined( __CL_ULONG2__)\r\n    __cl_ulong2     v2[4];\r\n#endif\r\n#if defined( __CL_ULONG4__)\r\n    __cl_ulong4     v4[2];\r\n#endif\r\n#if defined( __CL_ULONG8__ )\r\n    __cl_ulong8     v8;\r\n#endif\r\n}cl_ulong8;\r\n\r\ntypedef union\r\n{\r\n    cl_ulong  CL_ALIGNED(128) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };\r\n#endif\r\n#if defined( __CL_ULONG2__)\r\n    __cl_ulong2     v2[8];\r\n#endif\r\n#if defined( __CL_ULONG4__)\r\n    __cl_ulong4     v4[4];\r\n#endif\r\n#if defined( __CL_ULONG8__ )\r\n    __cl_ulong8     v8[2];\r\n#endif\r\n#if defined( __CL_ULONG16__ )\r\n    __cl_ulong16    v16;\r\n#endif\r\n}cl_ulong16;\r\n\r\n\r\n/* --- cl_floatn ---- */\r\n\r\ntypedef union\r\n{\r\n    cl_float  CL_ALIGNED(8) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };\r\n#endif\r\n#if defined( __CL_FLOAT2__)\r\n    __cl_float2     v2;\r\n#endif\r\n}cl_float2;\r\n\r\ntypedef union\r\n{\r\n    cl_float  CL_ALIGNED(16) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };\r\n#endif\r\n#if defined( __CL_FLOAT2__)\r\n    __cl_float2     v2[2];\r\n#endif\r\n#if defined( __CL_FLOAT4__)\r\n    __cl_float4     v4;\r\n#endif\r\n}cl_float4;\r\n\r\n/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */\r\ntypedef  cl_float4  cl_float3;\r\n\r\ntypedef union\r\n{\r\n    cl_float   CL_ALIGNED(32) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };\r\n#endif\r\n#if defined( __CL_FLOAT2__)\r\n    __cl_float2     v2[4];\r\n#endif\r\n#if defined( __CL_FLOAT4__)\r\n    __cl_float4     v4[2];\r\n#endif\r\n#if defined( __CL_FLOAT8__ )\r\n    __cl_float8     v8;\r\n#endif\r\n}cl_float8;\r\n\r\ntypedef union\r\n{\r\n    cl_float  CL_ALIGNED(64) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };\r\n#endif\r\n#if defined( __CL_FLOAT2__)\r\n    __cl_float2     v2[8];\r\n#endif\r\n#if defined( __CL_FLOAT4__)\r\n    __cl_float4     v4[4];\r\n#endif\r\n#if defined( __CL_FLOAT8__ )\r\n    __cl_float8     v8[2];\r\n#endif\r\n#if defined( __CL_FLOAT16__ )\r\n    __cl_float16    v16;\r\n#endif\r\n}cl_float16;\r\n\r\n/* --- cl_doublen ---- */\r\n\r\ntypedef union\r\n{\r\n    cl_double  CL_ALIGNED(16) s[2];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };\r\n   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };\r\n   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };\r\n#endif\r\n#if defined( __CL_DOUBLE2__)\r\n    __cl_double2     v2;\r\n#endif\r\n}cl_double2;\r\n\r\ntypedef union\r\n{\r\n    cl_double  CL_ALIGNED(32) s[4];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };\r\n   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };\r\n#endif\r\n#if defined( __CL_DOUBLE2__)\r\n    __cl_double2     v2[2];\r\n#endif\r\n#if defined( __CL_DOUBLE4__)\r\n    __cl_double4     v4;\r\n#endif\r\n}cl_double4;\r\n\r\n/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */\r\ntypedef  cl_double4  cl_double3;\r\n\r\ntypedef union\r\n{\r\n    cl_double   CL_ALIGNED(64) s[8];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };\r\n   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };\r\n   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };\r\n#endif\r\n#if defined( __CL_DOUBLE2__)\r\n    __cl_double2     v2[4];\r\n#endif\r\n#if defined( __CL_DOUBLE4__)\r\n    __cl_double4     v4[2];\r\n#endif\r\n#if defined( __CL_DOUBLE8__ )\r\n    __cl_double8     v8;\r\n#endif\r\n}cl_double8;\r\n\r\ntypedef union\r\n{\r\n    cl_double  CL_ALIGNED(128) s[16];\r\n#if __CL_HAS_ANON_STRUCT__\r\n   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };\r\n   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };\r\n   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };\r\n#endif\r\n#if defined( __CL_DOUBLE2__)\r\n    __cl_double2     v2[8];\r\n#endif\r\n#if defined( __CL_DOUBLE4__)\r\n    __cl_double4     v4[4];\r\n#endif\r\n#if defined( __CL_DOUBLE8__ )\r\n    __cl_double8     v8[2];\r\n#endif\r\n#if defined( __CL_DOUBLE16__ )\r\n    __cl_double16    v16;\r\n#endif\r\n}cl_double16;\r\n\r\n/* Macro to facilitate debugging\r\n * Usage:\r\n *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.\r\n *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \\\"\r\n *   Each line thereafter of OpenCL C source must end with: \\n\\\r\n *   The last line ends in \";\r\n *\r\n *   Example:\r\n *\r\n *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO \"\\\r\n *   kernel void foo( int a, float * b )             \\n\\\r\n *   {                                               \\n\\\r\n *      // my comment                                \\n\\\r\n *      *b[ get_global_id(0)] = a;                   \\n\\\r\n *   }                                               \\n\\\r\n *   \";\r\n *\r\n * This should correctly set up the line, (column) and file information for your source\r\n * string so you can do source level debugging.\r\n */\r\n#define  __CL_STRINGIFY( _x )               # _x\r\n#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )\r\n#define  CL_PROGRAM_STRING_DEBUG_INFO       \"#line \"  _CL_STRINGIFY(__LINE__) \" \\\"\" __FILE__ \"\\\" \\n\\n\"\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__\r\n    #pragma warning( pop )\r\n#endif\r\n\r\n#endif  /* __CL_PLATFORM_H  */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_va_api_media_sharing_intel.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2023 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_\r\n#define OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_\r\n\r\n/*\r\n** This header is generated from the Khronos OpenCL XML API Registry.\r\n*/\r\n\r\n#include <va/va.h>\r\n\r\n#include <CL/cl.h>\r\n\r\n/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)\r\n#define CL_NO_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n/* CL_NO_EXTENSION_PROTOTYPES implies\r\n   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and\r\n   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n#if defined(CL_NO_EXTENSION_PROTOTYPES) && \\\r\n    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES\r\n#endif\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n/***************************************************************\r\n* cl_intel_sharing_format_query_va_api\r\n***************************************************************/\r\n#define cl_intel_sharing_format_query_va_api 1\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_VA_API_EXTENSION_NAME \\\r\n    \"cl_intel_sharing_format_query_va_api\"\r\n\r\n\r\n#define CL_INTEL_SHARING_FORMAT_QUERY_VA_API_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\n/* when cl_intel_va_api_media_sharing is supported */\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetSupportedVA_APIMediaSurfaceFormatsINTEL_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint plane,\r\n    cl_uint num_entries,\r\n    VAImageFormat* va_api_formats,\r\n    cl_uint* num_surface_formats);\r\n\r\ntypedef clGetSupportedVA_APIMediaSurfaceFormatsINTEL_t *\r\nclGetSupportedVA_APIMediaSurfaceFormatsINTEL_fn ;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetSupportedVA_APIMediaSurfaceFormatsINTEL(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    cl_mem_object_type image_type,\r\n    cl_uint plane,\r\n    cl_uint num_entries,\r\n    VAImageFormat* va_api_formats,\r\n    cl_uint* num_surface_formats) ;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n/***************************************************************\r\n* cl_intel_va_api_media_sharing\r\n***************************************************************/\r\n#define cl_intel_va_api_media_sharing 1\r\n#define CL_INTEL_VA_API_MEDIA_SHARING_EXTENSION_NAME \\\r\n    \"cl_intel_va_api_media_sharing\"\r\n\r\n\r\n#define CL_INTEL_VA_API_MEDIA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)\r\n\r\ntypedef cl_uint             cl_va_api_device_source_intel;\r\ntypedef cl_uint             cl_va_api_device_set_intel;\r\n\r\n/* Error codes */\r\n#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098\r\n#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099\r\n#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100\r\n#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101\r\n\r\n/* cl_va_api_device_source_intel */\r\n#define CL_VA_API_DISPLAY_INTEL                             0x4094\r\n\r\n/* cl_va_api_device_set_intel */\r\n#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095\r\n#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096\r\n\r\n/* cl_context_info */\r\n#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097\r\n\r\n/* cl_mem_info */\r\n#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098\r\n\r\n/* cl_image_info */\r\n#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099\r\n\r\n/* cl_command_type */\r\n#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A\r\n#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B\r\n\r\n\r\ntypedef cl_int CL_API_CALL\r\nclGetDeviceIDsFromVA_APIMediaAdapterINTEL_t(\r\n    cl_platform_id platform,\r\n    cl_va_api_device_source_intel media_adapter_type,\r\n    void* media_adapter,\r\n    cl_va_api_device_set_intel media_adapter_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices);\r\n\r\ntypedef clGetDeviceIDsFromVA_APIMediaAdapterINTEL_t *\r\nclGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_mem CL_API_CALL\r\nclCreateFromVA_APIMediaSurfaceINTEL_t(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    VASurfaceID* surface,\r\n    cl_uint plane,\r\n    cl_int* errcode_ret);\r\n\r\ntypedef clCreateFromVA_APIMediaSurfaceINTEL_t *\r\nclCreateFromVA_APIMediaSurfaceINTEL_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueAcquireVA_APIMediaSurfacesINTEL_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueAcquireVA_APIMediaSurfacesINTEL_t *\r\nclEnqueueAcquireVA_APIMediaSurfacesINTEL_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\ntypedef cl_int CL_API_CALL\r\nclEnqueueReleaseVA_APIMediaSurfacesINTEL_t(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event);\r\n\r\ntypedef clEnqueueReleaseVA_APIMediaSurfacesINTEL_t *\r\nclEnqueueReleaseVA_APIMediaSurfacesINTEL_fn CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclGetDeviceIDsFromVA_APIMediaAdapterINTEL(\r\n    cl_platform_id platform,\r\n    cl_va_api_device_source_intel media_adapter_type,\r\n    void* media_adapter,\r\n    cl_va_api_device_set_intel media_adapter_set,\r\n    cl_uint num_entries,\r\n    cl_device_id* devices,\r\n    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_mem CL_API_CALL\r\nclCreateFromVA_APIMediaSurfaceINTEL(\r\n    cl_context context,\r\n    cl_mem_flags flags,\r\n    VASurfaceID* surface,\r\n    cl_uint plane,\r\n    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueAcquireVA_APIMediaSurfacesINTEL(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\nextern CL_API_ENTRY cl_int CL_API_CALL\r\nclEnqueueReleaseVA_APIMediaSurfacesINTEL(\r\n    cl_command_queue command_queue,\r\n    cl_uint num_objects,\r\n    const cl_mem* mem_objects,\r\n    cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list,\r\n    cl_event* event) CL_API_SUFFIX__VERSION_1_2;\r\n\r\n#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif /* OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_ */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/cl_version.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2018-2020 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef __CL_VERSION_H\r\n#define __CL_VERSION_H\r\n\r\n/* Detect which version to target */\r\n#if !defined(CL_TARGET_OPENCL_VERSION)\r\n#pragma message(\"cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)\")\r\n#define CL_TARGET_OPENCL_VERSION 300\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION != 100 && \\\r\n    CL_TARGET_OPENCL_VERSION != 110 && \\\r\n    CL_TARGET_OPENCL_VERSION != 120 && \\\r\n    CL_TARGET_OPENCL_VERSION != 200 && \\\r\n    CL_TARGET_OPENCL_VERSION != 210 && \\\r\n    CL_TARGET_OPENCL_VERSION != 220 && \\\r\n    CL_TARGET_OPENCL_VERSION != 300\r\n#pragma message(\"cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 300 (OpenCL 3.0)\")\r\n#undef CL_TARGET_OPENCL_VERSION\r\n#define CL_TARGET_OPENCL_VERSION 300\r\n#endif\r\n\r\n\r\n/* OpenCL Version */\r\n#if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0)\r\n#define CL_VERSION_3_0  1\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)\r\n#define CL_VERSION_2_2  1\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)\r\n#define CL_VERSION_2_1  1\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)\r\n#define CL_VERSION_2_0  1\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)\r\n#define CL_VERSION_1_2  1\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)\r\n#define CL_VERSION_1_1  1\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)\r\n#define CL_VERSION_1_0  1\r\n#endif\r\n\r\n/* Allow deprecated APIs for older OpenCL versions. */\r\n#if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)\r\n#define CL_USE_DEPRECATED_OPENCL_2_2_APIS\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)\r\n#define CL_USE_DEPRECATED_OPENCL_2_1_APIS\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)\r\n#define CL_USE_DEPRECATED_OPENCL_2_0_APIS\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)\r\n#define CL_USE_DEPRECATED_OPENCL_1_2_APIS\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n#define CL_USE_DEPRECATED_OPENCL_1_1_APIS\r\n#endif\r\n#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)\r\n#define CL_USE_DEPRECATED_OPENCL_1_0_APIS\r\n#endif\r\n\r\n#endif  /* __CL_VERSION_H */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/opencl.h",
    "content": "/*******************************************************************************\r\n * Copyright (c) 2008-2021 The Khronos Group Inc.\r\n *\r\n * Licensed under the Apache License, Version 2.0 (the \"License\");\r\n * you may not use this file except in compliance with the License.\r\n * You may obtain a copy of the License at\r\n *\r\n *    http://www.apache.org/licenses/LICENSE-2.0\r\n *\r\n * Unless required by applicable law or agreed to in writing, software\r\n * distributed under the License is distributed on an \"AS IS\" BASIS,\r\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n * See the License for the specific language governing permissions and\r\n * limitations under the License.\r\n ******************************************************************************/\r\n\r\n#ifndef __OPENCL_H\r\n#define __OPENCL_H\r\n\r\n#ifdef __cplusplus\r\nextern \"C\" {\r\n#endif\r\n\r\n#include <CL/cl.h>\r\n#include <CL/cl_gl.h>\r\n#include <CL/cl_ext.h>\r\n\r\n#ifdef __cplusplus\r\n}\r\n#endif\r\n\r\n#endif  /* __OPENCL_H   */\r\n"
  },
  {
    "path": "svm/OpenCL/include/CL/opencl.hpp",
    "content": "//\r\n// Copyright (c) 2008-2024 The Khronos Group Inc.\r\n//\r\n// Licensed under the Apache License, Version 2.0 (the \"License\");\r\n// you may not use this file except in compliance with the License.\r\n// You may obtain a copy of the License at\r\n//\r\n//    http://www.apache.org/licenses/LICENSE-2.0\r\n//\r\n// Unless required by applicable law or agreed to in writing, software\r\n// distributed under the License is distributed on an \"AS IS\" BASIS,\r\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r\n// See the License for the specific language governing permissions and\r\n// limitations under the License.\r\n//\r\n\r\n/*! \\file\r\n *\r\n *   \\brief C++ bindings for OpenCL 1.0, OpenCL 1.1, OpenCL 1.2,\r\n *       OpenCL 2.0, OpenCL 2.1, OpenCL 2.2, and OpenCL 3.0.\r\n *   \\author Lee Howes and Bruce Merry\r\n *\r\n *   Derived from the OpenCL 1.x C++ bindings written by\r\n *   Benedict R. Gaster, Laurent Morichetti and Lee Howes\r\n *   With additions and fixes from:\r\n *       Brian Cole, March 3rd 2010 and April 2012\r\n *       Matt Gruenke, April 2012.\r\n *       Bruce Merry, February 2013.\r\n *       Tom Deakin and Simon McIntosh-Smith, July 2013\r\n *       James Price, 2015-\r\n *   \\version 2.2.0\r\n *   \\date 2019-09-18\r\n *\r\n *   Optional extension support\r\n *\r\n *         cl_khr_d3d10_sharing\r\n *         #define CL_HPP_USE_DX_INTEROP\r\n *         cl_khr_il_program\r\n *         #define CL_HPP_USE_IL_KHR\r\n *         cl_khr_sub_groups\r\n *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR\r\n *\r\n *   Doxygen documentation for this header is available here:\r\n *\r\n *       http://khronosgroup.github.io/OpenCL-CLHPP/\r\n *\r\n *   The latest version of this header can be found on the GitHub releases page:\r\n *\r\n *       https://github.com/KhronosGroup/OpenCL-CLHPP/releases\r\n *\r\n *   Bugs and patches can be submitted to the GitHub repository:\r\n *\r\n *       https://github.com/KhronosGroup/OpenCL-CLHPP\r\n */\r\n\r\n/*! \\mainpage\r\n * \\section intro Introduction\r\n * For many large applications C++ is the language of choice and so it seems\r\n * reasonable to define C++ bindings for OpenCL.\r\n *\r\n * The interface is contained with a single C++ header file \\em opencl.hpp and all\r\n * definitions are contained within the namespace \\em cl. There is no additional\r\n * requirement to include \\em cl.h and to use either the C++ or original C\r\n * bindings; it is enough to simply include \\em opencl.hpp.\r\n *\r\n * The bindings themselves are lightweight and correspond closely to the\r\n * underlying C API. Using the C++ bindings introduces no additional execution\r\n * overhead.\r\n *\r\n * There are numerous compatibility, portability and memory management\r\n * fixes in the new header as well as additional OpenCL 2.0 features.\r\n * As a result the header is not directly backward compatible and for this\r\n * reason we release it as opencl.hpp rather than a new version of cl.hpp.\r\n * \r\n *\r\n * \\section compatibility Compatibility\r\n * Due to the evolution of the underlying OpenCL API the 2.0 C++ bindings\r\n * include an updated approach to defining supported feature versions\r\n * and the range of valid underlying OpenCL runtime versions supported.\r\n *\r\n * The combination of preprocessor macros CL_HPP_TARGET_OPENCL_VERSION and \r\n * CL_HPP_MINIMUM_OPENCL_VERSION control this range. These are three digit\r\n * decimal values representing OpenCL runtime versions. The default for \r\n * the target is 300, representing OpenCL 3.0.  The minimum is defined as 200.\r\n * These settings would use 2.0 and newer API calls only.\r\n * If backward compatibility with a 1.2 runtime is required, the minimum\r\n * version may be set to 120.\r\n *\r\n * Note that this is a compile-time setting, and so affects linking against\r\n * a particular SDK version rather than the versioning of the loaded runtime.\r\n *\r\n * The earlier versions of the header included basic vector and string \r\n * classes based loosely on STL versions. These were difficult to \r\n * maintain and very rarely used. For the 2.0 header we now assume\r\n * the presence of the standard library unless requested otherwise.\r\n * We use std::array, std::vector, std::shared_ptr and std::string \r\n * throughout to safely manage memory and reduce the chance of a \r\n * recurrance of earlier memory management bugs.\r\n *\r\n * These classes are used through typedefs in the cl namespace: \r\n * cl::array, cl::vector, cl::pointer and cl::string.\r\n * In addition cl::allocate_pointer forwards to std::allocate_shared\r\n * by default.\r\n * In all cases these standard library classes can be replaced with \r\n * custom interface-compatible versions using the CL_HPP_NO_STD_ARRAY, \r\n * CL_HPP_NO_STD_VECTOR, CL_HPP_NO_STD_UNIQUE_PTR and \r\n * CL_HPP_NO_STD_STRING macros.\r\n *\r\n * The OpenCL 1.x versions of the C++ bindings included a size_t wrapper\r\n * class to interface with kernel enqueue. This caused unpleasant interactions\r\n * with the standard size_t declaration and led to namespacing bugs.\r\n * In the 2.0 version we have replaced this with a std::array-based interface.\r\n * However, the old behaviour can be regained for backward compatibility\r\n * using the CL_HPP_ENABLE_SIZE_T_COMPATIBILITY macro.\r\n *\r\n * Finally, the program construction interface used a clumsy vector-of-pairs\r\n * design in the earlier versions. We have replaced that with a cleaner \r\n * vector-of-vectors and vector-of-strings design. However, for backward \r\n * compatibility old behaviour can be regained with the\r\n * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY macro.\r\n * \r\n * In OpenCL 2.0 OpenCL C is not entirely backward compatibility with \r\n * earlier versions. As a result a flag must be passed to the OpenCL C\r\n * compiled to request OpenCL 2.0 compilation of kernels with 1.2 as\r\n * the default in the absence of the flag.\r\n * In some cases the C++ bindings automatically compile code for ease.\r\n * For those cases the compilation defaults to OpenCL C 2.0.\r\n * If this is not wanted, the CL_HPP_CL_1_2_DEFAULT_BUILD macro may\r\n * be specified to assume 1.2 compilation.\r\n * If more fine-grained decisions on a per-kernel bases are required\r\n * then explicit build operations that take the flag should be used.\r\n *\r\n *\r\n * \\section parameterization Parameters\r\n * This header may be parameterized by a set of preprocessor macros.\r\n *\r\n * - CL_HPP_TARGET_OPENCL_VERSION\r\n *\r\n *   Defines the target OpenCL runtime version to build the header\r\n *   against. Defaults to 300, representing OpenCL 3.0.\r\n *\r\n * - CL_HPP_MINIMUM_OPENCL_VERSION\r\n *\r\n *   Defines the minimum OpenCL runtime version to build the header\r\n *   against. Defaults to 200, representing OpenCL 2.0.\r\n *\r\n * - CL_HPP_NO_STD_STRING\r\n *\r\n *   Do not use the standard library string class. cl::string is not\r\n *   defined and may be defined by the user before opencl.hpp is\r\n *   included.\r\n *\r\n * - CL_HPP_NO_STD_VECTOR\r\n *\r\n *   Do not use the standard library vector class. cl::vector is not\r\n *   defined and may be defined by the user before opencl.hpp is\r\n *   included.\r\n *\r\n * - CL_HPP_NO_STD_ARRAY\r\n *\r\n *   Do not use the standard library array class. cl::array is not\r\n *   defined and may be defined by the user before opencl.hpp is\r\n *   included.\r\n *\r\n * - CL_HPP_NO_STD_UNIQUE_PTR\r\n *\r\n *   Do not use the standard library unique_ptr class. cl::pointer and\r\n *   the cl::allocate_pointer functions are not defined and may be\r\n *   defined by the user before opencl.hpp is included.\r\n *\r\n * - CL_HPP_ENABLE_EXCEPTIONS\r\n *\r\n *   Enable exceptions for use in the C++ bindings header. This is the\r\n *   preferred error handling mechanism but is not required.\r\n *\r\n * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY\r\n *\r\n *   Backward compatibility option to support cl.hpp-style size_t\r\n *   class.  Replaces the updated std::array derived version and\r\n *   removal of size_t from the namespace. Note that in this case the\r\n *   new size_t class is placed in the cl::compatibility namespace and\r\n *   thus requires an additional using declaration for direct backward\r\n *   compatibility.\r\n *\r\n * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY\r\n *\r\n *   Enable older vector of pairs interface for construction of\r\n *   programs.\r\n *\r\n * - CL_HPP_CL_1_2_DEFAULT_BUILD\r\n *\r\n *   Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0\r\n *   applies to use of cl::Program construction and other program\r\n *   build variants.\r\n *\r\n *\r\n * - CL_HPP_USE_CL_SUB_GROUPS_KHR\r\n *\r\n *   Enable the cl_khr_subgroups extension.\r\n *\r\n * - CL_HPP_USE_DX_INTEROP\r\n *\r\n *   Enable the cl_khr_d3d10_sharing extension.\r\n *\r\n * - CL_HPP_USE_IL_KHR\r\n *\r\n *   Enable the cl_khr_il_program extension.\r\n *\r\n *\r\n * \\section example Example\r\n *\r\n * The following example shows a general use case for the C++\r\n * bindings, including support for the optional exception feature and\r\n * also the supplied vector and string classes, see following sections for\r\n * decriptions of these features.\r\n * \r\n * Note: the C++ bindings use std::call_once and therefore may need to be\r\n * compiled using special command-line options (such as \"-pthread\") on some\r\n * platforms!\r\n *\r\n * \\code\r\n    #define CL_HPP_ENABLE_EXCEPTIONS\r\n    #define CL_HPP_TARGET_OPENCL_VERSION 200\r\n\r\n    #include <CL/opencl.hpp>\r\n    #include <iostream>\r\n    #include <vector>\r\n    #include <memory>\r\n    #include <algorithm>\r\n\r\n    const int numElements = 32;\r\n\r\n    int main(void)\r\n    {\r\n        // Filter for a 2.0 or newer platform and set it as the default\r\n        std::vector<cl::Platform> platforms;\r\n        cl::Platform::get(&platforms);\r\n        cl::Platform plat;\r\n        for (auto &p : platforms) {\r\n            std::string platver = p.getInfo<CL_PLATFORM_VERSION>();\r\n            if (platver.find(\"OpenCL 2.\") != std::string::npos ||\r\n                platver.find(\"OpenCL 3.\") != std::string::npos) {\r\n                // Note: an OpenCL 3.x platform may not support all required features!\r\n                plat = p;\r\n            }\r\n        }\r\n        if (plat() == 0) {\r\n            std::cout << \"No OpenCL 2.0 or newer platform found.\\n\";\r\n            return -1;\r\n        }\r\n\r\n        cl::Platform newP = cl::Platform::setDefault(plat);\r\n        if (newP != plat) {\r\n            std::cout << \"Error setting default platform.\\n\";\r\n            return -1;\r\n        }\r\n\r\n        // C++11 raw string literal for the first kernel\r\n        std::string kernel1{R\"CLC(\r\n            global int globalA;\r\n            kernel void updateGlobal()\r\n            {\r\n              globalA = 75;\r\n            }\r\n        )CLC\"};\r\n\r\n        // Raw string literal for the second kernel\r\n        std::string kernel2{R\"CLC(\r\n            typedef struct { global int *bar; } Foo;\r\n            kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,\r\n                                  global int *output, int val, write_only pipe int outPipe, queue_t childQueue)\r\n            {\r\n              output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);\r\n              write_pipe(outPipe, &val);\r\n              queue_t default_queue = get_default_queue();\r\n              ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2);\r\n\r\n              // Have a child kernel write into third quarter of output\r\n              enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,\r\n                ^{\r\n                    output[get_global_size(0)*2 + get_global_id(0)] =\r\n                      inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA;\r\n                });\r\n\r\n              // Have a child kernel write into last quarter of output\r\n              enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,\r\n                ^{\r\n                    output[get_global_size(0)*3 + get_global_id(0)] =\r\n                      inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;\r\n                });\r\n            }\r\n        )CLC\"};\r\n\r\n        std::vector<std::string> programStrings;\r\n        programStrings.push_back(kernel1);\r\n        programStrings.push_back(kernel2);\r\n\r\n        cl::Program vectorAddProgram(programStrings);\r\n        try {\r\n            vectorAddProgram.build(\"-cl-std=CL2.0\");\r\n        }\r\n        catch (...) {\r\n            // Print build info for all devices\r\n            cl_int buildErr = CL_SUCCESS;\r\n            auto buildInfo = vectorAddProgram.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&buildErr);\r\n            for (auto &pair : buildInfo) {\r\n                std::cerr << pair.second << std::endl << std::endl;\r\n            }\r\n\r\n            return 1;\r\n        }\r\n\r\n        typedef struct { int *bar; } Foo;\r\n\r\n        // Get and run kernel that initializes the program-scope global\r\n        // A test for kernels that take no arguments\r\n        auto program2Kernel =\r\n            cl::KernelFunctor<>(vectorAddProgram, \"updateGlobal\");\r\n        program2Kernel(\r\n            cl::EnqueueArgs(\r\n            cl::NDRange(1)));\r\n\r\n        //////////////////\r\n        // SVM allocations\r\n\r\n        auto anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();\r\n        *anSVMInt = 5;\r\n        cl::SVMAllocator<Foo, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;\r\n        auto fooPointer = cl::allocate_pointer<Foo>(svmAllocReadOnly);\r\n        fooPointer->bar = anSVMInt.get();\r\n        cl::SVMAllocator<int, cl::SVMTraitCoarse<>> svmAlloc;\r\n        std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);\r\n        cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);\r\n\r\n        //////////////\r\n        // Traditional cl_mem allocations\r\n\r\n        std::vector<int> output(numElements, 0xdeadbeef);\r\n        cl::Buffer outputBuffer(output.begin(), output.end(), false);\r\n        cl::Pipe aPipe(sizeof(cl_int), numElements / 2);\r\n\r\n        // Default command queue, also passed in as a parameter\r\n        cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault(\r\n            cl::Context::getDefault(), cl::Device::getDefault());\r\n\r\n        auto vectorAddKernel =\r\n            cl::KernelFunctor<\r\n                decltype(fooPointer)&,\r\n                int*,\r\n                cl::coarse_svm_vector<int>&,\r\n                cl::Buffer,\r\n                int,\r\n                cl::Pipe&,\r\n                cl::DeviceCommandQueue\r\n                >(vectorAddProgram, \"vectorAdd\");\r\n\r\n        // Ensure that the additional SVM pointer is available to the kernel\r\n        // This one was not passed as a parameter\r\n        vectorAddKernel.setSVMPointers(anSVMInt);\r\n\r\n        cl_int error;\r\n        vectorAddKernel(\r\n            cl::EnqueueArgs(\r\n                cl::NDRange(numElements/2),\r\n                cl::NDRange(numElements/2)),\r\n            fooPointer,\r\n            inputA.data(),\r\n            inputB,\r\n            outputBuffer,\r\n            3,\r\n            aPipe,\r\n            defaultDeviceQueue,\r\n            error\r\n            );\r\n\r\n        cl::copy(outputBuffer, output.begin(), output.end());\r\n\r\n        cl::Device d = cl::Device::getDefault();\r\n\r\n        std::cout << \"Output:\\n\";\r\n        for (int i = 1; i < numElements; ++i) {\r\n            std::cout << \"\\t\" << output[i] << \"\\n\";\r\n        }\r\n        std::cout << \"\\n\\n\";\r\n\r\n        return 0;\r\n    }\r\n *\r\n * \\endcode\r\n *\r\n */\r\n#ifndef CL_HPP_\r\n#define CL_HPP_\r\n\r\n/* Handle deprecated preprocessor definitions. In each case, we only check for\r\n * the old name if the new name is not defined, so that user code can define\r\n * both and hence work with either version of the bindings.\r\n */\r\n#if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP)\r\n# pragma message(\"opencl.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead\")\r\n# define CL_HPP_USE_DX_INTEROP\r\n#endif\r\n#if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)\r\n# pragma message(\"opencl.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead\")\r\n# define CL_HPP_ENABLE_EXCEPTIONS\r\n#endif\r\n#if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)\r\n# pragma message(\"opencl.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead\")\r\n# define CL_HPP_NO_STD_VECTOR\r\n#endif\r\n#if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING)\r\n# pragma message(\"opencl.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead\")\r\n# define CL_HPP_NO_STD_STRING\r\n#endif\r\n#if defined(VECTOR_CLASS)\r\n# pragma message(\"opencl.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead\")\r\n#endif\r\n#if defined(STRING_CLASS)\r\n# pragma message(\"opencl.hpp: STRING_CLASS is deprecated. Alias cl::string instead.\")\r\n#endif\r\n#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS)\r\n# pragma message(\"opencl.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead\")\r\n# define CL_HPP_USER_OVERRIDE_ERROR_STRINGS\r\n#endif\r\n\r\n/* Warn about features that are no longer supported\r\n */\r\n#if defined(__USE_DEV_VECTOR)\r\n# pragma message(\"opencl.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors\")\r\n#endif\r\n#if defined(__USE_DEV_STRING)\r\n# pragma message(\"opencl.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors\")\r\n#endif\r\n\r\n/* Detect which version to target */\r\n#if !defined(CL_HPP_TARGET_OPENCL_VERSION)\r\n# pragma message(\"opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 300 (OpenCL 3.0)\")\r\n# define CL_HPP_TARGET_OPENCL_VERSION 300\r\n#endif\r\n#if CL_HPP_TARGET_OPENCL_VERSION != 100 && \\\r\n    CL_HPP_TARGET_OPENCL_VERSION != 110 && \\\r\n    CL_HPP_TARGET_OPENCL_VERSION != 120 && \\\r\n    CL_HPP_TARGET_OPENCL_VERSION != 200 && \\\r\n    CL_HPP_TARGET_OPENCL_VERSION != 210 && \\\r\n    CL_HPP_TARGET_OPENCL_VERSION != 220 && \\\r\n    CL_HPP_TARGET_OPENCL_VERSION != 300\r\n# pragma message(\"opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 300 (OpenCL 3.0).\")\r\n# undef CL_HPP_TARGET_OPENCL_VERSION\r\n# define CL_HPP_TARGET_OPENCL_VERSION 300\r\n#endif\r\n\r\n/* Forward target OpenCL version to C headers if necessary */\r\n#if defined(CL_TARGET_OPENCL_VERSION)\r\n/* Warn if prior definition of CL_TARGET_OPENCL_VERSION is lower than\r\n * requested C++ bindings version */\r\n#if CL_TARGET_OPENCL_VERSION < CL_HPP_TARGET_OPENCL_VERSION\r\n# pragma message(\"CL_TARGET_OPENCL_VERSION is already defined as is lower than CL_HPP_TARGET_OPENCL_VERSION\")\r\n#endif\r\n#else\r\n# define CL_TARGET_OPENCL_VERSION CL_HPP_TARGET_OPENCL_VERSION\r\n#endif\r\n\r\n#if !defined(CL_HPP_MINIMUM_OPENCL_VERSION)\r\n# define CL_HPP_MINIMUM_OPENCL_VERSION 200\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && \\\r\n    CL_HPP_MINIMUM_OPENCL_VERSION != 110 && \\\r\n    CL_HPP_MINIMUM_OPENCL_VERSION != 120 && \\\r\n    CL_HPP_MINIMUM_OPENCL_VERSION != 200 && \\\r\n    CL_HPP_MINIMUM_OPENCL_VERSION != 210 && \\\r\n    CL_HPP_MINIMUM_OPENCL_VERSION != 220 && \\\r\n    CL_HPP_MINIMUM_OPENCL_VERSION != 300\r\n# pragma message(\"opencl.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 100\")\r\n# undef CL_HPP_MINIMUM_OPENCL_VERSION\r\n# define CL_HPP_MINIMUM_OPENCL_VERSION 100\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION > CL_HPP_TARGET_OPENCL_VERSION\r\n# error \"CL_HPP_MINIMUM_OPENCL_VERSION must not be greater than CL_HPP_TARGET_OPENCL_VERSION\"\r\n#endif\r\n\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)\r\n# define CL_USE_DEPRECATED_OPENCL_1_0_APIS\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n# define CL_USE_DEPRECATED_OPENCL_1_1_APIS\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)\r\n# define CL_USE_DEPRECATED_OPENCL_1_2_APIS\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)\r\n# define CL_USE_DEPRECATED_OPENCL_2_0_APIS\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)\r\n# define CL_USE_DEPRECATED_OPENCL_2_1_APIS\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)\r\n# define CL_USE_DEPRECATED_OPENCL_2_2_APIS\r\n#endif\r\n\r\n#ifdef _WIN32\r\n\r\n#include <malloc.h>\r\n\r\n#if defined(CL_HPP_USE_DX_INTEROP)\r\n#include <CL/cl_d3d10.h>\r\n#include <CL/cl_dx9_media_sharing.h>\r\n#endif\r\n#endif // _WIN32\r\n\r\n#if defined(_MSC_VER)\r\n#include <intrin.h>\r\n#endif // _MSC_VER \r\n \r\n // Check for a valid C++ version\r\n\r\n// Need to do both tests here because for some reason __cplusplus is not \r\n// updated in visual studio\r\n#if (!defined(_MSC_VER) && __cplusplus < 201103L) || (defined(_MSC_VER) && _MSC_VER < 1700)\r\n#error Visual studio 2013 or another C++11-supporting compiler required\r\n#endif\r\n\r\n#if defined(__APPLE__) || defined(__MACOSX)\r\n#include <OpenCL/opencl.h>\r\n#else\r\n#include <CL/opencl.h>\r\n#endif // !__APPLE__\r\n\r\n#if __cplusplus >= 201703L\r\n# define CL_HPP_DEFINE_STATIC_MEMBER_ inline\r\n#elif defined(_MSC_VER)\r\n# define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)\r\n#elif defined(__MINGW32__)\r\n# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany))\r\n#else\r\n# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))\r\n#endif // !_MSC_VER\r\n\r\n// Define deprecated prefixes and suffixes to ensure compilation\r\n// in case they are not pre-defined\r\n#if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)\r\n#define CL_API_PREFIX__VERSION_1_1_DEPRECATED\r\n#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)\r\n#if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)\r\n#define CL_API_SUFFIX__VERSION_1_1_DEPRECATED\r\n#endif // #if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)\r\n\r\n#if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)\r\n#define CL_API_PREFIX__VERSION_1_2_DEPRECATED\r\n#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)\r\n#if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)\r\n#define CL_API_SUFFIX__VERSION_1_2_DEPRECATED\r\n#endif // #if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)\r\n\r\n#if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)\r\n#define CL_API_PREFIX__VERSION_2_2_DEPRECATED\r\n#endif // #if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)\r\n#if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)\r\n#define CL_API_SUFFIX__VERSION_2_2_DEPRECATED\r\n#endif // #if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)\r\n\r\n#if !defined(CL_CALLBACK)\r\n#define CL_CALLBACK\r\n#endif //CL_CALLBACK\r\n\r\n#include <utility>\r\n#include <limits>\r\n#include <iterator>\r\n#include <mutex>\r\n#include <cstring>\r\n#include <functional>\r\n\r\n\r\n// Define a size_type to represent a correctly resolved size_t\r\n#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)\r\nnamespace cl {\r\n    using size_type = ::size_t;\r\n} // namespace cl\r\n#else // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)\r\nnamespace cl {\r\n    using size_type = size_t;\r\n} // namespace cl\r\n#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)\r\n\r\n\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n#include <exception>\r\n#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n\r\n#if !defined(CL_HPP_NO_STD_VECTOR)\r\n#include <vector>\r\nnamespace cl {\r\n    template < class T, class Alloc = std::allocator<T> >\r\n    using vector = std::vector<T, Alloc>;\r\n} // namespace cl\r\n#endif // #if !defined(CL_HPP_NO_STD_VECTOR)\r\n\r\n#if !defined(CL_HPP_NO_STD_STRING)\r\n#include <string>\r\nnamespace cl {\r\n    using string = std::string;\r\n} // namespace cl\r\n#endif // #if !defined(CL_HPP_NO_STD_STRING)\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)\r\n#include <memory>\r\nnamespace cl {\r\n    // Replace unique_ptr and allocate_pointer for internal use\r\n    // to allow user to replace them\r\n    template<class T, class D>\r\n    using pointer = std::unique_ptr<T, D>;\r\n} // namespace cl\r\n#endif \r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#if !defined(CL_HPP_NO_STD_ARRAY)\r\n#include <array>\r\nnamespace cl {\r\n    template < class T, size_type N >\r\n    using array = std::array<T, N>;\r\n} // namespace cl\r\n#endif // #if !defined(CL_HPP_NO_STD_ARRAY)\r\n\r\n// Define size_type appropriately to allow backward-compatibility\r\n// use of the old size_t interface class\r\n#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)\r\nnamespace cl {\r\n    namespace compatibility {\r\n        /*! \\brief class used to interface between C++ and\r\n        *  OpenCL C calls that require arrays of size_t values, whose\r\n        *  size is known statically.\r\n        */\r\n        template <int N>\r\n        class size_t\r\n        {\r\n        private:\r\n            size_type data_[N];\r\n\r\n        public:\r\n            //! \\brief Initialize size_t to all 0s\r\n            size_t()\r\n            {\r\n                for (int i = 0; i < N; ++i) {\r\n                    data_[i] = 0;\r\n                }\r\n            }\r\n\r\n            size_t(const array<size_type, N> &rhs)\r\n            {\r\n                for (int i = 0; i < N; ++i) {\r\n                    data_[i] = rhs[i];\r\n                }\r\n            }\r\n\r\n            size_type& operator[](int index)\r\n            {\r\n                return data_[index];\r\n            }\r\n\r\n            const size_type& operator[](int index) const\r\n            {\r\n                return data_[index];\r\n            }\r\n\r\n            //! \\brief Conversion operator to T*.\r\n            operator size_type* ()             { return data_; }\r\n\r\n            //! \\brief Conversion operator to const T*.\r\n            operator const size_type* () const { return data_; }\r\n\r\n            operator array<size_type, N>() const\r\n            {\r\n                array<size_type, N> ret;\r\n\r\n                for (int i = 0; i < N; ++i) {\r\n                    ret[i] = data_[i];\r\n                }\r\n                return ret;\r\n            }\r\n        };\r\n    } // namespace compatibility\r\n\r\n    template<int N>\r\n    using size_t = compatibility::size_t<N>;\r\n} // namespace cl\r\n#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)\r\n\r\n// Helper alias to avoid confusing the macros\r\nnamespace cl {\r\n    namespace detail {\r\n        using size_t_array = array<size_type, 3>;\r\n    } // namespace detail\r\n} // namespace cl\r\n\r\n\r\n/*! \\namespace cl\r\n *\r\n * \\brief The OpenCL C++ bindings are defined within this namespace.\r\n *\r\n */\r\nnamespace cl {\r\n\r\n#define CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(name) \\\r\n    using PFN_##name = name##_fn\r\n\r\n#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name)                               \\\r\n    if (!pfn_##name) {                                                  \\\r\n        pfn_##name = (PFN_##name)clGetExtensionFunctionAddress(#name);  \\\r\n    }\r\n\r\n#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name)            \\\r\n    if (!pfn_##name) {                                                  \\\r\n        pfn_##name = (PFN_##name)                                       \\\r\n            clGetExtensionFunctionAddressForPlatform(platform, #name);  \\\r\n    }\r\n\r\n#ifdef cl_khr_external_memory\r\n    enum class ExternalMemoryType : cl_external_memory_handle_type_khr;\r\n#endif\r\n\r\n    class Memory;\r\n    class Platform;\r\n    class Program;\r\n    class Device;\r\n    class Context;\r\n    class CommandQueue;\r\n    class DeviceCommandQueue;\r\n    class Memory;\r\n    class Buffer;\r\n    class Pipe;\r\n#ifdef cl_khr_semaphore\r\n    class Semaphore;\r\n#endif\r\n#if defined(cl_khr_command_buffer)\r\n    class CommandBufferKhr;\r\n    class MutableCommandKhr;\r\n#endif // cl_khr_command_buffer\r\n\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n    /*! \\brief Exception class \r\n     * \r\n     *  This may be thrown by API functions when CL_HPP_ENABLE_EXCEPTIONS is defined.\r\n     */\r\n    class Error : public std::exception\r\n    {\r\n    private:\r\n        cl_int err_;\r\n        const char * errStr_;\r\n    public:\r\n        /*! \\brief Create a new CL error exception for a given error code\r\n         *  and corresponding message.\r\n         * \r\n         *  \\param err error code value.\r\n         *\r\n         *  \\param errStr a descriptive string that must remain in scope until\r\n         *                handling of the exception has concluded.  If set, it\r\n         *                will be returned by what().\r\n         */\r\n        Error(cl_int err, const char * errStr = nullptr) : err_(err), errStr_(errStr)\r\n        {}\r\n\r\n        /*! \\brief Get error string associated with exception\r\n         *\r\n         * \\return A memory pointer to the error message string.\r\n         */\r\n        const char * what() const noexcept override\r\n        {\r\n            if (errStr_ == nullptr) {\r\n                return \"empty\";\r\n            }\r\n            else {\r\n                return errStr_;\r\n            }\r\n        }\r\n\r\n        /*! \\brief Get error code associated with exception\r\n         *\r\n         *  \\return The error code.\r\n         */\r\n        cl_int err(void) const { return err_; }\r\n    };\r\n#define CL_HPP_ERR_STR_(x) #x\r\n#else\r\n#define CL_HPP_ERR_STR_(x) nullptr\r\n#endif // CL_HPP_ENABLE_EXCEPTIONS\r\n\r\n\r\nnamespace detail\r\n{\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\nstatic inline cl_int errHandler (\r\n    cl_int err,\r\n    const char * errStr = nullptr)\r\n{\r\n    if (err != CL_SUCCESS) {\r\n        throw Error(err, errStr);\r\n    }\r\n    return err;\r\n}\r\n#else\r\nstatic inline cl_int errHandler (cl_int err, const char * errStr = nullptr)\r\n{\r\n    (void) errStr; // suppress unused variable warning\r\n    return err;\r\n}\r\n#endif // CL_HPP_ENABLE_EXCEPTIONS\r\n}\r\n\r\n\r\n\r\n//! \\cond DOXYGEN_DETAIL\r\n#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)\r\n#define __GET_DEVICE_INFO_ERR               CL_HPP_ERR_STR_(clGetDeviceInfo)\r\n#define __GET_PLATFORM_INFO_ERR             CL_HPP_ERR_STR_(clGetPlatformInfo)\r\n#define __GET_DEVICE_IDS_ERR                CL_HPP_ERR_STR_(clGetDeviceIDs)\r\n#define __GET_PLATFORM_IDS_ERR              CL_HPP_ERR_STR_(clGetPlatformIDs)\r\n#define __GET_CONTEXT_INFO_ERR              CL_HPP_ERR_STR_(clGetContextInfo)\r\n#define __GET_EVENT_INFO_ERR                CL_HPP_ERR_STR_(clGetEventInfo)\r\n#define __GET_EVENT_PROFILE_INFO_ERR        CL_HPP_ERR_STR_(clGetEventProfileInfo)\r\n#define __GET_MEM_OBJECT_INFO_ERR           CL_HPP_ERR_STR_(clGetMemObjectInfo)\r\n#define __GET_IMAGE_INFO_ERR                CL_HPP_ERR_STR_(clGetImageInfo)\r\n#define __GET_SAMPLER_INFO_ERR              CL_HPP_ERR_STR_(clGetSamplerInfo)\r\n#define __GET_KERNEL_INFO_ERR               CL_HPP_ERR_STR_(clGetKernelInfo)\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __GET_KERNEL_ARG_INFO_ERR           CL_HPP_ERR_STR_(clGetKernelArgInfo)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n#define __GET_KERNEL_SUB_GROUP_INFO_ERR     CL_HPP_ERR_STR_(clGetKernelSubGroupInfo)\r\n#else\r\n#define __GET_KERNEL_SUB_GROUP_INFO_ERR     CL_HPP_ERR_STR_(clGetKernelSubGroupInfoKHR)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n#define __GET_KERNEL_WORK_GROUP_INFO_ERR    CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo)\r\n#define __GET_PROGRAM_INFO_ERR              CL_HPP_ERR_STR_(clGetProgramInfo)\r\n#define __GET_PROGRAM_BUILD_INFO_ERR        CL_HPP_ERR_STR_(clGetProgramBuildInfo)\r\n#define __GET_COMMAND_QUEUE_INFO_ERR        CL_HPP_ERR_STR_(clGetCommandQueueInfo)\r\n\r\n#define __CREATE_CONTEXT_ERR                CL_HPP_ERR_STR_(clCreateContext)\r\n#define __CREATE_CONTEXT_FROM_TYPE_ERR      CL_HPP_ERR_STR_(clCreateContextFromType)\r\n#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   CL_HPP_ERR_STR_(clGetSupportedImageFormats)\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n#define __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR  CL_HPP_ERR_STR_(clSetContextDestructorCallback)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n\r\n#define __CREATE_BUFFER_ERR                 CL_HPP_ERR_STR_(clCreateBuffer)\r\n#define __COPY_ERR                          CL_HPP_ERR_STR_(cl::copy)\r\n#define __CREATE_SUBBUFFER_ERR              CL_HPP_ERR_STR_(clCreateSubBuffer)\r\n#define __CREATE_GL_BUFFER_ERR              CL_HPP_ERR_STR_(clCreateFromGLBuffer)\r\n#define __CREATE_GL_RENDER_BUFFER_ERR       CL_HPP_ERR_STR_(clCreateFromGLBuffer)\r\n#define __GET_GL_OBJECT_INFO_ERR            CL_HPP_ERR_STR_(clGetGLObjectInfo)\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __CREATE_IMAGE_ERR                  CL_HPP_ERR_STR_(clCreateImage)\r\n#define __CREATE_GL_TEXTURE_ERR             CL_HPP_ERR_STR_(clCreateFromGLTexture)\r\n#define __IMAGE_DIMENSION_ERR               CL_HPP_ERR_STR_(Incorrect image dimensions)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback)\r\n\r\n#define __CREATE_USER_EVENT_ERR             CL_HPP_ERR_STR_(clCreateUserEvent)\r\n#define __SET_USER_EVENT_STATUS_ERR         CL_HPP_ERR_STR_(clSetUserEventStatus)\r\n#define __SET_EVENT_CALLBACK_ERR            CL_HPP_ERR_STR_(clSetEventCallback)\r\n#define __WAIT_FOR_EVENTS_ERR               CL_HPP_ERR_STR_(clWaitForEvents)\r\n\r\n#define __CREATE_KERNEL_ERR                 CL_HPP_ERR_STR_(clCreateKernel)\r\n#define __SET_KERNEL_ARGS_ERR               CL_HPP_ERR_STR_(clSetKernelArg)\r\n#define __CREATE_PROGRAM_WITH_SOURCE_ERR    CL_HPP_ERR_STR_(clCreateProgramWithSource)\r\n#define __CREATE_PROGRAM_WITH_BINARY_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBinary)\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithIL)\r\n#else\r\n#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithILKHR)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __BUILD_PROGRAM_ERR                 CL_HPP_ERR_STR_(clBuildProgram)\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __COMPILE_PROGRAM_ERR               CL_HPP_ERR_STR_(clCompileProgram)\r\n#define __LINK_PROGRAM_ERR                  CL_HPP_ERR_STR_(clLinkProgram)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __CREATE_KERNELS_IN_PROGRAM_ERR     CL_HPP_ERR_STR_(clCreateKernelsInProgram)\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#define __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR          CL_HPP_ERR_STR_(clCreateCommandQueueWithProperties)\r\n#define __CREATE_SAMPLER_WITH_PROPERTIES_ERR                CL_HPP_ERR_STR_(clCreateSamplerWithProperties)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#define __SET_COMMAND_QUEUE_PROPERTY_ERR    CL_HPP_ERR_STR_(clSetCommandQueueProperty)\r\n#define __ENQUEUE_READ_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueReadBuffer)\r\n#define __ENQUEUE_READ_BUFFER_RECT_ERR      CL_HPP_ERR_STR_(clEnqueueReadBufferRect)\r\n#define __ENQUEUE_WRITE_BUFFER_ERR          CL_HPP_ERR_STR_(clEnqueueWriteBuffer)\r\n#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     CL_HPP_ERR_STR_(clEnqueueWriteBufferRect)\r\n#define __ENQEUE_COPY_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueCopyBuffer)\r\n#define __ENQEUE_COPY_BUFFER_RECT_ERR       CL_HPP_ERR_STR_(clEnqueueCopyBufferRect)\r\n#define __ENQUEUE_FILL_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueFillBuffer)\r\n#define __ENQUEUE_READ_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueReadImage)\r\n#define __ENQUEUE_WRITE_IMAGE_ERR           CL_HPP_ERR_STR_(clEnqueueWriteImage)\r\n#define __ENQUEUE_COPY_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueCopyImage)\r\n#define __ENQUEUE_FILL_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueFillImage)\r\n#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  CL_HPP_ERR_STR_(clEnqueueCopyImageToBuffer)\r\n#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  CL_HPP_ERR_STR_(clEnqueueCopyBufferToImage)\r\n#define __ENQUEUE_MAP_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueMapBuffer)\r\n#define __ENQUEUE_MAP_SVM_ERR               CL_HPP_ERR_STR_(clEnqueueSVMMap)\r\n#define __ENQUEUE_FILL_SVM_ERR              CL_HPP_ERR_STR_(clEnqueueSVMMemFill)\r\n#define __ENQUEUE_COPY_SVM_ERR              CL_HPP_ERR_STR_(clEnqueueSVMMemcpy)\r\n#define __ENQUEUE_UNMAP_SVM_ERR             CL_HPP_ERR_STR_(clEnqueueSVMUnmap)\r\n#define __ENQUEUE_MAP_IMAGE_ERR             CL_HPP_ERR_STR_(clEnqueueMapImage)\r\n#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      CL_HPP_ERR_STR_(clEnqueueUnMapMemObject)\r\n#define __ENQUEUE_NDRANGE_KERNEL_ERR        CL_HPP_ERR_STR_(clEnqueueNDRangeKernel)\r\n#define __ENQUEUE_NATIVE_KERNEL             CL_HPP_ERR_STR_(clEnqueueNativeKernel)\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n#define __ENQUEUE_MIGRATE_SVM_ERR   CL_HPP_ERR_STR_(clEnqueueSVMMigrateMem)\r\n#define __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR   CL_HPP_ERR_STR_(clSetDefaultDeviceCommandQueue)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n\r\n#define __ENQUEUE_ACQUIRE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects)\r\n#define __ENQUEUE_RELEASE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects)\r\n\r\n#define __CREATE_PIPE_ERR             CL_HPP_ERR_STR_(clCreatePipe)\r\n#define __GET_PIPE_INFO_ERR           CL_HPP_ERR_STR_(clGetPipeInfo)\r\n\r\n#define __RETAIN_ERR                        CL_HPP_ERR_STR_(Retain Object)\r\n#define __RELEASE_ERR                       CL_HPP_ERR_STR_(Release Object)\r\n#define __FLUSH_ERR                         CL_HPP_ERR_STR_(clFlush)\r\n#define __FINISH_ERR                        CL_HPP_ERR_STR_(clFinish)\r\n#define __VECTOR_CAPACITY_ERR               CL_HPP_ERR_STR_(Vector capacity error)\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n#define __GET_HOST_TIMER_ERR           CL_HPP_ERR_STR_(clGetHostTimer)\r\n#define __GET_DEVICE_AND_HOST_TIMER_ERR           CL_HPP_ERR_STR_(clGetDeviceAndHostTimer)\r\n#endif\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 220\r\n#define __SET_PROGRAM_RELEASE_CALLBACK_ERR          CL_HPP_ERR_STR_(clSetProgramReleaseCallback)\r\n#define __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR   CL_HPP_ERR_STR_(clSetProgramSpecializationConstant)\r\n#endif\r\n\r\n#ifdef cl_khr_external_memory\r\n#define __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR       CL_HPP_ERR_STR_(clEnqueueAcquireExternalMemObjectsKHR)\r\n#define __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR       CL_HPP_ERR_STR_(clEnqueueReleaseExternalMemObjectsKHR)\r\n#endif\r\n\r\n#ifdef cl_khr_semaphore\r\n#define __GET_SEMAPHORE_KHR_INFO_ERR                CL_HPP_ERR_STR_(clGetSemaphoreInfoKHR)\r\n#define __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR  CL_HPP_ERR_STR_(clCreateSemaphoreWithPropertiesKHR)\r\n#define __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR            CL_HPP_ERR_STR_(clEnqueueWaitSemaphoresKHR)\r\n#define __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR          CL_HPP_ERR_STR_(clEnqueueSignalSemaphoresKHR)\r\n#define __RETAIN_SEMAPHORE_KHR_ERR                  CL_HPP_ERR_STR_(clRetainSemaphoreKHR)\r\n#define __RELEASE_SEMAPHORE_KHR_ERR                 CL_HPP_ERR_STR_(clReleaseSemaphoreKHR)\r\n#endif\r\n\r\n#ifdef cl_khr_external_semaphore\r\n#define __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR         CL_HPP_ERR_STR_(clGetSemaphoreHandleForTypeKHR)\r\n#endif // cl_khr_external_semaphore\r\n\r\n#if defined(cl_khr_command_buffer)\r\n#define __CREATE_COMMAND_BUFFER_KHR_ERR             CL_HPP_ERR_STR_(clCreateCommandBufferKHR)\r\n#define __GET_COMMAND_BUFFER_INFO_KHR_ERR           CL_HPP_ERR_STR_(clGetCommandBufferInfoKHR)\r\n#define __FINALIZE_COMMAND_BUFFER_KHR_ERR           CL_HPP_ERR_STR_(clFinalizeCommandBufferKHR)\r\n#define __ENQUEUE_COMMAND_BUFFER_KHR_ERR            CL_HPP_ERR_STR_(clEnqueueCommandBufferKHR)\r\n#define __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR    CL_HPP_ERR_STR_(clCommandBarrierWithWaitListKHR)\r\n#define __COMMAND_COPY_BUFFER_KHR_ERR               CL_HPP_ERR_STR_(clCommandCopyBufferKHR)\r\n#define __COMMAND_COPY_BUFFER_RECT_KHR_ERR          CL_HPP_ERR_STR_(clCommandCopyBufferRectKHR)\r\n#define __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR      CL_HPP_ERR_STR_(clCommandCopyBufferToImageKHR)\r\n#define __COMMAND_COPY_IMAGE_KHR_ERR                CL_HPP_ERR_STR_(clCommandCopyImageKHR)\r\n#define __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR      CL_HPP_ERR_STR_(clCommandCopyImageToBufferKHR)\r\n#define __COMMAND_FILL_BUFFER_KHR_ERR               CL_HPP_ERR_STR_(clCommandFillBufferKHR)\r\n#define __COMMAND_FILL_IMAGE_KHR_ERR                CL_HPP_ERR_STR_(clCommandFillImageKHR)\r\n#define __COMMAND_NDRANGE_KERNEL_KHR_ERR            CL_HPP_ERR_STR_(clCommandNDRangeKernelKHR)\r\n#define __UPDATE_MUTABLE_COMMANDS_KHR_ERR           CL_HPP_ERR_STR_(clUpdateMutableCommandsKHR)\r\n#define __GET_MUTABLE_COMMAND_INFO_KHR_ERR          CL_HPP_ERR_STR_(clGetMutableCommandInfoKHR)\r\n#define __RETAIN_COMMAND_BUFFER_KHR_ERR             CL_HPP_ERR_STR_(clRetainCommandBufferKHR)\r\n#define __RELEASE_COMMAND_BUFFER_KHR_ERR            CL_HPP_ERR_STR_(clReleaseCommandBufferKHR)\r\n#endif // cl_khr_command_buffer\r\n\r\n#if defined(cl_ext_image_requirements_info)\r\n#define __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR            CL_HPP_ERR_STR_(clGetImageRequirementsInfoEXT)\r\n#endif //cl_ext_image_requirements_info\r\n\r\n/**\r\n * CL 1.2 version that uses device fission.\r\n */\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevices)\r\n#else\r\n#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevicesEXT)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n/**\r\n * Deprecated APIs for 1.2\r\n */\r\n#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n#define __ENQUEUE_MARKER_ERR                CL_HPP_ERR_STR_(clEnqueueMarker)\r\n#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       CL_HPP_ERR_STR_(clEnqueueWaitForEvents)\r\n#define __ENQUEUE_BARRIER_ERR               CL_HPP_ERR_STR_(clEnqueueBarrier)\r\n#define __UNLOAD_COMPILER_ERR               CL_HPP_ERR_STR_(clUnloadCompiler)\r\n#define __CREATE_GL_TEXTURE_2D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture2D)\r\n#define __CREATE_GL_TEXTURE_3D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture3D)\r\n#define __CREATE_IMAGE2D_ERR                CL_HPP_ERR_STR_(clCreateImage2D)\r\n#define __CREATE_IMAGE3D_ERR                CL_HPP_ERR_STR_(clCreateImage3D)\r\n#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n\r\n/**\r\n * Deprecated APIs for 2.0\r\n */\r\n#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)\r\n#define __CREATE_COMMAND_QUEUE_ERR          CL_HPP_ERR_STR_(clCreateCommandQueue)\r\n#define __ENQUEUE_TASK_ERR                  CL_HPP_ERR_STR_(clEnqueueTask)\r\n#define __CREATE_SAMPLER_ERR                CL_HPP_ERR_STR_(clCreateSampler)\r\n#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n\r\n/**\r\n * CL 1.2 marker and barrier commands\r\n */\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#define __ENQUEUE_MARKER_WAIT_LIST_ERR                CL_HPP_ERR_STR_(clEnqueueMarkerWithWaitList)\r\n#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n#define __CLONE_KERNEL_ERR     CL_HPP_ERR_STR_(clCloneKernel)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n#endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS\r\n//! \\endcond\r\n\r\n#ifdef cl_khr_external_memory\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueAcquireExternalMemObjectsKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueReleaseExternalMemObjectsKHR);\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueAcquireExternalMemObjectsKHR pfn_clEnqueueAcquireExternalMemObjectsKHR = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueReleaseExternalMemObjectsKHR pfn_clEnqueueReleaseExternalMemObjectsKHR = nullptr;\r\n#endif // cl_khr_external_memory\r\n\r\n#ifdef cl_khr_semaphore\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateSemaphoreWithPropertiesKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clReleaseSemaphoreKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clRetainSemaphoreKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueWaitSemaphoresKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueSignalSemaphoresKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetSemaphoreInfoKHR);\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateSemaphoreWithPropertiesKHR pfn_clCreateSemaphoreWithPropertiesKHR  = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clReleaseSemaphoreKHR              pfn_clReleaseSemaphoreKHR               = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clRetainSemaphoreKHR               pfn_clRetainSemaphoreKHR                = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueWaitSemaphoresKHR         pfn_clEnqueueWaitSemaphoresKHR          = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueSignalSemaphoresKHR       pfn_clEnqueueSignalSemaphoresKHR        = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetSemaphoreInfoKHR              pfn_clGetSemaphoreInfoKHR               = nullptr;\r\n#endif // cl_khr_semaphore\r\n\r\n#ifdef cl_khr_external_semaphore\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetSemaphoreHandleForTypeKHR);\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetSemaphoreHandleForTypeKHR     pfn_clGetSemaphoreHandleForTypeKHR      = nullptr;\r\n#endif // cl_khr_external_semaphore\r\n\r\n#if defined(cl_khr_command_buffer)\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateCommandBufferKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clFinalizeCommandBufferKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clRetainCommandBufferKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clReleaseCommandBufferKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetCommandBufferInfoKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueCommandBufferKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandBarrierWithWaitListKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferRectKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferToImageKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyImageKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyImageToBufferKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandFillBufferKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandFillImageKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandNDRangeKernelKHR);\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateCommandBufferKHR pfn_clCreateCommandBufferKHR               = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clFinalizeCommandBufferKHR pfn_clFinalizeCommandBufferKHR           = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clRetainCommandBufferKHR pfn_clRetainCommandBufferKHR               = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clReleaseCommandBufferKHR pfn_clReleaseCommandBufferKHR             = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetCommandBufferInfoKHR pfn_clGetCommandBufferInfoKHR             = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueCommandBufferKHR pfn_clEnqueueCommandBufferKHR             = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandBarrierWithWaitListKHR pfn_clCommandBarrierWithWaitListKHR = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferKHR pfn_clCommandCopyBufferKHR                   = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferRectKHR pfn_clCommandCopyBufferRectKHR           = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferToImageKHR pfn_clCommandCopyBufferToImageKHR     = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyImageKHR pfn_clCommandCopyImageKHR                     = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyImageToBufferKHR pfn_clCommandCopyImageToBufferKHR     = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandFillBufferKHR pfn_clCommandFillBufferKHR                   = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandFillImageKHR pfn_clCommandFillImageKHR                     = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandNDRangeKernelKHR pfn_clCommandNDRangeKernelKHR             = nullptr;\r\n#endif /* cl_khr_command_buffer */\r\n\r\n#if defined(cl_khr_command_buffer_mutable_dispatch)\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clUpdateMutableCommandsKHR);\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetMutableCommandInfoKHR);\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clUpdateMutableCommandsKHR pfn_clUpdateMutableCommandsKHR           = nullptr;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetMutableCommandInfoKHR pfn_clGetMutableCommandInfoKHR           = nullptr;\r\n#endif /* cl_khr_command_buffer_mutable_dispatch */\r\n\r\n#if defined(cl_ext_image_requirements_info)\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetImageRequirementsInfoEXT);\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetImageRequirementsInfoEXT pfn_clGetImageRequirementsInfoEXT  = nullptr;\r\n#endif\r\n\r\n#if defined(cl_ext_device_fission)\r\nCL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateSubDevicesEXT);\r\nCL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateSubDevicesEXT\r\n    pfn_clCreateSubDevicesEXT = nullptr;\r\n#endif\r\n\r\nnamespace detail {\r\n\r\n// Generic getInfoHelper. The final parameter is used to guide overload\r\n// resolution: the actual parameter passed is an int, which makes this\r\n// a worse conversion sequence than a specialization that declares the\r\n// parameter as an int.\r\ntemplate<typename Functor, typename T>\r\ninline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)\r\n{\r\n    return f(name, sizeof(T), param, nullptr);\r\n}\r\n\r\n// Specialized for getInfo<CL_PROGRAM_BINARIES>\r\n// Assumes that the output vector was correctly resized on the way in\r\ntemplate <typename Func>\r\ninline cl_int getInfoHelper(Func f, cl_uint name, vector<vector<unsigned char>>* param, int)\r\n{\r\n    if (name != CL_PROGRAM_BINARIES) {\r\n        return CL_INVALID_VALUE;\r\n    }\r\n    if (param) {\r\n        // Create array of pointers, calculate total size and pass pointer array in\r\n        size_type numBinaries = param->size();\r\n        vector<unsigned char*> binariesPointers(numBinaries);\r\n\r\n        for (size_type i = 0; i < numBinaries; ++i)\r\n        {\r\n            binariesPointers[i] = (*param)[i].data();\r\n        }\r\n\r\n        cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), nullptr);\r\n\r\n        if (err != CL_SUCCESS) {\r\n            return err;\r\n        }\r\n    }\r\n\r\n    return CL_SUCCESS;\r\n}\r\n\r\n// Specialized getInfoHelper for vector params\r\ntemplate <typename Func, typename T>\r\ninline cl_int getInfoHelper(Func f, cl_uint name, vector<T>* param, long)\r\n{\r\n    size_type required;\r\n    cl_int err = f(name, 0, nullptr, &required);\r\n    if (err != CL_SUCCESS) {\r\n        return err;\r\n    }\r\n    const size_type elements = required / sizeof(T);\r\n\r\n    // Temporary to avoid changing param on an error\r\n    vector<T> localData(elements);\r\n    err = f(name, required, localData.data(), nullptr);\r\n    if (err != CL_SUCCESS) {\r\n        return err;\r\n    }\r\n    if (param) {\r\n        *param = std::move(localData);\r\n    }\r\n\r\n    return CL_SUCCESS;\r\n}\r\n\r\n/* Specialization for reference-counted types. This depends on the\r\n * existence of Wrapper<T>::cl_type, and none of the other types having the\r\n * cl_type member. Note that simplify specifying the parameter as Wrapper<T>\r\n * does not work, because when using a derived type (e.g. Context) the generic\r\n * template will provide a better match.\r\n */\r\ntemplate <typename Func, typename T>\r\ninline cl_int getInfoHelper(\r\n    Func f, cl_uint name, vector<T>* param, int, typename T::cl_type = 0)\r\n{\r\n    size_type required;\r\n    cl_int err = f(name, 0, nullptr, &required);\r\n    if (err != CL_SUCCESS) {\r\n        return err;\r\n    }\r\n\r\n    const size_type elements = required / sizeof(typename T::cl_type);\r\n\r\n    vector<typename T::cl_type> value(elements);\r\n    err = f(name, required, value.data(), nullptr);\r\n    if (err != CL_SUCCESS) {\r\n        return err;\r\n    }\r\n\r\n    if (param) {\r\n        // Assign to convert CL type to T for each element\r\n        param->resize(elements);\r\n\r\n        // Assign to param, constructing with retain behaviour\r\n        // to correctly capture each underlying CL object\r\n        for (size_type i = 0; i < elements; i++) {\r\n            (*param)[i] = T(value[i], true);\r\n        }\r\n    }\r\n    return CL_SUCCESS;\r\n}\r\n\r\n// Specialized GetInfoHelper for string params\r\ntemplate <typename Func>\r\ninline cl_int getInfoHelper(Func f, cl_uint name, string* param, long)\r\n{\r\n    size_type required;\r\n    cl_int err = f(name, 0, nullptr, &required);\r\n    if (err != CL_SUCCESS) {\r\n        return err;\r\n    }\r\n\r\n    // std::string has a constant data member\r\n    // a char vector does not\r\n    if (required > 0) {\r\n        vector<char> value(required);\r\n        err = f(name, required, value.data(), nullptr);\r\n        if (err != CL_SUCCESS) {\r\n            return err;\r\n        }\r\n        if (param) {\r\n            param->assign(value.begin(), value.end() - 1);\r\n        }\r\n    }\r\n    else if (param) {\r\n        param->assign(\"\");\r\n    }\r\n    return CL_SUCCESS;\r\n}\r\n\r\n// Specialized GetInfoHelper for clsize_t params\r\ntemplate <typename Func, size_type N>\r\ninline cl_int getInfoHelper(Func f, cl_uint name, array<size_type, N>* param, long)\r\n{\r\n    size_type required;\r\n    cl_int err = f(name, 0, nullptr, &required);\r\n    if (err != CL_SUCCESS) {\r\n        return err;\r\n    }\r\n\r\n    size_type elements = required / sizeof(size_type);\r\n    vector<size_type> value(elements, 0);\r\n\r\n    err = f(name, required, value.data(), nullptr);\r\n    if (err != CL_SUCCESS) {\r\n        return err;\r\n    }\r\n    \r\n    // Bound the copy with N to prevent overruns\r\n    // if passed N > than the amount copied\r\n    if (elements > N) {\r\n        elements = N;\r\n    }\r\n    for (size_type i = 0; i < elements; ++i) {\r\n        (*param)[i] = value[i];\r\n    }\r\n\r\n    return CL_SUCCESS;\r\n}\r\n\r\ntemplate<typename T> struct ReferenceHandler;\r\n\r\n/* Specialization for reference-counted types. This depends on the\r\n * existence of Wrapper<T>::cl_type, and none of the other types having the\r\n * cl_type member. Note that simplify specifying the parameter as Wrapper<T>\r\n * does not work, because when using a derived type (e.g. Context) the generic\r\n * template will provide a better match.\r\n */\r\ntemplate<typename Func, typename T>\r\ninline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)\r\n{\r\n    typename T::cl_type value;\r\n    cl_int err = f(name, sizeof(value), &value, nullptr);\r\n    if (err != CL_SUCCESS) {\r\n        return err;\r\n    }\r\n    *param = value;\r\n    if (value != nullptr)\r\n    {\r\n        err = param->retain();\r\n        if (err != CL_SUCCESS) {\r\n            return err;\r\n        }\r\n    }\r\n    return CL_SUCCESS;\r\n}\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_1_0_(F) \\\r\n    F(cl_platform_info, CL_PLATFORM_PROFILE, string) \\\r\n    F(cl_platform_info, CL_PLATFORM_VERSION, string) \\\r\n    F(cl_platform_info, CL_PLATFORM_NAME, string) \\\r\n    F(cl_platform_info, CL_PLATFORM_VENDOR, string) \\\r\n    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, string) \\\r\n    \\\r\n    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \\\r\n    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, cl::vector<size_type>) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \\\r\n    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \\\r\n    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \\\r\n    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \\\r\n    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\\\r\n    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \\\r\n    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \\\r\n    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \\\r\n    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \\\r\n    F(cl_device_info, CL_DEVICE_PLATFORM, cl::Platform) \\\r\n    F(cl_device_info, CL_DEVICE_NAME, string) \\\r\n    F(cl_device_info, CL_DEVICE_VENDOR, string) \\\r\n    F(cl_device_info, CL_DRIVER_VERSION, string) \\\r\n    F(cl_device_info, CL_DEVICE_PROFILE, string) \\\r\n    F(cl_device_info, CL_DEVICE_VERSION, string) \\\r\n    F(cl_device_info, CL_DEVICE_EXTENSIONS, string) \\\r\n    \\\r\n    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \\\r\n    F(cl_context_info, CL_CONTEXT_DEVICES, cl::vector<Device>) \\\r\n    F(cl_context_info, CL_CONTEXT_PROPERTIES, cl::vector<cl_context_properties>) \\\r\n    \\\r\n    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \\\r\n    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \\\r\n    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \\\r\n    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \\\r\n    \\\r\n    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \\\r\n    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \\\r\n    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \\\r\n    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \\\r\n    \\\r\n    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \\\r\n    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \\\r\n    F(cl_mem_info, CL_MEM_SIZE, size_type) \\\r\n    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \\\r\n    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \\\r\n    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \\\r\n    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \\\r\n    \\\r\n    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \\\r\n    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, size_type) \\\r\n    F(cl_image_info, CL_IMAGE_ROW_PITCH, size_type) \\\r\n    F(cl_image_info, CL_IMAGE_SLICE_PITCH, size_type) \\\r\n    F(cl_image_info, CL_IMAGE_WIDTH, size_type) \\\r\n    F(cl_image_info, CL_IMAGE_HEIGHT, size_type) \\\r\n    F(cl_image_info, CL_IMAGE_DEPTH, size_type) \\\r\n    \\\r\n    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \\\r\n    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \\\r\n    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \\\r\n    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \\\r\n    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \\\r\n    \\\r\n    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \\\r\n    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \\\r\n    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \\\r\n    F(cl_program_info, CL_PROGRAM_DEVICES, cl::vector<Device>) \\\r\n    F(cl_program_info, CL_PROGRAM_SOURCE, string) \\\r\n    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, cl::vector<size_type>) \\\r\n    F(cl_program_info, CL_PROGRAM_BINARIES, cl::vector<cl::vector<unsigned char>>) \\\r\n    \\\r\n    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \\\r\n    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, string) \\\r\n    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, string) \\\r\n    \\\r\n    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, string) \\\r\n    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \\\r\n    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \\\r\n    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \\\r\n    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \\\r\n    \\\r\n    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, size_type) \\\r\n    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::detail::size_t_array) \\\r\n    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \\\r\n    \\\r\n    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \\\r\n    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \\\r\n    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \\\r\n    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)\r\n\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_1_1_(F) \\\r\n    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, string) \\\r\n    \\\r\n    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \\\r\n    F(cl_mem_info, CL_MEM_OFFSET, size_type) \\\r\n    \\\r\n    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \\\r\n    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \\\r\n    \\\r\n    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_1_2_(F) \\\r\n    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, size_type) \\\r\n    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, string) \\\r\n    \\\r\n    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \\\r\n    \\\r\n    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, string) \\\r\n    \\\r\n    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \\\r\n    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \\\r\n    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, string) \\\r\n    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \\\r\n    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \\\r\n    \\\r\n    F(cl_kernel_work_group_info, CL_KERNEL_GLOBAL_WORK_SIZE, cl::detail::size_t_array) \\\r\n    \\\r\n    F(cl_device_info, CL_DEVICE_LINKER_AVAILABLE, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \\\r\n    F(cl_device_info, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector<cl_device_partition_property>) \\\r\n    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector<cl_device_partition_property>)  \\\r\n    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \\\r\n    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \\\r\n    F(cl_device_info, CL_DEVICE_PRINTF_BUFFER_SIZE, size_type) \\\r\n    \\\r\n    F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \\\r\n    F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \\\r\n    F(cl_image_info, CL_IMAGE_NUM_SAMPLES, cl_uint)\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_2_0_(F) \\\r\n    F(cl_device_info, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, cl_command_queue_properties) \\\r\n    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, cl_command_queue_properties) \\\r\n    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_QUEUES, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_EVENTS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_PIPE_ARGS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PIPE_MAX_PACKET_SIZE, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS, cl_uint ) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, size_type ) \\\r\n    F(cl_device_info, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, size_type ) \\\r\n    F(cl_profiling_info, CL_PROFILING_COMMAND_COMPLETE, cl_ulong) \\\r\n    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool) \\\r\n    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_PTRS, void**) \\\r\n    F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \\\r\n    F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \\\r\n    F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \\\r\n    F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \\\r\n    F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint)\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(F) \\\r\n    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR, size_type) \\\r\n    F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR, size_type)\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_IL_KHR_(F) \\\r\n    F(cl_device_info, CL_DEVICE_IL_VERSION_KHR, string) \\\r\n    F(cl_program_info, CL_PROGRAM_IL_KHR, cl::vector<unsigned char>)\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_2_1_(F) \\\r\n    F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, cl_ulong) \\\r\n    F(cl_program_info, CL_PROGRAM_IL, cl::vector<unsigned char>) \\\r\n    F(cl_device_info, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_IL_VERSION, string) \\\r\n    F(cl_device_info, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_bool) \\\r\n    F(cl_command_queue_info, CL_QUEUE_DEVICE_DEFAULT, cl::DeviceCommandQueue) \\\r\n    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, size_type) \\\r\n    F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, size_type) \\\r\n    F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) \\\r\n    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \\\r\n    F(cl_kernel_sub_group_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type)\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_2_2_(F) \\\r\n    F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \\\r\n    F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT, cl_bool)\r\n\r\n#define CL_HPP_PARAM_NAME_DEVICE_FISSION_EXT_(F) \\\r\n    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl::Device) \\\r\n    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector<cl_device_partition_property_ext>) \\\r\n    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, cl::vector<cl_device_partition_property_ext>) \\\r\n    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \\\r\n    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector<cl_device_partition_property_ext>)\r\n\r\n#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(F) \\\r\n    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION_KHR, cl_version_khr) \\\r\n    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \\\r\n    \\\r\n    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION_KHR, cl_version_khr) \\\r\n    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \\\r\n    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \\\r\n    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>)\r\n\r\n#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(F) \\\r\n    F(cl_device_info, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, cl_version_khr)\r\n\r\n// Note: the query for CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR is handled specially!\r\n#define CL_HPP_PARAM_NAME_CL_KHR_SEMAPHORE_(F) \\\r\n    F(cl_semaphore_info_khr, CL_SEMAPHORE_CONTEXT_KHR, cl::Context) \\\r\n    F(cl_semaphore_info_khr, CL_SEMAPHORE_REFERENCE_COUNT_KHR, cl_uint) \\\r\n    F(cl_semaphore_info_khr, CL_SEMAPHORE_PROPERTIES_KHR, cl::vector<cl_semaphore_properties_khr>) \\\r\n    F(cl_semaphore_info_khr, CL_SEMAPHORE_TYPE_KHR, cl_semaphore_type_khr) \\\r\n    F(cl_semaphore_info_khr, CL_SEMAPHORE_PAYLOAD_KHR, cl_semaphore_payload_khr) \\\r\n    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_TYPES_KHR,  cl::vector<cl_semaphore_type_khr>) \\\r\n    F(cl_device_info, CL_DEVICE_SEMAPHORE_TYPES_KHR,      cl::vector<cl_semaphore_type_khr>) \\\r\n\r\n#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_MEMORY_(F) \\\r\n    F(cl_device_info, CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, cl::vector<cl::ExternalMemoryType>) \\\r\n    F(cl_platform_info, CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, cl::vector<cl::ExternalMemoryType>)\r\n\r\n#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_(F) \\\r\n    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR,  cl::vector<cl_external_semaphore_handle_type_khr>) \\\r\n    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,  cl::vector<cl_external_semaphore_handle_type_khr>) \\\r\n    F(cl_device_info, CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR,      cl::vector<cl_external_semaphore_handle_type_khr>) \\\r\n    F(cl_device_info, CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,      cl::vector<cl_external_semaphore_handle_type_khr>) \\\r\n    F(cl_semaphore_info_khr, CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,      cl::vector<cl_external_semaphore_handle_type_khr>) \\\r\n\r\n#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXT(F) \\\r\n    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR, int) \\\r\n\r\n#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXT(F) \\\r\n    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_SYNC_FD_KHR, int) \\\r\n\r\n#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXT(F) \\\r\n    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR, void*) \\\r\n    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR, void*) \\\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_3_0_(F) \\\r\n    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION, cl_version) \\\r\n    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \\\r\n    \\\r\n    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION, cl_version) \\\r\n    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \\\r\n    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION, cl::vector<cl_name_version>) \\\r\n    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION, cl::vector<cl_name_version>) \\\r\n    F(cl_device_info, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, cl_device_atomic_capabilities) \\\r\n    F(cl_device_info, CL_DEVICE_ATOMIC_FENCE_CAPABILITIES, cl_device_atomic_capabilities) \\\r\n    F(cl_device_info, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_OPENCL_C_ALL_VERSIONS, cl::vector<cl_name_version>) \\\r\n    F(cl_device_info, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \\\r\n    F(cl_device_info, CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_OPENCL_C_FEATURES, cl::vector<cl_name_version>) \\\r\n    F(cl_device_info, CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES, cl_device_device_enqueue_capabilities) \\\r\n    F(cl_device_info, CL_DEVICE_PIPE_SUPPORT, cl_bool) \\\r\n    F(cl_device_info, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED, string) \\\r\n    \\\r\n    F(cl_command_queue_info, CL_QUEUE_PROPERTIES_ARRAY, cl::vector<cl_queue_properties>) \\\r\n    F(cl_mem_info, CL_MEM_PROPERTIES, cl::vector<cl_mem_properties>) \\\r\n    F(cl_pipe_info, CL_PIPE_PROPERTIES, cl::vector<cl_pipe_properties>) \\\r\n    F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector<cl_sampler_properties>) \\\r\n\r\n#define CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_EXT(F) \\\r\n    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, size_type) \\\r\n    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT, size_type) \\\r\n    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_SIZE_EXT, size_type) \\\r\n    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT, cl_uint) \\\r\n    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT, cl_uint) \\\r\n    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT, cl_uint) \\\r\n    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT, cl_uint) \\\r\n\r\n#define CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT(F) \\\r\n    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT, size_type) \\\r\n\r\n#define CL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(F) \\\r\n    F(cl_device_info, CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, cl::vector<cl_queue_family_properties_intel>) \\\r\n    \\\r\n    F(cl_command_queue_info, CL_QUEUE_FAMILY_INTEL, cl_uint) \\\r\n    F(cl_command_queue_info, CL_QUEUE_INDEX_INTEL, cl_uint)\r\n\r\n#define CL_HPP_PARAM_NAME_CL_INTEL_UNIFIED_SHARED_MEMORY_(F) \\\r\n    F(cl_device_info, CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel ) \\\r\n    F(cl_device_info, CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel ) \\\r\n    F(cl_device_info, CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel ) \\\r\n    F(cl_device_info, CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel ) \\\r\n    F(cl_device_info, CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel )\r\n\r\ntemplate <typename enum_type, cl_int Name>\r\nstruct param_traits {};\r\n\r\n#define CL_HPP_DECLARE_PARAM_TRAITS_(token, param_name, T) \\\r\nstruct token;                                        \\\r\ntemplate<>                                           \\\r\nstruct param_traits<detail:: token,param_name>       \\\r\n{                                                    \\\r\n    enum { value = param_name };                     \\\r\n    typedef T param_type;                            \\\r\n};\r\n\r\nCL_HPP_PARAM_NAME_INFO_1_0_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 110\r\nCL_HPP_PARAM_NAME_INFO_1_1_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\nCL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\nCL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\nCL_HPP_PARAM_NAME_INFO_2_1_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 220\r\nCL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\nCL_HPP_PARAM_NAME_INFO_3_0_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n\r\n#if defined(cl_khr_subgroups) && CL_HPP_TARGET_OPENCL_VERSION < 210\r\nCL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // #if defined(cl_khr_subgroups) && CL_HPP_TARGET_OPENCL_VERSION < 210\r\n\r\n#if defined(cl_khr_il_program) && CL_HPP_TARGET_OPENCL_VERSION < 210\r\nCL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // #if defined(cl_khr_il_program) && CL_HPP_TARGET_OPENCL_VERSION < 210\r\n\r\n\r\n// Flags deprecated in OpenCL 2.0\r\n#define CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(F) \\\r\n    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties)\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(F) \\\r\n    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool)\r\n\r\n#define CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(F) \\\r\n    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer)\r\n\r\n// Include deprecated query flags based on versions\r\n// Only include deprecated 1.0 flags if 2.0 not active as there is an enum clash\r\n#if CL_HPP_TARGET_OPENCL_VERSION > 100 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 && CL_HPP_TARGET_OPENCL_VERSION < 200\r\nCL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 110\r\n#if CL_HPP_TARGET_OPENCL_VERSION > 110 && CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\nCL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n#if CL_HPP_TARGET_OPENCL_VERSION > 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\nCL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n\r\n#if defined(cl_ext_device_fission)\r\nCL_HPP_PARAM_NAME_DEVICE_FISSION_EXT_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_ext_device_fission\r\n\r\n#if defined(cl_khr_extended_versioning)\r\n#if CL_HPP_TARGET_OPENCL_VERSION < 300\r\nCL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION < 300\r\nCL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_khr_extended_versioning\r\n\r\n#if defined(cl_khr_semaphore)\r\nCL_HPP_PARAM_NAME_CL_KHR_SEMAPHORE_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#if defined(CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_semaphore_info_khr, CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR, cl::vector<cl::Device>)\r\n#endif // defined(CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR)\r\n#endif // defined(cl_khr_semaphore)\r\n\r\n#ifdef cl_khr_external_memory\r\nCL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_MEMORY_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_khr_external_memory\r\n\r\n#if defined(cl_khr_external_semaphore)\r\nCL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_khr_external_semaphore\r\n\r\n#if defined(cl_khr_external_semaphore_opaque_fd)\r\nCL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_khr_external_semaphore_opaque_fd\r\n#if defined(cl_khr_external_semaphore_sync_fd)\r\nCL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_khr_external_semaphore_sync_fd\r\n#if defined(cl_khr_external_semaphore_win32)\r\nCL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_khr_external_semaphore_win32\r\n\r\n#if defined(cl_khr_device_uuid)\r\nusing uuid_array = array<cl_uchar, CL_UUID_SIZE_KHR>;\r\nusing luid_array = array<cl_uchar, CL_LUID_SIZE_KHR>;\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DRIVER_UUID_KHR, uuid_array)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_VALID_KHR, cl_bool)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_KHR, luid_array)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NODE_MASK_KHR, cl_uint)\r\n#endif\r\n\r\n#if defined(cl_khr_pci_bus_info)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PCI_BUS_INFO_KHR, cl_device_pci_bus_info_khr)\r\n#endif\r\n\r\n// Note: some headers do not define cl_khr_image2d_from_buffer\r\n#if CL_HPP_TARGET_OPENCL_VERSION < 200\r\n#if defined(CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR, cl_uint)\r\n#endif\r\n#if defined(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR, cl_uint)\r\n#endif\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION < 200\r\n\r\n#if defined(cl_khr_integer_dot_product)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, cl_device_integer_dot_product_capabilities_khr)\r\n#if defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, cl_device_integer_dot_product_acceleration_properties_khr)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, cl_device_integer_dot_product_acceleration_properties_khr)\r\n#endif // defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)\r\n#endif // defined(cl_khr_integer_dot_product)\r\n\r\n#if defined(cl_ext_image_requirements_info)\r\nCL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_ext_image_requirements_info\r\n\r\n#if defined(cl_ext_image_from_buffer)\r\nCL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_ext_image_from_buffer\r\n\r\n#ifdef CL_PLATFORM_ICD_SUFFIX_KHR\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string)\r\n#endif\r\n\r\n#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)\r\n#endif\r\n#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector<size_type>)\r\n#endif\r\n#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_SIMD_WIDTH_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_BOARD_NAME_AMD\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_BOARD_NAME_AMD, string)\r\n#endif\r\n\r\n#ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong)\r\n#endif\r\n#ifdef CL_DEVICE_JOB_SLOTS_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, cl_bitfield)\r\n#endif\r\n#ifdef CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM, vector<cl_uint>)\r\n#endif\r\n#ifdef CL_DEVICE_MAX_WARP_COUNT_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_MAX_WARP_COUNT_ARM, cl_uint)\r\n#endif\r\n#ifdef CL_KERNEL_MAX_WARP_COUNT_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_info, CL_KERNEL_MAX_WARP_COUNT_ARM, cl_uint)\r\n#endif\r\n#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM, cl_uint)\r\n#endif\r\n#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, cl_int)\r\n#endif\r\n#ifdef CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM, cl_uint)\r\n#endif\r\n#ifdef CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM, cl_uint)\r\n#endif\r\n\r\n#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_WARP_SIZE_NV\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)\r\n#endif\r\n#ifdef CL_DEVICE_GPU_OVERLAP_NV\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)\r\n#endif\r\n#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)\r\n#endif\r\n#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)\r\n#endif\r\n\r\n#if defined(cl_khr_command_buffer)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR, cl_device_command_buffer_capabilities_khr)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR, cl_command_queue_properties)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_QUEUES_KHR, cl::vector<CommandQueue>)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_NUM_QUEUES_KHR, cl_uint)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR, cl_uint)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_STATE_KHR, cl_command_buffer_state_khr)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR, cl::vector<cl_command_buffer_properties_khr>)\r\n#endif /* cl_khr_command_buffer */\r\n\r\n#if defined(cl_khr_command_buffer_mutable_dispatch)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR, CommandQueue)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR, CommandBufferKhr)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR, cl_command_type)\r\n\r\n#if CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 2)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_PROPERTIES_ARRAY_KHR, cl::vector<cl_command_properties_khr>)\r\n#else\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_PROPERTIES_ARRAY_KHR, cl::vector<cl_ndrange_kernel_command_properties_khr>)\r\n#endif\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_KERNEL_KHR, cl_kernel)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_DIMENSIONS_KHR, cl_uint)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR, cl::vector<size_type>)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR, cl::vector<size_type>)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR, cl::vector<size_type>)\r\n#endif /* cl_khr_command_buffer_mutable_dispatch */\r\n\r\n#if defined(cl_khr_kernel_clock)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR, cl_device_kernel_clock_capabilities_khr)\r\n#endif /* cl_khr_kernel_clock */\r\n\r\n#if defined(cl_ext_float_atomics)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)\r\n#endif /* cl_ext_float_atomics */\r\n\r\n#if defined(cl_intel_command_queue_families)\r\nCL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_intel_command_queue_families\r\n\r\n#if defined(cl_intel_device_attribute_query)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IP_VERSION_INTEL, cl_uint)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_ID_INTEL, cl_uint)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_SLICES_INTEL, cl_uint)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL, cl_uint)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, cl_uint)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_THREADS_PER_EU_INTEL, cl_uint)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_FEATURE_CAPABILITIES_INTEL, cl_device_feature_capabilities_intel)\r\n#endif // cl_intel_device_attribute_query\r\n\r\n#if defined(cl_intel_required_subgroup_size)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUB_GROUP_SIZES_INTEL, cl::vector<size_type>)\r\nCL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_work_group_info, CL_KERNEL_SPILL_MEM_SIZE_INTEL, cl_ulong)\r\n#endif // cl_intel_required_subgroup_size\r\n\r\n#if defined(cl_intel_unified_shared_memory)\r\nCL_HPP_PARAM_NAME_CL_INTEL_UNIFIED_SHARED_MEMORY_(CL_HPP_DECLARE_PARAM_TRAITS_)\r\n#endif // cl_intel_unified_shared_memory\r\n\r\n// Convenience functions\r\n\r\ntemplate <typename Func, typename T>\r\ninline cl_int\r\ngetInfo(Func f, cl_uint name, T* param)\r\n{\r\n    return getInfoHelper(f, name, param, 0);\r\n}\r\n\r\ntemplate <typename Func, typename Arg0>\r\nstruct GetInfoFunctor0\r\n{\r\n    Func f_; const Arg0& arg0_;\r\n    cl_int operator ()(\r\n        cl_uint param, size_type size, void* value, size_type* size_ret)\r\n    { return f_(arg0_, param, size, value, size_ret); }\r\n};\r\n\r\ntemplate <typename Func, typename Arg0, typename Arg1>\r\nstruct GetInfoFunctor1\r\n{\r\n    Func f_; const Arg0& arg0_; const Arg1& arg1_;\r\n    cl_int operator ()(\r\n        cl_uint param, size_type size, void* value, size_type* size_ret)\r\n    { return f_(arg0_, arg1_, param, size, value, size_ret); }\r\n};\r\n\r\ntemplate <typename Func, typename Arg0, typename T>\r\ninline cl_int\r\ngetInfo(Func f, const Arg0& arg0, cl_uint name, T* param)\r\n{\r\n    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };\r\n    return getInfoHelper(f0, name, param, 0);\r\n}\r\n\r\ntemplate <typename Func, typename Arg0, typename Arg1, typename T>\r\ninline cl_int\r\ngetInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)\r\n{\r\n    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };\r\n    return getInfoHelper(f0, name, param, 0);\r\n}\r\n\r\n\r\ntemplate<typename T>\r\nstruct ReferenceHandler\r\n{ };\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n/**\r\n * OpenCL 1.2 devices do have retain/release.\r\n */\r\ntemplate <>\r\nstruct ReferenceHandler<cl_device_id>\r\n{\r\n    /**\r\n     * Retain the device.\r\n     * \\param device A valid device created using createSubDevices\r\n     * \\return \r\n     *   CL_SUCCESS if the function executed successfully.\r\n     *   CL_INVALID_DEVICE if device was not a valid subdevice\r\n     *   CL_OUT_OF_RESOURCES\r\n     *   CL_OUT_OF_HOST_MEMORY\r\n     */\r\n    static cl_int retain(cl_device_id device)\r\n    { return ::clRetainDevice(device); }\r\n    /**\r\n     * Retain the device.\r\n     * \\param device A valid device created using createSubDevices\r\n     * \\return \r\n     *   CL_SUCCESS if the function executed successfully.\r\n     *   CL_INVALID_DEVICE if device was not a valid subdevice\r\n     *   CL_OUT_OF_RESOURCES\r\n     *   CL_OUT_OF_HOST_MEMORY\r\n     */\r\n    static cl_int release(cl_device_id device)\r\n    { return ::clReleaseDevice(device); }\r\n};\r\n#else // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n/**\r\n * OpenCL 1.1 devices do not have retain/release.\r\n */\r\ntemplate <>\r\nstruct ReferenceHandler<cl_device_id>\r\n{\r\n    // cl_device_id does not have retain().\r\n    static cl_int retain(cl_device_id)\r\n    { return CL_SUCCESS; }\r\n    // cl_device_id does not have release().\r\n    static cl_int release(cl_device_id)\r\n    { return CL_SUCCESS; }\r\n};\r\n#endif // ! (CL_HPP_TARGET_OPENCL_VERSION >= 120)\r\n\r\ntemplate <>\r\nstruct ReferenceHandler<cl_platform_id>\r\n{\r\n    // cl_platform_id does not have retain().\r\n    static cl_int retain(cl_platform_id)\r\n    { return CL_SUCCESS; }\r\n    // cl_platform_id does not have release().\r\n    static cl_int release(cl_platform_id)\r\n    { return CL_SUCCESS; }\r\n};\r\n\r\ntemplate <>\r\nstruct ReferenceHandler<cl_context>\r\n{\r\n    static cl_int retain(cl_context context)\r\n    { return ::clRetainContext(context); }\r\n    static cl_int release(cl_context context)\r\n    { return ::clReleaseContext(context); }\r\n};\r\n\r\ntemplate <>\r\nstruct ReferenceHandler<cl_command_queue>\r\n{\r\n    static cl_int retain(cl_command_queue queue)\r\n    { return ::clRetainCommandQueue(queue); }\r\n    static cl_int release(cl_command_queue queue)\r\n    { return ::clReleaseCommandQueue(queue); }\r\n};\r\n\r\ntemplate <>\r\nstruct ReferenceHandler<cl_mem>\r\n{\r\n    static cl_int retain(cl_mem memory)\r\n    { return ::clRetainMemObject(memory); }\r\n    static cl_int release(cl_mem memory)\r\n    { return ::clReleaseMemObject(memory); }\r\n};\r\n\r\ntemplate <>\r\nstruct ReferenceHandler<cl_sampler>\r\n{\r\n    static cl_int retain(cl_sampler sampler)\r\n    { return ::clRetainSampler(sampler); }\r\n    static cl_int release(cl_sampler sampler)\r\n    { return ::clReleaseSampler(sampler); }\r\n};\r\n\r\ntemplate <>\r\nstruct ReferenceHandler<cl_program>\r\n{\r\n    static cl_int retain(cl_program program)\r\n    { return ::clRetainProgram(program); }\r\n    static cl_int release(cl_program program)\r\n    { return ::clReleaseProgram(program); }\r\n};\r\n\r\ntemplate <>\r\nstruct ReferenceHandler<cl_kernel>\r\n{\r\n    static cl_int retain(cl_kernel kernel)\r\n    { return ::clRetainKernel(kernel); }\r\n    static cl_int release(cl_kernel kernel)\r\n    { return ::clReleaseKernel(kernel); }\r\n};\r\n\r\ntemplate <>\r\nstruct ReferenceHandler<cl_event>\r\n{\r\n    static cl_int retain(cl_event event)\r\n    { return ::clRetainEvent(event); }\r\n    static cl_int release(cl_event event)\r\n    { return ::clReleaseEvent(event); }\r\n};\r\n\r\n#ifdef cl_khr_semaphore\r\ntemplate <>\r\nstruct ReferenceHandler<cl_semaphore_khr>\r\n{\r\n    static cl_int retain(cl_semaphore_khr semaphore)\r\n    { \r\n        if (pfn_clRetainSemaphoreKHR != nullptr) {\r\n            return pfn_clRetainSemaphoreKHR(semaphore);\r\n        }\r\n\r\n        return CL_INVALID_OPERATION;\r\n    }\r\n\r\n    static cl_int release(cl_semaphore_khr semaphore)\r\n    {\r\n        if (pfn_clReleaseSemaphoreKHR != nullptr) {\r\n            return pfn_clReleaseSemaphoreKHR(semaphore);\r\n        }\r\n\r\n        return CL_INVALID_OPERATION;\r\n    }\r\n};\r\n#endif // cl_khr_semaphore\r\n#if defined(cl_khr_command_buffer)\r\ntemplate <>\r\nstruct ReferenceHandler<cl_command_buffer_khr>\r\n{\r\n    static cl_int retain(cl_command_buffer_khr cmdBufferKhr)\r\n    {\r\n        if (pfn_clRetainCommandBufferKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION, __RETAIN_COMMAND_BUFFER_KHR_ERR);\r\n        }\r\n        return pfn_clRetainCommandBufferKHR(cmdBufferKhr);\r\n    }\r\n\r\n    static cl_int release(cl_command_buffer_khr cmdBufferKhr)\r\n    {\r\n        if (pfn_clReleaseCommandBufferKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION, __RELEASE_COMMAND_BUFFER_KHR_ERR);\r\n        }\r\n        return pfn_clReleaseCommandBufferKHR(cmdBufferKhr);\r\n    }\r\n};\r\n\r\ntemplate <>\r\nstruct ReferenceHandler<cl_mutable_command_khr>\r\n{\r\n    // cl_mutable_command_khr does not have retain().\r\n    static cl_int retain(cl_mutable_command_khr)\r\n    { return CL_SUCCESS; }\r\n    // cl_mutable_command_khr does not have release().\r\n    static cl_int release(cl_mutable_command_khr)\r\n    { return CL_SUCCESS; }\r\n};\r\n#endif // cl_khr_command_buffer\r\n\r\n\r\n#if (CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120) || \\\r\n    (CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200)\r\n// Extracts version number with major in the upper 16 bits, minor in the lower 16\r\nstatic cl_uint getVersion(const vector<char> &versionInfo)\r\n{\r\n    int highVersion = 0;\r\n    int lowVersion = 0;\r\n    int index = 7;\r\n    while(versionInfo[index] != '.' ) {\r\n        highVersion *= 10;\r\n        highVersion += versionInfo[index]-'0';\r\n        ++index;\r\n    }\r\n    ++index;\r\n    while(versionInfo[index] != ' ' &&  versionInfo[index] != '\\0') {\r\n        lowVersion *= 10;\r\n        lowVersion += versionInfo[index]-'0';\r\n        ++index;\r\n    }\r\n    return (highVersion << 16) | lowVersion;\r\n}\r\n\r\nstatic cl_uint getPlatformVersion(cl_platform_id platform)\r\n{\r\n    size_type size = 0;\r\n    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &size);\r\n\r\n    vector<char> versionInfo(size);\r\n    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, versionInfo.data(), &size);\r\n    return getVersion(versionInfo);\r\n}\r\n\r\nstatic cl_uint getDevicePlatformVersion(cl_device_id device)\r\n{\r\n    cl_platform_id platform;\r\n    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr);\r\n    return getPlatformVersion(platform);\r\n}\r\n\r\nstatic cl_uint getContextPlatformVersion(cl_context context)\r\n{\r\n    // The platform cannot be queried directly, so we first have to grab a\r\n    // device and obtain its context\r\n    size_type size = 0;\r\n    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, nullptr, &size);\r\n    if (size == 0)\r\n        return 0;\r\n    vector<cl_device_id> devices(size/sizeof(cl_device_id));\r\n    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), nullptr);\r\n    return getDevicePlatformVersion(devices[0]);\r\n}\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION && CL_HPP_MINIMUM_OPENCL_VERSION\r\n\r\ntemplate <typename T>\r\nclass Wrapper\r\n{\r\npublic:\r\n    typedef T cl_type;\r\n\r\nprotected:\r\n    cl_type object_;\r\n\r\npublic:\r\n    Wrapper() : object_(nullptr) { }\r\n    \r\n    Wrapper(const cl_type &obj, bool retainObject) : object_(obj) \r\n    {\r\n        if (retainObject) { \r\n            detail::errHandler(retain(), __RETAIN_ERR); \r\n        }\r\n    }\r\n\r\n    ~Wrapper()\r\n    {\r\n        if (object_ != nullptr) { release(); }\r\n    }\r\n\r\n    Wrapper(const Wrapper<cl_type>& rhs)\r\n    {\r\n        object_ = rhs.object_;\r\n        detail::errHandler(retain(), __RETAIN_ERR);\r\n    }\r\n\r\n    Wrapper(Wrapper<cl_type>&& rhs) noexcept\r\n    {\r\n        object_ = rhs.object_;\r\n        rhs.object_ = nullptr;\r\n    }\r\n\r\n    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)\r\n    {\r\n        if (this != &rhs) {\r\n            detail::errHandler(release(), __RELEASE_ERR);\r\n            object_ = rhs.object_;\r\n            detail::errHandler(retain(), __RETAIN_ERR);\r\n        }\r\n        return *this;\r\n    }\r\n\r\n    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)\r\n    {\r\n        if (this != &rhs) {\r\n            detail::errHandler(release(), __RELEASE_ERR);\r\n            object_ = rhs.object_;\r\n            rhs.object_ = nullptr;\r\n        }\r\n        return *this;\r\n    }\r\n\r\n    Wrapper<cl_type>& operator = (const cl_type &rhs)\r\n    {\r\n        detail::errHandler(release(), __RELEASE_ERR);\r\n        object_ = rhs;\r\n        return *this;\r\n    }\r\n\r\n    const cl_type& operator ()() const { return object_; }\r\n\r\n    cl_type& operator ()() { return object_; }\r\n\r\n    cl_type get() const { return object_; }\r\n\r\nprotected:\r\n    template<typename Func, typename U>\r\n    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);\r\n\r\n    cl_int retain() const\r\n    {\r\n        if (object_ != nullptr) {\r\n            return ReferenceHandler<cl_type>::retain(object_);\r\n        }\r\n        else {\r\n            return CL_SUCCESS;\r\n        }\r\n    }\r\n\r\n    cl_int release() const\r\n    {\r\n        if (object_ != nullptr) {\r\n            return ReferenceHandler<cl_type>::release(object_);\r\n        }\r\n        else {\r\n            return CL_SUCCESS;\r\n        }\r\n    }\r\n};\r\n\r\ntemplate <>\r\nclass Wrapper<cl_device_id>\r\n{\r\npublic:\r\n    typedef cl_device_id cl_type;\r\n\r\nprotected:\r\n    cl_type object_;\r\n    bool referenceCountable_;\r\n\r\n    static bool isReferenceCountable(cl_device_id device)\r\n    {\r\n        bool retVal = false;\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n        if (device != nullptr) {\r\n            int version = getDevicePlatformVersion(device);\r\n            if(version > ((1 << 16) + 1)) {\r\n                retVal = true;\r\n            }\r\n        }\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        retVal = true;\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION\r\n        (void)device;\r\n        return retVal;\r\n    }\r\n\r\npublic:\r\n    Wrapper() : object_(nullptr), referenceCountable_(false) \r\n    { \r\n    }\r\n    \r\n    Wrapper(const cl_type &obj, bool retainObject) : \r\n        object_(obj), \r\n        referenceCountable_(false) \r\n    {\r\n        referenceCountable_ = isReferenceCountable(obj); \r\n\r\n        if (retainObject) {\r\n            detail::errHandler(retain(), __RETAIN_ERR);\r\n        }\r\n    }\r\n\r\n    ~Wrapper()\r\n    {\r\n        release();\r\n    }\r\n    \r\n    Wrapper(const Wrapper<cl_type>& rhs)\r\n    {\r\n        object_ = rhs.object_;\r\n        referenceCountable_ = isReferenceCountable(object_); \r\n        detail::errHandler(retain(), __RETAIN_ERR);\r\n    }\r\n\r\n    Wrapper(Wrapper<cl_type>&& rhs) noexcept\r\n    {\r\n        object_ = rhs.object_;\r\n        referenceCountable_ = rhs.referenceCountable_;\r\n        rhs.object_ = nullptr;\r\n        rhs.referenceCountable_ = false;\r\n    }\r\n\r\n    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)\r\n    {\r\n        if (this != &rhs) {\r\n            detail::errHandler(release(), __RELEASE_ERR);\r\n            object_ = rhs.object_;\r\n            referenceCountable_ = rhs.referenceCountable_;\r\n            detail::errHandler(retain(), __RETAIN_ERR);\r\n        }\r\n        return *this;\r\n    }\r\n\r\n    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)\r\n    {\r\n        if (this != &rhs) {\r\n            detail::errHandler(release(), __RELEASE_ERR);\r\n            object_ = rhs.object_;\r\n            referenceCountable_ = rhs.referenceCountable_;\r\n            rhs.object_ = nullptr;\r\n            rhs.referenceCountable_ = false;\r\n        }\r\n        return *this;\r\n    }\r\n\r\n    Wrapper<cl_type>& operator = (const cl_type &rhs)\r\n    {\r\n        detail::errHandler(release(), __RELEASE_ERR);\r\n        object_ = rhs;\r\n        referenceCountable_ = isReferenceCountable(object_); \r\n        return *this;\r\n    }\r\n\r\n    const cl_type& operator ()() const { return object_; }\r\n\r\n    cl_type& operator ()() { return object_; }\r\n\r\n    cl_type get() const { return object_; }\r\n\r\nprotected:\r\n    template<typename Func, typename U>\r\n    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);\r\n\r\n    template<typename Func, typename U>\r\n    friend inline cl_int getInfoHelper(Func, cl_uint, vector<U>*, int, typename U::cl_type);\r\n\r\n    cl_int retain() const\r\n    {\r\n        if( object_ != nullptr && referenceCountable_ ) {\r\n            return ReferenceHandler<cl_type>::retain(object_);\r\n        }\r\n        else {\r\n            return CL_SUCCESS;\r\n        }\r\n    }\r\n\r\n    cl_int release() const\r\n    {\r\n        if (object_ != nullptr && referenceCountable_) {\r\n            return ReferenceHandler<cl_type>::release(object_);\r\n        }\r\n        else {\r\n            return CL_SUCCESS;\r\n        }\r\n    }\r\n};\r\n\r\ntemplate <typename T>\r\ninline bool operator==(const Wrapper<T> &lhs, const Wrapper<T> &rhs)\r\n{\r\n    return lhs() == rhs();\r\n}\r\n\r\ntemplate <typename T>\r\ninline bool operator!=(const Wrapper<T> &lhs, const Wrapper<T> &rhs)\r\n{\r\n    return !operator==(lhs, rhs);\r\n}\r\n\r\n} // namespace detail\r\n//! \\endcond\r\n\r\n\r\n\r\n\r\n\r\n/*! \\stuct ImageFormat\r\n *  \\brief Adds constructors and member functions for cl_image_format.\r\n *\r\n *  \\see cl_image_format\r\n */\r\nstruct ImageFormat : public cl_image_format\r\n{\r\n    //! \\brief Default constructor - performs no initialization.\r\n    ImageFormat(){}\r\n\r\n    //! \\brief Initializing constructor.\r\n    ImageFormat(cl_channel_order order, cl_channel_type type)\r\n    {\r\n        image_channel_order = order;\r\n        image_channel_data_type = type;\r\n    }\r\n\r\n    //! \\brief Copy constructor.\r\n    ImageFormat(const ImageFormat &other) { *this = other; }\r\n\r\n    //! \\brief Assignment operator.\r\n    ImageFormat& operator = (const ImageFormat& rhs)\r\n    {\r\n        if (this != &rhs) {\r\n            this->image_channel_data_type = rhs.image_channel_data_type;\r\n            this->image_channel_order     = rhs.image_channel_order;\r\n        }\r\n        return *this;\r\n    }\r\n};\r\n\r\n/*! \\brief Class interface for cl_device_id.\r\n *\r\n *  \\note Copies of these objects are inexpensive, since they don't 'own'\r\n *        any underlying resources or data structures.\r\n *\r\n *  \\see cl_device_id\r\n */\r\nclass Device : public detail::Wrapper<cl_device_id>\r\n{\r\nprivate:\r\n    static std::once_flag default_initialized_;\r\n    static Device default_;\r\n    static cl_int default_error_;\r\n\r\n    /*! \\brief Create the default context.\r\n    *\r\n    * This sets @c default_ and @c default_error_. It does not throw\r\n    * @c cl::Error.\r\n    */\r\n    static void makeDefault();\r\n\r\n    /*! \\brief Create the default platform from a provided platform.\r\n    *\r\n    * This sets @c default_. It does not throw\r\n    * @c cl::Error.\r\n    */\r\n    static void makeDefaultProvided(const Device &p) {\r\n        default_ = p;\r\n    }\r\n\r\npublic:\r\n#ifdef CL_HPP_UNIT_TEST_ENABLE\r\n    /*! \\brief Reset the default.\r\n    *\r\n    * This sets @c default_ to an empty value to support cleanup in\r\n    * the unit test framework.\r\n    * This function is not thread safe.\r\n    */\r\n    static void unitTestClearDefault() {\r\n        default_ = Device();\r\n    }\r\n#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Device() : detail::Wrapper<cl_type>() { }\r\n\r\n    /*! \\brief Constructor from cl_device_id.\r\n     * \r\n     *  This simply copies the device ID value, which is an inexpensive operation.\r\n     */\r\n    explicit Device(const cl_device_id &device, bool retainObject = false) : \r\n        detail::Wrapper<cl_type>(device, retainObject) { }\r\n\r\n    /*! \\brief Returns the first device on the default context.\r\n     *\r\n     *  \\see Context::getDefault()\r\n     */\r\n    static Device getDefault(\r\n        cl_int *errResult = nullptr)\r\n    {\r\n        std::call_once(default_initialized_, makeDefault);\r\n        detail::errHandler(default_error_);\r\n        if (errResult != nullptr) {\r\n            *errResult = default_error_;\r\n        }\r\n        return default_;\r\n    }\r\n\r\n    /**\r\n    * Modify the default device to be used by\r\n    * subsequent operations.\r\n    * Will only set the default if no default was previously created.\r\n    * @return updated default device.\r\n    *         Should be compared to the passed value to ensure that it was updated.\r\n    */\r\n    static Device setDefault(const Device &default_device)\r\n    {\r\n        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_device));\r\n        detail::errHandler(default_error_);\r\n        return default_;\r\n    }\r\n\r\n    /*! \\brief Assignment operator from cl_device_id.\r\n     * \r\n     *  This simply copies the device ID value, which is an inexpensive operation.\r\n     */\r\n    Device& operator = (const cl_device_id& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n \r\n\r\n    //! \\brief Wrapper for clGetDeviceInfo().\r\n    template <typename T>\r\n    cl_int getInfo(cl_device_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetDeviceInfo, object_, name, param),\r\n            __GET_DEVICE_INFO_ERR);\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetDeviceInfo() that returns by value.\r\n    template <cl_device_info name> typename\r\n    detail::param_traits<detail::cl_device_info, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_device_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n    /**\r\n     * Return the current value of the host clock as seen by the device.\r\n     * The resolution of the device timer may be queried with the\r\n     * CL_DEVICE_PROFILING_TIMER_RESOLUTION query.\r\n     * @return The host timer value.\r\n     */\r\n    cl_ulong getHostTimer(cl_int *error = nullptr)\r\n    {\r\n        cl_ulong retVal = 0;\r\n        cl_int err = \r\n            clGetHostTimer(this->get(), &retVal);\r\n        detail::errHandler(\r\n            err,\r\n            __GET_HOST_TIMER_ERR);\r\n        if (error) {\r\n            *error = err;\r\n        }\r\n        return retVal;\r\n    }\r\n\r\n    /**\r\n     * Return a synchronized pair of host and device timestamps as seen by device.\r\n     * Use to correlate the clocks and get the host timer only using getHostTimer\r\n     * as a lower cost mechanism in between calls.\r\n     * The resolution of the host timer may be queried with the \r\n     * CL_PLATFORM_HOST_TIMER_RESOLUTION query.\r\n     * The resolution of the device timer may be queried with the\r\n     * CL_DEVICE_PROFILING_TIMER_RESOLUTION query.\r\n     * @return A pair of (device timer, host timer) timer values.\r\n     */\r\n    std::pair<cl_ulong, cl_ulong> getDeviceAndHostTimer(cl_int *error = nullptr)\r\n    {\r\n        std::pair<cl_ulong, cl_ulong> retVal;\r\n        cl_int err =\r\n            clGetDeviceAndHostTimer(this->get(), &(retVal.first), &(retVal.second));\r\n        detail::errHandler(\r\n            err,\r\n            __GET_DEVICE_AND_HOST_TIMER_ERR);\r\n        if (error) {\r\n            *error = err;\r\n        }\r\n        return retVal;\r\n    }\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    //! \\brief Wrapper for clCreateSubDevices().\r\n    cl_int createSubDevices(const cl_device_partition_property* properties,\r\n                            vector<Device>* devices);\r\n#endif // defined (CL_HPP_TARGET_OPENCL_VERSION >= 120)\r\n\r\n#if defined(cl_ext_device_fission)\r\n    //! \\brief Wrapper for clCreateSubDevices().\r\n    cl_int createSubDevices(const cl_device_partition_property_ext* properties,\r\n                            vector<Device>* devices);\r\n#endif // defined(cl_ext_device_fission)\r\n};\r\n\r\nusing BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n/**\r\n* Exception class for build errors to carry build info\r\n*/\r\nclass BuildError : public Error\r\n{\r\nprivate:\r\n    BuildLogType buildLogs;\r\npublic:\r\n    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)\r\n    {\r\n    }\r\n\r\n    BuildLogType getBuildLog() const\r\n    {\r\n        return buildLogs;\r\n    }\r\n};\r\nnamespace detail {\r\n    static inline cl_int buildErrHandler(\r\n        cl_int err,\r\n        const char * errStr,\r\n        const BuildLogType &buildLogs)\r\n    {\r\n        if (err != CL_SUCCESS) {\r\n            throw BuildError(err, errStr, buildLogs);\r\n        }\r\n        return err;\r\n    }\r\n} // namespace detail\r\n\r\n#else\r\nnamespace detail {\r\n    static inline cl_int buildErrHandler(\r\n        cl_int err,\r\n        const char * errStr,\r\n        const BuildLogType &buildLogs)\r\n    {\r\n        (void)buildLogs; // suppress unused variable warning\r\n        (void)errStr;\r\n        return err;\r\n    }\r\n} // namespace detail\r\n#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS;\r\n\r\n/*! \\brief Class interface for cl_platform_id.\r\n *\r\n *  \\note Copies of these objects are inexpensive, since they don't 'own'\r\n *        any underlying resources or data structures.\r\n *\r\n *  \\see cl_platform_id\r\n */\r\nclass Platform : public detail::Wrapper<cl_platform_id>\r\n{\r\nprivate:\r\n    static std::once_flag default_initialized_;\r\n    static Platform default_;\r\n    static cl_int default_error_;\r\n\r\n    /*! \\brief Create the default context.\r\n    *\r\n    * This sets @c default_ and @c default_error_. It does not throw\r\n    * @c cl::Error.\r\n    */\r\n    static void makeDefault() {\r\n        /* Throwing an exception from a call_once invocation does not do\r\n        * what we wish, so we catch it and save the error.\r\n        */\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n        try\r\n#endif\r\n        {\r\n            // If default wasn't passed ,generate one\r\n            // Otherwise set it\r\n            cl_uint n = 0;\r\n\r\n            cl_int err = ::clGetPlatformIDs(0, nullptr, &n);\r\n            if (err != CL_SUCCESS) {\r\n                default_error_ = err;\r\n                return;\r\n            }\r\n            if (n == 0) {\r\n                default_error_ = CL_INVALID_PLATFORM;\r\n                return;\r\n            }\r\n\r\n            vector<cl_platform_id> ids(n);\r\n            err = ::clGetPlatformIDs(n, ids.data(), nullptr);\r\n            if (err != CL_SUCCESS) {\r\n                default_error_ = err;\r\n                return;\r\n            }\r\n\r\n            default_ = Platform(ids[0]);\r\n        }\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n        catch (cl::Error &e) {\r\n            default_error_ = e.err();\r\n        }\r\n#endif\r\n    }\r\n\r\n    /*! \\brief Create the default platform from a provided platform.\r\n     *\r\n     * This sets @c default_. It does not throw\r\n     * @c cl::Error.\r\n     */\r\n    static void makeDefaultProvided(const Platform &p) {\r\n       default_ = p;\r\n    }\r\n    \r\npublic:\r\n#ifdef CL_HPP_UNIT_TEST_ENABLE\r\n    /*! \\brief Reset the default.\r\n    *\r\n    * This sets @c default_ to an empty value to support cleanup in\r\n    * the unit test framework.\r\n    * This function is not thread safe.\r\n    */\r\n    static void unitTestClearDefault() {\r\n        default_ = Platform();\r\n    }\r\n#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Platform() : detail::Wrapper<cl_type>()  { }\r\n\r\n    /*! \\brief Constructor from cl_platform_id.\r\n     * \r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  This simply copies the platform ID value, which is an inexpensive operation.\r\n     */\r\n    explicit Platform(const cl_platform_id &platform, bool retainObject = false) : \r\n        detail::Wrapper<cl_type>(platform, retainObject) { }\r\n\r\n    /*! \\brief Assignment operator from cl_platform_id.\r\n     * \r\n     *  This simply copies the platform ID value, which is an inexpensive operation.\r\n     */\r\n    Platform& operator = (const cl_platform_id& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n    static Platform getDefault(\r\n        cl_int *errResult = nullptr)\r\n    {\r\n        std::call_once(default_initialized_, makeDefault);\r\n        detail::errHandler(default_error_);\r\n        if (errResult != nullptr) {\r\n            *errResult = default_error_;\r\n        }\r\n        return default_;\r\n    }\r\n\r\n    /**\r\n     * Modify the default platform to be used by \r\n     * subsequent operations.\r\n     * Will only set the default if no default was previously created.\r\n     * @return updated default platform. \r\n     *         Should be compared to the passed value to ensure that it was updated.\r\n     */\r\n    static Platform setDefault(const Platform &default_platform)\r\n    {\r\n        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_platform));\r\n        detail::errHandler(default_error_);\r\n        return default_;\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetPlatformInfo().\r\n    template <typename T>\r\n    cl_int getInfo(cl_platform_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetPlatformInfo, object_, name, param),\r\n            __GET_PLATFORM_INFO_ERR);\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetPlatformInfo() that returns by value.\r\n    template <cl_platform_info name> typename\r\n    detail::param_traits<detail::cl_platform_info, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_platform_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n    /*! \\brief Gets a list of devices for this platform.\r\n     * \r\n     *  Wraps clGetDeviceIDs().\r\n     */\r\n    cl_int getDevices(\r\n        cl_device_type type,\r\n        vector<Device>* devices) const\r\n    {\r\n        cl_uint n = 0;\r\n        if( devices == nullptr ) {\r\n            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);\r\n        }\r\n        cl_int err = ::clGetDeviceIDs(object_, type, 0, nullptr, &n);\r\n        if (err != CL_SUCCESS  && err != CL_DEVICE_NOT_FOUND) {\r\n            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);\r\n        }\r\n\r\n        vector<cl_device_id> ids(n);\r\n        if (n>0) {\r\n            err = ::clGetDeviceIDs(object_, type, n, ids.data(), nullptr);\r\n            if (err != CL_SUCCESS) {\r\n                return detail::errHandler(err, __GET_DEVICE_IDS_ERR);\r\n            }\r\n        }\r\n\r\n        // Cannot trivially assign because we need to capture intermediates \r\n        // with safe construction\r\n        // We must retain things we obtain from the API to avoid releasing\r\n        // API-owned objects.\r\n        if (devices) {\r\n            devices->resize(ids.size());\r\n\r\n            // Assign to param, constructing with retain behaviour\r\n            // to correctly capture each underlying CL object\r\n            for (size_type i = 0; i < ids.size(); i++) {\r\n                (*devices)[i] = Device(ids[i], true);\r\n            }\r\n        }\r\n        return CL_SUCCESS;\r\n    }\r\n\r\n#if defined(CL_HPP_USE_DX_INTEROP)\r\n   /*! \\brief Get the list of available D3D10 devices.\r\n     *\r\n     *  \\param d3d_device_source.\r\n     *\r\n     *  \\param d3d_object.\r\n     *\r\n     *  \\param d3d_device_set.\r\n     *\r\n     *  \\param devices returns a vector of OpenCL D3D10 devices found. The cl::Device\r\n     *  values returned in devices can be used to identify a specific OpenCL\r\n     *  device. If \\a devices argument is nullptr, this argument is ignored.\r\n     *\r\n     *  \\return One of the following values:\r\n     *    - CL_SUCCESS if the function is executed successfully.\r\n     *\r\n     *  The application can query specific capabilities of the OpenCL device(s)\r\n     *  returned by cl::getDevices. This can be used by the application to\r\n     *  determine which device(s) to use.\r\n     *\r\n     * \\note In the case that exceptions are enabled and a return value\r\n     * other than CL_SUCCESS is generated, then cl::Error exception is\r\n     * generated.\r\n     */\r\n    cl_int getDevices(\r\n        cl_d3d10_device_source_khr d3d_device_source,\r\n        void *                     d3d_object,\r\n        cl_d3d10_device_set_khr    d3d_device_set,\r\n        vector<Device>* devices) const\r\n    {\r\n        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(\r\n            cl_platform_id platform, \r\n            cl_d3d10_device_source_khr d3d_device_source, \r\n            void * d3d_object,\r\n            cl_d3d10_device_set_khr d3d_device_set,\r\n            cl_uint num_entries,\r\n            cl_device_id * devices,\r\n            cl_uint* num_devices);\r\n\r\n        if( devices == nullptr ) {\r\n            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);\r\n        }\r\n\r\n        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = nullptr;\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(object_, clGetDeviceIDsFromD3D10KHR);\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetDeviceIDsFromD3D10KHR);\r\n#endif\r\n\r\n        cl_uint n = 0;\r\n        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(\r\n            object_, \r\n            d3d_device_source, \r\n            d3d_object,\r\n            d3d_device_set, \r\n            0, \r\n            nullptr, \r\n            &n);\r\n        if (err != CL_SUCCESS) {\r\n            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);\r\n        }\r\n\r\n        vector<cl_device_id> ids(n);\r\n        err = pfn_clGetDeviceIDsFromD3D10KHR(\r\n            object_, \r\n            d3d_device_source, \r\n            d3d_object,\r\n            d3d_device_set,\r\n            n, \r\n            ids.data(), \r\n            nullptr);\r\n        if (err != CL_SUCCESS) {\r\n            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);\r\n        }\r\n\r\n        // Cannot trivially assign because we need to capture intermediates \r\n        // with safe construction\r\n        // We must retain things we obtain from the API to avoid releasing\r\n        // API-owned objects.\r\n        if (devices) {\r\n            devices->resize(ids.size());\r\n\r\n            // Assign to param, constructing with retain behaviour\r\n            // to correctly capture each underlying CL object\r\n            for (size_type i = 0; i < ids.size(); i++) {\r\n                (*devices)[i] = Device(ids[i], true);\r\n            }\r\n        }\r\n        return CL_SUCCESS;\r\n    }\r\n#endif\r\n\r\n    /*! \\brief Gets a list of available platforms.\r\n     * \r\n     *  Wraps clGetPlatformIDs().\r\n     */\r\n    static cl_int get(\r\n        vector<Platform>* platforms)\r\n    {\r\n        cl_uint n = 0;\r\n\r\n        if( platforms == nullptr ) {\r\n            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);\r\n        }\r\n\r\n        cl_int err = ::clGetPlatformIDs(0, nullptr, &n);\r\n        if (err != CL_SUCCESS) {\r\n            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);\r\n        }\r\n\r\n        vector<cl_platform_id> ids(n);\r\n        err = ::clGetPlatformIDs(n, ids.data(), nullptr);\r\n        if (err != CL_SUCCESS) {\r\n            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);\r\n        }\r\n\r\n        if (platforms) {\r\n            platforms->resize(ids.size());\r\n\r\n            // Platforms don't reference count\r\n            for (size_type i = 0; i < ids.size(); i++) {\r\n                (*platforms)[i] = Platform(ids[i]);\r\n            }\r\n        }\r\n        return CL_SUCCESS;\r\n    }\r\n\r\n    /*! \\brief Gets the first available platform.\r\n     * \r\n     *  Wraps clGetPlatformIDs(), returning the first result.\r\n     */\r\n    static cl_int get(\r\n        Platform * platform)\r\n    {\r\n        cl_int err;\r\n        Platform default_platform = Platform::getDefault(&err);\r\n        if (platform) {\r\n            *platform = default_platform;\r\n        }\r\n        return err;\r\n    }\r\n\r\n    /*! \\brief Gets the first available platform, returning it by value.\r\n     *\r\n     * \\return Returns a valid platform if one is available.\r\n     *         If no platform is available will return a null platform.\r\n     * Throws an exception if no platforms are available\r\n     * or an error condition occurs.\r\n     * Wraps clGetPlatformIDs(), returning the first result.\r\n     */\r\n    static Platform get(\r\n        cl_int * errResult = nullptr)\r\n    {\r\n        cl_int err;\r\n        Platform default_platform = Platform::getDefault(&err);\r\n        if (errResult) {\r\n            *errResult = err;\r\n        }\r\n        return default_platform;\r\n    }    \r\n    \r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    //! \\brief Wrapper for clUnloadCompiler().\r\n    cl_int\r\n    unloadCompiler()\r\n    {\r\n        return ::clUnloadPlatformCompiler(object_);\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n}; // class Platform\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n   //! \\brief Wrapper for clCreateSubDevices().\r\ninline cl_int Device::createSubDevices(const cl_device_partition_property* properties,\r\n                         vector<Device>* devices)\r\n{\r\n    cl_uint n = 0;\r\n    cl_int err = clCreateSubDevices(object_, properties, 0, nullptr, &n);\r\n    if (err != CL_SUCCESS)\r\n    {\r\n        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);\r\n    }\r\n\r\n    vector<cl_device_id> ids(n);\r\n    err = clCreateSubDevices(object_, properties, n, ids.data(), nullptr);\r\n    if (err != CL_SUCCESS)\r\n    {\r\n        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);\r\n    }\r\n\r\n    // Cannot trivially assign because we need to capture intermediates\r\n    // with safe construction\r\n    if (devices)\r\n    {\r\n        devices->resize(ids.size());\r\n\r\n        // Assign to param, constructing with retain behaviour\r\n        // to correctly capture each underlying CL object\r\n        for (size_type i = 0; i < ids.size(); i++)\r\n        {\r\n            // We do not need to retain because this device is being created\r\n            // by the runtime\r\n            (*devices)[i] = Device(ids[i], false);\r\n        }\r\n    }\r\n\r\n    return CL_SUCCESS;\r\n}\r\n#endif // defined (CL_HPP_TARGET_OPENCL_VERSION >= 120)\r\n\r\n#if defined(cl_ext_device_fission)\r\n   //! \\brief Wrapper for clCreateSubDevices().\r\ninline cl_int Device::createSubDevices(const cl_device_partition_property_ext* properties,\r\n                        vector<Device>* devices)\r\n{\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    cl::Device device(object_);\r\n    cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();\r\n    CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateSubDevicesEXT);\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n    CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);\r\n#endif\r\n\r\n    cl_uint n = 0;\r\n    cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, nullptr, &n);\r\n    if (err != CL_SUCCESS)\r\n    {\r\n        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);\r\n    }\r\n\r\n    vector<cl_device_id> ids(n);\r\n    err =\r\n        pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), nullptr);\r\n    if (err != CL_SUCCESS)\r\n    {\r\n        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);\r\n    }\r\n    // Cannot trivially assign because we need to capture intermediates\r\n    // with safe construction\r\n    if (devices)\r\n    {\r\n        devices->resize(ids.size());\r\n\r\n        // Assign to param, constructing with retain behaviour\r\n        // to correctly capture each underlying CL object\r\n        for (size_type i = 0; i < ids.size(); i++)\r\n        {\r\n            // We do not need to retain because this device is being created\r\n            // by the runtime\r\n            (*devices)[i] = Device(ids[i], false);\r\n        }\r\n    }\r\n\r\n    return CL_SUCCESS;\r\n}\r\n#endif // defined(cl_ext_device_fission)\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Platform::default_initialized_;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ Platform Platform::default_;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS;\r\n\r\n\r\n/**\r\n * Deprecated APIs for 1.2\r\n */\r\n#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n/**\r\n * Unload the OpenCL compiler.\r\n * \\note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.\r\n */\r\ninline CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int\r\nUnloadCompiler() CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\ninline cl_int\r\nUnloadCompiler()\r\n{\r\n    return ::clUnloadCompiler();\r\n}\r\n#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n\r\n\r\n#if defined(cl_ext_image_requirements_info)\r\nenum ImageRequirementsInfoExt : cl_image_requirements_info_ext\r\n{\r\n    RowPitchAlign = CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT,\r\n    BaseAddAlign = CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT,\r\n    Size = CL_IMAGE_REQUIREMENTS_SIZE_EXT,\r\n    MaxWidth = CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT,\r\n    MaxHeight = CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT,\r\n    MaxDepth = CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT,\r\n    MaxArraySize = CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT,\r\n#if defined(cl_ext_image_from_buffer)\r\n    SlicePitchAlign = CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT,\r\n#endif\r\n};\r\n\r\n#endif // cl_ext_image_requirements_info\r\n\r\n\r\n/*! \\brief Class interface for cl_context.\r\n *\r\n *  \\note Copies of these objects are shallow, meaning that the copy will refer\r\n *        to the same underlying cl_context as the original.  For details, see\r\n *        clRetainContext() and clReleaseContext().\r\n *\r\n *  \\see cl_context\r\n */\r\nclass Context \r\n    : public detail::Wrapper<cl_context>\r\n{\r\nprivate:\r\n    static std::once_flag default_initialized_;\r\n    static Context default_;\r\n    static cl_int default_error_;\r\n\r\n    /*! \\brief Create the default context from the default device type in the default platform.\r\n     *\r\n     * This sets @c default_ and @c default_error_. It does not throw\r\n     * @c cl::Error.\r\n     */\r\n    static void makeDefault() {\r\n        /* Throwing an exception from a call_once invocation does not do\r\n         * what we wish, so we catch it and save the error.\r\n         */\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n        try\r\n#endif\r\n        {\r\n#if !defined(__APPLE__) && !defined(__MACOS)\r\n            const Platform &p = Platform::getDefault();\r\n            cl_platform_id defaultPlatform = p();\r\n            cl_context_properties properties[3] = {\r\n                CL_CONTEXT_PLATFORM, (cl_context_properties)defaultPlatform, 0\r\n            };\r\n#else // #if !defined(__APPLE__) && !defined(__MACOS)\r\n            cl_context_properties *properties = nullptr;\r\n#endif // #if !defined(__APPLE__) && !defined(__MACOS)\r\n\r\n            default_ = Context(\r\n                CL_DEVICE_TYPE_DEFAULT,\r\n                properties,\r\n                nullptr,\r\n                nullptr,\r\n                &default_error_);\r\n        }\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n        catch (cl::Error &e) {\r\n            default_error_ = e.err();\r\n        }\r\n#endif\r\n    }\r\n\r\n\r\n    /*! \\brief Create the default context from a provided Context.\r\n     *\r\n     * This sets @c default_. It does not throw\r\n     * @c cl::Error.\r\n     */\r\n    static void makeDefaultProvided(const Context &c) {\r\n        default_ = c;\r\n    }\r\n\r\n#if defined(cl_ext_image_requirements_info)\r\n    struct ImageRequirementsInfo {\r\n\r\n        ImageRequirementsInfo(cl_mem_flags f, const cl_mem_properties* mem_properties, const ImageFormat* format, const cl_image_desc* desc)\r\n        {\r\n            flags = f;\r\n            properties = mem_properties;\r\n            image_format = format;\r\n            image_desc = desc;\r\n        }\r\n\r\n        cl_mem_flags flags = 0;\r\n        const cl_mem_properties* properties;\r\n        const ImageFormat* image_format;\r\n        const cl_image_desc* image_desc;\r\n    };\r\n\r\n    static cl_int getImageRequirementsInfoExtHelper(const Context &context,\r\n        const ImageRequirementsInfo &info,\r\n        cl_image_requirements_info_ext param_name,\r\n        size_type param_value_size,\r\n        void* param_value,\r\n        size_type* param_value_size_ret)\r\n    {\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        Device device = context.getInfo<CL_CONTEXT_DEVICES>().at(0);\r\n        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetImageRequirementsInfoEXT);\r\n#else\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetImageRequirementsInfoEXT);\r\n#endif\r\n\r\n        if (pfn_clGetImageRequirementsInfoEXT == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION, __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR);\r\n        }\r\n\r\n        return detail::errHandler(\r\n            pfn_clGetImageRequirementsInfoEXT(context(), info.properties,\r\n                info.flags, info.image_format, info.image_desc, param_name,\r\n                param_value_size, param_value, param_value_size_ret),\r\n            __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR);\r\n    }\r\n#endif // cl_ext_image_requirements_info\r\n    \r\npublic:\r\n#ifdef CL_HPP_UNIT_TEST_ENABLE\r\n    /*! \\brief Reset the default.\r\n    *\r\n    * This sets @c default_ to an empty value to support cleanup in\r\n    * the unit test framework.\r\n    * This function is not thread safe.\r\n    */\r\n    static void unitTestClearDefault() {\r\n        default_ = Context();\r\n    }\r\n#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE\r\n\r\n    /*! \\brief Constructs a context including a list of specified devices.\r\n     *\r\n     *  Wraps clCreateContext().\r\n     */\r\n    Context(\r\n        const vector<Device>& devices,\r\n        const cl_context_properties* properties = nullptr,\r\n        void (CL_CALLBACK * notifyFptr)(\r\n            const char *,\r\n            const void *,\r\n            size_type,\r\n            void *) = nullptr,\r\n        void* data = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        size_type numDevices = devices.size();\r\n        vector<cl_device_id> deviceIDs(numDevices);\r\n\r\n        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {\r\n            deviceIDs[deviceIndex] = (devices[deviceIndex])();\r\n        }\r\n\r\n        object_ = ::clCreateContext(\r\n            properties, (cl_uint) numDevices,\r\n            deviceIDs.data(),\r\n            notifyFptr, data, &error);\r\n\r\n        detail::errHandler(error, __CREATE_CONTEXT_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    /*! \\brief Constructs a context including a specific device.\r\n     *\r\n     *  Wraps clCreateContext().\r\n     */\r\n    Context(\r\n        const Device& device,\r\n        const cl_context_properties* properties = nullptr,\r\n        void (CL_CALLBACK * notifyFptr)(\r\n            const char *,\r\n            const void *,\r\n            size_type,\r\n            void *) = nullptr,\r\n        void* data = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_device_id deviceID = device();\r\n\r\n        object_ = ::clCreateContext(\r\n            properties, 1,\r\n            &deviceID,\r\n            notifyFptr, data, &error);\r\n\r\n        detail::errHandler(error, __CREATE_CONTEXT_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n    \r\n    /*! \\brief Constructs a context including all or a subset of devices of a specified type.\r\n     *\r\n     *  Wraps clCreateContextFromType().\r\n     */\r\n    Context(\r\n        cl_device_type type,\r\n        const cl_context_properties* properties = nullptr,\r\n        void (CL_CALLBACK * notifyFptr)(\r\n            const char *,\r\n            const void *,\r\n            size_type,\r\n            void *) = nullptr,\r\n        void* data = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n#if !defined(__APPLE__) && !defined(__MACOS)\r\n        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };\r\n\r\n        if (properties == nullptr) {\r\n            // Get a valid platform ID as we cannot send in a blank one\r\n            vector<Platform> platforms;\r\n            error = Platform::get(&platforms);\r\n            if (error != CL_SUCCESS) {\r\n                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);\r\n                if (err != nullptr) {\r\n                    *err = error;\r\n                }\r\n                return;\r\n            }\r\n\r\n            // Check the platforms we found for a device of our specified type\r\n            cl_context_properties platform_id = 0;\r\n            for (unsigned int i = 0; i < platforms.size(); i++) {\r\n\r\n                vector<Device> devices;\r\n\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n                try {\r\n#endif\r\n\r\n                    error = platforms[i].getDevices(type, &devices);\r\n\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n                } catch (cl::Error& e) {\r\n                    error = e.err();\r\n                }\r\n    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type\r\n    // We do error checking next anyway, and can throw there if needed\r\n#endif\r\n\r\n                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND\r\n                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {\r\n                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);\r\n                    if (err != nullptr) {\r\n                        *err = error;\r\n                    }\r\n                }\r\n\r\n                if (devices.size() > 0) {\r\n                    platform_id = (cl_context_properties)platforms[i]();\r\n                    break;\r\n                }\r\n            }\r\n\r\n            if (platform_id == 0) {\r\n                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);\r\n                if (err != nullptr) {\r\n                    *err = CL_DEVICE_NOT_FOUND;\r\n                }\r\n                return;\r\n            }\r\n\r\n            prop[1] = platform_id;\r\n            properties = &prop[0];\r\n        }\r\n#endif\r\n        object_ = ::clCreateContextFromType(\r\n            properties, type, notifyFptr, data, &error);\r\n\r\n        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n\r\n    /*! \\brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.\r\n     *\r\n     *  \\note All calls to this function return the same cl_context as the first.\r\n     */\r\n    static Context getDefault(cl_int * err = nullptr) \r\n    {\r\n        std::call_once(default_initialized_, makeDefault);\r\n        detail::errHandler(default_error_);\r\n        if (err != nullptr) {\r\n            *err = default_error_;\r\n        }\r\n        return default_;\r\n    }\r\n\r\n    /**\r\n     * Modify the default context to be used by\r\n     * subsequent operations.\r\n     * Will only set the default if no default was previously created.\r\n     * @return updated default context.\r\n     *         Should be compared to the passed value to ensure that it was updated.\r\n     */\r\n    static Context setDefault(const Context &default_context)\r\n    {\r\n        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_context));\r\n        detail::errHandler(default_error_);\r\n        return default_;\r\n    }\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Context() : detail::Wrapper<cl_type>() { }\r\n\r\n    /*! \\brief Constructor from cl_context - takes ownership.\r\n     * \r\n     *  This effectively transfers ownership of a refcount on the cl_context\r\n     *  into the new Context object.\r\n     */\r\n    explicit Context(const cl_context& context, bool retainObject = false) : \r\n        detail::Wrapper<cl_type>(context, retainObject) { }\r\n\r\n    /*! \\brief Assignment operator from cl_context - takes ownership.\r\n     * \r\n     *  This effectively transfers ownership of a refcount on the rhs and calls\r\n     *  clReleaseContext() on the value previously held by this instance.\r\n     */\r\n    Context& operator = (const cl_context& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetContextInfo().\r\n    template <typename T>\r\n    cl_int getInfo(cl_context_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetContextInfo, object_, name, param),\r\n            __GET_CONTEXT_INFO_ERR);\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetContextInfo() that returns by value.\r\n    template <cl_context_info name> typename\r\n    detail::param_traits<detail::cl_context_info, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_context_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n    /*! \\brief Gets a list of supported image formats.\r\n     *  \r\n     *  Wraps clGetSupportedImageFormats().\r\n     */\r\n    cl_int getSupportedImageFormats(\r\n        cl_mem_flags flags,\r\n        cl_mem_object_type type,\r\n        vector<ImageFormat>* formats) const\r\n    {\r\n        cl_uint numEntries;\r\n        \r\n        if (!formats) {\r\n            return CL_SUCCESS;\r\n        }\r\n\r\n        cl_int err = ::clGetSupportedImageFormats(\r\n           object_, \r\n           flags,\r\n           type, \r\n           0, \r\n           nullptr, \r\n           &numEntries);\r\n        if (err != CL_SUCCESS) {\r\n            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);\r\n        }\r\n\r\n        if (numEntries > 0) {\r\n            vector<ImageFormat> value(numEntries);\r\n            err = ::clGetSupportedImageFormats(\r\n                object_,\r\n                flags,\r\n                type,\r\n                numEntries,\r\n                (cl_image_format*)value.data(),\r\n                nullptr);\r\n            if (err != CL_SUCCESS) {\r\n                return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);\r\n            }\r\n\r\n            formats->assign(value.begin(), value.end());\r\n        }\r\n        else {\r\n            // If no values are being returned, ensure an empty vector comes back\r\n            formats->clear();\r\n        }\r\n\r\n        return CL_SUCCESS;\r\n    }\r\n\r\n#if defined(cl_ext_image_requirements_info)\r\n    template <typename T>\r\n    cl_int getImageRequirementsInfoExt(cl_image_requirements_info_ext name,\r\n        T* param,\r\n        cl_mem_flags flags = 0,\r\n        const cl_mem_properties* properties = nullptr,\r\n        const ImageFormat* image_format = nullptr,\r\n        const cl_image_desc* image_desc = nullptr) const\r\n    {\r\n        ImageRequirementsInfo imageInfo = {flags, properties, image_format, image_desc};\r\n\r\n        return detail::errHandler(\r\n            detail::getInfo(\r\n                Context::getImageRequirementsInfoExtHelper, *this, imageInfo, name, param),\r\n                __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR);\r\n    }\r\n\r\n    template <cl_image_requirements_info_ext type> typename\r\n    detail::param_traits<detail::cl_image_requirements_info_ext, type>::param_type\r\n        getImageRequirementsInfoExt(cl_mem_flags flags = 0,\r\n            const cl_mem_properties* properties = nullptr,\r\n            const ImageFormat* image_format = nullptr,\r\n            const cl_image_desc* image_desc = nullptr,\r\n            cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n        detail::cl_image_requirements_info_ext, type>::param_type param;\r\n        cl_int result = getImageRequirementsInfoExt(type, &param, flags, properties, image_format, image_desc);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n#endif // cl_ext_image_requirements_info\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n    /*! \\brief  Registers a destructor callback function with a context.\r\n     *\r\n     *  Wraps clSetContextDestructorCallback().\r\n     * \r\n     * Each call to this function registers the specified callback function on\r\n     * a destructor callback stack associated with context. The registered\r\n     * callback functions are called in the reverse order in which they were registered.\r\n     * If a context callback function was specified when context was created,\r\n     * it will not be called after any context destructor callback is called.\r\n     */\r\n    cl_int setDestructorCallback(\r\n        void (CL_CALLBACK * pfn_notify)(cl_context, void *),\r\n        void * user_data = nullptr)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetContextDestructorCallback(\r\n                object_,\r\n                pfn_notify,\r\n                user_data),\r\n                __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR);\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n};\r\n\r\ninline void Device::makeDefault()\r\n{\r\n    /* Throwing an exception from a call_once invocation does not do\r\n    * what we wish, so we catch it and save the error.\r\n    */\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n    try\r\n#endif\r\n    {\r\n        cl_int error = 0;\r\n\r\n        Context context = Context::getDefault(&error);\r\n        detail::errHandler(error, __CREATE_CONTEXT_ERR);\r\n\r\n        if (error != CL_SUCCESS) {\r\n            default_error_ = error;\r\n        }\r\n        else {\r\n            default_ = context.getInfo<CL_CONTEXT_DEVICES>()[0];\r\n            default_error_ = CL_SUCCESS;\r\n        }\r\n    }\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n    catch (cl::Error &e) {\r\n        default_error_ = e.err();\r\n    }\r\n#endif\r\n}\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Context::default_initialized_;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ Context Context::default_;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ cl_int Context::default_error_ = CL_SUCCESS;\r\n\r\n/*! \\brief Class interface for cl_event.\r\n *\r\n *  \\note Copies of these objects are shallow, meaning that the copy will refer\r\n *        to the same underlying cl_event as the original.  For details, see\r\n *        clRetainEvent() and clReleaseEvent().\r\n *\r\n *  \\see cl_event\r\n */\r\nclass Event : public detail::Wrapper<cl_event>\r\n{\r\npublic:\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Event() : detail::Wrapper<cl_type>() { }\r\n\r\n    /*! \\brief Constructor from cl_event - takes ownership.\r\n     * \r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  This effectively transfers ownership of a refcount on the cl_event\r\n     *  into the new Event object.\r\n     */\r\n    explicit Event(const cl_event& event, bool retainObject = false) : \r\n        detail::Wrapper<cl_type>(event, retainObject) { }\r\n\r\n    /*! \\brief Assignment operator from cl_event - takes ownership.\r\n     *\r\n     *  This effectively transfers ownership of a refcount on the rhs and calls\r\n     *  clReleaseEvent() on the value previously held by this instance.\r\n     */\r\n    Event& operator = (const cl_event& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetEventInfo().\r\n    template <typename T>\r\n    cl_int getInfo(cl_event_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetEventInfo, object_, name, param),\r\n            __GET_EVENT_INFO_ERR);\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetEventInfo() that returns by value.\r\n    template <cl_event_info name> typename\r\n    detail::param_traits<detail::cl_event_info, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_event_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetEventProfilingInfo().\r\n    template <typename T>\r\n    cl_int getProfilingInfo(cl_profiling_info name, T* param) const\r\n    {\r\n        return detail::errHandler(detail::getInfo(\r\n            &::clGetEventProfilingInfo, object_, name, param),\r\n            __GET_EVENT_PROFILE_INFO_ERR);\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetEventProfilingInfo() that returns by value.\r\n    template <cl_profiling_info name> typename\r\n    detail::param_traits<detail::cl_profiling_info, name>::param_type\r\n    getProfilingInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_profiling_info, name>::param_type param;\r\n        cl_int result = getProfilingInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n    /*! \\brief Blocks the calling thread until this event completes.\r\n     * \r\n     *  Wraps clWaitForEvents().\r\n     */\r\n    cl_int wait() const\r\n    {\r\n        return detail::errHandler(\r\n            ::clWaitForEvents(1, &object_),\r\n            __WAIT_FOR_EVENTS_ERR);\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n    /*! \\brief Registers a user callback function for a specific command execution status.\r\n     *\r\n     *  Wraps clSetEventCallback().\r\n     */\r\n    cl_int setCallback(\r\n        cl_int type,\r\n        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),\r\n        void * user_data = nullptr)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetEventCallback(\r\n                object_,\r\n                type,\r\n                pfn_notify,\r\n                user_data), \r\n            __SET_EVENT_CALLBACK_ERR);\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n\r\n    /*! \\brief Blocks the calling thread until every event specified is complete.\r\n     * \r\n     *  Wraps clWaitForEvents().\r\n     */\r\n    static cl_int\r\n    waitForEvents(const vector<Event>& events)\r\n    {\r\n        static_assert(sizeof(cl::Event) == sizeof(cl_event),\r\n        \"Size of cl::Event must be equal to size of cl_event\");\r\n\r\n        return detail::errHandler(\r\n            ::clWaitForEvents(\r\n                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : nullptr),\r\n            __WAIT_FOR_EVENTS_ERR);\r\n    }\r\n};\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n/*! \\brief Class interface for user events (a subset of cl_event's).\r\n * \r\n *  See Event for details about copy semantics, etc.\r\n */\r\nclass UserEvent : public Event\r\n{\r\npublic:\r\n    /*! \\brief Constructs a user event on a given context.\r\n     *\r\n     *  Wraps clCreateUserEvent().\r\n     */\r\n    UserEvent(\r\n        const Context& context,\r\n        cl_int * err = nullptr)\r\n    {\r\n        cl_int error;\r\n        object_ = ::clCreateUserEvent(\r\n            context(),\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_USER_EVENT_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    UserEvent() : Event() { }\r\n\r\n    /*! \\brief Sets the execution status of a user event object.\r\n     *\r\n     *  Wraps clSetUserEventStatus().\r\n     */\r\n    cl_int setStatus(cl_int status)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetUserEventStatus(object_,status), \r\n            __SET_USER_EVENT_STATUS_ERR);\r\n    }\r\n};\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n\r\n/*! \\brief Blocks the calling thread until every event specified is complete.\r\n * \r\n *  Wraps clWaitForEvents().\r\n */\r\ninline static cl_int\r\nWaitForEvents(const vector<Event>& events)\r\n{\r\n    return detail::errHandler(\r\n        ::clWaitForEvents(\r\n            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : nullptr),\r\n        __WAIT_FOR_EVENTS_ERR);\r\n}\r\n\r\n/*! \\brief Class interface for cl_mem.\r\n *\r\n *  \\note Copies of these objects are shallow, meaning that the copy will refer\r\n *        to the same underlying cl_mem as the original.  For details, see\r\n *        clRetainMemObject() and clReleaseMemObject().\r\n *\r\n *  \\see cl_mem\r\n */\r\nclass Memory : public detail::Wrapper<cl_mem>\r\n{\r\npublic:\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Memory() : detail::Wrapper<cl_type>() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     *  Optionally transfer ownership of a refcount on the cl_mem\r\n     *  into the new Memory object.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Memory(const cl_mem& memory, bool retainObject) :\r\n        detail::Wrapper<cl_type>(memory, retainObject) { }\r\n\r\n    /*! \\brief Assignment operator from cl_mem - takes ownership.\r\n     *\r\n     *  This effectively transfers ownership of a refcount on the rhs and calls\r\n     *  clReleaseMemObject() on the value previously held by this instance.\r\n     */\r\n    Memory& operator = (const cl_mem& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetMemObjectInfo().\r\n    template <typename T>\r\n    cl_int getInfo(cl_mem_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),\r\n            __GET_MEM_OBJECT_INFO_ERR);\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetMemObjectInfo() that returns by value.\r\n    template <cl_mem_info name> typename\r\n    detail::param_traits<detail::cl_mem_info, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_mem_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n    /*! \\brief Registers a callback function to be called when the memory object\r\n     *         is no longer needed.\r\n     *\r\n     *  Wraps clSetMemObjectDestructorCallback().\r\n     *\r\n     *  Repeated calls to this function, for a given cl_mem value, will append\r\n     *  to the list of functions called (in reverse order) when memory object's\r\n     *  resources are freed and the memory object is deleted.\r\n     *\r\n     *  \\note\r\n     *  The registered callbacks are associated with the underlying cl_mem\r\n     *  value - not the Memory class instance.\r\n     */\r\n    cl_int setDestructorCallback(\r\n        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),\r\n        void * user_data = nullptr)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetMemObjectDestructorCallback(\r\n                object_,\r\n                pfn_notify,\r\n                user_data), \r\n            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n\r\n};\r\n\r\n// Pre-declare copy functions\r\nclass Buffer;\r\ntemplate< typename IteratorType >\r\ncl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );\r\ntemplate< typename IteratorType >\r\ncl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );\r\ntemplate< typename IteratorType >\r\ncl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );\r\ntemplate< typename IteratorType >\r\ncl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );\r\n\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\nnamespace detail\r\n{\r\n    class SVMTraitNull\r\n    {\r\n    public:\r\n        static cl_svm_mem_flags getSVMMemFlags()\r\n        {\r\n            return 0;\r\n        }\r\n    };\r\n} // namespace detail\r\n\r\ntemplate<class Trait = detail::SVMTraitNull>\r\nclass SVMTraitReadWrite\r\n{\r\npublic:\r\n    static cl_svm_mem_flags getSVMMemFlags()\r\n    {\r\n        return CL_MEM_READ_WRITE |\r\n            Trait::getSVMMemFlags();\r\n    }\r\n};\r\n\r\ntemplate<class Trait = detail::SVMTraitNull>\r\nclass SVMTraitReadOnly\r\n{\r\npublic:\r\n    static cl_svm_mem_flags getSVMMemFlags()\r\n    {\r\n        return CL_MEM_READ_ONLY |\r\n            Trait::getSVMMemFlags();\r\n    }\r\n};\r\n\r\ntemplate<class Trait = detail::SVMTraitNull>\r\nclass SVMTraitWriteOnly\r\n{\r\npublic:\r\n    static cl_svm_mem_flags getSVMMemFlags()\r\n    {\r\n        return CL_MEM_WRITE_ONLY |\r\n            Trait::getSVMMemFlags();\r\n    }\r\n};\r\n\r\ntemplate<class Trait = SVMTraitReadWrite<>>\r\nclass SVMTraitCoarse\r\n{\r\npublic:\r\n    static cl_svm_mem_flags getSVMMemFlags()\r\n    {\r\n        return Trait::getSVMMemFlags();\r\n    }\r\n};\r\n\r\ntemplate<class Trait = SVMTraitReadWrite<>>\r\nclass SVMTraitFine\r\n{\r\npublic:\r\n    static cl_svm_mem_flags getSVMMemFlags()\r\n    {\r\n        return CL_MEM_SVM_FINE_GRAIN_BUFFER |\r\n            Trait::getSVMMemFlags();\r\n    }\r\n};\r\n\r\ntemplate<class Trait = SVMTraitReadWrite<>>\r\nclass SVMTraitAtomic\r\n{\r\npublic:\r\n    static cl_svm_mem_flags getSVMMemFlags()\r\n    {\r\n        return\r\n            CL_MEM_SVM_FINE_GRAIN_BUFFER |\r\n            CL_MEM_SVM_ATOMICS |\r\n            Trait::getSVMMemFlags();\r\n    }\r\n};\r\n\r\n// Pre-declare SVM map function\r\ntemplate<typename T>\r\ninline cl_int enqueueMapSVM(\r\n    T* ptr,\r\n    cl_bool blocking,\r\n    cl_map_flags flags,\r\n    size_type size,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr);\r\n\r\n/**\r\n * STL-like allocator class for managing SVM objects provided for convenience.\r\n *\r\n * Note that while this behaves like an allocator for the purposes of constructing vectors and similar objects,\r\n * care must be taken when using with smart pointers.\r\n * The allocator should not be used to construct a unique_ptr if we are using coarse-grained SVM mode because\r\n * the coarse-grained management behaviour would behave incorrectly with respect to reference counting.\r\n *\r\n * Instead the allocator embeds a Deleter which may be used with unique_ptr and is used\r\n * with the allocate_shared and allocate_ptr supplied operations.\r\n */\r\ntemplate<typename T, class SVMTrait>\r\nclass SVMAllocator {\r\nprivate:\r\n    Context context_;\r\n\r\npublic:\r\n    typedef T value_type;\r\n    typedef value_type* pointer;\r\n    typedef const value_type* const_pointer;\r\n    typedef value_type& reference;\r\n    typedef const value_type& const_reference;\r\n    typedef std::size_t size_type;\r\n    typedef std::ptrdiff_t difference_type;\r\n\r\n    template<typename U>\r\n    struct rebind\r\n    {\r\n        typedef SVMAllocator<U, SVMTrait> other;\r\n    };\r\n\r\n    template<typename U, typename V>\r\n    friend class SVMAllocator;\r\n\r\n    SVMAllocator() :\r\n        context_(Context::getDefault())\r\n    {\r\n    }\r\n\r\n    explicit SVMAllocator(cl::Context context) :\r\n        context_(context)\r\n    {\r\n    }\r\n\r\n\r\n    SVMAllocator(const SVMAllocator &other) :\r\n        context_(other.context_)\r\n    {\r\n    }\r\n\r\n    template<typename U>\r\n    SVMAllocator(const SVMAllocator<U, SVMTrait> &other) :\r\n        context_(other.context_)\r\n    {\r\n    }\r\n\r\n    ~SVMAllocator()\r\n    {\r\n    }\r\n\r\n    pointer address(reference r) noexcept\r\n    {\r\n        return std::addressof(r);\r\n    }\r\n\r\n    const_pointer address(const_reference r) noexcept\r\n    {\r\n        return std::addressof(r);\r\n    }\r\n\r\n    /**\r\n     * Allocate an SVM pointer.\r\n     *\r\n     * If the allocator is coarse-grained, this will take ownership to allow\r\n     * containers to correctly construct data in place. \r\n     */\r\n    pointer allocate(\r\n        size_type size,\r\n        typename cl::SVMAllocator<void, SVMTrait>::const_pointer = 0,\r\n        bool map = true)\r\n    {\r\n        // Allocate memory with default alignment matching the size of the type\r\n        void* voidPointer =\r\n            clSVMAlloc(\r\n            context_(),\r\n            SVMTrait::getSVMMemFlags(),\r\n            size*sizeof(T),\r\n            0);\r\n        pointer retValue = reinterpret_cast<pointer>(\r\n            voidPointer);\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n        if (!retValue) {\r\n            std::bad_alloc excep;\r\n            throw excep;\r\n        }\r\n#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n\r\n        // If allocation was coarse-grained then map it\r\n        if (map && !(SVMTrait::getSVMMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) {\r\n            cl_int err = enqueueMapSVM(retValue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, size*sizeof(T));\r\n            if (err != CL_SUCCESS) {\r\n                clSVMFree(context_(), retValue);\r\n                retValue = nullptr;\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n                std::bad_alloc excep;\r\n                throw excep;\r\n#endif\r\n            }\r\n        }\r\n\r\n        // If exceptions disabled, return null pointer from allocator\r\n        return retValue;\r\n    }\r\n\r\n    void deallocate(pointer p, size_type)\r\n    {\r\n        clSVMFree(context_(), p);\r\n    }\r\n\r\n    /**\r\n     * Return the maximum possible allocation size.\r\n     * This is the minimum of the maximum sizes of all devices in the context.\r\n     */\r\n    size_type max_size() const noexcept\r\n    {\r\n        size_type maxSize = std::numeric_limits<size_type>::max() / sizeof(T);\r\n\r\n        for (const Device &d : context_.getInfo<CL_CONTEXT_DEVICES>()) {\r\n            maxSize = std::min(\r\n                maxSize, \r\n                static_cast<size_type>(d.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()));\r\n        }\r\n\r\n        return maxSize;\r\n    }\r\n\r\n    template< class U, class... Args >\r\n    void construct(U* p, Args&&... args)\r\n    {\r\n        new(p)T(args...);\r\n    }\r\n\r\n    template< class U >\r\n    void destroy(U* p)\r\n    {\r\n        p->~U();\r\n    }\r\n\r\n    /**\r\n     * Returns true if the contexts match.\r\n     */\r\n    inline bool operator==(SVMAllocator const& rhs)\r\n    {\r\n        return (context_==rhs.context_);\r\n    }\r\n\r\n    inline bool operator!=(SVMAllocator const& a)\r\n    {\r\n        return !operator==(a);\r\n    }\r\n}; // class SVMAllocator        return cl::pointer<T>(tmp, detail::Deleter<T, Alloc>{alloc, copies});\r\n\r\n\r\ntemplate<class SVMTrait>\r\nclass SVMAllocator<void, SVMTrait> {\r\npublic:\r\n    typedef void value_type;\r\n    typedef value_type* pointer;\r\n    typedef const value_type* const_pointer;\r\n\r\n    template<typename U>\r\n    struct rebind\r\n    {\r\n        typedef SVMAllocator<U, SVMTrait> other;\r\n    };\r\n\r\n    template<typename U, typename V>\r\n    friend class SVMAllocator;\r\n};\r\n\r\n#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)\r\nnamespace detail\r\n{\r\n    template<class Alloc>\r\n    class Deleter {\r\n    private:\r\n        Alloc alloc_;\r\n        size_type copies_;\r\n\r\n    public:\r\n        typedef typename std::allocator_traits<Alloc>::pointer pointer;\r\n\r\n        Deleter(const Alloc &alloc, size_type copies) : alloc_{ alloc }, copies_{ copies }\r\n        {\r\n        }\r\n\r\n        void operator()(pointer ptr) const {\r\n            Alloc tmpAlloc{ alloc_ };\r\n            std::allocator_traits<Alloc>::destroy(tmpAlloc, std::addressof(*ptr));\r\n            std::allocator_traits<Alloc>::deallocate(tmpAlloc, ptr, copies_);\r\n        }\r\n    };\r\n} // namespace detail\r\n\r\n/**\r\n * Allocation operation compatible with std::allocate_ptr.\r\n * Creates a unique_ptr<T> by default.\r\n * This requirement is to ensure that the control block is not\r\n * allocated in memory inaccessible to the host.\r\n */\r\ntemplate <class T, class Alloc, class... Args>\r\ncl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Args&&... args)\r\n{\r\n    Alloc alloc(alloc_);\r\n    static const size_type copies = 1;\r\n\r\n    // Ensure that creation of the management block and the\r\n    // object are dealt with separately such that we only provide a deleter\r\n\r\n    T* tmp = std::allocator_traits<Alloc>::allocate(alloc, copies);\r\n    if (!tmp) {\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n        std::bad_alloc excep;\r\n        throw excep;\r\n#else\r\n        return nullptr;\r\n#endif\r\n    }\r\n\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n    try\r\n#endif\r\n    {\r\n        std::allocator_traits<Alloc>::construct(\r\n            alloc,\r\n            std::addressof(*tmp),\r\n            std::forward<Args>(args)...);\r\n\r\n        return cl::pointer<T, detail::Deleter<Alloc>>(tmp, detail::Deleter<Alloc>{alloc, copies});\r\n    }\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n    catch (std::bad_alloc&)\r\n    {\r\n        std::allocator_traits<Alloc>::deallocate(alloc, tmp, copies);\r\n        throw;\r\n    }\r\n#endif\r\n}\r\n\r\ntemplate< class T, class SVMTrait, class... Args >\r\ncl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(Args... args)\r\n{\r\n    SVMAllocator<T, SVMTrait> alloc;\r\n    return cl::allocate_pointer<T>(alloc, args...);\r\n}\r\n\r\ntemplate< class T, class SVMTrait, class... Args >\r\ncl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(const cl::Context &c, Args... args)\r\n{\r\n    SVMAllocator<T, SVMTrait> alloc(c);\r\n    return cl::allocate_pointer<T>(alloc, args...);\r\n}\r\n#endif // #if !defined(CL_HPP_NO_STD_UNIQUE_PTR)\r\n\r\n/*! \\brief Vector alias to simplify contruction of coarse-grained SVM containers.\r\n * \r\n */\r\ntemplate < class T >\r\nusing coarse_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>>;\r\n\r\n/*! \\brief Vector alias to simplify contruction of fine-grained SVM containers.\r\n*\r\n*/\r\ntemplate < class T >\r\nusing fine_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitFine<>>>;\r\n\r\n/*! \\brief Vector alias to simplify contruction of fine-grained SVM containers that support platform atomics.\r\n*\r\n*/\r\ntemplate < class T >\r\nusing atomic_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitAtomic<>>>;\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n\r\n/*! \\brief Class interface for Buffer Memory Objects.\r\n * \r\n *  See Memory for details about copy semantics, etc.\r\n *\r\n *  \\see Memory\r\n */\r\nclass Buffer : public Memory\r\n{\r\npublic:\r\n\r\n    /*! \\brief Constructs a Buffer in a specified context.\r\n     *\r\n     *  Wraps clCreateBuffer().\r\n     *\r\n     *  \\param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was\r\n     *                  specified.  Note alignment & exclusivity requirements.\r\n     */\r\n    Buffer(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        size_type size,\r\n        void* host_ptr = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);\r\n\r\n        detail::errHandler(error, __CREATE_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n    /*! \\brief Constructs a Buffer in a specified context and with specified properties.\r\n     *\r\n     *  Wraps clCreateBufferWithProperties().\r\n     *\r\n     *  \\param properties Optional list of properties for the buffer object and\r\n     *                    their corresponding values. The non-empty list must\r\n     *                    end with 0. \r\n     *  \\param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was\r\n     *                  specified. Note alignment & exclusivity requirements.\r\n     */\r\n    Buffer(\r\n        const Context& context,\r\n        const vector<cl_mem_properties>& properties,\r\n        cl_mem_flags flags,\r\n        size_type size,\r\n        void* host_ptr = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        if (properties.empty()) {\r\n            object_ = ::clCreateBufferWithProperties(context(), nullptr, flags,\r\n                                                     size, host_ptr, &error);\r\n        }\r\n        else {\r\n            object_ = ::clCreateBufferWithProperties(\r\n                context(), properties.data(), flags, size, host_ptr, &error);\r\n        }\r\n\r\n        detail::errHandler(error, __CREATE_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n#endif\r\n\r\n    /*! \\brief Constructs a Buffer in the default context.\r\n     *\r\n     *  Wraps clCreateBuffer().\r\n     *\r\n     *  \\param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was\r\n     *                  specified.  Note alignment & exclusivity requirements.\r\n     *\r\n     *  \\see Context::getDefault()\r\n     */\r\n    Buffer(\r\n        cl_mem_flags flags,\r\n        size_type size,\r\n        void* host_ptr = nullptr,\r\n        cl_int* err = nullptr) : Buffer(Context::getDefault(err), flags, size, host_ptr, err) { }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n    /*! \\brief Constructs a Buffer in the default context and with specified properties.\r\n     *\r\n     *  Wraps clCreateBufferWithProperties().\r\n     *\r\n     *  \\param properties Optional list of properties for the buffer object and\r\n     *                    their corresponding values. The non-empty list must\r\n     *                    end with 0. \r\n     *  \\param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was\r\n     *                  specified. Note alignment & exclusivity requirements.\r\n     * \r\n     *  \\see Context::getDefault()\r\n     */\r\n    Buffer(\r\n        const vector<cl_mem_properties>& properties,\r\n        cl_mem_flags flags,\r\n        size_type size,\r\n        void* host_ptr = nullptr,\r\n        cl_int* err = nullptr) : Buffer(Context::getDefault(err), properties, flags, size, host_ptr, err) { }\r\n#endif\r\n\r\n    /*!\r\n     * \\brief Construct a Buffer from a host container via iterators.\r\n     * IteratorType must be random access.\r\n     * If useHostPtr is specified iterators must represent contiguous data.\r\n     */\r\n    template< typename IteratorType >\r\n    Buffer(\r\n        IteratorType startIterator,\r\n        IteratorType endIterator,\r\n        bool readOnly,\r\n        bool useHostPtr = false,\r\n        cl_int* err = nullptr)\r\n    {\r\n        typedef typename std::iterator_traits<IteratorType>::value_type DataType;\r\n        cl_int error;\r\n\r\n        cl_mem_flags flags = 0;\r\n        if( readOnly ) {\r\n            flags |= CL_MEM_READ_ONLY;\r\n        }\r\n        else {\r\n            flags |= CL_MEM_READ_WRITE;\r\n        }\r\n        if( useHostPtr ) {\r\n            flags |= CL_MEM_USE_HOST_PTR;\r\n        }\r\n        \r\n        size_type size = sizeof(DataType)*(endIterator - startIterator);\r\n\r\n        Context context = Context::getDefault(err);\r\n\r\n        if( useHostPtr ) {\r\n            object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);\r\n        } else {\r\n            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);\r\n        }\r\n\r\n        detail::errHandler(error, __CREATE_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n\r\n        if( !useHostPtr ) {\r\n            error = cl::copy(startIterator, endIterator, *this);\r\n            detail::errHandler(error, __CREATE_BUFFER_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n    }\r\n\r\n    /*!\r\n     * \\brief Construct a Buffer from a host container via iterators using a specified context.\r\n     * IteratorType must be random access.\r\n     * If useHostPtr is specified iterators must represent contiguous data.\r\n     */\r\n    template< typename IteratorType >\r\n    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,\r\n        bool readOnly, bool useHostPtr = false, cl_int* err = nullptr);\r\n    \r\n    /*!\r\n    * \\brief Construct a Buffer from a host container via iterators using a specified queue.\r\n    * If useHostPtr is specified iterators must be random access.\r\n    */\r\n    template< typename IteratorType >\r\n    Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,\r\n        bool readOnly, bool useHostPtr = false, cl_int* err = nullptr);\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Buffer() : Memory() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with earlier versions.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Buffer(const cl_mem& buffer, bool retainObject = false) :\r\n        Memory(buffer, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n    *\r\n    *  See Memory for further details.\r\n    */\r\n    Buffer& operator = (const cl_mem& rhs)\r\n    {\r\n        Memory::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n    /*! \\brief Creates a new buffer object from this.\r\n     *\r\n     *  Wraps clCreateSubBuffer().\r\n     */\r\n    Buffer createSubBuffer(\r\n        cl_mem_flags flags,\r\n        cl_buffer_create_type buffer_create_type,\r\n        const void * buffer_create_info,\r\n        cl_int * err = nullptr)\r\n    {\r\n        Buffer result;\r\n        cl_int error;\r\n        result.object_ = ::clCreateSubBuffer(\r\n            object_, \r\n            flags, \r\n            buffer_create_type, \r\n            buffer_create_info, \r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n\r\n        return result;\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n};\r\n\r\n#if defined (CL_HPP_USE_DX_INTEROP)\r\n/*! \\brief Class interface for creating OpenCL buffers from ID3D10Buffer's.\r\n *\r\n *  This is provided to facilitate interoperability with Direct3D.\r\n * \r\n *  See Memory for details about copy semantics, etc.\r\n *\r\n *  \\see Memory\r\n */\r\nclass BufferD3D10 : public Buffer\r\n{\r\npublic:\r\n   \r\n\r\n    /*! \\brief Constructs a BufferD3D10, in a specified context, from a\r\n     *         given ID3D10Buffer.\r\n     *\r\n     *  Wraps clCreateFromD3D10BufferKHR().\r\n     */\r\n    BufferD3D10(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        ID3D10Buffer* bufobj,\r\n        cl_int * err = nullptr) : pfn_clCreateFromD3D10BufferKHR(nullptr)\r\n    {\r\n        typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(\r\n            cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,\r\n            cl_int* errcode_ret);\r\n        PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR;\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();\r\n        cl_platform platform = nullptr;\r\n        for( int i = 0; i < props.size(); ++i ) {\r\n            if( props[i] == CL_CONTEXT_PLATFORM ) {\r\n                platform = props[i+1];\r\n            }\r\n        }\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateFromD3D10BufferKHR);\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateFromD3D10BufferKHR);\r\n#endif\r\n\r\n        cl_int error;\r\n        object_ = pfn_clCreateFromD3D10BufferKHR(\r\n            context(),\r\n            flags,\r\n            bufobj,\r\n            &error);\r\n\r\n        // TODO: This should really have a D3D10 rerror code!\r\n        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    BufferD3D10() : Buffer() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with \r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit BufferD3D10(const cl_mem& buffer, bool retainObject = false) : \r\n        Buffer(buffer, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    BufferD3D10& operator = (const cl_mem& rhs)\r\n    {\r\n        Buffer::operator=(rhs);\r\n        return *this;\r\n    }\r\n};\r\n#endif\r\n\r\n/*! \\brief Class interface for GL Buffer Memory Objects.\r\n *\r\n *  This is provided to facilitate interoperability with OpenGL.\r\n * \r\n *  See Memory for details about copy semantics, etc.\r\n * \r\n *  \\see Memory\r\n */\r\nclass BufferGL : public Buffer\r\n{\r\npublic:\r\n    /*! \\brief Constructs a BufferGL in a specified context, from a given\r\n     *         GL buffer.\r\n     *\r\n     *  Wraps clCreateFromGLBuffer().\r\n     */\r\n    BufferGL(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        cl_GLuint bufobj,\r\n        cl_int * err = nullptr)\r\n    {\r\n        cl_int error;\r\n        object_ = ::clCreateFromGLBuffer(\r\n            context(),\r\n            flags,\r\n            bufobj,\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    BufferGL() : Buffer() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit BufferGL(const cl_mem& buffer, bool retainObject = false) :\r\n        Buffer(buffer, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    BufferGL& operator = (const cl_mem& rhs)\r\n    {\r\n        Buffer::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n\r\n    //! \\brief Wrapper for clGetGLObjectInfo().\r\n    cl_int getObjectInfo(\r\n        cl_gl_object_type *type,\r\n        cl_GLuint * gl_object_name)\r\n    {\r\n        return detail::errHandler(\r\n            ::clGetGLObjectInfo(object_,type,gl_object_name),\r\n            __GET_GL_OBJECT_INFO_ERR);\r\n    }\r\n};\r\n\r\n/*! \\brief Class interface for GL Render Buffer Memory Objects.\r\n *\r\n *  This is provided to facilitate interoperability with OpenGL.\r\n * \r\n *  See Memory for details about copy semantics, etc.\r\n * \r\n *  \\see Memory\r\n */\r\nclass BufferRenderGL : public Buffer\r\n{\r\npublic:\r\n    /*! \\brief Constructs a BufferRenderGL in a specified context, from a given\r\n     *         GL Renderbuffer.\r\n     *\r\n     *  Wraps clCreateFromGLRenderbuffer().\r\n     */\r\n    BufferRenderGL(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        cl_GLuint bufobj,\r\n        cl_int * err = nullptr)\r\n    {\r\n        cl_int error;\r\n        object_ = ::clCreateFromGLRenderbuffer(\r\n            context(),\r\n            flags,\r\n            bufobj,\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    BufferRenderGL() : Buffer() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with \r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit BufferRenderGL(const cl_mem& buffer, bool retainObject = false) :\r\n        Buffer(buffer, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    BufferRenderGL& operator = (const cl_mem& rhs)\r\n    {\r\n        Buffer::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n\r\n    //! \\brief Wrapper for clGetGLObjectInfo().\r\n    cl_int getObjectInfo(\r\n        cl_gl_object_type *type,\r\n        cl_GLuint * gl_object_name)\r\n    {\r\n        return detail::errHandler(\r\n            ::clGetGLObjectInfo(object_,type,gl_object_name),\r\n            __GET_GL_OBJECT_INFO_ERR);\r\n    }\r\n};\r\n\r\n/*! \\brief C++ base class for Image Memory objects.\r\n *\r\n *  See Memory for details about copy semantics, etc.\r\n * \r\n *  \\see Memory\r\n */\r\nclass Image : public Memory\r\n{\r\nprotected:\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Image() : Memory() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Image(const cl_mem& image, bool retainObject = false) :\r\n        Memory(image, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    Image& operator = (const cl_mem& rhs)\r\n    {\r\n        Memory::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n\r\npublic:\r\n    //! \\brief Wrapper for clGetImageInfo().\r\n    template <typename T>\r\n    cl_int getImageInfo(cl_image_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetImageInfo, object_, name, param),\r\n            __GET_IMAGE_INFO_ERR);\r\n    }\r\n    \r\n    //! \\brief Wrapper for clGetImageInfo() that returns by value.\r\n    template <cl_image_info name> typename\r\n    detail::param_traits<detail::cl_image_info, name>::param_type\r\n    getImageInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_image_info, name>::param_type param;\r\n        cl_int result = getImageInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n};\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n/*! \\brief Class interface for 1D Image Memory objects.\r\n *\r\n *  See Memory for details about copy semantics, etc.\r\n * \r\n *  \\see Memory\r\n */\r\nclass Image1D : public Image\r\n{\r\npublic:\r\n    /*! \\brief Constructs a 1D Image in a specified context.\r\n     *\r\n     *  Wraps clCreateImage().\r\n     */\r\n    Image1D(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        ImageFormat format,\r\n        size_type width,\r\n        void* host_ptr = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_image_desc desc = {};\r\n        desc.image_type = CL_MEM_OBJECT_IMAGE1D;\r\n        desc.image_width = width;\r\n\r\n        object_ = ::clCreateImage(\r\n            context(), \r\n            flags, \r\n            &format, \r\n            &desc, \r\n            host_ptr, \r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Image1D() { }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n    /*! \\brief Constructs a Image1D with specified properties.\r\n     *\r\n     *  Wraps clCreateImageWithProperties().\r\n     *\r\n     *  \\param properties Optional list of properties for the image object and\r\n     *                    their corresponding values. The non-empty list must\r\n     *                    end with 0.\r\n     *  \\param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was\r\n     *                  specified. Note alignment & exclusivity requirements.\r\n     */\r\n    Image1D(const Context &context, const vector<cl_mem_properties> &properties,\r\n            cl_mem_flags flags, ImageFormat format, size_type width,\r\n            void *host_ptr = nullptr, cl_int *err = nullptr) {\r\n      cl_int error;\r\n\r\n      cl_image_desc desc = {};\r\n      desc.image_type = CL_MEM_OBJECT_IMAGE1D;\r\n      desc.image_width = width;\r\n\r\n      if (properties.empty()) {\r\n        object_ = ::clCreateImageWithProperties(\r\n            context(), nullptr, flags, &format, &desc, host_ptr, &error);\r\n      } else {\r\n        object_ =\r\n            ::clCreateImageWithProperties(context(), properties.data(), flags,\r\n                                          &format, &desc, host_ptr, &error);\r\n      }\r\n\r\n      detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n      if (err != nullptr) {\r\n        *err = error;\r\n      }\r\n    }\r\n#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Image1D(const cl_mem& image1D, bool retainObject = false) :\r\n        Image(image1D, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    Image1D& operator = (const cl_mem& rhs)\r\n    {\r\n        Image::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n\r\n};\r\n\r\n/*! \\class Image1DBuffer\r\n * \\brief Image interface for 1D buffer images.\r\n */\r\nclass Image1DBuffer : public Image\r\n{\r\npublic:\r\n    Image1DBuffer(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        ImageFormat format,\r\n        size_type width,\r\n        const Buffer &buffer,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_image_desc desc = {};\r\n        desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;\r\n        desc.image_width = width;\r\n        desc.buffer = buffer();\r\n\r\n        object_ = ::clCreateImage(\r\n            context(), \r\n            flags, \r\n            &format, \r\n            &desc, \r\n            nullptr, \r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    Image1DBuffer() { }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n    /*! \\brief Constructs a Image1DBuffer with specified properties.\r\n     *\r\n     *  Wraps clCreateImageWithProperties().\r\n     *\r\n     *  \\param properties Optional list of properties for the image object and\r\n     *                    their corresponding values. The non-empty list must\r\n     *                    end with 0.\r\n     *  \\param buffer Refer to a valid buffer or image memory object.\r\n     */\r\n    Image1DBuffer(const Context &context,\r\n                  const vector<cl_mem_properties> &properties,\r\n                  cl_mem_flags flags, ImageFormat format, size_type width,\r\n                  const Buffer &buffer, cl_int *err = nullptr) {\r\n      cl_int error;\r\n\r\n      cl_image_desc desc = {};\r\n      desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;\r\n      desc.image_width = width;\r\n      desc.buffer = buffer();\r\n\r\n      if (properties.empty()) {\r\n        object_ = ::clCreateImageWithProperties(\r\n            context(), nullptr, flags, &format, &desc, nullptr, &error);\r\n      } else {\r\n        object_ =\r\n            ::clCreateImageWithProperties(context(), properties.data(), flags,\r\n                                          &format, &desc, nullptr, &error);\r\n      }\r\n\r\n      detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n      if (err != nullptr) {\r\n        *err = error;\r\n      }\r\n    }\r\n#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Image1DBuffer(const cl_mem& image1D, bool retainObject = false) :\r\n        Image(image1D, retainObject) { }\r\n\r\n    Image1DBuffer& operator = (const cl_mem& rhs)\r\n    {\r\n        Image::operator=(rhs);\r\n        return *this;\r\n    }\r\n};\r\n\r\n/*! \\class Image1DArray\r\n * \\brief Image interface for arrays of 1D images.\r\n */\r\nclass Image1DArray : public Image\r\n{\r\npublic:\r\n    Image1DArray(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        ImageFormat format,\r\n        size_type arraySize,\r\n        size_type width,\r\n        size_type rowPitch,\r\n        void* host_ptr = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_image_desc desc = {};\r\n        desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;\r\n        desc.image_width = width;\r\n        desc.image_array_size = arraySize;\r\n        desc.image_row_pitch = rowPitch;\r\n\r\n        object_ = ::clCreateImage(\r\n            context(), \r\n            flags, \r\n            &format, \r\n            &desc, \r\n            host_ptr, \r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    Image1DArray() { }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n    /*! \\brief Constructs a Image1DArray with specified properties.\r\n     *\r\n     *  Wraps clCreateImageWithProperties().\r\n     *\r\n     *  \\param properties Optional list of properties for the image object and\r\n     *                    their corresponding values. The non-empty list must\r\n     *                    end with 0.\r\n     *  \\param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was\r\n     *                  specified. Note alignment & exclusivity requirements.\r\n     */\r\n    Image1DArray(const Context &context,\r\n                 const vector<cl_mem_properties> &properties,\r\n                 cl_mem_flags flags, ImageFormat format, size_type arraySize,\r\n                 size_type width, size_type rowPitch = 0,\r\n                 void *host_ptr = nullptr, cl_int *err = nullptr) {\r\n      cl_int error;\r\n\r\n      cl_image_desc desc = {};\r\n      desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;\r\n      desc.image_width = width;\r\n      desc.image_array_size = arraySize;\r\n      desc.image_row_pitch = rowPitch;\r\n\r\n      if (properties.empty()) {\r\n        object_ = ::clCreateImageWithProperties(\r\n            context(), nullptr, flags, &format, &desc, host_ptr, &error);\r\n      } else {\r\n        object_ =\r\n            ::clCreateImageWithProperties(context(), properties.data(), flags,\r\n                                          &format, &desc, host_ptr, &error);\r\n      }\r\n\r\n      detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n      if (err != nullptr) {\r\n        *err = error;\r\n      }\r\n    }\r\n#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Image1DArray(const cl_mem& imageArray, bool retainObject = false) :\r\n        Image(imageArray, retainObject) { }\r\n\r\n\r\n    Image1DArray& operator = (const cl_mem& rhs)\r\n    {\r\n        Image::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n\r\n};\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n\r\n/*! \\brief Class interface for 2D Image Memory objects.\r\n *\r\n *  See Memory for details about copy semantics, etc.\r\n * \r\n *  \\see Memory\r\n */\r\nclass Image2D : public Image\r\n{\r\npublic:\r\n    /*! \\brief Constructs a 2D Image in a specified context.\r\n     *\r\n     *  Wraps clCreateImage().\r\n     */\r\n    Image2D(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        ImageFormat format,\r\n        size_type width,\r\n        size_type height,\r\n        size_type row_pitch = 0,\r\n        void* host_ptr = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        bool useCreateImage;\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n        // Run-time decision based on the actual platform\r\n        {\r\n            cl_uint version = detail::getContextPlatformVersion(context());\r\n            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above\r\n        }\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        useCreateImage = true;\r\n#else\r\n        useCreateImage = false;\r\n#endif\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        if (useCreateImage)\r\n        {\r\n            cl_image_desc desc = {};\r\n            desc.image_type = CL_MEM_OBJECT_IMAGE2D;\r\n            desc.image_width = width;\r\n            desc.image_height = height;\r\n            desc.image_row_pitch = row_pitch;\r\n\r\n            object_ = ::clCreateImage(\r\n                context(),\r\n                flags,\r\n                &format,\r\n                &desc,\r\n                host_ptr,\r\n                &error);\r\n\r\n            detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n        if (!useCreateImage)\r\n        {\r\n            object_ = ::clCreateImage2D(\r\n                context(), flags,&format, width, height, row_pitch, host_ptr, &error);\r\n\r\n            detail::errHandler(error, __CREATE_IMAGE2D_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    /*! \\brief Constructs a 2D Image from a buffer.\r\n    * \\note This will share storage with the underlying buffer.\r\n    *\r\n    *  Requires OpenCL 2.0 or newer or OpenCL 1.2 and the \r\n    *  cl_khr_image2d_from_buffer extension.\r\n    *\r\n    *  Wraps clCreateImage().\r\n    */\r\n    Image2D(\r\n        const Context& context,\r\n        ImageFormat format,\r\n        const Buffer &sourceBuffer,\r\n        size_type width,\r\n        size_type height,\r\n        size_type row_pitch = 0,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_image_desc desc = {};\r\n        desc.image_type = CL_MEM_OBJECT_IMAGE2D;\r\n        desc.image_width = width;\r\n        desc.image_height = height;\r\n        desc.image_row_pitch = row_pitch;\r\n        desc.buffer = sourceBuffer();\r\n\r\n        object_ = ::clCreateImage(\r\n            context(),\r\n            0, // flags inherited from buffer\r\n            &format,\r\n            &desc,\r\n            nullptr,\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n    /*! \\brief Constructs a 2D Image from an image.\r\n    * \\note This will share storage with the underlying image but may\r\n    *       reinterpret the channel order and type.\r\n    *\r\n    * The image will be created matching with a descriptor matching the source. \r\n    *\r\n    * \\param order is the channel order to reinterpret the image data as.\r\n    *              The channel order may differ as described in the OpenCL \r\n    *              2.0 API specification.\r\n    *\r\n    * Wraps clCreateImage().\r\n    */\r\n    Image2D(\r\n        const Context& context,\r\n        cl_channel_order order,\r\n        const Image &sourceImage,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        // Descriptor fields have to match source image\r\n        size_type sourceWidth = \r\n            sourceImage.getImageInfo<CL_IMAGE_WIDTH>();\r\n        size_type sourceHeight = \r\n            sourceImage.getImageInfo<CL_IMAGE_HEIGHT>();\r\n        size_type sourceRowPitch =\r\n            sourceImage.getImageInfo<CL_IMAGE_ROW_PITCH>();\r\n        cl_uint sourceNumMIPLevels =\r\n            sourceImage.getImageInfo<CL_IMAGE_NUM_MIP_LEVELS>();\r\n        cl_uint sourceNumSamples =\r\n            sourceImage.getImageInfo<CL_IMAGE_NUM_SAMPLES>();\r\n        cl_image_format sourceFormat =\r\n            sourceImage.getImageInfo<CL_IMAGE_FORMAT>();\r\n\r\n        // Update only the channel order. \r\n        // Channel format inherited from source.\r\n        sourceFormat.image_channel_order = order;\r\n\r\n        cl_image_desc desc = {};\r\n        desc.image_type = CL_MEM_OBJECT_IMAGE2D;\r\n        desc.image_width = sourceWidth;\r\n        desc.image_height = sourceHeight;\r\n        desc.image_row_pitch = sourceRowPitch;\r\n        desc.num_mip_levels = sourceNumMIPLevels;\r\n        desc.num_samples = sourceNumSamples;\r\n        desc.buffer = sourceImage();\r\n\r\n        object_ = ::clCreateImage(\r\n            context(),\r\n            0, // flags should be inherited from mem_object\r\n            &sourceFormat,\r\n            &desc,\r\n            nullptr,\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n    /*! \\brief Constructs a Image2D with specified properties.\r\n     *\r\n     *  Wraps clCreateImageWithProperties().\r\n     *\r\n     *  \\param properties Optional list of properties for the image object and\r\n     *                    their corresponding values. The non-empty list must\r\n     *                    end with 0.\r\n     *  \\param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was\r\n     *                  specified. Note alignment & exclusivity requirements.\r\n     */\r\n    Image2D(const Context &context, const vector<cl_mem_properties> &properties,\r\n            cl_mem_flags flags, ImageFormat format, size_type width,\r\n            size_type height, size_type row_pitch = 0, void *host_ptr = nullptr,\r\n            cl_int *err = nullptr) {\r\n      cl_int error;\r\n\r\n      cl_image_desc desc = {};\r\n      desc.image_type = CL_MEM_OBJECT_IMAGE2D;\r\n      desc.image_width = width;\r\n      desc.image_height = height;\r\n      desc.image_row_pitch = row_pitch;\r\n\r\n      if (properties.empty()) {\r\n        object_ = ::clCreateImageWithProperties(\r\n            context(), nullptr, flags, &format, &desc, host_ptr, &error);\r\n      } else {\r\n        object_ =\r\n            ::clCreateImageWithProperties(context(), properties.data(), flags,\r\n                                          &format, &desc, host_ptr, &error);\r\n      }\r\n\r\n      detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n      if (err != nullptr) {\r\n        *err = error;\r\n      }\r\n    }\r\n\r\n    /*! \\brief Constructs a Image2D with specified properties.\r\n     *\r\n     *  Wraps clCreateImageWithProperties().\r\n     *\r\n     *  \\param properties Optional list of properties for the image object and\r\n     *                    their corresponding values. The non-empty list must\r\n     *                    end with 0.\r\n     *  \\param buffer Refer to a valid buffer or image memory object.\r\n     */\r\n    Image2D(const Context &context, const vector<cl_mem_properties> &properties,\r\n            cl_mem_flags flags, ImageFormat format, const Buffer &buffer,\r\n            size_type width, size_type height, size_type row_pitch = 0,\r\n            cl_int *err = nullptr) {\r\n      cl_int error;\r\n\r\n      cl_image_desc desc = {};\r\n      desc.image_type = CL_MEM_OBJECT_IMAGE2D;\r\n      desc.image_width = width;\r\n      desc.image_height = height;\r\n      desc.image_row_pitch = row_pitch;\r\n      desc.buffer = buffer();\r\n\r\n      if (properties.empty()) {\r\n        object_ = ::clCreateImageWithProperties(\r\n            context(), nullptr, flags, &format, &desc, nullptr, &error);\r\n      } else {\r\n        object_ =\r\n            ::clCreateImageWithProperties(context(), properties.data(), flags,\r\n                                          &format, &desc, nullptr, &error);\r\n      }\r\n\r\n      detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n      if (err != nullptr) {\r\n        *err = error;\r\n      }\r\n    }\r\n\r\n#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Image2D() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Image2D(const cl_mem& image2D, bool retainObject = false) :\r\n        Image(image2D, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    Image2D& operator = (const cl_mem& rhs)\r\n    {\r\n        Image::operator=(rhs);\r\n        return *this;\r\n    }\r\n};\r\n\r\n\r\n#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n/*! \\brief Class interface for GL 2D Image Memory objects.\r\n *\r\n *  This is provided to facilitate interoperability with OpenGL.\r\n * \r\n *  See Memory for details about copy semantics, etc.\r\n * \r\n *  \\see Memory\r\n *  \\note Deprecated for OpenCL 1.2. Please use ImageGL instead.\r\n */\r\nclass CL_API_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D \r\n{\r\npublic:\r\n    /*! \\brief Constructs an Image2DGL in a specified context, from a given\r\n     *         GL Texture.\r\n     *\r\n     *  Wraps clCreateFromGLTexture2D().\r\n     */\r\n    Image2DGL(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        cl_GLenum target,\r\n        cl_GLint  miplevel,\r\n        cl_GLuint texobj,\r\n        cl_int * err = nullptr)\r\n    {\r\n        cl_int error;\r\n        object_ = ::clCreateFromGLTexture2D(\r\n            context(),\r\n            flags,\r\n            target,\r\n            miplevel,\r\n            texobj,\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n\r\n    }\r\n    \r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Image2DGL() : Image2D() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Image2DGL(const cl_mem& image, bool retainObject = false) : \r\n        Image2D(image, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *c\r\n     *  See Memory for further details.\r\n     */\r\n    Image2DGL& operator = (const cl_mem& rhs)\r\n    {\r\n        Image2D::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n\r\n\r\n} CL_API_SUFFIX__VERSION_1_1_DEPRECATED;\r\n#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n/*! \\class Image2DArray\r\n * \\brief Image interface for arrays of 2D images.\r\n */\r\nclass Image2DArray : public Image\r\n{\r\npublic:\r\n    Image2DArray(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        ImageFormat format,\r\n        size_type arraySize,\r\n        size_type width,\r\n        size_type height,\r\n        size_type rowPitch,\r\n        size_type slicePitch,\r\n        void* host_ptr = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_image_desc desc = {};\r\n        desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;\r\n        desc.image_width = width;\r\n        desc.image_height = height;\r\n        desc.image_array_size = arraySize;\r\n        desc.image_row_pitch = rowPitch;\r\n        desc.image_slice_pitch = slicePitch;\r\n\r\n        object_ = ::clCreateImage(\r\n            context(), \r\n            flags, \r\n            &format, \r\n            &desc, \r\n            host_ptr, \r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n    /*! \\brief Constructs a Image2DArray with specified properties.\r\n     *\r\n     *  Wraps clCreateImageWithProperties().\r\n     *\r\n     *  \\param properties Optional list of properties for the image object and\r\n     *                    their corresponding values. The non-empty list must\r\n     *                    end with 0.\r\n     *  \\param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was\r\n     *                  specified. Note alignment & exclusivity requirements.\r\n     */\r\n    Image2DArray(const Context &context,\r\n                 const vector<cl_mem_properties> &properties,\r\n                 cl_mem_flags flags, ImageFormat format, size_type arraySize,\r\n                 size_type width, size_type height, size_type rowPitch = 0,\r\n                 size_type slicePitch = 0, void *host_ptr = nullptr,\r\n                 cl_int *err = nullptr) {\r\n      cl_int error;\r\n\r\n      cl_image_desc desc = {};\r\n      desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;\r\n      desc.image_width = width;\r\n      desc.image_height = height;\r\n      desc.image_array_size = arraySize;\r\n      desc.image_row_pitch = rowPitch;\r\n      desc.image_slice_pitch = slicePitch;\r\n\r\n      if (properties.empty()) {\r\n        object_ = ::clCreateImageWithProperties(\r\n            context(), nullptr, flags, &format, &desc, host_ptr, &error);\r\n      } else {\r\n        object_ =\r\n            ::clCreateImageWithProperties(context(), properties.data(), flags,\r\n                                          &format, &desc, host_ptr, &error);\r\n      }\r\n\r\n      detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n      if (err != nullptr) {\r\n        *err = error;\r\n      }\r\n    }\r\n#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n\r\n    Image2DArray() { }\r\n    \r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Image2DArray(const cl_mem& imageArray, bool retainObject = false) : Image(imageArray, retainObject) { }\r\n\r\n    Image2DArray& operator = (const cl_mem& rhs)\r\n    {\r\n        Image::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n};\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n/*! \\brief Class interface for 3D Image Memory objects.\r\n *\r\n *  See Memory for details about copy semantics, etc.\r\n * \r\n *  \\see Memory\r\n */\r\nclass Image3D : public Image\r\n{\r\npublic:\r\n    /*! \\brief Constructs a 3D Image in a specified context.\r\n     *\r\n     *  Wraps clCreateImage().\r\n     */\r\n    Image3D(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        ImageFormat format,\r\n        size_type width,\r\n        size_type height,\r\n        size_type depth,\r\n        size_type row_pitch = 0,\r\n        size_type slice_pitch = 0,\r\n        void* host_ptr = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        bool useCreateImage;\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n        // Run-time decision based on the actual platform\r\n        {\r\n            cl_uint version = detail::getContextPlatformVersion(context());\r\n            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above\r\n        }\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        useCreateImage = true;\r\n#else\r\n        useCreateImage = false;\r\n#endif\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        if (useCreateImage)\r\n        {\r\n            cl_image_desc desc = {};\r\n            desc.image_type = CL_MEM_OBJECT_IMAGE3D;\r\n            desc.image_width = width;\r\n            desc.image_height = height;\r\n            desc.image_depth = depth;\r\n            desc.image_row_pitch = row_pitch;\r\n            desc.image_slice_pitch = slice_pitch;\r\n\r\n            object_ = ::clCreateImage(\r\n                context(), \r\n                flags, \r\n                &format, \r\n                &desc, \r\n                host_ptr, \r\n                &error);\r\n\r\n            detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif  // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n        if (!useCreateImage)\r\n        {\r\n            object_ = ::clCreateImage3D(\r\n                context(), flags, &format, width, height, depth, row_pitch,\r\n                slice_pitch, host_ptr, &error);\r\n\r\n            detail::errHandler(error, __CREATE_IMAGE3D_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n    /*! \\brief Constructs a Image3D with specified properties.\r\n     *\r\n     *  Wraps clCreateImageWithProperties().\r\n     *\r\n     *  \\param properties Optional list of properties for the image object and\r\n     *                    their corresponding values. The non-empty list must\r\n     *                    end with 0.\r\n     *  \\param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was\r\n     *                  specified. Note alignment & exclusivity requirements.\r\n     */\r\n    Image3D(const Context &context, const vector<cl_mem_properties> &properties,\r\n            cl_mem_flags flags, ImageFormat format, size_type width,\r\n            size_type height, size_type depth, size_type row_pitch = 0,\r\n            size_type slice_pitch = 0, void *host_ptr = nullptr,\r\n            cl_int *err = nullptr) {\r\n      cl_int error;\r\n\r\n      cl_image_desc desc = {};\r\n      desc.image_type = CL_MEM_OBJECT_IMAGE3D;\r\n      desc.image_width = width;\r\n      desc.image_height = height;\r\n      desc.image_depth = depth;\r\n      desc.image_row_pitch = row_pitch;\r\n      desc.image_slice_pitch = slice_pitch;\r\n\r\n      if (properties.empty()) {\r\n        object_ = ::clCreateImageWithProperties(\r\n            context(), nullptr, flags, &format, &desc, host_ptr, &error);\r\n      } else {\r\n        object_ =\r\n            ::clCreateImageWithProperties(context(), properties.data(), flags,\r\n                                          &format, &desc, host_ptr, &error);\r\n      }\r\n\r\n      detail::errHandler(error, __CREATE_IMAGE_ERR);\r\n      if (err != nullptr) {\r\n        *err = error;\r\n      }\r\n    }\r\n#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Image3D() : Image() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Image3D(const cl_mem& image3D, bool retainObject = false) : \r\n        Image(image3D, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    Image3D& operator = (const cl_mem& rhs)\r\n    {\r\n        Image::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n};\r\n\r\n#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n/*! \\brief Class interface for GL 3D Image Memory objects.\r\n *\r\n *  This is provided to facilitate interoperability with OpenGL.\r\n * \r\n *  See Memory for details about copy semantics, etc.\r\n * \r\n *  \\see Memory\r\n */\r\nclass Image3DGL : public Image3D\r\n{\r\npublic:\r\n    /*! \\brief Constructs an Image3DGL in a specified context, from a given\r\n     *         GL Texture.\r\n     *\r\n     *  Wraps clCreateFromGLTexture3D().\r\n     */\r\n    Image3DGL(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        cl_GLenum target,\r\n        cl_GLint  miplevel,\r\n        cl_GLuint texobj,\r\n        cl_int * err = nullptr)\r\n    {\r\n        cl_int error;\r\n        object_ = ::clCreateFromGLTexture3D(\r\n            context(),\r\n            flags,\r\n            target,\r\n            miplevel,\r\n            texobj,\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Image3DGL() : Image3D() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Image3DGL(const cl_mem& image, bool retainObject = false) : \r\n        Image3D(image, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    Image3DGL& operator = (const cl_mem& rhs)\r\n    {\r\n        Image3D::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n};\r\n#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n/*! \\class ImageGL\r\n * \\brief general image interface for GL interop.\r\n * We abstract the 2D and 3D GL images into a single instance here\r\n * that wraps all GL sourced images on the grounds that setup information\r\n * was performed by OpenCL anyway.\r\n */\r\nclass ImageGL : public Image\r\n{\r\npublic:\r\n    ImageGL(\r\n        const Context& context,\r\n        cl_mem_flags flags,\r\n        cl_GLenum target,\r\n        cl_GLint  miplevel,\r\n        cl_GLuint texobj,\r\n        cl_int * err = nullptr)\r\n    {\r\n        cl_int error;\r\n        object_ = ::clCreateFromGLTexture(\r\n            context(), \r\n            flags, \r\n            target,\r\n            miplevel,\r\n            texobj,\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    ImageGL() : Image() { }\r\n    \r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  See Memory for further details.\r\n     */\r\n    explicit ImageGL(const cl_mem& image, bool retainObject = false) : \r\n        Image(image, retainObject) { }\r\n\r\n    ImageGL& operator = (const cl_mem& rhs)\r\n    {\r\n        Image::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n};\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n/*! \\brief Class interface for Pipe Memory Objects.\r\n*\r\n*  See Memory for details about copy semantics, etc.\r\n*\r\n*  \\see Memory\r\n*/\r\nclass Pipe : public Memory\r\n{\r\npublic:\r\n\r\n    /*! \\brief Constructs a Pipe in a specified context.\r\n     *\r\n     * Wraps clCreatePipe().\r\n     * @param context Context in which to create the pipe.\r\n     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.\r\n     * @param packet_size Size in bytes of a single packet of the pipe.\r\n     * @param max_packets Number of packets that may be stored in the pipe.\r\n     *\r\n     */\r\n    Pipe(\r\n        const Context& context,\r\n        cl_uint packet_size,\r\n        cl_uint max_packets,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;\r\n        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);\r\n\r\n        detail::errHandler(error, __CREATE_PIPE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    /*! \\brief Constructs a Pipe in a the default context.\r\n     *\r\n     * Wraps clCreatePipe().\r\n     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.\r\n     * @param packet_size Size in bytes of a single packet of the pipe.\r\n     * @param max_packets Number of packets that may be stored in the pipe.\r\n     *\r\n     */\r\n    Pipe(\r\n        cl_uint packet_size,\r\n        cl_uint max_packets,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        Context context = Context::getDefault(err);\r\n\r\n        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;\r\n        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);\r\n\r\n        detail::errHandler(error, __CREATE_PIPE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Pipe() : Memory() { }\r\n\r\n    /*! \\brief Constructor from cl_mem - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with earlier versions.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    explicit Pipe(const cl_mem& pipe, bool retainObject = false) :\r\n        Memory(pipe, retainObject) { }\r\n\r\n    /*! \\brief Assignment from cl_mem - performs shallow copy.\r\n     *\r\n     *  See Memory for further details.\r\n     */\r\n    Pipe& operator = (const cl_mem& rhs)\r\n    {\r\n        Memory::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n\r\n\r\n    //! \\brief Wrapper for clGetMemObjectInfo().\r\n    template <typename T>\r\n    cl_int getInfo(cl_pipe_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetPipeInfo, object_, name, param),\r\n            __GET_PIPE_INFO_ERR);\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetMemObjectInfo() that returns by value.\r\n    template <cl_pipe_info name> typename\r\n        detail::param_traits<detail::cl_pipe_info, name>::param_type\r\n        getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_pipe_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n}; // class Pipe\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n\r\n/*! \\brief Class interface for cl_sampler.\r\n *\r\n *  \\note Copies of these objects are shallow, meaning that the copy will refer\r\n *        to the same underlying cl_sampler as the original.  For details, see\r\n *        clRetainSampler() and clReleaseSampler().\r\n *\r\n *  \\see cl_sampler \r\n */\r\nclass Sampler : public detail::Wrapper<cl_sampler>\r\n{\r\npublic:\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Sampler() { }\r\n\r\n    /*! \\brief Constructs a Sampler in a specified context.\r\n     *\r\n     *  Wraps clCreateSampler().\r\n     */\r\n    Sampler(\r\n        const Context& context,\r\n        cl_bool normalized_coords,\r\n        cl_addressing_mode addressing_mode,\r\n        cl_filter_mode filter_mode,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        cl_sampler_properties sampler_properties[] = {\r\n            CL_SAMPLER_NORMALIZED_COORDS, normalized_coords,\r\n            CL_SAMPLER_ADDRESSING_MODE, addressing_mode,\r\n            CL_SAMPLER_FILTER_MODE, filter_mode,\r\n            0 };\r\n        object_ = ::clCreateSamplerWithProperties(\r\n            context(),\r\n            sampler_properties,\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_SAMPLER_WITH_PROPERTIES_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n#else\r\n        object_ = ::clCreateSampler(\r\n            context(),\r\n            normalized_coords,\r\n            addressing_mode,\r\n            filter_mode,\r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_SAMPLER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n#endif        \r\n    }\r\n\r\n    /*! \\brief Constructor from cl_sampler - takes ownership.\r\n     * \r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  This effectively transfers ownership of a refcount on the cl_sampler\r\n     *  into the new Sampler object.\r\n     */\r\n    explicit Sampler(const cl_sampler& sampler, bool retainObject = false) : \r\n        detail::Wrapper<cl_type>(sampler, retainObject) { }\r\n\r\n    /*! \\brief Assignment operator from cl_sampler - takes ownership.\r\n     *\r\n     *  This effectively transfers ownership of a refcount on the rhs and calls\r\n     *  clReleaseSampler() on the value previously held by this instance.\r\n     */\r\n    Sampler& operator = (const cl_sampler& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n  \r\n\r\n    //! \\brief Wrapper for clGetSamplerInfo().\r\n    template <typename T>\r\n    cl_int getInfo(cl_sampler_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetSamplerInfo, object_, name, param),\r\n            __GET_SAMPLER_INFO_ERR);\r\n    }\r\n\r\n    //! \\brief Wrapper for clGetSamplerInfo() that returns by value.\r\n    template <cl_sampler_info name> typename\r\n    detail::param_traits<detail::cl_sampler_info, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_sampler_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n};\r\n\r\nclass Program;\r\nclass CommandQueue;\r\nclass DeviceCommandQueue;\r\nclass Kernel;\r\n\r\n//! \\brief Class interface for specifying NDRange values.\r\nclass NDRange\r\n{\r\nprivate:\r\n    size_type sizes_[3];\r\n    cl_uint dimensions_;\r\n\r\npublic:\r\n    //! \\brief Default constructor - resulting range has zero dimensions.\r\n    NDRange()\r\n        : dimensions_(0)\r\n    {\r\n        sizes_[0] = 0;\r\n        sizes_[1] = 0;\r\n        sizes_[2] = 0;\r\n    }\r\n\r\n    //! \\brief Constructs one-dimensional range.\r\n    NDRange(size_type size0)\r\n        : dimensions_(1)\r\n    {\r\n        sizes_[0] = size0;\r\n        sizes_[1] = 1;\r\n        sizes_[2] = 1;\r\n    }\r\n\r\n    //! \\brief Constructs two-dimensional range.\r\n    NDRange(size_type size0, size_type size1)\r\n        : dimensions_(2)\r\n    {\r\n        sizes_[0] = size0;\r\n        sizes_[1] = size1;\r\n        sizes_[2] = 1;\r\n    }\r\n\r\n    //! \\brief Constructs three-dimensional range.\r\n    NDRange(size_type size0, size_type size1, size_type size2)\r\n        : dimensions_(3)\r\n    {\r\n        sizes_[0] = size0;\r\n        sizes_[1] = size1;\r\n        sizes_[2] = size2;\r\n    }\r\n\r\n    //! \\brief Constructs one-dimensional range.\r\n    NDRange(array<size_type, 1> a) : NDRange(a[0]){}\r\n\r\n    //! \\brief Constructs two-dimensional range.\r\n    NDRange(array<size_type, 2> a) : NDRange(a[0], a[1]){}\r\n\r\n    //! \\brief Constructs three-dimensional range.\r\n    NDRange(array<size_type, 3> a) : NDRange(a[0], a[1], a[2]){}\r\n\r\n    /*! \\brief Conversion operator to const size_type *.\r\n     *  \r\n     *  \\returns a pointer to the size of the first dimension.\r\n     */\r\n    operator const size_type*() const { \r\n        return sizes_; \r\n    }\r\n\r\n    //! \\brief Queries the number of dimensions in the range.\r\n    size_type dimensions() const \r\n    { \r\n        return dimensions_; \r\n    }\r\n\r\n    //! \\brief Returns the size of the object in bytes based on the\r\n    // runtime number of dimensions\r\n    size_type size() const\r\n    {\r\n        return dimensions_*sizeof(size_type);\r\n    }\r\n\r\n    size_type* get()\r\n    {\r\n        return sizes_;\r\n    }\r\n    \r\n    const size_type* get() const\r\n    {\r\n        return sizes_;\r\n    }\r\n};\r\n\r\n//! \\brief A zero-dimensional range.\r\nstatic const NDRange NullRange;\r\n\r\n//! \\brief Local address wrapper for use with Kernel::setArg\r\nstruct LocalSpaceArg\r\n{\r\n    size_type size_;\r\n};\r\n\r\nnamespace detail {\r\n\r\ntemplate <typename T, class Enable = void>\r\nstruct KernelArgumentHandler;\r\n\r\n// Enable for objects that are not subclasses of memory\r\n// Pointers, constants etc\r\ntemplate <typename T>\r\nstruct KernelArgumentHandler<T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type>\r\n{\r\n    static size_type size(const T&) { return sizeof(T); }\r\n    static const T* ptr(const T& value) { return &value; }\r\n};\r\n\r\n// Enable for subclasses of memory where we want to get a reference to the cl_mem out\r\n// and pass that in for safety\r\ntemplate <typename T>\r\nstruct KernelArgumentHandler<T, typename std::enable_if<std::is_base_of<cl::Memory, T>::value>::type>\r\n{\r\n    static size_type size(const T&) { return sizeof(cl_mem); }\r\n    static const cl_mem* ptr(const T& value) { return &(value()); }\r\n};\r\n\r\n// Specialization for DeviceCommandQueue defined later\r\n\r\ntemplate <>\r\nstruct KernelArgumentHandler<LocalSpaceArg, void>\r\n{\r\n    static size_type size(const LocalSpaceArg& value) { return value.size_; }\r\n    static const void* ptr(const LocalSpaceArg&) { return nullptr; }\r\n};\r\n\r\n} \r\n//! \\endcond\r\n\r\n/*! Local\r\n * \\brief Helper function for generating LocalSpaceArg objects.\r\n */\r\ninline LocalSpaceArg\r\nLocal(size_type size)\r\n{\r\n    LocalSpaceArg ret = { size };\r\n    return ret;\r\n}\r\n\r\n/*! \\brief Class interface for cl_kernel.\r\n *\r\n *  \\note Copies of these objects are shallow, meaning that the copy will refer\r\n *        to the same underlying cl_kernel as the original.  For details, see\r\n *        clRetainKernel() and clReleaseKernel().\r\n *\r\n *  \\see cl_kernel\r\n */\r\nclass Kernel : public detail::Wrapper<cl_kernel>\r\n{\r\npublic:\r\n    inline Kernel(const Program& program, const string& name, cl_int* err = nullptr);\r\n    inline Kernel(const Program& program, const char* name, cl_int* err = nullptr);\r\n\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    Kernel() { }\r\n\r\n    /*! \\brief Constructor from cl_kernel - takes ownership.\r\n     * \r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     *  This effectively transfers ownership of a refcount on the cl_kernel\r\n     *  into the new Kernel object.\r\n     */\r\n    explicit Kernel(const cl_kernel& kernel, bool retainObject = false) : \r\n        detail::Wrapper<cl_type>(kernel, retainObject) { }\r\n\r\n    /*! \\brief Assignment operator from cl_kernel - takes ownership.\r\n     *\r\n     *  This effectively transfers ownership of a refcount on the rhs and calls\r\n     *  clReleaseKernel() on the value previously held by this instance.\r\n     */\r\n    Kernel& operator = (const cl_kernel& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n\r\n\r\n\r\n    template <typename T>\r\n    cl_int getInfo(cl_kernel_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetKernelInfo, object_, name, param),\r\n            __GET_KERNEL_INFO_ERR);\r\n    }\r\n\r\n    template <cl_kernel_info name> typename\r\n    detail::param_traits<detail::cl_kernel_info, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_kernel_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    template <typename T>\r\n    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),\r\n            __GET_KERNEL_ARG_INFO_ERR);\r\n    }\r\n\r\n    template <cl_kernel_arg_info name> typename\r\n    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type\r\n    getArgInfo(cl_uint argIndex, cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_kernel_arg_info, name>::param_type param;\r\n        cl_int result = getArgInfo(argIndex, name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n    template <typename T>\r\n    cl_int getWorkGroupInfo(\r\n        const Device& device, cl_kernel_work_group_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(\r\n                &::clGetKernelWorkGroupInfo, object_, device(), name, param),\r\n                __GET_KERNEL_WORK_GROUP_INFO_ERR);\r\n    }\r\n\r\n    template <cl_kernel_work_group_info name> typename\r\n    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type\r\n        getWorkGroupInfo(const Device& device, cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n        detail::cl_kernel_work_group_info, name>::param_type param;\r\n        cl_int result = getWorkGroupInfo(device, name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n    \r\n#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n    cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const\r\n    {\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n        return detail::errHandler(\r\n            clGetKernelSubGroupInfo(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),\r\n            __GET_KERNEL_SUB_GROUP_INFO_ERR);\r\n\r\n#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n        typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR;\r\n        static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = nullptr;\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR);\r\n\r\n        return detail::errHandler(\r\n            pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),\r\n            __GET_KERNEL_SUB_GROUP_INFO_ERR);\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n    }\r\n\r\n    template <cl_kernel_sub_group_info name>\r\n        size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = nullptr) const\r\n    {\r\n        size_type param;\r\n        cl_int result = getSubGroupInfo(dev, name, range, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n#endif // defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n    /*! \\brief setArg overload taking a shared_ptr type\r\n     */\r\n    template<typename T, class D>\r\n    cl_int setArg(cl_uint index, const cl::pointer<T, D> &argPtr)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetKernelArgSVMPointer(object_, index, argPtr.get()),\r\n            __SET_KERNEL_ARGS_ERR);\r\n    }\r\n\r\n    /*! \\brief setArg overload taking a vector type.\r\n     */\r\n    template<typename T, class Alloc>\r\n    cl_int setArg(cl_uint index, const cl::vector<T, Alloc> &argPtr)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetKernelArgSVMPointer(object_, index, argPtr.data()),\r\n            __SET_KERNEL_ARGS_ERR);\r\n    }\r\n\r\n    /*! \\brief setArg overload taking a pointer type\r\n     */\r\n    template<typename T>\r\n    typename std::enable_if<std::is_pointer<T>::value, cl_int>::type\r\n        setArg(cl_uint index, const T argPtr)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetKernelArgSVMPointer(object_, index, argPtr),\r\n            __SET_KERNEL_ARGS_ERR);\r\n    }\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n    /*! \\brief setArg overload taking a POD type\r\n     */\r\n    template <typename T>\r\n    typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type\r\n        setArg(cl_uint index, const T &value)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetKernelArg(\r\n                object_,\r\n                index,\r\n                detail::KernelArgumentHandler<T>::size(value),\r\n                detail::KernelArgumentHandler<T>::ptr(value)),\r\n            __SET_KERNEL_ARGS_ERR);\r\n    }\r\n\r\n    cl_int setArg(cl_uint index, size_type size, const void* argPtr)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetKernelArg(object_, index, size, argPtr),\r\n            __SET_KERNEL_ARGS_ERR);\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n    /*!\r\n     * Specify a vector of SVM pointers that the kernel may access in \r\n     * addition to its arguments.\r\n     */\r\n    cl_int setSVMPointers(const vector<void*> &pointerList)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetKernelExecInfo(\r\n                object_,\r\n                CL_KERNEL_EXEC_INFO_SVM_PTRS,\r\n                sizeof(void*)*pointerList.size(),\r\n                pointerList.data()));\r\n    }\r\n\r\n    /*!\r\n     * Specify a std::array of SVM pointers that the kernel may access in\r\n     * addition to its arguments.\r\n     */\r\n    template<int ArrayLength>\r\n    cl_int setSVMPointers(const std::array<void*, ArrayLength> &pointerList)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetKernelExecInfo(\r\n                object_,\r\n                CL_KERNEL_EXEC_INFO_SVM_PTRS,\r\n                sizeof(void*)*pointerList.size(),\r\n                pointerList.data()));\r\n    }\r\n\r\n    /*! \\brief Enable fine-grained system SVM.\r\n     *\r\n     * \\note It is only possible to enable fine-grained system SVM if all devices\r\n     *       in the context associated with kernel support it.\r\n     * \r\n     * \\param svmEnabled True if fine-grained system SVM is requested. False otherwise.\r\n     * \\return CL_SUCCESS if the function was executed succesfully. CL_INVALID_OPERATION\r\n     *         if no devices in the context support fine-grained system SVM.\r\n     *\r\n     * \\see clSetKernelExecInfo\r\n     */\r\n    cl_int enableFineGrainedSystemSVM(bool svmEnabled)\r\n    {\r\n        cl_bool svmEnabled_ = svmEnabled ? CL_TRUE : CL_FALSE;\r\n        return detail::errHandler(\r\n            ::clSetKernelExecInfo(\r\n                object_,\r\n                CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM,\r\n                sizeof(cl_bool),\r\n                &svmEnabled_\r\n                )\r\n            );\r\n    }\r\n    \r\n    template<int index, int ArrayLength, class D, typename T0, typename T1, typename... Ts>\r\n    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0, const pointer<T1, D> &t1, Ts & ... ts)\r\n    {\r\n        pointerList[index] = static_cast<void*>(t0.get());\r\n        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);\r\n    }\r\n\r\n    template<int index, int ArrayLength, typename T0, typename T1, typename... Ts>\r\n    typename std::enable_if<std::is_pointer<T0>::value, void>::type\r\n    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0, T1 t1, Ts... ts)\r\n    {\r\n        pointerList[index] = static_cast<void*>(t0);\r\n        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);\r\n    }\r\n\r\n    template<int index, int ArrayLength, typename T0, class D>\r\n    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0)\r\n    {\r\n        pointerList[index] = static_cast<void*>(t0.get());\r\n    }\r\n\r\n\r\n    template<int index, int ArrayLength, typename T0>\r\n    typename std::enable_if<std::is_pointer<T0>::value, void>::type\r\n    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0)\r\n    {\r\n        pointerList[index] = static_cast<void*>(t0);\r\n    }\r\n\r\n    template<typename T0, typename... Ts>\r\n    cl_int setSVMPointers(const T0 &t0, Ts & ... ts)\r\n    {\r\n        std::array<void*, 1 + sizeof...(Ts)> pointerList;\r\n\r\n        setSVMPointersHelper<0, 1 + sizeof...(Ts)>(pointerList, t0, ts...);\r\n        return detail::errHandler(\r\n            ::clSetKernelExecInfo(\r\n            object_,\r\n            CL_KERNEL_EXEC_INFO_SVM_PTRS,\r\n            sizeof(void*)*(1 + sizeof...(Ts)),\r\n            pointerList.data()));\r\n    }\r\n\r\n    template<typename T>\r\n    cl_int setExecInfo(cl_kernel_exec_info param_name, const T& val)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetKernelExecInfo(\r\n            object_,\r\n            param_name,\r\n            sizeof(T),\r\n            &val));\r\n    }\r\n\r\n    template<cl_kernel_exec_info name>\r\n    cl_int setExecInfo(typename detail::param_traits<detail::cl_kernel_exec_info, name>::param_type& val)\r\n    {\r\n        return setExecInfo(name, val);\r\n    }\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n    /**\r\n     * Make a deep copy of the kernel object including its arguments.\r\n     * @return A new kernel object with internal state entirely separate from that\r\n     *         of the original but with any arguments set on the original intact.\r\n     */\r\n    Kernel clone()\r\n    {\r\n        cl_int error;\r\n        Kernel retValue(clCloneKernel(this->get(), &error));\r\n\r\n        detail::errHandler(error, __CLONE_KERNEL_ERR);\r\n        return retValue;\r\n    }\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n};\r\n\r\n/*! \\class Program\r\n * \\brief Program interface that implements cl_program.\r\n */\r\nclass Program : public detail::Wrapper<cl_program>\r\n{\r\npublic:\r\n#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n    typedef vector<vector<unsigned char>> Binaries;\r\n    typedef vector<string> Sources;\r\n#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n    typedef vector<std::pair<const void*, size_type> > Binaries;\r\n    typedef vector<std::pair<const char*, size_type> > Sources;\r\n#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n    \r\n    Program(\r\n        const string& source,\r\n        bool build = false,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        const char * strings = source.c_str();\r\n        const size_type length  = source.size();\r\n\r\n        Context context = Context::getDefault(err);\r\n\r\n        object_ = ::clCreateProgramWithSource(\r\n            context(), (cl_uint)1, &strings, &length, &error);\r\n\r\n        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);\r\n\r\n        if (error == CL_SUCCESS && build) {\r\n\r\n            error = ::clBuildProgram(\r\n                object_,\r\n                0,\r\n                nullptr,\r\n#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)\r\n                \"-cl-std=CL2.0\",\r\n#else\r\n                \"\",\r\n#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)\r\n                nullptr,\r\n                nullptr);\r\n\r\n            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());\r\n        }\r\n\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    Program(\r\n        const Context& context,\r\n        const string& source,\r\n        bool build = false,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        const char * strings = source.c_str();\r\n        const size_type length  = source.size();\r\n\r\n        object_ = ::clCreateProgramWithSource(\r\n            context(), (cl_uint)1, &strings, &length, &error);\r\n\r\n        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);\r\n\r\n        if (error == CL_SUCCESS && build) {\r\n            error = ::clBuildProgram(\r\n                object_,\r\n                0,\r\n                nullptr,\r\n#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)\r\n                \"-cl-std=CL2.0\",\r\n#else\r\n                \"\",\r\n#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)\r\n                nullptr,\r\n                nullptr);\r\n            \r\n            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());\r\n        }\r\n\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    /**\r\n     * Create a program from a vector of source strings and the default context.\r\n     * Does not compile or link the program.\r\n     */\r\n    Program(\r\n        const Sources& sources,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        Context context = Context::getDefault(err);\r\n\r\n        const size_type n = (size_type)sources.size();\r\n\r\n        vector<size_type> lengths(n);\r\n        vector<const char*> strings(n);\r\n\r\n        for (size_type i = 0; i < n; ++i) {\r\n#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n            strings[i] = sources[(int)i].data();\r\n            lengths[i] = sources[(int)i].length();\r\n#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n            strings[i] = sources[(int)i].first;\r\n            lengths[i] = sources[(int)i].second;\r\n#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n        }\r\n\r\n        object_ = ::clCreateProgramWithSource(\r\n            context(), (cl_uint)n, strings.data(), lengths.data(), &error);\r\n\r\n        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    /**\r\n     * Create a program from a vector of source strings and a provided context.\r\n     * Does not compile or link the program.\r\n     */\r\n    Program(\r\n        const Context& context,\r\n        const Sources& sources,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        const size_type n = (size_type)sources.size();\r\n\r\n        vector<size_type> lengths(n);\r\n        vector<const char*> strings(n);\r\n\r\n        for (size_type i = 0; i < n; ++i) {\r\n#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n            strings[i] = sources[(int)i].data();\r\n            lengths[i] = sources[(int)i].length();\r\n#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n            strings[i] = sources[(int)i].first;\r\n            lengths[i] = sources[(int)i].second;\r\n#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n        }\r\n\r\n        object_ = ::clCreateProgramWithSource(\r\n            context(), (cl_uint)n, strings.data(), lengths.data(), &error);\r\n\r\n        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n#if defined(CL_HPP_USE_IL_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n    /**\r\n     * Program constructor to allow construction of program from SPIR-V or another IL.\r\n     *\r\n     * Requires OpenCL 2.1 or newer or the cl_khr_il_program extension.\r\n     */\r\n    Program(\r\n        const vector<char>& IL,\r\n        bool build = false,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        Context context = Context::getDefault(err);\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n        object_ = ::clCreateProgramWithIL(\r\n            context(), static_cast<const void*>(IL.data()), IL.size(), &error);\r\n\r\n#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n        typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;\r\n        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = nullptr;\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);\r\n\r\n        object_ = pfn_clCreateProgramWithILKHR(\r\n                context(), static_cast<const void*>(IL.data()), IL.size(), &error);\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n        detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR);\r\n\r\n        if (error == CL_SUCCESS && build) {\r\n\r\n            error = ::clBuildProgram(\r\n                object_,\r\n                0,\r\n                nullptr,\r\n#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)\r\n                \"-cl-std=CL2.0\",\r\n#else\r\n                \"\",\r\n#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)\r\n                nullptr,\r\n                nullptr);\r\n\r\n            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());\r\n        }\r\n\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    /**\r\n     * Program constructor to allow construction of program from SPIR-V or another IL\r\n     * for a specific context.\r\n     *\r\n     * Requires OpenCL 2.1 or newer or the cl_khr_il_program extension.\r\n     */\r\n    Program(\r\n        const Context& context,\r\n        const vector<char>& IL,\r\n        bool build = false,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n        object_ = ::clCreateProgramWithIL(\r\n            context(), static_cast<const void*>(IL.data()), IL.size(), &error);\r\n\r\n#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n        typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;\r\n        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = nullptr;\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);\r\n\r\n        object_ = pfn_clCreateProgramWithILKHR(\r\n            context(), static_cast<const void*>(IL.data()), IL.size(), &error);\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n        detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR);\r\n\r\n        if (error == CL_SUCCESS && build) {\r\n            error = ::clBuildProgram(\r\n                object_,\r\n                0,\r\n                nullptr,\r\n#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)\r\n                \"-cl-std=CL2.0\",\r\n#else\r\n                \"\",\r\n#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)\r\n                nullptr,\r\n                nullptr);\r\n\r\n            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());\r\n        }\r\n\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n#endif // defined(CL_HPP_USE_IL_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n\r\n    /**\r\n     * Construct a program object from a list of devices and a per-device list of binaries.\r\n     * \\param context A valid OpenCL context in which to construct the program.\r\n     * \\param devices A vector of OpenCL device objects for which the program will be created.\r\n     * \\param binaries A vector of pairs of a pointer to a binary object and its length.\r\n     * \\param binaryStatus An optional vector that on completion will be resized to\r\n     *   match the size of binaries and filled with values to specify if each binary\r\n     *   was successfully loaded.\r\n     *   Set to CL_SUCCESS if the binary was successfully loaded.\r\n     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is nullptr.\r\n     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.\r\n     * \\param err if non-nullptr will be set to CL_SUCCESS on successful operation or one of the following errors:\r\n     *   CL_INVALID_CONTEXT if context is not a valid context.\r\n     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; \r\n     *     or if any entry in binaries is nullptr or has length 0.\r\n     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.\r\n     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.\r\n     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.\r\n     */\r\n    Program(\r\n        const Context& context,\r\n        const vector<Device>& devices,\r\n        const Binaries& binaries,\r\n        vector<cl_int>* binaryStatus = nullptr,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        \r\n        const size_type numDevices = devices.size();\r\n        \r\n        // Catch size mismatch early and return\r\n        if(binaries.size() != numDevices) {\r\n            error = CL_INVALID_VALUE;\r\n            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n            return;\r\n        }\r\n\r\n        vector<size_type> lengths(numDevices);\r\n        vector<const unsigned char*> images(numDevices);\r\n#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n        for (size_type i = 0; i < numDevices; ++i) {\r\n            images[i] = binaries[i].data();\r\n            lengths[i] = binaries[(int)i].size();\r\n        }\r\n#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n        for (size_type i = 0; i < numDevices; ++i) {\r\n            images[i] = (const unsigned char*)binaries[i].first;\r\n            lengths[i] = binaries[(int)i].second;\r\n        }\r\n#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)\r\n\r\n        vector<cl_device_id> deviceIDs(numDevices);\r\n        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {\r\n            deviceIDs[deviceIndex] = (devices[deviceIndex])();\r\n        }\r\n\r\n        if(binaryStatus) {\r\n            binaryStatus->resize(numDevices);\r\n        }\r\n\r\n        object_ = ::clCreateProgramWithBinary(\r\n            context(), (cl_uint) devices.size(),\r\n            deviceIDs.data(),\r\n            lengths.data(), images.data(), (binaryStatus != nullptr && numDevices > 0)\r\n               ? &binaryStatus->front()\r\n               : nullptr, &error);\r\n\r\n        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    \r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    /**\r\n     * Create program using builtin kernels.\r\n     * \\param kernelNames Semi-colon separated list of builtin kernel names\r\n     */\r\n    Program(\r\n        const Context& context,\r\n        const vector<Device>& devices,\r\n        const string& kernelNames,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n\r\n        size_type numDevices = devices.size();\r\n        vector<cl_device_id> deviceIDs(numDevices);\r\n        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {\r\n            deviceIDs[deviceIndex] = (devices[deviceIndex])();\r\n        }\r\n        \r\n        object_ = ::clCreateProgramWithBuiltInKernels(\r\n            context(), \r\n            (cl_uint) devices.size(),\r\n            deviceIDs.data(),\r\n            kernelNames.c_str(), \r\n            &error);\r\n\r\n        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n    Program() { }\r\n    \r\n\r\n    /*! \\brief Constructor from cl_program - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     */\r\n    explicit Program(const cl_program& program, bool retainObject = false) : \r\n        detail::Wrapper<cl_type>(program, retainObject) { }\r\n\r\n    Program& operator = (const cl_program& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n    cl_int build(\r\n        const vector<Device>& devices,\r\n        const string& options,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        return build(devices, options.c_str(), notifyFptr, data);\r\n    }\r\n\r\n    cl_int build(\r\n        const vector<Device>& devices,\r\n        const char* options = nullptr,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        size_type numDevices = devices.size();\r\n        vector<cl_device_id> deviceIDs(numDevices);\r\n\r\n        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {\r\n            deviceIDs[deviceIndex] = (devices[deviceIndex])();\r\n        }\r\n\r\n        cl_int buildError = ::clBuildProgram(\r\n            object_,\r\n            (cl_uint)\r\n            devices.size(),\r\n            deviceIDs.data(),\r\n            options,\r\n            notifyFptr,\r\n            data);\r\n\r\n        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());\r\n    }\r\n\r\n    cl_int build(\r\n        const Device& device,\r\n        const string& options,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        return build(device, options.c_str(), notifyFptr, data);\r\n    }\r\n\r\n    cl_int build(\r\n        const Device& device,\r\n        const char* options = nullptr,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        cl_device_id deviceID = device();\r\n\r\n        cl_int buildError = ::clBuildProgram(\r\n            object_,\r\n            1,\r\n            &deviceID,\r\n            options,\r\n            notifyFptr,\r\n            data);\r\n\r\n        BuildLogType buildLog(0);\r\n        buildLog.push_back(std::make_pair(device, getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)));\r\n        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, buildLog);\r\n    }\r\n\r\n    cl_int build(\r\n        const string& options,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        return build(options.c_str(), notifyFptr, data);\r\n    }\r\n\r\n    cl_int build(\r\n        const char* options = nullptr,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        cl_int buildError = ::clBuildProgram(\r\n            object_,\r\n            0,\r\n            nullptr,\r\n            options,\r\n            notifyFptr,\r\n            data);\r\n\r\n        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    cl_int compile(\r\n        const string& options,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        return compile(options.c_str(), notifyFptr, data);\r\n    }\r\n\r\n    cl_int compile(\r\n        const char* options = nullptr,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        cl_int error = ::clCompileProgram(\r\n            object_,\r\n            0,\r\n            nullptr,\r\n            options,\r\n            0,\r\n            nullptr,\r\n            nullptr,\r\n            notifyFptr,\r\n            data);\r\n        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());\r\n    }\r\n\r\n    cl_int compile(\r\n        const string& options,\r\n        const vector<Program>& inputHeaders,\r\n        const vector<string>& headerIncludeNames,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        return compile(options.c_str(), inputHeaders, headerIncludeNames, notifyFptr, data);\r\n    }\r\n\r\n    cl_int compile(\r\n        const char* options,\r\n        const vector<Program>& inputHeaders,\r\n        const vector<string>& headerIncludeNames,\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        static_assert(sizeof(cl::Program) == sizeof(cl_program),\r\n            \"Size of cl::Program must be equal to size of cl_program\");\r\n        vector<const char*> headerIncludeNamesCStr;\r\n        for(const string& name: headerIncludeNames) {\r\n            headerIncludeNamesCStr.push_back(name.c_str());\r\n        }\r\n        cl_int error = ::clCompileProgram(\r\n            object_,\r\n            0,\r\n            nullptr,\r\n            options,\r\n            static_cast<cl_uint>(inputHeaders.size()),\r\n            reinterpret_cast<const cl_program*>(inputHeaders.data()),\r\n            reinterpret_cast<const char**>(headerIncludeNamesCStr.data()),\r\n            notifyFptr,\r\n            data);\r\n        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());\r\n    }\r\n\r\n    cl_int compile(\r\n        const string& options,\r\n        const vector<Device>& deviceList,\r\n        const vector<Program>& inputHeaders = vector<Program>(),\r\n        const vector<string>& headerIncludeNames = vector<string>(),\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        return compile(options.c_str(), deviceList, inputHeaders, headerIncludeNames, notifyFptr, data);\r\n    }\r\n\r\n    cl_int compile(\r\n        const char* options,\r\n        const vector<Device>& deviceList,\r\n        const vector<Program>& inputHeaders = vector<Program>(),\r\n        const vector<string>& headerIncludeNames = vector<string>(),\r\n        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n        void* data = nullptr) const\r\n    {\r\n        static_assert(sizeof(cl::Program) == sizeof(cl_program),\r\n            \"Size of cl::Program must be equal to size of cl_program\");\r\n        vector<const char*> headerIncludeNamesCStr;\r\n        for(const string& name: headerIncludeNames) {\r\n            headerIncludeNamesCStr.push_back(name.c_str());\r\n        }\r\n        vector<cl_device_id> deviceIDList;\r\n        for(const Device& device: deviceList) {\r\n            deviceIDList.push_back(device());\r\n        }\r\n        cl_int error = ::clCompileProgram(\r\n            object_,\r\n            static_cast<cl_uint>(deviceList.size()),\r\n            reinterpret_cast<const cl_device_id*>(deviceIDList.data()),\r\n            options,\r\n            static_cast<cl_uint>(inputHeaders.size()),\r\n            reinterpret_cast<const cl_program*>(inputHeaders.data()),\r\n            reinterpret_cast<const char**>(headerIncludeNamesCStr.data()),\r\n            notifyFptr,\r\n            data);\r\n        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n    template <typename T>\r\n    cl_int getInfo(cl_program_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetProgramInfo, object_, name, param),\r\n            __GET_PROGRAM_INFO_ERR);\r\n    }\r\n\r\n    template <cl_program_info name> typename\r\n    detail::param_traits<detail::cl_program_info, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_program_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n    template <typename T>\r\n    cl_int getBuildInfo(\r\n        const Device& device, cl_program_build_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(\r\n                &::clGetProgramBuildInfo, object_, device(), name, param),\r\n                __GET_PROGRAM_BUILD_INFO_ERR);\r\n    }\r\n\r\n    template <cl_program_build_info name> typename\r\n    detail::param_traits<detail::cl_program_build_info, name>::param_type\r\n    getBuildInfo(const Device& device, cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_program_build_info, name>::param_type param;\r\n        cl_int result = getBuildInfo(device, name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n    \r\n    /**\r\n     * Build info function that returns a vector of device/info pairs for the specified \r\n     * info type and for all devices in the program.\r\n     * On an error reading the info for any device, an empty vector of info will be returned.\r\n     */\r\n    template <cl_program_build_info name>\r\n    vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>\r\n        getBuildInfo(cl_int *err = nullptr) const\r\n    {\r\n        cl_int result = CL_SUCCESS;\r\n\r\n        auto devs = getInfo<CL_PROGRAM_DEVICES>(&result);\r\n        vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>\r\n            devInfo;\r\n\r\n        // If there was an initial error from getInfo return the error\r\n        if (result != CL_SUCCESS) {\r\n            if (err != nullptr) {\r\n                *err = result;\r\n            }\r\n            return devInfo;\r\n        }\r\n\r\n        for (const cl::Device &d : devs) {\r\n            typename detail::param_traits<\r\n                detail::cl_program_build_info, name>::param_type param;\r\n            result = getBuildInfo(d, name, &param);\r\n            devInfo.push_back(\r\n                std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>\r\n                (d, param));\r\n            if (result != CL_SUCCESS) {\r\n                // On error, leave the loop and return the error code\r\n                break;\r\n            }\r\n        }\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        if (result != CL_SUCCESS) {\r\n            devInfo.clear();\r\n        }\r\n        return devInfo;\r\n    }\r\n\r\n    cl_int createKernels(vector<Kernel>* kernels)\r\n    {\r\n        cl_uint numKernels;\r\n        cl_int err = ::clCreateKernelsInProgram(object_, 0, nullptr, &numKernels);\r\n        if (err != CL_SUCCESS) {\r\n            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);\r\n        }\r\n\r\n        vector<cl_kernel> value(numKernels);\r\n        \r\n        err = ::clCreateKernelsInProgram(\r\n            object_, numKernels, value.data(), nullptr);\r\n        if (err != CL_SUCCESS) {\r\n            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);\r\n        }\r\n\r\n        if (kernels) {\r\n            kernels->resize(value.size());\r\n\r\n            // Assign to param, constructing with retain behaviour\r\n            // to correctly capture each underlying CL object\r\n            for (size_type i = 0; i < value.size(); i++) {\r\n                // We do not need to retain because this kernel is being created \r\n                // by the runtime\r\n                (*kernels)[i] = Kernel(value[i], false);\r\n            }\r\n        }\r\n        return CL_SUCCESS;\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 220\r\n#if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)\r\n    /*! \\brief Registers a callback function to be called when destructors for\r\n     *         program scope global variables are complete and before the\r\n     *         program is released.\r\n     *\r\n     *  Wraps clSetProgramReleaseCallback().\r\n     *\r\n     *  Each call to this function registers the specified user callback function\r\n     *  on a callback stack associated with program. The registered user callback\r\n     *  functions are called in the reverse order in which they were registered.\r\n     */\r\n    CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int setReleaseCallback(\r\n        void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data),\r\n        void * user_data = nullptr) CL_API_SUFFIX__VERSION_2_2_DEPRECATED\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetProgramReleaseCallback(\r\n                object_,\r\n                pfn_notify,\r\n                user_data),\r\n            __SET_PROGRAM_RELEASE_CALLBACK_ERR);\r\n    }\r\n#endif // #if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)\r\n\r\n    /*! \\brief Sets a SPIR-V specialization constant.\r\n     *\r\n     *  Wraps clSetProgramSpecializationConstant().\r\n     */\r\n    template <typename T>\r\n    typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type\r\n        setSpecializationConstant(cl_uint index, const T &value)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetProgramSpecializationConstant(\r\n                object_,\r\n                index,\r\n                sizeof(value),\r\n                &value),\r\n            __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);\r\n    }\r\n\r\n    /*! \\brief Sets a SPIR-V specialization constant.\r\n     *\r\n     *  Wraps clSetProgramSpecializationConstant().\r\n     */\r\n    cl_int setSpecializationConstant(cl_uint index, size_type size, const void* value)\r\n    {\r\n        return detail::errHandler(\r\n            ::clSetProgramSpecializationConstant(\r\n                object_,\r\n                index,\r\n                size,\r\n                value),\r\n            __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220\r\n};\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\ninline Program linkProgram(\r\n    const Program& input1,\r\n    const Program& input2,\r\n    const char* options = nullptr,\r\n    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n    void* data = nullptr,\r\n    cl_int* err = nullptr)\r\n{\r\n    cl_int error_local = CL_SUCCESS;\r\n    cl_program programs[2] = { input1(), input2() };\r\n\r\n    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);\r\n    if(error_local!=CL_SUCCESS) {\r\n        detail::errHandler(error_local, __LINK_PROGRAM_ERR);\r\n    }\r\n\r\n    cl_program prog = ::clLinkProgram(\r\n        ctx(),\r\n        0,\r\n        nullptr,\r\n        options,\r\n        2,\r\n        programs,\r\n        notifyFptr,\r\n        data,\r\n        &error_local);\r\n\r\n    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);\r\n    if (err != nullptr) {\r\n        *err = error_local;\r\n    }\r\n\r\n    return Program(prog);\r\n}\r\n\r\ninline Program linkProgram(\r\n    const Program& input1,\r\n    const Program& input2,\r\n    const string& options,\r\n    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n    void* data = nullptr,\r\n    cl_int* err = nullptr)\r\n{\r\n    return linkProgram(input1, input2, options.c_str(), notifyFptr, data, err);\r\n}\r\n\r\ninline Program linkProgram(\r\n    const vector<Program>& inputPrograms,\r\n    const char* options = nullptr,\r\n    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n    void* data = nullptr,\r\n    cl_int* err = nullptr)\r\n{\r\n    cl_int error_local = CL_SUCCESS;\r\n    Context ctx;\r\n\r\n    static_assert(sizeof(cl::Program) == sizeof(cl_program),\r\n        \"Size of cl::Program must be equal to size of cl_program\");\r\n\r\n    if(inputPrograms.size() > 0) {\r\n        ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);\r\n        if(error_local!=CL_SUCCESS) {\r\n            detail::errHandler(error_local, __LINK_PROGRAM_ERR);\r\n        }\r\n    }\r\n\r\n    cl_program prog = ::clLinkProgram(\r\n        ctx(),\r\n        0,\r\n        nullptr,\r\n        options,\r\n        static_cast<cl_uint>(inputPrograms.size()),\r\n        reinterpret_cast<const cl_program *>(inputPrograms.data()),\r\n        notifyFptr,\r\n        data,\r\n        &error_local);\r\n\r\n    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);\r\n    if (err != nullptr) {\r\n        *err = error_local;\r\n    }\r\n\r\n    return Program(prog);\r\n}\r\n\r\ninline Program linkProgram(\r\n    const vector<Program>& inputPrograms,\r\n    const string& options,\r\n    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,\r\n    void* data = nullptr,\r\n    cl_int* err = nullptr)\r\n{\r\n    return linkProgram(inputPrograms, options.c_str(), notifyFptr, data, err);\r\n}\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n// Template specialization for CL_PROGRAM_BINARIES\r\ntemplate <>\r\ninline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned char>>* param) const\r\n{\r\n    if (name != CL_PROGRAM_BINARIES) {\r\n        return CL_INVALID_VALUE;\r\n    }\r\n    if (param) {\r\n        // Resize the parameter array appropriately for each allocation\r\n        // and pass down to the helper\r\n\r\n        vector<size_type> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();\r\n        size_type numBinaries = sizes.size();\r\n\r\n        // Resize the parameter array and constituent arrays\r\n        param->resize(numBinaries);\r\n        for (size_type i = 0; i < numBinaries; ++i) {\r\n            (*param)[i].resize(sizes[i]);\r\n        }\r\n\r\n        return detail::errHandler(\r\n            detail::getInfo(&::clGetProgramInfo, object_, name, param),\r\n            __GET_PROGRAM_INFO_ERR);\r\n    }\r\n\r\n    return CL_SUCCESS;\r\n}\r\n\r\ntemplate<>\r\ninline vector<vector<unsigned char>> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const\r\n{\r\n    vector<vector<unsigned char>> binariesVectors;\r\n\r\n    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binariesVectors);\r\n    if (err != nullptr) {\r\n        *err = result;\r\n    }\r\n    return binariesVectors;\r\n}\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 220\r\n// Template specialization for clSetProgramSpecializationConstant\r\ntemplate <>\r\ninline cl_int cl::Program::setSpecializationConstant(cl_uint index, const bool &value)\r\n{\r\n    cl_uchar ucValue = value ? CL_UCHAR_MAX : 0;\r\n    return detail::errHandler(\r\n        ::clSetProgramSpecializationConstant(\r\n            object_,\r\n            index,\r\n            sizeof(ucValue),\r\n            &ucValue),\r\n        __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);\r\n}\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220\r\n\r\ninline Kernel::Kernel(const Program& program, const string& name, cl_int* err)\r\n{\r\n    cl_int error;\r\n\r\n    object_ = ::clCreateKernel(program(), name.c_str(), &error);\r\n    detail::errHandler(error, __CREATE_KERNEL_ERR);\r\n\r\n    if (err != nullptr) {\r\n        *err = error;\r\n    }\r\n}\r\n\r\ninline Kernel::Kernel(const Program& program, const char* name, cl_int* err)\r\n{\r\n    cl_int error;\r\n\r\n    object_ = ::clCreateKernel(program(), name, &error);\r\n    detail::errHandler(error, __CREATE_KERNEL_ERR);\r\n\r\n    if (err != nullptr) {\r\n        *err = error;\r\n    }\r\n}\r\n\r\n#ifdef cl_khr_external_memory\r\nenum class ExternalMemoryType : cl_external_memory_handle_type_khr\r\n{\r\n    None = 0,\r\n#ifdef cl_khr_external_memory_opaque_fd\r\n    OpaqueFd = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR,\r\n#endif // cl_khr_external_memory_opaque_fd\r\n#ifdef cl_khr_external_memory_win32\r\n    OpaqueWin32 = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR,\r\n    OpaqueWin32Kmt = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR,\r\n#endif // cl_khr_external_memory_win32\r\n#ifdef cl_khr_external_memory_dma_buf\r\n    DmaBuf = CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR,\r\n#endif // cl_khr_external_memory_dma_buf\r\n};\r\n#endif // cl_khr_external_memory\r\n\r\nenum class QueueProperties : cl_command_queue_properties\r\n{\r\n    None = 0,\r\n    Profiling = CL_QUEUE_PROFILING_ENABLE,\r\n    OutOfOrder = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,\r\n};\r\n\r\ninline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs)\r\n{\r\n    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));\r\n}\r\n\r\ninline QueueProperties operator&(QueueProperties lhs, QueueProperties rhs)\r\n{\r\n    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) & static_cast<cl_command_queue_properties>(rhs));\r\n}\r\n\r\n/*! \\class CommandQueue\r\n * \\brief CommandQueue interface for cl_command_queue.\r\n */\r\nclass CommandQueue : public detail::Wrapper<cl_command_queue>\r\n{\r\nprivate:\r\n    static std::once_flag default_initialized_;\r\n    static CommandQueue default_;\r\n    static cl_int default_error_;\r\n\r\n    /*! \\brief Create the default command queue returned by @ref getDefault.\r\n     *\r\n     * It sets default_error_ to indicate success or failure. It does not throw\r\n     * @c cl::Error.\r\n     */\r\n    static void makeDefault()\r\n    {\r\n        /* We don't want to throw an error from this function, so we have to\r\n         * catch and set the error flag.\r\n         */\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n        try\r\n#endif\r\n        {\r\n            int error;\r\n            Context context = Context::getDefault(&error);\r\n\r\n            if (error != CL_SUCCESS) {\r\n                default_error_ = error;\r\n            }\r\n            else {\r\n                Device device = Device::getDefault();\r\n                default_ = CommandQueue(context, device, 0, &default_error_);\r\n            }\r\n        }\r\n#if defined(CL_HPP_ENABLE_EXCEPTIONS)\r\n        catch (cl::Error &e) {\r\n            default_error_ = e.err();\r\n        }\r\n#endif\r\n    }\r\n\r\n    /*! \\brief Create the default command queue.\r\n     *\r\n     * This sets @c default_. It does not throw\r\n     * @c cl::Error.\r\n     */\r\n    static void makeDefaultProvided(const CommandQueue &c) {\r\n        default_ = c;\r\n    }\r\n\r\n#ifdef cl_khr_external_memory\r\n    static std::once_flag ext_memory_initialized_;\r\n\r\n    static void initMemoryExtension(const cl::Device& device) \r\n    {\r\n        auto platform = device.getInfo<CL_DEVICE_PLATFORM>()();\r\n\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireExternalMemObjectsKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseExternalMemObjectsKHR);\r\n\r\n        if ((pfn_clEnqueueAcquireExternalMemObjectsKHR == nullptr)\r\n            && (pfn_clEnqueueReleaseExternalMemObjectsKHR == nullptr))\r\n        {\r\n            detail::errHandler(CL_INVALID_VALUE, __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR);\r\n        }\r\n    }\r\n#endif // cl_khr_external_memory\r\n\r\npublic:\r\n#ifdef CL_HPP_UNIT_TEST_ENABLE\r\n    /*! \\brief Reset the default.\r\n    *\r\n    * This sets @c default_ to an empty value to support cleanup in\r\n    * the unit test framework.\r\n    * This function is not thread safe.\r\n    */\r\n    static void unitTestClearDefault() {\r\n        default_ = CommandQueue();\r\n    }\r\n#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE\r\n        \r\n\r\n    /*!\r\n     * \\brief Constructs a CommandQueue based on passed properties.\r\n     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.\r\n     */\r\n   CommandQueue(\r\n        cl_command_queue_properties properties,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        Context context = Context::getDefault(&error);\r\n        detail::errHandler(error, __CREATE_CONTEXT_ERR);\r\n\r\n        if (error != CL_SUCCESS) {\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n        else {\r\n            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];\r\n            bool useWithProperties;\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n            // Run-time decision based on the actual platform\r\n            {\r\n                cl_uint version = detail::getContextPlatformVersion(context());\r\n                useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above\r\n            }\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n            useWithProperties = true;\r\n#else\r\n            useWithProperties = false;\r\n#endif\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n            if (useWithProperties) {\r\n                cl_queue_properties queue_properties[] = {\r\n                    CL_QUEUE_PROPERTIES, properties, 0 };\r\n                if ((properties & CL_QUEUE_ON_DEVICE) == 0) {\r\n                    object_ = ::clCreateCommandQueueWithProperties(\r\n                        context(), device(), queue_properties, &error);\r\n                }\r\n                else {\r\n                    error = CL_INVALID_QUEUE_PROPERTIES;\r\n                }\r\n\r\n                detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n                if (err != nullptr) {\r\n                    *err = error;\r\n                }\r\n            }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n            if (!useWithProperties) {\r\n                object_ = ::clCreateCommandQueue(\r\n                    context(), device(), properties, &error);\r\n\r\n                detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);\r\n                if (err != nullptr) {\r\n                    *err = error;\r\n                }\r\n            }\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n        }\r\n    }\r\n\r\n   /*!\r\n    * \\brief Constructs a CommandQueue based on passed properties.\r\n    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.\r\n    */\r\n   CommandQueue(\r\n       QueueProperties properties,\r\n       cl_int* err = nullptr)\r\n   {\r\n       cl_int error;\r\n\r\n       Context context = Context::getDefault(&error);\r\n       detail::errHandler(error, __CREATE_CONTEXT_ERR);\r\n\r\n       if (error != CL_SUCCESS) {\r\n           if (err != nullptr) {\r\n               *err = error;\r\n           }\r\n       }\r\n       else {\r\n           Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];\r\n           bool useWithProperties;\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n           // Run-time decision based on the actual platform\r\n           {\r\n               cl_uint version = detail::getContextPlatformVersion(context());\r\n               useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above\r\n           }\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n           useWithProperties = true;\r\n#else\r\n           useWithProperties = false;\r\n#endif\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n           if (useWithProperties) {\r\n               cl_queue_properties queue_properties[] = {\r\n                   CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };\r\n\r\n               object_ = ::clCreateCommandQueueWithProperties(\r\n                   context(), device(), queue_properties, &error);\r\n\r\n               detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n               if (err != nullptr) {\r\n                   *err = error;\r\n               }\r\n           }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n           if (!useWithProperties) {\r\n               object_ = ::clCreateCommandQueue(\r\n                   context(), device(), static_cast<cl_command_queue_properties>(properties), &error);\r\n\r\n               detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);\r\n               if (err != nullptr) {\r\n                   *err = error;\r\n               }\r\n           }\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n\r\n       }\r\n   }\r\n\r\n    /*!\r\n     * \\brief Constructs a CommandQueue for an implementation defined device in the given context\r\n     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.\r\n     */\r\n    explicit CommandQueue(\r\n        const Context& context,\r\n        cl_command_queue_properties properties = 0,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        bool useWithProperties;\r\n        vector<cl::Device> devices;\r\n        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);\r\n\r\n        detail::errHandler(error, __CREATE_CONTEXT_ERR);\r\n\r\n        if (error != CL_SUCCESS)\r\n        {\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n            return;\r\n        }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n        // Run-time decision based on the actual platform\r\n        {\r\n            cl_uint version = detail::getContextPlatformVersion(context());\r\n            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above\r\n        }\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        useWithProperties = true;\r\n#else\r\n        useWithProperties = false;\r\n#endif\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        if (useWithProperties) {\r\n            cl_queue_properties queue_properties[] = {\r\n                CL_QUEUE_PROPERTIES, properties, 0 };\r\n            if ((properties & CL_QUEUE_ON_DEVICE) == 0) {\r\n                object_ = ::clCreateCommandQueueWithProperties(\r\n                    context(), devices[0](), queue_properties, &error);\r\n            }\r\n            else {\r\n                error = CL_INVALID_QUEUE_PROPERTIES;\r\n            }\r\n\r\n            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n        if (!useWithProperties) {\r\n            object_ = ::clCreateCommandQueue(\r\n                context(), devices[0](), properties, &error);\r\n\r\n            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n    }\r\n\r\n    /*!\r\n    * \\brief Constructs a CommandQueue for an implementation defined device in the given context\r\n    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.\r\n    */\r\n    explicit CommandQueue(\r\n        const Context& context,\r\n        QueueProperties properties,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        bool useWithProperties;\r\n        vector<cl::Device> devices;\r\n        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);\r\n\r\n        detail::errHandler(error, __CREATE_CONTEXT_ERR);\r\n\r\n        if (error != CL_SUCCESS)\r\n        {\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n            return;\r\n        }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n        // Run-time decision based on the actual platform\r\n        {\r\n            cl_uint version = detail::getContextPlatformVersion(context());\r\n            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above\r\n        }\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        useWithProperties = true;\r\n#else\r\n        useWithProperties = false;\r\n#endif\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        if (useWithProperties) {\r\n            cl_queue_properties queue_properties[] = {\r\n                CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };\r\n            object_ = ::clCreateCommandQueueWithProperties(\r\n                context(), devices[0](), queue_properties, &error);\r\n\r\n            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n        if (!useWithProperties) {\r\n            object_ = ::clCreateCommandQueue(\r\n                context(), devices[0](), static_cast<cl_command_queue_properties>(properties), &error);\r\n\r\n            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n    }\r\n\r\n    /*!\r\n     * \\brief Constructs a CommandQueue for a passed device and context\r\n     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.\r\n     */\r\n    CommandQueue(\r\n        const Context& context,\r\n        const Device& device,\r\n        cl_command_queue_properties properties = 0,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        bool useWithProperties;\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n        // Run-time decision based on the actual platform\r\n        {\r\n            cl_uint version = detail::getContextPlatformVersion(context());\r\n            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above\r\n        }\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        useWithProperties = true;\r\n#else\r\n        useWithProperties = false;\r\n#endif\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        if (useWithProperties) {\r\n            cl_queue_properties queue_properties[] = {\r\n                CL_QUEUE_PROPERTIES, properties, 0 };\r\n            object_ = ::clCreateCommandQueueWithProperties(\r\n                context(), device(), queue_properties, &error);\r\n\r\n            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n        if (!useWithProperties) {\r\n            object_ = ::clCreateCommandQueue(\r\n                context(), device(), properties, &error);\r\n\r\n            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n    }\r\n\r\n    /*!\r\n     * \\brief Constructs a CommandQueue for a passed device and context\r\n     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.\r\n     */\r\n    CommandQueue(\r\n        const Context& context,\r\n        const Device& device,\r\n        QueueProperties properties,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        bool useWithProperties;\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n        // Run-time decision based on the actual platform\r\n        {\r\n            cl_uint version = detail::getContextPlatformVersion(context());\r\n            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above\r\n        }\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        useWithProperties = true;\r\n#else\r\n        useWithProperties = false;\r\n#endif\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        if (useWithProperties) {\r\n            cl_queue_properties queue_properties[] = {\r\n                CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };\r\n            object_ = ::clCreateCommandQueueWithProperties(\r\n                context(), device(), queue_properties, &error);\r\n\r\n            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n        if (!useWithProperties) {\r\n            object_ = ::clCreateCommandQueue(\r\n                context(), device(), static_cast<cl_command_queue_properties>(properties), &error);\r\n\r\n            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);\r\n            if (err != nullptr) {\r\n                *err = error;\r\n            }\r\n        }\r\n#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200\r\n    }\r\n\r\n    static CommandQueue getDefault(cl_int * err = nullptr) \r\n    {\r\n        std::call_once(default_initialized_, makeDefault);\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n#else // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_ERR);\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n        if (err != nullptr) {\r\n            *err = default_error_;\r\n        }\r\n        return default_;\r\n    }\r\n\r\n    /**\r\n     * Modify the default command queue to be used by\r\n     * subsequent operations.\r\n     * Will only set the default if no default was previously created.\r\n     * @return updated default command queue.\r\n     *         Should be compared to the passed value to ensure that it was updated.\r\n     */\r\n    static CommandQueue setDefault(const CommandQueue &default_queue)\r\n    {\r\n        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_queue));\r\n        detail::errHandler(default_error_);\r\n        return default_;\r\n    }\r\n\r\n    CommandQueue() { }\r\n\r\n\r\n    /*! \\brief Constructor from cl_command_queue - takes ownership.\r\n     *\r\n     * \\param retainObject will cause the constructor to retain its cl object.\r\n     *                     Defaults to false to maintain compatibility with\r\n     *                     earlier versions.\r\n     */\r\n    explicit CommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) : \r\n        detail::Wrapper<cl_type>(commandQueue, retainObject) { }\r\n\r\n    CommandQueue& operator = (const cl_command_queue& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n    template <typename T>\r\n    cl_int getInfo(cl_command_queue_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(\r\n                &::clGetCommandQueueInfo, object_, name, param),\r\n                __GET_COMMAND_QUEUE_INFO_ERR);\r\n    }\r\n\r\n    template <cl_command_queue_info name> typename\r\n    detail::param_traits<detail::cl_command_queue_info, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_command_queue_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n    cl_int enqueueReadBuffer(\r\n        const Buffer& buffer,\r\n        cl_bool blocking,\r\n        size_type offset,\r\n        size_type size,\r\n        void* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueReadBuffer(\r\n                object_, buffer(), blocking, offset, size,\r\n                ptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_READ_BUFFER_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueWriteBuffer(\r\n        const Buffer& buffer,\r\n        cl_bool blocking,\r\n        size_type offset,\r\n        size_type size,\r\n        const void* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueWriteBuffer(\r\n                object_, buffer(), blocking, offset, size,\r\n                ptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n                __ENQUEUE_WRITE_BUFFER_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueCopyBuffer(\r\n        const Buffer& src,\r\n        const Buffer& dst,\r\n        size_type src_offset,\r\n        size_type dst_offset,\r\n        size_type size,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueCopyBuffer(\r\n                object_, src(), dst(), src_offset, dst_offset, size,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQEUE_COPY_BUFFER_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n    cl_int enqueueReadBufferRect(\r\n        const Buffer& buffer,\r\n        cl_bool blocking,\r\n        const array<size_type, 3>& buffer_offset,\r\n        const array<size_type, 3>& host_offset,\r\n        const array<size_type, 3>& region,\r\n        size_type buffer_row_pitch,\r\n        size_type buffer_slice_pitch,\r\n        size_type host_row_pitch,\r\n        size_type host_slice_pitch,\r\n        void *ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueReadBufferRect(\r\n                object_, \r\n                buffer(), \r\n                blocking,\r\n                buffer_offset.data(),\r\n                host_offset.data(),\r\n                region.data(),\r\n                buffer_row_pitch,\r\n                buffer_slice_pitch,\r\n                host_row_pitch,\r\n                host_slice_pitch,\r\n                ptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n                __ENQUEUE_READ_BUFFER_RECT_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueReadBufferRect(\r\n        const Buffer& buffer,\r\n        cl_bool blocking,\r\n        const array<size_type, 2>& buffer_offset,\r\n        const array<size_type, 2>& host_offset,\r\n        const array<size_type, 2>& region,\r\n        size_type buffer_row_pitch,\r\n        size_type buffer_slice_pitch,\r\n        size_type host_row_pitch,\r\n        size_type host_slice_pitch,\r\n        void* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    { \r\n        return enqueueReadBufferRect(\r\n            buffer,\r\n            blocking,\r\n            { buffer_offset[0], buffer_offset[1], 0 },\r\n            { host_offset[0], host_offset[1], 0 },\r\n            { region[0], region[1], 1 },\r\n            buffer_row_pitch,\r\n            buffer_slice_pitch,\r\n            host_row_pitch,\r\n            host_slice_pitch,\r\n            ptr,\r\n            events,\r\n            event);\r\n    }\r\n\r\n    cl_int enqueueWriteBufferRect(\r\n        const Buffer& buffer,\r\n        cl_bool blocking,\r\n        const array<size_type, 3>& buffer_offset,\r\n        const array<size_type, 3>& host_offset,\r\n        const array<size_type, 3>& region,\r\n        size_type buffer_row_pitch,\r\n        size_type buffer_slice_pitch,\r\n        size_type host_row_pitch,\r\n        size_type host_slice_pitch,\r\n        const void *ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueWriteBufferRect(\r\n                object_, \r\n                buffer(), \r\n                blocking,\r\n                buffer_offset.data(),\r\n                host_offset.data(),\r\n                region.data(),\r\n                buffer_row_pitch,\r\n                buffer_slice_pitch,\r\n                host_row_pitch,\r\n                host_slice_pitch,\r\n                ptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n                __ENQUEUE_WRITE_BUFFER_RECT_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueWriteBufferRect(\r\n        const Buffer& buffer,\r\n        cl_bool blocking,\r\n        const array<size_type, 2>& buffer_offset,\r\n        const array<size_type, 2>& host_offset,\r\n        const array<size_type, 2>& region,\r\n        size_type buffer_row_pitch,\r\n        size_type buffer_slice_pitch,\r\n        size_type host_row_pitch,\r\n        size_type host_slice_pitch,\r\n        const void* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueWriteBufferRect(\r\n            buffer, \r\n            blocking,\r\n            { buffer_offset[0], buffer_offset[1], 0 },\r\n            { host_offset[0], host_offset[1], 0 },\r\n            { region[0], region[1], 1 },\r\n            buffer_row_pitch,\r\n            buffer_slice_pitch,\r\n            host_row_pitch,\r\n            host_slice_pitch,\r\n            ptr,\r\n            events,\r\n            event);\r\n    }\r\n\r\n    cl_int enqueueCopyBufferRect(\r\n        const Buffer& src,\r\n        const Buffer& dst,\r\n        const array<size_type, 3>& src_origin,\r\n        const array<size_type, 3>& dst_origin,\r\n        const array<size_type, 3>& region,\r\n        size_type src_row_pitch,\r\n        size_type src_slice_pitch,\r\n        size_type dst_row_pitch,\r\n        size_type dst_slice_pitch,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueCopyBufferRect(\r\n                object_, \r\n                src(), \r\n                dst(), \r\n                src_origin.data(),\r\n                dst_origin.data(),\r\n                region.data(),\r\n                src_row_pitch,\r\n                src_slice_pitch,\r\n                dst_row_pitch,\r\n                dst_slice_pitch,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQEUE_COPY_BUFFER_RECT_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueCopyBufferRect(\r\n        const Buffer& src,\r\n        const Buffer& dst,\r\n        const array<size_type, 2>& src_origin,\r\n        const array<size_type, 2>& dst_origin,\r\n        const array<size_type, 2>& region,\r\n        size_type src_row_pitch,\r\n        size_type src_slice_pitch,\r\n        size_type dst_row_pitch,\r\n        size_type dst_slice_pitch,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueCopyBufferRect(\r\n            src,\r\n            dst,\r\n            { src_origin[0], src_origin[1], 0 },\r\n            { dst_origin[0], dst_origin[1], 0 },\r\n            { region[0], region[1], 1 },\r\n            src_row_pitch,\r\n            src_slice_pitch,\r\n            dst_row_pitch,\r\n            dst_slice_pitch,\r\n            events,\r\n            event);\r\n    }\r\n\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    /**\r\n     * Enqueue a command to fill a buffer object with a pattern\r\n     * of a given size. The pattern is specified as a vector type.\r\n     * \\tparam PatternType The datatype of the pattern field. \r\n     *     The pattern type must be an accepted OpenCL data type.\r\n     * \\tparam offset Is the offset in bytes into the buffer at \r\n     *     which to start filling. This must be a multiple of \r\n     *     the pattern size.\r\n     * \\tparam size Is the size in bytes of the region to fill.\r\n     *     This must be a multiple of the pattern size.\r\n     */\r\n    template<typename PatternType>\r\n    cl_int enqueueFillBuffer(\r\n        const Buffer& buffer,\r\n        PatternType pattern,\r\n        size_type offset,\r\n        size_type size,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueFillBuffer(\r\n                object_, \r\n                buffer(),\r\n                static_cast<void*>(&pattern),\r\n                sizeof(PatternType), \r\n                offset, \r\n                size,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n                __ENQUEUE_FILL_BUFFER_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n    cl_int enqueueReadImage(\r\n        const Image& image,\r\n        cl_bool blocking,\r\n        const array<size_type, 3>& origin,\r\n        const array<size_type, 3>& region,\r\n        size_type row_pitch,\r\n        size_type slice_pitch,\r\n        void* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueReadImage(\r\n                object_, \r\n                image(), \r\n                blocking, \r\n                origin.data(),\r\n                region.data(), \r\n                row_pitch, \r\n                slice_pitch, \r\n                ptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_READ_IMAGE_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueReadImage(\r\n        const Image& image,\r\n        cl_bool blocking,\r\n        const array<size_type, 2>& origin,\r\n        const array<size_type, 2>& region,\r\n        size_type row_pitch,\r\n        size_type slice_pitch,\r\n        void* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueReadImage(\r\n            image,\r\n            blocking,\r\n            { origin[0], origin[1], 0 },\r\n            { region[0], region[1], 1 },\r\n            row_pitch,\r\n            slice_pitch,\r\n            ptr,\r\n            events,\r\n            event);\r\n    }\r\n\r\n    cl_int enqueueWriteImage(\r\n        const Image& image,\r\n        cl_bool blocking,\r\n        const array<size_type, 3>& origin,\r\n        const array<size_type, 3>& region,\r\n        size_type row_pitch,\r\n        size_type slice_pitch,\r\n        const void* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueWriteImage(\r\n                object_, \r\n                image(), \r\n                blocking, \r\n                origin.data(),\r\n                region.data(), \r\n                row_pitch, \r\n                slice_pitch, \r\n                ptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_WRITE_IMAGE_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueWriteImage(\r\n        const Image& image,\r\n        cl_bool blocking,\r\n        const array<size_type, 2>& origin,\r\n        const array<size_type, 2>& region,\r\n        size_type row_pitch,\r\n        size_type slice_pitch,\r\n        const void* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueWriteImage(\r\n            image,\r\n            blocking,\r\n            { origin[0], origin[1], 0 },\r\n            { region[0], region[1], 1 },\r\n            row_pitch,\r\n            slice_pitch,\r\n            ptr,\r\n            events,\r\n            event);\r\n    }\r\n\r\n    cl_int enqueueCopyImage(\r\n        const Image& src,\r\n        const Image& dst,\r\n        const array<size_type, 3>& src_origin,\r\n        const array<size_type, 3>& dst_origin,\r\n        const array<size_type, 3>& region,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueCopyImage(\r\n                object_, \r\n                src(), \r\n                dst(), \r\n                src_origin.data(),\r\n                dst_origin.data(), \r\n                region.data(),\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_COPY_IMAGE_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueCopyImage(\r\n        const Image& src,\r\n        const Image& dst,\r\n        const array<size_type, 2>& src_origin,\r\n        const array<size_type, 2>& dst_origin,\r\n        const array<size_type, 2>& region,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueCopyImage(\r\n            src,\r\n            dst,\r\n            { src_origin[0], src_origin[1], 0 },\r\n            { dst_origin[0], dst_origin[1], 0 },\r\n            { region[0], region[1], 1 },\r\n            events,\r\n            event);\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    /**\r\n     * Enqueue a command to fill an image object with a specified color.\r\n     * \\param fillColor is the color to use to fill the image.\r\n     *     This is a four component RGBA floating-point, signed integer\r\n     *     or unsigned integer color value if  the image channel data\r\n     *     type is an unnormalized signed integer type.   \r\n     */\r\n    template <typename T>\r\n    typename std::enable_if<std::is_same<T, cl_float4>::value ||\r\n                            std::is_same<T, cl_int4  >::value ||\r\n                            std::is_same<T, cl_uint4 >::value,\r\n                            cl_int>::type \r\n     enqueueFillImage(\r\n         const Image& image, \r\n         T fillColor,\r\n         const array<size_type, 3>& origin,\r\n         const array<size_type, 3>& region,\r\n         const vector<Event>* events = nullptr,\r\n         Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueFillImage(\r\n                object_,\r\n                image(),\r\n                static_cast<void*>(&fillColor),\r\n                origin.data(),\r\n                region.data(),\r\n                (events != nullptr) ? (cl_uint)events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : NULL,\r\n                (event != NULL) ? &tmp : nullptr),\r\n            __ENQUEUE_FILL_IMAGE_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS) *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n   /**\r\n     * Enqueue a command to fill an image object with a specified color.\r\n     * \\param fillColor is the color to use to fill the image.\r\n     *     This is a four component RGBA floating-point, signed integer\r\n     *     or unsigned integer color value if  the image channel data\r\n     *     type is an unnormalized signed integer type.\r\n     */\r\n    template <typename T>\r\n    typename std::enable_if<std::is_same<T, cl_float4>::value ||\r\n                            std::is_same<T, cl_int4  >::value ||\r\n                            std::is_same<T, cl_uint4 >::value, cl_int>::type\r\n    enqueueFillImage(\r\n        const Image& image,\r\n        T fillColor,\r\n        const array<size_type, 2>& origin,\r\n        const array<size_type, 2>& region,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueFillImage(\r\n            image,\r\n            fillColor,\r\n            { origin[0], origin[1], 0 },\r\n            { region[0], region[1], 1 },\r\n            events,\r\n            event\r\n            );\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n    cl_int enqueueCopyImageToBuffer(\r\n        const Image& src,\r\n        const Buffer& dst,\r\n        const array<size_type, 3>& src_origin,\r\n        const array<size_type, 3>& region,\r\n        size_type dst_offset,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueCopyImageToBuffer(\r\n                object_, \r\n                src(), \r\n                dst(), \r\n                src_origin.data(),\r\n                region.data(), \r\n                dst_offset,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueCopyImageToBuffer(\r\n        const Image& src,\r\n        const Buffer& dst,\r\n        const array<size_type, 2>& src_origin,\r\n        const array<size_type, 2>& region,\r\n        size_type dst_offset,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    { \r\n        return enqueueCopyImageToBuffer(\r\n            src,\r\n            dst,\r\n            { src_origin[0], src_origin[1], 0 },\r\n            { region[0], region[1], 1 },\r\n            dst_offset,\r\n            events,\r\n            event);\r\n    }\r\n\r\n    cl_int enqueueCopyBufferToImage(\r\n        const Buffer& src,\r\n        const Image& dst,\r\n        size_type src_offset,\r\n        const array<size_type, 3>& dst_origin,\r\n        const array<size_type, 3>& region,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueCopyBufferToImage(\r\n                object_, \r\n                src(), \r\n                dst(), \r\n                src_offset,\r\n                dst_origin.data(), \r\n                region.data(),\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueCopyBufferToImage(\r\n        const Buffer& src,\r\n        const Image& dst,\r\n        size_type src_offset,\r\n        const array<size_type, 2>& dst_origin,\r\n        const array<size_type, 2>& region,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueCopyBufferToImage(\r\n            src,\r\n            dst, \r\n            src_offset,\r\n            { dst_origin[0], dst_origin[1], 0 },\r\n            { region[0], region[1], 1 },\r\n            events,\r\n            event);\r\n    }\r\n\r\n    void* enqueueMapBuffer(\r\n        const Buffer& buffer,\r\n        cl_bool blocking,\r\n        cl_map_flags flags,\r\n        size_type offset,\r\n        size_type size,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr,\r\n        cl_int* err = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int error;\r\n        void * result = ::clEnqueueMapBuffer(\r\n            object_, buffer(), blocking, flags, offset, size,\r\n            (events != nullptr) ? (cl_uint) events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n            (event != nullptr) ? &tmp : nullptr,\r\n            &error);\r\n\r\n        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n        if (event != nullptr && error == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return result;\r\n    }\r\n\r\n    void* enqueueMapImage(\r\n        const Image& image,\r\n        cl_bool blocking,\r\n        cl_map_flags flags,\r\n        const array<size_type, 3>& origin,\r\n        const array<size_type, 3>& region,\r\n        size_type * row_pitch,\r\n        size_type * slice_pitch,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr,\r\n        cl_int* err = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int error;\r\n        void * result = ::clEnqueueMapImage(\r\n            object_, image(), blocking, flags,\r\n            origin.data(), \r\n            region.data(),\r\n            row_pitch, slice_pitch,\r\n            (events != nullptr) ? (cl_uint) events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n            (event != nullptr) ? &tmp : nullptr,\r\n            &error);\r\n\r\n        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);\r\n        if (err != nullptr) {\r\n              *err = error;\r\n        }\r\n        if (event != nullptr && error == CL_SUCCESS)\r\n            *event = tmp;\r\n        return result;\r\n    }\r\n\r\n    void* enqueueMapImage(\r\n         const Image& image,\r\n         cl_bool blocking,\r\n         cl_map_flags flags,\r\n         const array<size_type, 2>& origin,\r\n         const array<size_type, 2>& region,\r\n         size_type* row_pitch,\r\n         size_type* slice_pitch,\r\n         const vector<Event>* events = nullptr,\r\n         Event* event = nullptr,\r\n         cl_int* err = nullptr) const\r\n    {\r\n        return enqueueMapImage(image, blocking, flags,\r\n                               { origin[0], origin[1], 0 },\r\n                               { region[0], region[1], 1 }, row_pitch,\r\n                               slice_pitch, events, event, err);\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n    /**\r\n    * Enqueues a command that copies a region of memory from the source pointer to the destination pointer.\r\n    * This function is specifically for transferring data between the host and a coarse-grained SVM buffer.\r\n    */\r\n    template<typename T>\r\n    cl_int enqueueMemcpySVM(\r\n            T *dst_ptr,\r\n            const T *src_ptr,\r\n            cl_bool blocking,\r\n            size_type size,\r\n            const vector<Event> *events = nullptr,\r\n            Event *event = nullptr) const {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMemcpy(\r\n                object_, blocking, static_cast<void *>(dst_ptr), static_cast<const void *>(src_ptr), size,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_COPY_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n    *Enqueues a command that will copy data from one coarse-grained SVM buffer to another.\r\n    *This function takes two cl::pointer instances representing the destination and source buffers.\r\n    */\r\n    template<typename T, class D>\r\n    cl_int enqueueMemcpySVM(\r\n            cl::pointer<T, D> &dst_ptr,\r\n            const cl::pointer<T, D> &src_ptr,\r\n            cl_bool blocking,\r\n            size_type size,\r\n            const vector<Event> *events = nullptr,\r\n            Event *event = nullptr) const {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMemcpy(\r\n                object_, blocking, static_cast<void *>(dst_ptr.get()), static_cast<const void *>(src_ptr.get()),\r\n                size,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_COPY_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n    * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.\r\n    * This variant takes a cl::vector instance.\r\n    */\r\n    template<typename T, class Alloc>\r\n    cl_int enqueueMemcpySVM(\r\n            cl::vector<T, Alloc> &dst_container,\r\n            const cl::vector<T, Alloc> &src_container,\r\n            cl_bool blocking,\r\n            const vector<Event> *events = nullptr,\r\n            Event *event = nullptr) const {\r\n        cl_event tmp;\r\n        if(src_container.size() != dst_container.size()){\r\n            return detail::errHandler(CL_INVALID_VALUE,__ENQUEUE_COPY_SVM_ERR);\r\n        }\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMemcpy(\r\n                object_, blocking, static_cast<void *>(dst_container.data()),\r\n                static_cast<const void *>(src_container.data()),\r\n                dst_container.size() * sizeof(T),\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,\r\n                (event != NULL) ? &tmp : nullptr), __ENQUEUE_COPY_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n    * Enqueues a command to fill a SVM buffer with a pattern.\r\n    *\r\n    */\r\n    template<typename T, typename PatternType>\r\n    cl_int enqueueMemFillSVM(\r\n            T *ptr,\r\n            PatternType pattern,\r\n            size_type size,\r\n            const vector<Event> *events = nullptr,\r\n            Event *event = nullptr) const {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMemFill(\r\n                object_, static_cast<void *>(ptr), static_cast<void *>(&pattern),\r\n                sizeof(PatternType), size,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_FILL_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n    * Enqueues a command that fills a region of a coarse-grained SVM buffer with a specified pattern.\r\n    * This variant takes a cl::pointer instance.\r\n    */\r\n    template<typename T, class D, typename PatternType>\r\n    cl_int enqueueMemFillSVM(\r\n            cl::pointer<T, D> &ptr,\r\n            PatternType pattern,\r\n            size_type size,\r\n            const vector<Event> *events = nullptr,\r\n            Event *event = nullptr) const {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMemFill(\r\n                object_, static_cast<void *>(ptr.get()), static_cast<void *>(&pattern),\r\n                sizeof(PatternType), size,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_FILL_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n    * Enqueues a command that will allow the host to fill a region of a coarse-grained SVM buffer with a specified pattern.\r\n    * This variant takes a cl::vector instance.\r\n    */\r\n    template<typename T, class Alloc, typename PatternType>\r\n    cl_int enqueueMemFillSVM(\r\n            cl::vector<T, Alloc> &container,\r\n            PatternType pattern,\r\n            const vector<Event> *events = nullptr,\r\n            Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMemFill(\r\n                object_, static_cast<void *>(container.data()), static_cast<void *>(&pattern),\r\n                sizeof(PatternType), container.size() * sizeof(T),\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : NULL), __ENQUEUE_FILL_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.\r\n     * This variant takes a raw SVM pointer.\r\n     */\r\n    template<typename T>\r\n    cl_int enqueueMapSVM(\r\n        T* ptr,\r\n        cl_bool blocking,\r\n        cl_map_flags flags,\r\n        size_type size,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMap(\r\n            object_, blocking, flags, static_cast<void*>(ptr), size,\r\n            (events != nullptr) ? (cl_uint)events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,\r\n            (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_MAP_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n\r\n    /**\r\n     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.\r\n     * This variant takes a cl::pointer instance.\r\n     */\r\n    template<typename T, class D>\r\n    cl_int enqueueMapSVM(\r\n        cl::pointer<T, D> &ptr,\r\n        cl_bool blocking,\r\n        cl_map_flags flags,\r\n        size_type size,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMap(\r\n            object_, blocking, flags, static_cast<void*>(ptr.get()), size,\r\n            (events != nullptr) ? (cl_uint)events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,\r\n            (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_MAP_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.\r\n     * This variant takes a cl::vector instance.\r\n     */\r\n    template<typename T, class Alloc>\r\n    cl_int enqueueMapSVM(\r\n        cl::vector<T, Alloc> &container,\r\n        cl_bool blocking,\r\n        cl_map_flags flags,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMap(\r\n            object_, blocking, flags, static_cast<void*>(container.data()), container.size()*sizeof(T),\r\n            (events != nullptr) ? (cl_uint)events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,\r\n            (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_MAP_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n    cl_int enqueueUnmapMemObject(\r\n        const Memory& memory,\r\n        void* mapped_ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueUnmapMemObject(\r\n                object_, memory(), mapped_ptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n    /**\r\n     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.\r\n     * This variant takes a raw SVM pointer.\r\n     */\r\n    template<typename T>\r\n    cl_int enqueueUnmapSVM(\r\n        T* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueSVMUnmap(\r\n            object_, static_cast<void*>(ptr),\r\n            (events != nullptr) ? (cl_uint)events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,\r\n            (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_UNMAP_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.\r\n     * This variant takes a cl::pointer instance.\r\n     */\r\n    template<typename T, class D>\r\n    cl_int enqueueUnmapSVM(\r\n        cl::pointer<T, D> &ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueSVMUnmap(\r\n            object_, static_cast<void*>(ptr.get()),\r\n            (events != nullptr) ? (cl_uint)events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,\r\n            (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_UNMAP_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.\r\n     * This variant takes a cl::vector instance.\r\n     */\r\n    template<typename T, class Alloc>\r\n    cl_int enqueueUnmapSVM(\r\n        cl::vector<T, Alloc> &container,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueSVMUnmap(\r\n            object_, static_cast<void*>(container.data()),\r\n            (events != nullptr) ? (cl_uint)events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,\r\n            (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_UNMAP_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n    /**\r\n     * Enqueues a marker command which waits for either a list of events to complete, \r\n     * or all previously enqueued commands to complete.\r\n     *\r\n     * Enqueues a marker command which waits for either a list of events to complete, \r\n     * or if the list is empty it waits for all commands previously enqueued in command_queue \r\n     * to complete before it completes. This command returns an event which can be waited on, \r\n     * i.e. this event can be waited on to insure that all events either in the event_wait_list \r\n     * or all previously enqueued commands, queued before this command to command_queue, \r\n     * have completed.\r\n     */\r\n    cl_int enqueueMarkerWithWaitList(\r\n        const vector<Event> *events = nullptr,\r\n        Event *event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueMarkerWithWaitList(\r\n                object_,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_MARKER_WAIT_LIST_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n     * A synchronization point that enqueues a barrier operation.\r\n     *\r\n     * Enqueues a barrier command which waits for either a list of events to complete, \r\n     * or if the list is empty it waits for all commands previously enqueued in command_queue \r\n     * to complete before it completes. This command blocks command execution, that is, any \r\n     * following commands enqueued after it do not execute until it completes. This command \r\n     * returns an event which can be waited on, i.e. this event can be waited on to insure that \r\n     * all events either in the event_wait_list or all previously enqueued commands, queued \r\n     * before this command to command_queue, have completed.\r\n     */\r\n    cl_int enqueueBarrierWithWaitList(\r\n        const vector<Event> *events = nullptr,\r\n        Event *event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueBarrierWithWaitList(\r\n                object_,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_BARRIER_WAIT_LIST_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n    \r\n    /**\r\n     * Enqueues a command to indicate with which device a set of memory objects\r\n     * should be associated.\r\n     */\r\n    cl_int enqueueMigrateMemObjects(\r\n        const vector<Memory> &memObjects,\r\n        cl_mem_migration_flags flags,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr\r\n        ) const\r\n    {\r\n        cl_event tmp;\r\n        \r\n        vector<cl_mem> localMemObjects(memObjects.size());\r\n\r\n        for( int i = 0; i < (int)memObjects.size(); ++i ) {\r\n            localMemObjects[i] = memObjects[i]();\r\n        }\r\n        \r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueMigrateMemObjects(\r\n                object_, \r\n                (cl_uint)memObjects.size(), \r\n                localMemObjects.data(),\r\n                flags,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n    /**\r\n     * Enqueues a command that will allow the host associate ranges within a set of\r\n     * SVM allocations with a device.\r\n     * @param sizes - The length from each pointer to migrate.\r\n     */\r\n    template<typename T>\r\n    cl_int enqueueMigrateSVM(\r\n        const cl::vector<T*> &svmRawPointers,\r\n        const cl::vector<size_type> &sizes,\r\n        cl_mem_migration_flags flags = 0,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(::clEnqueueSVMMigrateMem(\r\n            object_,\r\n            svmRawPointers.size(), static_cast<void**>(svmRawPointers.data()),\r\n            sizes.data(), // array of sizes not passed\r\n            flags,\r\n            (events != nullptr) ? (cl_uint)events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,\r\n            (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_MIGRATE_SVM_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    /**\r\n     * Enqueues a command that will allow the host associate a set of SVM allocations with\r\n     * a device.\r\n     */\r\n    template<typename T>\r\n    cl_int enqueueMigrateSVM(\r\n        const cl::vector<T*> &svmRawPointers,\r\n        cl_mem_migration_flags flags = 0,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueMigrateSVM(svmRawPointers, cl::vector<size_type>(svmRawPointers.size()), flags, events, event);\r\n    }\r\n\r\n\r\n    /**\r\n     * Enqueues a command that will allow the host associate ranges within a set of\r\n     * SVM allocations with a device.\r\n     * @param sizes - The length from each pointer to migrate.\r\n     */\r\n    template<typename T, class D>\r\n    cl_int enqueueMigrateSVM(\r\n        const cl::vector<cl::pointer<T, D>> &svmPointers,\r\n        const cl::vector<size_type> &sizes,\r\n        cl_mem_migration_flags flags = 0,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl::vector<void*> svmRawPointers;\r\n        svmRawPointers.reserve(svmPointers.size());\r\n        for (auto p : svmPointers) {\r\n            svmRawPointers.push_back(static_cast<void*>(p.get()));\r\n        }\r\n\r\n        return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event);\r\n    }\r\n\r\n\r\n    /**\r\n     * Enqueues a command that will allow the host associate a set of SVM allocations with\r\n     * a device.\r\n     */\r\n    template<typename T, class D>\r\n    cl_int enqueueMigrateSVM(\r\n        const cl::vector<cl::pointer<T, D>> &svmPointers,\r\n        cl_mem_migration_flags flags = 0,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueMigrateSVM(svmPointers, cl::vector<size_type>(svmPointers.size()), flags, events, event);\r\n    }\r\n\r\n    /**\r\n     * Enqueues a command that will allow the host associate ranges within a set of\r\n     * SVM allocations with a device.\r\n     * @param sizes - The length from the beginning of each container to migrate.\r\n     */\r\n    template<typename T, class Alloc>\r\n    cl_int enqueueMigrateSVM(\r\n        const cl::vector<cl::vector<T, Alloc>> &svmContainers,\r\n        const cl::vector<size_type> &sizes,\r\n        cl_mem_migration_flags flags = 0,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl::vector<void*> svmRawPointers;\r\n        svmRawPointers.reserve(svmContainers.size());\r\n        for (auto p : svmContainers) {\r\n            svmRawPointers.push_back(static_cast<void*>(p.data()));\r\n        }\r\n\r\n        return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event);\r\n    }\r\n\r\n    /**\r\n     * Enqueues a command that will allow the host associate a set of SVM allocations with\r\n     * a device.\r\n     */\r\n    template<typename T, class Alloc>\r\n    cl_int enqueueMigrateSVM(\r\n        const cl::vector<cl::vector<T, Alloc>> &svmContainers,\r\n        cl_mem_migration_flags flags = 0,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        return enqueueMigrateSVM(svmContainers, cl::vector<size_type>(svmContainers.size()), flags, events, event);\r\n    }\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n    \r\n    cl_int enqueueNDRangeKernel(\r\n        const Kernel& kernel,\r\n        const NDRange& offset,\r\n        const NDRange& global,\r\n        const NDRange& local = NullRange,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueNDRangeKernel(\r\n                object_, kernel(), (cl_uint) global.dimensions(),\r\n                offset.dimensions() != 0 ? (const size_type*) offset : nullptr,\r\n                (const size_type*) global,\r\n                local.dimensions() != 0 ? (const size_type*) local : nullptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_NDRANGE_KERNEL_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)\r\n    CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(\r\n        const Kernel& kernel,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueTask(\r\n                object_, kernel(),\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_TASK_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)\r\n\r\n    cl_int enqueueNativeKernel(\r\n        void (CL_CALLBACK *userFptr)(void *),\r\n        std::pair<void*, size_type> args,\r\n        const vector<Memory>* mem_objects = nullptr,\r\n        const vector<const void*>* mem_locs = nullptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr) const\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueNativeKernel(\r\n                object_, userFptr, args.first, args.second,\r\n                (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,\r\n                (mem_objects->size() > 0 ) ? reinterpret_cast<const cl_mem *>(mem_objects->data()) : nullptr,\r\n                (mem_locs != nullptr && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : nullptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_NATIVE_KERNEL);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n/**\r\n * Deprecated APIs for 1.2\r\n */\r\n#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n    CL_API_PREFIX__VERSION_1_1_DEPRECATED \r\n    cl_int enqueueMarker(Event* event = nullptr) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED\r\n    {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            ::clEnqueueMarker(\r\n                object_, \r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_MARKER_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    CL_API_PREFIX__VERSION_1_1_DEPRECATED\r\n    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED\r\n    {\r\n        return detail::errHandler(\r\n            ::clEnqueueWaitForEvents(\r\n                object_,\r\n                (cl_uint) events.size(),\r\n                events.size() > 0 ? (const cl_event*) &events.front() : nullptr),\r\n            __ENQUEUE_WAIT_FOR_EVENTS_ERR);\r\n    }\r\n#endif // defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n\r\n    cl_int enqueueAcquireGLObjects(\r\n         const vector<Memory>* mem_objects = nullptr,\r\n         const vector<Event>* events = nullptr,\r\n         Event* event = nullptr) const\r\n     {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n             ::clEnqueueAcquireGLObjects(\r\n                 object_,\r\n                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,\r\n                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,\r\n                 (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                 (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                 (event != nullptr) ? &tmp : nullptr),\r\n             __ENQUEUE_ACQUIRE_GL_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n     }\r\n\r\n    cl_int enqueueReleaseGLObjects(\r\n         const vector<Memory>* mem_objects = nullptr,\r\n         const vector<Event>* events = nullptr,\r\n         Event* event = nullptr) const\r\n     {\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n             ::clEnqueueReleaseGLObjects(\r\n                 object_,\r\n                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,\r\n                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,\r\n                 (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                 (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                 (event != nullptr) ? &tmp : nullptr),\r\n             __ENQUEUE_RELEASE_GL_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n     }\r\n\r\n#if defined (CL_HPP_USE_DX_INTEROP)\r\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(\r\n    cl_command_queue command_queue, cl_uint num_objects,\r\n    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list, cl_event* event);\r\ntypedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(\r\n    cl_command_queue command_queue, cl_uint num_objects,\r\n    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,\r\n    const cl_event* event_wait_list, cl_event* event);\r\n\r\n    cl_int enqueueAcquireD3D10Objects(\r\n         const vector<Memory>* mem_objects = nullptr,\r\n         const vector<Event>* events = nullptr,\r\n         Event* event = nullptr) const\r\n    {\r\n        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = nullptr;\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        cl_context context = getInfo<CL_QUEUE_CONTEXT>();\r\n        cl::Device device(getInfo<CL_QUEUE_DEVICE>());\r\n        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireD3D10ObjectsKHR);\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueAcquireD3D10ObjectsKHR);\r\n#endif\r\n        \r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n             pfn_clEnqueueAcquireD3D10ObjectsKHR(\r\n                 object_,\r\n                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,\r\n                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,\r\n                 (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                 (events != nullptr) ? (cl_event*) &events->front() : nullptr,\r\n                 (event != nullptr) ? &tmp : nullptr),\r\n             __ENQUEUE_ACQUIRE_GL_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n     }\r\n\r\n    cl_int enqueueReleaseD3D10Objects(\r\n         const vector<Memory>* mem_objects = nullptr,\r\n         const vector<Event>* events = nullptr,\r\n         Event* event = nullptr) const\r\n    {\r\n        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = nullptr;\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        cl_context context = getInfo<CL_QUEUE_CONTEXT>();\r\n        cl::Device device(getInfo<CL_QUEUE_DEVICE>());\r\n        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseD3D10ObjectsKHR);\r\n#endif\r\n#if CL_HPP_MINIMUM_OPENCL_VERSION < 120\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueReleaseD3D10ObjectsKHR);\r\n#endif\r\n\r\n        cl_event tmp;\r\n        cl_int err = detail::errHandler(\r\n            pfn_clEnqueueReleaseD3D10ObjectsKHR(\r\n                object_,\r\n                (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,\r\n                (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr),\r\n            __ENQUEUE_RELEASE_GL_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n#endif\r\n\r\n/**\r\n * Deprecated APIs for 1.2\r\n */\r\n#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)\r\n    CL_API_PREFIX__VERSION_1_1_DEPRECATED\r\n    cl_int enqueueBarrier() const CL_API_SUFFIX__VERSION_1_1_DEPRECATED\r\n    {\r\n        return detail::errHandler(\r\n            ::clEnqueueBarrier(object_),\r\n            __ENQUEUE_BARRIER_ERR);\r\n    }\r\n#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS\r\n\r\n    cl_int flush() const\r\n    {\r\n        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);\r\n    }\r\n\r\n    cl_int finish() const\r\n    {\r\n        return detail::errHandler(::clFinish(object_), __FINISH_ERR);\r\n    }\r\n\r\n#ifdef cl_khr_external_memory\r\n    cl_int enqueueAcquireExternalMemObjects(\r\n        const vector<Memory>& mem_objects,\r\n        const vector<Event>* events_wait = nullptr,\r\n        Event *event = nullptr)\r\n    {\r\n        cl_int err = CL_INVALID_OPERATION;\r\n        cl_event tmp;\r\n\r\n        std::call_once(ext_memory_initialized_, initMemoryExtension, this->getInfo<CL_QUEUE_DEVICE>());\r\n\r\n        if (pfn_clEnqueueAcquireExternalMemObjectsKHR)\r\n        {\r\n            err = pfn_clEnqueueAcquireExternalMemObjectsKHR(\r\n                object_,\r\n                static_cast<cl_uint>(mem_objects.size()),\r\n                (mem_objects.size() > 0) ? reinterpret_cast<const cl_mem *>(mem_objects.data()) : nullptr,\r\n                (events_wait != nullptr) ? static_cast<cl_uint>(events_wait->size()) : 0,\r\n                (events_wait != nullptr && events_wait->size() > 0) ? reinterpret_cast<const cl_event*>(events_wait->data()) : nullptr,\r\n                &tmp);\r\n        }\r\n\r\n        detail::errHandler(err, __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n\r\n    cl_int enqueueReleaseExternalMemObjects(\r\n        const vector<Memory>& mem_objects,\r\n        const vector<Event>* events_wait = nullptr,\r\n        Event *event = nullptr)\r\n    {\r\n        cl_int err = CL_INVALID_OPERATION;\r\n        cl_event tmp;\r\n\r\n        std::call_once(ext_memory_initialized_, initMemoryExtension, this->getInfo<CL_QUEUE_DEVICE>());\r\n\r\n        if (pfn_clEnqueueReleaseExternalMemObjectsKHR)\r\n        {\r\n            err = pfn_clEnqueueReleaseExternalMemObjectsKHR(\r\n                object_,\r\n                static_cast<cl_uint>(mem_objects.size()),\r\n                (mem_objects.size() > 0) ? reinterpret_cast<const cl_mem *>(mem_objects.data()) : nullptr,\r\n                (events_wait != nullptr) ? static_cast<cl_uint>(events_wait->size()) : 0,\r\n                (events_wait != nullptr && events_wait->size() > 0) ? reinterpret_cast<const cl_event*>(events_wait->data()) : nullptr,\r\n                &tmp);\r\n        }\r\n\r\n        detail::errHandler(err, __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR);\r\n\r\n        if (event != nullptr && err == CL_SUCCESS)\r\n            *event = tmp;\r\n\r\n        return err;\r\n    }\r\n#endif // cl_khr_external_memory && CL_HPP_TARGET_OPENCL_VERSION >= 300\r\n\r\n#ifdef cl_khr_semaphore\r\n    cl_int enqueueWaitSemaphores(\r\n        const vector<Semaphore> &sema_objects,\r\n        const vector<cl_semaphore_payload_khr> &sema_payloads = {},\r\n        const vector<Event>* events_wait_list = nullptr,\r\n        Event *event = nullptr) const;\r\n\r\n    cl_int enqueueSignalSemaphores(\r\n        const vector<Semaphore> &sema_objects,\r\n        const vector<cl_semaphore_payload_khr>& sema_payloads = {},\r\n        const vector<Event>* events_wait_list = nullptr,\r\n        Event* event = nullptr);\r\n#endif // cl_khr_semaphore\r\n}; // CommandQueue\r\n\r\n#ifdef cl_khr_external_memory\r\nCL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::ext_memory_initialized_;\r\n#endif\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::default_initialized_;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ CommandQueue CommandQueue::default_;\r\nCL_HPP_DEFINE_STATIC_MEMBER_ cl_int CommandQueue::default_error_ = CL_SUCCESS;\r\n\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\nenum class DeviceQueueProperties : cl_command_queue_properties\r\n{\r\n    None = 0,\r\n    Profiling = CL_QUEUE_PROFILING_ENABLE,\r\n};\r\n\r\ninline DeviceQueueProperties operator|(DeviceQueueProperties lhs, DeviceQueueProperties rhs)\r\n{\r\n    return static_cast<DeviceQueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));\r\n}\r\n\r\n/*! \\class DeviceCommandQueue\r\n * \\brief DeviceCommandQueue interface for device cl_command_queues.\r\n */\r\nclass DeviceCommandQueue : public detail::Wrapper<cl_command_queue>\r\n{\r\npublic:\r\n\r\n    /*!\r\n     * Trivial empty constructor to create a null queue.\r\n     */\r\n    DeviceCommandQueue() { }\r\n\r\n    /*!\r\n     * Default construct device command queue on default context and device\r\n     */\r\n    DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n        cl::Context context = cl::Context::getDefault();\r\n        cl::Device device = cl::Device::getDefault();\r\n\r\n        cl_command_queue_properties mergedProperties =\r\n            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);\r\n\r\n        cl_queue_properties queue_properties[] = {\r\n            CL_QUEUE_PROPERTIES, mergedProperties, 0 };\r\n        object_ = ::clCreateCommandQueueWithProperties(\r\n            context(), device(), queue_properties, &error);\r\n\r\n        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    /*!\r\n     * Create a device command queue for a specified device in the passed context.\r\n     */\r\n    DeviceCommandQueue(\r\n        const Context& context,\r\n        const Device& device,\r\n        DeviceQueueProperties properties = DeviceQueueProperties::None,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_command_queue_properties mergedProperties =\r\n            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);\r\n        cl_queue_properties queue_properties[] = {\r\n            CL_QUEUE_PROPERTIES, mergedProperties, 0 };\r\n        object_ = ::clCreateCommandQueueWithProperties(\r\n            context(), device(), queue_properties, &error);\r\n\r\n        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    /*!\r\n     * Create a device command queue for a specified device in the passed context.\r\n     */\r\n    DeviceCommandQueue(\r\n        const Context& context,\r\n        const Device& device,\r\n        cl_uint queueSize,\r\n        DeviceQueueProperties properties = DeviceQueueProperties::None,\r\n        cl_int* err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_command_queue_properties mergedProperties =\r\n            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);\r\n        cl_queue_properties queue_properties[] = {\r\n            CL_QUEUE_PROPERTIES, mergedProperties,\r\n            CL_QUEUE_SIZE, queueSize, \r\n            0 };\r\n        object_ = ::clCreateCommandQueueWithProperties(\r\n            context(), device(), queue_properties, &error);\r\n\r\n        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n\r\n    /*! \\brief Constructor from cl_command_queue - takes ownership.\r\n    *\r\n    * \\param retainObject will cause the constructor to retain its cl object.\r\n    *                     Defaults to false to maintain compatibility with\r\n    *                     earlier versions.\r\n    */\r\n    explicit DeviceCommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) :\r\n        detail::Wrapper<cl_type>(commandQueue, retainObject) { }\r\n\r\n    DeviceCommandQueue& operator = (const cl_command_queue& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n    template <typename T>\r\n    cl_int getInfo(cl_command_queue_info name, T* param) const\r\n    {\r\n        return detail::errHandler(\r\n            detail::getInfo(\r\n            &::clGetCommandQueueInfo, object_, name, param),\r\n            __GET_COMMAND_QUEUE_INFO_ERR);\r\n    }\r\n\r\n    template <cl_command_queue_info name> typename\r\n        detail::param_traits<detail::cl_command_queue_info, name>::param_type\r\n        getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_command_queue_info, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n    /*!\r\n     * Create a new default device command queue for the default device,\r\n     * in the default context and of the default size.\r\n     * If there is already a default queue for the specified device this\r\n     * function will return the pre-existing queue.\r\n     */\r\n    static DeviceCommandQueue makeDefault(\r\n        cl_int *err = nullptr)\r\n    {\r\n        cl_int error;\r\n        cl::Context context = cl::Context::getDefault();\r\n        cl::Device device = cl::Device::getDefault();\r\n\r\n        cl_command_queue_properties properties =\r\n            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;\r\n        cl_queue_properties queue_properties[] = {\r\n            CL_QUEUE_PROPERTIES, properties,\r\n            0 };\r\n        DeviceCommandQueue deviceQueue(\r\n            ::clCreateCommandQueueWithProperties(\r\n            context(), device(), queue_properties, &error));\r\n\r\n        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n\r\n        return deviceQueue;\r\n    }\r\n\r\n    /*!\r\n     * Create a new default device command queue for the specified device\r\n     * and of the default size.\r\n     * If there is already a default queue for the specified device this\r\n     * function will return the pre-existing queue.\r\n     */\r\n    static DeviceCommandQueue makeDefault(\r\n        const Context &context, const Device &device, cl_int *err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_command_queue_properties properties =\r\n            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;\r\n        cl_queue_properties queue_properties[] = {\r\n            CL_QUEUE_PROPERTIES, properties,\r\n            0 };\r\n        DeviceCommandQueue deviceQueue(\r\n            ::clCreateCommandQueueWithProperties(\r\n            context(), device(), queue_properties, &error));\r\n\r\n        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n\r\n        return deviceQueue;\r\n    }\r\n\r\n    /*!\r\n     * Create a new default device command queue for the specified device \r\n     * and of the requested size in bytes.\r\n     * If there is already a default queue for the specified device this\r\n     * function will return the pre-existing queue.\r\n     */\r\n    static DeviceCommandQueue makeDefault(\r\n        const Context &context, const Device &device, cl_uint queueSize, cl_int *err = nullptr)\r\n    {\r\n        cl_int error;\r\n\r\n        cl_command_queue_properties properties =\r\n            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;\r\n        cl_queue_properties queue_properties[] = {\r\n            CL_QUEUE_PROPERTIES, properties,\r\n            CL_QUEUE_SIZE, queueSize,\r\n            0 };\r\n        DeviceCommandQueue deviceQueue(\r\n            ::clCreateCommandQueueWithProperties(\r\n                context(), device(), queue_properties, &error));\r\n\r\n        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n\r\n        return deviceQueue;\r\n    }\r\n\r\n\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n    /*!\r\n     * Modify the default device command queue to be used for subsequent kernels.\r\n     * This can update the default command queue for a device repeatedly to account\r\n     * for kernels that rely on the default.\r\n     * @return updated default device command queue.\r\n     */\r\n    static DeviceCommandQueue updateDefault(const Context &context, const Device &device, const DeviceCommandQueue &default_queue, cl_int *err = nullptr)\r\n    {\r\n        cl_int error;\r\n        error = clSetDefaultDeviceCommandQueue(context.get(), device.get(), default_queue.get());\r\n\r\n        detail::errHandler(error, __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n        return default_queue;\r\n    }\r\n\r\n    /*!\r\n     * Return the current default command queue for the specified command queue\r\n     */\r\n    static DeviceCommandQueue getDefault(const CommandQueue &queue, cl_int * err = nullptr)\r\n    {\r\n        return queue.getInfo<CL_QUEUE_DEVICE_DEFAULT>(err);\r\n    }\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210\r\n}; // DeviceCommandQueue\r\n\r\nnamespace detail\r\n{\r\n    // Specialization for device command queue\r\n    template <>\r\n    struct KernelArgumentHandler<cl::DeviceCommandQueue, void>\r\n    {\r\n        static size_type size(const cl::DeviceCommandQueue&) { return sizeof(cl_command_queue); }\r\n        static const cl_command_queue* ptr(const cl::DeviceCommandQueue& value) { return &(value()); }\r\n    };\r\n} // namespace detail\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n\r\ntemplate< typename IteratorType >\r\nBuffer::Buffer(\r\n    const Context &context,\r\n    IteratorType startIterator,\r\n    IteratorType endIterator,\r\n    bool readOnly,\r\n    bool useHostPtr,\r\n    cl_int* err)\r\n{\r\n    typedef typename std::iterator_traits<IteratorType>::value_type DataType;\r\n    cl_int error;\r\n\r\n    cl_mem_flags flags = 0;\r\n    if( readOnly ) {\r\n        flags |= CL_MEM_READ_ONLY;\r\n    }\r\n    else {\r\n        flags |= CL_MEM_READ_WRITE;\r\n    }\r\n    if( useHostPtr ) {\r\n        flags |= CL_MEM_USE_HOST_PTR;\r\n    }\r\n    \r\n    size_type size = sizeof(DataType)*(endIterator - startIterator);\r\n\r\n    if( useHostPtr ) {\r\n        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);\r\n    } else {\r\n        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);\r\n    }\r\n\r\n    detail::errHandler(error, __CREATE_BUFFER_ERR);\r\n    if (err != nullptr) {\r\n        *err = error;\r\n    }\r\n\r\n    if( !useHostPtr ) {\r\n        CommandQueue queue(context, 0, &error);\r\n        detail::errHandler(error, __CREATE_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n\r\n        error = cl::copy(queue, startIterator, endIterator, *this);\r\n        detail::errHandler(error, __CREATE_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n}\r\n\r\ntemplate< typename IteratorType >\r\nBuffer::Buffer(\r\n    const CommandQueue &queue,\r\n    IteratorType startIterator,\r\n    IteratorType endIterator,\r\n    bool readOnly,\r\n    bool useHostPtr,\r\n    cl_int* err)\r\n{\r\n    typedef typename std::iterator_traits<IteratorType>::value_type DataType;\r\n    cl_int error;\r\n\r\n    cl_mem_flags flags = 0;\r\n    if (readOnly) {\r\n        flags |= CL_MEM_READ_ONLY;\r\n    }\r\n    else {\r\n        flags |= CL_MEM_READ_WRITE;\r\n    }\r\n    if (useHostPtr) {\r\n        flags |= CL_MEM_USE_HOST_PTR;\r\n    }\r\n\r\n    size_type size = sizeof(DataType)*(endIterator - startIterator);\r\n\r\n    Context context = queue.getInfo<CL_QUEUE_CONTEXT>();\r\n\r\n    if (useHostPtr) {\r\n        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);\r\n    }\r\n    else {\r\n        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);\r\n    }\r\n\r\n    detail::errHandler(error, __CREATE_BUFFER_ERR);\r\n    if (err != nullptr) {\r\n        *err = error;\r\n    }\r\n\r\n    if (!useHostPtr) {\r\n        error = cl::copy(queue, startIterator, endIterator, *this);\r\n        detail::errHandler(error, __CREATE_BUFFER_ERR);\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n}\r\n\r\ninline cl_int enqueueReadBuffer(\r\n    const Buffer& buffer,\r\n    cl_bool blocking,\r\n    size_type offset,\r\n    size_type size,\r\n    void* ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);\r\n}\r\n\r\ninline cl_int enqueueWriteBuffer(\r\n        const Buffer& buffer,\r\n        cl_bool blocking,\r\n        size_type offset,\r\n        size_type size,\r\n        const void* ptr,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);\r\n}\r\n\r\ninline void* enqueueMapBuffer(\r\n        const Buffer& buffer,\r\n        cl_bool blocking,\r\n        cl_map_flags flags,\r\n        size_type offset,\r\n        size_type size,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr,\r\n        cl_int* err = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);\r\n    if (err != nullptr) {\r\n        *err = error;\r\n    }\r\n\r\n    void * result = ::clEnqueueMapBuffer(\r\n            queue(), buffer(), blocking, flags, offset, size,\r\n            (events != nullptr) ? (cl_uint) events->size() : 0,\r\n            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n            (cl_event*) event,\r\n            &error);\r\n\r\n    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);\r\n    if (err != nullptr) {\r\n        *err = error;\r\n    }\r\n    return result;\r\n}\r\n\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n/**\r\n * Enqueues to the default queue a command that will allow the host to\r\n * update a region of a coarse-grained SVM buffer.\r\n * This variant takes a raw SVM pointer.\r\n */\r\ntemplate<typename T>\r\ninline cl_int enqueueMapSVM(\r\n    T* ptr,\r\n    cl_bool blocking,\r\n    cl_map_flags flags,\r\n    size_type size,\r\n    const vector<Event>* events,\r\n    Event* event)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    if (error != CL_SUCCESS) {\r\n        return detail::errHandler(error, __ENQUEUE_MAP_SVM_ERR);\r\n    }\r\n\r\n    return queue.enqueueMapSVM(\r\n        ptr, blocking, flags, size, events, event);\r\n}\r\n\r\n/**\r\n * Enqueues to the default queue a command that will allow the host to \r\n * update a region of a coarse-grained SVM buffer.\r\n * This variant takes a cl::pointer instance.\r\n */\r\ntemplate<typename T, class D>\r\ninline cl_int enqueueMapSVM(\r\n    cl::pointer<T, D> &ptr,\r\n    cl_bool blocking,\r\n    cl_map_flags flags,\r\n    size_type size,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    if (error != CL_SUCCESS) {\r\n        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);\r\n    }\r\n\r\n    return queue.enqueueMapSVM(\r\n        ptr, blocking, flags, size, events, event);\r\n}\r\n\r\n/**\r\n * Enqueues to the default queue a command that will allow the host to\r\n * update a region of a coarse-grained SVM buffer.\r\n * This variant takes a cl::vector instance.\r\n */\r\ntemplate<typename T, class Alloc>\r\ninline cl_int enqueueMapSVM(\r\n    cl::vector<T, Alloc> &container,\r\n    cl_bool blocking,\r\n    cl_map_flags flags,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    if (error != CL_SUCCESS) {\r\n        return detail::errHandler(error, __ENQUEUE_MAP_SVM_ERR);\r\n    }\r\n\r\n    return queue.enqueueMapSVM(\r\n        container, blocking, flags, events, event);\r\n}\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\ninline cl_int enqueueUnmapMemObject(\r\n    const Memory& memory,\r\n    void* mapped_ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    cl_event tmp;\r\n    cl_int err = detail::errHandler(\r\n        ::clEnqueueUnmapMemObject(\r\n        queue(), memory(), mapped_ptr,\r\n        (events != nullptr) ? (cl_uint)events->size() : 0,\r\n        (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,\r\n        (event != nullptr) ? &tmp : nullptr),\r\n        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);\r\n\r\n    if (event != nullptr && err == CL_SUCCESS)\r\n        *event = tmp;\r\n\r\n    return err;\r\n}\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n/**\r\n * Enqueues to the default queue a command that will release a coarse-grained \r\n * SVM buffer back to the OpenCL runtime.\r\n * This variant takes a raw SVM pointer.\r\n */\r\ntemplate<typename T>\r\ninline cl_int enqueueUnmapSVM(\r\n    T* ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    if (error != CL_SUCCESS) {\r\n        return detail::errHandler(error, __ENQUEUE_UNMAP_SVM_ERR);\r\n    }\r\n\r\n    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), \r\n        __ENQUEUE_UNMAP_SVM_ERR);\r\n\r\n}\r\n\r\n/**\r\n * Enqueues to the default queue a command that will release a coarse-grained \r\n * SVM buffer back to the OpenCL runtime.\r\n * This variant takes a cl::pointer instance.\r\n */\r\ntemplate<typename T, class D>\r\ninline cl_int enqueueUnmapSVM(\r\n    cl::pointer<T, D> &ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    if (error != CL_SUCCESS) {\r\n        return detail::errHandler(error, __ENQUEUE_UNMAP_SVM_ERR);\r\n    }\r\n\r\n    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event),\r\n        __ENQUEUE_UNMAP_SVM_ERR);\r\n}\r\n\r\n/**\r\n * Enqueues to the default queue a command that will release a coarse-grained \r\n * SVM buffer back to the OpenCL runtime.\r\n * This variant takes a cl::vector instance.\r\n */\r\ntemplate<typename T, class Alloc>\r\ninline cl_int enqueueUnmapSVM(\r\n    cl::vector<T, Alloc> &container,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    if (error != CL_SUCCESS) {\r\n        return detail::errHandler(error, __ENQUEUE_UNMAP_SVM_ERR);\r\n    }\r\n\r\n    return detail::errHandler(queue.enqueueUnmapSVM(container, events, event),\r\n        __ENQUEUE_UNMAP_SVM_ERR);\r\n}\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\ninline cl_int enqueueCopyBuffer(\r\n        const Buffer& src,\r\n        const Buffer& dst,\r\n        size_type src_offset,\r\n        size_type dst_offset,\r\n        size_type size,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);\r\n}\r\n\r\n/**\r\n * Blocking copy operation between iterators and a buffer.\r\n * Host to Device.\r\n * Uses default command queue.\r\n */\r\ntemplate< typename IteratorType >\r\ninline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    if (error != CL_SUCCESS)\r\n        return error;\r\n\r\n    return cl::copy(queue, startIterator, endIterator, buffer);\r\n}\r\n\r\n/**\r\n * Blocking copy operation between iterators and a buffer.\r\n * Device to Host.\r\n * Uses default command queue.\r\n */\r\ntemplate< typename IteratorType >\r\ninline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n    if (error != CL_SUCCESS)\r\n        return error;\r\n\r\n    return cl::copy(queue, buffer, startIterator, endIterator);\r\n}\r\n\r\n/**\r\n * Blocking copy operation between iterators and a buffer.\r\n * Host to Device.\r\n * Uses specified queue.\r\n */\r\ntemplate< typename IteratorType >\r\ninline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )\r\n{\r\n    typedef typename std::iterator_traits<IteratorType>::value_type DataType;\r\n    cl_int error;\r\n    \r\n    size_type length = endIterator-startIterator;\r\n    size_type byteLength = length*sizeof(DataType);\r\n\r\n    DataType *pointer = \r\n        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));\r\n    // if exceptions enabled, enqueueMapBuffer will throw\r\n    if( error != CL_SUCCESS ) {\r\n        return error;\r\n    }\r\n#if defined(_MSC_VER) && _MSC_VER < 1920\r\n    std::copy(\r\n        startIterator,\r\n        endIterator,\r\n        stdext::checked_array_iterator<DataType*>(\r\n            pointer, length));\r\n#else\r\n    std::copy(startIterator, endIterator, pointer);\r\n#endif // defined(_MSC_VER) && _MSC_VER < 1920\r\n    Event endEvent;\r\n    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);\r\n    // if exceptions enabled, enqueueUnmapMemObject will throw\r\n    if( error != CL_SUCCESS ) { \r\n        return error;\r\n    }\r\n    endEvent.wait();\r\n    return CL_SUCCESS;\r\n}\r\n\r\n/**\r\n * Blocking copy operation between iterators and a buffer.\r\n * Device to Host.\r\n * Uses specified queue.\r\n */\r\ntemplate< typename IteratorType >\r\ninline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )\r\n{\r\n    typedef typename std::iterator_traits<IteratorType>::value_type DataType;\r\n    cl_int error;\r\n        \r\n    size_type length = endIterator-startIterator;\r\n    size_type byteLength = length*sizeof(DataType);\r\n\r\n    DataType *pointer = \r\n        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));\r\n    // if exceptions enabled, enqueueMapBuffer will throw\r\n    if( error != CL_SUCCESS ) {\r\n        return error;\r\n    }\r\n    std::copy(pointer, pointer + length, startIterator);\r\n    Event endEvent;\r\n    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);\r\n    // if exceptions enabled, enqueueUnmapMemObject will throw\r\n    if( error != CL_SUCCESS ) { \r\n        return error;\r\n    }\r\n    endEvent.wait();\r\n    return CL_SUCCESS;\r\n}\r\n\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n/**\r\n * Blocking SVM map operation - performs a blocking map underneath.\r\n */\r\ntemplate<typename T, class Alloc>\r\ninline cl_int mapSVM(cl::vector<T, Alloc> &container)\r\n{\r\n    return enqueueMapSVM(container, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE);\r\n}\r\n\r\n/**\r\n* Blocking SVM map operation - performs a blocking map underneath.\r\n*/\r\ntemplate<typename T, class Alloc>\r\ninline cl_int unmapSVM(cl::vector<T, Alloc> &container)\r\n{\r\n    return enqueueUnmapSVM(container);\r\n}\r\n\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 110\r\ninline cl_int enqueueReadBufferRect(\r\n    const Buffer& buffer,\r\n    cl_bool blocking,\r\n    const array<size_type, 3>& buffer_offset,\r\n    const array<size_type, 3>& host_offset,\r\n    const array<size_type, 3>& region,\r\n    size_type buffer_row_pitch,\r\n    size_type buffer_slice_pitch,\r\n    size_type host_row_pitch,\r\n    size_type host_slice_pitch,\r\n    void *ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueReadBufferRect(\r\n        buffer, \r\n        blocking, \r\n        buffer_offset, \r\n        host_offset,\r\n        region,\r\n        buffer_row_pitch,\r\n        buffer_slice_pitch,\r\n        host_row_pitch,\r\n        host_slice_pitch,\r\n        ptr, \r\n        events, \r\n        event);\r\n}\r\n\r\ninline cl_int enqueueReadBufferRect(\r\n    const Buffer& buffer, \r\n    cl_bool blocking,\r\n    const array<size_type, 2>& buffer_offset,\r\n    const array<size_type, 2>& host_offset, \r\n    const array<size_type, 2>& region,\r\n    size_type buffer_row_pitch,\r\n    size_type buffer_slice_pitch,\r\n    size_type host_row_pitch,\r\n    size_type host_slice_pitch,\r\n    void* ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    return enqueueReadBufferRect(\r\n        buffer,\r\n        blocking,\r\n        { buffer_offset[0], buffer_offset[1], 0 },\r\n        { host_offset[0], host_offset[1], 0 },\r\n        { region[0], region[1], 1 },\r\n        buffer_row_pitch,\r\n        buffer_slice_pitch,\r\n        host_row_pitch,\r\n        host_slice_pitch,\r\n        ptr,\r\n        events,\r\n        event);\r\n}\r\n\r\ninline cl_int enqueueWriteBufferRect(\r\n    const Buffer& buffer,\r\n    cl_bool blocking,\r\n    const array<size_type, 3>& buffer_offset,\r\n    const array<size_type, 3>& host_offset,\r\n    const array<size_type, 3>& region,\r\n    size_type buffer_row_pitch,\r\n    size_type buffer_slice_pitch,\r\n    size_type host_row_pitch,\r\n    size_type host_slice_pitch,\r\n    const void *ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueWriteBufferRect(\r\n        buffer, \r\n        blocking, \r\n        buffer_offset, \r\n        host_offset,\r\n        region,\r\n        buffer_row_pitch,\r\n        buffer_slice_pitch,\r\n        host_row_pitch,\r\n        host_slice_pitch,\r\n        ptr, \r\n        events, \r\n        event);\r\n}\r\n\r\ninline cl_int enqueueWriteBufferRect(\r\n    const Buffer& buffer,\r\n    cl_bool blocking,\r\n    const array<size_type, 2>& buffer_offset,\r\n    const array<size_type, 2>& host_offset,\r\n    const array<size_type, 2>& region,\r\n    size_type buffer_row_pitch,\r\n    size_type buffer_slice_pitch,\r\n    size_type host_row_pitch,\r\n    size_type host_slice_pitch,\r\n    const void* ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    return enqueueWriteBufferRect(\r\n        buffer, \r\n        blocking,\r\n        { buffer_offset[0], buffer_offset[1], 0 },\r\n        { host_offset[0], host_offset[1], 0 },\r\n        { region[0], region[1], 1 }, \r\n        buffer_row_pitch,\r\n        buffer_slice_pitch,\r\n        host_row_pitch,\r\n        host_slice_pitch,\r\n        ptr,\r\n        events,\r\n        event);\r\n}\r\n\r\ninline cl_int enqueueCopyBufferRect(\r\n    const Buffer& src,\r\n    const Buffer& dst,\r\n    const array<size_type, 3>& src_origin,\r\n    const array<size_type, 3>& dst_origin,\r\n    const array<size_type, 3>& region,\r\n    size_type src_row_pitch,\r\n    size_type src_slice_pitch,\r\n    size_type dst_row_pitch,\r\n    size_type dst_slice_pitch,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueCopyBufferRect(\r\n        src,\r\n        dst,\r\n        src_origin,\r\n        dst_origin,\r\n        region,\r\n        src_row_pitch,\r\n        src_slice_pitch,\r\n        dst_row_pitch,\r\n        dst_slice_pitch,\r\n        events, \r\n        event);\r\n}\r\n\r\ninline cl_int enqueueCopyBufferRect(\r\n    const Buffer& src,\r\n    const Buffer& dst,\r\n    const array<size_type, 2>& src_origin,\r\n    const array<size_type, 2>& dst_origin,\r\n    const array<size_type, 2>& region,\r\n    size_type src_row_pitch,\r\n    size_type src_slice_pitch,\r\n    size_type dst_row_pitch,\r\n    size_type dst_slice_pitch,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    return enqueueCopyBufferRect(\r\n        src,\r\n        dst, \r\n        { src_origin[0], src_origin[1], 0 },\r\n        { dst_origin[0], dst_origin[1], 0 },\r\n        { region[0], region[1], 1 }, \r\n        src_row_pitch,\r\n        src_slice_pitch,\r\n        dst_row_pitch,\r\n        dst_slice_pitch,\r\n        events,\r\n        event);\r\n}\r\n#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n\r\ninline cl_int enqueueReadImage(\r\n    const Image& image,\r\n    cl_bool blocking,\r\n    const array<size_type, 3>& origin,\r\n    const array<size_type, 3>& region,\r\n    size_type row_pitch,\r\n    size_type slice_pitch,\r\n    void* ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr) \r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueReadImage(\r\n        image,\r\n        blocking,\r\n        origin,\r\n        region,\r\n        row_pitch,\r\n        slice_pitch,\r\n        ptr,\r\n        events, \r\n        event);\r\n}\r\n\r\ninline cl_int enqueueReadImage(\r\n    const Image& image, \r\n    cl_bool blocking,\r\n    const array<size_type, 2>& origin,\r\n    const array<size_type, 2>& region,\r\n    size_type row_pitch,\r\n    size_type slice_pitch,\r\n    void* ptr, \r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    return enqueueReadImage(\r\n        image,\r\n        blocking, \r\n        { origin[0], origin[1], 0 },\r\n        { region[0], region[1], 1 },\r\n        row_pitch,\r\n        slice_pitch,\r\n        ptr,\r\n        events,\r\n        event);\r\n}\r\n\r\ninline cl_int enqueueWriteImage(\r\n    const Image& image,\r\n    cl_bool blocking,\r\n    const array<size_type, 3>& origin,\r\n    const array<size_type, 3>& region,\r\n    size_type row_pitch,\r\n    size_type slice_pitch,\r\n    const void* ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueWriteImage(\r\n        image,\r\n        blocking,\r\n        origin,\r\n        region,\r\n        row_pitch,\r\n        slice_pitch,\r\n        ptr,\r\n        events, \r\n        event);\r\n}\r\n\r\ninline cl_int enqueueWriteImage(\r\n    const Image& image, \r\n    cl_bool blocking,\r\n    const array<size_type, 2>& origin,\r\n    const array<size_type, 2>& region,\r\n    size_type row_pitch, \r\n    size_type slice_pitch,\r\n    const void* ptr,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    return enqueueWriteImage(\r\n        image, \r\n        blocking, \r\n        { origin[0], origin[1], 0 },\r\n        { region[0], region[1], 1 }, \r\n        row_pitch,\r\n        slice_pitch,\r\n        ptr,\r\n        events,\r\n        event);    \r\n}\r\n\r\ninline cl_int enqueueCopyImage(\r\n    const Image& src,\r\n    const Image& dst,\r\n    const array<size_type, 3>& src_origin,\r\n    const array<size_type, 3>& dst_origin,\r\n    const array<size_type, 3>& region,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueCopyImage(\r\n        src,\r\n        dst,\r\n        src_origin,\r\n        dst_origin,\r\n        region,\r\n        events,\r\n        event);\r\n}\r\n\r\ninline cl_int enqueueCopyImage(\r\n    const Image& src, \r\n    const Image& dst,\r\n    const array<size_type, 2>& src_origin,\r\n    const array<size_type, 2>& dst_origin,\r\n    const array<size_type, 2>& region,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    return enqueueCopyImage(\r\n        src, \r\n        dst,\r\n        { src_origin[0], src_origin[1], 0 },\r\n        { dst_origin[0], dst_origin[1], 0 },\r\n        { region[0], region[1], 1 },\r\n        events,\r\n        event);\r\n}\r\n\r\ninline cl_int enqueueCopyImageToBuffer(\r\n    const Image& src,\r\n    const Buffer& dst,\r\n    const array<size_type, 3>& src_origin,\r\n    const array<size_type, 3>& region,\r\n    size_type dst_offset,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueCopyImageToBuffer(\r\n        src,\r\n        dst,\r\n        src_origin,\r\n        region,\r\n        dst_offset,\r\n        events,\r\n        event);\r\n}\r\n\r\ninline cl_int enqueueCopyImageToBuffer(\r\n    const Image& src, \r\n    const Buffer& dst,\r\n    const array<size_type, 2>& src_origin,\r\n    const array<size_type, 2>& region,\r\n    size_type dst_offset,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    return enqueueCopyImageToBuffer(\r\n        src,\r\n        dst,\r\n        { src_origin[0], src_origin[1], 0 },\r\n        { region[0], region[1], 1 },\r\n        dst_offset,\r\n        events,\r\n        event);\r\n}\r\n\r\ninline cl_int enqueueCopyBufferToImage(\r\n    const Buffer& src,\r\n    const Image& dst,\r\n    size_type src_offset,\r\n    const array<size_type, 3>& dst_origin,\r\n    const array<size_type, 3>& region,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.enqueueCopyBufferToImage(\r\n        src,\r\n        dst,\r\n        src_offset,\r\n        dst_origin,\r\n        region,\r\n        events,\r\n        event);\r\n}\r\n\r\ninline cl_int enqueueCopyBufferToImage(\r\n    const Buffer& src,\r\n    const Image& dst,\r\n    size_type src_offset,\r\n    const array<size_type, 2>& dst_origin,\r\n    const array<size_type, 2>& region,\r\n    const vector<Event>* events = nullptr,\r\n    Event* event = nullptr)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return enqueueCopyBufferToImage(\r\n        src,\r\n        dst,\r\n        src_offset,\r\n        { dst_origin[0], dst_origin[1], 0 },\r\n        { region[0], region[1], 1 },\r\n        events,\r\n        event);\r\n}\r\n\r\ninline cl_int flush(void)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    }\r\n\r\n    return queue.flush();\r\n}\r\n\r\ninline cl_int finish(void)\r\n{\r\n    cl_int error;\r\n    CommandQueue queue = CommandQueue::getDefault(&error);\r\n\r\n    if (error != CL_SUCCESS) {\r\n        return error;\r\n    } \r\n\r\n\r\n    return queue.finish();\r\n}\r\n\r\nclass EnqueueArgs\r\n{\r\nprivate:\r\n    CommandQueue queue_;\r\n    const NDRange offset_;\r\n    const NDRange global_;\r\n    const NDRange local_;\r\n    vector<Event> events_;\r\n\r\n    template<typename... Ts>\r\n    friend class KernelFunctor;\r\n\r\npublic:\r\n    EnqueueArgs(NDRange global) : \r\n      queue_(CommandQueue::getDefault()),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(NullRange)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(NDRange global, NDRange local) : \r\n      queue_(CommandQueue::getDefault()),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(local)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : \r\n      queue_(CommandQueue::getDefault()),\r\n      offset_(offset), \r\n      global_(global),\r\n      local_(local)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(Event e, NDRange global) : \r\n      queue_(CommandQueue::getDefault()),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(NullRange)\r\n    {\r\n        events_.push_back(e);\r\n    }\r\n\r\n    EnqueueArgs(Event e, NDRange global, NDRange local) : \r\n      queue_(CommandQueue::getDefault()),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(local)\r\n    {\r\n        events_.push_back(e);\r\n    }\r\n\r\n    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : \r\n      queue_(CommandQueue::getDefault()),\r\n      offset_(offset), \r\n      global_(global),\r\n      local_(local)\r\n    {\r\n        events_.push_back(e);\r\n    }\r\n\r\n    EnqueueArgs(const vector<Event> &events, NDRange global) : \r\n      queue_(CommandQueue::getDefault()),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(NullRange),\r\n      events_(events)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(const vector<Event> &events, NDRange global, NDRange local) : \r\n      queue_(CommandQueue::getDefault()),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(local),\r\n      events_(events)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : \r\n      queue_(CommandQueue::getDefault()),\r\n      offset_(offset), \r\n      global_(global),\r\n      local_(local),\r\n      events_(events)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(CommandQueue &queue, NDRange global) : \r\n      queue_(queue),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(NullRange)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : \r\n      queue_(queue),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(local)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : \r\n      queue_(queue),\r\n      offset_(offset), \r\n      global_(global),\r\n      local_(local)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : \r\n      queue_(queue),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(NullRange)\r\n    {\r\n        events_.push_back(e);\r\n    }\r\n\r\n    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : \r\n      queue_(queue),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(local)\r\n    {\r\n        events_.push_back(e);\r\n    }\r\n\r\n    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : \r\n      queue_(queue),\r\n      offset_(offset), \r\n      global_(global),\r\n      local_(local)\r\n    {\r\n        events_.push_back(e);\r\n    }\r\n\r\n    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global) : \r\n      queue_(queue),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(NullRange),\r\n      events_(events)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global, NDRange local) : \r\n      queue_(queue),\r\n      offset_(NullRange), \r\n      global_(global),\r\n      local_(local),\r\n      events_(events)\r\n    {\r\n\r\n    }\r\n\r\n    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : \r\n      queue_(queue),\r\n      offset_(offset), \r\n      global_(global),\r\n      local_(local),\r\n      events_(events)\r\n    {\r\n\r\n    }\r\n};\r\n\r\n\r\n//----------------------------------------------------------------------------------------------\r\n\r\n\r\n/**\r\n * Type safe kernel functor.\r\n * \r\n */\r\ntemplate<typename... Ts>\r\nclass KernelFunctor\r\n{\r\nprivate:\r\n    Kernel kernel_;\r\n\r\n    template<int index, typename T0, typename... T1s>\r\n    void setArgs(T0&& t0, T1s&&... t1s)\r\n    {\r\n        kernel_.setArg(index, t0);\r\n        setArgs<index + 1, T1s...>(std::forward<T1s>(t1s)...);\r\n    }\r\n\r\n    template<int index, typename T0>\r\n    void setArgs(T0&& t0)\r\n    {\r\n        kernel_.setArg(index, t0);\r\n    }\r\n\r\n    template<int index>\r\n    void setArgs()\r\n    {\r\n    }\r\n\r\n\r\npublic:\r\n    KernelFunctor(Kernel kernel) : kernel_(kernel)\r\n    {}\r\n\r\n    KernelFunctor(\r\n        const Program& program,\r\n        const string name,\r\n        cl_int * err = nullptr) :\r\n        kernel_(program, name.c_str(), err)\r\n    {}\r\n\r\n    //! \\brief Return type of the functor\r\n    typedef Event result_type;\r\n\r\n    /**\r\n     * Enqueue kernel.\r\n     * @param args Launch parameters of the kernel.\r\n     * @param t0... List of kernel arguments based on the template type of the functor.\r\n     */\r\n    Event operator() (\r\n        const EnqueueArgs& args,\r\n        Ts... ts)\r\n    {\r\n        Event event;\r\n        setArgs<0>(std::forward<Ts>(ts)...);\r\n        \r\n        args.queue_.enqueueNDRangeKernel(\r\n            kernel_,\r\n            args.offset_,\r\n            args.global_,\r\n            args.local_,\r\n            &args.events_,\r\n            &event);\r\n\r\n        return event;\r\n    }\r\n\r\n    /**\r\n    * Enqueue kernel with support for error code.\r\n    * @param args Launch parameters of the kernel.\r\n    * @param t0... List of kernel arguments based on the template type of the functor.\r\n    * @param error Out parameter returning the error code from the execution.\r\n    */\r\n    Event operator() (\r\n        const EnqueueArgs& args,\r\n        Ts... ts,\r\n        cl_int &error)\r\n    {\r\n        Event event;\r\n        setArgs<0>(std::forward<Ts>(ts)...);\r\n\r\n        error = args.queue_.enqueueNDRangeKernel(\r\n            kernel_,\r\n            args.offset_,\r\n            args.global_,\r\n            args.local_,\r\n            &args.events_,\r\n            &event);\r\n        \r\n        return event;\r\n    }\r\n\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n    cl_int setSVMPointers(const vector<void*> &pointerList)\r\n    {\r\n        return kernel_.setSVMPointers(pointerList);\r\n    }\r\n\r\n    template<typename T0, typename... T1s>\r\n    cl_int setSVMPointers(const T0 &t0, T1s &... ts)\r\n    {\r\n        return kernel_.setSVMPointers(t0, ts...);\r\n    }\r\n#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200\r\n\r\n    Kernel getKernel()\r\n    {\r\n        return kernel_;\r\n    }\r\n};\r\n\r\nnamespace compatibility {\r\n    /**\r\n     * Backward compatibility class to ensure that cl.hpp code works with opencl.hpp.\r\n     * Please use KernelFunctor directly.\r\n     */\r\n    template<typename... Ts>\r\n    struct make_kernel\r\n    {\r\n        typedef KernelFunctor<Ts...> FunctorType;\r\n\r\n        FunctorType functor_;\r\n\r\n        make_kernel(\r\n            const Program& program,\r\n            const string name,\r\n            cl_int * err = nullptr) :\r\n            functor_(FunctorType(program, name, err))\r\n        {}\r\n\r\n        make_kernel(\r\n            const Kernel kernel) :\r\n            functor_(FunctorType(kernel))\r\n        {}\r\n\r\n        //! \\brief Return type of the functor\r\n        typedef Event result_type;\r\n\r\n        //! \\brief Function signature of kernel functor with no event dependency.\r\n        typedef Event type_(\r\n            const EnqueueArgs&,\r\n            Ts...);\r\n\r\n        Event operator()(\r\n            const EnqueueArgs& enqueueArgs,\r\n            Ts... args)\r\n        {\r\n            return functor_(\r\n                enqueueArgs, args...);\r\n        }\r\n    };\r\n} // namespace compatibility\r\n\r\n#ifdef cl_khr_semaphore\r\n\r\n#ifdef cl_khr_external_semaphore\r\nenum ExternalSemaphoreType : cl_external_semaphore_handle_type_khr\r\n{\r\n    None = 0,\r\n#ifdef cl_khr_external_semaphore_opaque_fd\r\n    OpaqueFd = CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR,\r\n#endif // cl_khr_external_semaphore_opaque_fd\r\n#ifdef cl_khr_external_semaphore_sync_fd\r\n    SyncFd = CL_SEMAPHORE_HANDLE_SYNC_FD_KHR,\r\n#endif // cl_khr_external_semaphore_sync_fd\r\n#ifdef cl_khr_external_semaphore_win32\r\n    OpaqueWin32 = CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR,\r\n    OpaqueWin32Kmt = CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR,\r\n#endif // cl_khr_external_semaphore_win32\r\n};\r\n#endif // cl_khr_external_semaphore\r\n\r\nclass Semaphore : public detail::Wrapper<cl_semaphore_khr>\r\n{\r\npublic:\r\n    Semaphore() : detail::Wrapper<cl_type>() {}\r\n    Semaphore(\r\n        const Context &context,\r\n        const vector<cl_semaphore_properties_khr>& sema_props,\r\n        cl_int *err = nullptr) \r\n    {\r\n        /* initialization of addresses to extension functions (it is done only once) */\r\n        std::call_once(ext_init_, initExtensions, context);\r\n\r\n        cl_int error = CL_INVALID_OPERATION;\r\n\r\n        if (pfn_clCreateSemaphoreWithPropertiesKHR)\r\n        {\r\n            object_ = pfn_clCreateSemaphoreWithPropertiesKHR(\r\n                context(),\r\n                sema_props.data(),\r\n                &error);\r\n        }\r\n          \r\n        detail::errHandler(error, __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR);\r\n\r\n        if (err != nullptr) {\r\n            *err = error;\r\n        }\r\n    }\r\n    Semaphore(\r\n        const vector<cl_semaphore_properties_khr>& sema_props,\r\n        cl_int* err = nullptr):Semaphore(Context::getDefault(err), sema_props, err) {}\r\n    \r\n    explicit Semaphore(const cl_semaphore_khr& semaphore, bool retainObject = false) :\r\n        detail::Wrapper<cl_type>(semaphore, retainObject) {}\r\n    Semaphore& operator = (const cl_semaphore_khr& rhs) {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n    template <typename T>\r\n    cl_int getInfo(cl_semaphore_info_khr name, T* param) const\r\n    {\r\n        if (pfn_clGetSemaphoreInfoKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                                      __GET_SEMAPHORE_KHR_INFO_ERR);\r\n        }\r\n\r\n        return detail::errHandler(\r\n            detail::getInfo(pfn_clGetSemaphoreInfoKHR, object_, name, param),\r\n            __GET_SEMAPHORE_KHR_INFO_ERR);\r\n    }\r\n    template <cl_semaphore_info_khr name> typename\r\n    detail::param_traits<detail::cl_semaphore_info_khr, name>::param_type\r\n    getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_semaphore_info_khr, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;        \r\n        }\r\n        return param;      \r\n    }\r\n\r\n#ifdef cl_khr_external_semaphore\r\n    template <typename T>\r\n    cl_int getHandleForTypeKHR(\r\n        const Device& device, cl_external_semaphore_handle_type_khr name, T* param) const\r\n    {\r\n        if (pfn_clGetSemaphoreHandleForTypeKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                                      __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR);\r\n        }\r\n\r\n        return detail::errHandler(\r\n            detail::getInfo(\r\n                pfn_clGetSemaphoreHandleForTypeKHR, object_, device(), name, param),\r\n                __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR);\r\n    }\r\n\r\n    template <cl_external_semaphore_handle_type_khr type> typename\r\n    detail::param_traits<detail::cl_external_semaphore_handle_type_khr, type>::param_type\r\n        getHandleForTypeKHR(const Device& device, cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n        detail::cl_external_semaphore_handle_type_khr, type>::param_type param;\r\n        cl_int result = getHandleForTypeKHR(device, type, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n#endif // cl_khr_external_semaphore\r\n\r\n    cl_int retain()\r\n    { \r\n        if (pfn_clRetainSemaphoreKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                                      __RETAIN_SEMAPHORE_KHR_ERR);\r\n        }\r\n        return pfn_clRetainSemaphoreKHR(object_);\r\n    }\r\n\r\n    cl_int release()\r\n    { \r\n        if (pfn_clReleaseSemaphoreKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                                      __RELEASE_SEMAPHORE_KHR_ERR);\r\n        }\r\n        return pfn_clReleaseSemaphoreKHR(object_);\r\n    }\r\n\r\nprivate:\r\n    static std::once_flag ext_init_;\r\n\r\n    static void initExtensions(const Context& context)\r\n    {\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        Device device = context.getInfo<CL_CONTEXT_DEVICES>().at(0);\r\n        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateSemaphoreWithPropertiesKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clReleaseSemaphoreKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clRetainSemaphoreKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueWaitSemaphoresKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueSignalSemaphoresKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetSemaphoreInfoKHR);\r\n#ifdef cl_khr_external_semaphore\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetSemaphoreHandleForTypeKHR);\r\n#endif // cl_khr_external_semaphore\r\n\r\n#else\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSemaphoreWithPropertiesKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clReleaseSemaphoreKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clRetainSemaphoreKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueWaitSemaphoresKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueSignalSemaphoresKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetSemaphoreInfoKHR);\r\n#ifdef cl_khr_external_semaphore\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetSemaphoreHandleForTypeKHR);\r\n#endif // cl_khr_external_semaphore\r\n\r\n#endif\r\n        if ((pfn_clCreateSemaphoreWithPropertiesKHR == nullptr) &&\r\n            (pfn_clReleaseSemaphoreKHR              == nullptr) &&\r\n            (pfn_clRetainSemaphoreKHR               == nullptr) &&\r\n            (pfn_clEnqueueWaitSemaphoresKHR         == nullptr) &&\r\n            (pfn_clEnqueueSignalSemaphoresKHR       == nullptr) &&\r\n#ifdef cl_khr_external_semaphore\r\n            (pfn_clGetSemaphoreHandleForTypeKHR     == nullptr) &&\r\n#endif // cl_khr_external_semaphore\r\n            (pfn_clGetSemaphoreInfoKHR              == nullptr))\r\n        {\r\n            detail::errHandler(CL_INVALID_VALUE, __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR);\r\n        }\r\n    }\r\n\r\n};\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Semaphore::ext_init_;\r\n\r\ninline cl_int CommandQueue::enqueueWaitSemaphores(\r\n    const vector<Semaphore> &sema_objects,\r\n    const vector<cl_semaphore_payload_khr> &sema_payloads,\r\n    const vector<Event>* events_wait_list,\r\n    Event *event) const\r\n{\r\n    cl_event tmp;\r\n    cl_int err = CL_INVALID_OPERATION;\r\n\r\n    if (pfn_clEnqueueWaitSemaphoresKHR != nullptr) {\r\n        err = pfn_clEnqueueWaitSemaphoresKHR(\r\n                object_,\r\n                (cl_uint)sema_objects.size(),\r\n                (const cl_semaphore_khr *) &sema_objects.front(),\r\n                (sema_payloads.size() > 0) ? &sema_payloads.front() : nullptr,\r\n                (events_wait_list != nullptr) ? (cl_uint) events_wait_list->size() : 0,\r\n                (events_wait_list != nullptr && events_wait_list->size() > 0) ? (cl_event*) &events_wait_list->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr);\r\n    }\r\n\r\n    detail::errHandler(err, __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR);\r\n\r\n    if (event != nullptr && err == CL_SUCCESS)\r\n        *event = tmp;\r\n\r\n    return err;\r\n}\r\n\r\ninline cl_int CommandQueue::enqueueSignalSemaphores(\r\n    const vector<Semaphore> &sema_objects,\r\n    const vector<cl_semaphore_payload_khr>& sema_payloads,\r\n    const vector<Event>* events_wait_list,\r\n    Event* event)\r\n{\r\n    cl_event tmp;\r\n    cl_int err = CL_INVALID_OPERATION;\r\n\r\n    if (pfn_clEnqueueSignalSemaphoresKHR != nullptr) {\r\n        err = pfn_clEnqueueSignalSemaphoresKHR(\r\n                object_,\r\n                (cl_uint)sema_objects.size(),\r\n                (const cl_semaphore_khr*) &sema_objects.front(),\r\n                (sema_payloads.size() > 0) ? &sema_payloads.front() : nullptr,\r\n                (events_wait_list != nullptr) ? (cl_uint) events_wait_list->size() : 0,\r\n                (events_wait_list != nullptr && events_wait_list->size() > 0) ? (cl_event*) &events_wait_list->front() : nullptr,\r\n                (event != nullptr) ? &tmp : nullptr);\r\n    }\r\n\r\n    detail::errHandler(err, __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR);\r\n\r\n    if (event != nullptr && err == CL_SUCCESS)\r\n        *event = tmp;\r\n\r\n    return err;\r\n}\r\n\r\n#endif // cl_khr_semaphore\r\n\r\n#if defined(cl_khr_command_buffer)\r\n/*! \\class CommandBufferKhr\r\n * \\brief CommandBufferKhr interface for cl_command_buffer_khr.\r\n */\r\nclass CommandBufferKhr : public detail::Wrapper<cl_command_buffer_khr>\r\n{\r\npublic:\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    CommandBufferKhr() : detail::Wrapper<cl_type>() { }\r\n\r\n    explicit CommandBufferKhr(const vector<CommandQueue> &queues,\r\n        cl_command_buffer_properties_khr properties = 0,\r\n        cl_int* errcode_ret = nullptr)\r\n    {\r\n        cl_command_buffer_properties_khr command_buffer_properties[] = {\r\n            CL_COMMAND_BUFFER_FLAGS_KHR, properties, 0\r\n        };\r\n\r\n        /* initialization of addresses to extension functions (it is done only once) */\r\n        std::call_once(ext_init_, [&] { initExtensions(queues[0].getInfo<CL_QUEUE_DEVICE>()); });\r\n        cl_int error = CL_INVALID_OPERATION;\r\n\r\n        static_assert(sizeof(cl::CommandQueue) == sizeof(cl_command_queue),\r\n            \"Size of cl::CommandQueue must be equal to size of cl_command_queue\");\r\n\r\n        if (pfn_clCreateCommandBufferKHR)\r\n        {\r\n            object_ = pfn_clCreateCommandBufferKHR((cl_uint) queues.size(),\r\n                (cl_command_queue *) &queues.front(),\r\n                command_buffer_properties,\r\n                &error);\r\n        }\r\n\r\n        detail::errHandler(error, __CREATE_COMMAND_BUFFER_KHR_ERR);\r\n        if (errcode_ret != nullptr) {\r\n            *errcode_ret = error;\r\n        }\r\n    }\r\n\r\n    explicit CommandBufferKhr(const cl_command_buffer_khr& commandBufferKhr, bool retainObject = false) :\r\n        detail::Wrapper<cl_type>(commandBufferKhr, retainObject) { }\r\n\r\n    CommandBufferKhr& operator=(const cl_command_buffer_khr& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n    template <typename T>\r\n    cl_int getInfo(cl_command_buffer_info_khr name, T* param) const\r\n    {\r\n        if (pfn_clGetCommandBufferInfoKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __GET_COMMAND_BUFFER_INFO_KHR_ERR);\r\n        }\r\n        return detail::errHandler(\r\n            detail::getInfo(pfn_clGetCommandBufferInfoKHR, object_, name, param),\r\n                __GET_COMMAND_BUFFER_INFO_KHR_ERR);\r\n    }\r\n\r\n    template <cl_command_buffer_info_khr name> typename\r\n        detail::param_traits<detail::cl_command_buffer_info_khr, name>::param_type\r\n        getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_command_buffer_info_khr, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n\r\n    cl_int finalizeCommandBuffer() const\r\n    {\r\n        return detail::errHandler(::clFinalizeCommandBufferKHR(object_), __FINALIZE_COMMAND_BUFFER_KHR_ERR);\r\n    }\r\n\r\n    cl_int enqueueCommandBuffer(vector<CommandQueue> &queues,\r\n        const vector<Event>* events = nullptr,\r\n        Event* event = nullptr)\r\n    {\r\n        if (pfn_clEnqueueCommandBufferKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __ENQUEUE_COMMAND_BUFFER_KHR_ERR);\r\n        }\r\n\r\n         static_assert(sizeof(cl::CommandQueue) == sizeof(cl_command_queue),\r\n            \"Size of cl::CommandQueue must be equal to size of cl_command_queue\");\r\n\r\n        return detail::errHandler(pfn_clEnqueueCommandBufferKHR((cl_uint) queues.size(),\r\n                (cl_command_queue *) &queues.front(),\r\n                object_,\r\n                (events != nullptr) ? (cl_uint) events->size() : 0,\r\n                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,\r\n                (cl_event*) event),\r\n                __ENQUEUE_COMMAND_BUFFER_KHR_ERR);\r\n    }\r\n\r\n    cl_int commandBarrierWithWaitList(const vector<cl_sync_point_khr>* sync_points_vec = nullptr,\r\n        cl_sync_point_khr* sync_point = nullptr,\r\n        MutableCommandKhr* mutable_handle = nullptr,\r\n        const CommandQueue* command_queue = nullptr)\r\n    {\r\n        if (pfn_clCommandBarrierWithWaitListKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR);\r\n        }\r\n\r\n        cl_sync_point_khr tmp_sync_point;\r\n        cl_int error = detail::errHandler(\r\n            pfn_clCommandBarrierWithWaitListKHR(object_,\r\n                (command_queue != nullptr) ? (*command_queue)() : nullptr,\r\n#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)\r\n                nullptr, // Properties\r\n#endif\r\n                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,\r\n                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,\r\n                (sync_point != nullptr) ? &tmp_sync_point : nullptr,\r\n                (cl_mutable_command_khr*) mutable_handle),\r\n            __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR);\r\n\r\n        if (sync_point != nullptr && error == CL_SUCCESS)\r\n            *sync_point = tmp_sync_point;\r\n\r\n        return error;\r\n    }\r\n\r\n    cl_int commandCopyBuffer(const Buffer& src,\r\n        const Buffer& dst,\r\n        size_type src_offset,\r\n        size_type dst_offset,\r\n        size_type size,\r\n        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,\r\n        cl_sync_point_khr* sync_point = nullptr,\r\n        MutableCommandKhr* mutable_handle = nullptr,\r\n        const CommandQueue* command_queue = nullptr)\r\n    {\r\n        if (pfn_clCommandCopyBufferKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __COMMAND_COPY_BUFFER_KHR_ERR);\r\n        }\r\n\r\n        cl_sync_point_khr tmp_sync_point;\r\n        cl_int error = detail::errHandler(\r\n            pfn_clCommandCopyBufferKHR(object_,\r\n                (command_queue != nullptr) ? (*command_queue)() : nullptr,\r\n#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)\r\n                nullptr, // Properties\r\n#endif\r\n                src(),\r\n                dst(),\r\n                src_offset,\r\n                dst_offset,\r\n                size,\r\n                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,\r\n                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,\r\n                (sync_point != nullptr) ? &tmp_sync_point : nullptr,\r\n                (cl_mutable_command_khr*) mutable_handle),\r\n            __COMMAND_COPY_BUFFER_KHR_ERR);\r\n\r\n        if (sync_point != nullptr && error == CL_SUCCESS)\r\n            *sync_point = tmp_sync_point;\r\n\r\n        return error;\r\n    }\r\n\r\n    cl_int commandCopyBufferRect(const Buffer& src,\r\n        const Buffer& dst,\r\n        const array<size_type, 3>& src_origin,\r\n        const array<size_type, 3>& dst_origin,\r\n        const array<size_type, 3>& region,\r\n        size_type src_row_pitch,\r\n        size_type src_slice_pitch,\r\n        size_type dst_row_pitch,\r\n        size_type dst_slice_pitch,\r\n        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,\r\n        cl_sync_point_khr* sync_point = nullptr,\r\n        MutableCommandKhr* mutable_handle = nullptr,\r\n        const CommandQueue* command_queue = nullptr)\r\n    {\r\n        if (pfn_clCommandCopyBufferRectKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __COMMAND_COPY_BUFFER_RECT_KHR_ERR);\r\n        }\r\n\r\n        cl_sync_point_khr tmp_sync_point;\r\n        cl_int error = detail::errHandler(\r\n            pfn_clCommandCopyBufferRectKHR(object_,\r\n                (command_queue != nullptr) ? (*command_queue)() : nullptr,\r\n#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)\r\n                nullptr, // Properties\r\n#endif\r\n                src(),\r\n                dst(),\r\n                src_origin.data(),\r\n                dst_origin.data(),\r\n                region.data(),\r\n                src_row_pitch,\r\n                src_slice_pitch,\r\n                dst_row_pitch,\r\n                dst_slice_pitch,\r\n                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,\r\n                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,\r\n                (sync_point != nullptr) ? &tmp_sync_point : nullptr,\r\n                (cl_mutable_command_khr*) mutable_handle),\r\n            __COMMAND_COPY_BUFFER_RECT_KHR_ERR);\r\n\r\n        if (sync_point != nullptr && error == CL_SUCCESS)\r\n            *sync_point = tmp_sync_point;\r\n\r\n        return error;\r\n    }\r\n\r\n    cl_int commandCopyBufferToImage(const Buffer& src,\r\n        const Image& dst,\r\n        size_type src_offset,\r\n        const array<size_type, 3>& dst_origin,\r\n        const array<size_type, 3>& region,\r\n        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,\r\n        cl_sync_point_khr* sync_point = nullptr,\r\n        MutableCommandKhr* mutable_handle = nullptr,\r\n        const CommandQueue* command_queue = nullptr)\r\n    {\r\n        if (pfn_clCommandCopyBufferToImageKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR);\r\n        }\r\n\r\n        cl_sync_point_khr tmp_sync_point;\r\n        cl_int error = detail::errHandler(\r\n            pfn_clCommandCopyBufferToImageKHR(object_,\r\n                (command_queue != nullptr) ? (*command_queue)() : nullptr,\r\n#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)\r\n                nullptr, // Properties\r\n#endif\r\n                src(),\r\n                dst(),\r\n                src_offset,\r\n                dst_origin.data(),\r\n                region.data(),\r\n                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,\r\n                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,\r\n                (sync_point != nullptr) ? &tmp_sync_point : nullptr,\r\n                (cl_mutable_command_khr*) mutable_handle),\r\n            __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR);\r\n\r\n        if (sync_point != nullptr && error == CL_SUCCESS)\r\n            *sync_point = tmp_sync_point;\r\n\r\n        return error;\r\n    }\r\n\r\n    cl_int commandCopyImage(const Image& src,\r\n        const Image& dst,\r\n        const array<size_type, 3>& src_origin,\r\n        const array<size_type, 3>& dst_origin,\r\n        const array<size_type, 3>& region,\r\n        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,\r\n        cl_sync_point_khr* sync_point = nullptr,\r\n        MutableCommandKhr* mutable_handle = nullptr,\r\n        const CommandQueue* command_queue = nullptr)\r\n    {\r\n        if (pfn_clCommandCopyImageKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __COMMAND_COPY_IMAGE_KHR_ERR);\r\n        }\r\n\r\n        cl_sync_point_khr tmp_sync_point;\r\n        cl_int error = detail::errHandler(\r\n            pfn_clCommandCopyImageKHR(object_,\r\n                (command_queue != nullptr) ? (*command_queue)() : nullptr,\r\n#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)\r\n                nullptr, // Properties\r\n#endif\r\n                src(),\r\n                dst(),\r\n                src_origin.data(),\r\n                dst_origin.data(),\r\n                region.data(),\r\n                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,\r\n                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,\r\n                (sync_point != nullptr) ? &tmp_sync_point : nullptr,\r\n                (cl_mutable_command_khr*) mutable_handle),\r\n            __COMMAND_COPY_IMAGE_KHR_ERR);\r\n\r\n        if (sync_point != nullptr && error == CL_SUCCESS)\r\n            *sync_point = tmp_sync_point;\r\n\r\n        return error;\r\n    }\r\n\r\n    cl_int commandCopyImageToBuffer(const Image& src,\r\n        const Buffer& dst,\r\n        const array<size_type, 3>& src_origin,\r\n        const array<size_type, 3>& region,\r\n        size_type dst_offset,\r\n        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,\r\n        cl_sync_point_khr* sync_point = nullptr,\r\n        MutableCommandKhr* mutable_handle = nullptr,\r\n        const CommandQueue* command_queue = nullptr)\r\n    {\r\n        if (pfn_clCommandCopyImageToBufferKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR);\r\n        }\r\n\r\n        cl_sync_point_khr tmp_sync_point;\r\n        cl_int error = detail::errHandler(\r\n            pfn_clCommandCopyImageToBufferKHR(object_,\r\n                (command_queue != nullptr) ? (*command_queue)() : nullptr,\r\n#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)\r\n                nullptr, // Properties\r\n#endif\r\n                src(),\r\n                dst(),\r\n                src_origin.data(),\r\n                region.data(),\r\n                dst_offset,\r\n                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,\r\n                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,\r\n                (sync_point != nullptr) ? &tmp_sync_point : nullptr,\r\n                (cl_mutable_command_khr*) mutable_handle),\r\n            __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR);\r\n\r\n        if (sync_point != nullptr && error == CL_SUCCESS)\r\n            *sync_point = tmp_sync_point;\r\n\r\n        return error;\r\n    }\r\n\r\n    template<typename PatternType>\r\n    cl_int commandFillBuffer(const Buffer& buffer,\r\n        PatternType pattern,\r\n        size_type offset,\r\n        size_type size,\r\n        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,\r\n        cl_sync_point_khr* sync_point = nullptr,\r\n        MutableCommandKhr* mutable_handle = nullptr,\r\n        const CommandQueue* command_queue = nullptr)\r\n    {\r\n        if (pfn_clCommandFillBufferKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __COMMAND_FILL_BUFFER_KHR_ERR);\r\n        }\r\n\r\n        cl_sync_point_khr tmp_sync_point;\r\n        cl_int error = detail::errHandler(\r\n            pfn_clCommandFillBufferKHR(object_,\r\n                (command_queue != nullptr) ? (*command_queue)() : nullptr,\r\n#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)\r\n                nullptr, // Properties\r\n#endif\r\n                buffer(),\r\n                static_cast<void*>(&pattern),\r\n                sizeof(PatternType),\r\n                offset,\r\n                size,\r\n                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,\r\n                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,\r\n                (sync_point != nullptr) ? &tmp_sync_point : nullptr,\r\n                (cl_mutable_command_khr*) mutable_handle),\r\n            __COMMAND_FILL_BUFFER_KHR_ERR);\r\n\r\n        if (sync_point != nullptr && error == CL_SUCCESS)\r\n            *sync_point = tmp_sync_point;\r\n\r\n        return error;\r\n    }\r\n\r\n    cl_int commandFillImage(const Image& image,\r\n        cl_float4 fillColor,\r\n        const array<size_type, 3>& origin,\r\n        const array<size_type, 3>& region,\r\n        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,\r\n        cl_sync_point_khr* sync_point = nullptr,\r\n        MutableCommandKhr* mutable_handle = nullptr,\r\n        const CommandQueue* command_queue = nullptr)\r\n    {\r\n        if (pfn_clCommandFillImageKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __COMMAND_FILL_IMAGE_KHR_ERR);\r\n        }\r\n\r\n        cl_sync_point_khr tmp_sync_point;\r\n        cl_int error = detail::errHandler(\r\n            pfn_clCommandFillImageKHR(object_,\r\n                (command_queue != nullptr) ? (*command_queue)() : nullptr,\r\n#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)\r\n                nullptr, // Properties\r\n#endif\r\n                image(),\r\n                static_cast<void*>(&fillColor),\r\n                origin.data(),\r\n                region.data(),\r\n                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,\r\n                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,\r\n                (sync_point != nullptr) ? &tmp_sync_point : nullptr,\r\n                (cl_mutable_command_khr*) mutable_handle),\r\n            __COMMAND_FILL_IMAGE_KHR_ERR);\r\n\r\n        if (sync_point != nullptr && error == CL_SUCCESS)\r\n            *sync_point = tmp_sync_point;\r\n\r\n        return error;\r\n    }\r\n\r\n    cl_int commandNDRangeKernel(\r\n#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)\r\n            const cl::vector<cl_command_properties_khr> &properties,\r\n#else\r\n            const cl::vector<cl_ndrange_kernel_command_properties_khr> &properties,\r\n#endif\r\n        const Kernel& kernel,\r\n        const NDRange& offset,\r\n        const NDRange& global,\r\n        const NDRange& local = NullRange,\r\n        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,\r\n        cl_sync_point_khr* sync_point = nullptr,\r\n        MutableCommandKhr* mutable_handle = nullptr,\r\n        const CommandQueue* command_queue = nullptr)\r\n    {\r\n        if (pfn_clCommandNDRangeKernelKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __COMMAND_NDRANGE_KERNEL_KHR_ERR);\r\n        }\r\n\r\n        cl_sync_point_khr tmp_sync_point;\r\n        cl_int error = detail::errHandler(\r\n            pfn_clCommandNDRangeKernelKHR(object_,\r\n                (command_queue != nullptr) ? (*command_queue)() : nullptr,\r\n                &properties[0],\r\n                kernel(),\r\n                (cl_uint) global.dimensions(),\r\n                offset.dimensions() != 0 ? (const size_type*) offset : nullptr,\r\n                (const size_type*) global,\r\n                local.dimensions() != 0 ? (const size_type*) local : nullptr,\r\n                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,\r\n                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,\r\n                (sync_point != nullptr) ? &tmp_sync_point : nullptr,\r\n                (cl_mutable_command_khr*) mutable_handle),\r\n            __COMMAND_NDRANGE_KERNEL_KHR_ERR);\r\n\r\n        if (sync_point != nullptr && error == CL_SUCCESS)\r\n            *sync_point = tmp_sync_point;\r\n\r\n        return error;\r\n    }\r\n\r\n#if defined(cl_khr_command_buffer_mutable_dispatch)\r\n#if CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION <                 \\\r\n    CL_MAKE_VERSION(0, 9, 2)\r\n    cl_int updateMutableCommands(const cl_mutable_base_config_khr* mutable_config)\r\n    {\r\n        if (pfn_clUpdateMutableCommandsKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __UPDATE_MUTABLE_COMMANDS_KHR_ERR);\r\n        }\r\n        return detail::errHandler(pfn_clUpdateMutableCommandsKHR(object_, mutable_config),\r\n                        __UPDATE_MUTABLE_COMMANDS_KHR_ERR);\r\n    }\r\n#else\r\n    template <int ArrayLength>\r\n    cl_int updateMutableCommands(std::array<cl_command_buffer_update_type_khr,\r\n                                            ArrayLength> &config_types,\r\n                                 std::array<const void *, ArrayLength> &configs) {\r\n        if (pfn_clUpdateMutableCommandsKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                                      __UPDATE_MUTABLE_COMMANDS_KHR_ERR);\r\n        }\r\n        return detail::errHandler(\r\n            pfn_clUpdateMutableCommandsKHR(object_, static_cast<cl_uint>(configs.size()),\r\n                                           config_types.data(), configs.data()),\r\n            __UPDATE_MUTABLE_COMMANDS_KHR_ERR);\r\n    }\r\n#endif /* CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION */\r\n#endif /* cl_khr_command_buffer_mutable_dispatch */\r\n\r\nprivate:\r\n    static std::once_flag ext_init_;\r\n\r\n    static void initExtensions(const cl::Device& device)\r\n    {\r\n#if CL_HPP_TARGET_OPENCL_VERSION >= 120\r\n        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clFinalizeCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clRetainCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clReleaseCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetCommandBufferInfoKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandBarrierWithWaitListKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferRectKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferToImageKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyImageKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyImageToBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandFillBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandFillImageKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandNDRangeKernelKHR);\r\n#if defined(cl_khr_command_buffer_mutable_dispatch)\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clUpdateMutableCommandsKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetMutableCommandInfoKHR);\r\n#endif /* cl_khr_command_buffer_mutable_dispatch */\r\n#elif CL_HPP_TARGET_OPENCL_VERSION >= 110\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clFinalizeCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clRetainCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clReleaseCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetCommandBufferInfoKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueCommandBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandBarrierWithWaitListKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferRectKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferToImageKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyImageKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyImageToBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandFillBufferKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandFillImageKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandNDRangeKernelKHR);\r\n#if defined(cl_khr_command_buffer_mutable_dispatch)\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clUpdateMutableCommandsKHR);\r\n        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetMutableCommandInfoKHR);\r\n#endif /* cl_khr_command_buffer_mutable_dispatch */\r\n#endif\r\n        if ((pfn_clCreateCommandBufferKHR        == nullptr) &&\r\n            (pfn_clFinalizeCommandBufferKHR      == nullptr) &&\r\n            (pfn_clRetainCommandBufferKHR        == nullptr) &&\r\n            (pfn_clReleaseCommandBufferKHR       == nullptr) &&\r\n            (pfn_clGetCommandBufferInfoKHR       == nullptr) &&\r\n            (pfn_clEnqueueCommandBufferKHR       == nullptr) &&\r\n            (pfn_clCommandBarrierWithWaitListKHR == nullptr) &&\r\n            (pfn_clCommandCopyBufferKHR          == nullptr) &&\r\n            (pfn_clCommandCopyBufferRectKHR      == nullptr) &&\r\n            (pfn_clCommandCopyBufferToImageKHR   == nullptr) &&\r\n            (pfn_clCommandCopyImageKHR           == nullptr) &&\r\n            (pfn_clCommandCopyImageToBufferKHR   == nullptr) &&\r\n            (pfn_clCommandFillBufferKHR          == nullptr) &&\r\n            (pfn_clCommandFillImageKHR           == nullptr) &&\r\n            (pfn_clCommandNDRangeKernelKHR       == nullptr)\r\n#if defined(cl_khr_command_buffer_mutable_dispatch)\r\n            && (pfn_clUpdateMutableCommandsKHR      == nullptr)\r\n            && (pfn_clGetMutableCommandInfoKHR      == nullptr)\r\n#endif /* cl_khr_command_buffer_mutable_dispatch */\r\n            )\r\n        {\r\n            detail::errHandler(CL_INVALID_VALUE, __CREATE_COMMAND_BUFFER_KHR_ERR);\r\n        }\r\n    }\r\n}; // CommandBufferKhr\r\n\r\nCL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandBufferKhr::ext_init_;\r\n\r\n#if defined(cl_khr_command_buffer_mutable_dispatch)\r\n/*! \\class MutableCommandKhr\r\n * \\brief MutableCommandKhr interface for cl_mutable_command_khr.\r\n */\r\nclass MutableCommandKhr : public detail::Wrapper<cl_mutable_command_khr>\r\n{\r\npublic:\r\n    //! \\brief Default constructor - initializes to nullptr.\r\n    MutableCommandKhr() : detail::Wrapper<cl_type>() { }\r\n\r\n    explicit MutableCommandKhr(const cl_mutable_command_khr& mutableCommandKhr, bool retainObject = false) :\r\n        detail::Wrapper<cl_type>(mutableCommandKhr, retainObject) { }\r\n\r\n    MutableCommandKhr& operator=(const cl_mutable_command_khr& rhs)\r\n    {\r\n        detail::Wrapper<cl_type>::operator=(rhs);\r\n        return *this;\r\n    }\r\n\r\n    template <typename T>\r\n    cl_int getInfo(cl_mutable_command_info_khr name, T* param) const\r\n    {\r\n        if (pfn_clGetMutableCommandInfoKHR == nullptr) {\r\n            return detail::errHandler(CL_INVALID_OPERATION,\r\n                    __GET_MUTABLE_COMMAND_INFO_KHR_ERR);\r\n        }\r\n        return detail::errHandler(\r\n            detail::getInfo(pfn_clGetMutableCommandInfoKHR, object_, name, param),\r\n                __GET_MUTABLE_COMMAND_INFO_KHR_ERR);\r\n    }\r\n\r\n    template <cl_mutable_command_info_khr name> typename\r\n        detail::param_traits<detail::cl_mutable_command_info_khr, name>::param_type\r\n        getInfo(cl_int* err = nullptr) const\r\n    {\r\n        typename detail::param_traits<\r\n            detail::cl_mutable_command_info_khr, name>::param_type param;\r\n        cl_int result = getInfo(name, &param);\r\n        if (err != nullptr) {\r\n            *err = result;\r\n        }\r\n        return param;\r\n    }\r\n}; // MutableCommandKhr\r\n#endif /* cl_khr_command_buffer_mutable_dispatch */\r\n\r\n#endif // cl_khr_command_buffer\r\n//----------------------------------------------------------------------------------------------------------------------\r\n\r\n#undef CL_HPP_ERR_STR_\r\n#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)\r\n#undef __GET_DEVICE_INFO_ERR               \r\n#undef __GET_PLATFORM_INFO_ERR             \r\n#undef __GET_DEVICE_IDS_ERR                \r\n#undef __GET_PLATFORM_IDS_ERR              \r\n#undef __GET_CONTEXT_INFO_ERR              \r\n#undef __GET_EVENT_INFO_ERR                \r\n#undef __GET_EVENT_PROFILE_INFO_ERR        \r\n#undef __GET_MEM_OBJECT_INFO_ERR           \r\n#undef __GET_IMAGE_INFO_ERR                \r\n#undef __GET_SAMPLER_INFO_ERR              \r\n#undef __GET_KERNEL_INFO_ERR               \r\n#undef __GET_KERNEL_ARG_INFO_ERR           \r\n#undef __GET_KERNEL_SUB_GROUP_INFO_ERR     \r\n#undef __GET_KERNEL_WORK_GROUP_INFO_ERR    \r\n#undef __GET_PROGRAM_INFO_ERR              \r\n#undef __GET_PROGRAM_BUILD_INFO_ERR        \r\n#undef __GET_COMMAND_QUEUE_INFO_ERR        \r\n#undef __CREATE_CONTEXT_ERR                \r\n#undef __CREATE_CONTEXT_FROM_TYPE_ERR\r\n#undef __CREATE_COMMAND_BUFFER_KHR_ERR\r\n#undef __GET_COMMAND_BUFFER_INFO_KHR_ERR\r\n#undef __FINALIZE_COMMAND_BUFFER_KHR_ERR\r\n#undef __ENQUEUE_COMMAND_BUFFER_KHR_ERR\r\n#undef __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR\r\n#undef __COMMAND_COPY_BUFFER_KHR_ERR\r\n#undef __COMMAND_COPY_BUFFER_RECT_KHR_ERR\r\n#undef __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR\r\n#undef __COMMAND_COPY_IMAGE_KHR_ERR\r\n#undef __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR\r\n#undef __COMMAND_FILL_BUFFER_KHR_ERR\r\n#undef __COMMAND_FILL_IMAGE_KHR_ERR\r\n#undef __COMMAND_NDRANGE_KERNEL_KHR_ERR\r\n#undef __UPDATE_MUTABLE_COMMANDS_KHR_ERR\r\n#undef __GET_MUTABLE_COMMAND_INFO_KHR_ERR\r\n#undef __RETAIN_COMMAND_BUFFER_KHR_ERR\r\n#undef __RELEASE_COMMAND_BUFFER_KHR_ERR\r\n#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR   \r\n#undef __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR\r\n#undef __CREATE_BUFFER_ERR                 \r\n#undef __COPY_ERR                          \r\n#undef __CREATE_SUBBUFFER_ERR              \r\n#undef __CREATE_GL_BUFFER_ERR              \r\n#undef __CREATE_GL_RENDER_BUFFER_ERR       \r\n#undef __GET_GL_OBJECT_INFO_ERR            \r\n#undef __CREATE_IMAGE_ERR                  \r\n#undef __CREATE_GL_TEXTURE_ERR             \r\n#undef __IMAGE_DIMENSION_ERR               \r\n#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR \r\n#undef __CREATE_USER_EVENT_ERR             \r\n#undef __SET_USER_EVENT_STATUS_ERR         \r\n#undef __SET_EVENT_CALLBACK_ERR            \r\n#undef __WAIT_FOR_EVENTS_ERR               \r\n#undef __CREATE_KERNEL_ERR                 \r\n#undef __SET_KERNEL_ARGS_ERR               \r\n#undef __CREATE_PROGRAM_WITH_SOURCE_ERR    \r\n#undef __CREATE_PROGRAM_WITH_BINARY_ERR    \r\n#undef __CREATE_PROGRAM_WITH_IL_ERR        \r\n#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    \r\n#undef __BUILD_PROGRAM_ERR                 \r\n#undef __COMPILE_PROGRAM_ERR               \r\n#undef __LINK_PROGRAM_ERR                  \r\n#undef __CREATE_KERNELS_IN_PROGRAM_ERR     \r\n#undef __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR          \r\n#undef __CREATE_SAMPLER_WITH_PROPERTIES_ERR                \r\n#undef __SET_COMMAND_QUEUE_PROPERTY_ERR    \r\n#undef __ENQUEUE_READ_BUFFER_ERR           \r\n#undef __ENQUEUE_READ_BUFFER_RECT_ERR      \r\n#undef __ENQUEUE_WRITE_BUFFER_ERR          \r\n#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR     \r\n#undef __ENQEUE_COPY_BUFFER_ERR            \r\n#undef __ENQEUE_COPY_BUFFER_RECT_ERR       \r\n#undef __ENQUEUE_FILL_BUFFER_ERR           \r\n#undef __ENQUEUE_READ_IMAGE_ERR            \r\n#undef __ENQUEUE_WRITE_IMAGE_ERR           \r\n#undef __ENQUEUE_COPY_IMAGE_ERR            \r\n#undef __ENQUEUE_FILL_IMAGE_ERR            \r\n#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  \r\n#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  \r\n#undef __ENQUEUE_MAP_BUFFER_ERR\r\n#undef __ENQUEUE_MAP_IMAGE_ERR\r\n#undef __ENQUEUE_MAP_SVM_ERR\r\n#undef __ENQUEUE_FILL_SVM_ERR\r\n#undef __ENQUEUE_COPY_SVM_ERR\r\n#undef __ENQUEUE_UNMAP_SVM_ERR              \r\n#undef __ENQUEUE_MAP_IMAGE_ERR             \r\n#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR      \r\n#undef __ENQUEUE_NDRANGE_KERNEL_ERR        \r\n#undef __ENQUEUE_NATIVE_KERNEL             \r\n#undef __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   \r\n#undef __ENQUEUE_MIGRATE_SVM_ERR\r\n#undef __ENQUEUE_ACQUIRE_GL_ERR            \r\n#undef __ENQUEUE_RELEASE_GL_ERR            \r\n#undef __CREATE_PIPE_ERR             \r\n#undef __GET_PIPE_INFO_ERR           \r\n#undef __RETAIN_ERR                        \r\n#undef __RELEASE_ERR                       \r\n#undef __FLUSH_ERR                         \r\n#undef __FINISH_ERR                        \r\n#undef __VECTOR_CAPACITY_ERR               \r\n#undef __CREATE_SUB_DEVICES_ERR\r\n#undef __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR\r\n#undef __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR\r\n#undef __ENQUEUE_MARKER_ERR                \r\n#undef __ENQUEUE_WAIT_FOR_EVENTS_ERR       \r\n#undef __ENQUEUE_BARRIER_ERR               \r\n#undef __UNLOAD_COMPILER_ERR               \r\n#undef __CREATE_GL_TEXTURE_2D_ERR          \r\n#undef __CREATE_GL_TEXTURE_3D_ERR          \r\n#undef __CREATE_IMAGE2D_ERR                \r\n#undef __CREATE_IMAGE3D_ERR                \r\n#undef __CREATE_COMMAND_QUEUE_ERR          \r\n#undef __ENQUEUE_TASK_ERR                  \r\n#undef __CREATE_SAMPLER_ERR                \r\n#undef __ENQUEUE_MARKER_WAIT_LIST_ERR                \r\n#undef __ENQUEUE_BARRIER_WAIT_LIST_ERR               \r\n#undef __CLONE_KERNEL_ERR     \r\n#undef __GET_HOST_TIMER_ERR\r\n#undef __GET_DEVICE_AND_HOST_TIMER_ERR\r\n#undef __GET_SEMAPHORE_KHR_INFO_ERR\r\n#undef __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR\r\n#undef __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR\r\n#undef __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR\r\n#undef __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR\r\n#undef __RETAIN_SEMAPHORE_KHR_ERR\r\n#undef __RELEASE_SEMAPHORE_KHR_ERR\r\n#undef __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR\r\n\r\n#endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS\r\n\r\n// Extensions\r\n#undef CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_\r\n#undef CL_HPP_INIT_CL_EXT_FCN_PTR_\r\n#undef CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_\r\n\r\n#undef CL_HPP_DEFINE_STATIC_MEMBER_\r\n\r\n} // namespace cl\r\n\r\n#endif // CL_HPP_\r\n"
  },
  {
    "path": "svm/OpenCL/lib/pkgconfig/OpenCL.pc",
    "content": "prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install\r\nexec_prefix=${prefix}\r\nlibdir=${exec_prefix}/lib\r\n\r\nName: OpenCL\r\nDescription: Khronos OpenCL ICD Loader\r\nRequires: OpenCL-Headers\r\nVersion: 3.0\r\nLibs: -L${libdir} -lOpenCL\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCL/OpenCLConfig.cmake",
    "content": "get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH)\r\ninclude(\"${PARENT_DIR}/OpenCLHeaders/OpenCLHeadersConfig.cmake\")\r\ninclude(\"${PARENT_DIR}/OpenCLICDLoader/OpenCLICDLoaderConfig.cmake\")\r\ninclude(\"${PARENT_DIR}/OpenCLHeadersCpp/OpenCLHeadersCppConfig.cmake\")\r\ninclude(\"${PARENT_DIR}/OpenCLUtils/OpenCLUtilsConfig.cmake\")\r\ninclude(\"${PARENT_DIR}/OpenCLUtilsCpp/OpenCLUtilsCppConfig.cmake\")\r\n  "
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCL/OpenCLConfigVersion.cmake",
    "content": "# This is a basic version file for the Config-mode of find_package().\r\n# It is used by write_basic_package_version_file() as input file for configure_file()\r\n# to create a version-file which can be installed along a config.cmake file.\r\n#\r\n# The created file sets PACKAGE_VERSION_EXACT if the current version string and\r\n# the requested version string are exactly the same and it sets\r\n# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.\r\n# The variable CVF_VERSION must be set before calling configure_file().\r\n\r\nset(PACKAGE_VERSION \"2024.10.24\")\r\n\r\nif (PACKAGE_FIND_VERSION_RANGE)\r\n  # Package version must be in the requested version range\r\n  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)\r\n      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)\r\n        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"EXCLUDE\" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n  endif()\r\nelse()\r\n  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)\r\n      set(PACKAGE_VERSION_EXACT TRUE)\r\n    endif()\r\n  endif()\r\nendif()\r\n\r\n\r\n# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:\r\nif(\"${CMAKE_SIZEOF_VOID_P}\" STREQUAL \"\" OR \"\" STREQUAL \"\")\r\n  return()\r\nendif()\r\n\r\n# check that the installed version has the same 32/64bit-ness as the one which is currently searching:\r\nif(NOT CMAKE_SIZEOF_VOID_P STREQUAL \"\")\r\n  math(EXPR installedBits \" * 8\")\r\n  set(PACKAGE_VERSION \"${PACKAGE_VERSION} (${installedBits}bit)\")\r\n  set(PACKAGE_VERSION_UNSUITABLE TRUE)\r\nendif()\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderConfig.cmake",
    "content": "include(\"${CMAKE_CURRENT_LIST_DIR}/OpenCLExtensionLoaderTargets.cmake\")"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderConfigVersion.cmake",
    "content": "# This is a basic version file for the Config-mode of find_package().\r\n# It is used by write_basic_package_version_file() as input file for configure_file()\r\n# to create a version-file which can be installed along a config.cmake file.\r\n#\r\n# The created file sets PACKAGE_VERSION_EXACT if the current version string and\r\n# the requested version string are exactly the same and it sets\r\n# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.\r\n# The variable CVF_VERSION must be set before calling configure_file().\r\n\r\nset(PACKAGE_VERSION \"1.0.220515\")\r\n\r\nif (PACKAGE_FIND_VERSION_RANGE)\r\n  # Package version must be in the requested version range\r\n  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)\r\n      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)\r\n        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"EXCLUDE\" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n  endif()\r\nelse()\r\n  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)\r\n      set(PACKAGE_VERSION_EXACT TRUE)\r\n    endif()\r\n  endif()\r\nendif()\r\n\r\n\r\n# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:\r\nif(\"${CMAKE_SIZEOF_VOID_P}\" STREQUAL \"\" OR \"\" STREQUAL \"\")\r\n  return()\r\nendif()\r\n\r\n# check that the installed version has the same 32/64bit-ness as the one which is currently searching:\r\nif(NOT CMAKE_SIZEOF_VOID_P STREQUAL \"\")\r\n  math(EXPR installedBits \" * 8\")\r\n  set(PACKAGE_VERSION \"${PACKAGE_VERSION} (${installedBits}bit)\")\r\n  set(PACKAGE_VERSION_UNSUITABLE TRUE)\r\nendif()\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets-debug.cmake",
    "content": "#----------------------------------------------------------------\r\n# Generated CMake target import file for configuration \"Debug\".\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Import target \"OpenCL::OpenCLExt\" for configuration \"Debug\"\r\nset_property(TARGET OpenCL::OpenCLExt APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)\r\nset_target_properties(OpenCL::OpenCLExt PROPERTIES\r\n  IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG \"CXX\"\r\n  IMPORTED_LOCATION_DEBUG \"${_IMPORT_PREFIX}/lib/OpenCLExt.lib\"\r\n  )\r\n\r\nlist(APPEND _cmake_import_check_targets OpenCL::OpenCLExt )\r\nlist(APPEND _cmake_import_check_files_for_OpenCL::OpenCLExt \"${_IMPORT_PREFIX}/lib/OpenCLExt.lib\" )\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets-release.cmake",
    "content": "#----------------------------------------------------------------\r\n# Generated CMake target import file for configuration \"Release\".\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Import target \"OpenCL::OpenCLExt\" for configuration \"Release\"\r\nset_property(TARGET OpenCL::OpenCLExt APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)\r\nset_target_properties(OpenCL::OpenCLExt PROPERTIES\r\n  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE \"CXX\"\r\n  IMPORTED_LOCATION_RELEASE \"${_IMPORT_PREFIX}/lib/OpenCLExt.lib\"\r\n  )\r\n\r\nlist(APPEND _cmake_import_check_targets OpenCL::OpenCLExt )\r\nlist(APPEND _cmake_import_check_files_for_OpenCL::OpenCLExt \"${_IMPORT_PREFIX}/lib/OpenCLExt.lib\" )\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets.cmake",
    "content": "# Generated by CMake\r\n\r\nif(\"${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}\" LESS 2.8)\r\n   message(FATAL_ERROR \"CMake >= 2.8.0 required\")\r\nendif()\r\nif(CMAKE_VERSION VERSION_LESS \"2.8.12\")\r\n   message(FATAL_ERROR \"CMake >= 2.8.12 required\")\r\nendif()\r\ncmake_policy(PUSH)\r\ncmake_policy(VERSION 2.8.12...3.28)\r\n#----------------------------------------------------------------\r\n# Generated CMake target import file.\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Protect against multiple inclusion, which would fail when already imported targets are added once more.\r\nset(_cmake_targets_defined \"\")\r\nset(_cmake_targets_not_defined \"\")\r\nset(_cmake_expected_targets \"\")\r\nforeach(_cmake_expected_target IN ITEMS OpenCL::OpenCLExt)\r\n  list(APPEND _cmake_expected_targets \"${_cmake_expected_target}\")\r\n  if(TARGET \"${_cmake_expected_target}\")\r\n    list(APPEND _cmake_targets_defined \"${_cmake_expected_target}\")\r\n  else()\r\n    list(APPEND _cmake_targets_not_defined \"${_cmake_expected_target}\")\r\n  endif()\r\nendforeach()\r\nunset(_cmake_expected_target)\r\nif(_cmake_targets_defined STREQUAL _cmake_expected_targets)\r\n  unset(_cmake_targets_defined)\r\n  unset(_cmake_targets_not_defined)\r\n  unset(_cmake_expected_targets)\r\n  unset(CMAKE_IMPORT_FILE_VERSION)\r\n  cmake_policy(POP)\r\n  return()\r\nendif()\r\nif(NOT _cmake_targets_defined STREQUAL \"\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_defined_text \"${_cmake_targets_defined}\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_not_defined_text \"${_cmake_targets_not_defined}\")\r\n  message(FATAL_ERROR \"Some (but not all) targets in this export set were already defined.\\nTargets Defined: ${_cmake_targets_defined_text}\\nTargets not yet defined: ${_cmake_targets_not_defined_text}\\n\")\r\nendif()\r\nunset(_cmake_targets_defined)\r\nunset(_cmake_targets_not_defined)\r\nunset(_cmake_expected_targets)\r\n\r\n\r\n# Compute the installation prefix relative to this file.\r\nget_filename_component(_IMPORT_PREFIX \"${CMAKE_CURRENT_LIST_FILE}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nif(_IMPORT_PREFIX STREQUAL \"/\")\r\n  set(_IMPORT_PREFIX \"\")\r\nendif()\r\n\r\n# Create imported target OpenCL::OpenCLExt\r\nadd_library(OpenCL::OpenCLExt STATIC IMPORTED)\r\n\r\nset_target_properties(OpenCL::OpenCLExt PROPERTIES\r\n  INTERFACE_LINK_LIBRARIES \"\\$<LINK_ONLY:OpenCL::OpenCL>\"\r\n)\r\n\r\n# Load information for each installed configuration.\r\nfile(GLOB _cmake_config_files \"${CMAKE_CURRENT_LIST_DIR}/OpenCLExtensionLoaderTargets-*.cmake\")\r\nforeach(_cmake_config_file IN LISTS _cmake_config_files)\r\n  include(\"${_cmake_config_file}\")\r\nendforeach()\r\nunset(_cmake_config_file)\r\nunset(_cmake_config_files)\r\n\r\n# Cleanup temporary variables.\r\nset(_IMPORT_PREFIX)\r\n\r\n# Loop over all imported files and verify that they actually exist\r\nforeach(_cmake_target IN LISTS _cmake_import_check_targets)\r\n  if(CMAKE_VERSION VERSION_LESS \"3.28\"\r\n      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}\r\n      OR NOT IS_DIRECTORY \"${_cmake_import_check_xcframework_for_${_cmake_target}}\")\r\n    foreach(_cmake_file IN LISTS \"_cmake_import_check_files_for_${_cmake_target}\")\r\n      if(NOT EXISTS \"${_cmake_file}\")\r\n        message(FATAL_ERROR \"The imported target \\\"${_cmake_target}\\\" references the file\r\n   \\\"${_cmake_file}\\\"\r\nbut this file does not exist.  Possible reasons include:\r\n* The file was deleted, renamed, or moved to another location.\r\n* An install or uninstall procedure did not complete successfully.\r\n* The installation package was faulty and contained\r\n   \\\"${CMAKE_CURRENT_LIST_FILE}\\\"\r\nbut not all the files it references.\r\n\")\r\n      endif()\r\n    endforeach()\r\n  endif()\r\n  unset(_cmake_file)\r\n  unset(\"_cmake_import_check_files_for_${_cmake_target}\")\r\nendforeach()\r\nunset(_cmake_target)\r\nunset(_cmake_import_check_targets)\r\n\r\n# Make sure the targets which have been exported in some other\r\n# export set exist.\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\nforeach(_target \"OpenCL::OpenCL\" )\r\n  if(NOT TARGET \"${_target}\" )\r\n    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets \"${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}\")\r\n  endif()\r\nendforeach()\r\n\r\nif(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n  if(CMAKE_FIND_PACKAGE_NAME)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  else()\r\n    message(FATAL_ERROR \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  endif()\r\nendif()\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\ncmake_policy(POP)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersConfig.cmake",
    "content": "include(\"${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersTargets.cmake\")"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersConfigVersion.cmake",
    "content": "# This is a basic version file for the Config-mode of find_package().\r\n# It is used by write_basic_package_version_file() as input file for configure_file()\r\n# to create a version-file which can be installed along a config.cmake file.\r\n#\r\n# The created file sets PACKAGE_VERSION_EXACT if the current version string and\r\n# the requested version string are exactly the same and it sets\r\n# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.\r\n# The variable CVF_VERSION must be set before calling configure_file().\r\n\r\nset(PACKAGE_VERSION \"3.0\")\r\n\r\nif (PACKAGE_FIND_VERSION_RANGE)\r\n  # Package version must be in the requested version range\r\n  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)\r\n      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)\r\n        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"EXCLUDE\" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n  endif()\r\nelse()\r\n  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)\r\n      set(PACKAGE_VERSION_EXACT TRUE)\r\n    endif()\r\n  endif()\r\nendif()\r\n\r\n\r\n# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:\r\nif(\"${CMAKE_SIZEOF_VOID_P}\" STREQUAL \"\" OR \"\" STREQUAL \"\")\r\n  return()\r\nendif()\r\n\r\n# check that the installed version has the same 32/64bit-ness as the one which is currently searching:\r\nif(NOT CMAKE_SIZEOF_VOID_P STREQUAL \"\")\r\n  math(EXPR installedBits \" * 8\")\r\n  set(PACKAGE_VERSION \"${PACKAGE_VERSION} (${installedBits}bit)\")\r\n  set(PACKAGE_VERSION_UNSUITABLE TRUE)\r\nendif()\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersTargets.cmake",
    "content": "# Generated by CMake\r\n\r\nif(\"${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}\" LESS 2.8)\r\n   message(FATAL_ERROR \"CMake >= 2.8.0 required\")\r\nendif()\r\nif(CMAKE_VERSION VERSION_LESS \"3.0.0\")\r\n   message(FATAL_ERROR \"CMake >= 3.0.0 required\")\r\nendif()\r\ncmake_policy(PUSH)\r\ncmake_policy(VERSION 3.0.0...3.28)\r\n#----------------------------------------------------------------\r\n# Generated CMake target import file.\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Protect against multiple inclusion, which would fail when already imported targets are added once more.\r\nset(_cmake_targets_defined \"\")\r\nset(_cmake_targets_not_defined \"\")\r\nset(_cmake_expected_targets \"\")\r\nforeach(_cmake_expected_target IN ITEMS OpenCL::Headers)\r\n  list(APPEND _cmake_expected_targets \"${_cmake_expected_target}\")\r\n  if(TARGET \"${_cmake_expected_target}\")\r\n    list(APPEND _cmake_targets_defined \"${_cmake_expected_target}\")\r\n  else()\r\n    list(APPEND _cmake_targets_not_defined \"${_cmake_expected_target}\")\r\n  endif()\r\nendforeach()\r\nunset(_cmake_expected_target)\r\nif(_cmake_targets_defined STREQUAL _cmake_expected_targets)\r\n  unset(_cmake_targets_defined)\r\n  unset(_cmake_targets_not_defined)\r\n  unset(_cmake_expected_targets)\r\n  unset(CMAKE_IMPORT_FILE_VERSION)\r\n  cmake_policy(POP)\r\n  return()\r\nendif()\r\nif(NOT _cmake_targets_defined STREQUAL \"\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_defined_text \"${_cmake_targets_defined}\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_not_defined_text \"${_cmake_targets_not_defined}\")\r\n  message(FATAL_ERROR \"Some (but not all) targets in this export set were already defined.\\nTargets Defined: ${_cmake_targets_defined_text}\\nTargets not yet defined: ${_cmake_targets_not_defined_text}\\n\")\r\nendif()\r\nunset(_cmake_targets_defined)\r\nunset(_cmake_targets_not_defined)\r\nunset(_cmake_expected_targets)\r\n\r\n\r\n# Compute the installation prefix relative to this file.\r\nget_filename_component(_IMPORT_PREFIX \"${CMAKE_CURRENT_LIST_FILE}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nif(_IMPORT_PREFIX STREQUAL \"/\")\r\n  set(_IMPORT_PREFIX \"\")\r\nendif()\r\n\r\n# Create imported target OpenCL::Headers\r\nadd_library(OpenCL::Headers INTERFACE IMPORTED)\r\n\r\nset_target_properties(OpenCL::Headers PROPERTIES\r\n  INTERFACE_INCLUDE_DIRECTORIES \"${_IMPORT_PREFIX}/include\"\r\n)\r\n\r\n# Load information for each installed configuration.\r\nfile(GLOB _cmake_config_files \"${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersTargets-*.cmake\")\r\nforeach(_cmake_config_file IN LISTS _cmake_config_files)\r\n  include(\"${_cmake_config_file}\")\r\nendforeach()\r\nunset(_cmake_config_file)\r\nunset(_cmake_config_files)\r\n\r\n# Cleanup temporary variables.\r\nset(_IMPORT_PREFIX)\r\n\r\n# Loop over all imported files and verify that they actually exist\r\nforeach(_cmake_target IN LISTS _cmake_import_check_targets)\r\n  if(CMAKE_VERSION VERSION_LESS \"3.28\"\r\n      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}\r\n      OR NOT IS_DIRECTORY \"${_cmake_import_check_xcframework_for_${_cmake_target}}\")\r\n    foreach(_cmake_file IN LISTS \"_cmake_import_check_files_for_${_cmake_target}\")\r\n      if(NOT EXISTS \"${_cmake_file}\")\r\n        message(FATAL_ERROR \"The imported target \\\"${_cmake_target}\\\" references the file\r\n   \\\"${_cmake_file}\\\"\r\nbut this file does not exist.  Possible reasons include:\r\n* The file was deleted, renamed, or moved to another location.\r\n* An install or uninstall procedure did not complete successfully.\r\n* The installation package was faulty and contained\r\n   \\\"${CMAKE_CURRENT_LIST_FILE}\\\"\r\nbut not all the files it references.\r\n\")\r\n      endif()\r\n    endforeach()\r\n  endif()\r\n  unset(_cmake_file)\r\n  unset(\"_cmake_import_check_files_for_${_cmake_target}\")\r\nendforeach()\r\nunset(_cmake_target)\r\nunset(_cmake_import_check_targets)\r\n\r\n# This file does not depend on other imported targets which have\r\n# been exported from the same project but in a separate export set.\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\ncmake_policy(POP)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppConfig.cmake",
    "content": "include(\"${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersCppTargets.cmake\")"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppConfigVersion.cmake",
    "content": "# This is a basic version file for the Config-mode of find_package().\r\n# It is used by write_basic_package_version_file() as input file for configure_file()\r\n# to create a version-file which can be installed along a config.cmake file.\r\n#\r\n# The created file sets PACKAGE_VERSION_EXACT if the current version string and\r\n# the requested version string are exactly the same and it sets\r\n# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.\r\n# The variable CVF_VERSION must be set before calling configure_file().\r\n\r\nset(PACKAGE_VERSION \"3.0\")\r\n\r\nif (PACKAGE_FIND_VERSION_RANGE)\r\n  # Package version must be in the requested version range\r\n  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)\r\n      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)\r\n        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"EXCLUDE\" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n  endif()\r\nelse()\r\n  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)\r\n      set(PACKAGE_VERSION_EXACT TRUE)\r\n    endif()\r\n  endif()\r\nendif()\r\n\r\n\r\n# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:\r\nif(\"${CMAKE_SIZEOF_VOID_P}\" STREQUAL \"\" OR \"\" STREQUAL \"\")\r\n  return()\r\nendif()\r\n\r\n# check that the installed version has the same 32/64bit-ness as the one which is currently searching:\r\nif(NOT CMAKE_SIZEOF_VOID_P STREQUAL \"\")\r\n  math(EXPR installedBits \" * 8\")\r\n  set(PACKAGE_VERSION \"${PACKAGE_VERSION} (${installedBits}bit)\")\r\n  set(PACKAGE_VERSION_UNSUITABLE TRUE)\r\nendif()\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppTargets.cmake",
    "content": "# Generated by CMake\r\n\r\nif(\"${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}\" LESS 2.8)\r\n   message(FATAL_ERROR \"CMake >= 2.8.0 required\")\r\nendif()\r\nif(CMAKE_VERSION VERSION_LESS \"3.0.0\")\r\n   message(FATAL_ERROR \"CMake >= 3.0.0 required\")\r\nendif()\r\ncmake_policy(PUSH)\r\ncmake_policy(VERSION 3.0.0...3.28)\r\n#----------------------------------------------------------------\r\n# Generated CMake target import file.\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Protect against multiple inclusion, which would fail when already imported targets are added once more.\r\nset(_cmake_targets_defined \"\")\r\nset(_cmake_targets_not_defined \"\")\r\nset(_cmake_expected_targets \"\")\r\nforeach(_cmake_expected_target IN ITEMS OpenCL::HeadersCpp)\r\n  list(APPEND _cmake_expected_targets \"${_cmake_expected_target}\")\r\n  if(TARGET \"${_cmake_expected_target}\")\r\n    list(APPEND _cmake_targets_defined \"${_cmake_expected_target}\")\r\n  else()\r\n    list(APPEND _cmake_targets_not_defined \"${_cmake_expected_target}\")\r\n  endif()\r\nendforeach()\r\nunset(_cmake_expected_target)\r\nif(_cmake_targets_defined STREQUAL _cmake_expected_targets)\r\n  unset(_cmake_targets_defined)\r\n  unset(_cmake_targets_not_defined)\r\n  unset(_cmake_expected_targets)\r\n  unset(CMAKE_IMPORT_FILE_VERSION)\r\n  cmake_policy(POP)\r\n  return()\r\nendif()\r\nif(NOT _cmake_targets_defined STREQUAL \"\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_defined_text \"${_cmake_targets_defined}\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_not_defined_text \"${_cmake_targets_not_defined}\")\r\n  message(FATAL_ERROR \"Some (but not all) targets in this export set were already defined.\\nTargets Defined: ${_cmake_targets_defined_text}\\nTargets not yet defined: ${_cmake_targets_not_defined_text}\\n\")\r\nendif()\r\nunset(_cmake_targets_defined)\r\nunset(_cmake_targets_not_defined)\r\nunset(_cmake_expected_targets)\r\n\r\n\r\n# Compute the installation prefix relative to this file.\r\nget_filename_component(_IMPORT_PREFIX \"${CMAKE_CURRENT_LIST_FILE}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nif(_IMPORT_PREFIX STREQUAL \"/\")\r\n  set(_IMPORT_PREFIX \"\")\r\nendif()\r\n\r\n# Create imported target OpenCL::HeadersCpp\r\nadd_library(OpenCL::HeadersCpp INTERFACE IMPORTED)\r\n\r\nset_target_properties(OpenCL::HeadersCpp PROPERTIES\r\n  INTERFACE_INCLUDE_DIRECTORIES \"${_IMPORT_PREFIX}/include\"\r\n  INTERFACE_LINK_LIBRARIES \"OpenCL::Headers\"\r\n)\r\n\r\n# Load information for each installed configuration.\r\nfile(GLOB _cmake_config_files \"${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersCppTargets-*.cmake\")\r\nforeach(_cmake_config_file IN LISTS _cmake_config_files)\r\n  include(\"${_cmake_config_file}\")\r\nendforeach()\r\nunset(_cmake_config_file)\r\nunset(_cmake_config_files)\r\n\r\n# Cleanup temporary variables.\r\nset(_IMPORT_PREFIX)\r\n\r\n# Loop over all imported files and verify that they actually exist\r\nforeach(_cmake_target IN LISTS _cmake_import_check_targets)\r\n  if(CMAKE_VERSION VERSION_LESS \"3.28\"\r\n      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}\r\n      OR NOT IS_DIRECTORY \"${_cmake_import_check_xcframework_for_${_cmake_target}}\")\r\n    foreach(_cmake_file IN LISTS \"_cmake_import_check_files_for_${_cmake_target}\")\r\n      if(NOT EXISTS \"${_cmake_file}\")\r\n        message(FATAL_ERROR \"The imported target \\\"${_cmake_target}\\\" references the file\r\n   \\\"${_cmake_file}\\\"\r\nbut this file does not exist.  Possible reasons include:\r\n* The file was deleted, renamed, or moved to another location.\r\n* An install or uninstall procedure did not complete successfully.\r\n* The installation package was faulty and contained\r\n   \\\"${CMAKE_CURRENT_LIST_FILE}\\\"\r\nbut not all the files it references.\r\n\")\r\n      endif()\r\n    endforeach()\r\n  endif()\r\n  unset(_cmake_file)\r\n  unset(\"_cmake_import_check_files_for_${_cmake_target}\")\r\nendforeach()\r\nunset(_cmake_target)\r\nunset(_cmake_import_check_targets)\r\n\r\n# Make sure the targets which have been exported in some other\r\n# export set exist.\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\nforeach(_target \"OpenCL::Headers\" )\r\n  if(NOT TARGET \"${_target}\" )\r\n    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets \"${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}\")\r\n  endif()\r\nendforeach()\r\n\r\nif(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n  if(CMAKE_FIND_PACKAGE_NAME)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  else()\r\n    message(FATAL_ERROR \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  endif()\r\nendif()\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\ncmake_policy(POP)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderConfig.cmake",
    "content": "include(\"${CMAKE_CURRENT_LIST_DIR}/OpenCLICDLoaderTargets.cmake\")"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderConfigVersion.cmake",
    "content": "# This is a basic version file for the Config-mode of find_package().\r\n# It is used by write_basic_package_version_file() as input file for configure_file()\r\n# to create a version-file which can be installed along a config.cmake file.\r\n#\r\n# The created file sets PACKAGE_VERSION_EXACT if the current version string and\r\n# the requested version string are exactly the same and it sets\r\n# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.\r\n# The variable CVF_VERSION must be set before calling configure_file().\r\n\r\nset(PACKAGE_VERSION \"3.0\")\r\n\r\nif (PACKAGE_FIND_VERSION_RANGE)\r\n  # Package version must be in the requested version range\r\n  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)\r\n      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)\r\n        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"EXCLUDE\" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n  endif()\r\nelse()\r\n  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)\r\n      set(PACKAGE_VERSION_EXACT TRUE)\r\n    endif()\r\n  endif()\r\nendif()\r\n\r\n\r\n# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:\r\nif(\"${CMAKE_SIZEOF_VOID_P}\" STREQUAL \"\" OR \"\" STREQUAL \"\")\r\n  return()\r\nendif()\r\n\r\n# check that the installed version has the same 32/64bit-ness as the one which is currently searching:\r\nif(NOT CMAKE_SIZEOF_VOID_P STREQUAL \"\")\r\n  math(EXPR installedBits \" * 8\")\r\n  set(PACKAGE_VERSION \"${PACKAGE_VERSION} (${installedBits}bit)\")\r\n  set(PACKAGE_VERSION_UNSUITABLE TRUE)\r\nendif()\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets-debug.cmake",
    "content": "#----------------------------------------------------------------\r\n# Generated CMake target import file for configuration \"Debug\".\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Import target \"OpenCL::OpenCL\" for configuration \"Debug\"\r\nset_property(TARGET OpenCL::OpenCL APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)\r\nset_target_properties(OpenCL::OpenCL PROPERTIES\r\n  IMPORTED_IMPLIB_DEBUG \"${_IMPORT_PREFIX}/lib/OpenCL.lib\"\r\n  IMPORTED_LOCATION_DEBUG \"${_IMPORT_PREFIX}/bin/OpenCL.dll\"\r\n  )\r\n\r\nlist(APPEND _cmake_import_check_targets OpenCL::OpenCL )\r\nlist(APPEND _cmake_import_check_files_for_OpenCL::OpenCL \"${_IMPORT_PREFIX}/lib/OpenCL.lib\" \"${_IMPORT_PREFIX}/bin/OpenCL.dll\" )\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets-release.cmake",
    "content": "#----------------------------------------------------------------\r\n# Generated CMake target import file for configuration \"Release\".\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Import target \"OpenCL::OpenCL\" for configuration \"Release\"\r\nset_property(TARGET OpenCL::OpenCL APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)\r\nset_target_properties(OpenCL::OpenCL PROPERTIES\r\n  IMPORTED_IMPLIB_RELEASE \"${_IMPORT_PREFIX}/lib/OpenCL.lib\"\r\n  IMPORTED_LOCATION_RELEASE \"${_IMPORT_PREFIX}/bin/OpenCL.dll\"\r\n  )\r\n\r\nlist(APPEND _cmake_import_check_targets OpenCL::OpenCL )\r\nlist(APPEND _cmake_import_check_files_for_OpenCL::OpenCL \"${_IMPORT_PREFIX}/lib/OpenCL.lib\" \"${_IMPORT_PREFIX}/bin/OpenCL.dll\" )\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets.cmake",
    "content": "# Generated by CMake\r\n\r\nif(\"${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}\" LESS 2.8)\r\n   message(FATAL_ERROR \"CMake >= 2.8.0 required\")\r\nendif()\r\nif(CMAKE_VERSION VERSION_LESS \"2.8.12\")\r\n   message(FATAL_ERROR \"CMake >= 2.8.12 required\")\r\nendif()\r\ncmake_policy(PUSH)\r\ncmake_policy(VERSION 2.8.12...3.28)\r\n#----------------------------------------------------------------\r\n# Generated CMake target import file.\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Protect against multiple inclusion, which would fail when already imported targets are added once more.\r\nset(_cmake_targets_defined \"\")\r\nset(_cmake_targets_not_defined \"\")\r\nset(_cmake_expected_targets \"\")\r\nforeach(_cmake_expected_target IN ITEMS OpenCL::OpenCL)\r\n  list(APPEND _cmake_expected_targets \"${_cmake_expected_target}\")\r\n  if(TARGET \"${_cmake_expected_target}\")\r\n    list(APPEND _cmake_targets_defined \"${_cmake_expected_target}\")\r\n  else()\r\n    list(APPEND _cmake_targets_not_defined \"${_cmake_expected_target}\")\r\n  endif()\r\nendforeach()\r\nunset(_cmake_expected_target)\r\nif(_cmake_targets_defined STREQUAL _cmake_expected_targets)\r\n  unset(_cmake_targets_defined)\r\n  unset(_cmake_targets_not_defined)\r\n  unset(_cmake_expected_targets)\r\n  unset(CMAKE_IMPORT_FILE_VERSION)\r\n  cmake_policy(POP)\r\n  return()\r\nendif()\r\nif(NOT _cmake_targets_defined STREQUAL \"\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_defined_text \"${_cmake_targets_defined}\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_not_defined_text \"${_cmake_targets_not_defined}\")\r\n  message(FATAL_ERROR \"Some (but not all) targets in this export set were already defined.\\nTargets Defined: ${_cmake_targets_defined_text}\\nTargets not yet defined: ${_cmake_targets_not_defined_text}\\n\")\r\nendif()\r\nunset(_cmake_targets_defined)\r\nunset(_cmake_targets_not_defined)\r\nunset(_cmake_expected_targets)\r\n\r\n\r\n# Compute the installation prefix relative to this file.\r\nget_filename_component(_IMPORT_PREFIX \"${CMAKE_CURRENT_LIST_FILE}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nif(_IMPORT_PREFIX STREQUAL \"/\")\r\n  set(_IMPORT_PREFIX \"\")\r\nendif()\r\n\r\n# Create imported target OpenCL::OpenCL\r\nadd_library(OpenCL::OpenCL SHARED IMPORTED)\r\n\r\nset_target_properties(OpenCL::OpenCL PROPERTIES\r\n  INTERFACE_LINK_LIBRARIES \"OpenCL::Headers\"\r\n)\r\n\r\n# Load information for each installed configuration.\r\nfile(GLOB _cmake_config_files \"${CMAKE_CURRENT_LIST_DIR}/OpenCLICDLoaderTargets-*.cmake\")\r\nforeach(_cmake_config_file IN LISTS _cmake_config_files)\r\n  include(\"${_cmake_config_file}\")\r\nendforeach()\r\nunset(_cmake_config_file)\r\nunset(_cmake_config_files)\r\n\r\n# Cleanup temporary variables.\r\nset(_IMPORT_PREFIX)\r\n\r\n# Loop over all imported files and verify that they actually exist\r\nforeach(_cmake_target IN LISTS _cmake_import_check_targets)\r\n  if(CMAKE_VERSION VERSION_LESS \"3.28\"\r\n      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}\r\n      OR NOT IS_DIRECTORY \"${_cmake_import_check_xcframework_for_${_cmake_target}}\")\r\n    foreach(_cmake_file IN LISTS \"_cmake_import_check_files_for_${_cmake_target}\")\r\n      if(NOT EXISTS \"${_cmake_file}\")\r\n        message(FATAL_ERROR \"The imported target \\\"${_cmake_target}\\\" references the file\r\n   \\\"${_cmake_file}\\\"\r\nbut this file does not exist.  Possible reasons include:\r\n* The file was deleted, renamed, or moved to another location.\r\n* An install or uninstall procedure did not complete successfully.\r\n* The installation package was faulty and contained\r\n   \\\"${CMAKE_CURRENT_LIST_FILE}\\\"\r\nbut not all the files it references.\r\n\")\r\n      endif()\r\n    endforeach()\r\n  endif()\r\n  unset(_cmake_file)\r\n  unset(\"_cmake_import_check_files_for_${_cmake_target}\")\r\nendforeach()\r\nunset(_cmake_target)\r\nunset(_cmake_import_check_targets)\r\n\r\n# Make sure the targets which have been exported in some other\r\n# export set exist.\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\nforeach(_target \"OpenCL::Headers\" )\r\n  if(NOT TARGET \"${_target}\" )\r\n    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets \"${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}\")\r\n  endif()\r\nendforeach()\r\n\r\nif(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n  if(CMAKE_FIND_PACKAGE_NAME)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  else()\r\n    message(FATAL_ERROR \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  endif()\r\nendif()\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\ncmake_policy(POP)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsConfig.cmake",
    "content": "include(\"${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsTargets.cmake\")"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsConfigVersion.cmake",
    "content": "# This is a basic version file for the Config-mode of find_package().\r\n# It is used by write_basic_package_version_file() as input file for configure_file()\r\n# to create a version-file which can be installed along a config.cmake file.\r\n#\r\n# The created file sets PACKAGE_VERSION_EXACT if the current version string and\r\n# the requested version string are exactly the same and it sets\r\n# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.\r\n# The variable CVF_VERSION must be set before calling configure_file().\r\n\r\nset(PACKAGE_VERSION \"2024.10.24\")\r\n\r\nif (PACKAGE_FIND_VERSION_RANGE)\r\n  # Package version must be in the requested version range\r\n  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)\r\n      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)\r\n        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"EXCLUDE\" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n  endif()\r\nelse()\r\n  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)\r\n      set(PACKAGE_VERSION_EXACT TRUE)\r\n    endif()\r\n  endif()\r\nendif()\r\n\r\n\r\n# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:\r\nif(\"${CMAKE_SIZEOF_VOID_P}\" STREQUAL \"\" OR \"\" STREQUAL \"\")\r\n  return()\r\nendif()\r\n\r\n# check that the installed version has the same 32/64bit-ness as the one which is currently searching:\r\nif(NOT CMAKE_SIZEOF_VOID_P STREQUAL \"\")\r\n  math(EXPR installedBits \" * 8\")\r\n  set(PACKAGE_VERSION \"${PACKAGE_VERSION} (${installedBits}bit)\")\r\n  set(PACKAGE_VERSION_UNSUITABLE TRUE)\r\nendif()\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets-debug.cmake",
    "content": "#----------------------------------------------------------------\r\n# Generated CMake target import file for configuration \"Debug\".\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Import target \"OpenCL::Utils\" for configuration \"Debug\"\r\nset_property(TARGET OpenCL::Utils APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)\r\nset_target_properties(OpenCL::Utils PROPERTIES\r\n  IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG \"C\"\r\n  IMPORTED_LOCATION_DEBUG \"${_IMPORT_PREFIX}/lib/OpenCLUtilsd.lib\"\r\n  )\r\n\r\nlist(APPEND _cmake_import_check_targets OpenCL::Utils )\r\nlist(APPEND _cmake_import_check_files_for_OpenCL::Utils \"${_IMPORT_PREFIX}/lib/OpenCLUtilsd.lib\" )\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets-release.cmake",
    "content": "#----------------------------------------------------------------\r\n# Generated CMake target import file for configuration \"Release\".\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Import target \"OpenCL::Utils\" for configuration \"Release\"\r\nset_property(TARGET OpenCL::Utils APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)\r\nset_target_properties(OpenCL::Utils PROPERTIES\r\n  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE \"C\"\r\n  IMPORTED_LOCATION_RELEASE \"${_IMPORT_PREFIX}/lib/OpenCLUtils.lib\"\r\n  )\r\n\r\nlist(APPEND _cmake_import_check_targets OpenCL::Utils )\r\nlist(APPEND _cmake_import_check_files_for_OpenCL::Utils \"${_IMPORT_PREFIX}/lib/OpenCLUtils.lib\" )\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets.cmake",
    "content": "# Generated by CMake\r\n\r\nif(\"${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}\" LESS 2.8)\r\n   message(FATAL_ERROR \"CMake >= 2.8.0 required\")\r\nendif()\r\nif(CMAKE_VERSION VERSION_LESS \"2.8.12\")\r\n   message(FATAL_ERROR \"CMake >= 2.8.12 required\")\r\nendif()\r\ncmake_policy(PUSH)\r\ncmake_policy(VERSION 2.8.12...3.28)\r\n#----------------------------------------------------------------\r\n# Generated CMake target import file.\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Protect against multiple inclusion, which would fail when already imported targets are added once more.\r\nset(_cmake_targets_defined \"\")\r\nset(_cmake_targets_not_defined \"\")\r\nset(_cmake_expected_targets \"\")\r\nforeach(_cmake_expected_target IN ITEMS OpenCL::Utils)\r\n  list(APPEND _cmake_expected_targets \"${_cmake_expected_target}\")\r\n  if(TARGET \"${_cmake_expected_target}\")\r\n    list(APPEND _cmake_targets_defined \"${_cmake_expected_target}\")\r\n  else()\r\n    list(APPEND _cmake_targets_not_defined \"${_cmake_expected_target}\")\r\n  endif()\r\nendforeach()\r\nunset(_cmake_expected_target)\r\nif(_cmake_targets_defined STREQUAL _cmake_expected_targets)\r\n  unset(_cmake_targets_defined)\r\n  unset(_cmake_targets_not_defined)\r\n  unset(_cmake_expected_targets)\r\n  unset(CMAKE_IMPORT_FILE_VERSION)\r\n  cmake_policy(POP)\r\n  return()\r\nendif()\r\nif(NOT _cmake_targets_defined STREQUAL \"\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_defined_text \"${_cmake_targets_defined}\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_not_defined_text \"${_cmake_targets_not_defined}\")\r\n  message(FATAL_ERROR \"Some (but not all) targets in this export set were already defined.\\nTargets Defined: ${_cmake_targets_defined_text}\\nTargets not yet defined: ${_cmake_targets_not_defined_text}\\n\")\r\nendif()\r\nunset(_cmake_targets_defined)\r\nunset(_cmake_targets_not_defined)\r\nunset(_cmake_expected_targets)\r\n\r\n\r\n# Compute the installation prefix relative to this file.\r\nget_filename_component(_IMPORT_PREFIX \"${CMAKE_CURRENT_LIST_FILE}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nif(_IMPORT_PREFIX STREQUAL \"/\")\r\n  set(_IMPORT_PREFIX \"\")\r\nendif()\r\n\r\n# Create imported target OpenCL::Utils\r\nadd_library(OpenCL::Utils STATIC IMPORTED)\r\n\r\nset_target_properties(OpenCL::Utils PROPERTIES\r\n  INTERFACE_COMPILE_DEFINITIONS \"CL_HPP_ENABLE_EXCEPTIONS\"\r\n  INTERFACE_INCLUDE_DIRECTORIES \"${_IMPORT_PREFIX}/include\"\r\n  INTERFACE_LINK_LIBRARIES \"\\$<LINK_ONLY:whereami>;OpenCL::Headers;\\$<\\$<BOOL:>:m>;OpenCL::OpenCL\"\r\n)\r\n\r\n# Load information for each installed configuration.\r\nfile(GLOB _cmake_config_files \"${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsTargets-*.cmake\")\r\nforeach(_cmake_config_file IN LISTS _cmake_config_files)\r\n  include(\"${_cmake_config_file}\")\r\nendforeach()\r\nunset(_cmake_config_file)\r\nunset(_cmake_config_files)\r\n\r\n# Cleanup temporary variables.\r\nset(_IMPORT_PREFIX)\r\n\r\n# Loop over all imported files and verify that they actually exist\r\nforeach(_cmake_target IN LISTS _cmake_import_check_targets)\r\n  if(CMAKE_VERSION VERSION_LESS \"3.28\"\r\n      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}\r\n      OR NOT IS_DIRECTORY \"${_cmake_import_check_xcframework_for_${_cmake_target}}\")\r\n    foreach(_cmake_file IN LISTS \"_cmake_import_check_files_for_${_cmake_target}\")\r\n      if(NOT EXISTS \"${_cmake_file}\")\r\n        message(FATAL_ERROR \"The imported target \\\"${_cmake_target}\\\" references the file\r\n   \\\"${_cmake_file}\\\"\r\nbut this file does not exist.  Possible reasons include:\r\n* The file was deleted, renamed, or moved to another location.\r\n* An install or uninstall procedure did not complete successfully.\r\n* The installation package was faulty and contained\r\n   \\\"${CMAKE_CURRENT_LIST_FILE}\\\"\r\nbut not all the files it references.\r\n\")\r\n      endif()\r\n    endforeach()\r\n  endif()\r\n  unset(_cmake_file)\r\n  unset(\"_cmake_import_check_files_for_${_cmake_target}\")\r\nendforeach()\r\nunset(_cmake_target)\r\nunset(_cmake_import_check_targets)\r\n\r\n# Make sure the targets which have been exported in some other\r\n# export set exist.\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\nforeach(_target \"OpenCL::Headers\" \"OpenCL::OpenCL\" )\r\n  if(NOT TARGET \"${_target}\" )\r\n    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets \"${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}\")\r\n  endif()\r\nendforeach()\r\n\r\nif(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n  if(CMAKE_FIND_PACKAGE_NAME)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  else()\r\n    message(FATAL_ERROR \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  endif()\r\nendif()\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\ncmake_policy(POP)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppConfig.cmake",
    "content": "include(\"${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsCppTargets.cmake\")"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppConfigVersion.cmake",
    "content": "# This is a basic version file for the Config-mode of find_package().\r\n# It is used by write_basic_package_version_file() as input file for configure_file()\r\n# to create a version-file which can be installed along a config.cmake file.\r\n#\r\n# The created file sets PACKAGE_VERSION_EXACT if the current version string and\r\n# the requested version string are exactly the same and it sets\r\n# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.\r\n# The variable CVF_VERSION must be set before calling configure_file().\r\n\r\nset(PACKAGE_VERSION \"2024.10.24\")\r\n\r\nif (PACKAGE_FIND_VERSION_RANGE)\r\n  # Package version must be in the requested version range\r\n  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)\r\n      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"INCLUDE\" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)\r\n        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL \"EXCLUDE\" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n  endif()\r\nelse()\r\n  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)\r\n    set(PACKAGE_VERSION_COMPATIBLE FALSE)\r\n  else()\r\n    set(PACKAGE_VERSION_COMPATIBLE TRUE)\r\n    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)\r\n      set(PACKAGE_VERSION_EXACT TRUE)\r\n    endif()\r\n  endif()\r\nendif()\r\n\r\n\r\n# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:\r\nif(\"${CMAKE_SIZEOF_VOID_P}\" STREQUAL \"\" OR \"\" STREQUAL \"\")\r\n  return()\r\nendif()\r\n\r\n# check that the installed version has the same 32/64bit-ness as the one which is currently searching:\r\nif(NOT CMAKE_SIZEOF_VOID_P STREQUAL \"\")\r\n  math(EXPR installedBits \" * 8\")\r\n  set(PACKAGE_VERSION \"${PACKAGE_VERSION} (${installedBits}bit)\")\r\n  set(PACKAGE_VERSION_UNSUITABLE TRUE)\r\nendif()\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets-debug.cmake",
    "content": "#----------------------------------------------------------------\r\n# Generated CMake target import file for configuration \"Debug\".\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Import target \"OpenCL::UtilsCpp\" for configuration \"Debug\"\r\nset_property(TARGET OpenCL::UtilsCpp APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)\r\nset_target_properties(OpenCL::UtilsCpp PROPERTIES\r\n  IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG \"C;CXX\"\r\n  IMPORTED_LOCATION_DEBUG \"${_IMPORT_PREFIX}/lib/OpenCLUtilsCppd.lib\"\r\n  )\r\n\r\nlist(APPEND _cmake_import_check_targets OpenCL::UtilsCpp )\r\nlist(APPEND _cmake_import_check_files_for_OpenCL::UtilsCpp \"${_IMPORT_PREFIX}/lib/OpenCLUtilsCppd.lib\" )\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets-release.cmake",
    "content": "#----------------------------------------------------------------\r\n# Generated CMake target import file for configuration \"Release\".\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Import target \"OpenCL::UtilsCpp\" for configuration \"Release\"\r\nset_property(TARGET OpenCL::UtilsCpp APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)\r\nset_target_properties(OpenCL::UtilsCpp PROPERTIES\r\n  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE \"C;CXX\"\r\n  IMPORTED_LOCATION_RELEASE \"${_IMPORT_PREFIX}/lib/OpenCLUtilsCpp.lib\"\r\n  )\r\n\r\nlist(APPEND _cmake_import_check_targets OpenCL::UtilsCpp )\r\nlist(APPEND _cmake_import_check_files_for_OpenCL::UtilsCpp \"${_IMPORT_PREFIX}/lib/OpenCLUtilsCpp.lib\" )\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\n"
  },
  {
    "path": "svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets.cmake",
    "content": "# Generated by CMake\r\n\r\nif(\"${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}\" LESS 2.8)\r\n   message(FATAL_ERROR \"CMake >= 2.8.0 required\")\r\nendif()\r\nif(CMAKE_VERSION VERSION_LESS \"2.8.12\")\r\n   message(FATAL_ERROR \"CMake >= 2.8.12 required\")\r\nendif()\r\ncmake_policy(PUSH)\r\ncmake_policy(VERSION 2.8.12...3.28)\r\n#----------------------------------------------------------------\r\n# Generated CMake target import file.\r\n#----------------------------------------------------------------\r\n\r\n# Commands may need to know the format version.\r\nset(CMAKE_IMPORT_FILE_VERSION 1)\r\n\r\n# Protect against multiple inclusion, which would fail when already imported targets are added once more.\r\nset(_cmake_targets_defined \"\")\r\nset(_cmake_targets_not_defined \"\")\r\nset(_cmake_expected_targets \"\")\r\nforeach(_cmake_expected_target IN ITEMS OpenCL::UtilsCpp)\r\n  list(APPEND _cmake_expected_targets \"${_cmake_expected_target}\")\r\n  if(TARGET \"${_cmake_expected_target}\")\r\n    list(APPEND _cmake_targets_defined \"${_cmake_expected_target}\")\r\n  else()\r\n    list(APPEND _cmake_targets_not_defined \"${_cmake_expected_target}\")\r\n  endif()\r\nendforeach()\r\nunset(_cmake_expected_target)\r\nif(_cmake_targets_defined STREQUAL _cmake_expected_targets)\r\n  unset(_cmake_targets_defined)\r\n  unset(_cmake_targets_not_defined)\r\n  unset(_cmake_expected_targets)\r\n  unset(CMAKE_IMPORT_FILE_VERSION)\r\n  cmake_policy(POP)\r\n  return()\r\nendif()\r\nif(NOT _cmake_targets_defined STREQUAL \"\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_defined_text \"${_cmake_targets_defined}\")\r\n  string(REPLACE \";\" \", \" _cmake_targets_not_defined_text \"${_cmake_targets_not_defined}\")\r\n  message(FATAL_ERROR \"Some (but not all) targets in this export set were already defined.\\nTargets Defined: ${_cmake_targets_defined_text}\\nTargets not yet defined: ${_cmake_targets_not_defined_text}\\n\")\r\nendif()\r\nunset(_cmake_targets_defined)\r\nunset(_cmake_targets_not_defined)\r\nunset(_cmake_expected_targets)\r\n\r\n\r\n# Compute the installation prefix relative to this file.\r\nget_filename_component(_IMPORT_PREFIX \"${CMAKE_CURRENT_LIST_FILE}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nget_filename_component(_IMPORT_PREFIX \"${_IMPORT_PREFIX}\" PATH)\r\nif(_IMPORT_PREFIX STREQUAL \"/\")\r\n  set(_IMPORT_PREFIX \"\")\r\nendif()\r\n\r\n# Create imported target OpenCL::UtilsCpp\r\nadd_library(OpenCL::UtilsCpp STATIC IMPORTED)\r\n\r\nset_target_properties(OpenCL::UtilsCpp PROPERTIES\r\n  INTERFACE_COMPILE_DEFINITIONS \"CL_HPP_ENABLE_EXCEPTIONS\"\r\n  INTERFACE_INCLUDE_DIRECTORIES \"${_IMPORT_PREFIX}/include\"\r\n  INTERFACE_LINK_LIBRARIES \"\\$<LINK_ONLY:whereami>;OpenCL::HeadersCpp;OpenCL::Utils;OpenCL::OpenCL\"\r\n)\r\n\r\n# Load information for each installed configuration.\r\nfile(GLOB _cmake_config_files \"${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsCppTargets-*.cmake\")\r\nforeach(_cmake_config_file IN LISTS _cmake_config_files)\r\n  include(\"${_cmake_config_file}\")\r\nendforeach()\r\nunset(_cmake_config_file)\r\nunset(_cmake_config_files)\r\n\r\n# Cleanup temporary variables.\r\nset(_IMPORT_PREFIX)\r\n\r\n# Loop over all imported files and verify that they actually exist\r\nforeach(_cmake_target IN LISTS _cmake_import_check_targets)\r\n  if(CMAKE_VERSION VERSION_LESS \"3.28\"\r\n      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}\r\n      OR NOT IS_DIRECTORY \"${_cmake_import_check_xcframework_for_${_cmake_target}}\")\r\n    foreach(_cmake_file IN LISTS \"_cmake_import_check_files_for_${_cmake_target}\")\r\n      if(NOT EXISTS \"${_cmake_file}\")\r\n        message(FATAL_ERROR \"The imported target \\\"${_cmake_target}\\\" references the file\r\n   \\\"${_cmake_file}\\\"\r\nbut this file does not exist.  Possible reasons include:\r\n* The file was deleted, renamed, or moved to another location.\r\n* An install or uninstall procedure did not complete successfully.\r\n* The installation package was faulty and contained\r\n   \\\"${CMAKE_CURRENT_LIST_FILE}\\\"\r\nbut not all the files it references.\r\n\")\r\n      endif()\r\n    endforeach()\r\n  endif()\r\n  unset(_cmake_file)\r\n  unset(\"_cmake_import_check_files_for_${_cmake_target}\")\r\nendforeach()\r\nunset(_cmake_target)\r\nunset(_cmake_import_check_targets)\r\n\r\n# Make sure the targets which have been exported in some other\r\n# export set exist.\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\nforeach(_target \"OpenCL::HeadersCpp\" \"OpenCL::Utils\" \"OpenCL::OpenCL\" )\r\n  if(NOT TARGET \"${_target}\" )\r\n    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets \"${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}\")\r\n  endif()\r\nendforeach()\r\n\r\nif(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n  if(CMAKE_FIND_PACKAGE_NAME)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)\r\n    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  else()\r\n    message(FATAL_ERROR \"The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}\")\r\n  endif()\r\nendif()\r\nunset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)\r\n\r\n# Commands beyond this point should not need to know the version.\r\nset(CMAKE_IMPORT_FILE_VERSION)\r\ncmake_policy(POP)\r\n"
  },
  {
    "path": "svm/OpenCL/share/pkgconfig/OpenCL-CLHPP.pc",
    "content": "prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install\r\nincludedir=${prefix}/include\r\n\r\nName: OpenCL-CLHPP\r\nDescription: OpenCL API C++ bindings\r\nRequires: OpenCL-Headers\r\nVersion: 3.0\r\nCflags: -I${includedir}\r\n"
  },
  {
    "path": "svm/OpenCL/share/pkgconfig/OpenCL-Headers.pc",
    "content": "prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install\r\nincludedir=${prefix}/include\r\n\r\nName: OpenCL-Headers\r\nDescription: Khronos OpenCL Headers\r\nVersion: 3.0\r\nCflags: -I${includedir}\r\n"
  },
  {
    "path": "svm/atomic_latency_kernel.cl",
    "content": "__kernel void atomic_exec_latency_test(__global int* A, int count) {\r\n    int current = 1;\r\n    while (current <= 2 * count) {\r\n        if (atomic_cmpxchg(A, current - 1, current) == current - 1) {\r\n            current += 2;\r\n            // printf(\"gpu current = %d\\n\", current);\r\n        } // else printf(\"A = %d wait for %d\\n\", *A, current - 1);\r\n    }\r\n}\r\n\r\n__kernel void increment_on_gpu(__global int *A)\r\n{\r\n    *A = *A + 1;\r\n}"
  },
  {
    "path": "svm/svm.sln",
    "content": "﻿\r\nMicrosoft Visual Studio Solution File, Format Version 12.00\r\n# Visual Studio Version 17\r\nVisualStudioVersion = 17.12.35527.113 d17.12\r\nMinimumVisualStudioVersion = 10.0.40219.1\r\nProject(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"svm\", \"svm.vcxproj\", \"{411AB5E4-FD55-4478-83F2-80C51F205FA7}\"\r\nEndProject\r\nGlobal\r\n\tGlobalSection(SolutionConfigurationPlatforms) = preSolution\r\n\t\tDebug|x64 = Debug|x64\r\n\t\tDebug|x86 = Debug|x86\r\n\t\tRelease|x64 = Release|x64\r\n\t\tRelease|x86 = Release|x86\r\n\tEndGlobalSection\r\n\tGlobalSection(ProjectConfigurationPlatforms) = postSolution\r\n\t\t{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x64.ActiveCfg = Debug|x64\r\n\t\t{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x64.Build.0 = Debug|x64\r\n\t\t{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x86.ActiveCfg = Debug|Win32\r\n\t\t{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x86.Build.0 = Debug|Win32\r\n\t\t{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x64.ActiveCfg = Release|x64\r\n\t\t{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x64.Build.0 = Release|x64\r\n\t\t{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x86.ActiveCfg = Release|Win32\r\n\t\t{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x86.Build.0 = Release|Win32\r\n\tEndGlobalSection\r\n\tGlobalSection(SolutionProperties) = preSolution\r\n\t\tHideSolutionNode = FALSE\r\n\tEndGlobalSection\r\nEndGlobal\r\n"
  },
  {
    "path": "svm/svm.vcxproj",
    "content": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup Label=\"ProjectConfigurations\">\r\n    <ProjectConfiguration Include=\"Debug|Win32\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|Win32\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>Win32</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Debug|x64\">\r\n      <Configuration>Debug</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n    <ProjectConfiguration Include=\"Release|x64\">\r\n      <Configuration>Release</Configuration>\r\n      <Platform>x64</Platform>\r\n    </ProjectConfiguration>\r\n  </ItemGroup>\r\n  <PropertyGroup Label=\"Globals\">\r\n    <VCProjectVersion>17.0</VCProjectVersion>\r\n    <Keyword>Win32Proj</Keyword>\r\n    <ProjectGuid>{411ab5e4-fd55-4478-83f2-80c51f205fa7}</ProjectGuid>\r\n    <RootNamespace>svm</RootNamespace>\r\n    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.Default.props\" />\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>true</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <PropertyGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\" Label=\"Configuration\">\r\n    <ConfigurationType>Application</ConfigurationType>\r\n    <UseDebugLibraries>false</UseDebugLibraries>\r\n    <PlatformToolset>v143</PlatformToolset>\r\n    <WholeProgramOptimization>true</WholeProgramOptimization>\r\n    <CharacterSet>Unicode</CharacterSet>\r\n  </PropertyGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.props\" />\r\n  <ImportGroup Label=\"ExtensionSettings\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"Shared\">\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <ImportGroup Label=\"PropertySheets\" Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <Import Project=\"$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props\" Condition=\"exists('$(UserRootDir)\\Microsoft.Cpp.$(Platform).user.props')\" Label=\"LocalAppDataPlatform\" />\r\n  </ImportGroup>\r\n  <PropertyGroup Label=\"UserMacros\" />\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n      <AdditionalIncludeDirectories>$(SolutionDir)\\OpenCL\\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|Win32'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n      <AdditionalIncludeDirectories>$(SolutionDir)\\OpenCL\\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Debug|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n      <AdditionalIncludeDirectories>$(SolutionDir)\\OpenCL\\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n      <AdditionalLibraryDirectories>$(SolutionDir)\\OpenCL\\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>\r\n      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemDefinitionGroup Condition=\"'$(Configuration)|$(Platform)'=='Release|x64'\">\r\n    <ClCompile>\r\n      <WarningLevel>Level3</WarningLevel>\r\n      <FunctionLevelLinking>true</FunctionLevelLinking>\r\n      <IntrinsicFunctions>true</IntrinsicFunctions>\r\n      <SDLCheck>true</SDLCheck>\r\n      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>\r\n      <ConformanceMode>true</ConformanceMode>\r\n      <AdditionalIncludeDirectories>$(SolutionDir)\\OpenCL\\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>\r\n    </ClCompile>\r\n    <Link>\r\n      <SubSystem>Console</SubSystem>\r\n      <EnableCOMDATFolding>true</EnableCOMDATFolding>\r\n      <OptimizeReferences>true</OptimizeReferences>\r\n      <GenerateDebugInformation>true</GenerateDebugInformation>\r\n      <AdditionalLibraryDirectories>$(SolutionDir)\\OpenCL\\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>\r\n      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>\r\n    </Link>\r\n  </ItemDefinitionGroup>\r\n  <ItemGroup>\r\n    <ClInclude Include=\"..\\common\\timing.h\" />\r\n    <ClCompile Include=\"svmtest.cpp\" />\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <CopyFileToFolders Include=\"atomic_latency_kernel.cl\">\r\n      <FileType>Document</FileType>\r\n    </CopyFileToFolders>\r\n  </ItemGroup>\r\n  <Import Project=\"$(VCTargetsPath)\\Microsoft.Cpp.targets\" />\r\n  <ImportGroup Label=\"ExtensionTargets\">\r\n  </ImportGroup>\r\n</Project>"
  },
  {
    "path": "svm/svm.vcxproj.filters",
    "content": "﻿<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n<Project ToolsVersion=\"4.0\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">\r\n  <ItemGroup>\r\n    <Filter Include=\"Source Files\">\r\n      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>\r\n      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Header Files\">\r\n      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>\r\n      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>\r\n    </Filter>\r\n    <Filter Include=\"Resource Files\">\r\n      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>\r\n      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>\r\n    </Filter>\r\n  </ItemGroup>\r\n  <ItemGroup>\r\n    <ClCompile Include=\"svmtest.cpp\">\r\n      <Filter>Source Files</Filter>\r\n    </ClCompile>\r\n  </ItemGroup>\r\n</Project>"
  },
  {
    "path": "svm/svmtest.cpp",
    "content": "#define CL_TARGET_OPENCL_VERSION 300\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <stdint.h>\n#include <string.h>\n#include <math.h>\n#include \"../Common/timing.h\"\n#include \"../Common/timing.c\" // lol avoids trying to link\n#include <CL/cl.h>\n\n#ifdef _MSC_VER\n#include <Windows.h>\n#else\n#include <sched.h>\n#include <pthread.h>\n#define _strnicmp strncmp\n#endif\n\n#define TARGET_TIME_MS 2000\n\ncl_device_id selected_device_id;\ncl_platform_id selected_platform_id;\n\nint checkSVMSupport(cl_device_svm_capabilities desiredCaps);\ncl_context get_context_from_user(int platform_index, int device_index);\ncl_program build_program(cl_context context, const char* fname, const char* params);\n#ifdef _MSC_VER\nDWORD WINAPI LatencyTestThread(LPVOID param);\n#else\nvoid* LatencyTestThread(void* param);\n#endif\n\nfloat runAtomicsTest(cl_context context, cl_command_queue command_queue);\nfloat runBufferSharingTest(cl_context context, cl_command_queue command_queue);\n\ntypedef struct LatencyThreadData {\n    uint64_t start;       // initial value to write into target\n    uint64_t iterations;  // number of iterations to run\n    uint32_t* target;       // value to bounce between threads, init with start - 1\n} LatencyData;\n\n\n#define ALLOC_SIZE 4096\n\n// 4K alignment doesn't work\n#define ALLOC_ALIGN 64\n\nenum TestType {\n    Atomics,\n    BufferSharing,\n    Custom\n};\n\nint main(int argc, char* argv[])\n{\n    cl_int ret;\n    cl_context context = NULL;\n    cl_command_queue command_queue = NULL;\n    TestType testType = Atomics;\n    int platform_index = -1, device_index = -1;\n\n    for (int argIdx = 1; argIdx < argc; argIdx++) {\n        if (*(argv[argIdx]) == '-') {\n            char* arg = argv[argIdx] + 1;\n            if (_strnicmp(arg, \"atomics\", 7) == 0) {\n                argIdx++;\n                testType = Atomics;\n                fprintf(stderr, \"Test type = atomics\\n\");\n            }\n            else if (_strnicmp(arg, \"buffersharing\", 13) == 0)\n            {\n                argIdx++;\n                testType = BufferSharing;\n                fprintf(stderr, \"Test type = buffer sharing\\n\");\n            }\n            else if (_strnicmp(arg, \"custom\", 6) == 0)\n            {\n                argIdx++;\n                testType = Custom;\n                fprintf(stderr, \"Test type = custom code\\n\");\n            }\n            else if (_strnicmp(arg, \"platform\", 8) == 0) {\n                argIdx++;\n                platform_index = atoi(argv[argIdx]);\n                fprintf(stderr, \"Using OpenCL platform index %d\\n\", platform_index);\n            }\n            else if (_strnicmp(arg, \"device\", 6) == 0) {\n                argIdx++;\n                device_index = atoi(argv[argIdx]);\n                fprintf(stderr, \"Using OpenCL device index %d\\n\", device_index);\n            }\n        }\n    }\n\n    if (testType != Custom)\n    {\n        context = get_context_from_user(platform_index, device_index);\n        command_queue = clCreateCommandQueueWithProperties(context, selected_device_id, NULL, &ret);\n    }\n\n    if (testType == Atomics)\n    {\n        runAtomicsTest(context, command_queue);\n    }\n    else if (testType == BufferSharing)\n    {\n        runBufferSharingTest(context, command_queue);\n    }\n    else\n    {\n#ifdef _MSC_VER\n        float results[4];\n        for (int i = 0; i < 4; i++)\n        {\n            context = get_context_from_user(1, 0);\n            command_queue = clCreateCommandQueueWithProperties(context, selected_device_id, NULL, &ret);\n            SetProcessAffinityMask(GetCurrentProcess(), 1UL << i);\n            results[i] = runAtomicsTest(context, command_queue);\n            clReleaseCommandQueue(command_queue);\n            clReleaseContext(context);\n        }\n\n        printf(\"CPU,GPU\\n\");\n        for (int i = 0; i < 4; i++)\n        {\n            printf(\"%d,%f\\n\", i, results[i]);\n        }\n#endif\n    }\n\nend:\n    if (testType != Custom) {\n        clReleaseCommandQueue(command_queue);\n        clReleaseContext(context);\n    }\n    return 0;\n}\n\nfloat runBufferSharingTest(cl_context context, cl_command_queue command_queue)\n{\n    cl_int ret;\n    cl_event evt, buffer_evt;\n    size_t gpu_threads = 1;\n    uint64_t time_diff_ms;\n    float latency;\n    uint32_t* testptr, current = 2, iterations = 1000;\n    cl_program program = build_program(context, \"atomic_latency_kernel.cl\", NULL);\n    cl_kernel increment_kernel = clCreateKernel(program, \"increment_on_gpu\", &ret);\n    int fineGrainedSupport = checkSVMSupport(CL_DEVICE_SVM_FINE_GRAIN_BUFFER);\n    if (fineGrainedSupport) fprintf(stderr, \"Device has SVM fine grained buffer support\\n\");\n    else fprintf(stderr, \"Device can only use coarse grained buffer sharing\\n\");\n    testptr = (uint32_t*)clSVMAlloc(context, CL_MEM_READ_WRITE | (fineGrainedSupport ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0), ALLOC_SIZE, ALLOC_ALIGN);\n    \n    // test setup\n    do {\n        clSetKernelArgSVMPointer(increment_kernel, 0, testptr);\n        *testptr = 0;\n        current = 2;\n        start_timing();\n        for (int i = 0; i < iterations; i++)\n        {\n            if (!fineGrainedSupport)\n            {\n                clEnqueueSVMUnmap(command_queue, testptr, 0, NULL, NULL);\n                clFinish(command_queue);\n            }\n            \n\n            ret = clEnqueueNDRangeKernel(command_queue, increment_kernel, 1, NULL, &gpu_threads, &gpu_threads, 0, NULL, &evt);\n            if (ret != CL_SUCCESS)\n            {\n                fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\n                latency = 0;\n                goto bufferend;\n            }\n\n            clWaitForEvents(1, &evt);\n            if (!fineGrainedSupport) {\n                clEnqueueSVMMap(command_queue, CL_NON_BLOCKING, CL_MAP_READ | CL_MAP_WRITE, testptr, ALLOC_SIZE, 0, NULL, NULL);\n                //clWaitForEvents(1, &buffer_evt);\n                clFinish(command_queue);\n            }\n\n            if (*testptr == current - 1)\n            {\n                *testptr = current;\n                current += 2;\n            }\n            else\n            {\n                fprintf(stderr, \"Buffer sharing did not work. Expected %d, test value is still %d\\n\", current - 1, *testptr);\n                goto bufferend;\n            }\n        }\n        time_diff_ms = end_timing();\n        latency = (1e6 * (float)time_diff_ms / (float)(iterations));\n        printf(\"Latency: %f ns, %lu ms elapsed time, %u iterations\\n\", latency, time_diff_ms, iterations);\n        iterations = scale_iterations_to_target(iterations, time_diff_ms, TARGET_TIME_MS);\n\n    } while (time_diff_ms < TARGET_TIME_MS / 2);\n\nbufferend:\n    clSVMFree(context, testptr);\n    clReleaseKernel(increment_kernel);\n    clReleaseProgram(program);\n    return latency;\n}\n\nfloat runAtomicsTest(cl_context context, cl_command_queue command_queue)\n{\n    cl_int ret;\n    LatencyData latencyData;\n    float latency;\n    size_t gpu_threads = 1;\n    uint64_t time_diff_ms;\n    uint32_t* testptr, iterations = 1000000;\n    cl_program program;\n    cl_kernel atomic_kernel;\n\n    int svmSupport = checkSVMSupport(CL_DEVICE_SVM_ATOMICS);\n    if (svmSupport) fprintf(stderr, \"Device has SVM support\\n\");\n    else\n    {\n        fprintf(stderr, \"SVM atomics are not supported on selected device. Exiting.\\n\");\n        return 0.0f;\n    }\n\n    program = build_program(context, \"atomic_latency_kernel.cl\", NULL);\n    atomic_kernel = clCreateKernel(program, \"atomic_exec_latency_test\", &ret);\n    testptr = (uint32_t*)clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, ALLOC_SIZE, ALLOC_ALIGN);\n    if (testptr == NULL)\n    {\n        fprintf(stderr, \"Failed to get memory via clSVMAlloc\\n\");\n        goto atomicsend;\n    }\n\n    clFinish(command_queue);\n    clSetKernelArgSVMPointer(atomic_kernel, 0, testptr);\n\n    do {\n        clSetKernelArg(atomic_kernel, 1, sizeof(cl_int), (void*)&iterations);\n        latencyData.iterations = iterations;\n        latencyData.start = 2; // GPU thread start = 1\n        latencyData.target = testptr;\n        *testptr = 0;\n\n#ifdef _MSC_VER\n        HANDLE testThread;\n        DWORD testThreadId;\n        testThread = CreateThread(NULL, 0, LatencyTestThread, &latencyData, CREATE_SUSPENDED, &testThreadId);\n#else\n        pthread_t testThread;\n        pthread_create(&testThread, NULL, LatencyTestThread, (void*)&latencyData);\n#endif\n\n        start_timing();\n#ifdef _MSC_VER\n        ResumeThread(testThread);\n#else\n#endif\n\n        // Blocking call, must come after ResumeThread\n        ret = clEnqueueNDRangeKernel(command_queue, atomic_kernel, 1, NULL, &gpu_threads, &gpu_threads, 0, NULL, NULL);\n        if (ret != CL_SUCCESS)\n        {\n            fprintf(stderr, \"Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\\n\", ret);\n            latency = 0;\n            goto atomicsend;\n        }\n        clFinish(command_queue);\n#ifdef _MSC_VER\n        WaitForSingleObject(testThread, INFINITE);\n#else\n        pthread_join(testThread, NULL);\n#endif\n        time_diff_ms = end_timing();\n        latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2;\n        printf(\"Latency: %f ns, %lu ms elapsed time, %u iterations\\n\", latency, time_diff_ms, iterations);\n        iterations = scale_iterations_to_target(iterations, time_diff_ms, TARGET_TIME_MS);\n    } while (time_diff_ms < TARGET_TIME_MS / 2);\n\natomicsend:\n    clSVMFree(context, testptr);\n    clReleaseKernel(atomic_kernel);\n    clReleaseProgram(program);\n    return latency;\n}\n\n/// <summary>\n/// Runs one thread of the latency test. should be run in pairs\n/// Always writes to target\n/// </summary>\n/// <param name=\"param\">Latency test params</param>\n/// <returns>next value that would have been written to shared memory</returns>\n#ifdef _MSC_VER\nDWORD WINAPI LatencyTestThread(LPVOID param) {\n#else\nvoid* LatencyTestThread(void* param) {\n#endif\n    LatencyData* latencyData = (LatencyData*)param;\n    uint64_t current = latencyData->start;\n    while (current <= 2 * latencyData->iterations) {\n#ifdef _MSC_VER\n        if (_InterlockedCompareExchange(latencyData->target, current, current - 1) == current - 1) {\n#else\n        if (__sync_bool_compare_and_swap(latencyData->target, current - 1, current)) {\n#endif\n            current += 2;\n            // fprintf(stderr, \"CPU current = %d\\n\", current);\n        }\n        // else fprintf(stderr, \"target = %d waiting for %d\\n\", latencyData->target, current - 1);\n    }\n\n#ifdef _MSC_VER\n    return current;\n#else\n    pthread_exit(NULL);\n#endif\n    }\n\n\nint checkSVMSupport(cl_device_svm_capabilities desiredCaps)\n{\n    cl_device_svm_capabilities caps;\n    cl_int ret = clGetDeviceInfo(selected_device_id, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, 0);\n    if (ret != CL_SUCCESS)\n    {\n        fprintf(stderr, \"Failed to check for SVM support (%d)\\n\", ret);\n        return 0;\n    }\n\n    if (caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) fprintf(stderr, \"Device supports coarse grained buffer sharing\\n\");\n    if (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) fprintf(stderr, \"Device supports fine grained buffer sharing\\n\");\n    if (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) fprintf(stderr, \"Device supports sharing virtual memory allocated on host\\n\");\n    if (caps & CL_DEVICE_SVM_ATOMICS) fprintf(stderr, \"Device supports atomic operations on shared memory\\n\");\n\n    return caps & desiredCaps;\n}\n\n#define MAX_SOURCE_SIZE (0x100000)\ncl_program build_program(cl_context context, const char* fname, const char* params)\n{\n    cl_int ret;\n    FILE* fp = NULL;\n    char* source_str;\n    size_t source_size;\n    fp = fopen(fname, \"r\");\n    if (!fp) {\n        fprintf(stderr, \"Failed to load kernel %s.\\n\", fname);\n        exit(1);\n    }\n    source_str = (char*)malloc(MAX_SOURCE_SIZE);\n    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);\n    fclose(fp);\n\n    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &ret);\n    ret = clBuildProgram(program, 1, &selected_device_id, params, NULL, NULL);\n    //fprintf(stderr, \"clBuildProgram %s returned %d\\n\", fname, ret);\n    if (ret == -11)\n    {\n        size_t log_size;\n        fprintf(stderr, \"OpenCL kernel build error\\n\");\n        clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);\n        char* log = (char*)malloc(log_size);\n        clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);\n        fprintf(stderr, \"%s\\n\", log);\n        free(log);\n    }\n\n    free(source_str);\n    return program;\n}\n\n/// <summary>\n/// populate global variables for opencl device id and platform id\n/// </summary>\n/// <param name=\"platform_index\">platform index. if -1, prompt user</param>\n/// <param name=\"device_index\">device index. if -1. prompt user</param>\n/// <returns>opencl context</returns>\ncl_context get_context_from_user(int platform_index, int device_index) {\n    int i = 0;\n    int selected_platform_index = 0, selected_device_index = 0;\n\n    // Get platform and device information\n    cl_uint ret_num_devices;\n    cl_uint ret_num_platforms;\n\n    cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);\n    cl_platform_id* platforms = NULL;\n    cl_device_id* devices = NULL;\n    cl_context context = NULL;\n    platforms = (cl_platform_id*)malloc(ret_num_platforms * sizeof(cl_platform_id));\n\n    ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);\n    fprintf(stderr, \"clGetPlatformIDs returned %d. %d platforms\\n\", ret, ret_num_platforms);\n\n    for (i = 0; i < ret_num_platforms; i++)\n    {\n        size_t platform_name_len;\n        char* platform_name = NULL;\n        if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &platform_name_len)) {\n            fprintf(stderr, \"Failed to get platform info for platform %d\\n\", i);\n            continue;\n        }\n\n        platform_name = (char*)malloc(platform_name_len + 1);\n        platform_name[platform_name_len] = 0;\n\n        if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, platform_name_len, platform_name, NULL)) {\n            fprintf(stderr, \"Failed to get platform name for platform %d\\n\", i);\n            free(platform_name);\n            continue;\n        }\n\n        fprintf(stderr, \"Platform %d: %s\\n\", i, platform_name);\n        free(platform_name);\n    }\n\n    selected_platform_index = platform_index;\n    if (selected_platform_index == -1)\n    {\n        printf(\"Enter platform #:\");\n        scanf(\"%d\", &selected_platform_index);\n    }\n\n    if (selected_platform_index > ret_num_platforms - 1)\n    {\n        fprintf(stderr, \"platform index out of range\\n\");\n        goto get_context_from_user_end;\n    }\n\n    selected_platform_id = platforms[selected_platform_index];\n\n    if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &ret_num_devices)) {\n        fprintf(stderr, \"Failed to enumerate device ids for platform\");\n        return NULL;\n    }\n\n    devices = (cl_device_id*)malloc(ret_num_devices * sizeof(cl_device_id));\n    if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, ret_num_devices, devices, NULL)) {\n        fprintf(stderr, \"Failed to get device ids for platform\");\n        free(devices);\n        return NULL;\n    }\n\n    fprintf(stderr, \"clGetDeviceIDs returned %d devices\\n\", ret_num_devices);\n\n    for (i = 0; i < ret_num_devices; i++)\n    {\n        size_t device_name_len;\n        char* device_name = NULL;\n        if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &device_name_len)) {\n            fprintf(stderr, \"Failed to get name length for device %d\\n\", i);\n            continue;\n        }\n\n        //fprintf(stderr, \"debug: device name length: %d\\n\", device_name_len);\n        device_name = (char*)malloc(device_name_len + 1);\n        device_name[device_name_len] = 0;\n\n        if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, device_name_len, device_name, &device_name_len)) {\n            fprintf(stderr, \"Failed to get name for device %d\\n\", i);\n            free(device_name);\n            continue;\n        }\n\n        fprintf(stderr, \"Device %d: %s\\n\", i, device_name);\n        free(device_name);\n    }\n\n    selected_device_index = device_index;\n    if (selected_device_index == -1)\n    {\n        fprintf(stderr, \"Enter device #:\");\n        scanf(\"%d\", &selected_device_index);\n    }\n\n\n    if (selected_device_index > ret_num_devices - 1)\n    {\n        fprintf(stderr, \"Device index out of range\\n\");\n        goto get_context_from_user_end;\n    }\n\n    selected_device_id = devices[selected_device_index];\n\n    // Create an OpenCL context\n    context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret);\n    fprintf(stderr, \"clCreateContext returned %d\\n\", ret);\n\nget_context_from_user_end:\n    free(platforms);\n    free(devices);\n    return context;\n}\n"
  }
]