#endif
int main()
{
#ifndef __APPLE__
uint64_t hwcaps = getauxval(AT_HWCAP);
#ifdef HWCAP2_I8MM
if (hwcaps & HWCAP2_I8MM)
{
printf("_I8MM_\n");
}
#endif
#ifdef HWCAP2_BF16
if (hwcaps & HWCAP2_BF16)
{
printf("_BF16_\n");
}
#endif
#ifdef HWCAP_ASIMDDP
if (hwcaps & HWCAP_ASIMDDP)
{
printf("_ASIMD_DP_\n");
}
#endif
#ifdef HWCAP_ASIMDHP
if (hwcaps & HWCAP_ASIMDHP)
{
printf("_ASIMD_HP_\n");
}
#endif
#ifdef HWCAP_ASIMD
if (hwcaps & HWCAP_ASIMD)
{
printf("_ASIMD_\n");
}
#endif
#else
size_t size = 4;
uint32_t res;
sysctlbyname("hw.optional.arm.FEAT_I8MM", &res, &size, NULL, 0);
if (res == 1) {
printf("_I8MM_\n");
}
sysctlbyname("hw.optional.arm.FEAT_BF16", &res, &size, NULL, 0);
if (res == 1) {
printf("_BF16_\n");
}
sysctlbyname("hw.optional.arm.FEAT_DotProd", &res, &size, NULL, 0);
if (res == 1) {
printf("_ASIMD_DP_\n");
}
sysctlbyname("hw.optional.AdvSIMD_HPFPCv", &res, &size, NULL, 0);
if (res == 1) {
printf("_ASIMD_HP_\n");
}
sysctlbyname("hw.optional.AdvSIMD", &res, &size, NULL, 0);
if (res == 1) {
printf("_ASIMD_\n");
}
#endif
return 0;
}
================================================
FILE: benchmark_result/arm64/AWS_Graviton_3E.md
================================================
# AWS Graviton 3E
Architecture: Neoverse V1
Setting: Virtual 1 Core
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 332.34 GGOPS |
| i8mm | mmla(u32,u8,u8) | 332.46 GGOPS |
| i8mm | mmla(s32,u8,s8) | 332.46 GGOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 166.23 GGOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 166.17 GGOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 166.14 GGOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 166.18 GGOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 166.22 GGOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 166.22 GGOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 166.22 GGOPS |
| bf16 | mmla(f32,bf16,bf16) | 166.18 GGFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 83.085 GGFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 83.111 GGFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 83.105 GGFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 83.113 GGFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 41.549 GGFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 41.542 GGFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 35.96 GGFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 20.779 GGFLOPS |
----------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/Apple_Silicon_M2_Max.md
================================================
# Apple M2 Max (Macbook Pro 16)
Setting: 8 Avalanche P-Cores + 4 Blizzard E-Cores
OS: MacOS 15.1
For 1 P-core:
> ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 347.22 GOPS |
| i8mm | mmla(u32,u8,u8) | 353.72 GOPS |
| i8mm | mmla(s32,u8,s8) | 361.84 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 426.77 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 418.49 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 436.31 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 425.79 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 420.44 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 430.16 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 425.55 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 51.959 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 53.449 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 53.995 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 215.06 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 210.01 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 105.54 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 107.27 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 54.109 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 51.883 GFLOPS |
----------------------------------------------------------------
For 8 P-cores:
> ./cpufp --thread_pool=[0-7]
Number Threads: 8
Thread Pool Binding: 0 1 2 3 4 5 6 7
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 2.5416 TOPS |
| i8mm | mmla(u32,u8,u8) | 2.2677 TOPS |
| i8mm | mmla(s32,u8,s8) | 2.6085 TOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 3.0364 TOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 3.0657 TOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 3.1035 TOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 2.9913 TOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 3.0582 TOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 2.9646 TOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 2.3463 TOPS |
| bf16 | mmla(f32,bf16,bf16) | 384.6 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 375.38 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 369.55 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 1.5043 TFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 1.5192 TFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 763 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 765.33 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 377.3 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 377.05 GFLOPS |
----------------------------------------------------------------
For 1 E-core:
> taskpolicy -c background ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 101.41 GOPS |
| i8mm | mmla(u32,u8,u8) | 97.71 GOPS |
| i8mm | mmla(s32,u8,s8) | 100.49 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 101.54 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 96.847 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 98.375 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 102.21 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 95.13 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 98.558 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 102.73 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 12.526 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 11.987 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 11.877 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 50.557 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 51.691 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 23.584 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 23.78 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 12.689 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 12.744 GFLOPS |
----------------------------------------------------------------
For 4 E-cores (OS is running and therefore using some of them):
> taskpolicy -c background ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 292.61 GOPS |
| i8mm | mmla(u32,u8,u8) | 278.35 GOPS |
| i8mm | mmla(s32,u8,s8) | 288.3 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 315.5 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 312.98 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 245.39 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 205.68 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 267.14 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 320.75 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 279.87 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 37.858 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 36.48 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 35.658 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 145.14 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 140.57 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 74.868 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 78.191 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 40.488 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 36.496 GFLOPS |
----------------------------------------------------------------
For 8 P-cores and 4 E-cores:
> ./cpufp --thread_pool=[0-11]
Number Threads: 12
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 2.3888 TOPS |
| i8mm | mmla(u32,u8,u8) | 2.4141 TOPS |
| i8mm | mmla(s32,u8,s8) | 2.2572 TOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 2.7256 TOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 2.4714 TOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 2.6389 TOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 2.7067 TOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 2.626 TOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 2.7011 TOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 2.6723 TOPS |
| bf16 | mmla(f32,bf16,bf16) | 345.83 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 341.14 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 340.41 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 1.3411 TFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 1.2838 TFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 645.88 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 668.01 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 339.89 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 337.88 GFLOPS |
----------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/Apple_Silicon_M4_Max.md
================================================
# Apple M4 Max (Macbook Pro 16)
Setting: 12 P-Cores + 4 E-Cores
OS: MacOS 15.1
For 1 P-core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 477.42 GOPS |
| i8mm | mmla(u32,u8,u8) | 477.76 GOPS |
| i8mm | mmla(s32,u8,s8) | 478.18 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 472.27 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 472.34 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 472.57 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 472.39 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 472.39 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 472.66 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 472.7 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 71.964 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 71.942 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 71.915 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 233.67 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 236.39 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 116.7 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 118.4 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 58.344 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 59.124 GFLOPS |
----------------------------------------------------------------
For 12 P-cores:
$ ./cpufp --thread_pool=[0-11]
Number Threads: 12
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 4.9542 TOPS |
| i8mm | mmla(u32,u8,u8) | 4.9557 TOPS |
| i8mm | mmla(s32,u8,s8) | 4.9335 TOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 4.8965 TOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 4.8873 TOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 4.896 TOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 4.891 TOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 4.8954 TOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 4.8983 TOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 4.8943 TOPS |
| bf16 | mmla(f32,bf16,bf16) | 745.35 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 745.37 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 745.28 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 2.4183 TFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 2.4491 TFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 1.208 TFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 1.2245 TFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 604.22 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 612.65 GFLOPS |
----------------------------------------------------------------
For 1 E-core:
$ taskpolicy -c background ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 66.327 GOPS |
| i8mm | mmla(u32,u8,u8) | 68.298 GOPS |
| i8mm | mmla(s32,u8,s8) | 75.25 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 65.959 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 66.819 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 69.26 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 67.005 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 66.623 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 64.867 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 65.323 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 11.234 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 11.222 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 11.242 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 32.67 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 33.329 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 16.367 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 16.262 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 8.1371 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 8.5853 GFLOPS |
----------------------------------------------------------------
For 4 E-cores (OS is running and therefore using some of them):
$ taskpolicy -c background ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 245.5 GOPS |
| i8mm | mmla(u32,u8,u8) | 254.44 GOPS |
| i8mm | mmla(s32,u8,s8) | 254.65 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 250.63 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 254.65 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 254.88 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 247.45 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 255.69 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 254.06 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 253.43 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 42.842 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 43.632 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 43.273 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 126.73 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 132.21 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 65.895 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 63.022 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 31.509 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 31.543 GFLOPS |
----------------------------------------------------------------
For 12 P-cores + 4 E-cores:
$ ./cpufp --thread_pool=[0-15]
Number Threads: 16
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
Warning: cpu thread policy is not supported by OS
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 5.4673 TOPS |
| i8mm | mmla(u32,u8,u8) | 5.5309 TOPS |
| i8mm | mmla(s32,u8,s8) | 5.5254 TOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 5.4348 TOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 5.4187 TOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 5.4255 TOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 5.4434 TOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 5.4171 TOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 5.4069 TOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 5.3969 TOPS |
| bf16 | mmla(f32,bf16,bf16) | 844.34 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 843.35 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 841.86 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 2.6914 TFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 2.735 TFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 1.3444 TFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 1.3631 TFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 673.16 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 678.52 GFLOPS |
----------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/Broadcom_BCM2711.md
================================================
# Broadcom BCM2711(RaspBerry Pi 4)
Setting: 4 Cortex-A72 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
-------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd | fmla.vs(f32,f32,f32) | 11.958 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 11.958 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 5.9792 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 5.9792 GFLOPS |
-------------------------------------------------------------
For 4 cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
-------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd | fmla.vs(f32,f32,f32) | 47.883 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 47.88 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 23.933 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 23.943 GFLOPS |
-------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/Broadcom_BCM2712.md
================================================
# Broadcom BCM2712(RaspBerry Pi 5)
Setting: 4 Cortex-A76 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 153.48 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 153.48 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 153.47 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 153.48 GOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 76.738 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 76.738 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 38.369 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 38.369 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 19.185 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 19.185 GFLOPS |
----------------------------------------------------------------
For 4 cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 613.79 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 614.02 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 613.98 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 613.99 GOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 306.88 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 306.98 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 153.48 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 153.5 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 74.513 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 76.751 GFLOPS |
----------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/CIX_P1_CD8180.md
================================================
# CIX P1 CD8180(Radxa Orion O6)
Settings:
Cortex-A720 @ 2.5GHz: 0,11
Cortex-A720 @ 2.4GHz: 9,10
Cortex-A720 @ 2.3GHz: 5,6
Cortex-A720 @ 2.2GHz: 7,8
Cortex-A520 @ 1.8GHz: 1-4
Power policy: Balance
For single P-Core @ 2.5GHz:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 319.69 GOPS |
| i8mm | mmla(u32,u8,u8) | 319.71 GOPS |
| i8mm | mmla(s32,u8,s8) | 319.71 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 159.86 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 159.88 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 159.85 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 159.87 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 159.89 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 159.87 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 159.89 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 159.9 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 79.947 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 79.949 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 79.948 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 79.944 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 39.971 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 39.972 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 19.985 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 19.984 GFLOPS |
----------------------------------------------------------------
For 2 P-Cores @ 2.5GHz:
$ ./cpufp --thread_pool=[0,11]
Number Threads: 2
Thread Pool Binding: 0 11
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 638.37 GOPS |
| i8mm | mmla(u32,u8,u8) | 639.22 GOPS |
| i8mm | mmla(s32,u8,s8) | 639.3 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 319.61 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 319.58 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 319.69 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 319.67 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 319.61 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 319.6 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 319.65 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 319.64 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 159.87 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 159.86 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 159.85 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 159.85 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 79.899 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 79.935 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 39.956 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 39.963 GFLOPS |
----------------------------------------------------------------
For single P-Core @ 2.4GHz:
$ ./cpufp --thread_pool=[9]
Number Threads: 1
Thread Pool Binding: 9
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 306.95 GOPS |
| i8mm | mmla(u32,u8,u8) | 306.94 GOPS |
| i8mm | mmla(s32,u8,s8) | 306.98 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 153.47 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 153.46 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 153.48 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 153.46 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 153.46 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 153.48 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 153.47 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 153.49 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 76.745 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 76.732 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 76.734 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 76.75 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 38.369 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 38.367 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 19.186 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 19.185 GFLOPS |
----------------------------------------------------------------
For 2 P-Cores @ 2.4GHz:
$ ./cpufp --thread_pool=[9,10]
Number Threads: 2
Thread Pool Binding: 9 10
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 613.78 GOPS |
| i8mm | mmla(u32,u8,u8) | 613.84 GOPS |
| i8mm | mmla(s32,u8,s8) | 613.84 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 306.92 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 306.9 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 306.95 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 306.92 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 306.89 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 306.94 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 306.93 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 306.9 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 153.47 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 153.46 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 153.45 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 153.46 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 76.725 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 76.726 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 38.368 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 38.364 GFLOPS |
----------------------------------------------------------------
For single P-Core @ 2.3GHz:
$ ./cpufp --thread_pool=[5]
Number Threads: 1
Thread Pool Binding: 5
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 294.17 GOPS |
| i8mm | mmla(u32,u8,u8) | 294.15 GOPS |
| i8mm | mmla(s32,u8,s8) | 294.14 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 147.07 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 147.08 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 147.07 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 147.07 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 147.07 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 147.07 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 147.08 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 147.07 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 73.532 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 73.539 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 73.541 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 73.537 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 36.768 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 36.772 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 18.383 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 18.384 GFLOPS |
----------------------------------------------------------------
For 2 P-Cores @ 2.3GHz:
$ ./cpufp --thread_pool=[5,6]
Number Threads: 2
Thread Pool Binding: 5 6
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 586.66 GOPS |
| i8mm | mmla(u32,u8,u8) | 587.38 GOPS |
| i8mm | mmla(s32,u8,s8) | 587.83 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 293.61 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 293.87 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 293.46 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 293.87 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 293.94 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 293.91 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 293.86 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 293.81 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 146.88 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 146.91 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 146.94 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 146.84 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 73.442 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 73.456 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 36.735 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 36.728 GFLOPS |
----------------------------------------------------------------
For single P-Core @ 2.2GHz:
$ ./cpufp --thread_pool=[7]
Number Threads: 1
Thread Pool Binding: 7
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 281.34 GOPS |
| i8mm | mmla(u32,u8,u8) | 281.37 GOPS |
| i8mm | mmla(s32,u8,s8) | 281.35 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 140.67 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 140.68 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 140.68 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 140.68 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 140.7 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 140.69 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 140.69 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 140.67 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 70.338 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 70.335 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 70.346 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 70.345 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 35.169 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 35.172 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 17.587 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 17.585 GFLOPS |
----------------------------------------------------------------
For 2 P-Cores @ 2.2GHz:
$ ./cpufp --thread_pool=[7,8]
Number Threads: 2
Thread Pool Binding: 7 8
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 562.68 GOPS |
| i8mm | mmla(u32,u8,u8) | 562.69 GOPS |
| i8mm | mmla(s32,u8,s8) | 562.75 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 281.34 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 281.32 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 281.32 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 281.32 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 281.36 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 281.38 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 281.36 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 281.34 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 140.68 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 140.67 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 140.67 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 140.69 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 70.344 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 70.342 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 35.171 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 35.17 GFLOPS |
----------------------------------------------------------------
For single E-core @ 1.8GHz:
$ ./cpufp --thread_pool=[1]
Number Threads: 1
Thread Pool Binding: 1
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 114.83 GOPS |
| i8mm | mmla(u32,u8,u8) | 114.82 GOPS |
| i8mm | mmla(s32,u8,s8) | 114.81 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 57.415 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 57.414 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 57.417 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 57.411 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 57.417 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 57.418 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 57.415 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 22.967 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 28.706 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 28.708 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 28.703 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 28.708 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 14.354 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 14.353 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 7.1768 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 7.1766 GFLOPS |
----------------------------------------------------------------
For 4 E-Cores @ 1.8GHz:
$ ./cpufp --thread_pool=[1-4]
Number Threads: 4
Thread Pool Binding: 1 2 3 4
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 402.75 GOPS |
| i8mm | mmla(u32,u8,u8) | 402.79 GOPS |
| i8mm | mmla(s32,u8,s8) | 402.79 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 201.37 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 201.35 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 201.35 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 201.37 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 201.29 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 201.35 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 201.36 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 80.555 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 100.68 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 100.66 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 100.68 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 100.7 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 50.355 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 50.348 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 25.172 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 25.172 GFLOPS |
----------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/HUAWEI_Kunpeng_920_7260.md
================================================
# HUAWEI Kunpeng 920 7260
Architecture: Taishan V110
Setting: 2 * 64 cores
For single core:
$ ./cpufp --thread_pool=[1]
Number Threads: 1
Thread Pool Binding: 1
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 166.3 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 166.32 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 166.31 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 166.29 GOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 83.161 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 83.151 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 41.576 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 41.579 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 10.395 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 10.394 GFLOPS |
----------------------------------------------------------------
For 32 cores:
$ ./cpufp --thread_pool=[0-31]
Number Threads: 32
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 5.304 TOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 5.3108 TOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 5.307 TOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 5.3123 TOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 2.6555 TFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 2.6564 TFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 1.3252 TFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 1.328 TFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 331.95 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 331.98 GFLOPS |
----------------------------------------------------------------
For 64 cores:
$ ./cpufp --thread_pool=[0-63]
Number Threads: 64
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 10.601 TOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 10.586 TOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 10.587 TOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 10.593 TOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 5.2966 TFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 5.2975 TFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 2.6551 TFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 2.6557 TFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 663.98 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 663.73 GFLOPS |
----------------------------------------------------------------
For 128 cores:
$ ./cpufp --thread_pool=[0-127]
Number Threads: 128
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 20.951 TOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 20.27 TOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 19.736 TOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 16.495 TOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 10.481 TFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 10.514 TFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 5.1993 TFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 4.117 TFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 1.2754 TFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 1.049 TFLOPS |
----------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/HUAWEI_Kunpeng_D920_2249K.md
================================================
# HUAWEI Kunpeng D920 2249K
Architecture: Taishan V110
Setting: 8 cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 166.21 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 166.21 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 166.21 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 166.2 GOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 83.104 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 83.104 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 41.553 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 41.553 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 10.388 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 10.388 GFLOPS |
----------------------------------------------------------------
For 8 cores:
$ ./cpufp --thread_pool=[0-7]
Number Threads: 8
Thread Pool Binding: 0 1 2 3 4 5 6 7
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 1.3132 TOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 1.3014 TOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 1.3034 TOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 1.3016 TOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 651.87 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 652.34 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 326.4 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 326.12 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 81.791 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 81.503 GFLOPS |
----------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/Phytium_D2000.md
================================================
# Phytium D2000/8
Setting: 8 FTC663 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
-------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd | fmla.vs(f32,f32,f32) | 18.376 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 18.375 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 9.1877 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 9.1891 GFLOPS |
-------------------------------------------------------------
For 4 cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
-------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd | fmla.vs(f32,f32,f32) | 73.51 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 73.51 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 36.755 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 36.747 GFLOPS |
-------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/Qualcomm_Snapdragon_X_Elite_X1E80100.md
================================================
# Qualcomm Snapdragon X Elite - X1E80100
Architecture: Oryon-1
Setting: 4 E-cores @ 3.4Ghz + 8 P-cores @ 4.0Ghz
For single core:
> .\cpufp.exe --thread_pool=[4]
Number Threads: 1
Thread Pool Binding: 4
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 442.36 GOPS |
| i8mm | mmla(u32,u8,u8) | 434.67 GOPS |
| i8mm | mmla(s32,u8,s8) | 437.35 GOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 520.02 GOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 525.78 GOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 515.6 GOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 510.91 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 516.89 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 518 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 514.3 GOPS |
| bf16 | mmla(f32,bf16,bf16) | 223.53 GFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 256.44 GFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 252.13 GFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 260.4 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 259.04 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 127.29 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 125.67 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 65.2 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 65.195 GFLOPS |
----------------------------------------------------------------
For 12 cores:
> .\cpufp.exe --thread_pool=[0-11]
Number Threads: 12
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| i8mm | mmla(s32,s8,s8) | 4.3971 TOPS |
| i8mm | mmla(u32,u8,u8) | 4.3813 TOPS |
| i8mm | mmla(s32,u8,s8) | 4.3889 TOPS |
| i8mm | dp4a.vs(s32,s8,u8) | 5.1953 TOPS |
| i8mm | dp4a.vs(s32,u8,s8) | 5.221 TOPS |
| i8mm | dp4a.vv(s32,u8,s8) | 5.209 TOPS |
| asimd_dp | dp4a.vs(s32,s8,s8) | 5.2081 TOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 5.2275 TOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 5.222 TOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 5.2146 TOPS |
| bf16 | mmla(f32,bf16,bf16) | 2.2578 TFLOPS |
| bf16 | dp2a.vs(f32,bf16,bf16) | 2.6124 TFLOPS |
| bf16 | dp2a.vv(f32,bf16,bf16) | 2.6172 TFLOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 2.6051 TFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 2.6035 TFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 1.3028 TFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 1.3032 TFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 654.67 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 654.44 GFLOPS |
----------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/RockChip_RK3399.md
================================================
# Rockchip RK3399
Setting: 2 Cortex-A72(big) Cores + 4 Cortex-A53(Little) Cores
For single Little core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
-------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd | fmla.vs(f32,f32,f32) | 11.255 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 11.255 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 5.6275 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 5.6277 GFLOPS |
-------------------------------------------------------------
For 4 Little cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
-------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd | fmla.vs(f32,f32,f32) | 45.029 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 45.027 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 22.509 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 22.513 GFLOPS |
-------------------------------------------------------------
For single big core:
$ ./cpufp --thread_pool=[4]
Number Threads: 1
Thread Pool Binding: 4
-------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd | fmla.vs(f32,f32,f32) | 14.348 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 14.348 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 7.1744 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 7.1743 GFLOPS |
-------------------------------------------------------------
For 2 big cores:
$ ./cpufp --thread_pool=[4,5]
Number Threads: 2
Thread Pool Binding: 4 5
-------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd | fmla.vs(f32,f32,f32) | 28.698 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 28.698 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 14.349 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 14.347 GFLOPS |
-------------------------------------------------------------
================================================
FILE: benchmark_result/arm64/RockChip_RK3588.md
================================================
# RockChip RK3588
Setting: 4 Cortex-A76(big) Cores + 4 Cortex-A55(Little) Cores
For single Little core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 58.379 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 58.371 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 58.369 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 58.382 GOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 29.193 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 29.192 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 14.593 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 14.596 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 7.2971 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 7.2972 GFLOPS |
----------------------------------------------------------------
For 4 Little cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 233.08 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 233.05 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 233.06 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 233.05 GOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 116.54 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 116.51 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 58.261 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 58.258 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 29.13 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 29.126 GFLOPS |
----------------------------------------------------------------
For single big core:
$ ./cpufp --thread_pool=[4]
Number Threads: 1
Thread Pool Binding: 4
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 152.1 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 152.1 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 152.06 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 152.08 GOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 76.022 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 76.027 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 38.012 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 38.008 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 19.004 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 19.004 GFLOPS |
----------------------------------------------------------------
For 4 big cores:
$ ./cpufp --thread_pool=[4-7]
Number Threads: 4
Thread Pool Binding: 4 5 6 7
----------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| asimd_dp | dp4a.vs(s32,s8,s8) | 601.71 GOPS |
| asimd_dp | dp4a.vv(s32,s8,s8) | 602.2 GOPS |
| asimd_dp | dp4a.vs(u32,u8,u8) | 602.22 GOPS |
| asimd_dp | dp4a.vv(u32,u8,u8) | 602.2 GOPS |
| asimd_hp | fmla.vs(fp16,fp16,fp16) | 300.97 GFLOPS |
| asimd_hp | fmla.vv(fp16,fp16,fp16) | 300.93 GFLOPS |
| asimd | fmla.vs(f32,f32,f32) | 149.79 GFLOPS |
| asimd | fmla.vv(f32,f32,f32) | 150.15 GFLOPS |
| asimd | fmla.vs(f64,f64,f64) | 75.222 GFLOPS |
| asimd | fmla.vv(f64,f64,f64) | 75.215 GFLOPS |
----------------------------------------------------------------
================================================
FILE: benchmark_result/e2k/Elbrus_4C.md
================================================
# Elbrus-4C
Setting: 4 Sockets x 4 Elbrus-v3
Freqency: 750 MHz
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| v1 | ADD(MUL(f32,f32),f32) | 11.939 GFLOPS |
| v1 | ADD(MUL(f64,f64),f64) | 5.9801 GFLOPS |
--------------------------------------------------------------
For 4 cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| v1 | ADD(MUL(f32,f32),f32) | 47.704 GFLOPS |
| v1 | ADD(MUL(f64,f64),f64) | 23.913 GFLOPS |
--------------------------------------------------------------
For 16 cores:
$ ./cpufp --thread_pool=[0-15]
Number Threads: 16
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| v1 | ADD(MUL(f32,f32),f32) | 189.81 GFLOPS |
| v1 | ADD(MUL(f64,f64),f64) | 95.294 GFLOPS |
--------------------------------------------------------------
================================================
FILE: benchmark_result/e2k/Elbrus_8C.md
================================================
# Elbrus-8C
Setting: 4 Sockets x 8 Elbrus-v4
Frequency: 1.2 GHz
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| v4 | ADD(MUL(f32,f32),f32) | 28.704 GFLOPS |
| v4 | ADD(MUL(f64,f64),f64) | 14.353 GFLOPS |
--------------------------------------------------------------
For 8 cores:
$ ./cpufp --thread_pool=[0-7]
Number Threads: 8
Thread Pool Binding: 0 1 2 3 4 5 6 7
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| v4 | ADD(MUL(f32,f32),f32) | 229.42 GFLOPS |
| v4 | ADD(MUL(f64,f64),f64) | 114.56 GFLOPS |
--------------------------------------------------------------
For 32 cores:
$ ./cpufp --thread_pool=[0-31]
Number Threads: 32
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| v4 | ADD(MUL(f32,f32),f32) | 896.58 GFLOPS |
| v4 | ADD(MUL(f64,f64),f64) | 448.7 GFLOPS |
--------------------------------------------------------------
================================================
FILE: benchmark_result/e2k/Elbrus_8C2.md
================================================
# Elbrus-8C2
Setting: 4 Sockets x 8 Elbrus-v5
Frequency: 1.2 GHz
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| v5 | ADD(MUL(f32,f32),f32) | 57.413 GFLOPS |
| v5 | ADD(MUL(f64,f64),f64) | 28.707 GFLOPS |
| v4 | ADD(MUL(f32,f32),f32) | 28.727 GFLOPS |
| v4 | ADD(MUL(f64,f64),f64) | 14.353 GFLOPS |
--------------------------------------------------------------
For 8 cores:
$ ./cpufp --thread_pool=[0-7]
Number Threads: 8
Thread Pool Binding: 0 1 2 3 4 5 6 7
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| v5 | ADD(MUL(f32,f32),f32) | 459.61 GFLOPS |
| v5 | ADD(MUL(f64,f64),f64) | 229.72 GFLOPS |
| v4 | ADD(MUL(f32,f32),f32) | 229.76 GFLOPS |
| v4 | ADD(MUL(f64,f64),f64) | 114.89 GFLOPS |
--------------------------------------------------------------
For 32 cores:
$ ./cpufp --thread_pool=[0-31]
Number Threads: 32
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| v5 | ADD(MUL(f32,f32),f32) | 1.835 TFLOPS |
| v5 | ADD(MUL(f64,f64),f64) | 917.64 GFLOPS |
| v4 | ADD(MUL(f32,f32),f32) | 917.56 GFLOPS |
| v4 | ADD(MUL(f64,f64),f64) | 458.77 GFLOPS |
--------------------------------------------------------------
================================================
FILE: benchmark_result/loongarch64/Loongson_3A5000M.md
================================================
# Loongson 3A5000M
Setting: 4 LA464 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
-------------------------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| LASX | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 47.831 GFLOPS |
| LASX | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 23.888 GFLOPS |
| LSX | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 23.918 GFLOPS |
| LSX | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 11.957 GFLOPS |
| FP_SP | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 5.9803 GFLOPS |
| FP_DP | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 5.9803 GFLOPS |
-------------------------------------------------------------------------------
For 4 cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
-------------------------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| LASX | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 190.92 GFLOPS |
| LASX | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 95.47 GFLOPS |
| LSX | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 95.184 GFLOPS |
| LSX | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 47.652 GFLOPS |
| FP_SP | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 23.847 GFLOPS |
| FP_DP | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 23.876 GFLOPS |
-------------------------------------------------------------------------------
================================================
FILE: benchmark_result/loongarch64/Loongson_3A6000.md
================================================
# Loongson 3A6000
Setting: 4 LA664 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| LASX | 256b | fmadd(f32,f32,f32) | 79.781 GFLOPS |
| LASX | 256b | fmadd(f64,f64,f64) | 39.939 GFLOPS |
| LASX | 256b | add(mul(f32,f32),f32) | 79.853 GFLOPS |
| LASX | 256b | add(mul(f64,f64),f64) | 39.937 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| LSX | 128b | fmadd(f32,f32,f32) | 39.916 GFLOPS |
| LSX | 128b | fmadd(f64,f64,f64) | 19.97 GFLOPS |
| LSX | 128b | add(mul(f32,f32),f32) | 39.935 GFLOPS |
| LSX | 128b | add(mul(f64,f64),f64) | 19.968 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| FP_SP | scalar | fmadd(f32,f32,f32) | 9.9848 GFLOPS |
| FP_DP | scalar | fmadd(f64,f64,f64) | 9.979 GFLOPS |
------------------------------------------------------------------------------
For 4 cores:
$ ./cpufp --thread_pool=[0,2,4,6]
Number Threads: 4
Thread Pool Binding: 0 2 4 6
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| LASX | 256b | fmadd(f32,f32,f32) | 319.54 GFLOPS |
| LASX | 256b | fmadd(f64,f64,f64) | 159.71 GFLOPS |
| LASX | 256b | add(mul(f32,f32),f32) | 319.15 GFLOPS |
| LASX | 256b | add(mul(f64,f64),f64) | 159.61 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| LSX | 128b | fmadd(f32,f32,f32) | 159.75 GFLOPS |
| LSX | 128b | fmadd(f64,f64,f64) | 79.876 GFLOPS |
| LSX | 128b | add(mul(f32,f32),f32) | 159.56 GFLOPS |
| LSX | 128b | add(mul(f64,f64),f64) | 79.751 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| FP_SP | scalar | fmadd(f32,f32,f32) | 39.937 GFLOPS |
| FP_DP | scalar | fmadd(f64,f64,f64) | 39.937 GFLOPS |
------------------------------------------------------------------------------
================================================
FILE: benchmark_result/loongarch64/Loongson_3C5000.md
================================================
# Loongson 3C5000
Setting: 16 LA464 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
-------------------------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| LASX | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 52.603 GFLOPS |
| LASX | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 26.331 GFLOPS |
| LSX | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 26.323 GFLOPS |
| LSX | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 13.166 GFLOPS |
| FP_SP | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 6.583 GFLOPS |
| FP_DP | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 6.5723 GFLOPS |
-------------------------------------------------------------------------------
For 16 cores:
$ ./cpufp --thread_pool=[0-15]
Number Threads: 16
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-------------------------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| LASX | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 841.77 GFLOPS |
| LASX | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 406.52 GFLOPS |
| LSX | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 420.84 GFLOPS |
| LSX | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 210.01 GFLOPS |
| FP_SP | fmadd(f32,f32,f32) + fadd(f32,f32,f32) | 105.21 GFLOPS |
| FP_DP | fmadd(f64,f64,f64) + fadd(f64,f64,f64) | 104.59 GFLOPS |
-------------------------------------------------------------------------------
================================================
FILE: benchmark_result/riscv64/Kendryte_K230.md
================================================
# Kendryte K230
Setting: 2 C908 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
---------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| vector | vfmacc.vf(f16,f16,f16) | 25.014 GFLOPS |
| vector | vfmacc.vv(f16,f16,f16) | 25.01 GFLOPS |
| vector | vfmacc.vf(f32,f32,f32) | 12.507 GFLOPS |
| vector | vfmacc.vv(f32,f32,f32) | 12.508 GFLOPS |
| vector | vfmacc.vf(f64,f64,f64) | 6.254 GFLOPS |
| vector | vfmacc.vv(f64,f64,f64) | 6.2541 GFLOPS |
---------------------------------------------------------------
================================================
FILE: benchmark_result/riscv64/SpacemiT_K1.md
================================================
# SpacemiT K1
Setting: 8 SpacemiT-X60 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
---------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| ime | vmadot(s32,s8,s8) | 511.53 GOPS |
| ime | vmadotu(u32,u8,u8) | 511.5 GOPS |
| ime | vmadotus(s32,u8,s8) | 511.53 GOPS |
| ime | vmadotsu(s32,s8,u8) | 511.51 GOPS |
| ime | vmadotslide(s32,s8,s8) | 511.51 GOPS |
| vector | vfmacc.vf(f16,f16,f16) | 66.722 GFLOPS |
| vector | vfmacc.vv(f16,f16,f16) | 63.936 GFLOPS |
| vector | vfmacc.vf(f32,f32,f32) | 33.36 GFLOPS |
| vector | vfmacc.vv(f32,f32,f32) | 31.968 GFLOPS |
| vector | vfmacc.vf(f64,f64,f64) | 16.679 GFLOPS |
| vector | vfmacc.vv(f64,f64,f64) | 15.985 GFLOPS |
---------------------------------------------------------------
For cluster 0(with ime extension), 4 cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
---------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| ime | vmadot(s32,s8,s8) | 2.046 TOPS |
| ime | vmadotu(u32,u8,u8) | 2.0462 TOPS |
| ime | vmadotus(s32,u8,s8) | 2.0461 TOPS |
| ime | vmadotsu(s32,s8,u8) | 2.0462 TOPS |
| ime | vmadotslide(s32,s8,s8) | 2.0461 TOPS |
| vector | vfmacc.vf(f16,f16,f16) | 266.88 GFLOPS |
| vector | vfmacc.vv(f16,f16,f16) | 255.75 GFLOPS |
| vector | vfmacc.vf(f32,f32,f32) | 133.43 GFLOPS |
| vector | vfmacc.vv(f32,f32,f32) | 127.85 GFLOPS |
| vector | vfmacc.vf(f64,f64,f64) | 66.709 GFLOPS |
| vector | vfmacc.vv(f64,f64,f64) | 63.935 GFLOPS |
---------------------------------------------------------------
For 2 clusters, 8 cores:
$ ./cpufp --thread_pool=[0-7]
Number Threads: 8
Thread Pool Binding: 0 1 2 3 4 5 6 7
---------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| vector | vfmacc.vf(f16,f16,f16) | 533.65 GFLOPS |
| vector | vfmacc.vv(f16,f16,f16) | 511.45 GFLOPS |
| vector | vfmacc.vf(f32,f32,f32) | 266.89 GFLOPS |
| vector | vfmacc.vv(f32,f32,f32) | 255.75 GFLOPS |
| vector | vfmacc.vf(f64,f64,f64) | 133.42 GFLOPS |
| vector | vfmacc.vv(f64,f64,f64) | 127.86 GFLOPS |
---------------------------------------------------------------
================================================
FILE: benchmark_result/x64/AMD_Ryzen7_8845HS.md
================================================
# AMD Ryzen7 8845HS
Architecture: Zen4
Setting: 8 Zen4 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AVX512_VNNI | DP4A(s32,u8,s8) | 647.97 GOPS |
| AVX512_VNNI | DP2A(s32,s16,s16) | 324.27 GOPS |
| AVX512_BF16 | DP2A(f32,bf16,bf16) | 324.92 GFLOPS |
| AVX512F | FMA(f32,f32,f32) | 163.58 GFLOPS |
| AVX512F | FMA(f64,f64,f64) | 81.786 GFLOPS |
| FMA | FMA(f32,f32,f32) | 163.57 GFLOPS |
| FMA | FMA(f64,f64,f64) | 81.785 GFLOPS |
| AVX | ADD(MUL(f32,f32),f32) | 157.36 GFLOPS |
| AVX | ADD(MUL(f64,f64),f64) | 79.045 GFLOPS |
| SSE | ADD(MUL(f32,f32),f32) | 80.34 GFLOPS |
| SSE2 | ADD(MUL(f64,f64),f64) | 40.371 GFLOPS |
--------------------------------------------------------------
For 8 cores:
$ ./cpufp --thread_pool=[0-7]
Number Threads: 8
Thread Pool Binding: 0 1 2 3 4 5 6 7
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AVX512_VNNI | DP4A(s32,u8,s8) | 5113.8 GOPS |
| AVX512_VNNI | DP2A(s32,s16,s16) | 2559.1 GOPS |
| AVX512_BF16 | DP2A(f32,bf16,bf16) | 2551.6 GFLOPS |
| AVX512F | FMA(f32,f32,f32) | 1283.6 GFLOPS |
| AVX512F | FMA(f64,f64,f64) | 641.21 GFLOPS |
| FMA | FMA(f32,f32,f32) | 1271.7 GFLOPS |
| FMA | FMA(f64,f64,f64) | 632.3 GFLOPS |
| AVX | ADD(MUL(f32,f32),f32) | 1193.6 GFLOPS |
| AVX | ADD(MUL(f64,f64),f64) | 590.85 GFLOPS |
| SSE | ADD(MUL(f32,f32),f32) | 613.54 GFLOPS |
| SSE2 | ADD(MUL(f64,f64),f64) | 307.67 GFLOPS |
--------------------------------------------------------------
================================================
FILE: benchmark_result/x64/AMD_Ryzen7_9700X.md
================================================
# AMD Ryzen7 9700X
Microarchitecture: Zen5
Setting: 8 Zen5 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX512_VNNI | 512b | DP4A(s32,u8,s8) | 1.4172 TOPS |
| AVX512_VNNI | 512b | DP2A(s32,s16,s16) | 708.61 GOPS |
| AVX512_BF16 | 512b | DP2A(f32,bf16,bf16) | 708.19 GFLOPS |
| AVX512F | 512b | FMA(f32,f32,f32) | 354.29 GFLOPS |
| AVX512F | 512b | FMA(f64,f64,f64) | 177.09 GFLOPS |
| AVX512F | 512b | ADD(MUL(f32,f32),f32) | 353.63 GFLOPS |
| AVX512F | 512b | ADD(MUL(f64,f64),f64) | 176.47 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX512_VNNI | 256b | DP4A(s32,u8,s8) | 708.55 GOPS |
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 708.64 GOPS |
| AVX512_VNNI | 256b | DP2A(s32,s16,s16) | 354.38 GOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 354.22 GOPS |
| AVX512_BF16 | 256b | DP2A(f32,bf16,bf16) | 354.39 GFLOPS |
| FMA | 256b | FMA(f32,f32,f32) | 177.14 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 88.565 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 176.96 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 88.467 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX512_VNNI | 128b | DP4A(s32,u8,s8) | 354.53 GOPS |
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 354.53 GOPS |
| AVX512_VNNI | 128b | DP2A(s32,s16,s16) | 177.27 GOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 177.24 GOPS |
| AVX512_BF16 | 128b | DP2A(f32,bf16,bf16) | 177.26 GFLOPS |
| FMA | 128b | FMA(f32,f32,f32) | 88.641 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 44.308 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 88.465 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 44.259 GFLOPS |
------------------------------------------------------------------------------
For 8 cores:
$ ./cpufp --thread_pool=[0-7]
Number Threads: 8
Thread Pool Binding: 0 1 2 3 4 5 6 7
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX512_VNNI | 512b | DP4A(s32,u8,s8) | 11.064 TOPS |
| AVX512_VNNI | 512b | DP2A(s32,s16,s16) | 5.5293 TOPS |
| AVX512_BF16 | 512b | DP2A(f32,bf16,bf16) | 5.5324 TFLOPS |
| AVX512F | 512b | FMA(f32,f32,f32) | 2.7598 TFLOPS |
| AVX512F | 512b | FMA(f64,f64,f64) | 1.3768 TFLOPS |
| AVX512F | 512b | ADD(MUL(f32,f32),f32) | 2.7312 TFLOPS |
| AVX512F | 512b | ADD(MUL(f64,f64),f64) | 1.3605 TFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX512_VNNI | 256b | DP4A(s32,u8,s8) | 5.5604 TOPS |
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 5.5592 TOPS |
| AVX512_VNNI | 256b | DP2A(s32,s16,s16) | 2.7816 TOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 2.7783 TOPS |
| AVX512_BF16 | 256b | DP2A(f32,bf16,bf16) | 2.7814 TFLOPS |
| FMA | 256b | FMA(f32,f32,f32) | 1.3884 TFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 694.02 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 1.3781 TFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 688.82 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX512_VNNI | 128b | DP4A(s32,u8,s8) | 2.7881 TOPS |
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 2.7881 TOPS |
| AVX512_VNNI | 128b | DP2A(s32,s16,s16) | 1.3938 TOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 1.3938 TOPS |
| AVX512_BF16 | 128b | DP2A(f32,bf16,bf16) | 1.3958 TFLOPS |
| FMA | 128b | FMA(f32,f32,f32) | 696.63 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 348.12 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 686.34 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 344.64 GFLOPS |
------------------------------------------------------------------------------
================================================
FILE: benchmark_result/x64/AMD_Ryzen9_6900HX.md
================================================
# AMD Ryzen9 6900HX
Architecture: Zen3+
Setting: 8 Zen3+ Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| FMA | FMA(f32,f32,f32) | 151.84 GFLOPS |
| FMA | FMA(f64,f64,f64) | 75.702 GFLOPS |
| AVX | ADD(MUL(f32,f32),f32) | 150.86 GFLOPS |
| AVX | ADD(MUL(f64,f64),f64) | 75.476 GFLOPS |
| SSE | ADD(MUL(f32,f32),f32) | 75.452 GFLOPS |
| SSE2 | ADD(MUL(f64,f64),f64) | 37.737 GFLOPS |
--------------------------------------------------------------
For 8 cores:
$ ./cpufp --thread_pool=[0,2,4,6,8,10,12,14]
Number Threads: 8
Thread Pool Binding: 0 2 4 6 8 10 12 14
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| FMA | FMA(f32,f32,f32) | 1057.8 GFLOPS |
| FMA | FMA(f64,f64,f64) | 534.37 GFLOPS |
| AVX | ADD(MUL(f32,f32),f32) | 1037.6 GFLOPS |
| AVX | ADD(MUL(f64,f64),f64) | 516.21 GFLOPS |
| SSE | ADD(MUL(f32,f32),f32) | 518.32 GFLOPS |
| SSE2 | ADD(MUL(f64,f64),f64) | 258.92 GFLOPS |
--------------------------------------------------------------
================================================
FILE: benchmark_result/x64/Intel_Core_i3_8121U.md
================================================
# Intel Core i3-8121U
Product Code Name: Cannon-Lake
Setting: 2 Cannon-Lake Cores
For single Core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX512F | 512b | FMA(f32,f32,f32) | 101.56 GFLOPS |
| AVX512F | 512b | FMA(f64,f64,f64) | 50.784 GFLOPS |
| AVX512F | 512b | ADD(MUL(f32,f32),f32) | 50.783 GFLOPS |
| AVX512F | 512b | ADD(MUL(f64,f64),f64) | 25.391 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| FMA | 256b | FMA(f32,f32,f32) | 101.55 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 50.803 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 50.744 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 25.39 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| FMA | 128b | FMA(f32,f32,f32) | 50.772 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 25.376 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 12.69 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 6.3453 GFLOPS |
------------------------------------------------------------------------------
For 2 Cores:
$ ./cpufp --thread_pool=[0,1]
Number Threads: 2
Thread Pool Binding: 0 1
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX512F | 512b | FMA(f32,f32,f32) | 197.25 GFLOPS |
| AVX512F | 512b | FMA(f64,f64,f64) | 98.624 GFLOPS |
| AVX512F | 512b | ADD(MUL(f32,f32),f32) | 98.62 GFLOPS |
| AVX512F | 512b | ADD(MUL(f64,f64),f64) | 49.315 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| FMA | 256b | FMA(f32,f32,f32) | 197.18 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 98.594 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 98.64 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 49.304 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| FMA | 128b | FMA(f32,f32,f32) | 98.629 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 49.319 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 24.658 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 12.326 GFLOPS |
------------------------------------------------------------------------------
================================================
FILE: benchmark_result/x64/Intel_Core_i5_1340P.md
================================================
# Intel Core i5-1340P
Product Code Name: Raptor Lake
Setting: 4 Raptor Cove(P-Core) Cores + 8 Gracemont(E-Core) Cores
For single P-Core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AVX_VNNI | DP4A(s32,u8,s8) | 586.84 Gops |
| AVX_VNNI | DP2A(s32,s16,s16) | 293.5 Gops |
| FMA | FMA(f32,f32,f32) | 146.76 Gflops |
| FMA | FMA(f64,f64,f64) | 73.373 Gflops |
| AVX | ADD(MUL(f32,f32),f32) | 107.7 Gflops |
| AVX | ADD(MUL(f64,f64),f64) | 53.512 Gflops |
| SSE | ADD(MUL(f32,f32),f32) | 54.49 Gflops |
| SSE2 | ADD(MUL(f64,f64),f64) | 27.243 Gflops |
--------------------------------------------------------------
For 4 P-Cores:
$ ./cpufp --thread_pool=[0,2,4,6]
Number Threads: 4
Thread Pool Binding: 0 2 4 6
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AVX_VNNI | DP4A(s32,u8,s8) | 2.2454 Tops |
| AVX_VNNI | DP2A(s32,s16,s16) | 1.1215 Tops |
| FMA | FMA(f32,f32,f32) | 546.31 Gflops |
| FMA | FMA(f64,f64,f64) | 267.62 Gflops |
| AVX | ADD(MUL(f32,f32),f32) | 356.72 Gflops |
| AVX | ADD(MUL(f64,f64),f64) | 176.89 Gflops |
| SSE | ADD(MUL(f32,f32),f32) | 183.39 Gflops |
| SSE2 | ADD(MUL(f64,f64),f64) | 91.293 Gflops |
--------------------------------------------------------------
For single E-Core:
$ ./cpufp --thread_pool=[8]
Number Threads: 1
Thread Pool Binding: 8
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AVX_VNNI | DP4A(s32,u8,s8) | 108.5 Gops |
| AVX_VNNI | DP2A(s32,s16,s16) | 54.251 Gops |
| FMA | FMA(f32,f32,f32) | 54.248 Gflops |
| FMA | FMA(f64,f64,f64) | 27.125 Gflops |
| AVX | ADD(MUL(f32,f32),f32) | 27.126 Gflops |
| AVX | ADD(MUL(f64,f64),f64) | 13.563 Gflops |
| SSE | ADD(MUL(f32,f32),f32) | 27.122 Gflops |
| SSE2 | ADD(MUL(f64,f64),f64) | 13.561 Gflops |
--------------------------------------------------------------
For 8 E-Cores:
$ ./cpufp --thread_pool=[8-15]
Number Threads: 8
Thread Pool Binding: 8 9 10 11 12 13 14 15
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AVX_VNNI | DP4A(s32,u8,s8) | 791.36 Gops |
| AVX_VNNI | DP2A(s32,s16,s16) | 395.68 Gops |
| FMA | FMA(f32,f32,f32) | 395.67 Gflops |
| FMA | FMA(f64,f64,f64) | 197.83 Gflops |
| AVX | ADD(MUL(f32,f32),f32) | 197.84 Gflops |
| AVX | ADD(MUL(f64,f64),f64) | 98.921 Gflops |
| SSE | ADD(MUL(f32,f32),f32) | 197.83 Gflops |
| SSE2 | ADD(MUL(f64,f64),f64) | 98.916 Gflops |
--------------------------------------------------------------
================================================
FILE: benchmark_result/x64/Intel_N150.md
================================================
# Intel N150
Product Code Name: Twin-Lake
Setting: 4 Gracemont Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 114.75 GOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 57.372 GOPS |
| FMA | 256b | FMA(f32,f32,f32) | 57.374 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 28.608 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 28.688 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 14.344 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 114.75 GOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 57.352 GOPS |
| FMA | 128b | FMA(f32,f32,f32) | 56.509 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 28.259 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 28.685 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 14.34 GFLOPS |
------------------------------------------------------------------------------
For 4 cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 369.64 GOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 184.83 GOPS |
| FMA | 256b | FMA(f32,f32,f32) | 179.63 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 89.945 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 91.402 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 45.469 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 369.7 GOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 184.84 GOPS |
| FMA | 128b | FMA(f32,f32,f32) | 171.99 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 86.56 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 88.764 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 44.468 GFLOPS |
------------------------------------------------------------------------------
================================================
FILE: benchmark_result/x64/Intel_Ultra7_255H.md
================================================
# Intel Ultra7 255H
Product Code Name: Arrow Lake-H
Setting: 6 Lion Cove P-Cores + 8 Skymont E-Cores + 2 (Unknown Arch) LPE-Cores
For single P-Core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 647.06 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,s8) | 646.81 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,u8) | 647.17 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,u8,u8) | 646.86 GOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 323.05 GOPS |
| FMA | 256b | FMA(f32,f32,f32) | 161.55 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 80.961 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 132.12 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 66.11 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 323.03 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,s8) | 323.55 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,u8) | 323.24 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,u8,u8) | 323.2 GOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 161.58 GOPS |
| FMA | 128b | FMA(f32,f32,f32) | 80.786 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 40.381 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 67.709 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 33.791 GFLOPS |
------------------------------------------------------------------------------
For 6 P-Cores:
$ ./cpufp --thread_pool=[0-5]
Number Threads: 6
Thread Pool Binding: 0 1 2 3 4 5
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 3.4864 TOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,s8) | 3.4477 TOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,u8) | 3.416 TOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,u8,u8) | 3.4142 TOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 1.7058 TOPS |
| FMA | 256b | FMA(f32,f32,f32) | 854.05 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 426.89 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 710.61 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 355.38 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 1.7078 TOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,s8) | 1.7078 TOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,u8) | 1.7081 TOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,u8,u8) | 1.7087 TOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 853.72 GOPS |
| FMA | 128b | FMA(f32,f32,f32) | 426.93 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 213.29 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 354.37 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 178.34 GFLOPS |
------------------------------------------------------------------------------
For single E-Core:
$ ./cpufp --thread_pool=[6]
Number Threads: 1
Thread Pool Binding: 6
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 561.38 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,s8) | 561.39 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,u8) | 561.43 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,u8,u8) | 561.43 GOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 280.72 GOPS |
| FMA | 256b | FMA(f32,f32,f32) | 140.35 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 70.175 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 70.177 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 35.089 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 449.23 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,s8) | 449.91 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,u8) | 449.35 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,u8,u8) | 449.5 GOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 224.62 GOPS |
| FMA | 128b | FMA(f32,f32,f32) | 113.49 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 56.793 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 70.099 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 35.043 GFLOPS |
------------------------------------------------------------------------------
For 8 E-Cores:
$ ./cpufp --thread_pool=[6-13]
Number Threads: 8
Thread Pool Binding: 6 7 8 9 10 11 12 13
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 4.1754 TOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,s8) | 4.1767 TOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,u8) | 4.1732 TOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,u8,u8) | 4.1708 TOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 2.0668 TOPS |
| FMA | 256b | FMA(f32,f32,f32) | 1.029 TFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 513.76 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 511.26 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 254.99 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 3.26 TOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,s8) | 3.2669 TOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,u8) | 3.2702 TOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,u8,u8) | 3.2616 TOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 1.6311 TOPS |
| FMA | 128b | FMA(f32,f32,f32) | 824.83 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 412.47 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 509.08 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 254.62 GFLOPS |
------------------------------------------------------------------------------
For single LPE-Core:
$ ./cpufp --thread_pool=[14]
Number Threads: 1
Thread Pool Binding: 14
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 157.12 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,s8) | 157 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,u8) | 157.02 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,u8,u8) | 156.96 GOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 78.469 GOPS |
| FMA | 256b | FMA(f32,f32,f32) | 39.237 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 19.624 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 19.63 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 9.8176 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 156.93 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,s8) | 157.11 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,u8) | 156.99 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,u8,u8) | 156.87 GOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 78.453 GOPS |
| FMA | 128b | FMA(f32,f32,f32) | 39.312 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 19.628 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 19.615 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 9.8155 GFLOPS |
------------------------------------------------------------------------------
For 2 LPE-Cores:
$ ./cpufp --thread_pool=[14,15]
Number Threads: 2
Thread Pool Binding: 14 15
------------------------------------------------------------------------------
| Instruction Set | Vector Length | Core Computation | Peak Performance |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 256b | DP4A(s32,u8,s8) | 316.22 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,s8) | 316.14 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,s8,u8) | 315.76 GOPS |
| AVX_VNNI_INT8 | 256b | DP4A(s32,u8,u8) | 316.06 GOPS |
| AVX_VNNI | 256b | DP2A(s32,s16,s16) | 158.13 GOPS |
| FMA | 256b | FMA(f32,f32,f32) | 79.052 GFLOPS |
| FMA | 256b | FMA(f64,f64,f64) | 39.483 GFLOPS |
| AVX | 256b | ADD(MUL(f32,f32),f32) | 39.472 GFLOPS |
| AVX | 256b | ADD(MUL(f64,f64),f64) | 19.759 GFLOPS |
|-----------------|---------------|-----------------------|------------------|
| AVX_VNNI | 128b | DP4A(s32,u8,s8) | 315.74 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,s8) | 316.01 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,s8,u8) | 315.23 GOPS |
| AVX_VNNI_INT8 | 128b | DP4A(s32,u8,u8) | 316.03 GOPS |
| AVX_VNNI | 128b | DP2A(s32,s16,s16) | 157.66 GOPS |
| FMA | 128b | FMA(f32,f32,f32) | 79.005 GFLOPS |
| FMA | 128b | FMA(f64,f64,f64) | 39.435 GFLOPS |
| SSE | 128b | ADD(MUL(f32,f32),f32) | 39.406 GFLOPS |
| SSE2 | 128b | ADD(MUL(f64,f64),f64) | 19.723 GFLOPS |
------------------------------------------------------------------------------
================================================
FILE: benchmark_result/x64/Intel_Xeon_Gold_6455B.md
================================================
# Intel Xeon Gold 6455B
Microarchitecture: Sapphire Rapids
Setting: 2 Sockets x 32 Golden Cove Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AMX_INT8 | MM(s32,s8,s8) | 6.3726 Tops |
| AMX_INT8 | MM(s32,s8,u8) | 7.5746 Tops |
| AMX_INT8 | MM(s32,u8,s8) | 7.5733 Tops |
| AMX_INT8 | MM(s32,u8,u8) | 7.5718 Tops |
| AMX_BF16 | MM(f32,bf16,bf16) | 3.7868 Tflops |
| AVX512_VNNI | DP4A(s32,u8,s8) | 998.07 Gops |
| AVX512_VNNI | DP2A(s32,s16,s16) | 499.07 Gops |
| AVX_VNNI | DP4A(s32,u8,s8) | 498.96 Gops |
| AVX_VNNI | DP2A(s32,s16,s16) | 249.47 Gops |
| AVX512_BF16 | DP2A(f32,bf16,bf16) | 115.16 Gflops |
| AVX512_FP16 | FMA(f16,f16,f16) | 499.08 Gflops |
| AVX512F | FMA(f32,f32,f32) | 230.28 Gflops |
| AVX512F | FMA(f64,f64,f64) | 115.17 Gflops |
| FMA | FMA(f32,f32,f32) | 118.35 Gflops |
| FMA | FMA(f64,f64,f64) | 62.385 Gflops |
| AVX | ADD(MUL(f32,f32),f32) | 91.59 Gflops |
| AVX | ADD(MUL(f64,f64),f64) | 45.85 Gflops |
| SSE | ADD(MUL(f32,f32),f32) | 46.493 Gflops |
| SSE2 | ADD(MUL(f64,f64),f64) | 23.235 Gflops |
--------------------------------------------------------------
For 64 cores:
$ ./cpufp --thread_pool=[0-63]
Number Threads: 64
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AMX_INT8 | MM(s32,s8,s8) | 390.67 Tops |
| AMX_INT8 | MM(s32,s8,u8) | 380.93 Tops |
| AMX_INT8 | MM(s32,u8,s8) | 391.32 Tops |
| AMX_INT8 | MM(s32,u8,u8) | 380.28 Tops |
| AMX_BF16 | MM(f32,bf16,bf16) | 192.47 Tflops |
| AVX512_VNNI | DP4A(s32,u8,s8) | 48.114 Tops |
| AVX512_VNNI | DP2A(s32,s16,s16) | 24.169 Tops |
| AVX_VNNI | DP4A(s32,u8,s8) | 30.818 Tops |
| AVX_VNNI | DP2A(s32,s16,s16) | 15.74 Tops |
| AVX512_BF16 | DP2A(f32,bf16,bf16) | 7.09 Tflops |
| AVX512_FP16 | FMA(f16,f16,f16) | 31.473 Tflops |
| AVX512F | FMA(f32,f32,f32) | 14.329 Tflops |
| AVX512F | FMA(f64,f64,f64) | 6.5406 Tflops |
| FMA | FMA(f32,f32,f32) | 7.4039 Tflops |
| FMA | FMA(f64,f64,f64) | 3.9067 Tflops |
| AVX | ADD(MUL(f32,f32),f32) | 5.4087 Tflops |
| AVX | ADD(MUL(f64,f64),f64) | 2.7339 Tflops |
| SSE | ADD(MUL(f32,f32),f32) | 2.9077 Tflops |
| SSE2 | ADD(MUL(f64,f64),f64) | 1.4791 Tflops |
--------------------------------------------------------------
================================================
FILE: benchmark_result/x64/Intel_Xeon_W9_3495X.md
================================================
# Intel Xeon W9-3495X
Microarchitecture: Sapphire Rapids
Setting: 1 Sockets x 56 Golden Cove Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AMX_INT8 | MM(s32,s8,s8) | 5.6821 TOPS |
| AMX_INT8 | MM(s32,s8,u8) | 5.6854 TOPS |
| AMX_INT8 | MM(s32,u8,s8) | 5.6872 TOPS |
| AMX_INT8 | MM(s32,u8,u8) | 5.6905 TOPS |
| AMX_BF16 | MM(f32,bf16,bf16) | 2.8448 TFLOPS |
| AVX512_VNNI | DP4A(s32,u8,s8) | 711.46 GOPS |
| AVX512_VNNI | DP2A(s32,s16,s16) | 355.73 GOPS |
| AVX_VNNI | DP4A(s32,u8,s8) | 368.94 GOPS |
| AVX_VNNI | DP2A(s32,s16,s16) | 184.44 GOPS |
| AVX512_BF16 | DP2A(f32,bf16,bf16) | 80.477 GFLOPS |
| AVX512_FP16 | FMA(f16,f16,f16) | 355.76 GFLOPS |
| AVX512F | FMA(f32,f32,f32) | 158.74 GFLOPS |
| AVX512F | FMA(f64,f64,f64) | 79.375 GFLOPS |
| FMA | FMA(f32,f32,f32) | 92.224 GFLOPS |
| FMA | FMA(f64,f64,f64) | 46.115 GFLOPS |
| AVX | ADD(MUL(f32,f32),f32) | 67.789 GFLOPS |
| AVX | ADD(MUL(f64,f64),f64) | 33.9 GFLOPS |
| SSE | ADD(MUL(f32,f32),f32) | 34.43 GFLOPS |
| SSE2 | ADD(MUL(f64,f64),f64) | 17.218 GFLOPS |
--------------------------------------------------------------
For 56 cores:
$ ./cpufp --thread_pool=[0-55]
Number Threads: 56
Thread Pool Binding: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AMX_INT8 | MM(s32,s8,s8) | 293.86 TOPS |
| AMX_INT8 | MM(s32,s8,u8) | 309.81 TOPS |
| AMX_INT8 | MM(s32,u8,s8) | 293.44 TOPS |
| AMX_INT8 | MM(s32,u8,u8) | 293.07 TOPS |
| AMX_BF16 | MM(f32,bf16,bf16) | 141.12 TFLOPS |
| AVX512_VNNI | DP4A(s32,u8,s8) | 39.629 TOPS |
| AVX512_VNNI | DP2A(s32,s16,s16) | 19.772 TOPS |
| AVX_VNNI | DP4A(s32,u8,s8) | 20.503 TOPS |
| AVX_VNNI | DP2A(s32,s16,s16) | 10.236 TOPS |
| AVX512_BF16 | DP2A(f32,bf16,bf16) | 4.4223 TFLOPS |
| AVX512_FP16 | FMA(f16,f16,f16) | 19.761 TFLOPS |
| AVX512F | FMA(f32,f32,f32) | 7.7876 TFLOPS |
| AVX512F | FMA(f64,f64,f64) | 3.8961 TFLOPS |
| FMA | FMA(f32,f32,f32) | 4.962 TFLOPS |
| FMA | FMA(f64,f64,f64) | 2.4778 TFLOPS |
| AVX | ADD(MUL(f32,f32),f32) | 3.4637 TFLOPS |
| AVX | ADD(MUL(f64,f64),f64) | 1.7112 TFLOPS |
| SSE | ADD(MUL(f32,f32),f32) | 1.9122 TFLOPS |
| SSE2 | ADD(MUL(f64,f64),f64) | 960.12 GFLOPS |
--------------------------------------------------------------
================================================
FILE: benchmark_result/x64/ZHAOXIN_KX_6640MA.md
================================================
# ZHAOXIN KX-6640MA
Architecture: LuJiaZui
Setting: 4 Cores
For single core:
$ ./cpufp --thread_pool=[0]
Number Threads: 1
Thread Pool Binding: 0
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AVX | ADD(MUL(f32,f32),f32) | 13.825 GFLOPS |
| AVX | ADD(MUL(f64,f64),f64) | 5.1625 GFLOPS |
| SSE | ADD(MUL(f32,f32),f32) | 20.738 GFLOPS |
| SSE2 | ADD(MUL(f64,f64),f64) | 5.1844 GFLOPS |
--------------------------------------------------------------
For 4 cores:
$ ./cpufp --thread_pool=[0-3]
Number Threads: 4
Thread Pool Binding: 0 1 2 3
--------------------------------------------------------------
| Instruction Set | Core Computation | Peak Performance |
| AVX | ADD(MUL(f32,f32),f32) | 46.638 GFLOPS |
| AVX | ADD(MUL(f64,f64),f64) | 17.449 GFLOPS |
| SSE | ADD(MUL(f32,f32),f32) | 70.102 GFLOPS |
| SSE2 | ADD(MUL(f64,f64),f64) | 17.511 GFLOPS |
--------------------------------------------------------------
================================================
FILE: build_arm64.sh
================================================
SRC=arm64
ASM=$SRC/asm
COMM=common
BUILD_DIR=build_dir
OS=$(uname -o)
# make directory
if [ -d "$BUILD_DIR" ]; then
rm -rf $BUILD_DIR/*
else
mkdir $BUILD_DIR
fi
# build common tools
if [ "${OS}" == "Darwin" ]; then
g++ -O3 -std=gnu++17 -c $COMM/table.cpp -o $BUILD_DIR/table.o
g++ -O3 -std=gnu++17 -pthread -c $COMM/smtl.cpp -o $BUILD_DIR/smtl.o
else
g++ -O3 -c $COMM/table.cpp -o $BUILD_DIR/table.o
g++ -O3 -pthread -c $COMM/smtl.cpp -o $BUILD_DIR/smtl.o
fi
# gen benchmark macro according to cpuid feature
gcc $SRC/cpuid.c -o $BUILD_DIR/cpuid
SIMD_MACRO=" "
SIMD_OBJ=" "
AS_EXTRA_FLAGS="-mcpu=all"
if [ "${OS}" == "Darwin" ]; then
AS_EXTRA_FLAGS="-mcpu=apple-m2"
fi
for SIMD in `$BUILD_DIR/cpuid`;
do
SIMD_MACRO="$SIMD_MACRO-D$SIMD "
SIMD_OBJ="$SIMD_OBJ$BUILD_DIR/$SIMD.o "
as ${AS_EXTRA_FLAGS} -c $ASM/$SIMD.S -o $BUILD_DIR/$SIMD.o
done
# compile cpufp
if [ "${OS}" != "Darwin" ]; then
g++ -std=gnu++17 -O3 -I$COMM $SIMD_MACRO -c $SRC/cpufp.cpp -o $BUILD_DIR/cpufp.o
g++ -std=gnu++17 -O3 -z noexecstack -pthread -o cpufp $BUILD_DIR/cpufp.o $BUILD_DIR/smtl.o $BUILD_DIR/table.o $SIMD_OBJ
else
g++ -O3 -I$COMM $SIMD_MACRO -c $SRC/cpufp.cpp -o $BUILD_DIR/cpufp.o
g++ -O3 -pthread -o cpufp $BUILD_DIR/cpufp.o $BUILD_DIR/smtl.o $BUILD_DIR/table.o $SIMD_OBJ
fi
================================================
FILE: build_e2k.sh
================================================
SRC=e2k
COMM=common
BUILD_DIR=build_dir
# make directory
if [ -d "$BUILD_DIR" ]; then
rm -rf $BUILD_DIR/*
else
mkdir $BUILD_DIR
fi
# build common tools
l++ -O3 -c $COMM/table.cpp -o $BUILD_DIR/table.o
l++ -O3 -pthread -c $COMM/smtl.cpp -o $BUILD_DIR/smtl.o
# assembler implementation is selected by __iset__ at compile time
l++ -c $SRC/asm.S -o $BUILD_DIR/asm.o
# compile cpufp
l++ -O3 -I$COMM -c $SRC/cpufp.cpp -o $BUILD_DIR/cpufp.o
l++ -O3 -z noexecstack -pthread -o cpufp $BUILD_DIR/cpufp.o $BUILD_DIR/smtl.o $BUILD_DIR/table.o $BUILD_DIR/asm.o
================================================
FILE: build_loongarch64.sh
================================================
SRC=loongarch64
ASM=$SRC/asm
COMM=common
BUILD_DIR=build_dir
# make directory
if [ -d "$BUILD_DIR" ]; then
rm -rf $BUILD_DIR/*
else
mkdir $BUILD_DIR
fi
# build common tools
g++ -O3 -c $COMM/table.cpp -o $BUILD_DIR/table.o
g++ -O3 -pthread -c $COMM/smtl.cpp -o $BUILD_DIR/smtl.o
# gen benchmark macro according to cpuid feature
gcc $SRC/cpuid.c -o $BUILD_DIR/cpuid
SIMD_MACRO=" "
SIMD_OBJ=" "
for SIMD in `$BUILD_DIR/cpuid`;
do
SIMD_MACRO="$SIMD_MACRO-D$SIMD "
SIMD_OBJ="$SIMD_OBJ$BUILD_DIR/$SIMD.o "
g++ -c $ASM/$SIMD.S -o $BUILD_DIR/$SIMD.o
done
# compile cpufp
g++ -O3 -I$COMM $SIMD_MACRO -c $SRC/cpufp.cpp -o $BUILD_DIR/cpufp.o
g++ -O3 -z noexecstack -pthread -o cpufp $BUILD_DIR/cpufp.o $BUILD_DIR/smtl.o $BUILD_DIR/table.o $SIMD_OBJ
================================================
FILE: build_riscv64.sh
================================================
SRC=riscv64
ASM=$SRC/asm
COMM=common
BUILD_DIR=build_dir
# make directory
if [ -d "$BUILD_DIR" ]; then
rm -rf $BUILD_DIR/*
else
mkdir $BUILD_DIR
fi
# build common tools
g++ -O3 -c $COMM/table.cpp -o $BUILD_DIR/table.o
g++ -O3 -pthread -c $COMM/smtl.cpp -o $BUILD_DIR/smtl.o
# gen benchmark macro according to cpuid feature
gcc $SRC/cpuid.c -o $BUILD_DIR/cpuid
SIMD_MACRO=" "
SIMD_OBJ=" "
for SIMD in `$BUILD_DIR/cpuid`;
do
SIMD_MACRO="$SIMD_MACRO-D$SIMD "
SIMD_OBJ="$SIMD_OBJ$BUILD_DIR/$SIMD.o "
as -march=rv64gcv_zfh -c $ASM/$SIMD.S -o $BUILD_DIR/$SIMD.o
done
# compile cpufp
g++ -O3 -march=rv64gcv_zfh -I$COMM $SIMD_MACRO -c $SRC/cpufp.cpp -o $BUILD_DIR/cpufp.o
g++ -O3 -z noexecstack -pthread -o cpufp $BUILD_DIR/cpufp.o $BUILD_DIR/smtl.o $BUILD_DIR/table.o $SIMD_OBJ
================================================
FILE: build_x64.sh
================================================
SRC=x64
ASM=$SRC/asm
COMM=common
BUILD_DIR=build_dir
# make directory
if [ -d "$BUILD_DIR" ]; then
rm -rf $BUILD_DIR/*
else
mkdir $BUILD_DIR
fi
# build common tools
g++ -O3 -c $COMM/table.cpp -o $BUILD_DIR/table.o
g++ -O3 -pthread -c $COMM/smtl.cpp -o $BUILD_DIR/smtl.o
# gen benchmark macro according to cpuid feature
gcc $SRC/cpuid.c -o $BUILD_DIR/cpuid
SIMD_MACRO=" "
SIMD_OBJ=" "
for SIMD in `$BUILD_DIR/cpuid`;
do
SIMD_MACRO="$SIMD_MACRO-D$SIMD "
SIMD_OBJ="$SIMD_OBJ$BUILD_DIR/$SIMD.o "
g++ -c $ASM/$SIMD.S -o $BUILD_DIR/$SIMD.o
done
# compile cpufp
g++ -O3 -I$COMM $SIMD_MACRO -c $SRC/cpufp.cpp -o $BUILD_DIR/cpufp.o
g++ -O3 -z noexecstack -pthread -o cpufp $BUILD_DIR/cpufp.o $BUILD_DIR/smtl.o $BUILD_DIR/table.o $SIMD_OBJ
================================================
FILE: clean.sh
================================================
BUILD_DIR=build_dir
rm -rf $BUILD_DIR cpufp
================================================
FILE: common/smtl.cpp
================================================
#include "smtl.hpp"
#include
#include
#include
#ifdef __APPLE__
#include
#include
#include
#endif
#include
#include
#define SMTL_MAX_THREADS 512
enum smtl_status
{
SMTL_WORK,
SMTL_IDLE,
SMTL_FINI,
};
struct queue_node_t
{
task_func_t task_func;
void *params;
struct queue_node_t *next;
};
struct smtl_t
{
int num_threads;
struct queue_node_t *task_queues[SMTL_MAX_THREADS];
int cur_qid;
pthread_t tids[SMTL_MAX_THREADS];
pthread_mutex_t pt_mtx;
pthread_cond_t pt_cv;
int thread_holds;
pthread_mutex_t sl_mtxs[SMTL_MAX_THREADS];
pthread_cond_t sl_cvs[SMTL_MAX_THREADS];
enum smtl_status status[SMTL_MAX_THREADS];
};
struct smtl_tp_t
{
int tid;
int tbind;
struct smtl_t *sh;
};
static void thread_bind(int cpu)
{
#ifndef __APPLE__
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(cpu, &cpu_set);
if (pthread_setaffinity_np(pthread_self(),
sizeof(cpu_set_t), &cpu_set) != 0)
{
fprintf(stderr, "Error: cpu[%d] bind failed.\n", cpu);
exit(0);
}
#else
thread_policy_t cpu_set = &cpu;
kern_return_t res = thread_policy_set(pthread_mach_thread_np(pthread_self()),
THREAD_AFFINITY_POLICY, (thread_policy_t)&cpu_set, 1);
if (res == KERN_NOT_SUPPORTED) {
fprintf(stderr, "Warning: cpu thread policy is not supported by OS\n");
return;
}
if (res != KERN_SUCCESS) {
fprintf(stderr, "Error: cpu[%d] bind failed, return code %i\n", cpu, res);
exit(0);
}
#endif
}
static void *smtl_thread_func(void *params)
{
int err = 0;
struct smtl_tp_t *stp = (struct smtl_tp_t*)params;
int tid = stp->tid;
int tbind = stp->tbind;
struct smtl_t *sh = stp->sh;
free(stp);
thread_bind(tbind);
pthread_mutex_t *sl_mtx = sh->sl_mtxs + tid;
pthread_cond_t *sl_cv = sh->sl_cvs + tid;
pthread_mutex_t *pt_mtx = &sh->pt_mtx;
pthread_cond_t *pt_cv = &sh->pt_cv;
while (1)
{
err = pthread_mutex_lock(sl_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_mtx lock failed.\n");
exit(0);
}
while (sh->status[tid] != SMTL_WORK)
{
if (sh->status[tid] == SMTL_FINI)
{
err = pthread_mutex_unlock(sl_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_mtx unlock failed.\n");
exit(0);
}
return NULL;
}
err = pthread_cond_wait(sl_cv, sl_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_cv wait failed.\n");
exit(0);
}
}
err = pthread_mutex_unlock(sl_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_mtx unlock failed.\n");
exit(0);
}
struct queue_node_t *p = sh->task_queues[tid];
struct queue_node_t *q = NULL;
sh->task_queues[tid] = NULL;
while (p != NULL)
{
q = p->next;
p->task_func(p->params);
free(p);
p = q;
}
err = pthread_mutex_lock(pt_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_mtx lock failed.\n");
exit(0);
}
sh->status[tid] = SMTL_IDLE;
sh->thread_holds--;
if (sh->thread_holds == 0)
{
err = pthread_cond_signal(pt_cv);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_cv signal failed.\n");
exit(0);
}
}
err = pthread_mutex_unlock(pt_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_mtx unlock failed.\n");
exit(0);
}
}
return NULL;
}
void smtl_init(smtl_handle *psh,
std::vector &set_of_threads)
{
int err = 0;
struct smtl_t *sh =
(struct smtl_t*)malloc(sizeof(struct smtl_t));
if (sh == NULL)
{
fprintf(stderr,
"ERROR: smtl_init allocation failed.\n");
exit(0);
}
int num_threads = set_of_threads.size();
sh->num_threads = num_threads;
sh->cur_qid = 0;
sh->thread_holds = 0;
memset(sh->task_queues, 0,
num_threads * sizeof(struct queue_node_t*));
err = pthread_mutex_init(&sh->pt_mtx, NULL);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_mtx init failed.\n");
exit(0);
}
err = pthread_cond_init(&sh->pt_cv, NULL);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_cv init failed.\n");
exit(0);
}
int i;
for (i = 0; i < num_threads; i++)
{
err = pthread_mutex_init(sh->sl_mtxs + i, NULL);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_mtxs init failed.\n");
exit(0);
}
err = pthread_cond_init(sh->sl_cvs + i, NULL);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_cvs init failed.\n");
exit(0);
}
sh->status[i] = SMTL_IDLE;
struct smtl_tp_t *stp =
(struct smtl_tp_t*)malloc(sizeof(struct smtl_tp_t));
if (stp == NULL)
{
fprintf(stderr, "ERROR: stp allocation failed.\n");
exit(0);
}
stp->sh = sh;
stp->tid = i;
stp->tbind = set_of_threads[i];
err = pthread_create(sh->tids + i, NULL,
smtl_thread_func, stp);
if (err != 0)
{
fprintf(stderr, "ERROR: pthread_create failed.\n");
exit(0);
}
}
*psh = sh;
}
void smtl_fini(smtl_handle sh)
{
int err = 0;
int i;
for (i = 0; i < sh->num_threads; i++)
{
err = pthread_mutex_lock(sh->sl_mtxs + i);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_mtxs lock failed.\n");
exit(0);
}
sh->status[i] = SMTL_FINI;
err = pthread_cond_signal(sh->sl_cvs + i);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_cv signal failed.\n");
exit(0);
}
err = pthread_mutex_unlock(sh->sl_mtxs + i);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_mtxs unlock failed.\n");
exit(0);
}
}
for (i = 0; i < sh->num_threads; i++)
{
err = pthread_join(sh->tids[i], NULL);
if (err != 0)
{
fprintf(stderr, "ERROR: pthread_join failed.\n");
exit(0);
}
}
err = pthread_mutex_destroy(&sh->pt_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_mtx destroy failed.\n");
exit(0);
}
err = pthread_cond_destroy(&sh->pt_cv);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_cv destroy failed.\n");
exit(0);
}
for (i = 0; i < sh->num_threads; i++)
{
err = pthread_mutex_destroy(sh->sl_mtxs + i);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_mtxs destroy failed.\n");
exit(0);
}
err = pthread_cond_destroy(sh->sl_cvs + i);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_cvs destroy failed.\n");
exit(0);
}
struct queue_node_t *p = sh->task_queues[i], *q = NULL;
while (p != NULL)
{
q = p->next;
free(p);
p = q;
}
}
}
int smtl_num_threads(smtl_handle sh)
{
return sh->num_threads;
}
void smtl_add_task(smtl_handle sh,
task_func_t task_func,
void *params)
{
struct queue_node_t *task =
(struct queue_node_t*)malloc(sizeof(struct queue_node_t));
if (task == NULL)
{
fprintf(stderr, "ERROR: add_task allocation failed.\n");
exit(0);
}
task->task_func = task_func;
task->params = params;
task->next = sh->task_queues[sh->cur_qid];
sh->task_queues[sh->cur_qid] = task;
sh->cur_qid++;
if (sh->cur_qid == sh->num_threads)
{
sh->cur_qid = 0;
}
}
void smtl_begin_tasks(smtl_handle sh)
{
int i, err = 0;
sh->thread_holds = sh->num_threads;
for (i = 0; i < sh->num_threads; i++)
{
err = pthread_mutex_lock(sh->sl_mtxs + i);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_mtxs lock failed.\n");
exit(0);
}
sh->status[i] = SMTL_WORK;
err = pthread_cond_signal(sh->sl_cvs + i);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_cvs signal failed.\n");
exit(0);
}
err = pthread_mutex_unlock(sh->sl_mtxs + i);
if (err != 0)
{
fprintf(stderr, "ERROR: sl_mtxs unlock failed.\n");
exit(0);
}
}
}
void smtl_wait_tasks_finished(smtl_handle sh)
{
int err = 0;
pthread_mutex_lock(&sh->pt_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_mtx lock failed.\n");
exit(0);
}
while (sh->thread_holds > 0)
{
pthread_cond_wait(&sh->pt_cv, &sh->pt_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_cv wait failed.\n");
exit(0);
}
}
sh->cur_qid = 0;
pthread_mutex_unlock(&sh->pt_mtx);
if (err != 0)
{
fprintf(stderr, "ERROR: pt_mtx unlock failed.\n");
exit(0);
}
}
================================================
FILE: common/smtl.hpp
================================================
#ifndef _SMTL_H
#define _SMTL_H
#include
typedef struct smtl_t* smtl_handle;
typedef void (*task_func_t)(void*);
void smtl_init(smtl_handle *psh,
std::vector &set_of_threads);
void smtl_fini(smtl_handle sh);
int smtl_num_threads(smtl_handle sh);
void smtl_add_task(smtl_handle sh,
task_func_t task_func,
void *params);
void smtl_begin_tasks(smtl_handle sh);
void smtl_wait_tasks_finished(smtl_handle sh);
#endif
================================================
FILE: common/table.cpp
================================================
#include "table.hpp"
#include
using namespace std;
Table::Table()
{
col = 0;
}
Table::~Table()
{
}
void Table::setColumnNum(int col)
{
int i;
this->col = col;
colWidths.resize(col);
for (i = 0; i < col; i++)
{
colWidths[i] = 2;
}
}
void Table::addOneItem(std::vector &item)
{
int i;
contents.push_back(item);
pSep.push_back(0);
for (i = 0; i < col; i++)
{
int length = item[i].size() + 2;
if (length > colWidths[i])
{
colWidths[i] = length;
}
}
}
void Table::addSeparator()
{
std::vector dummy;
contents.push_back(dummy);
pSep.push_back(1);
}
void Table::print()
{
int i, j, k;
int tableWidth = col + 1;
for (i = 0; i < col; i++)
{
tableWidth += colWidths[i];
}
string vLine(tableWidth, '-');
cout << vLine << endl;
for (i = 0; i < contents.size(); i++)
{
string oneLine("|");
for (j = 0; j < col; j++)
{
if (pSep[i] == 0)
{
oneLine += (" " + contents[i][j]);
for (k = 1 + contents[i][j].size();
k < colWidths[j]; k++)
{
oneLine += " ";
}
}
else if (pSep[i] == 1)
{
string curCol(colWidths[j], '-');
oneLine += curCol;
}
oneLine += "|";
}
cout << oneLine << endl;
}
cout << vLine << endl;
}
================================================
FILE: common/table.hpp
================================================
#ifndef _TABLE_HPP
#define _TABLE_HPP
#include
#include
class Table
{
public:
Table();
~Table();
Table(const Table &) = delete;
Table &operator=(const Table &) = delete;
void setColumnNum(int col);
void addOneItem(std::vector &item);
void addSeparator();
void print();
private:
int col;
std::vector colWidths;
std::vector pSep;
std::vector > contents;
};
#endif
================================================
FILE: e2k/asm.S
================================================
#if __iset__ < 5
# define CLEAR addd
#else
# define CLEAR qppackdl
#endif
.macro impl_bench name, op
.global \name
.type \name, #function
.align 8
\name:
{
setwd wsz=34, nfx=0
setbn rsz=29, rbs=4, rcur=0
rwd,0 (1ULL << 37) | 15, %lsr
disp %ctpr1, 0f
}
{
getfd,0 %r0, (32 << 6), %g16
disp %ctpr2, 1f
}
{
ord,0 %g16, (1ULL << 37), %g16
return %ctpr3
nop 2
}
0:
{
loop_mode
alc alcf=1, alct=1
abn abnf=1, abnt=1
CLEAR,0 0, 0, %b[0]
CLEAR,1 0, 0, %b[1]
CLEAR,3 0, 0, %b[30]
CLEAR,4 0, 0, %b[31]
ct %ctpr1 ? #NOT_LOOP_END
}
{
rwd,0 %g16, %lsr
nop 3 // NOTE: low delay may lead to undefined behaviour
}
// NOTE: `{,p,qp}fmul_add{s,d}` has latency 8 cycles thus we need at least
// 8 registers for each channel.
//
// Example for ALC0:
// I: read (b0) write (b20)
// 0: r8 r28 # use r28 as dst
// 1: r10 r30
// 2: r12 r32
// 3: r16 r34
// 4: r18 r36 # r28 is ready if fmuld (4)
// 5: r20 r38 # r28 is ready if fmad (5)
// 6: r22 r40
// 7: r24 r42
// 8: r26 r44 # r28 is ready if fmul_addd (8)
// 9: r28 r46 # read from r28 (+1 just in case)
1:
{
loop_mode
alc alcf=1, alct=1
abn abnf=1, abnt=1
\op,0 %b[0], %b[0], %b[0], %b[20]
\op,1 %b[20], %b[20], %b[20], %b[40]
\op,3 %b[1], %b[1], %b[1], %b[21]
\op,4 %b[21], %b[21], %b[21], %b[41]
#if __iset__ >= 4
// NOTE: v1-v3 does not support fops in ALC2/ALC5
\op,2 %b[40], %b[40], %b[40], %b[0]
\op,5 %b[41], %b[41], %b[41], %b[1]
#endif
ct %ctpr2 ? #NOT_LOOP_END
}
{
ct %ctpr3
}
.size \name, . - \name
.endm
.text
#if __iset__ >= 5
impl_bench bench_qpfmul_adds, qpfmul_adds
impl_bench bench_qpfmul_addd, qpfmul_addd
#endif
#if __iset__ >= 6
impl_bench bench_qpfmas, qpfmas
impl_bench bench_qpfmad, qpfmad
#endif
impl_bench bench_pfmul_adds, pfmul_adds
impl_bench bench_fmul_addd, fmul_addd
================================================
FILE: e2k/cpufp.cpp
================================================
#include "table.hpp"
#include "smtl.hpp"
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
extern "C"
{
#if __iset__ >= 6
void bench_qpfmas(int64_t, void *params);
void bench_qpfmad(int64_t, void *params);
#endif
#if __iset__ >= 5
void bench_qpfmul_adds(int64_t, void *params);
void bench_qpfmul_addd(int64_t, void *params);
#endif
void bench_pfmul_adds(int64_t, void *params);
void bench_fmul_addd(int64_t, void *params);
}
typedef struct
{
std::string isa;
std::string type;
std::string dim;
int64_t loop_time;
int64_t comp_pl;
void *params;
void (*bench)(int64_t, void*);
} cpubm_t;
static vector bm_list;
static double get_time(struct timespec *start,
struct timespec *end)
{
return end->tv_sec - start->tv_sec +
(end->tv_nsec - start->tv_nsec) * 1e-9;
}
static void reg_new_isa(std::string isa,
std::string type,
std::string dim,
int64_t loop_time,
int64_t comp_pl,
void *params,
void (*bench)(int64_t, void*))
{
cpubm_t new_one;
new_one.isa = isa;
new_one.type = type;
new_one.dim = dim;
new_one.loop_time = loop_time;
new_one.comp_pl = comp_pl;
new_one.params = params;
new_one.bench = bench;
bm_list.push_back(new_one);
}
static void thread_func(void *params)
{
cpubm_t *bm = (cpubm_t*)params;
if (bm->params)
{
bm->bench(bm->loop_time, bm->params);
}
else
{
bm->bench(bm->loop_time, NULL);
}
}
static void cpubm_e2k_one(smtl_handle sh,
cpubm_t &item,
Table &table)
{
struct timespec start, end;
double time_used, perf;
char perfUnit = 'G';
int i;
int num_threads = smtl_num_threads(sh);
// warm up
for (i = 0; i < num_threads; i++)
{
smtl_add_task(sh, thread_func, (void*)&item);
}
smtl_begin_tasks(sh);
smtl_wait_tasks_finished(sh);
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
for (i = 0; i < num_threads; i++)
{
smtl_add_task(sh, thread_func, (void*)&item);
}
smtl_begin_tasks(sh);
smtl_wait_tasks_finished(sh);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
time_used = get_time(&start, &end);
perf = item.loop_time * item.comp_pl * num_threads /
time_used;
if (perf > 1e12)
{
perfUnit = 'T';
perf /= 1e12;
}
else
{
perf /= 1e9;
}
stringstream ss;
ss << std::setprecision(5) << perf << " " << perfUnit << item.dim;
vector cont;
cont.resize(3);
cont[0] = item.isa;
cont[1] = item.type;
cont[2] = ss.str();
table.addOneItem(cont);
}
static void cpubm_do_bench(std::vector &set_of_threads,
uint32_t idle_time)
{
int i;
if (bm_list.size() > 0)
{
int num_threads = set_of_threads.size();
printf("Number Threads: %d\n", num_threads);
printf("Thread Pool Binding:");
for (i = 0; i < num_threads; i++)
{
printf(" %d", set_of_threads[i]);
}
printf("\n");
// set table head
vector ti;
ti.resize(3);
ti[0] = "Instruction Set";
ti[1] = "Core Computation";
ti[2] = "Peak Performance";
Table table;
table.setColumnNum(3);
table.addOneItem(ti);
// set thread pool
smtl_handle sh;
smtl_init(&sh, set_of_threads);
// traverse task list
cpubm_e2k_one(sh, bm_list[0], table);
for (i = 1; i < bm_list.size(); i++)
{
sleep(idle_time);
cpubm_e2k_one(sh, bm_list[i], table);
}
table.print();
smtl_fini(sh);
}
}
static void parse_thread_pool(char *sets,
vector &set_of_threads)
{
if (sets[0] != '[')
{
return;
}
int pos = 1;
int left = 0, right = 0;
int state = 0;
while (sets[pos] != ']' && sets[pos] != '\0')
{
if (state == 0)
{
if (sets[pos] >= '0' && sets[pos] <= '9')
{
left *= 10;
left += (int)(sets[pos] - '0');
}
else if (sets[pos] == ',')
{
set_of_threads.push_back(left);
left = 0;
}
else if (sets[pos] == '-')
{
right = 0;
state = 1;
}
}
else if (state == 1)
{
if (sets[pos] >= '0' && sets[pos] <= '9')
{
right *= 10;
right += (int)(sets[pos] - '0');
}
else if (sets[pos] == ',')
{
int i;
for (i = left; i <= right; i++)
{
set_of_threads.push_back(i);
}
left = 0;
state = 0;
}
}
pos++;
}
if (sets[pos] != ']')
{
return;
}
if (state == 0)
{
set_of_threads.push_back(left);
}
else if (state == 1)
{
int i;
for (i = left; i <= right; i++)
{
set_of_threads.push_back(i);
}
}
}
static void cpufp_register_isa()
{
// NOTE: do not use values greater than UINT32_MAX
const uint32_t loop_time = 0x20000000;
#if __iset__ >= 6
reg_new_isa("v6", "FMA(f32,f32,f32)", "FLOPS",
loop_time, 48LL, NULL, bench_qpfmas);
reg_new_isa("v6", "FMA(f64,f64,f64)", "FLOPS",
loop_time, 24LL, NULL, bench_qpfmad);
#endif
#if __iset__ >= 5
reg_new_isa("v5", "ADD(f32,MUL(f32,f32))", "FLOPS",
loop_time, 48LL, NULL, bench_qpfmul_adds);
reg_new_isa("v5", "ADD(f64,MUL(f64,f64))", "FLOPS",
loop_time, 24LL, NULL, bench_qpfmul_addd);
#endif
#if __iset__ >= 4
reg_new_isa("v4", "ADD(f32,MUL(f32,f32))", "FLOPS",
loop_time, 24LL, NULL, bench_pfmul_adds);
reg_new_isa("v4", "ADD(f64,MUL(f64,f64))", "FLOPS",
loop_time, 12LL, NULL, bench_fmul_addd);
#else
reg_new_isa("v1", "ADD(f32,MUL(f32,f32))", "FLOPS",
loop_time, 16LL, NULL, bench_pfmul_adds);
reg_new_isa("v1", "ADD(f64,MUL(f64,f64))", "FLOPS",
loop_time, 8LL, NULL, bench_fmul_addd);
#endif
}
int main(int argc, char *argv[])
{
vector set_of_threads;
uint32_t idle_time = 0;
bool params_enough = false;
int i;
for (i = 1; i < argc; i++)
{
if (strncmp(argv[i], "--thread_pool=", 14) == 0)
{
parse_thread_pool(argv[i] + 14, set_of_threads);
params_enough = true;
}
else if (strncmp(argv[i], "--idle_time=", 12) == 0)
{
idle_time = (uint32_t)atoi(argv[i] + 12);
}
}
if (!params_enough)
{
fprintf(stderr, "Error: You must set --thread_pool parameter.\n");
fprintf(stderr, "You may also set --idle_time parameter.\n");
fprintf(stderr, "Usage: %s --thread_pool=[xxx] --idle_time=yyy\n", argv[0]);
fprintf(stderr, "[xxx] indicates all cores to benchmark.\n");
fprintf(stderr, "Example: [0,3,5-8,13-15].\n");
fprintf(stderr, "idle_time is the interval time(s) between every two benchmarks.\n");
fprintf(stderr, "idle_time parameter can be ignored, the default value is 0s.\n");
fprintf(stderr, "Notice: there must NOT be any spaces.\n");
exit(0);
}
cpufp_register_isa();
cpubm_do_bench(set_of_threads, idle_time);
return 0;
}
================================================
FILE: loongarch64/asm/_FP_DP_.S
================================================
.globl fp64_fmadd_f64f64f64
fp64_fmadd_f64f64f64:
movgr2fr.d $f0, $r0
movgr2fr.d $f1, $r0
movgr2fr.d $f2, $r0
movgr2fr.d $f3, $r0
movgr2fr.d $f4, $r0
movgr2fr.d $f5, $r0
movgr2fr.d $f6, $r0
movgr2fr.d $f7, $r0
movgr2fr.d $f8, $r0
movgr2fr.d $f9, $r0
movgr2fr.d $f10, $r0
movgr2fr.d $f11, $r0
movgr2fr.d $f12, $r0
movgr2fr.d $f13, $r0
movgr2fr.d $f14, $r0
movgr2fr.d $f15, $r0
movgr2fr.d $f16, $r0
.fp64.fmadd.f64f64f64:
fmadd.d $f0, $f16, $f16, $f0
fmadd.d $f1, $f16, $f16, $f1
fmadd.d $f2, $f16, $f16, $f2
fmadd.d $f3, $f16, $f16, $f3
fmadd.d $f4, $f16, $f16, $f4
fmadd.d $f5, $f16, $f16, $f5
fmadd.d $f6, $f16, $f16, $f6
fmadd.d $f7, $f16, $f16, $f7
fmadd.d $f8, $f16, $f16, $f8
fmadd.d $f9, $f16, $f16, $f9
fmadd.d $f10, $f16, $f16, $f10
fmadd.d $f11, $f16, $f16, $f11
fmadd.d $f12, $f16, $f16, $f12
fmadd.d $f13, $f16, $f16, $f13
fmadd.d $f14, $f16, $f16, $f14
fmadd.d $f15, $f16, $f16, $f15
addi.d $a0, $a0, -1
bne $a0, $r0, .fp64.fmadd.f64f64f64
jr $r1
================================================
FILE: loongarch64/asm/_FP_SP_.S
================================================
.globl fp32_fmadd_f32f32f32
fp32_fmadd_f32f32f32:
movgr2fr.w $f0, $r0
movgr2fr.w $f1, $r0
movgr2fr.w $f2, $r0
movgr2fr.w $f3, $r0
movgr2fr.w $f4, $r0
movgr2fr.w $f5, $r0
movgr2fr.w $f6, $r0
movgr2fr.w $f7, $r0
movgr2fr.w $f8, $r0
movgr2fr.w $f9, $r0
movgr2fr.w $f10, $r0
movgr2fr.w $f11, $r0
movgr2fr.w $f12, $r0
movgr2fr.w $f13, $r0
movgr2fr.w $f14, $r0
movgr2fr.w $f15, $r0
movgr2fr.w $f16, $r0
.fp32.fmadd.f32f32f32:
fmadd.s $f0, $f16, $f16, $f0
fmadd.s $f1, $f16, $f16, $f1
fmadd.s $f2, $f16, $f16, $f2
fmadd.s $f3, $f16, $f16, $f3
fmadd.s $f4, $f16, $f16, $f4
fmadd.s $f5, $f16, $f16, $f5
fmadd.s $f6, $f16, $f16, $f6
fmadd.s $f7, $f16, $f16, $f7
fmadd.s $f8, $f16, $f16, $f8
fmadd.s $f9, $f16, $f16, $f9
fmadd.s $f10, $f16, $f16, $f10
fmadd.s $f11, $f16, $f16, $f11
fmadd.s $f12, $f16, $f16, $f12
fmadd.s $f13, $f16, $f16, $f13
fmadd.s $f14, $f16, $f16, $f14
fmadd.s $f15, $f16, $f16, $f15
addi.d $a0, $a0, -1
bne $a0, $r0, .fp32.fmadd.f32f32f32
jr $r1
================================================
FILE: loongarch64/asm/_LASX_.S
================================================
.globl lasx_fp32_fmadd_f32f32f32
.globl lasx_fp64_fmadd_f64f64f64
.globl lasx_fp32_add_mul_f32f32_f32
.globl lasx_fp64_add_mul_f64f64_f64
lasx_fp32_fmadd_f32f32f32:
xvxor.v $xr0, $xr0, $xr0
xvxor.v $xr1, $xr1, $xr1
xvxor.v $xr2, $xr2, $xr2
xvxor.v $xr3, $xr3, $xr3
xvxor.v $xr4, $xr4, $xr4
xvxor.v $xr5, $xr5, $xr5
xvxor.v $xr6, $xr6, $xr6
xvxor.v $xr7, $xr7, $xr7
xvxor.v $xr8, $xr8, $xr8
xvxor.v $xr9, $xr9, $xr9
xvxor.v $xr10, $xr10, $xr10
xvxor.v $xr11, $xr11, $xr11
xvxor.v $xr12, $xr12, $xr12
xvxor.v $xr13, $xr13, $xr13
xvxor.v $xr14, $xr14, $xr14
xvxor.v $xr15, $xr15, $xr15
xvxor.v $xr16, $xr16, $xr16
.lasx.fp32.fmadd.f32f32f32:
xvfmadd.s $xr0, $xr16, $xr16, $xr0
xvfmadd.s $xr1, $xr16, $xr16, $xr1
xvfmadd.s $xr2, $xr16, $xr16, $xr2
xvfmadd.s $xr3, $xr16, $xr16, $xr3
xvfmadd.s $xr4, $xr16, $xr16, $xr4
xvfmadd.s $xr5, $xr16, $xr16, $xr5
xvfmadd.s $xr6, $xr16, $xr16, $xr6
xvfmadd.s $xr7, $xr16, $xr16, $xr7
xvfmadd.s $xr8, $xr16, $xr16, $xr8
xvfmadd.s $xr9, $xr16, $xr16, $xr9
xvfmadd.s $xr10, $xr16, $xr16, $xr10
xvfmadd.s $xr11, $xr16, $xr16, $xr11
xvfmadd.s $xr12, $xr16, $xr16, $xr12
xvfmadd.s $xr13, $xr16, $xr16, $xr13
xvfmadd.s $xr14, $xr16, $xr16, $xr14
xvfmadd.s $xr15, $xr16, $xr16, $xr15
addi.d $a0, $a0, -1
bne $a0, $r0, .lasx.fp32.fmadd.f32f32f32
jr $r1
lasx_fp64_fmadd_f64f64f64:
xvxor.v $xr0, $xr0, $xr0
xvxor.v $xr1, $xr1, $xr1
xvxor.v $xr2, $xr2, $xr2
xvxor.v $xr3, $xr3, $xr3
xvxor.v $xr4, $xr4, $xr4
xvxor.v $xr5, $xr5, $xr5
xvxor.v $xr6, $xr6, $xr6
xvxor.v $xr7, $xr7, $xr7
xvxor.v $xr8, $xr8, $xr8
xvxor.v $xr9, $xr9, $xr9
xvxor.v $xr10, $xr10, $xr10
xvxor.v $xr11, $xr11, $xr11
xvxor.v $xr12, $xr12, $xr12
xvxor.v $xr13, $xr13, $xr13
xvxor.v $xr14, $xr14, $xr14
xvxor.v $xr15, $xr15, $xr15
xvxor.v $xr16, $xr16, $xr16
.lasx.fp64.fmadd.f64f64f64:
xvfmadd.d $xr0, $xr16, $xr16, $xr0
xvfmadd.d $xr1, $xr16, $xr16, $xr1
xvfmadd.d $xr2, $xr16, $xr16, $xr2
xvfmadd.d $xr3, $xr16, $xr16, $xr3
xvfmadd.d $xr4, $xr16, $xr16, $xr4
xvfmadd.d $xr5, $xr16, $xr16, $xr5
xvfmadd.d $xr6, $xr16, $xr16, $xr6
xvfmadd.d $xr7, $xr16, $xr16, $xr7
xvfmadd.d $xr8, $xr16, $xr16, $xr8
xvfmadd.d $xr9, $xr16, $xr16, $xr9
xvfmadd.d $xr10, $xr16, $xr16, $xr10
xvfmadd.d $xr11, $xr16, $xr16, $xr11
xvfmadd.d $xr12, $xr16, $xr16, $xr12
xvfmadd.d $xr13, $xr16, $xr16, $xr13
xvfmadd.d $xr14, $xr16, $xr16, $xr14
xvfmadd.d $xr15, $xr16, $xr16, $xr15
addi.d $a0, $a0, -1
bne $a0, $r0, .lasx.fp64.fmadd.f64f64f64
jr $r1
lasx_fp32_add_mul_f32f32_f32:
xvxor.v $xr0, $xr0, $xr0
xvxor.v $xr1, $xr1, $xr1
xvxor.v $xr2, $xr2, $xr2
xvxor.v $xr3, $xr3, $xr3
xvxor.v $xr4, $xr4, $xr4
xvxor.v $xr5, $xr5, $xr5
xvxor.v $xr6, $xr6, $xr6
xvxor.v $xr7, $xr7, $xr7
xvxor.v $xr8, $xr8, $xr8
xvxor.v $xr9, $xr9, $xr9
xvxor.v $xr10, $xr10, $xr10
xvxor.v $xr11, $xr11, $xr11
xvxor.v $xr12, $xr12, $xr12
xvxor.v $xr13, $xr13, $xr13
xvxor.v $xr14, $xr14, $xr14
xvxor.v $xr15, $xr15, $xr15
xvxor.v $xr16, $xr16, $xr16
xvxor.v $xr17, $xr17, $xr17
xvxor.v $xr18, $xr18, $xr18
xvxor.v $xr19, $xr19, $xr19
xvxor.v $xr20, $xr20, $xr20
xvxor.v $xr21, $xr21, $xr21
xvxor.v $xr22, $xr22, $xr22
xvxor.v $xr23, $xr23, $xr23
xvxor.v $xr24, $xr24, $xr24
.lasx.fp32.add.mul.f32f32.f32:
xvfmul.s $xr0, $xr24, $xr24
xvfadd.s $xr1, $xr24, $xr24
xvfmul.s $xr2, $xr24, $xr24
xvfadd.s $xr3, $xr24, $xr24
xvfmul.s $xr4, $xr24, $xr24
xvfadd.s $xr5, $xr24, $xr24
xvfmul.s $xr6, $xr24, $xr24
xvfadd.s $xr7, $xr24, $xr24
xvfmul.s $xr8, $xr24, $xr24
xvfadd.s $xr9, $xr24, $xr24
xvfmul.s $xr10, $xr24, $xr24
xvfadd.s $xr11, $xr24, $xr24
xvfmul.s $xr12, $xr24, $xr24
xvfadd.s $xr13, $xr24, $xr24
xvfmul.s $xr14, $xr24, $xr24
xvfadd.s $xr15, $xr24, $xr24
xvfmul.s $xr16, $xr24, $xr24
xvfadd.s $xr17, $xr24, $xr24
xvfmul.s $xr18, $xr24, $xr24
xvfadd.s $xr19, $xr24, $xr24
xvfmul.s $xr20, $xr24, $xr24
xvfadd.s $xr21, $xr24, $xr24
xvfmul.s $xr22, $xr24, $xr24
xvfadd.s $xr23, $xr24, $xr24
addi.d $a0, $a0, -1
bne $a0, $r0, .lasx.fp32.add.mul.f32f32.f32
jr $r1
lasx_fp64_add_mul_f64f64_f64:
xvxor.v $xr0, $xr0, $xr0
xvxor.v $xr1, $xr1, $xr1
xvxor.v $xr2, $xr2, $xr2
xvxor.v $xr3, $xr3, $xr3
xvxor.v $xr4, $xr4, $xr4
xvxor.v $xr5, $xr5, $xr5
xvxor.v $xr6, $xr6, $xr6
xvxor.v $xr7, $xr7, $xr7
xvxor.v $xr8, $xr8, $xr8
xvxor.v $xr9, $xr9, $xr9
xvxor.v $xr10, $xr10, $xr10
xvxor.v $xr11, $xr11, $xr11
xvxor.v $xr12, $xr12, $xr12
xvxor.v $xr13, $xr13, $xr13
xvxor.v $xr14, $xr14, $xr14
xvxor.v $xr15, $xr15, $xr15
xvxor.v $xr16, $xr16, $xr16
xvxor.v $xr17, $xr17, $xr17
xvxor.v $xr18, $xr18, $xr18
xvxor.v $xr19, $xr19, $xr19
xvxor.v $xr20, $xr20, $xr20
xvxor.v $xr21, $xr21, $xr21
xvxor.v $xr22, $xr22, $xr22
xvxor.v $xr23, $xr23, $xr23
xvxor.v $xr24, $xr24, $xr24
.lasx.fp64.add.mul.f64f64.f64:
xvfmul.d $xr0, $xr24, $xr24
xvfadd.d $xr1, $xr24, $xr24
xvfmul.d $xr2, $xr24, $xr24
xvfadd.d $xr3, $xr24, $xr24
xvfmul.d $xr4, $xr24, $xr24
xvfadd.d $xr5, $xr24, $xr24
xvfmul.d $xr6, $xr24, $xr24
xvfadd.d $xr7, $xr24, $xr24
xvfmul.d $xr8, $xr24, $xr24
xvfadd.d $xr9, $xr24, $xr24
xvfmul.d $xr10, $xr24, $xr24
xvfadd.d $xr11, $xr24, $xr24
xvfmul.d $xr12, $xr24, $xr24
xvfadd.d $xr13, $xr24, $xr24
xvfmul.d $xr14, $xr24, $xr24
xvfadd.d $xr15, $xr24, $xr24
xvfmul.d $xr16, $xr24, $xr24
xvfadd.d $xr17, $xr24, $xr24
xvfmul.d $xr18, $xr24, $xr24
xvfadd.d $xr19, $xr24, $xr24
xvfmul.d $xr20, $xr24, $xr24
xvfadd.d $xr21, $xr24, $xr24
xvfmul.d $xr22, $xr24, $xr24
xvfadd.d $xr23, $xr24, $xr24
addi.d $a0, $a0, -1
bne $a0, $r0, .lasx.fp64.add.mul.f64f64.f64
jr $r1
================================================
FILE: loongarch64/asm/_LSX_.S
================================================
.globl lsx_fp32_fmadd_f32f32f32
.globl lsx_fp64_fmadd_f64f64f64
.globl lsx_fp32_add_mul_f32f32_f32
.globl lsx_fp64_add_mul_f64f64_f64
lsx_fp32_fmadd_f32f32f32:
vxor.v $vr0, $vr0, $vr0
vxor.v $vr1, $vr1, $vr1
vxor.v $vr2, $vr2, $vr2
vxor.v $vr3, $vr3, $vr3
vxor.v $vr4, $vr4, $vr4
vxor.v $vr5, $vr5, $vr5
vxor.v $vr6, $vr6, $vr6
vxor.v $vr7, $vr7, $vr7
vxor.v $vr8, $vr8, $vr8
vxor.v $vr9, $vr9, $vr9
vxor.v $vr10, $vr10, $vr10
vxor.v $vr11, $vr11, $vr11
vxor.v $vr12, $vr12, $vr12
vxor.v $vr13, $vr13, $vr13
vxor.v $vr14, $vr14, $vr14
vxor.v $vr15, $vr15, $vr15
vxor.v $vr16, $vr16, $vr16
.lsx.fp32.fmadd.f32f32f32:
vfmadd.s $vr0, $vr16, $vr16, $vr0
vfmadd.s $vr1, $vr16, $vr16, $vr1
vfmadd.s $vr2, $vr16, $vr16, $vr2
vfmadd.s $vr3, $vr16, $vr16, $vr3
vfmadd.s $vr4, $vr16, $vr16, $vr4
vfmadd.s $vr5, $vr16, $vr16, $vr5
vfmadd.s $vr6, $vr16, $vr16, $vr6
vfmadd.s $vr7, $vr16, $vr16, $vr7
vfmadd.s $vr8, $vr16, $vr16, $vr8
vfmadd.s $vr9, $vr16, $vr16, $vr9
vfmadd.s $vr10, $vr16, $vr16, $vr10
vfmadd.s $vr11, $vr16, $vr16, $vr11
vfmadd.s $vr12, $vr16, $vr16, $vr12
vfmadd.s $vr13, $vr16, $vr16, $vr13
vfmadd.s $vr14, $vr16, $vr16, $vr14
vfmadd.s $vr15, $vr16, $vr16, $vr15
addi.d $a0, $a0, -1
bne $a0, $r0, .lsx.fp32.fmadd.f32f32f32
jr $r1
lsx_fp64_fmadd_f64f64f64:
vxor.v $vr0, $vr0, $vr0
vxor.v $vr1, $vr1, $vr1
vxor.v $vr2, $vr2, $vr2
vxor.v $vr3, $vr3, $vr3
vxor.v $vr4, $vr4, $vr4
vxor.v $vr5, $vr5, $vr5
vxor.v $vr6, $vr6, $vr6
vxor.v $vr7, $vr7, $vr7
vxor.v $vr8, $vr8, $vr8
vxor.v $vr9, $vr9, $vr9
vxor.v $vr10, $vr10, $vr10
vxor.v $vr11, $vr11, $vr11
vxor.v $vr12, $vr12, $vr12
vxor.v $vr13, $vr13, $vr13
vxor.v $vr14, $vr14, $vr14
vxor.v $vr15, $vr15, $vr15
vxor.v $vr16, $vr16, $vr16
.lsx.fp64.fmadd.f64f64f64:
vfmadd.d $vr0, $vr16, $vr16, $vr0
vfmadd.d $vr1, $vr16, $vr16, $vr1
vfmadd.d $vr2, $vr16, $vr16, $vr2
vfmadd.d $vr3, $vr16, $vr16, $vr3
vfmadd.d $vr4, $vr16, $vr16, $vr4
vfmadd.d $vr5, $vr16, $vr16, $vr5
vfmadd.d $vr6, $vr16, $vr16, $vr6
vfmadd.d $vr7, $vr16, $vr16, $vr7
vfmadd.d $vr8, $vr16, $vr16, $vr8
vfmadd.d $vr9, $vr16, $vr16, $vr9
vfmadd.d $vr10, $vr16, $vr16, $vr10
vfmadd.d $vr11, $vr16, $vr16, $vr11
vfmadd.d $vr12, $vr16, $vr16, $vr12
vfmadd.d $vr13, $vr16, $vr16, $vr13
vfmadd.d $vr14, $vr16, $vr16, $vr14
vfmadd.d $vr15, $vr16, $vr16, $vr15
addi.d $a0, $a0, -1
bne $a0, $r0, .lsx.fp64.fmadd.f64f64f64
jr $r1
lsx_fp32_add_mul_f32f32_f32:
vxor.v $vr0, $vr0, $vr0
vxor.v $vr1, $vr1, $vr1
vxor.v $vr2, $vr2, $vr2
vxor.v $vr3, $vr3, $vr3
vxor.v $vr4, $vr4, $vr4
vxor.v $vr5, $vr5, $vr5
vxor.v $vr6, $vr6, $vr6
vxor.v $vr7, $vr7, $vr7
vxor.v $vr8, $vr8, $vr8
vxor.v $vr9, $vr9, $vr9
vxor.v $vr10, $vr10, $vr10
vxor.v $vr11, $vr11, $vr11
vxor.v $vr12, $vr12, $vr12
vxor.v $vr13, $vr13, $vr13
vxor.v $vr14, $vr14, $vr14
vxor.v $vr15, $vr15, $vr15
vxor.v $vr16, $vr16, $vr16
vxor.v $vr17, $vr17, $vr17
vxor.v $vr18, $vr18, $vr18
vxor.v $vr19, $vr19, $vr19
vxor.v $vr20, $vr20, $vr20
vxor.v $vr21, $vr21, $vr21
vxor.v $vr22, $vr22, $vr22
vxor.v $vr23, $vr23, $vr23
vxor.v $vr24, $vr24, $vr24
.lsx.fp32.add.mul.f32f32.f32:
vfmul.s $vr0, $vr24, $vr24
vfadd.s $vr1, $vr24, $vr24
vfmul.s $vr2, $vr24, $vr24
vfadd.s $vr3, $vr24, $vr24
vfmul.s $vr4, $vr24, $vr24
vfadd.s $vr5, $vr24, $vr24
vfmul.s $vr6, $vr24, $vr24
vfadd.s $vr7, $vr24, $vr24
vfmul.s $vr8, $vr24, $vr24
vfadd.s $vr9, $vr24, $vr24
vfmul.s $vr10, $vr24, $vr24
vfadd.s $vr11, $vr24, $vr24
vfmul.s $vr12, $vr24, $vr24
vfadd.s $vr13, $vr24, $vr24
vfmul.s $vr14, $vr24, $vr24
vfadd.s $vr15, $vr24, $vr24
vfmul.s $vr16, $vr24, $vr24
vfadd.s $vr17, $vr24, $vr24
vfmul.s $vr18, $vr24, $vr24
vfadd.s $vr19, $vr24, $vr24
vfmul.s $vr20, $vr24, $vr24
vfadd.s $vr21, $vr24, $vr24
vfmul.s $vr22, $vr24, $vr24
vfadd.s $vr23, $vr24, $vr24
addi.d $a0, $a0, -1
bne $a0, $r0, .lsx.fp32.add.mul.f32f32.f32
jr $r1
lsx_fp64_add_mul_f64f64_f64:
vxor.v $vr0, $vr0, $vr0
vxor.v $vr1, $vr1, $vr1
vxor.v $vr2, $vr2, $vr2
vxor.v $vr3, $vr3, $vr3
vxor.v $vr4, $vr4, $vr4
vxor.v $vr5, $vr5, $vr5
vxor.v $vr6, $vr6, $vr6
vxor.v $vr7, $vr7, $vr7
vxor.v $vr8, $vr8, $vr8
vxor.v $vr9, $vr9, $vr9
vxor.v $vr10, $vr10, $vr10
vxor.v $vr11, $vr11, $vr11
vxor.v $vr12, $vr12, $vr12
vxor.v $vr13, $vr13, $vr13
vxor.v $vr14, $vr14, $vr14
vxor.v $vr15, $vr15, $vr15
vxor.v $vr16, $vr16, $vr16
vxor.v $vr17, $vr17, $vr17
vxor.v $vr18, $vr18, $vr18
vxor.v $vr19, $vr19, $vr19
vxor.v $vr20, $vr20, $vr20
vxor.v $vr21, $vr21, $vr21
vxor.v $vr22, $vr22, $vr22
vxor.v $vr23, $vr23, $vr23
vxor.v $vr24, $vr24, $vr24
.lsx.fp64.add.mul.f64f64.f64:
vfmul.d $vr0, $vr24, $vr24
vfadd.d $vr1, $vr24, $vr24
vfmul.d $vr2, $vr24, $vr24
vfadd.d $vr3, $vr24, $vr24
vfmul.d $vr4, $vr24, $vr24
vfadd.d $vr5, $vr24, $vr24
vfmul.d $vr6, $vr24, $vr24
vfadd.d $vr7, $vr24, $vr24
vfmul.d $vr8, $vr24, $vr24
vfadd.d $vr9, $vr24, $vr24
vfmul.d $vr10, $vr24, $vr24
vfadd.d $vr11, $vr24, $vr24
vfmul.d $vr12, $vr24, $vr24
vfadd.d $vr13, $vr24, $vr24
vfmul.d $vr14, $vr24, $vr24
vfadd.d $vr15, $vr24, $vr24
vfmul.d $vr16, $vr24, $vr24
vfadd.d $vr17, $vr24, $vr24
vfmul.d $vr18, $vr24, $vr24
vfadd.d $vr19, $vr24, $vr24
vfmul.d $vr20, $vr24, $vr24
vfadd.d $vr21, $vr24, $vr24
vfmul.d $vr22, $vr24, $vr24
vfadd.d $vr23, $vr24, $vr24
addi.d $a0, $a0, -1
bne $a0, $r0, .lsx.fp64.add.mul.f64f64.f64
jr $r1
================================================
FILE: loongarch64/cpufp.cpp
================================================
#include "table.hpp"
#include "smtl.hpp"
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
extern "C"
{
#ifdef _FP_SP_
void fp32_fmadd_f32f32f32(int64_t);
#endif
#ifdef _FP_DP_
void fp64_fmadd_f64f64f64(int64_t);
#endif
#ifdef _LSX_
void lsx_fp32_fmadd_f32f32f32(int64_t);
void lsx_fp64_fmadd_f64f64f64(int64_t);
void lsx_fp32_add_mul_f32f32_f32(int64_t);
void lsx_fp64_add_mul_f64f64_f64(int64_t);
#endif
#ifdef _LASX_
void lasx_fp32_fmadd_f32f32f32(int64_t);
void lasx_fp64_fmadd_f64f64f64(int64_t);
void lasx_fp32_add_mul_f32f32_f32(int64_t);
void lasx_fp64_add_mul_f64f64_f64(int64_t);
#endif
}
typedef struct
{
std::string isa;
std::string vlen;
std::string type;
std::string dim;
int64_t loop_time;
int64_t comp_pl;
void (*bench)(int64_t);
} cpubm_t;
static int num_simd_256b = 0;
static int num_simd_128b = 0;
static int num_scalar = 0;
static std::vector bm_list;
static double get_time(struct timespec *start,
struct timespec *end)
{
return end->tv_sec - start->tv_sec +
(end->tv_nsec - start->tv_nsec) * 1e-9;
}
static void reg_new_isa(std::string isa,
std::string vlen,
std::string type,
std::string dim,
int64_t loop_time,
int64_t comp_pl,
void (*bench)(int64_t))
{
cpubm_t new_one;
new_one.isa = isa;
new_one.vlen = vlen;
new_one.type = type;
new_one.dim = dim;
new_one.loop_time = loop_time;
new_one.comp_pl = comp_pl;
new_one.bench = bench;
bm_list.push_back(new_one);
}
static void thread_func(void *params)
{
cpubm_t *bm = (cpubm_t*)params;
bm->bench(bm->loop_time);
}
static void cpubm_x64_one(smtl_handle sh,
cpubm_t &item,
Table &table)
{
struct timespec start, end;
double time_used, perf;
char perfUnit = 'G';
int i;
int num_threads = smtl_num_threads(sh);
// warm up
for (i = 0; i < num_threads; i++)
{
smtl_add_task(sh, thread_func, (void*)&item);
}
smtl_begin_tasks(sh);
smtl_wait_tasks_finished(sh);
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
for (i = 0; i < num_threads; i++)
{
smtl_add_task(sh, thread_func, (void*)&item);
}
smtl_begin_tasks(sh);
smtl_wait_tasks_finished(sh);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
time_used = get_time(&start, &end);
perf = item.loop_time * item.comp_pl * num_threads /
time_used;
if (perf > 1e12)
{
perfUnit = 'T';
perf /= 1e12;
}
else
{
perf /= 1e9;
}
stringstream ss;
ss << std::setprecision(5) << perf << " " << perfUnit << item.dim;
std::vector cont;
cont.resize(4);
cont[0] = item.isa;
cont[1] = item.vlen;
cont[2] = item.type;
cont[3] = ss.str();
table.addOneItem(cont);
}
static void cpubm_do_bench(std::vector &set_of_threads,
uint32_t idle_time)
{
int i;
if (bm_list.size() > 0)
{
int num_threads = set_of_threads.size();
printf("Number Threads: %d\n", num_threads);
printf("Thread Pool Binding:");
for (i = 0; i < num_threads; i++)
{
printf(" %d", set_of_threads[i]);
}
printf("\n");
// set table head
std::vector ti;
ti.resize(4);
ti[0] = "Instruction Set";
ti[1] = "Vector Length";
ti[2] = "Core Computation";
ti[3] = "Peak Performance";
Table table;
table.setColumnNum(4);
table.addOneItem(ti);
// set thread pool
smtl_handle sh;
smtl_init(&sh, set_of_threads);
// traverse task list
int bm_idx = 0;
if (num_simd_256b)
{
table.addSeparator();
for (i = 0; i < num_simd_256b; i++)
{
sleep(idle_time);
cpubm_x64_one(sh, bm_list[bm_idx], table);
bm_idx++;
}
}
if (num_simd_128b)
{
table.addSeparator();
for (i = 0; i < num_simd_128b; i++)
{
sleep(idle_time);
cpubm_x64_one(sh, bm_list[bm_idx], table);
bm_idx++;
}
}
if (num_scalar)
{
table.addSeparator();
for (i = 0; i < num_scalar; i++)
{
sleep(idle_time);
cpubm_x64_one(sh, bm_list[bm_idx], table);
bm_idx++;
}
}
table.print();
smtl_fini(sh);
}
}
static void parse_thread_pool(char *sets,
std::vector &set_of_threads)
{
if (sets[0] != '[')
{
return;
}
int pos = 1;
int left = 0, right = 0;
int state = 0;
while (sets[pos] != ']' && sets[pos] != '\0')
{
if (state == 0)
{
if (sets[pos] >= '0' && sets[pos] <= '9')
{
left *= 10;
left += (int)(sets[pos] - '0');
}
else if (sets[pos] == ',')
{
set_of_threads.push_back(left);
left = 0;
}
else if (sets[pos] == '-')
{
right = 0;
state = 1;
}
}
else if (state == 1)
{
if (sets[pos] >= '0' && sets[pos] <= '9')
{
right *= 10;
right += (int)(sets[pos] - '0');
}
else if (sets[pos] == ',')
{
int i;
for (i = left; i <= right; i++)
{
set_of_threads.push_back(i);
}
left = 0;
state = 0;
}
}
pos++;
}
if (sets[pos] != ']')
{
return;
}
if (state == 0)
{
set_of_threads.push_back(left);
}
else if (state == 1)
{
int i;
for (i = left; i <= right; i++)
{
set_of_threads.push_back(i);
}
}
}
static void cpufp_register_isa()
{
#ifdef _LASX_
reg_new_isa("LASX", "256b", "fmadd(f32,f32,f32)", "FLOPS",
0x20000000LL, 256LL, lasx_fp32_fmadd_f32f32f32);
reg_new_isa("LASX", "256b", "fmadd(f64,f64,f64)", "FLOPS",
0x20000000LL, 128LL, lasx_fp64_fmadd_f64f64f64);
reg_new_isa("LASX", "256b", "add(mul(f32,f32),f32)", "FLOPS",
0x20000000LL, 192LL, lasx_fp32_add_mul_f32f32_f32);
reg_new_isa("LASX", "256b", "add(mul(f64,f64),f64)", "FLOPS",
0x20000000LL, 96LL, lasx_fp64_add_mul_f64f64_f64);
num_simd_256b += 4;
#endif
#ifdef _LSX_
reg_new_isa("LSX", "128b", "fmadd(f32,f32,f32)", "FLOPS",
0x20000000LL, 128LL, lsx_fp32_fmadd_f32f32f32);
reg_new_isa("LSX", "128b", "fmadd(f64,f64,f64)", "FLOPS",
0x20000000LL, 64LL, lsx_fp64_fmadd_f64f64f64);
reg_new_isa("LSX", "128b", "add(mul(f32,f32),f32)", "FLOPS",
0x20000000LL, 96LL, lsx_fp32_add_mul_f32f32_f32);
reg_new_isa("LSX", "128b", "add(mul(f64,f64),f64)", "FLOPS",
0x20000000LL, 48LL, lsx_fp64_add_mul_f64f64_f64);
num_simd_128b += 4;
#endif
#ifdef _FP_SP_
reg_new_isa("FP_SP", "scalar", "fmadd(f32,f32,f32)", "FLOPS",
0x20000000LL, 32LL, fp32_fmadd_f32f32f32);
num_scalar++;
#endif
#ifdef _FP_DP_
reg_new_isa("FP_DP", "scalar", "fmadd(f64,f64,f64)", "FLOPS",
0x20000000LL, 32LL, fp64_fmadd_f64f64f64);
num_scalar++;
#endif
}
int main(int argc, char *argv[])
{
std::vector set_of_threads;
uint32_t idle_time = 0;
bool params_enough = false;
int i;
for (i = 1; i < argc; i++)
{
if (strncmp(argv[i], "--thread_pool=", 14) == 0)
{
parse_thread_pool(argv[i] + 14, set_of_threads);
params_enough = true;
}
else if (strncmp(argv[i], "--idle_time=", 12) == 0)
{
idle_time = (uint32_t)atoi(argv[i] + 12);
}
}
if (!params_enough)
{
fprintf(stderr, "Error: You must set --thread_pool parameter.\n");
fprintf(stderr, "You may also set --idle_time parameter.\n");
fprintf(stderr, "Usage: %s --thread_pool=[xxx] --idle_time=yyy\n", argv[0]);
fprintf(stderr, "[xxx] indicates all cores to benchmark.\n");
fprintf(stderr, "Example: [0,3,5-8,13-15].\n");
fprintf(stderr, "idle_time is the interval time(s) between every two benchmarks.\n");
fprintf(stderr, "idle_time parameter can be ignored, the default value is 0s.\n");
fprintf(stderr, "Notice: there must NOT be any spaces.\n");
exit(0);
}
cpufp_register_isa();
cpubm_do_bench(set_of_threads, idle_time);
return 0;
}
================================================
FILE: loongarch64/cpuid.c
================================================
#include
#include
#define BIT_TEST(bit_map, pos) (((bit_map) & (0x1 << (pos))) ? 1 : 0)
uint32_t read_cpucfg(uint32_t reg)
{
uint32_t val = 0;
asm volatile("cpucfg %0, %1\n\t"
:"=r"(val)
:"r"(reg)
);
return val;
}
int main()
{
uint32_t f_0x2 = read_cpucfg(0x2);
if (BIT_TEST(f_0x2, 0))
{
if (BIT_TEST(f_0x2, 1))
{
printf("_FP_SP_\n");
}
if (BIT_TEST(f_0x2, 2))
{
printf("_FP_DP_\n");
}
}
if (BIT_TEST(f_0x2, 6))
{
printf("_LSX_\n");
}
if (BIT_TEST(f_0x2, 7))
{
printf("_LASX_\n");
}
return 0;
}
================================================
FILE: riscv64/asm/_IME_.S
================================================
.align 4
.macro preserve_caller_vec
csrr x5, vtype
csrr x6, vl
vsetvli x7, x0, e8, m8
sub sp, sp, x7
vse8.v v0, (sp)
sub sp, sp, x7
vse8.v v8, (sp)
sub sp, sp, x7
vse8.v v16, (sp)
sub sp, sp, x7
vse8.v v24, (sp)
.endm
.macro restore_caller_vec
vsetvli x7, x0, e8, m8
vle8.v v24, (sp)
add sp, sp, x7
vle8.v v16, (sp)
add sp, sp, x7
vle8.v v8, (sp)
add sp, sp, x7
vle8.v v0, (sp)
add sp, sp, x7
vsetvl x7, x6, x5
.endm
#ifdef __APPLE__
.globl _ime_vmadot_s32s8s8
_ime_vmadot_s32s8s8:
#else
.globl ime_vmadot_s32s8s8
ime_vmadot_s32s8s8:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.ime.vmadot.s32s8s8.L1:
vsetvli x7, x0, e8, m1
vmadot v4, v0, v1
vmadot v6, v0, v1
vmadot v8, v0, v1
vmadot v10, v0, v1
vmadot v12, v0, v1
vmadot v14, v0, v1
vmadot v16, v0, v1
addi a0, a0, -1
vmadot v18, v0, v1
vmadot v20, v0, v1
vmadot v22, v0, v1
vmadot v24, v0, v1
vmadot v26, v0, v1
vmadot v28, v0, v1
vmadot v30, v0, v1
bnez a0, .ime.vmadot.s32s8s8.L1
restore_caller_vec
ret
#ifdef __APPLE__
.globl _ime_vmadotu_u32u8u8
_ime_vmadotu_u32u8u8:
#else
.globl ime_vmadotu_u32u8u8
ime_vmadotu_u32u8u8:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.ime.vmadotu.u32u8u8.L1:
vsetvli x7, x0, e8, m1
vmadotu v4, v0, v1
vmadotu v6, v0, v1
vmadotu v8, v0, v1
vmadotu v10, v0, v1
vmadotu v12, v0, v1
vmadotu v14, v0, v1
vmadotu v16, v0, v1
addi a0, a0, -1
vmadotu v18, v0, v1
vmadotu v20, v0, v1
vmadotu v22, v0, v1
vmadotu v24, v0, v1
vmadotu v26, v0, v1
vmadotu v28, v0, v1
vmadotu v30, v0, v1
bnez a0, .ime.vmadotu.u32u8u8.L1
restore_caller_vec
ret
#ifdef __APPLE__
.globl _ime_vmadotus_s32u8s8
_ime_vmadotus_s32u8s8:
#else
.globl ime_vmadotus_s32u8s8
ime_vmadotus_s32u8s8:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.ime.vmadotus.s32u8s8.L1:
vsetvli x7, x0, e8, m1
vmadotus v4, v0, v1
vmadotus v6, v0, v1
vmadotus v8, v0, v1
vmadotus v10, v0, v1
vmadotus v12, v0, v1
vmadotus v14, v0, v1
vmadotus v16, v0, v1
addi a0, a0, -1
vmadotus v18, v0, v1
vmadotus v20, v0, v1
vmadotus v22, v0, v1
vmadotus v24, v0, v1
vmadotus v26, v0, v1
vmadotus v28, v0, v1
vmadotus v30, v0, v1
bnez a0, .ime.vmadotus.s32u8s8.L1
restore_caller_vec
ret
#ifdef __APPLE__
.globl _ime_vmadotsu_s32s8u8
_ime_vmadotsu_s32s8u8:
#else
.globl ime_vmadotsu_s32s8u8
ime_vmadotsu_s32s8u8:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.ime.vmadotsu.s32s8u8.L1:
vsetvli x7, x0, e8, m1
vmadotsu v4, v0, v1
vmadotsu v6, v0, v1
vmadotsu v8, v0, v1
vmadotsu v10, v0, v1
vmadotsu v12, v0, v1
vmadotsu v14, v0, v1
vmadotsu v16, v0, v1
addi a0, a0, -1
vmadotsu v18, v0, v1
vmadotsu v20, v0, v1
vmadotsu v22, v0, v1
vmadotsu v24, v0, v1
vmadotsu v26, v0, v1
vmadotsu v28, v0, v1
vmadotsu v30, v0, v1
bnez a0, .ime.vmadotsu.s32s8u8.L1
restore_caller_vec
ret
#ifdef __APPLE__
.globl _ime_vmadotslide_s32s8s8
_ime_vmadotslide_s32s8s8:
#else
.globl ime_vmadotslide_s32s8s8
ime_vmadotslide_s32s8s8:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.ime.vmadotslide.s32s8s8.L1:
vsetvli x7, x0, e8, m1
vmadot v4, v0, v2
vmadot1 v6, v0, v2
vmadot2 v8, v0, v2
vmadot3 v10, v0, v2
vmadot v12, v0, v2
vmadot1 v14, v0, v2
vmadot2 v16, v0, v2
vmadot3 v18, v0, v2
addi a0, a0, -1
vmadot v20, v0, v2
vmadot1 v22, v0, v2
vmadot2 v24, v0, v2
vmadot3 v26, v0, v2
bnez a0, .ime.vmadotslide.s32s8s8.L1
restore_caller_vec
ret
================================================
FILE: riscv64/asm/_VECTOR_.S
================================================
.align 4
.macro preserve_caller_vec
csrr x5, vtype
csrr x6, vl
vsetvli x7, x0, e8, m8
sub sp, sp, x7
vse8.v v0, (sp)
sub sp, sp, x7
vse8.v v8, (sp)
sub sp, sp, x7
vse8.v v16, (sp)
sub sp, sp, x7
vse8.v v24, (sp)
.endm
.macro restore_caller_vec
vsetvli x7, x0, e8, m8
vle8.v v24, (sp)
add sp, sp, x7
vle8.v v16, (sp)
add sp, sp, x7
vle8.v v8, (sp)
add sp, sp, x7
vle8.v v0, (sp)
add sp, sp, x7
vsetvl x7, x6, x5
.endm
#ifdef __APPLE__
.globl _vector_vfmacc_vf_f16f16f16
_vector_vfmacc_vf_f16f16f16:
#else
.globl vector_vfmacc_vf_f16f16f16
vector_vfmacc_vf_f16f16f16:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.vector.vfmacc.vf.f16f16f16.L1:
vsetvli x7, x0, e16, m1
vfmacc.vf v9, f0, v1
vfmacc.vf v10, f0, v1
vfmacc.vf v11, f0, v1
vfmacc.vf v12, f0, v1
vfmacc.vf v13, f0, v1
vfmacc.vf v14, f0, v1
vfmacc.vf v15, f0, v1
vfmacc.vf v16, f0, v1
vfmacc.vf v17, f0, v1
vfmacc.vf v18, f0, v1
vfmacc.vf v19, f0, v1
addi a0, a0, -1
vfmacc.vf v20, f0, v1
vfmacc.vf v21, f0, v1
vfmacc.vf v22, f0, v1
vfmacc.vf v23, f0, v1
vfmacc.vf v24, f0, v1
vfmacc.vf v25, f0, v1
vfmacc.vf v26, f0, v1
vfmacc.vf v27, f0, v1
vfmacc.vf v28, f0, v1
vfmacc.vf v29, f0, v1
vfmacc.vf v30, f0, v1
vfmacc.vf v31, f0, v1
bnez a0, .vector.vfmacc.vf.f16f16f16.L1
restore_caller_vec
ret
#ifdef __APPLE__
.globl _vector_vfmacc_vv_f16f16f16
_vector_vfmacc_vv_f16f16f16:
#else
.globl vector_vfmacc_vv_f16f16f16
vector_vfmacc_vv_f16f16f16:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.vector.vfmacc.vv.f16f16f16.L1:
vsetvli x7, x0, e16, m1
vfmacc.vv v8, v0, v1
vfmacc.vv v9, v0, v1
vfmacc.vv v10, v0, v1
vfmacc.vv v11, v0, v1
vfmacc.vv v12, v0, v1
vfmacc.vv v13, v0, v1
vfmacc.vv v14, v0, v1
vfmacc.vv v15, v0, v1
vfmacc.vv v16, v0, v1
vfmacc.vv v17, v0, v1
vfmacc.vv v18, v0, v1
vfmacc.vv v19, v0, v1
addi a0, a0, -1
vfmacc.vv v20, v0, v1
vfmacc.vv v21, v0, v1
vfmacc.vv v22, v0, v1
vfmacc.vv v23, v0, v1
vfmacc.vv v24, v0, v1
vfmacc.vv v25, v0, v1
vfmacc.vv v26, v0, v1
vfmacc.vv v27, v0, v1
vfmacc.vv v28, v0, v1
vfmacc.vv v29, v0, v1
vfmacc.vv v30, v0, v1
vfmacc.vv v31, v0, v1
bnez a0, .vector.vfmacc.vv.f16f16f16.L1
restore_caller_vec
ret
#ifdef __APPLE__
.globl _vector_vfmacc_vf_f32f32f32
_vector_vfmacc_vf_f32f32f32:
#else
.globl vector_vfmacc_vf_f32f32f32
vector_vfmacc_vf_f32f32f32:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.vector.vfmacc.vf.f32f32f32.L1:
vsetvli x7, x0, e32, m1
vfmacc.vf v9, f0, v1
vfmacc.vf v10, f0, v1
vfmacc.vf v11, f0, v1
vfmacc.vf v12, f0, v1
vfmacc.vf v13, f0, v1
vfmacc.vf v14, f0, v1
vfmacc.vf v15, f0, v1
vfmacc.vf v16, f0, v1
vfmacc.vf v17, f0, v1
vfmacc.vf v18, f0, v1
vfmacc.vf v19, f0, v1
addi a0, a0, -1
vfmacc.vf v20, f0, v1
vfmacc.vf v21, f0, v1
vfmacc.vf v22, f0, v1
vfmacc.vf v23, f0, v1
vfmacc.vf v24, f0, v1
vfmacc.vf v25, f0, v1
vfmacc.vf v26, f0, v1
vfmacc.vf v27, f0, v1
vfmacc.vf v28, f0, v1
vfmacc.vf v29, f0, v1
vfmacc.vf v30, f0, v1
vfmacc.vf v31, f0, v1
bnez a0, .vector.vfmacc.vf.f32f32f32.L1
restore_caller_vec
ret
#ifdef __APPLE__
.globl _vector_vfmacc_vv_f32f32f32
_vector_vfmacc_vv_f32f32f32:
#else
.globl vector_vfmacc_vv_f32f32f32
vector_vfmacc_vv_f32f32f32:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.vector.vfmacc.vv.f32f32f32.L1:
vsetvli x7, x0, e32, m1
vfmacc.vv v8, v0, v1
vfmacc.vv v9, v0, v1
vfmacc.vv v10, v0, v1
vfmacc.vv v11, v0, v1
vfmacc.vv v12, v0, v1
vfmacc.vv v13, v0, v1
vfmacc.vv v14, v0, v1
vfmacc.vv v15, v0, v1
vfmacc.vv v16, v0, v1
vfmacc.vv v17, v0, v1
vfmacc.vv v18, v0, v1
vfmacc.vv v19, v0, v1
addi a0, a0, -1
vfmacc.vv v20, v0, v1
vfmacc.vv v21, v0, v1
vfmacc.vv v22, v0, v1
vfmacc.vv v23, v0, v1
vfmacc.vv v24, v0, v1
vfmacc.vv v25, v0, v1
vfmacc.vv v26, v0, v1
vfmacc.vv v27, v0, v1
vfmacc.vv v28, v0, v1
vfmacc.vv v29, v0, v1
vfmacc.vv v30, v0, v1
vfmacc.vv v31, v0, v1
bnez a0, .vector.vfmacc.vv.f32f32f32.L1
restore_caller_vec
ret
#ifdef __APPLE__
.globl _vector_vfmacc_vf_f64f64f64
_vector_vfmacc_vf_f64f64f64:
#else
.globl vector_vfmacc_vf_f64f64f64
vector_vfmacc_vf_f64f64f64:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.vector.vfmacc.vf.f64f64f64.L1:
vsetvli x7, x0, e64, m1
vfmacc.vf v9, f0, v1
vfmacc.vf v10, f0, v1
vfmacc.vf v11, f0, v1
vfmacc.vf v12, f0, v1
vfmacc.vf v13, f0, v1
vfmacc.vf v14, f0, v1
vfmacc.vf v15, f0, v1
vfmacc.vf v16, f0, v1
vfmacc.vf v17, f0, v1
vfmacc.vf v18, f0, v1
vfmacc.vf v19, f0, v1
addi a0, a0, -1
vfmacc.vf v20, f0, v1
vfmacc.vf v21, f0, v1
vfmacc.vf v22, f0, v1
vfmacc.vf v23, f0, v1
vfmacc.vf v24, f0, v1
vfmacc.vf v25, f0, v1
vfmacc.vf v26, f0, v1
vfmacc.vf v27, f0, v1
vfmacc.vf v28, f0, v1
vfmacc.vf v29, f0, v1
vfmacc.vf v30, f0, v1
vfmacc.vf v31, f0, v1
bnez a0, .vector.vfmacc.vf.f64f64f64.L1
restore_caller_vec
ret
#ifdef __APPLE__
.globl _vector_vfmacc_vv_f64f64f64
_vector_vfmacc_vv_f64f64f64:
#else
.globl vector_vfmacc_vv_f64f64f64
vector_vfmacc_vv_f64f64f64:
#endif
preserve_caller_vec
vsetvli x7, x0, e8, m8
vxor.vv v0, v0, v0
vxor.vv v8, v8, v8
vxor.vv v16, v16, v16
vxor.vv v24, v24, v24
.vector.vfmacc.vv.f64f64f64.L1:
vsetvli x7, x0, e64, m1
vfmacc.vv v8, v0, v1
vfmacc.vv v9, v0, v1
vfmacc.vv v10, v0, v1
vfmacc.vv v11, v0, v1
vfmacc.vv v12, v0, v1
vfmacc.vv v13, v0, v1
vfmacc.vv v14, v0, v1
vfmacc.vv v15, v0, v1
vfmacc.vv v16, v0, v1
vfmacc.vv v17, v0, v1
vfmacc.vv v18, v0, v1
vfmacc.vv v19, v0, v1
addi a0, a0, -1
vfmacc.vv v20, v0, v1
vfmacc.vv v21, v0, v1
vfmacc.vv v22, v0, v1
vfmacc.vv v23, v0, v1
vfmacc.vv v24, v0, v1
vfmacc.vv v25, v0, v1
vfmacc.vv v26, v0, v1
vfmacc.vv v27, v0, v1
vfmacc.vv v28, v0, v1
vfmacc.vv v29, v0, v1
vfmacc.vv v30, v0, v1
vfmacc.vv v31, v0, v1
bnez a0, .vector.vfmacc.vv.f64f64f64.L1
restore_caller_vec
ret
================================================
FILE: riscv64/cpufp.cpp
================================================
#include "table.hpp"
#include "smtl.hpp"
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
extern "C"
{
#ifdef _IME_
void ime_vmadot_s32s8s8(int64_t);
void ime_vmadotu_u32u8u8(int64_t);
void ime_vmadotus_s32u8s8(int64_t);
void ime_vmadotsu_s32s8u8(int64_t);
void ime_vmadotslide_s32s8s8(int64_t);
#endif
#ifdef _VECTOR_
void vector_vfmacc_vf_f16f16f16(int64_t);
void vector_vfmacc_vv_f16f16f16(int64_t);
void vector_vfmacc_vf_f32f32f32(int64_t);
void vector_vfmacc_vv_f32f32f32(int64_t);
void vector_vfmacc_vf_f64f64f64(int64_t);
void vector_vfmacc_vv_f64f64f64(int64_t);
#endif
}
typedef struct
{
std::string isa;
std::string type;
std::string dim;
int64_t loop_time;
int64_t comp_pl;
void (*bench)(int64_t);
} cpubm_t;
static vector bm_list;
static double get_time(struct timespec *start,
struct timespec *end)
{
return end->tv_sec - start->tv_sec +
(end->tv_nsec - start->tv_nsec) * 1e-9;
}
static void reg_new_isa(std::string isa,
std::string type,
std::string dim,
int64_t loop_time,
int64_t comp_pl,
void (*bench)(int64_t))
{
cpubm_t new_one;
new_one.isa = isa;
new_one.type = type;
new_one.dim = dim;
new_one.loop_time = loop_time;
new_one.comp_pl = comp_pl;
new_one.bench = bench;
bm_list.push_back(new_one);
}
static void thread_func(void *params)
{
cpubm_t *bm = (cpubm_t*)params;
bm->bench(bm->loop_time);
}
static void cpubm_riscv64_one(smtl_handle sh,
cpubm_t &item,
Table &table)
{
struct timespec start, end;
double time_used, perf;
char perfUnit = 'G';
int i;
int num_threads = smtl_num_threads(sh);
// warm up
for (i = 0; i < num_threads; i++)
{
smtl_add_task(sh, thread_func, (void*)&item);
}
smtl_begin_tasks(sh);
smtl_wait_tasks_finished(sh);
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
for (i = 0; i < num_threads; i++)
{
smtl_add_task(sh, thread_func, (void*)&item);
}
smtl_begin_tasks(sh);
smtl_wait_tasks_finished(sh);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
time_used = get_time(&start, &end);
perf = item.loop_time * item.comp_pl * num_threads /
time_used;
if (perf > 1e12)
{
perfUnit = 'T';
perf /= 1e12;
}
else
{
perf /= 1e9;
}
stringstream ss;
ss << std::setprecision(5) << perf << " " << perfUnit << item.dim;
vector cont;
cont.resize(3);
cont[0] = item.isa;
cont[1] = item.type;
cont[2] = ss.str();
table.addOneItem(cont);
}
static void cpubm_do_bench(std::vector &set_of_threads,
uint32_t idle_time)
{
int i;
if (bm_list.size() > 0)
{
int num_threads = set_of_threads.size();
printf("Number Threads: %d\n", num_threads);
printf("Thread Pool Binding:");
for (i = 0; i < num_threads; i++)
{
printf(" %d", set_of_threads[i]);
}
printf("\n");
// set table head
vector ti;
ti.resize(3);
ti[0] = "Instruction Set";
ti[1] = "Core Computation";
ti[2] = "Peak Performance";
Table table;
table.setColumnNum(3);
table.addOneItem(ti);
// set thread pool
smtl_handle sh;
smtl_init(&sh, set_of_threads);
// traverse task list
cpubm_riscv64_one(sh, bm_list[0], table);
for (i = 1; i < bm_list.size(); i++)
{
sleep(idle_time);
cpubm_riscv64_one(sh, bm_list[i], table);
}
table.print();
smtl_fini(sh);
}
else
{
printf("Sorry, there's no any supported SIMD isa.\n");
}
}
static void parse_thread_pool(char *sets,
vector &set_of_threads)
{
if (sets[0] != '[')
{
return;
}
int pos = 1;
int left = 0, right = 0;
int state = 0;
while (sets[pos] != ']' && sets[pos] != '\0')
{
if (state == 0)
{
if (sets[pos] >= '0' && sets[pos] <= '9')
{
left *= 10;
left += (int)(sets[pos] - '0');
}
else if (sets[pos] == ',')
{
set_of_threads.push_back(left);
left = 0;
}
else if (sets[pos] == '-')
{
right = 0;
state = 1;
}
}
else if (state == 1)
{
if (sets[pos] >= '0' && sets[pos] <= '9')
{
right *= 10;
right += (int)(sets[pos] - '0');
}
else if (sets[pos] == ',')
{
int i;
for (i = left; i <= right; i++)
{
set_of_threads.push_back(i);
}
left = 0;
state = 0;
}
}
pos++;
}
if (sets[pos] != ']')
{
return;
}
if (state == 0)
{
set_of_threads.push_back(left);
}
else if (state == 1)
{
int i;
for (i = left; i <= right; i++)
{
set_of_threads.push_back(i);
}
}
}
static void cpufp_register_isa()
{
#ifdef _IME_
reg_new_isa("ime", "vmadot(s32,s8,s8)", "OPS",
0x10000000LL, 3584LL, ime_vmadot_s32s8s8);
reg_new_isa("ime", "vmadotu(u32,u8,u8)", "OPS",
0x10000000LL, 3584LL, ime_vmadotu_u32u8u8);
reg_new_isa("ime", "vmadotus(s32,u8,s8)", "OPS",
0x10000000LL, 3584LL, ime_vmadotus_s32u8s8);
reg_new_isa("ime", "vmadotsu(s32,s8,u8)", "OPS",
0x10000000LL, 3584LL, ime_vmadotsu_s32s8u8);
reg_new_isa("ime", "vmadotslide(s32,s8,s8)", "OPS",
0x10000000LL, 3072LL, ime_vmadotslide_s32s8s8);
#endif
#ifdef _VECTOR_
size_t avl = 0;
__asm__ volatile("vsetvli %[avl], x0, e16, m1\n\t"
: [avl] "=r" (avl)
:
: "cc");
reg_new_isa("vector", "vfmacc.vf(f16,f16,f16)", "FLOPS",
0x10000000LL, 48LL * avl, vector_vfmacc_vf_f16f16f16);
reg_new_isa("vector", "vfmacc.vv(f16,f16,f16)", "FLOPS",
0x10000000LL, 48LL * avl, vector_vfmacc_vv_f16f16f16);
__asm__ volatile("vsetvli %[avl], x0, e32, m1\n\t"
: [avl] "=r" (avl)
:
: "cc");
reg_new_isa("vector", "vfmacc.vf(f32,f32,f32)", "FLOPS",
0x10000000LL, 48LL * avl, vector_vfmacc_vf_f32f32f32);
reg_new_isa("vector", "vfmacc.vv(f32,f32,f32)", "FLOPS",
0x10000000LL, 48LL * avl, vector_vfmacc_vv_f32f32f32);
__asm__ volatile("vsetvli %[avl], x0, e64, m1\n\t"
: [avl] "=r" (avl)
:
: "cc");
reg_new_isa("vector", "vfmacc.vf(f64,f64,f64)", "FLOPS",
0x10000000LL, 48LL * avl, vector_vfmacc_vf_f64f64f64);
reg_new_isa("vector", "vfmacc.vv(f64,f64,f64)", "FLOPS",
0x10000000LL, 48LL * avl, vector_vfmacc_vv_f64f64f64);
#endif
}
int main(int argc, char *argv[])
{
vector set_of_threads;
uint32_t idle_time = 0;
bool params_enough = false;
int i;
for (i = 1; i < argc; i++)
{
if (strncmp(argv[i], "--thread_pool=", 14) == 0)
{
parse_thread_pool(argv[i] + 14, set_of_threads);
params_enough = true;
}
else if (strncmp(argv[i], "--idle_time=", 12) == 0)
{
idle_time = (uint32_t)atoi(argv[i] + 12);
}
}
if (!params_enough)
{
fprintf(stderr, "Error: You must set --thread_pool parameter.\n");
fprintf(stderr, "You may also set --idle_time parameter.\n");
fprintf(stderr, "Usage: %s --thread_pool=[xxx] --idle_time=yyy\n", argv[0]);
fprintf(stderr, "[xxx] indicates all cores to benchmark.\n");
fprintf(stderr, "Example: [0,3,5-8,13-15].\n");
fprintf(stderr, "idle_time is the interval time(s) between every two benchmarks.\n");
fprintf(stderr, "idle_time parameter can be ignored, the default value is 0s.\n");
fprintf(stderr, "Notice: there must NOT be any spaces.\n");
exit(0);
}
cpufp_register_isa();
cpubm_do_bench(set_of_threads, idle_time);
return 0;
}
================================================
FILE: riscv64/cpuid.c
================================================
#include
#include
#include
#include
#include
#include
#define ISA_V_HWCAP (1 << ('v' - 'a'))
int main()
{
FILE *cpuinfo_file;
char line[1024];
char *token;
const char *delimiter = ":";
const char *search_key = "vendorid";
char *vendor_id = NULL;
cpuinfo_file = fopen("/proc/cpuinfo", "r");
if (cpuinfo_file == NULL) {
printf("Failed to open /proc/cpuinfo\n");
return 1;
}
while (fgets(line, sizeof(line), cpuinfo_file)) {
if ((token = strtok(line, delimiter)) != NULL) {
if (strstr(token, search_key) != NULL) {
token = strtok(NULL, delimiter);
vendor_id = token + strspn(token, " \t");
break;
}
}
}
fclose(cpuinfo_file);
if (vendor_id) {
if (strncmp(vendor_id, "0x710", 5) == 0) {
printf("_IME_\n");
}
}
uint64_t hwcaps = getauxval(AT_HWCAP);
#ifdef ISA_V_HWCAP
if (hwcaps & ISA_V_HWCAP)
{
printf("_VECTOR_\n");
}
#endif
return 0;
}
================================================
FILE: x64/asm/_AMX_BF16_.S
================================================
.globl amx_bf16_mm_f32bf16bf16
amx_bf16_mm_f32bf16bf16:
ldtilecfg (%rsi)
tilezero %tmm0
tilezero %tmm1
tilezero %tmm2
tilezero %tmm3
tilezero %tmm4
tilezero %tmm5
.amx.bf16.mm.f32bf16bf16:
tdpbf16ps %tmm4, %tmm5, %tmm0
tdpbf16ps %tmm4, %tmm5, %tmm1
tdpbf16ps %tmm4, %tmm5, %tmm2
tdpbf16ps %tmm4, %tmm5, %tmm3
sub $0x1, %rdi
jne .amx.bf16.mm.f32bf16bf16
ret
================================================
FILE: x64/asm/_AMX_FP16_.S
================================================
.globl amx_fp16_mm_f32f16f16
amx_fp16_mm_f32f16f16:
ldtilecfg (%rsi)
tilezero %tmm0
tilezero %tmm1
tilezero %tmm2
tilezero %tmm3
tilezero %tmm4
tilezero %tmm5
.amx.fp16.mm.f32f16f16:
tdpfp16ps %tmm4, %tmm5, %tmm0
tdpfp16ps %tmm4, %tmm5, %tmm1
tdpfp16ps %tmm4, %tmm5, %tmm2
tdpfp16ps %tmm4, %tmm5, %tmm3
sub $0x1, %rdi
jne .amx.fp16.mm.f32f16f16
ret
================================================
FILE: x64/asm/_AMX_INT8_.S
================================================
.globl amx_int8_mm_s32s8s8
.globl amx_int8_mm_s32s8u8
.globl amx_int8_mm_s32u8s8
.globl amx_int8_mm_s32u8u8
amx_int8_mm_s32s8s8:
ldtilecfg (%rsi)
tilezero %tmm0
tilezero %tmm1
tilezero %tmm2
tilezero %tmm3
tilezero %tmm4
tilezero %tmm5
.amx.int8.mm.s32s8s8:
tdpbssd %tmm4, %tmm5, %tmm0
tdpbssd %tmm4, %tmm5, %tmm1
tdpbssd %tmm4, %tmm5, %tmm2
tdpbssd %tmm4, %tmm5, %tmm3
sub $0x1, %rdi
jne .amx.int8.mm.s32s8s8
ret
amx_int8_mm_s32s8u8:
ldtilecfg (%rsi)
tilezero %tmm0
tilezero %tmm1
tilezero %tmm2
tilezero %tmm3
tilezero %tmm4
tilezero %tmm5
.amx.int8.mm.s32s8u8:
tdpbsud %tmm4, %tmm5, %tmm0
tdpbsud %tmm4, %tmm5, %tmm1
tdpbsud %tmm4, %tmm5, %tmm2
tdpbsud %tmm4, %tmm5, %tmm3
sub $0x1, %rdi
jne .amx.int8.mm.s32s8u8
ret
amx_int8_mm_s32u8s8:
ldtilecfg (%rsi)
tilezero %tmm0
tilezero %tmm1
tilezero %tmm2
tilezero %tmm3
tilezero %tmm4
tilezero %tmm5
.amx.int8.mm.s32u8s8:
tdpbusd %tmm4, %tmm5, %tmm0
tdpbusd %tmm4, %tmm5, %tmm1
tdpbusd %tmm4, %tmm5, %tmm2
tdpbusd %tmm4, %tmm5, %tmm3
sub $0x1, %rdi
jne .amx.int8.mm.s32u8s8
ret
amx_int8_mm_s32u8u8:
ldtilecfg (%rsi)
tilezero %tmm0
tilezero %tmm1
tilezero %tmm2
tilezero %tmm3
tilezero %tmm4
tilezero %tmm5
.amx.int8.mm.s32u8u8:
tdpbuud %tmm4, %tmm5, %tmm0
tdpbuud %tmm4, %tmm5, %tmm1
tdpbuud %tmm4, %tmm5, %tmm2
tdpbuud %tmm4, %tmm5, %tmm3
sub $0x1, %rdi
jne .amx.int8.mm.s32u8u8
ret
================================================
FILE: x64/asm/_AVX512F_.S
================================================
.globl avx512f_512b_fma_f32f32f32
.globl avx512f_512b_fma_f64f64f64
.globl avx512f_512b_add_mul_f32f32_f32
.globl avx512f_512b_add_mul_f64f64_f64
avx512f_512b_fma_f32f32f32:
vpxord %zmm0, %zmm0, %zmm0
vpxord %zmm1, %zmm1, %zmm1
vpxord %zmm2, %zmm2, %zmm2
vpxord %zmm3, %zmm3, %zmm3
vpxord %zmm4, %zmm4, %zmm4
vpxord %zmm5, %zmm5, %zmm5
vpxord %zmm6, %zmm6, %zmm6
vpxord %zmm7, %zmm7, %zmm7
vpxord %zmm8, %zmm8, %zmm8
vpxord %zmm9, %zmm9, %zmm9
vpxord %zmm10, %zmm10, %zmm10
vpxord %zmm11, %zmm11, %zmm11
vpxord %zmm12, %zmm12, %zmm12
vpxord %zmm13, %zmm13, %zmm13
vpxord %zmm14, %zmm14, %zmm14
vpxord %zmm15, %zmm15, %zmm15
.avx512f.512b.fma.f32f32f32.L1:
vfmadd231ps %zmm0, %zmm0, %zmm0
vfmadd231ps %zmm1, %zmm1, %zmm1
vfmadd231ps %zmm2, %zmm2, %zmm2
vfmadd231ps %zmm3, %zmm3, %zmm3
vfmadd231ps %zmm4, %zmm4, %zmm4
vfmadd231ps %zmm5, %zmm5, %zmm5
vfmadd231ps %zmm6, %zmm6, %zmm6
vfmadd231ps %zmm7, %zmm7, %zmm7
vfmadd231ps %zmm8, %zmm8, %zmm8
vfmadd231ps %zmm9, %zmm9, %zmm9
vfmadd231ps %zmm10, %zmm10, %zmm10
vfmadd231ps %zmm11, %zmm11, %zmm11
vfmadd231ps %zmm12, %zmm12, %zmm12
vfmadd231ps %zmm13, %zmm13, %zmm13
vfmadd231ps %zmm14, %zmm14, %zmm14
vfmadd231ps %zmm15, %zmm15, %zmm15
sub $0x1, %rdi
jne .avx512f.512b.fma.f32f32f32.L1
ret
avx512f_512b_fma_f64f64f64:
vpxord %zmm0, %zmm0, %zmm0
vpxord %zmm1, %zmm1, %zmm1
vpxord %zmm2, %zmm2, %zmm2
vpxord %zmm3, %zmm3, %zmm3
vpxord %zmm4, %zmm4, %zmm4
vpxord %zmm5, %zmm5, %zmm5
vpxord %zmm6, %zmm6, %zmm6
vpxord %zmm7, %zmm7, %zmm7
vpxord %zmm8, %zmm8, %zmm8
vpxord %zmm9, %zmm9, %zmm9
vpxord %zmm10, %zmm10, %zmm10
vpxord %zmm11, %zmm11, %zmm11
vpxord %zmm12, %zmm12, %zmm12
vpxord %zmm13, %zmm13, %zmm13
vpxord %zmm14, %zmm14, %zmm14
vpxord %zmm15, %zmm15, %zmm15
.avx512f.512b.fma.f64f64f64.L1:
vfmadd231pd %zmm0, %zmm0, %zmm0
vfmadd231pd %zmm1, %zmm1, %zmm1
vfmadd231pd %zmm2, %zmm2, %zmm2
vfmadd231pd %zmm3, %zmm3, %zmm3
vfmadd231pd %zmm4, %zmm4, %zmm4
vfmadd231pd %zmm5, %zmm5, %zmm5
vfmadd231pd %zmm6, %zmm6, %zmm6
vfmadd231pd %zmm7, %zmm7, %zmm7
vfmadd231pd %zmm8, %zmm8, %zmm8
vfmadd231pd %zmm9, %zmm9, %zmm9
vfmadd231pd %zmm10, %zmm10, %zmm10
vfmadd231pd %zmm11, %zmm11, %zmm11
vfmadd231pd %zmm12, %zmm12, %zmm12
vfmadd231pd %zmm13, %zmm13, %zmm13
vfmadd231pd %zmm14, %zmm14, %zmm14
vfmadd231pd %zmm15, %zmm15, %zmm15
sub $0x1, %rdi
jne .avx512f.512b.fma.f64f64f64.L1
ret
avx512f_512b_add_mul_f32f32_f32:
vpxord %zmm0, %zmm0, %zmm0
vpxord %zmm1, %zmm1, %zmm1
vpxord %zmm2, %zmm2, %zmm2
vpxord %zmm3, %zmm3, %zmm3
vpxord %zmm4, %zmm4, %zmm4
vpxord %zmm5, %zmm5, %zmm5
vpxord %zmm6, %zmm6, %zmm6
vpxord %zmm7, %zmm7, %zmm7
vpxord %zmm8, %zmm8, %zmm8
vpxord %zmm9, %zmm9, %zmm9
vpxord %zmm10, %zmm10, %zmm10
vpxord %zmm11, %zmm11, %zmm11
vpxord %zmm12, %zmm12, %zmm12
vpxord %zmm13, %zmm13, %zmm13
vpxord %zmm14, %zmm14, %zmm14
vpxord %zmm15, %zmm15, %zmm15
.avx512f.512b.add.mul.f32f32.f32.L1:
vmulps %zmm0, %zmm0, %zmm0
vaddps %zmm1, %zmm1, %zmm1
vmulps %zmm2, %zmm2, %zmm2
vaddps %zmm3, %zmm3, %zmm3
vmulps %zmm4, %zmm4, %zmm4
vaddps %zmm5, %zmm5, %zmm5
vmulps %zmm6, %zmm6, %zmm6
vaddps %zmm7, %zmm7, %zmm7
vmulps %zmm8, %zmm8, %zmm8
vaddps %zmm9, %zmm9, %zmm9
vmulps %zmm10, %zmm10, %zmm10
vaddps %zmm11, %zmm11, %zmm11
vmulps %zmm12, %zmm12, %zmm12
vaddps %zmm13, %zmm13, %zmm13
vmulps %zmm14, %zmm14, %zmm14
vaddps %zmm15, %zmm15, %zmm15
sub $0x1, %rdi
jne .avx512f.512b.add.mul.f32f32.f32.L1
ret
avx512f_512b_add_mul_f64f64_f64:
vpxord %zmm0, %zmm0, %zmm0
vpxord %zmm1, %zmm1, %zmm1
vpxord %zmm2, %zmm2, %zmm2
vpxord %zmm3, %zmm3, %zmm3
vpxord %zmm4, %zmm4, %zmm4
vpxord %zmm5, %zmm5, %zmm5
vpxord %zmm6, %zmm6, %zmm6
vpxord %zmm7, %zmm7, %zmm7
vpxord %zmm8, %zmm8, %zmm8
vpxord %zmm9, %zmm9, %zmm9
vpxord %zmm10, %zmm10, %zmm10
vpxord %zmm11, %zmm11, %zmm11
vpxord %zmm12, %zmm12, %zmm12
vpxord %zmm13, %zmm13, %zmm13
vpxord %zmm14, %zmm14, %zmm14
vpxord %zmm15, %zmm15, %zmm15
.avx512f.512b.add.mul.f64f64.f64.L1:
vmulpd %zmm0, %zmm0, %zmm0
vaddpd %zmm1, %zmm1, %zmm1
vmulpd %zmm2, %zmm2, %zmm2
vaddpd %zmm3, %zmm3, %zmm3
vmulpd %zmm4, %zmm4, %zmm4
vaddpd %zmm5, %zmm5, %zmm5
vmulpd %zmm6, %zmm6, %zmm6
vaddpd %zmm7, %zmm7, %zmm7
vmulpd %zmm8, %zmm8, %zmm8
vaddpd %zmm9, %zmm9, %zmm9
vmulpd %zmm10, %zmm10, %zmm10
vaddpd %zmm11, %zmm11, %zmm11
vmulpd %zmm12, %zmm12, %zmm12
vaddpd %zmm13, %zmm13, %zmm13
vmulpd %zmm14, %zmm14, %zmm14
vaddpd %zmm15, %zmm15, %zmm15
sub $0x1, %rdi
jne .avx512f.512b.add.mul.f64f64.f64.L1
ret
================================================
FILE: x64/asm/_AVX512_BF16_.S
================================================
.globl avx512_bf16_512b_dp2a_f32bf16bf16
.globl avx512_bf16_256b_dp2a_f32bf16bf16
.globl avx512_bf16_128b_dp2a_f32bf16bf16
avx512_bf16_512b_dp2a_f32bf16bf16:
vpxord %zmm0, %zmm0, %zmm0
vpxord %zmm1, %zmm1, %zmm1
vpxord %zmm2, %zmm2, %zmm2
vpxord %zmm3, %zmm3, %zmm3
vpxord %zmm4, %zmm4, %zmm4
vpxord %zmm5, %zmm5, %zmm5
vpxord %zmm6, %zmm6, %zmm6
vpxord %zmm7, %zmm7, %zmm7
vpxord %zmm8, %zmm8, %zmm8
vpxord %zmm9, %zmm9, %zmm9
vpxord %zmm10, %zmm10, %zmm10
vpxord %zmm11, %zmm11, %zmm11
vpxord %zmm12, %zmm12, %zmm12
vpxord %zmm13, %zmm13, %zmm13
vpxord %zmm14, %zmm14, %zmm14
vpxord %zmm15, %zmm15, %zmm15
vpxord %zmm16, %zmm16, %zmm16
.avx512.bf16.512b.dp2a.fp32bf16bf16.L1:
vdpbf16ps %zmm16, %zmm16, %zmm0
vdpbf16ps %zmm16, %zmm16, %zmm1
vdpbf16ps %zmm16, %zmm16, %zmm2
vdpbf16ps %zmm16, %zmm16, %zmm3
vdpbf16ps %zmm16, %zmm16, %zmm4
vdpbf16ps %zmm16, %zmm16, %zmm5
vdpbf16ps %zmm16, %zmm16, %zmm6
vdpbf16ps %zmm16, %zmm16, %zmm7
vdpbf16ps %zmm16, %zmm16, %zmm8
vdpbf16ps %zmm16, %zmm16, %zmm9
vdpbf16ps %zmm16, %zmm16, %zmm10
vdpbf16ps %zmm16, %zmm16, %zmm11
vdpbf16ps %zmm16, %zmm16, %zmm12
vdpbf16ps %zmm16, %zmm16, %zmm13
vdpbf16ps %zmm16, %zmm16, %zmm14
vdpbf16ps %zmm16, %zmm16, %zmm15
sub $0x1, %rdi
jne .avx512.bf16.512b.dp2a.fp32bf16bf16.L1
ret
avx512_bf16_256b_dp2a_f32bf16bf16:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
vpxord %zmm16, %zmm16, %zmm16
.avx512.bf16.256b.dp2a.fp32bf16bf16.L1:
vdpbf16ps %ymm16, %ymm16, %ymm0
vdpbf16ps %ymm16, %ymm16, %ymm1
vdpbf16ps %ymm16, %ymm16, %ymm2
vdpbf16ps %ymm16, %ymm16, %ymm3
vdpbf16ps %ymm16, %ymm16, %ymm4
vdpbf16ps %ymm16, %ymm16, %ymm5
vdpbf16ps %ymm16, %ymm16, %ymm6
vdpbf16ps %ymm16, %ymm16, %ymm7
vdpbf16ps %ymm16, %ymm16, %ymm8
vdpbf16ps %ymm16, %ymm16, %ymm9
vdpbf16ps %ymm16, %ymm16, %ymm10
vdpbf16ps %ymm16, %ymm16, %ymm11
vdpbf16ps %ymm16, %ymm16, %ymm12
vdpbf16ps %ymm16, %ymm16, %ymm13
vdpbf16ps %ymm16, %ymm16, %ymm14
vdpbf16ps %ymm16, %ymm16, %ymm15
sub $0x1, %rdi
jne .avx512.bf16.256b.dp2a.fp32bf16bf16.L1
ret
avx512_bf16_128b_dp2a_f32bf16bf16:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
vpxord %zmm16, %zmm16, %zmm16
.avx512.bf16.128b.dp2a.fp32bf16bf16.L1:
vdpbf16ps %xmm16, %xmm16, %xmm0
vdpbf16ps %xmm16, %xmm16, %xmm1
vdpbf16ps %xmm16, %xmm16, %xmm2
vdpbf16ps %xmm16, %xmm16, %xmm3
vdpbf16ps %xmm16, %xmm16, %xmm4
vdpbf16ps %xmm16, %xmm16, %xmm5
vdpbf16ps %xmm16, %xmm16, %xmm6
vdpbf16ps %xmm16, %xmm16, %xmm7
vdpbf16ps %xmm16, %xmm16, %xmm8
vdpbf16ps %xmm16, %xmm16, %xmm9
vdpbf16ps %xmm16, %xmm16, %xmm10
vdpbf16ps %xmm16, %xmm16, %xmm11
vdpbf16ps %xmm16, %xmm16, %xmm12
vdpbf16ps %xmm16, %xmm16, %xmm13
vdpbf16ps %xmm16, %xmm16, %xmm14
vdpbf16ps %xmm16, %xmm16, %xmm15
sub $0x1, %rdi
jne .avx512.bf16.128b.dp2a.fp32bf16bf16.L1
ret
================================================
FILE: x64/asm/_AVX512_FP16_.S
================================================
.globl avx512_fp16_512b_fma_f16f16f16
.globl avx512_fp16_256b_fma_f16f16f16
.globl avx512_fp16_128b_fma_f16f16f16
avx512_fp16_512b_fma_f16f16f16:
vpxord %zmm0, %zmm0, %zmm0
vpxord %zmm1, %zmm1, %zmm1
vpxord %zmm2, %zmm2, %zmm2
vpxord %zmm3, %zmm3, %zmm3
vpxord %zmm4, %zmm4, %zmm4
vpxord %zmm5, %zmm5, %zmm5
vpxord %zmm6, %zmm6, %zmm6
vpxord %zmm7, %zmm7, %zmm7
vpxord %zmm8, %zmm8, %zmm8
vpxord %zmm9, %zmm9, %zmm9
vpxord %zmm10, %zmm10, %zmm10
vpxord %zmm11, %zmm11, %zmm11
vpxord %zmm12, %zmm12, %zmm12
vpxord %zmm13, %zmm13, %zmm13
vpxord %zmm14, %zmm14, %zmm14
vpxord %zmm15, %zmm15, %zmm15
.avx512.fp16.512b.fma.f16f16f16.L1:
vfmadd231ph %zmm0, %zmm0, %zmm0
vfmadd231ph %zmm1, %zmm1, %zmm1
vfmadd231ph %zmm2, %zmm2, %zmm2
vfmadd231ph %zmm3, %zmm3, %zmm3
vfmadd231ph %zmm4, %zmm4, %zmm4
vfmadd231ph %zmm5, %zmm5, %zmm5
vfmadd231ph %zmm6, %zmm6, %zmm6
vfmadd231ph %zmm7, %zmm7, %zmm7
vfmadd231ph %zmm8, %zmm8, %zmm8
vfmadd231ph %zmm9, %zmm9, %zmm9
vfmadd231ph %zmm10, %zmm10, %zmm10
vfmadd231ph %zmm11, %zmm11, %zmm11
vfmadd231ph %zmm12, %zmm12, %zmm12
vfmadd231ph %zmm13, %zmm13, %zmm13
vfmadd231ph %zmm14, %zmm14, %zmm14
vfmadd231ph %zmm15, %zmm15, %zmm15
sub $0x1, %rdi
jne .avx512.fp16.512b.fma.f16f16f16.L1
ret
avx512_fp16_256b_fma_f16f16f16:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx512.fp16.256b.fma.f16f16f16.L1:
vfmadd231ph %ymm0, %ymm0, %ymm0
vfmadd231ph %ymm1, %ymm1, %ymm1
vfmadd231ph %ymm2, %ymm2, %ymm2
vfmadd231ph %ymm3, %ymm3, %ymm3
vfmadd231ph %ymm4, %ymm4, %ymm4
vfmadd231ph %ymm5, %ymm5, %ymm5
vfmadd231ph %ymm6, %ymm6, %ymm6
vfmadd231ph %ymm7, %ymm7, %ymm7
vfmadd231ph %ymm8, %ymm8, %ymm8
vfmadd231ph %ymm9, %ymm9, %ymm9
vfmadd231ph %ymm10, %ymm10, %ymm10
vfmadd231ph %ymm11, %ymm11, %ymm11
vfmadd231ph %ymm12, %ymm12, %ymm12
vfmadd231ph %ymm13, %ymm13, %ymm13
vfmadd231ph %ymm14, %ymm14, %ymm14
vfmadd231ph %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx512.fp16.256b.fma.f16f16f16.L1
ret
avx512_fp16_128b_fma_f16f16f16:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx512.fp16.128b.fma.f16f16f16.L1:
vfmadd231ph %xmm0, %xmm0, %xmm0
vfmadd231ph %xmm1, %xmm1, %xmm1
vfmadd231ph %xmm2, %xmm2, %xmm2
vfmadd231ph %xmm3, %xmm3, %xmm3
vfmadd231ph %xmm4, %xmm4, %xmm4
vfmadd231ph %xmm5, %xmm5, %xmm5
vfmadd231ph %xmm6, %xmm6, %xmm6
vfmadd231ph %xmm7, %xmm7, %xmm7
vfmadd231ph %xmm8, %xmm8, %xmm8
vfmadd231ph %xmm9, %xmm9, %xmm9
vfmadd231ph %xmm10, %xmm10, %xmm10
vfmadd231ph %xmm11, %xmm11, %xmm11
vfmadd231ph %xmm12, %xmm12, %xmm12
vfmadd231ph %xmm13, %xmm13, %xmm13
vfmadd231ph %xmm14, %xmm14, %xmm14
vfmadd231ph %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx512.fp16.128b.fma.f16f16f16.L1
ret
================================================
FILE: x64/asm/_AVX512_VNNI_.S
================================================
.globl avx512_vnni_512b_dp4a_s32u8s8
.globl avx512_vnni_512b_dp2a_s32s16s16
.globl avx512_vnni_256b_dp4a_s32u8s8
.globl avx512_vnni_256b_dp2a_s32s16s16
.globl avx512_vnni_128b_dp4a_s32u8s8
.globl avx512_vnni_128b_dp2a_s32s16s16
avx512_vnni_512b_dp4a_s32u8s8:
vpxord %zmm0, %zmm0, %zmm0
vpxord %zmm1, %zmm1, %zmm1
vpxord %zmm2, %zmm2, %zmm2
vpxord %zmm3, %zmm3, %zmm3
vpxord %zmm4, %zmm4, %zmm4
vpxord %zmm5, %zmm5, %zmm5
vpxord %zmm6, %zmm6, %zmm6
vpxord %zmm7, %zmm7, %zmm7
vpxord %zmm8, %zmm8, %zmm8
vpxord %zmm9, %zmm9, %zmm9
vpxord %zmm10, %zmm10, %zmm10
vpxord %zmm11, %zmm11, %zmm11
vpxord %zmm12, %zmm12, %zmm12
vpxord %zmm13, %zmm13, %zmm13
vpxord %zmm14, %zmm14, %zmm14
vpxord %zmm15, %zmm15, %zmm15
.avx512.vnni.512b.dp4a.s32u8s8.L1:
vpdpbusd %zmm0, %zmm0, %zmm0
vpdpbusd %zmm1, %zmm1, %zmm1
vpdpbusd %zmm2, %zmm2, %zmm2
vpdpbusd %zmm3, %zmm3, %zmm3
vpdpbusd %zmm4, %zmm4, %zmm4
vpdpbusd %zmm5, %zmm5, %zmm5
vpdpbusd %zmm6, %zmm6, %zmm6
vpdpbusd %zmm7, %zmm7, %zmm7
vpdpbusd %zmm8, %zmm8, %zmm8
vpdpbusd %zmm9, %zmm9, %zmm9
vpdpbusd %zmm10, %zmm10, %zmm10
vpdpbusd %zmm11, %zmm11, %zmm11
vpdpbusd %zmm12, %zmm12, %zmm12
vpdpbusd %zmm13, %zmm13, %zmm13
vpdpbusd %zmm14, %zmm14, %zmm14
vpdpbusd %zmm15, %zmm15, %zmm15
sub $0x1, %rdi
jne .avx512.vnni.512b.dp4a.s32u8s8.L1
ret
avx512_vnni_512b_dp2a_s32s16s16:
vpxord %zmm0, %zmm0, %zmm0
vpxord %zmm1, %zmm1, %zmm1
vpxord %zmm2, %zmm2, %zmm2
vpxord %zmm3, %zmm3, %zmm3
vpxord %zmm4, %zmm4, %zmm4
vpxord %zmm5, %zmm5, %zmm5
vpxord %zmm6, %zmm6, %zmm6
vpxord %zmm7, %zmm7, %zmm7
vpxord %zmm8, %zmm8, %zmm8
vpxord %zmm9, %zmm9, %zmm9
vpxord %zmm10, %zmm10, %zmm10
vpxord %zmm11, %zmm11, %zmm11
vpxord %zmm12, %zmm12, %zmm12
vpxord %zmm13, %zmm13, %zmm13
vpxord %zmm14, %zmm14, %zmm14
vpxord %zmm15, %zmm15, %zmm15
.avx512.vnni.512b.dp2a.s32s16s16.L1:
vpdpwssd %zmm0, %zmm0, %zmm0
vpdpwssd %zmm1, %zmm1, %zmm1
vpdpwssd %zmm2, %zmm2, %zmm2
vpdpwssd %zmm3, %zmm3, %zmm3
vpdpwssd %zmm4, %zmm4, %zmm4
vpdpwssd %zmm5, %zmm5, %zmm5
vpdpwssd %zmm6, %zmm6, %zmm6
vpdpwssd %zmm7, %zmm7, %zmm7
vpdpwssd %zmm8, %zmm8, %zmm8
vpdpwssd %zmm9, %zmm9, %zmm9
vpdpwssd %zmm10, %zmm10, %zmm10
vpdpwssd %zmm11, %zmm11, %zmm11
vpdpwssd %zmm12, %zmm12, %zmm12
vpdpwssd %zmm13, %zmm13, %zmm13
vpdpwssd %zmm14, %zmm14, %zmm14
vpdpwssd %zmm15, %zmm15, %zmm15
sub $0x1, %rdi
jne .avx512.vnni.512b.dp2a.s32s16s16.L1
ret
avx512_vnni_256b_dp4a_s32u8s8:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx512.vnni.256b.dp4a.s32u8s8.L1:
vpdpbusd %ymm0, %ymm0, %ymm0
vpdpbusd %ymm1, %ymm1, %ymm1
vpdpbusd %ymm2, %ymm2, %ymm2
vpdpbusd %ymm3, %ymm3, %ymm3
vpdpbusd %ymm4, %ymm4, %ymm4
vpdpbusd %ymm5, %ymm5, %ymm5
vpdpbusd %ymm6, %ymm6, %ymm6
vpdpbusd %ymm7, %ymm7, %ymm7
vpdpbusd %ymm8, %ymm8, %ymm8
vpdpbusd %ymm9, %ymm9, %ymm9
vpdpbusd %ymm10, %ymm10, %ymm10
vpdpbusd %ymm11, %ymm11, %ymm11
vpdpbusd %ymm12, %ymm12, %ymm12
vpdpbusd %ymm13, %ymm13, %ymm13
vpdpbusd %ymm14, %ymm14, %ymm14
vpdpbusd %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx512.vnni.256b.dp4a.s32u8s8.L1
ret
avx512_vnni_256b_dp2a_s32s16s16:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx512.vnni.256b.dp2a.s32s16s16.L1:
vpdpwssd %ymm0, %ymm0, %ymm0
vpdpwssd %ymm1, %ymm1, %ymm1
vpdpwssd %ymm2, %ymm2, %ymm2
vpdpwssd %ymm3, %ymm3, %ymm3
vpdpwssd %ymm4, %ymm4, %ymm4
vpdpwssd %ymm5, %ymm5, %ymm5
vpdpwssd %ymm6, %ymm6, %ymm6
vpdpwssd %ymm7, %ymm7, %ymm7
vpdpwssd %ymm8, %ymm8, %ymm8
vpdpwssd %ymm9, %ymm9, %ymm9
vpdpwssd %ymm10, %ymm10, %ymm10
vpdpwssd %ymm11, %ymm11, %ymm11
vpdpwssd %ymm12, %ymm12, %ymm12
vpdpwssd %ymm13, %ymm13, %ymm13
vpdpwssd %ymm14, %ymm14, %ymm14
vpdpwssd %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx512.vnni.256b.dp2a.s32s16s16.L1
ret
avx512_vnni_128b_dp4a_s32u8s8:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx512.vnni.128b.dp4a.s32u8s8.L1:
vpdpbusd %xmm0, %xmm0, %xmm0
vpdpbusd %xmm1, %xmm1, %xmm1
vpdpbusd %xmm2, %xmm2, %xmm2
vpdpbusd %xmm3, %xmm3, %xmm3
vpdpbusd %xmm4, %xmm4, %xmm4
vpdpbusd %xmm5, %xmm5, %xmm5
vpdpbusd %xmm6, %xmm6, %xmm6
vpdpbusd %xmm7, %xmm7, %xmm7
vpdpbusd %xmm8, %xmm8, %xmm8
vpdpbusd %xmm9, %xmm9, %xmm9
vpdpbusd %xmm10, %xmm10, %xmm10
vpdpbusd %xmm11, %xmm11, %xmm11
vpdpbusd %xmm12, %xmm12, %xmm12
vpdpbusd %xmm13, %xmm13, %xmm13
vpdpbusd %xmm14, %xmm14, %xmm14
vpdpbusd %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx512.vnni.128b.dp4a.s32u8s8.L1
ret
avx512_vnni_128b_dp2a_s32s16s16:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx512.vnni.128b.dp2a.s32s16s16.L1:
vpdpwssd %xmm0, %xmm0, %xmm0
vpdpwssd %xmm1, %xmm1, %xmm1
vpdpwssd %xmm2, %xmm2, %xmm2
vpdpwssd %xmm3, %xmm3, %xmm3
vpdpwssd %xmm4, %xmm4, %xmm4
vpdpwssd %xmm5, %xmm5, %xmm5
vpdpwssd %xmm6, %xmm6, %xmm6
vpdpwssd %xmm7, %xmm7, %xmm7
vpdpwssd %xmm8, %xmm8, %xmm8
vpdpwssd %xmm9, %xmm9, %xmm9
vpdpwssd %xmm10, %xmm10, %xmm10
vpdpwssd %xmm11, %xmm11, %xmm11
vpdpwssd %xmm12, %xmm12, %xmm12
vpdpwssd %xmm13, %xmm13, %xmm13
vpdpwssd %xmm14, %xmm14, %xmm14
vpdpwssd %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx512.vnni.128b.dp2a.s32s16s16.L1
ret
================================================
FILE: x64/asm/_AVX_.S
================================================
.globl avx_256b_add_mul_f32f32_f32
.globl avx_256b_add_mul_f64f64_f64
avx_256b_add_mul_f32f32_f32:
vxorps %ymm0, %ymm0, %ymm0
vxorps %ymm1, %ymm1, %ymm1
vxorps %ymm2, %ymm2, %ymm2
vxorps %ymm3, %ymm3, %ymm3
vxorps %ymm4, %ymm4, %ymm4
vxorps %ymm5, %ymm5, %ymm5
vxorps %ymm6, %ymm6, %ymm6
vxorps %ymm7, %ymm7, %ymm7
vxorps %ymm8, %ymm8, %ymm8
vxorps %ymm9, %ymm9, %ymm9
vxorps %ymm10, %ymm10, %ymm10
vxorps %ymm11, %ymm11, %ymm11
vxorps %ymm12, %ymm12, %ymm12
vxorps %ymm13, %ymm13, %ymm13
vxorps %ymm14, %ymm14, %ymm14
vxorps %ymm15, %ymm15, %ymm15
.avx.256b.add.mul.f32f32.f32.L1:
vmulps %ymm0, %ymm0, %ymm0
vaddps %ymm1, %ymm1, %ymm1
vmulps %ymm2, %ymm2, %ymm2
vaddps %ymm3, %ymm3, %ymm3
vmulps %ymm4, %ymm4, %ymm4
vaddps %ymm5, %ymm5, %ymm5
vmulps %ymm6, %ymm6, %ymm6
vaddps %ymm7, %ymm7, %ymm7
vmulps %ymm8, %ymm8, %ymm8
vaddps %ymm9, %ymm9, %ymm9
vmulps %ymm10, %ymm10, %ymm10
vaddps %ymm11, %ymm11, %ymm11
vmulps %ymm12, %ymm12, %ymm12
vaddps %ymm13, %ymm13, %ymm13
vmulps %ymm14, %ymm14, %ymm14
vaddps %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.256b.add.mul.f32f32.f32.L1
ret
avx_256b_add_mul_f64f64_f64:
vxorpd %ymm0, %ymm0, %ymm0
vxorpd %ymm1, %ymm1, %ymm1
vxorpd %ymm2, %ymm2, %ymm2
vxorpd %ymm3, %ymm3, %ymm3
vxorpd %ymm4, %ymm4, %ymm4
vxorpd %ymm5, %ymm5, %ymm5
vxorpd %ymm6, %ymm6, %ymm6
vxorpd %ymm7, %ymm7, %ymm7
vxorpd %ymm8, %ymm8, %ymm8
vxorpd %ymm9, %ymm9, %ymm9
vxorpd %ymm10, %ymm10, %ymm10
vxorpd %ymm11, %ymm11, %ymm11
vxorpd %ymm12, %ymm12, %ymm12
vxorpd %ymm13, %ymm13, %ymm13
vxorpd %ymm14, %ymm14, %ymm14
vxorpd %ymm15, %ymm15, %ymm15
.avx.256b.add.mul.f64f64.f64.L1:
vmulpd %ymm0, %ymm0, %ymm0
vaddpd %ymm1, %ymm1, %ymm1
vmulpd %ymm2, %ymm2, %ymm2
vaddpd %ymm3, %ymm3, %ymm3
vmulpd %ymm4, %ymm4, %ymm4
vaddpd %ymm5, %ymm5, %ymm5
vmulpd %ymm6, %ymm6, %ymm6
vaddpd %ymm7, %ymm7, %ymm7
vmulpd %ymm8, %ymm8, %ymm8
vaddpd %ymm9, %ymm9, %ymm9
vmulpd %ymm10, %ymm10, %ymm10
vaddpd %ymm11, %ymm11, %ymm11
vmulpd %ymm12, %ymm12, %ymm12
vaddpd %ymm13, %ymm13, %ymm13
vmulpd %ymm14, %ymm14, %ymm14
vaddpd %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.256b.add.mul.f64f64.f64.L1
ret
================================================
FILE: x64/asm/_AVX_VNNI_.S
================================================
.globl avx_vnni_256b_dp4a_s32u8s8
.globl avx_vnni_256b_dp2a_s32s16s16
.globl avx_vnni_128b_dp4a_s32u8s8
.globl avx_vnni_128b_dp2a_s32s16s16
avx_vnni_256b_dp4a_s32u8s8:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx.vnni.256b.dp4a.s32u8s8.L1:
{vex} vpdpbusd %ymm0, %ymm0, %ymm0
{vex} vpdpbusd %ymm1, %ymm1, %ymm1
{vex} vpdpbusd %ymm2, %ymm2, %ymm2
{vex} vpdpbusd %ymm3, %ymm3, %ymm3
{vex} vpdpbusd %ymm4, %ymm4, %ymm4
{vex} vpdpbusd %ymm5, %ymm5, %ymm5
{vex} vpdpbusd %ymm6, %ymm6, %ymm6
{vex} vpdpbusd %ymm7, %ymm7, %ymm7
{vex} vpdpbusd %ymm8, %ymm8, %ymm8
{vex} vpdpbusd %ymm9, %ymm9, %ymm9
{vex} vpdpbusd %ymm10, %ymm10, %ymm10
{vex} vpdpbusd %ymm11, %ymm11, %ymm11
{vex} vpdpbusd %ymm12, %ymm12, %ymm12
{vex} vpdpbusd %ymm13, %ymm13, %ymm13
{vex} vpdpbusd %ymm14, %ymm14, %ymm14
{vex} vpdpbusd %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.vnni.256b.dp4a.s32u8s8.L1
ret
avx_vnni_256b_dp2a_s32s16s16:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx.vnni.256b.dp2a.s32s16s16.L1:
{vex} vpdpwssd %ymm0, %ymm0, %ymm0
{vex} vpdpwssd %ymm1, %ymm1, %ymm1
{vex} vpdpwssd %ymm2, %ymm2, %ymm2
{vex} vpdpwssd %ymm3, %ymm3, %ymm3
{vex} vpdpwssd %ymm4, %ymm4, %ymm4
{vex} vpdpwssd %ymm5, %ymm5, %ymm5
{vex} vpdpwssd %ymm6, %ymm6, %ymm6
{vex} vpdpwssd %ymm7, %ymm7, %ymm7
{vex} vpdpwssd %ymm8, %ymm8, %ymm8
{vex} vpdpwssd %ymm9, %ymm9, %ymm9
{vex} vpdpwssd %ymm10, %ymm10, %ymm10
{vex} vpdpwssd %ymm11, %ymm11, %ymm11
{vex} vpdpwssd %ymm12, %ymm12, %ymm12
{vex} vpdpwssd %ymm13, %ymm13, %ymm13
{vex} vpdpwssd %ymm14, %ymm14, %ymm14
{vex} vpdpwssd %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.vnni.256b.dp2a.s32s16s16.L1
ret
avx_vnni_128b_dp4a_s32u8s8:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx.vnni.128b.dp4a.s32u8s8.L1:
{vex} vpdpbusd %xmm0, %xmm0, %xmm0
{vex} vpdpbusd %xmm1, %xmm1, %xmm1
{vex} vpdpbusd %xmm2, %xmm2, %xmm2
{vex} vpdpbusd %xmm3, %xmm3, %xmm3
{vex} vpdpbusd %xmm4, %xmm4, %xmm4
{vex} vpdpbusd %xmm5, %xmm5, %xmm5
{vex} vpdpbusd %xmm6, %xmm6, %xmm6
{vex} vpdpbusd %xmm7, %xmm7, %xmm7
{vex} vpdpbusd %xmm8, %xmm8, %xmm8
{vex} vpdpbusd %xmm9, %xmm9, %xmm9
{vex} vpdpbusd %xmm10, %xmm10, %xmm10
{vex} vpdpbusd %xmm11, %xmm11, %xmm11
{vex} vpdpbusd %xmm12, %xmm12, %xmm12
{vex} vpdpbusd %xmm13, %xmm13, %xmm13
{vex} vpdpbusd %xmm14, %xmm14, %xmm14
{vex} vpdpbusd %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx.vnni.128b.dp4a.s32u8s8.L1
ret
avx_vnni_128b_dp2a_s32s16s16:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx.vnni.128b.dp2a.s32s16s16.L1:
{vex} vpdpwssd %xmm0, %xmm0, %xmm0
{vex} vpdpwssd %xmm1, %xmm1, %xmm1
{vex} vpdpwssd %xmm2, %xmm2, %xmm2
{vex} vpdpwssd %xmm3, %xmm3, %xmm3
{vex} vpdpwssd %xmm4, %xmm4, %xmm4
{vex} vpdpwssd %xmm5, %xmm5, %xmm5
{vex} vpdpwssd %xmm6, %xmm6, %xmm6
{vex} vpdpwssd %xmm7, %xmm7, %xmm7
{vex} vpdpwssd %xmm8, %xmm8, %xmm8
{vex} vpdpwssd %xmm9, %xmm9, %xmm9
{vex} vpdpwssd %xmm10, %xmm10, %xmm10
{vex} vpdpwssd %xmm11, %xmm11, %xmm11
{vex} vpdpwssd %xmm12, %xmm12, %xmm12
{vex} vpdpwssd %xmm13, %xmm13, %xmm13
{vex} vpdpwssd %xmm14, %xmm14, %xmm14
{vex} vpdpwssd %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx.vnni.128b.dp2a.s32s16s16.L1
ret
================================================
FILE: x64/asm/_AVX_VNNI_INT16_.S
================================================
.globl avx_vnni_int16_256b_dp2a_s32s16u16
.globl avx_vnni_int16_256b_dp2a_s32u16s16
.globl avx_vnni_int16_256b_dp2a_s32u16u16
.globl avx_vnni_int16_128b_dp2a_s32s16u16
.globl avx_vnni_int16_128b_dp2a_s32u16s16
.globl avx_vnni_int16_128b_dp2a_s32u16u16
avx_vnni_int16_256b_dp2a_s32s16u16:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx.vnni.int16.256b.dp2a.s32s16u16.L1:
vpdpwsud %ymm0, %ymm0, %ymm0
vpdpwsud %ymm1, %ymm1, %ymm1
vpdpwsud %ymm2, %ymm2, %ymm2
vpdpwsud %ymm3, %ymm3, %ymm3
vpdpwsud %ymm4, %ymm4, %ymm4
vpdpwsud %ymm5, %ymm5, %ymm5
vpdpwsud %ymm6, %ymm6, %ymm6
vpdpwsud %ymm7, %ymm7, %ymm7
vpdpwsud %ymm8, %ymm8, %ymm8
vpdpwsud %ymm9, %ymm9, %ymm9
vpdpwsud %ymm10, %ymm10, %ymm10
vpdpwsud %ymm11, %ymm11, %ymm11
vpdpwsud %ymm12, %ymm12, %ymm12
vpdpwsud %ymm13, %ymm13, %ymm13
vpdpwsud %ymm14, %ymm14, %ymm14
vpdpwsud %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.vnni.int16.256b.dp2a.s32s16u16.L1
ret
avx_vnni_int16_256b_dp2a_s32u16s16:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx.vnni.int16.256b.dp2a.s32u16s16.L1:
vpdpwusd %ymm0, %ymm0, %ymm0
vpdpwusd %ymm1, %ymm1, %ymm1
vpdpwusd %ymm2, %ymm2, %ymm2
vpdpwusd %ymm3, %ymm3, %ymm3
vpdpwusd %ymm4, %ymm4, %ymm4
vpdpwusd %ymm5, %ymm5, %ymm5
vpdpwusd %ymm6, %ymm6, %ymm6
vpdpwusd %ymm7, %ymm7, %ymm7
vpdpwusd %ymm8, %ymm8, %ymm8
vpdpwusd %ymm9, %ymm9, %ymm9
vpdpwusd %ymm10, %ymm10, %ymm10
vpdpwusd %ymm11, %ymm11, %ymm11
vpdpwusd %ymm12, %ymm12, %ymm12
vpdpwusd %ymm13, %ymm13, %ymm13
vpdpwusd %ymm14, %ymm14, %ymm14
vpdpwusd %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.vnni.int16.256b.dp2a.s32u16s16.L1
ret
avx_vnni_int16_256b_dp2a_s32u16u16:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx.vnni.int16.256b.dp2a.s32u16u16.L1:
vpdpwuud %ymm0, %ymm0, %ymm0
vpdpwuud %ymm1, %ymm1, %ymm1
vpdpwuud %ymm2, %ymm2, %ymm2
vpdpwuud %ymm3, %ymm3, %ymm3
vpdpwuud %ymm4, %ymm4, %ymm4
vpdpwuud %ymm5, %ymm5, %ymm5
vpdpwuud %ymm6, %ymm6, %ymm6
vpdpwuud %ymm7, %ymm7, %ymm7
vpdpwuud %ymm8, %ymm8, %ymm8
vpdpwuud %ymm9, %ymm9, %ymm9
vpdpwuud %ymm10, %ymm10, %ymm10
vpdpwuud %ymm11, %ymm11, %ymm11
vpdpwuud %ymm12, %ymm12, %ymm12
vpdpwuud %ymm13, %ymm13, %ymm13
vpdpwuud %ymm14, %ymm14, %ymm14
vpdpwuud %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.vnni.int16.256b.dp2a.s32u16u16.L1
ret
avx_vnni_int16_128b_dp2a_s32s16u16:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx.vnni.int16.128b.dp2a.s32s16u16.L1:
vpdpwsud %xmm0, %xmm0, %xmm0
vpdpwsud %xmm1, %xmm1, %xmm1
vpdpwsud %xmm2, %xmm2, %xmm2
vpdpwsud %xmm3, %xmm3, %xmm3
vpdpwsud %xmm4, %xmm4, %xmm4
vpdpwsud %xmm5, %xmm5, %xmm5
vpdpwsud %xmm6, %xmm6, %xmm6
vpdpwsud %xmm7, %xmm7, %xmm7
vpdpwsud %xmm8, %xmm8, %xmm8
vpdpwsud %xmm9, %xmm9, %xmm9
vpdpwsud %xmm10, %xmm10, %xmm10
vpdpwsud %xmm11, %xmm11, %xmm11
vpdpwsud %xmm12, %xmm12, %xmm12
vpdpwsud %xmm13, %xmm13, %xmm13
vpdpwsud %xmm14, %xmm14, %xmm14
vpdpwsud %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx.vnni.int16.128b.dp2a.s32s16u16.L1
ret
avx_vnni_int16_128b_dp2a_s32u16s16:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx.vnni.int16.128b.dp2a.s32u16s16.L1:
vpdpwusd %xmm0, %xmm0, %xmm0
vpdpwusd %xmm1, %xmm1, %xmm1
vpdpwusd %xmm2, %xmm2, %xmm2
vpdpwusd %xmm3, %xmm3, %xmm3
vpdpwusd %xmm4, %xmm4, %xmm4
vpdpwusd %xmm5, %xmm5, %xmm5
vpdpwusd %xmm6, %xmm6, %xmm6
vpdpwusd %xmm7, %xmm7, %xmm7
vpdpwusd %xmm8, %xmm8, %xmm8
vpdpwusd %xmm9, %xmm9, %xmm9
vpdpwusd %xmm10, %xmm10, %xmm10
vpdpwusd %xmm11, %xmm11, %xmm11
vpdpwusd %xmm12, %xmm12, %xmm12
vpdpwusd %xmm13, %xmm13, %xmm13
vpdpwusd %xmm14, %xmm14, %xmm14
vpdpwusd %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx.vnni.int16.128b.dp2a.s32u16s16.L1
ret
avx_vnni_int16_128b_dp2a_s32u16u16:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx.vnni.int16.128b.dp2a.s32u16u16.L1:
vpdpwuud %xmm0, %xmm0, %xmm0
vpdpwuud %xmm1, %xmm1, %xmm1
vpdpwuud %xmm2, %xmm2, %xmm2
vpdpwuud %xmm3, %xmm3, %xmm3
vpdpwuud %xmm4, %xmm4, %xmm4
vpdpwuud %xmm5, %xmm5, %xmm5
vpdpwuud %xmm6, %xmm6, %xmm6
vpdpwuud %xmm7, %xmm7, %xmm7
vpdpwuud %xmm8, %xmm8, %xmm8
vpdpwuud %xmm9, %xmm9, %xmm9
vpdpwuud %xmm10, %xmm10, %xmm10
vpdpwuud %xmm11, %xmm11, %xmm11
vpdpwuud %xmm12, %xmm12, %xmm12
vpdpwuud %xmm13, %xmm13, %xmm13
vpdpwuud %xmm14, %xmm14, %xmm14
vpdpwuud %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx.vnni.int16.128b.dp2a.s32u16u16.L1
ret
================================================
FILE: x64/asm/_AVX_VNNI_INT8_.S
================================================
.globl avx_vnni_int8_256b_dp4a_s32s8s8
.globl avx_vnni_int8_256b_dp4a_s32s8u8
.globl avx_vnni_int8_256b_dp4a_s32u8u8
.globl avx_vnni_int8_128b_dp4a_s32s8s8
.globl avx_vnni_int8_128b_dp4a_s32s8u8
.globl avx_vnni_int8_128b_dp4a_s32u8u8
avx_vnni_int8_256b_dp4a_s32s8s8:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx.vnni.int8.256b.dp4a.s32s8s8.L1:
vpdpbssd %ymm0, %ymm0, %ymm0
vpdpbssd %ymm1, %ymm1, %ymm1
vpdpbssd %ymm2, %ymm2, %ymm2
vpdpbssd %ymm3, %ymm3, %ymm3
vpdpbssd %ymm4, %ymm4, %ymm4
vpdpbssd %ymm5, %ymm5, %ymm5
vpdpbssd %ymm6, %ymm6, %ymm6
vpdpbssd %ymm7, %ymm7, %ymm7
vpdpbssd %ymm8, %ymm8, %ymm8
vpdpbssd %ymm9, %ymm9, %ymm9
vpdpbssd %ymm10, %ymm10, %ymm10
vpdpbssd %ymm11, %ymm11, %ymm11
vpdpbssd %ymm12, %ymm12, %ymm12
vpdpbssd %ymm13, %ymm13, %ymm13
vpdpbssd %ymm14, %ymm14, %ymm14
vpdpbssd %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.vnni.int8.256b.dp4a.s32s8s8.L1
ret
avx_vnni_int8_256b_dp4a_s32s8u8:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx.vnni.int8.256b.dp4a.s32s8u8.L1:
vpdpbsud %ymm0, %ymm0, %ymm0
vpdpbsud %ymm1, %ymm1, %ymm1
vpdpbsud %ymm2, %ymm2, %ymm2
vpdpbsud %ymm3, %ymm3, %ymm3
vpdpbsud %ymm4, %ymm4, %ymm4
vpdpbsud %ymm5, %ymm5, %ymm5
vpdpbsud %ymm6, %ymm6, %ymm6
vpdpbsud %ymm7, %ymm7, %ymm7
vpdpbsud %ymm8, %ymm8, %ymm8
vpdpbsud %ymm9, %ymm9, %ymm9
vpdpbsud %ymm10, %ymm10, %ymm10
vpdpbsud %ymm11, %ymm11, %ymm11
vpdpbsud %ymm12, %ymm12, %ymm12
vpdpbsud %ymm13, %ymm13, %ymm13
vpdpbsud %ymm14, %ymm14, %ymm14
vpdpbsud %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.vnni.int8.256b.dp4a.s32s8u8.L1
ret
avx_vnni_int8_256b_dp4a_s32u8u8:
vpxor %ymm0, %ymm0, %ymm0
vpxor %ymm1, %ymm1, %ymm1
vpxor %ymm2, %ymm2, %ymm2
vpxor %ymm3, %ymm3, %ymm3
vpxor %ymm4, %ymm4, %ymm4
vpxor %ymm5, %ymm5, %ymm5
vpxor %ymm6, %ymm6, %ymm6
vpxor %ymm7, %ymm7, %ymm7
vpxor %ymm8, %ymm8, %ymm8
vpxor %ymm9, %ymm9, %ymm9
vpxor %ymm10, %ymm10, %ymm10
vpxor %ymm11, %ymm11, %ymm11
vpxor %ymm12, %ymm12, %ymm12
vpxor %ymm13, %ymm13, %ymm13
vpxor %ymm14, %ymm14, %ymm14
vpxor %ymm15, %ymm15, %ymm15
.avx.vnni.int8.256b.dp4a.s32u8u8.L1:
vpdpbuud %ymm0, %ymm0, %ymm0
vpdpbuud %ymm1, %ymm1, %ymm1
vpdpbuud %ymm2, %ymm2, %ymm2
vpdpbuud %ymm3, %ymm3, %ymm3
vpdpbuud %ymm4, %ymm4, %ymm4
vpdpbuud %ymm5, %ymm5, %ymm5
vpdpbuud %ymm6, %ymm6, %ymm6
vpdpbuud %ymm7, %ymm7, %ymm7
vpdpbuud %ymm8, %ymm8, %ymm8
vpdpbuud %ymm9, %ymm9, %ymm9
vpdpbuud %ymm10, %ymm10, %ymm10
vpdpbuud %ymm11, %ymm11, %ymm11
vpdpbuud %ymm12, %ymm12, %ymm12
vpdpbuud %ymm13, %ymm13, %ymm13
vpdpbuud %ymm14, %ymm14, %ymm14
vpdpbuud %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .avx.vnni.int8.256b.dp4a.s32u8u8.L1
ret
avx_vnni_int8_128b_dp4a_s32s8s8:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx.vnni.int8.128b.dp4a.s32s8s8.L1:
vpdpbssd %xmm0, %xmm0, %xmm0
vpdpbssd %xmm1, %xmm1, %xmm1
vpdpbssd %xmm2, %xmm2, %xmm2
vpdpbssd %xmm3, %xmm3, %xmm3
vpdpbssd %xmm4, %xmm4, %xmm4
vpdpbssd %xmm5, %xmm5, %xmm5
vpdpbssd %xmm6, %xmm6, %xmm6
vpdpbssd %xmm7, %xmm7, %xmm7
vpdpbssd %xmm8, %xmm8, %xmm8
vpdpbssd %xmm9, %xmm9, %xmm9
vpdpbssd %xmm10, %xmm10, %xmm10
vpdpbssd %xmm11, %xmm11, %xmm11
vpdpbssd %xmm12, %xmm12, %xmm12
vpdpbssd %xmm13, %xmm13, %xmm13
vpdpbssd %xmm14, %xmm14, %xmm14
vpdpbssd %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx.vnni.int8.128b.dp4a.s32s8s8.L1
ret
avx_vnni_int8_128b_dp4a_s32s8u8:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx.vnni.int8.128b.dp4a.s32s8u8.L1:
vpdpbsud %xmm0, %xmm0, %xmm0
vpdpbsud %xmm1, %xmm1, %xmm1
vpdpbsud %xmm2, %xmm2, %xmm2
vpdpbsud %xmm3, %xmm3, %xmm3
vpdpbsud %xmm4, %xmm4, %xmm4
vpdpbsud %xmm5, %xmm5, %xmm5
vpdpbsud %xmm6, %xmm6, %xmm6
vpdpbsud %xmm7, %xmm7, %xmm7
vpdpbsud %xmm8, %xmm8, %xmm8
vpdpbsud %xmm9, %xmm9, %xmm9
vpdpbsud %xmm10, %xmm10, %xmm10
vpdpbsud %xmm11, %xmm11, %xmm11
vpdpbsud %xmm12, %xmm12, %xmm12
vpdpbsud %xmm13, %xmm13, %xmm13
vpdpbsud %xmm14, %xmm14, %xmm14
vpdpbsud %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx.vnni.int8.128b.dp4a.s32s8u8.L1
ret
avx_vnni_int8_128b_dp4a_s32u8u8:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
pxor %xmm9, %xmm9
pxor %xmm10, %xmm10
pxor %xmm11, %xmm11
pxor %xmm12, %xmm12
pxor %xmm13, %xmm13
pxor %xmm14, %xmm14
pxor %xmm15, %xmm15
.avx.vnni.int8.128b.dp4a.s32u8u8.L1:
vpdpbuud %xmm0, %xmm0, %xmm0
vpdpbuud %xmm1, %xmm1, %xmm1
vpdpbuud %xmm2, %xmm2, %xmm2
vpdpbuud %xmm3, %xmm3, %xmm3
vpdpbuud %xmm4, %xmm4, %xmm4
vpdpbuud %xmm5, %xmm5, %xmm5
vpdpbuud %xmm6, %xmm6, %xmm6
vpdpbuud %xmm7, %xmm7, %xmm7
vpdpbuud %xmm8, %xmm8, %xmm8
vpdpbuud %xmm9, %xmm9, %xmm9
vpdpbuud %xmm10, %xmm10, %xmm10
vpdpbuud %xmm11, %xmm11, %xmm11
vpdpbuud %xmm12, %xmm12, %xmm12
vpdpbuud %xmm13, %xmm13, %xmm13
vpdpbuud %xmm14, %xmm14, %xmm14
vpdpbuud %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .avx.vnni.int8.128b.dp4a.s32u8u8.L1
ret
================================================
FILE: x64/asm/_FMA_.S
================================================
.globl fma_256b_fma_f32f32f32
.globl fma_256b_fma_f64f64f64
.globl fma_128b_fma_f32f32f32
.globl fma_128b_fma_f64f64f64
fma_256b_fma_f32f32f32:
vxorps %ymm0, %ymm0, %ymm0
vxorps %ymm1, %ymm1, %ymm1
vxorps %ymm2, %ymm2, %ymm2
vxorps %ymm3, %ymm3, %ymm3
vxorps %ymm4, %ymm4, %ymm4
vxorps %ymm5, %ymm5, %ymm5
vxorps %ymm6, %ymm6, %ymm6
vxorps %ymm7, %ymm7, %ymm7
vxorps %ymm8, %ymm8, %ymm8
vxorps %ymm9, %ymm9, %ymm9
vxorps %ymm10, %ymm10, %ymm10
vxorps %ymm11, %ymm11, %ymm11
vxorps %ymm12, %ymm12, %ymm12
vxorps %ymm13, %ymm13, %ymm13
vxorps %ymm14, %ymm14, %ymm14
vxorps %ymm15, %ymm15, %ymm15
.fma.256b.fma.f32f32f32.L1:
vfmadd231ps %ymm0, %ymm0, %ymm0
vfmadd231ps %ymm1, %ymm1, %ymm1
vfmadd231ps %ymm2, %ymm2, %ymm2
vfmadd231ps %ymm3, %ymm3, %ymm3
vfmadd231ps %ymm4, %ymm4, %ymm4
vfmadd231ps %ymm5, %ymm5, %ymm5
vfmadd231ps %ymm6, %ymm6, %ymm6
vfmadd231ps %ymm7, %ymm7, %ymm7
vfmadd231ps %ymm8, %ymm8, %ymm8
vfmadd231ps %ymm9, %ymm9, %ymm9
vfmadd231ps %ymm10, %ymm10, %ymm10
vfmadd231ps %ymm11, %ymm11, %ymm11
vfmadd231ps %ymm12, %ymm12, %ymm12
vfmadd231ps %ymm13, %ymm13, %ymm13
vfmadd231ps %ymm14, %ymm14, %ymm14
vfmadd231ps %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .fma.256b.fma.f32f32f32.L1
ret
fma_256b_fma_f64f64f64:
vxorpd %ymm0, %ymm0, %ymm0
vxorpd %ymm1, %ymm1, %ymm1
vxorpd %ymm2, %ymm2, %ymm2
vxorpd %ymm3, %ymm3, %ymm3
vxorpd %ymm4, %ymm4, %ymm4
vxorpd %ymm5, %ymm5, %ymm5
vxorpd %ymm6, %ymm6, %ymm6
vxorpd %ymm7, %ymm7, %ymm7
vxorpd %ymm8, %ymm8, %ymm8
vxorpd %ymm9, %ymm9, %ymm9
vxorpd %ymm10, %ymm10, %ymm10
vxorpd %ymm11, %ymm11, %ymm11
vxorpd %ymm12, %ymm12, %ymm12
vxorpd %ymm13, %ymm13, %ymm13
vxorpd %ymm14, %ymm14, %ymm14
vxorpd %ymm15, %ymm15, %ymm15
.fma.256b.fma.f64f64f64.L1:
vfmadd231pd %ymm0, %ymm0, %ymm0
vfmadd231pd %ymm1, %ymm1, %ymm1
vfmadd231pd %ymm2, %ymm2, %ymm2
vfmadd231pd %ymm3, %ymm3, %ymm3
vfmadd231pd %ymm4, %ymm4, %ymm4
vfmadd231pd %ymm5, %ymm5, %ymm5
vfmadd231pd %ymm6, %ymm6, %ymm6
vfmadd231pd %ymm7, %ymm7, %ymm7
vfmadd231pd %ymm8, %ymm8, %ymm8
vfmadd231pd %ymm9, %ymm9, %ymm9
vfmadd231pd %ymm10, %ymm10, %ymm10
vfmadd231pd %ymm11, %ymm11, %ymm11
vfmadd231pd %ymm12, %ymm12, %ymm12
vfmadd231pd %ymm13, %ymm13, %ymm13
vfmadd231pd %ymm14, %ymm14, %ymm14
vfmadd231pd %ymm15, %ymm15, %ymm15
sub $0x1, %rdi
jne .fma.256b.fma.f64f64f64.L1
ret
fma_128b_fma_f32f32f32:
xorps %xmm0, %xmm0
xorps %xmm1, %xmm1
xorps %xmm2, %xmm2
xorps %xmm3, %xmm3
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
xorps %xmm6, %xmm6
xorps %xmm7, %xmm7
xorps %xmm8, %xmm8
xorps %xmm9, %xmm9
xorps %xmm10, %xmm10
xorps %xmm11, %xmm11
xorps %xmm12, %xmm12
xorps %xmm13, %xmm13
xorps %xmm14, %xmm14
xorps %xmm15, %xmm15
.fma.128b.fma.f32f32f32.L1:
vfmadd231ps %xmm0, %xmm0, %xmm0
vfmadd231ps %xmm1, %xmm1, %xmm1
vfmadd231ps %xmm2, %xmm2, %xmm2
vfmadd231ps %xmm3, %xmm3, %xmm3
vfmadd231ps %xmm4, %xmm4, %xmm4
vfmadd231ps %xmm5, %xmm5, %xmm5
vfmadd231ps %xmm6, %xmm6, %xmm6
vfmadd231ps %xmm7, %xmm7, %xmm7
vfmadd231ps %xmm8, %xmm8, %xmm8
vfmadd231ps %xmm9, %xmm9, %xmm9
vfmadd231ps %xmm10, %xmm10, %xmm10
vfmadd231ps %xmm11, %xmm11, %xmm11
vfmadd231ps %xmm12, %xmm12, %xmm12
vfmadd231ps %xmm13, %xmm13, %xmm13
vfmadd231ps %xmm14, %xmm14, %xmm14
vfmadd231ps %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .fma.128b.fma.f32f32f32.L1
ret
fma_128b_fma_f64f64f64:
xorpd %xmm0, %xmm0
xorpd %xmm1, %xmm1
xorpd %xmm2, %xmm2
xorpd %xmm3, %xmm3
xorpd %xmm4, %xmm4
xorpd %xmm5, %xmm5
xorpd %xmm6, %xmm6
xorpd %xmm7, %xmm7
xorpd %xmm8, %xmm8
xorpd %xmm9, %xmm9
xorpd %xmm10, %xmm10
xorpd %xmm11, %xmm11
xorpd %xmm12, %xmm12
xorpd %xmm13, %xmm13
xorpd %xmm14, %xmm14
xorpd %xmm15, %xmm15
.fma.128b.fma.f64f64f64.L1:
vfmadd231pd %xmm0, %xmm0, %xmm0
vfmadd231pd %xmm1, %xmm1, %xmm1
vfmadd231pd %xmm2, %xmm2, %xmm2
vfmadd231pd %xmm3, %xmm3, %xmm3
vfmadd231pd %xmm4, %xmm4, %xmm4
vfmadd231pd %xmm5, %xmm5, %xmm5
vfmadd231pd %xmm6, %xmm6, %xmm6
vfmadd231pd %xmm7, %xmm7, %xmm7
vfmadd231pd %xmm8, %xmm8, %xmm8
vfmadd231pd %xmm9, %xmm9, %xmm9
vfmadd231pd %xmm10, %xmm10, %xmm10
vfmadd231pd %xmm11, %xmm11, %xmm11
vfmadd231pd %xmm12, %xmm12, %xmm12
vfmadd231pd %xmm13, %xmm13, %xmm13
vfmadd231pd %xmm14, %xmm14, %xmm14
vfmadd231pd %xmm15, %xmm15, %xmm15
sub $0x1, %rdi
jne .fma.128b.fma.f64f64f64.L1
ret
================================================
FILE: x64/asm/_SSE2_.S
================================================
.globl sse2_128b_add_mul_f64f64_f64
sse2_128b_add_mul_f64f64_f64:
xorpd %xmm0, %xmm0
xorpd %xmm1, %xmm1
xorpd %xmm2, %xmm2
xorpd %xmm3, %xmm3
xorpd %xmm4, %xmm4
xorpd %xmm5, %xmm5
xorpd %xmm6, %xmm6
xorpd %xmm7, %xmm7
xorpd %xmm8, %xmm8
xorpd %xmm9, %xmm9
xorpd %xmm10, %xmm10
xorpd %xmm11, %xmm11
xorpd %xmm12, %xmm12
xorpd %xmm13, %xmm13
xorpd %xmm14, %xmm14
xorpd %xmm15, %xmm15
.sse2.128b.add.mul.f64f64.f64.L1:
mulpd %xmm0, %xmm0
addpd %xmm1, %xmm1
mulpd %xmm2, %xmm2
addpd %xmm3, %xmm3
mulpd %xmm4, %xmm4
addpd %xmm5, %xmm5
mulpd %xmm6, %xmm6
addpd %xmm7, %xmm7
sub $0x1, %rdi
mulpd %xmm8, %xmm8
addpd %xmm9, %xmm9
mulpd %xmm10, %xmm10
addpd %xmm11, %xmm11
mulpd %xmm12, %xmm12
addpd %xmm13, %xmm13
mulpd %xmm14, %xmm14
addpd %xmm15, %xmm15
jne .sse2.128b.add.mul.f64f64.f64.L1
ret
================================================
FILE: x64/asm/_SSE_.S
================================================
.globl sse_128b_add_mul_f32f32_f32
sse_128b_add_mul_f32f32_f32:
xorps %xmm0, %xmm0
xorps %xmm1, %xmm1
xorps %xmm2, %xmm2
xorps %xmm3, %xmm3
xorps %xmm4, %xmm4
xorps %xmm5, %xmm5
xorps %xmm6, %xmm6
xorps %xmm7, %xmm7
xorps %xmm8, %xmm8
xorps %xmm9, %xmm9
xorps %xmm10, %xmm10
xorps %xmm11, %xmm11
xorps %xmm12, %xmm12
xorps %xmm13, %xmm13
xorps %xmm14, %xmm14
xorps %xmm15, %xmm15
.sse.128b.add.mul.f32f32_f32.L1:
mulps %xmm0, %xmm0
addps %xmm1, %xmm1
mulps %xmm2, %xmm2
addps %xmm3, %xmm3
mulps %xmm4, %xmm4
addps %xmm5, %xmm5
mulps %xmm6, %xmm6
addps %xmm7, %xmm7
sub $0x1, %rdi
mulps %xmm8, %xmm8
addps %xmm9, %xmm9
mulps %xmm10, %xmm10
addps %xmm11, %xmm11
mulps %xmm12, %xmm12
addps %xmm13, %xmm13
mulps %xmm14, %xmm14
addps %xmm15, %xmm15
jne .sse.128b.add.mul.f32f32_f32.L1
ret
================================================
FILE: x64/cpufp.cpp
================================================
#include "table.hpp"
#include "smtl.hpp"
#include
#include
#include
#include
#include
#include
#include
#include
#if defined(_AMX_INT8_) || defined(_AMX_BF16_) || defined(_AMX_FP16_)
#include
#define _AMX_TILE_
#endif
using namespace std;
extern "C"
{
#ifdef _SSE_
void sse_128b_add_mul_f32f32_f32(int64_t, void *params);
#endif
#ifdef _SSE2_
void sse2_128b_add_mul_f64f64_f64(int64_t, void *params);
#endif
#ifdef _AVX_
void avx_256b_add_mul_f32f32_f32(int64_t, void *params);
void avx_256b_add_mul_f64f64_f64(int64_t, void *params);
#endif
#ifdef _FMA_
void fma_256b_fma_f32f32f32(int64_t, void *params);
void fma_256b_fma_f64f64f64(int64_t, void *params);
void fma_128b_fma_f32f32f32(int64_t, void *params);
void fma_128b_fma_f64f64f64(int64_t, void *params);
#endif
#ifdef _AVX512F_
void avx512f_512b_fma_f32f32f32(int64_t, void *params);
void avx512f_512b_fma_f64f64f64(int64_t, void *params);
void avx512f_512b_add_mul_f32f32_f32(int64_t, void *params);
void avx512f_512b_add_mul_f64f64_f64(int64_t, void *params);
#endif
#ifdef _AVX512_BF16_
void avx512_bf16_512b_dp2a_f32bf16bf16(int64_t, void *params);
void avx512_bf16_256b_dp2a_f32bf16bf16(int64_t, void *params);
void avx512_bf16_128b_dp2a_f32bf16bf16(int64_t, void *params);
#endif
#ifdef _AVX512_FP16_
void avx512_fp16_512b_fma_f16f16f16(int64_t, void *params);
void avx512_fp16_256b_fma_f16f16f16(int64_t, void *params);
void avx512_fp16_128b_fma_f16f16f16(int64_t, void *params);
#endif
#ifdef _AVX512_VNNI_
void avx512_vnni_512b_dp4a_s32u8s8(int64_t, void *params);
void avx512_vnni_256b_dp4a_s32u8s8(int64_t, void *params);
void avx512_vnni_128b_dp4a_s32u8s8(int64_t, void *params);
void avx512_vnni_512b_dp2a_s32s16s16(int64_t, void *params);
void avx512_vnni_256b_dp2a_s32s16s16(int64_t, void *params);
void avx512_vnni_128b_dp2a_s32s16s16(int64_t, void *params);
#endif
#ifdef _AVX_VNNI_
void avx_vnni_256b_dp4a_s32u8s8(int64_t, void *params);
void avx_vnni_128b_dp4a_s32u8s8(int64_t, void *params);
void avx_vnni_256b_dp2a_s32s16s16(int64_t, void *params);
void avx_vnni_128b_dp2a_s32s16s16(int64_t, void *params);
#endif
#ifdef _AVX_VNNI_INT8_
void avx_vnni_int8_256b_dp4a_s32s8s8(int64_t, void *params);
void avx_vnni_int8_128b_dp4a_s32s8s8(int64_t, void *params);
void avx_vnni_int8_256b_dp4a_s32s8u8(int64_t, void *params);
void avx_vnni_int8_128b_dp4a_s32s8u8(int64_t, void *params);
void avx_vnni_int8_256b_dp4a_s32u8u8(int64_t, void *params);
void avx_vnni_int8_128b_dp4a_s32u8u8(int64_t, void *params);
#endif
#ifdef _AVX_VNNI_INT16_
void avx_vnni_int16_256b_dp4a_s32s16u16(int64_t, void *params);
void avx_vnni_int16_128b_dp4a_s32s16u16(int64_t, void *params);
void avx_vnni_int16_256b_dp4a_s32u16s16(int64_t, void *params);
void avx_vnni_int16_128b_dp4a_s32u16s16(int64_t, void *params);
void avx_vnni_int16_256b_dp4a_s32u16u16(int64_t, void *params);
void avx_vnni_int16_128b_dp4a_s32u16u16(int64_t, void *params);
#endif
#ifdef _AMX_INT8_
void amx_int8_mm_s32s8s8(int64_t, void* tile_cfg);
void amx_int8_mm_s32s8u8(int64_t, void* tile_cfg);
void amx_int8_mm_s32u8s8(int64_t, void* tile_cfg);
void amx_int8_mm_s32u8u8(int64_t, void* tile_cfg);
#endif
#ifdef _AMX_BF16_
void amx_bf16_mm_f32bf16bf16(int64_t, void* tile_cfg);
#endif
#ifdef _AMX_FP16_
void amx_fp16_mm_f32f16f16(int64_t, void* tile_cfg);
#endif
}
#ifdef _AMX_TILE_
struct
{
uint8_t palette_id;
uint8_t start_row;
uint8_t reserved_0[14];
uint16_t colsb[16];
uint8_t rows[16];
} __tilecfg;
void init_tile_cfg()
{
int i;
__tilecfg.palette_id = 1;
__tilecfg.start_row = 0;
for (i = 0; i < 14; i++)
{
__tilecfg.reserved_0[i] = 0;
}
for (i = 0; i < 8; i++)
{
__tilecfg.colsb[i] = 64;
__tilecfg.rows[i] = 16;
}
for (; i < 16; i++)
{
__tilecfg.colsb[i] = 0;
__tilecfg.rows[i] = 0;
}
}
#endif
typedef struct
{
std::string isa;
std::string vlen;
std::string type;
std::string dim;
int64_t loop_time;
int64_t comp_pl;
void *params;
void (*bench)(int64_t, void*);
} cpubm_t;
static int num_dsa = 0;
static int num_simd_512b = 0;
static int num_simd_256b = 0;
static int num_simd_128b = 0;
static vector bm_list;
static double get_time(struct timespec *start,
struct timespec *end)
{
return end->tv_sec - start->tv_sec +
(end->tv_nsec - start->tv_nsec) * 1e-9;
}
static void reg_new_isa(std::string isa,
std::string vlen,
std::string type,
std::string dim,
int64_t loop_time,
int64_t comp_pl,
void *params,
void (*bench)(int64_t, void*))
{
cpubm_t new_one;
new_one.isa = isa;
new_one.vlen = vlen;
new_one.type = type;
new_one.dim = dim;
new_one.loop_time = loop_time;
new_one.comp_pl = comp_pl;
new_one.params = params;
new_one.bench = bench;
bm_list.push_back(new_one);
}
static void thread_func(void *params)
{
cpubm_t *bm = (cpubm_t*)params;
if (bm->params)
{
bm->bench(bm->loop_time, bm->params);
}
else
{
bm->bench(bm->loop_time, NULL);
}
}
static void cpubm_x64_one(smtl_handle sh,
cpubm_t &item,
Table &table)
{
struct timespec start, end;
double time_used, perf;
char perfUnit = 'G';
int i;
int num_threads = smtl_num_threads(sh);
// warm up
for (i = 0; i < num_threads; i++)
{
smtl_add_task(sh, thread_func, (void*)&item);
}
smtl_begin_tasks(sh);
smtl_wait_tasks_finished(sh);
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
for (i = 0; i < num_threads; i++)
{
smtl_add_task(sh, thread_func, (void*)&item);
}
smtl_begin_tasks(sh);
smtl_wait_tasks_finished(sh);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
time_used = get_time(&start, &end);
perf = item.loop_time * item.comp_pl * num_threads /
time_used;
if (perf > 1e12)
{
perfUnit = 'T';
perf /= 1e12;
}
else
{
perf /= 1e9;
}
stringstream ss;
ss << std::setprecision(5) << perf << " " << perfUnit << item.dim;
vector cont;
cont.resize(4);
cont[0] = item.isa;
cont[1] = item.vlen;
cont[2] = item.type;
cont[3] = ss.str();
table.addOneItem(cont);
}
static void cpubm_do_bench(std::vector &set_of_threads,
uint32_t idle_time)
{
int i;
if (bm_list.size())
{
int num_threads = set_of_threads.size();
printf("Number Threads: %d\n", num_threads);
printf("Thread Pool Binding:");
for (i = 0; i < num_threads; i++)
{
printf(" %d", set_of_threads[i]);
}
printf("\n");
// set table head
vector ti;
ti.resize(4);
ti[0] = "Instruction Set";
ti[1] = "Vector Length";
ti[2] = "Core Computation";
ti[3] = "Peak Performance";
Table table;
table.setColumnNum(4);
table.addOneItem(ti);
// set thread pool
smtl_handle sh;
smtl_init(&sh, set_of_threads);
// traverse task list
int idx_g = 0;
if (num_dsa)
{
table.addSeparator();
}
for (i = 0; i < num_dsa; i++)
{
sleep(idle_time);
cpubm_x64_one(sh, bm_list[idx_g], table);
idx_g++;
}
if (num_simd_512b)
{
table.addSeparator();
}
for (i = 0; i < num_simd_512b; i++)
{
sleep(idle_time);
cpubm_x64_one(sh, bm_list[idx_g], table);
idx_g++;
}
if (num_simd_256b)
{
table.addSeparator();
}
for (i = 0; i < num_simd_256b; i++)
{
sleep(idle_time);
cpubm_x64_one(sh, bm_list[idx_g], table);
idx_g++;
}
if (num_simd_128b)
{
table.addSeparator();
}
for (i = 0; i < num_simd_128b; i++)
{
sleep(idle_time);
cpubm_x64_one(sh, bm_list[idx_g], table);
idx_g++;
}
table.print();
smtl_fini(sh);
}
}
static void parse_thread_pool(char *sets,
vector &set_of_threads)
{
if (sets[0] != '[')
{
return;
}
int pos = 1;
int left = 0, right = 0;
int state = 0;
while (sets[pos] != ']' && sets[pos] != '\0')
{
if (state == 0)
{
if (sets[pos] >= '0' && sets[pos] <= '9')
{
left *= 10;
left += (int)(sets[pos] - '0');
}
else if (sets[pos] == ',')
{
set_of_threads.push_back(left);
left = 0;
}
else if (sets[pos] == '-')
{
right = 0;
state = 1;
}
}
else if (state == 1)
{
if (sets[pos] >= '0' && sets[pos] <= '9')
{
right *= 10;
right += (int)(sets[pos] - '0');
}
else if (sets[pos] == ',')
{
int i;
for (i = left; i <= right; i++)
{
set_of_threads.push_back(i);
}
left = 0;
state = 0;
}
}
pos++;
}
if (sets[pos] != ']')
{
return;
}
if (state == 0)
{
set_of_threads.push_back(left);
}
else if (state == 1)
{
int i;
for (i = left; i <= right; i++)
{
set_of_threads.push_back(i);
}
}
}
static void cpufp_register_isa()
{
/* Register AMX DSA */
#ifdef _AMX_TILE_
init_tile_cfg();
syscall(SYS_arch_prctl, 0x1023, 18);
#endif
#ifdef _AMX_INT8_
reg_new_isa("AMX_INT8", "DSA", "MM(s32,s8,s8)", "OPS",
0x2500000LL, 131072LL, &__tilecfg, amx_int8_mm_s32s8s8);
reg_new_isa("AMX_INT8", "DSA", "MM(s32,s8,u8)", "OPS",
0x2500000LL, 131072LL, &__tilecfg, amx_int8_mm_s32s8u8);
reg_new_isa("AMX_INT8", "DSA", "MM(s32,u8,s8)", "OPS",
0x2500000LL, 131072LL, &__tilecfg, amx_int8_mm_s32u8s8);
reg_new_isa("AMX_INT8", "DSA", "MM(s32,u8,u8)", "OPS",
0x2500000LL, 131072LL, &__tilecfg, amx_int8_mm_s32u8u8);
num_dsa += 4;
#endif
#ifdef _AMX_BF16_
reg_new_isa("AMX_BF16", "DSA", "MM(f32,bf16,bf16)", "FLOPS",
0x2500000LL, 65536LL, &__tilecfg, amx_bf16_mm_f32bf16bf16);
num_dsa++;
#endif
#ifdef _AMX_FP16_
reg_new_isa("AMX_FP16", "DSA", "MM(f32,f16,f16)", "FLOPS",
0x2500000LL, 65536LL, &__tilecfg, amx_fp16_mm_f32f16f16);
num_dsa++;
#endif
/* Register 512b SIMD ISA */
#ifdef _AVX512_VNNI_
reg_new_isa("AVX512_VNNI", "512b", "DP4A(s32,u8,s8)", "OPS",
0x20000000LL, 2048LL, NULL, avx512_vnni_512b_dp4a_s32u8s8);
reg_new_isa("AVX512_VNNI", "512b", "DP2A(s32,s16,s16)", "OPS",
0x20000000LL, 1024LL, NULL, avx512_vnni_512b_dp2a_s32s16s16);
num_simd_512b += 2;
#endif
#ifdef _AVX512_BF16_
reg_new_isa("AVX512_BF16", "512b", "DP2A(f32,bf16,bf16)", "FLOPS",
0x20000000LL, 1024LL, NULL, avx512_bf16_512b_dp2a_f32bf16bf16);
num_simd_512b++;
#endif
#ifdef _AVX512_FP16_
reg_new_isa("AVX512_FP16", "512b", "FMA(f16,f16,f16)", "FLOPS",
0x20000000LL, 1024LL, NULL, avx512_fp16_512b_fma_f16f16f16);
num_simd_512b++;
#endif
#ifdef _AVX512F_
reg_new_isa("AVX512F", "512b", "FMA(f32,f32,f32)", "FLOPS",
0x20000000LL, 512LL, NULL, avx512f_512b_fma_f32f32f32);
reg_new_isa("AVX512F", "512b", "FMA(f64,f64,f64)", "FLOPS",
0x20000000LL, 256LL, NULL, avx512f_512b_fma_f64f64f64);
reg_new_isa("AVX512F", "512b", "ADD(MUL(f32,f32),f32)", "FLOPS",
0x20000000LL, 256LL, NULL, avx512f_512b_add_mul_f32f32_f32);
reg_new_isa("AVX512F", "512b", "ADD(MUL(f64,f64),f64)", "FLOPS",
0x20000000LL, 128LL, NULL, avx512f_512b_add_mul_f64f64_f64);
num_simd_512b += 4;
#endif
/* Register 256b SIMD ISA */
#ifdef _AVX512_VNNI_
reg_new_isa("AVX512_VNNI", "256b", "DP4A(s32,u8,s8)", "OPS",
0x20000000LL, 1024LL, NULL, avx512_vnni_256b_dp4a_s32u8s8);
num_simd_256b++;
#endif
#ifdef _AVX_VNNI_
reg_new_isa("AVX_VNNI", "256b", "DP4A(s32,u8,s8)", "OPS",
0x20000000LL, 1024LL, NULL, avx_vnni_256b_dp4a_s32u8s8);
num_simd_256b++;
#endif
#ifdef _AVX_VNNI_INT8_
reg_new_isa("AVX_VNNI_INT8", "256b", "DP4A(s32,s8,s8)", "OPS",
0x20000000LL, 1024LL, NULL, avx_vnni_int8_256b_dp4a_s32s8s8);
reg_new_isa("AVX_VNNI_INT8", "256b", "DP4A(s32,s8,u8)", "OPS",
0x20000000LL, 1024LL, NULL, avx_vnni_int8_256b_dp4a_s32s8u8);
reg_new_isa("AVX_VNNI_INT8", "256b", "DP4A(s32,u8,u8)", "OPS",
0x20000000LL, 1024LL, NULL, avx_vnni_int8_256b_dp4a_s32u8u8);
num_simd_256b += 3;
#endif
#ifdef _AVX512_VNNI_
reg_new_isa("AVX512_VNNI", "256b", "DP2A(s32,s16,s16)", "OPS",
0x20000000LL, 512LL, NULL, avx512_vnni_256b_dp2a_s32s16s16);
num_simd_256b++;
#endif
#ifdef _AVX_VNNI_
reg_new_isa("AVX_VNNI", "256b", "DP2A(s32,s16,s16)", "OPS",
0x20000000LL, 512LL, NULL, avx_vnni_256b_dp2a_s32s16s16);
num_simd_256b++;
#endif
#ifdef _AVX_VNNI_INT16_
reg_new_isa("AVX_VNNI_INT16", "256b", "DP2A(s32,s16,u16)", "OPS",
0x20000000LL, 512LL, NULL, avx_vnni_int16_256b_dp2a_s32s16u16);
reg_new_isa("AVX_VNNI_INT16", "256b", "DP2A(s32,u16,s16)", "OPS",
0x20000000LL, 512LL, NULL, avx_vnni_int16_256b_dp2a_s32u16s16);
reg_new_isa("AVX_VNNI_INT16", "256b", "DP2A(s32,u16,u16)", "OPS",
0x20000000LL, 512LL, NULL, avx_vnni_int16_256b_dp2a_s32u16u16);
num_simd_256b += 3;
#endif
#ifdef _AVX512_BF16_
reg_new_isa("AVX512_BF16", "256b", "DP2A(f32,bf16,bf16)", "FLOPS",
0x20000000LL, 512LL, NULL, avx512_bf16_256b_dp2a_f32bf16bf16);
num_simd_256b++;
#endif
#ifdef _AVX512_FP16_
reg_new_isa("AVX512_FP16", "256b", "FMA(f16,f16,f16)", "FLOPS",
0x20000000LL, 512LL, NULL, avx512_fp16_256b_fma_f16f16f16);
num_simd_256b++;
#endif
#ifdef _FMA_
reg_new_isa("FMA", "256b", "FMA(f32,f32,f32)", "FLOPS",
0x20000000LL, 256LL, NULL, fma_256b_fma_f32f32f32);
reg_new_isa("FMA", "256b", "FMA(f64,f64,f64)", "FLOPS",
0x20000000LL, 128LL, NULL, fma_256b_fma_f64f64f64);
num_simd_256b += 2;
#endif
#ifdef _AVX_
reg_new_isa("AVX", "256b", "ADD(MUL(f32,f32),f32)", "FLOPS",
0x20000000LL, 128LL, NULL, avx_256b_add_mul_f32f32_f32);
reg_new_isa("AVX", "256b", "ADD(MUL(f64,f64),f64)", "FLOPS",
0x20000000LL, 64LL, NULL, avx_256b_add_mul_f64f64_f64);
num_simd_256b += 2;
#endif
/* Register 128b SIMD ISA */
#ifdef _AVX512_VNNI_
reg_new_isa("AVX512_VNNI", "128b", "DP4A(s32,u8,s8)", "OPS",
0x20000000LL, 512LL, NULL, avx512_vnni_128b_dp4a_s32u8s8);
num_simd_128b++;
#endif
#ifdef _AVX_VNNI_
reg_new_isa("AVX_VNNI", "128b", "DP4A(s32,u8,s8)", "OPS",
0x20000000LL, 512LL, NULL, avx_vnni_128b_dp4a_s32u8s8);
num_simd_128b++;
#endif
#ifdef _AVX_VNNI_INT8_
reg_new_isa("AVX_VNNI_INT8", "128b", "DP4A(s32,s8,s8)", "OPS",
0x20000000LL, 512LL, NULL, avx_vnni_int8_128b_dp4a_s32s8s8);
reg_new_isa("AVX_VNNI_INT8", "128b", "DP4A(s32,s8,u8)", "OPS",
0x20000000LL, 512LL, NULL, avx_vnni_int8_128b_dp4a_s32s8u8);
reg_new_isa("AVX_VNNI_INT8", "128b", "DP4A(s32,u8,u8)", "OPS",
0x20000000LL, 512LL, NULL, avx_vnni_int8_128b_dp4a_s32u8u8);
num_simd_128b += 3;
#endif
#ifdef _AVX512_VNNI_
reg_new_isa("AVX512_VNNI", "128b", "DP2A(s32,s16,s16)", "OPS",
0x20000000LL, 256LL, NULL, avx512_vnni_128b_dp2a_s32s16s16);
num_simd_128b++;
#endif
#ifdef _AVX_VNNI_
reg_new_isa("AVX_VNNI", "128b", "DP2A(s32,s16,s16)", "OPS",
0x20000000LL, 256LL, NULL, avx_vnni_128b_dp2a_s32s16s16);
num_simd_128b++;
#endif
#ifdef _AVX_VNNI_INT16_
reg_new_isa("AVX_VNNI_INT16", "128b", "DP2A(s32,s16,u16)", "OPS",
0x20000000LL, 256LL, NULL, avx_vnni_int16_128b_dp2a_s32s16u16);
reg_new_isa("AVX_VNNI_INT16", "128b", "DP2A(s32,u16,s16)", "OPS",
0x20000000LL, 256LL, NULL, avx_vnni_int16_128b_dp2a_s32u16s16);
reg_new_isa("AVX_VNNI_INT16", "128b", "DP2A(s32,u16,u16)", "OPS",
0x20000000LL, 256LL, NULL, avx_vnni_int16_128b_dp2a_s32u16u16);
num_simd_128b += 3;
#endif
#ifdef _AVX512_BF16_
reg_new_isa("AVX512_BF16", "128b", "DP2A(f32,bf16,bf16)", "FLOPS",
0x20000000LL, 256LL, NULL, avx512_bf16_128b_dp2a_f32bf16bf16);
num_simd_128b++;
#endif
#ifdef _AVX512_FP16_
reg_new_isa("AVX512_FP16", "128b", "FMA(f16,f16,f16)", "FLOPS",
0x20000000LL, 256LL, NULL, avx512_fp16_128b_fma_f16f16f16);
num_simd_128b++;
#endif
#ifdef _FMA_
reg_new_isa("FMA", "128b", "FMA(f32,f32,f32)", "FLOPS",
0x20000000LL, 128LL, NULL, fma_128b_fma_f32f32f32);
reg_new_isa("FMA", "128b", "FMA(f64,f64,f64)", "FLOPS",
0x20000000LL, 64LL, NULL, fma_128b_fma_f64f64f64);
num_simd_128b += 2;
#endif
#ifdef _SSE_
reg_new_isa("SSE", "128b", "ADD(MUL(f32,f32),f32)", "FLOPS",
0x20000000LL, 64LL, NULL, sse_128b_add_mul_f32f32_f32);
num_simd_128b++;
#endif
#ifdef _SSE2_
reg_new_isa("SSE2", "128b", "ADD(MUL(f64,f64),f64)", "FLOPS",
0x20000000LL, 32LL, NULL, sse2_128b_add_mul_f64f64_f64);
num_simd_128b++;
#endif
}
int main(int argc, char *argv[])
{
vector set_of_threads;
uint32_t idle_time = 0;
bool params_enough = false;
int i;
for (i = 1; i < argc; i++)
{
if (strncmp(argv[i], "--thread_pool=", 14) == 0)
{
parse_thread_pool(argv[i] + 14, set_of_threads);
params_enough = true;
}
else if (strncmp(argv[i], "--idle_time=", 12) == 0)
{
idle_time = (uint32_t)atoi(argv[i] + 12);
}
}
if (!params_enough)
{
fprintf(stderr, "Error: You must set --thread_pool parameter.\n");
fprintf(stderr, "You may also set --idle_time parameter.\n");
fprintf(stderr, "Usage: %s --thread_pool=[xxx] --idle_time=yyy\n", argv[0]);
fprintf(stderr, "[xxx] indicates all cores to benchmark.\n");
fprintf(stderr, "Example: [0,3,5-8,13-15].\n");
fprintf(stderr, "idle_time is the interval time(s) between every two benchmarks.\n");
fprintf(stderr, "idle_time parameter can be ignored, the default value is 0s.\n");
fprintf(stderr, "Notice: there must NOT be any spaces.\n");
exit(0);
}
cpufp_register_isa();
cpubm_do_bench(set_of_threads, idle_time);
return 0;
}
================================================
FILE: x64/cpuid.c
================================================
#include
struct cpuid_t
{
unsigned int eax;
unsigned int ebx;
unsigned int ecx;
unsigned int edx;
};
#define BIT_TEST(bit_map, pos) (((bit_map) & (0x1 << (pos))) ? 1 : 0)
static void cpuid_x86_exec(unsigned int ieax,
unsigned int iecx,
struct cpuid_t *cpuid)
{
asm volatile ("cpuid"
: "=a"(cpuid->eax), "=b"(cpuid->ebx), "=c"(cpuid->ecx), "=d"(cpuid->edx)
: "0"(ieax), "2"(iecx));
}
int main()
{
struct cpuid_t cpuid_0x1_0x0, cpuid_0x7_0x0, cpuid_0x7_0x1;;
cpuid_x86_exec(0x1, 0x0, &cpuid_0x1_0x0);
cpuid_x86_exec(0x7, 0x0, &cpuid_0x7_0x0);
cpuid_x86_exec(0x7, 0x1, &cpuid_0x7_0x1);
if (BIT_TEST(cpuid_0x7_0x0.edx, 24))
{
if (BIT_TEST(cpuid_0x7_0x0.edx, 25))
{
printf("_AMX_INT8_\n");
}
if (BIT_TEST(cpuid_0x7_0x0.edx, 22))
{
printf("_AMX_BF16_\n");
}
if (BIT_TEST(cpuid_0x7_0x1.eax, 21))
{
printf("_AMX_FP16_\n");
}
}
if (BIT_TEST(cpuid_0x7_0x1.eax, 4))
{
printf("_AVX_VNNI_\n");
}
if (BIT_TEST(cpuid_0x7_0x1.edx, 4))
{
printf("_AVX_VNNI_INT8_\n");
}
if (BIT_TEST(cpuid_0x7_0x1.edx, 10))
{
printf("_AVX_VNNI_INT16_\n");
}
if (BIT_TEST(cpuid_0x7_0x0.ecx, 11))
{
printf("_AVX512_VNNI_\n");
}
if (BIT_TEST(cpuid_0x7_0x1.eax, 5))
{
printf("_AVX512_BF16_\n");
}
if (BIT_TEST(cpuid_0x7_0x0.edx, 23))
{
printf("_AVX512_FP16_\n");
}
if (BIT_TEST(cpuid_0x7_0x0.ebx, 16))
{
printf("_AVX512F_\n");
}
if (BIT_TEST(cpuid_0x1_0x0.ecx, 12))
{
printf("_FMA_\n");
}
if (BIT_TEST(cpuid_0x1_0x0.ecx, 28))
{
printf("_AVX_\n");
}
if (BIT_TEST(cpuid_0x1_0x0.edx, 25))
{
printf("_SSE_\n");
}
if (BIT_TEST(cpuid_0x1_0x0.edx, 26))
{
printf("_SSE2_\n");
}
return 0;
}