Repository: VIA-Research/uPIMulator
Branch: main
Commit: 870d916334e9
Files: 2582
Total size: 229.4 MB
Directory structure:
gitextract_ry7k9cpr/
├── .gitignore
├── LICENSE
├── README.md
├── assets/
│ ├── figure5_mem_util_calculator.xlsx
│ ├── figure7_active_tasklet_breakdown.xlsx
│ └── figure9_instruction_mix.xlsx
├── golang/
│ ├── README.md
│ └── uPIMulator/
│ ├── benchmark/
│ │ ├── BS/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── bs_omp.c
│ │ │ │ │ └── timer.h
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── binary_search.cu
│ │ │ │ ├── binary_search.h
│ │ │ │ ├── cpu_lib.py
│ │ │ │ ├── cu_lib_import.py
│ │ │ │ └── run.py
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── CMakeLists.txt
│ │ ├── GEMV/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── _BL_10.conf
│ │ │ ├── _NR_TASKLETS_10_BL_10.conf
│ │ │ ├── _NR_TASKLETS_11_BL_10.conf
│ │ │ ├── _NR_TASKLETS_12_BL_10.conf
│ │ │ ├── _NR_TASKLETS_13_BL_10.conf
│ │ │ ├── _NR_TASKLETS_14_BL_10.conf
│ │ │ ├── _NR_TASKLETS_15_BL_10.conf
│ │ │ ├── _NR_TASKLETS_16
│ │ │ ├── _NR_TASKLETS_16_BL_10.conf
│ │ │ ├── _NR_TASKLETS_17_BL_10.conf
│ │ │ ├── _NR_TASKLETS_18_BL_10.conf
│ │ │ ├── _NR_TASKLETS_19_BL_10.conf
│ │ │ ├── _NR_TASKLETS_1_BL_10.conf
│ │ │ ├── _NR_TASKLETS_20_BL_10.conf
│ │ │ ├── _NR_TASKLETS_21_BL_10.conf
│ │ │ ├── _NR_TASKLETS_22_BL_10.conf
│ │ │ ├── _NR_TASKLETS_23_BL_10.conf
│ │ │ ├── _NR_TASKLETS_24_BL_10.conf
│ │ │ ├── _NR_TASKLETS_2_BL_10.conf
│ │ │ ├── _NR_TASKLETS_3_BL_10.conf
│ │ │ ├── _NR_TASKLETS_4_BL_10.conf
│ │ │ ├── _NR_TASKLETS_5_BL_10.conf
│ │ │ ├── _NR_TASKLETS_6_BL_10.conf
│ │ │ ├── _NR_TASKLETS_7_BL_10.conf
│ │ │ ├── _NR_TASKLETS_8_BL_10.conf
│ │ │ ├── _NR_TASKLETS_9_BL_10.conf
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── gemv_openmp.c
│ │ │ │ │ └── gemv_utils.h
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── gemv.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── HST-L/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── input/
│ │ │ │ └── image_VanHateren.iml
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── HST-S/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── input/
│ │ │ │ │ └── image_VanHateren.iml
│ │ │ │ ├── kernel.cpp
│ │ │ │ ├── kernel.cu
│ │ │ │ ├── kernel.h
│ │ │ │ ├── main.cpp
│ │ │ │ └── support/
│ │ │ │ ├── common.h
│ │ │ │ ├── cuda-setup.h
│ │ │ │ ├── partitioner.h
│ │ │ │ ├── timer.h
│ │ │ │ └── verify.h
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── input/
│ │ │ │ └── image_VanHateren.iml
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── MLP/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── _BL_10.conf
│ │ │ ├── _NR_TASKLETS_10_BL_10.conf
│ │ │ ├── _NR_TASKLETS_11_BL_10.conf
│ │ │ ├── _NR_TASKLETS_12_BL_10.conf
│ │ │ ├── _NR_TASKLETS_13_BL_10.conf
│ │ │ ├── _NR_TASKLETS_14_BL_10.conf
│ │ │ ├── _NR_TASKLETS_15_BL_10.conf
│ │ │ ├── _NR_TASKLETS_16
│ │ │ ├── _NR_TASKLETS_16_BL_10.conf
│ │ │ ├── _NR_TASKLETS_17_BL_10.conf
│ │ │ ├── _NR_TASKLETS_18_BL_10.conf
│ │ │ ├── _NR_TASKLETS_19_BL_10.conf
│ │ │ ├── _NR_TASKLETS_1_BL_10.conf
│ │ │ ├── _NR_TASKLETS_20_BL_10.conf
│ │ │ ├── _NR_TASKLETS_21_BL_10.conf
│ │ │ ├── _NR_TASKLETS_22_BL_10.conf
│ │ │ ├── _NR_TASKLETS_23_BL_10.conf
│ │ │ ├── _NR_TASKLETS_24_BL_10.conf
│ │ │ ├── _NR_TASKLETS_2_BL_10.conf
│ │ │ ├── _NR_TASKLETS_3_BL_10.conf
│ │ │ ├── _NR_TASKLETS_4_BL_10.conf
│ │ │ ├── _NR_TASKLETS_5_BL_10.conf
│ │ │ ├── _NR_TASKLETS_6_BL_10.conf
│ │ │ ├── _NR_TASKLETS_7_BL_10.conf
│ │ │ ├── _NR_TASKLETS_8_BL_10.conf
│ │ │ ├── _NR_TASKLETS_9_BL_10.conf
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── mlp_openmp.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── mlp.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── RED/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.cpp
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── app_baseline.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── cyclecount.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SCAN-RSS/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.cpp
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── app_baseline.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SCAN-SSA/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SEL/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── ds.h
│ │ │ │ ├── kernel.cu
│ │ │ │ └── select.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── TRNS/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── kernel.cpp
│ │ │ │ │ ├── kernel.h
│ │ │ │ │ ├── main.cpp
│ │ │ │ │ └── support/
│ │ │ │ │ ├── common.h
│ │ │ │ │ ├── setup.h
│ │ │ │ │ ├── timer.h
│ │ │ │ │ └── verify.h
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── kernel.cu
│ │ │ │ ├── kernel.h
│ │ │ │ ├── main.cpp
│ │ │ │ └── support/
│ │ │ │ ├── common.h
│ │ │ │ ├── cuda-setup.h
│ │ │ │ ├── timer.h
│ │ │ │ └── verify.h
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── TS/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── UNI/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── ds.h
│ │ │ │ ├── kernel.cu
│ │ │ │ └── unique.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── VA/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── vec_add.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ └── build.py
│ ├── docker/
│ │ └── Dockerfile
│ ├── go.mod
│ ├── go.sum
│ ├── script/
│ │ ├── build.py
│ │ ├── format.py
│ │ ├── run_validation.sh
│ │ └── visualize.py
│ ├── sdk/
│ │ ├── CMakeLists.txt
│ │ ├── build.py
│ │ ├── misc/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── accessMramFromDpu.c
│ │ │ ├── coreDump.c
│ │ │ ├── crt0.c
│ │ │ ├── dpu.lds
│ │ │ ├── internalStateReset.c
│ │ │ ├── linkerScript.lds
│ │ │ ├── restoreRegisters.c
│ │ │ └── restore_carry_and_zero_flag.h
│ │ ├── stdlib/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── abort.c
│ │ │ ├── assert.h
│ │ │ ├── atoi.c
│ │ │ ├── atol.c
│ │ │ ├── ctype.h
│ │ │ ├── errno.c
│ │ │ ├── errno.h
│ │ │ ├── exit.c
│ │ │ ├── inttypes.h
│ │ │ ├── iso646.h
│ │ │ ├── limits.h
│ │ │ ├── memchr.c
│ │ │ ├── memcmp.c
│ │ │ ├── memcpy.c
│ │ │ ├── memmove.c
│ │ │ ├── memmram_utils.h
│ │ │ ├── memset.c
│ │ │ ├── stdalign.h
│ │ │ ├── stdarg.h
│ │ │ ├── stdbool.h
│ │ │ ├── stddef.h
│ │ │ ├── stdint.h
│ │ │ ├── stdio.c
│ │ │ ├── stdio.h
│ │ │ ├── stdlib.h
│ │ │ ├── stdnoreturn.h
│ │ │ ├── stpcpy.c
│ │ │ ├── stpncpy.c
│ │ │ ├── strcat.c
│ │ │ ├── strchr.c
│ │ │ ├── strcmp.c
│ │ │ ├── strcpy.c
│ │ │ ├── strcspn.c
│ │ │ ├── strdup.c
│ │ │ ├── strerror.c
│ │ │ ├── string.h
│ │ │ ├── strlen.c
│ │ │ ├── strlwr.c
│ │ │ ├── strncat.c
│ │ │ ├── strncmp.c
│ │ │ ├── strncpy.c
│ │ │ ├── strndup.c
│ │ │ ├── strnlen.c
│ │ │ ├── strpbrk.c
│ │ │ ├── strrchr.c
│ │ │ ├── strrev.c
│ │ │ ├── strsep.c
│ │ │ ├── strspn.c
│ │ │ ├── strstr.c
│ │ │ ├── strtok_r.c
│ │ │ ├── strtol.c
│ │ │ └── strupr.c
│ │ └── syslib/
│ │ ├── CMakeLists.txt
│ │ ├── absvdi2.c
│ │ ├── absvsi2.c
│ │ ├── adddf3.c
│ │ ├── addsf3.c
│ │ ├── addvdi3.c
│ │ ├── addvsi3.c
│ │ ├── alloc.c
│ │ ├── alloc.h
│ │ ├── ashldi3.c
│ │ ├── ashrdi3.c
│ │ ├── atomic_bit.h
│ │ ├── atomics.c
│ │ ├── attributes.h
│ │ ├── barrier.c
│ │ ├── barrier.h
│ │ ├── bswapdi2.c
│ │ ├── bswapsi2.c
│ │ ├── buddy_alloc.c
│ │ ├── buddy_alloc.h
│ │ ├── buddy_realloc.c
│ │ ├── built_ins.h
│ │ ├── clzdi2.c
│ │ ├── clzsi2.c
│ │ ├── cmpdi2.c
│ │ ├── comparedf2.c
│ │ ├── comparesf2.c
│ │ ├── ctzdi2.c
│ │ ├── ctzsi2.c
│ │ ├── defs.c
│ │ ├── defs.h
│ │ ├── devprivate.h
│ │ ├── div32.c
│ │ ├── divdf3.c
│ │ ├── divdi3.c
│ │ ├── divmodsi4.c
│ │ ├── divsf3.c
│ │ ├── divsi3.c
│ │ ├── dpuconst.h
│ │ ├── dpufault.h
│ │ ├── dpuruntime.h
│ │ ├── extendhfsf2.c
│ │ ├── extendsfdf2.c
│ │ ├── ffsdi2.c
│ │ ├── ffssi2.c
│ │ ├── ffsti2.c
│ │ ├── fixdfdi.c
│ │ ├── fixdfsi.c
│ │ ├── fixsfdi.c
│ │ ├── fixsfsi.c
│ │ ├── fixunsdfdi.c
│ │ ├── fixunsdfsi.c
│ │ ├── fixunssfdi.c
│ │ ├── fixunssfsi.c
│ │ ├── float.h
│ │ ├── floatdidf.c
│ │ ├── floatdisf.c
│ │ ├── floatsidf.c
│ │ ├── floatsisf.c
│ │ ├── floatundidf.c
│ │ ├── floatundisf.c
│ │ ├── floatunsidf.c
│ │ ├── floatunsisf.c
│ │ ├── fp_add_impl.inc
│ │ ├── fp_extend.h
│ │ ├── fp_extend_impl.inc
│ │ ├── fp_fixint_impl.inc
│ │ ├── fp_fixuint_impl.inc
│ │ ├── fp_lib.h
│ │ ├── fp_mul_impl.inc
│ │ ├── fp_trunc.h
│ │ ├── fp_trunc_impl.inc
│ │ ├── fsb_allocator.c
│ │ ├── fsb_allocator.h
│ │ ├── handshake.c
│ │ ├── handshake.h
│ │ ├── int_endianness.h
│ │ ├── int_lib.h
│ │ ├── int_math.h
│ │ ├── int_types.h
│ │ ├── int_util.c
│ │ ├── int_util.h
│ │ ├── listener.c
│ │ ├── lshrdi3.c
│ │ ├── macro_utils.h
│ │ ├── mcount.c
│ │ ├── moddi3.c
│ │ ├── modsi3.c
│ │ ├── mram.h
│ │ ├── mul32.c
│ │ ├── mul64.c
│ │ ├── muldc3.c
│ │ ├── muldf3.c
│ │ ├── mulodi4.c
│ │ ├── mulosi4.c
│ │ ├── mulsf3.c
│ │ ├── mulvdi3.c
│ │ ├── mulvsi3.c
│ │ ├── mutex.h
│ │ ├── negdf2.c
│ │ ├── negdi2.c
│ │ ├── negsf2.c
│ │ ├── negvdi2.c
│ │ ├── negvsi2.c
│ │ ├── paritydi2.c
│ │ ├── paritysi2.c
│ │ ├── perfcounter.c
│ │ ├── perfcounter.h
│ │ ├── popcountdi2.c
│ │ ├── popcountsi2.c
│ │ ├── powidf2.c
│ │ ├── powisf2.c
│ │ ├── profiling.c
│ │ ├── profiling.h
│ │ ├── profiling_internals.h
│ │ ├── sem.c
│ │ ├── sem.h
│ │ ├── seqread.h
│ │ ├── seqread.inc
│ │ ├── seqread1024.c
│ │ ├── seqread128.c
│ │ ├── seqread256.c
│ │ ├── seqread32.c
│ │ ├── seqread512.c
│ │ ├── seqread64.c
│ │ ├── soft_cache.c
│ │ ├── soft_cache.h
│ │ ├── subdf3.c
│ │ ├── subsf3.c
│ │ ├── subvdi3.c
│ │ ├── subvsi3.c
│ │ ├── sysdef.h
│ │ ├── truncdfhf2.c
│ │ ├── truncdfsf2.c
│ │ ├── truncsfhf2.c
│ │ ├── ucmpdi2.c
│ │ ├── udiv64.c
│ │ ├── udivdi3.c
│ │ ├── udivmodsi4.c
│ │ ├── udivsi3.c
│ │ ├── umoddi3.c
│ │ ├── umodsi3.c
│ │ └── waitqueue.c
│ └── src/
│ ├── abi/
│ │ ├── encoding/
│ │ │ ├── ascii_encoder.go
│ │ │ └── byte_stream.go
│ │ └── word/
│ │ ├── intermediate.go
│ │ └── word.go
│ ├── assembler/
│ │ ├── assemblable.go
│ │ ├── assembler.go
│ │ └── prim/
│ │ ├── bs.go
│ │ ├── gemv.go
│ │ ├── hst_l.go
│ │ ├── hst_s.go
│ │ ├── mlp.go
│ │ ├── red.go
│ │ ├── scan_rss.go
│ │ ├── scan_ssa.go
│ │ ├── sel.go
│ │ ├── trns.go
│ │ ├── ts.go
│ │ ├── uni.go
│ │ └── va.go
│ ├── compiler/
│ │ └── compiler.go
│ ├── core/
│ │ ├── job.go
│ │ └── thread_pool.go
│ ├── linker/
│ │ ├── analyze_liveness_job.go
│ │ ├── kernel/
│ │ │ ├── directive/
│ │ │ │ ├── ascii_directive.go
│ │ │ │ ├── asciz_directive.go
│ │ │ │ ├── byte_directive.go
│ │ │ │ ├── long_directive.go
│ │ │ │ ├── quad_directive.go
│ │ │ │ ├── short_directive.go
│ │ │ │ └── zero_directive.go
│ │ │ ├── encodable.go
│ │ │ ├── executable.go
│ │ │ ├── instruction/
│ │ │ │ ├── cc/
│ │ │ │ │ ├── acquire_cc.go
│ │ │ │ │ ├── add_nz_cc.go
│ │ │ │ │ ├── boot_cc.go
│ │ │ │ │ ├── cc.go
│ │ │ │ │ ├── const_cc_ge0.go
│ │ │ │ │ ├── const_cc_geu.go
│ │ │ │ │ ├── const_cc_zero.go
│ │ │ │ │ ├── count_nz_cc.go
│ │ │ │ │ ├── div_cc.go
│ │ │ │ │ ├── div_nz_cc.go
│ │ │ │ │ ├── ext_sub_set_cc.go
│ │ │ │ │ ├── false_cc.go
│ │ │ │ │ ├── imm_shift_nz_cc.go
│ │ │ │ │ ├── log_nz_cc.go
│ │ │ │ │ ├── log_set_cc.go
│ │ │ │ │ ├── mul_nz_cc.go
│ │ │ │ │ ├── no_cc.go
│ │ │ │ │ ├── release_cc.go
│ │ │ │ │ ├── shift_nz_cc.go
│ │ │ │ │ ├── sub_nz_cc.go
│ │ │ │ │ ├── sub_set_cc.go
│ │ │ │ │ ├── true_cc.go
│ │ │ │ │ └── true_false_cc.go
│ │ │ │ ├── endian.go
│ │ │ │ ├── exception.go
│ │ │ │ ├── flag.go
│ │ │ │ ├── instruction.go
│ │ │ │ ├── op_code.go
│ │ │ │ ├── reg_descriptor/
│ │ │ │ │ ├── gp_reg_descriptor.go
│ │ │ │ │ ├── pair_reg_descriptor.go
│ │ │ │ │ ├── sp_reg_descriptor.go
│ │ │ │ │ └── src_reg_descriptor.go
│ │ │ │ └── suffix.go
│ │ │ ├── kernel.go
│ │ │ ├── label.go
│ │ │ ├── liveness.go
│ │ │ ├── relocatable.go
│ │ │ └── section.go
│ │ ├── lex_job.go
│ │ ├── lexer/
│ │ │ ├── keyword_factory.go
│ │ │ ├── lexer.go
│ │ │ ├── regex.go
│ │ │ ├── regex_factory.go
│ │ │ ├── token.go
│ │ │ ├── token_stream.go
│ │ │ └── tokenizer.go
│ │ ├── linker.go
│ │ ├── logic/
│ │ │ ├── instruction_assigner.go
│ │ │ ├── label_assigner.go
│ │ │ ├── linker_constant.go
│ │ │ ├── linker_script.go
│ │ │ ├── liveness_analyzer.go
│ │ │ └── set_assigner.go
│ │ ├── parse_job.go
│ │ └── parser/
│ │ ├── ast.go
│ │ ├── expr/
│ │ │ ├── binary_add_expr.go
│ │ │ ├── binary_sub_expr.go
│ │ │ ├── ci_op_code_expr.go
│ │ │ ├── condition_expr.go
│ │ │ ├── ddci_op_code_expr.go
│ │ │ ├── dma_rri_op_code_expr.go
│ │ │ ├── drdici_op_code_expr.go
│ │ │ ├── endian_expr.go
│ │ │ ├── expr.go
│ │ │ ├── i_op_code_expr.go
│ │ │ ├── jump_op_code_expr.go
│ │ │ ├── load_op_code_expr.go
│ │ │ ├── negative_number_expr.go
│ │ │ ├── primary_expr.go
│ │ │ ├── program_counter_expr.go
│ │ │ ├── r_op_code_expr.go
│ │ │ ├── rici_op_code_expr.go
│ │ │ ├── rr_op_code_expr.go
│ │ │ ├── rri_op_code_expr.go
│ │ │ ├── rrri_op_code_expr.go
│ │ │ ├── section_name_expr.go
│ │ │ ├── section_type_expr.go
│ │ │ ├── src_reg_expr.go
│ │ │ ├── store_op_code_expr.go
│ │ │ ├── suffix_expr.go
│ │ │ └── symbol_type.go
│ │ ├── parser.go
│ │ ├── rule.go
│ │ ├── stack.go
│ │ ├── stack_item.go
│ │ ├── stmt/
│ │ │ ├── directive/
│ │ │ │ ├── addrsig_stmt.go
│ │ │ │ ├── addrsig_sym_stmt.go
│ │ │ │ ├── ascii_stmt.go
│ │ │ │ ├── asciz_stmt.go
│ │ │ │ ├── byte_stmt.go
│ │ │ │ ├── cfi_def_cfa_offset_stmt.go
│ │ │ │ ├── cfi_endproc.go
│ │ │ │ ├── cfi_offset_stmt.go
│ │ │ │ ├── cfi_sections_stmt.go
│ │ │ │ ├── cfi_startproc_stmt.go
│ │ │ │ ├── file_number_stmt.go
│ │ │ │ ├── file_string_stmt.go
│ │ │ │ ├── global_stmt.go
│ │ │ │ ├── loc_is_stmt_stmt.go
│ │ │ │ ├── loc_number_stmt.go
│ │ │ │ ├── loc_prologue_end_stmt.go
│ │ │ │ ├── long_program_counter.go
│ │ │ │ ├── long_section_name_stmt.go
│ │ │ │ ├── p2_align_stmt.go
│ │ │ │ ├── quad_stmt.go
│ │ │ │ ├── section_identifier_number_stmt.go
│ │ │ │ ├── section_identifier_stmt.go
│ │ │ │ ├── section_stack_sizes_stmt.go
│ │ │ │ ├── section_string_number_stmt.go
│ │ │ │ ├── section_string_stmt.go
│ │ │ │ ├── set_stmt.go
│ │ │ │ ├── short_stmt.go
│ │ │ │ ├── size_stmt.go
│ │ │ │ ├── text_stmt.go
│ │ │ │ ├── type_stmt.go
│ │ │ │ ├── weak_stmt.go
│ │ │ │ ├── zero_double_number_stmt.go
│ │ │ │ └── zero_single_number_stmt.go
│ │ │ ├── instruction/
│ │ │ │ ├── ci_stmt.go
│ │ │ │ ├── ddci_stmt.go
│ │ │ │ ├── dma_rri_stmt.go
│ │ │ │ ├── drdici_stmt.go
│ │ │ │ ├── edri_stmt.go
│ │ │ │ ├── erid_stmt.go
│ │ │ │ ├── erii_stmt.go
│ │ │ │ ├── erir_stmt.go
│ │ │ │ ├── erri_stmt.go
│ │ │ │ ├── i_stmt.go
│ │ │ │ ├── nop_stmt.go
│ │ │ │ ├── r_stmt.go
│ │ │ │ ├── rci_stmt.go
│ │ │ │ ├── rici_stmt.go
│ │ │ │ ├── rir_stmt.go
│ │ │ │ ├── rirc_stmt.go
│ │ │ │ ├── rirci_stmt.go
│ │ │ │ ├── rr_stmt.go
│ │ │ │ ├── rrc_stmt.go
│ │ │ │ ├── rrci_stmt.go
│ │ │ │ ├── rri_stmt.go
│ │ │ │ ├── rric_stmt.go
│ │ │ │ ├── rrici_stmt.go
│ │ │ │ ├── rrr_stmt.go
│ │ │ │ ├── rrrc_stmt.go
│ │ │ │ ├── rrrci_stmt.go
│ │ │ │ ├── rrri_stmt.go
│ │ │ │ ├── rrrici_stmt.go
│ │ │ │ ├── s_erri_stmt.go
│ │ │ │ ├── s_r_stmt.go
│ │ │ │ ├── s_rci_stmt.go
│ │ │ │ ├── s_rirc_stmt.go
│ │ │ │ ├── s_rirci_stmt.go
│ │ │ │ ├── s_rr_stmt.go
│ │ │ │ ├── s_rrc_stmt.go
│ │ │ │ ├── s_rrci_stmt.go
│ │ │ │ ├── s_rri_stmt.go
│ │ │ │ ├── s_rric_stmt.go
│ │ │ │ ├── s_rrici_stmt.go
│ │ │ │ ├── s_rrr_stmt.go
│ │ │ │ ├── s_rrrc_stmt.go
│ │ │ │ ├── s_rrrci_stmt.go
│ │ │ │ ├── s_rrri_stmt.go
│ │ │ │ └── s_rrrici_stmt.go
│ │ │ ├── label_stmt.go
│ │ │ ├── stmt.go
│ │ │ └── sugar/
│ │ │ ├── bkp_stmt.go
│ │ │ ├── boot_ri_stmt.go
│ │ │ ├── call_ri_stmt.go
│ │ │ ├── call_rr_stmt.go
│ │ │ ├── div_step_drdi_stmt.go
│ │ │ ├── jeq_rii_stmt.go
│ │ │ ├── jeq_rri_stmt.go
│ │ │ ├── jnz_ri_stmt.go
│ │ │ ├── jump_i_stmt.go
│ │ │ ├── jump_r_stmt.go
│ │ │ ├── lbs_rri_stmt.go
│ │ │ ├── lbs_s_rri_stmt.go
│ │ │ ├── ld_dri_stmt.go
│ │ │ ├── movd_dd_stmt.go
│ │ │ ├── move_ri_stmt.go
│ │ │ ├── move_rici_stmt.go
│ │ │ ├── move_s_ri_stmt.go
│ │ │ ├── move_s_rici_stmt.go
│ │ │ ├── sb_id_ri_stmt.go
│ │ │ ├── sb_id_rii_stmt.go
│ │ │ ├── sb_rir_stmt.go
│ │ │ ├── sd_rid_stmt.go
│ │ │ ├── stop_stmt.go
│ │ │ └── time_cfg_r_stmt.go
│ │ ├── table.go
│ │ └── walker.go
│ ├── main.go
│ ├── misc/
│ │ ├── command_line_option.go
│ │ ├── command_line_parser.go
│ │ ├── command_line_validator.go
│ │ ├── config_loader.go
│ │ ├── config_validator.go
│ │ ├── file_dumper.go
│ │ ├── file_scanner.go
│ │ └── stat_factory.go
│ └── simulator/
│ ├── channel/
│ │ ├── channel.go
│ │ ├── channel_message.go
│ │ └── channel_message_q.go
│ ├── cycle_job.go
│ ├── dpu/
│ │ ├── dpu.go
│ │ ├── dram/
│ │ │ ├── dma_command.go
│ │ │ ├── dma_command_q.go
│ │ │ ├── memory_command.go
│ │ │ ├── memory_command_q.go
│ │ │ ├── memory_controller.go
│ │ │ ├── memory_scheduler.go
│ │ │ ├── mram.go
│ │ │ ├── row_buffer.go
│ │ │ └── wordline.go
│ │ ├── logic/
│ │ │ ├── alu.go
│ │ │ ├── cycle_rule.go
│ │ │ ├── dma.go
│ │ │ ├── instruction_q.go
│ │ │ ├── logic.go
│ │ │ ├── operand_collector.go
│ │ │ ├── pipeline.go
│ │ │ ├── reg_set.go
│ │ │ ├── thread.go
│ │ │ ├── thread_q.go
│ │ │ └── thread_scheduler.go
│ │ ├── reg/
│ │ │ ├── condition_reg.go
│ │ │ ├── exception_reg.go
│ │ │ ├── flag_reg.go
│ │ │ ├── gp_reg.go
│ │ │ ├── pc_reg.go
│ │ │ ├── reg_file.go
│ │ │ └── sp_reg.go
│ │ └── sram/
│ │ ├── atomic.go
│ │ ├── iram.go
│ │ ├── lock.go
│ │ └── wram.go
│ ├── host/
│ │ ├── channel_transfer_read_job.go
│ │ ├── channel_transfer_write_job.go
│ │ ├── chunk.go
│ │ ├── cycle_job.go
│ │ ├── dma_transfer_to_atomic_job.go
│ │ ├── dma_transfer_to_iram_job.go
│ │ ├── dma_transfer_to_mram_job.go
│ │ ├── dma_transfer_to_wram_job.go
│ │ └── host.go
│ ├── rank/
│ │ └── rank.go
│ └── simulator.go
├── golang_vm/
│ ├── README.md
│ └── uPIMulator/
│ ├── benchmark/
│ │ ├── BS/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── bs_omp.c
│ │ │ │ │ └── timer.h
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── binary_search.cu
│ │ │ │ ├── binary_search.h
│ │ │ │ ├── cpu_lib.py
│ │ │ │ ├── cu_lib_import.py
│ │ │ │ └── run.py
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── CMakeLists.txt
│ │ ├── GEMV/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── _BL_10.conf
│ │ │ ├── _NR_TASKLETS_10_BL_10.conf
│ │ │ ├── _NR_TASKLETS_11_BL_10.conf
│ │ │ ├── _NR_TASKLETS_12_BL_10.conf
│ │ │ ├── _NR_TASKLETS_13_BL_10.conf
│ │ │ ├── _NR_TASKLETS_14_BL_10.conf
│ │ │ ├── _NR_TASKLETS_15_BL_10.conf
│ │ │ ├── _NR_TASKLETS_16
│ │ │ ├── _NR_TASKLETS_16_BL_10.conf
│ │ │ ├── _NR_TASKLETS_17_BL_10.conf
│ │ │ ├── _NR_TASKLETS_18_BL_10.conf
│ │ │ ├── _NR_TASKLETS_19_BL_10.conf
│ │ │ ├── _NR_TASKLETS_1_BL_10.conf
│ │ │ ├── _NR_TASKLETS_20_BL_10.conf
│ │ │ ├── _NR_TASKLETS_21_BL_10.conf
│ │ │ ├── _NR_TASKLETS_22_BL_10.conf
│ │ │ ├── _NR_TASKLETS_23_BL_10.conf
│ │ │ ├── _NR_TASKLETS_24_BL_10.conf
│ │ │ ├── _NR_TASKLETS_2_BL_10.conf
│ │ │ ├── _NR_TASKLETS_3_BL_10.conf
│ │ │ ├── _NR_TASKLETS_4_BL_10.conf
│ │ │ ├── _NR_TASKLETS_5_BL_10.conf
│ │ │ ├── _NR_TASKLETS_6_BL_10.conf
│ │ │ ├── _NR_TASKLETS_7_BL_10.conf
│ │ │ ├── _NR_TASKLETS_8_BL_10.conf
│ │ │ ├── _NR_TASKLETS_9_BL_10.conf
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── gemv_openmp.c
│ │ │ │ │ └── gemv_utils.h
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── gemv.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── HST-L/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── input/
│ │ │ │ └── image_VanHateren.iml
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── HST-S/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── input/
│ │ │ │ │ └── image_VanHateren.iml
│ │ │ │ ├── kernel.cpp
│ │ │ │ ├── kernel.cu
│ │ │ │ ├── kernel.h
│ │ │ │ ├── main.cpp
│ │ │ │ └── support/
│ │ │ │ ├── common.h
│ │ │ │ ├── cuda-setup.h
│ │ │ │ ├── partitioner.h
│ │ │ │ ├── timer.h
│ │ │ │ └── verify.h
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── input/
│ │ │ │ └── image_VanHateren.iml
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── MLP/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── _BL_10.conf
│ │ │ ├── _NR_TASKLETS_10_BL_10.conf
│ │ │ ├── _NR_TASKLETS_11_BL_10.conf
│ │ │ ├── _NR_TASKLETS_12_BL_10.conf
│ │ │ ├── _NR_TASKLETS_13_BL_10.conf
│ │ │ ├── _NR_TASKLETS_14_BL_10.conf
│ │ │ ├── _NR_TASKLETS_15_BL_10.conf
│ │ │ ├── _NR_TASKLETS_16
│ │ │ ├── _NR_TASKLETS_16_BL_10.conf
│ │ │ ├── _NR_TASKLETS_17_BL_10.conf
│ │ │ ├── _NR_TASKLETS_18_BL_10.conf
│ │ │ ├── _NR_TASKLETS_19_BL_10.conf
│ │ │ ├── _NR_TASKLETS_1_BL_10.conf
│ │ │ ├── _NR_TASKLETS_20_BL_10.conf
│ │ │ ├── _NR_TASKLETS_21_BL_10.conf
│ │ │ ├── _NR_TASKLETS_22_BL_10.conf
│ │ │ ├── _NR_TASKLETS_23_BL_10.conf
│ │ │ ├── _NR_TASKLETS_24_BL_10.conf
│ │ │ ├── _NR_TASKLETS_2_BL_10.conf
│ │ │ ├── _NR_TASKLETS_3_BL_10.conf
│ │ │ ├── _NR_TASKLETS_4_BL_10.conf
│ │ │ ├── _NR_TASKLETS_5_BL_10.conf
│ │ │ ├── _NR_TASKLETS_6_BL_10.conf
│ │ │ ├── _NR_TASKLETS_7_BL_10.conf
│ │ │ ├── _NR_TASKLETS_8_BL_10.conf
│ │ │ ├── _NR_TASKLETS_9_BL_10.conf
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── mlp_openmp.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── mlp.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── RED/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.cpp
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── app_baseline.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── cyclecount.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SCAN-RSS/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.cpp
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── app_baseline.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SCAN-SSA/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SEL/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── ds.h
│ │ │ │ ├── kernel.cu
│ │ │ │ └── select.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── TRNS/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── kernel.cpp
│ │ │ │ │ ├── kernel.h
│ │ │ │ │ ├── main.cpp
│ │ │ │ │ └── support/
│ │ │ │ │ ├── common.h
│ │ │ │ │ ├── setup.h
│ │ │ │ │ ├── timer.h
│ │ │ │ │ └── verify.h
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── kernel.cu
│ │ │ │ ├── kernel.h
│ │ │ │ ├── main.cpp
│ │ │ │ └── support/
│ │ │ │ ├── common.h
│ │ │ │ ├── cuda-setup.h
│ │ │ │ ├── timer.h
│ │ │ │ └── verify.h
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── TS/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── UNI/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── ds.h
│ │ │ │ ├── kernel.cu
│ │ │ │ └── unique.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── VA/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── vec_add.cu
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── VA_SIMPLE/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── dpu/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── task.c
│ │ │ └── host/
│ │ │ ├── CMakeLists.txt
│ │ │ └── app.c
│ │ └── build.py
│ ├── docker/
│ │ └── Dockerfile
│ ├── go.mod
│ ├── script/
│ │ ├── build.py
│ │ └── format.py
│ ├── sdk/
│ │ ├── CMakeLists.txt
│ │ ├── build.py
│ │ ├── misc/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── accessMramFromDpu.c
│ │ │ ├── coreDump.c
│ │ │ ├── crt0.c
│ │ │ ├── dpu.lds
│ │ │ ├── internalStateReset.c
│ │ │ ├── linkerScript.lds
│ │ │ ├── restoreRegisters.c
│ │ │ └── restore_carry_and_zero_flag.h
│ │ ├── stdlib/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── abort.c
│ │ │ ├── assert.h
│ │ │ ├── atoi.c
│ │ │ ├── atol.c
│ │ │ ├── ctype.h
│ │ │ ├── errno.c
│ │ │ ├── errno.h
│ │ │ ├── exit.c
│ │ │ ├── inttypes.h
│ │ │ ├── iso646.h
│ │ │ ├── limits.h
│ │ │ ├── memchr.c
│ │ │ ├── memcmp.c
│ │ │ ├── memcpy.c
│ │ │ ├── memmove.c
│ │ │ ├── memmram_utils.h
│ │ │ ├── memset.c
│ │ │ ├── stdalign.h
│ │ │ ├── stdarg.h
│ │ │ ├── stdbool.h
│ │ │ ├── stddef.h
│ │ │ ├── stdint.h
│ │ │ ├── stdio.c
│ │ │ ├── stdio.h
│ │ │ ├── stdlib.h
│ │ │ ├── stdnoreturn.h
│ │ │ ├── stpcpy.c
│ │ │ ├── stpncpy.c
│ │ │ ├── strcat.c
│ │ │ ├── strchr.c
│ │ │ ├── strcmp.c
│ │ │ ├── strcpy.c
│ │ │ ├── strcspn.c
│ │ │ ├── strdup.c
│ │ │ ├── strerror.c
│ │ │ ├── string.h
│ │ │ ├── strlen.c
│ │ │ ├── strlwr.c
│ │ │ ├── strncat.c
│ │ │ ├── strncmp.c
│ │ │ ├── strncpy.c
│ │ │ ├── strndup.c
│ │ │ ├── strnlen.c
│ │ │ ├── strpbrk.c
│ │ │ ├── strrchr.c
│ │ │ ├── strrev.c
│ │ │ ├── strsep.c
│ │ │ ├── strspn.c
│ │ │ ├── strstr.c
│ │ │ ├── strtok_r.c
│ │ │ ├── strtol.c
│ │ │ └── strupr.c
│ │ └── syslib/
│ │ ├── CMakeLists.txt
│ │ ├── absvdi2.c
│ │ ├── absvsi2.c
│ │ ├── adddf3.c
│ │ ├── addsf3.c
│ │ ├── addvdi3.c
│ │ ├── addvsi3.c
│ │ ├── alloc.c
│ │ ├── alloc.h
│ │ ├── ashldi3.c
│ │ ├── ashrdi3.c
│ │ ├── atomic_bit.h
│ │ ├── atomics.c
│ │ ├── attributes.h
│ │ ├── barrier.c
│ │ ├── barrier.h
│ │ ├── bswapdi2.c
│ │ ├── bswapsi2.c
│ │ ├── buddy_alloc.c
│ │ ├── buddy_alloc.h
│ │ ├── buddy_realloc.c
│ │ ├── built_ins.h
│ │ ├── clzdi2.c
│ │ ├── clzsi2.c
│ │ ├── cmpdi2.c
│ │ ├── comparedf2.c
│ │ ├── comparesf2.c
│ │ ├── ctzdi2.c
│ │ ├── ctzsi2.c
│ │ ├── defs.c
│ │ ├── defs.h
│ │ ├── devprivate.h
│ │ ├── div32.c
│ │ ├── divdf3.c
│ │ ├── divdi3.c
│ │ ├── divmodsi4.c
│ │ ├── divsf3.c
│ │ ├── divsi3.c
│ │ ├── dpuconst.h
│ │ ├── dpufault.h
│ │ ├── dpuruntime.h
│ │ ├── extendhfsf2.c
│ │ ├── extendsfdf2.c
│ │ ├── ffsdi2.c
│ │ ├── ffssi2.c
│ │ ├── ffsti2.c
│ │ ├── fixdfdi.c
│ │ ├── fixdfsi.c
│ │ ├── fixsfdi.c
│ │ ├── fixsfsi.c
│ │ ├── fixunsdfdi.c
│ │ ├── fixunsdfsi.c
│ │ ├── fixunssfdi.c
│ │ ├── fixunssfsi.c
│ │ ├── float.h
│ │ ├── floatdidf.c
│ │ ├── floatdisf.c
│ │ ├── floatsidf.c
│ │ ├── floatsisf.c
│ │ ├── floatundidf.c
│ │ ├── floatundisf.c
│ │ ├── floatunsidf.c
│ │ ├── floatunsisf.c
│ │ ├── fp_add_impl.inc
│ │ ├── fp_extend.h
│ │ ├── fp_extend_impl.inc
│ │ ├── fp_fixint_impl.inc
│ │ ├── fp_fixuint_impl.inc
│ │ ├── fp_lib.h
│ │ ├── fp_mul_impl.inc
│ │ ├── fp_trunc.h
│ │ ├── fp_trunc_impl.inc
│ │ ├── fsb_allocator.c
│ │ ├── fsb_allocator.h
│ │ ├── handshake.c
│ │ ├── handshake.h
│ │ ├── int_endianness.h
│ │ ├── int_lib.h
│ │ ├── int_math.h
│ │ ├── int_types.h
│ │ ├── int_util.c
│ │ ├── int_util.h
│ │ ├── listener.c
│ │ ├── lshrdi3.c
│ │ ├── macro_utils.h
│ │ ├── mcount.c
│ │ ├── moddi3.c
│ │ ├── modsi3.c
│ │ ├── mram.h
│ │ ├── mul32.c
│ │ ├── mul64.c
│ │ ├── muldc3.c
│ │ ├── muldf3.c
│ │ ├── mulodi4.c
│ │ ├── mulosi4.c
│ │ ├── mulsf3.c
│ │ ├── mulvdi3.c
│ │ ├── mulvsi3.c
│ │ ├── mutex.h
│ │ ├── negdf2.c
│ │ ├── negdi2.c
│ │ ├── negsf2.c
│ │ ├── negvdi2.c
│ │ ├── negvsi2.c
│ │ ├── paritydi2.c
│ │ ├── paritysi2.c
│ │ ├── perfcounter.c
│ │ ├── perfcounter.h
│ │ ├── popcountdi2.c
│ │ ├── popcountsi2.c
│ │ ├── powidf2.c
│ │ ├── powisf2.c
│ │ ├── profiling.c
│ │ ├── profiling.h
│ │ ├── profiling_internals.h
│ │ ├── sem.c
│ │ ├── sem.h
│ │ ├── seqread.h
│ │ ├── seqread.inc
│ │ ├── seqread1024.c
│ │ ├── seqread128.c
│ │ ├── seqread256.c
│ │ ├── seqread32.c
│ │ ├── seqread512.c
│ │ ├── seqread64.c
│ │ ├── soft_cache.c
│ │ ├── soft_cache.h
│ │ ├── subdf3.c
│ │ ├── subsf3.c
│ │ ├── subvdi3.c
│ │ ├── subvsi3.c
│ │ ├── sysdef.h
│ │ ├── truncdfhf2.c
│ │ ├── truncdfsf2.c
│ │ ├── truncsfhf2.c
│ │ ├── ucmpdi2.c
│ │ ├── udiv64.c
│ │ ├── udivdi3.c
│ │ ├── udivmodsi4.c
│ │ ├── udivsi3.c
│ │ ├── umoddi3.c
│ │ ├── umodsi3.c
│ │ └── waitqueue.c
│ └── src/
│ ├── device/
│ │ ├── abi/
│ │ │ ├── intermediate.go
│ │ │ └── word.go
│ │ ├── compiler/
│ │ │ └── compiler.go
│ │ ├── core/
│ │ │ ├── job.go
│ │ │ └── thread_pool.go
│ │ ├── linker/
│ │ │ ├── analyze_liveness_job.go
│ │ │ ├── kernel/
│ │ │ │ ├── directive/
│ │ │ │ │ ├── ascii_directive.go
│ │ │ │ │ ├── asciz_directive.go
│ │ │ │ │ ├── byte_directive.go
│ │ │ │ │ ├── long_directive.go
│ │ │ │ │ ├── quad_directive.go
│ │ │ │ │ ├── short_directive.go
│ │ │ │ │ └── zero_directive.go
│ │ │ │ ├── encodable.go
│ │ │ │ ├── executable.go
│ │ │ │ ├── instruction/
│ │ │ │ │ ├── cc/
│ │ │ │ │ │ ├── acquire_cc.go
│ │ │ │ │ │ ├── add_nz_cc.go
│ │ │ │ │ │ ├── boot_cc.go
│ │ │ │ │ │ ├── cc.go
│ │ │ │ │ │ ├── const_cc_ge0.go
│ │ │ │ │ │ ├── const_cc_geu.go
│ │ │ │ │ │ ├── const_cc_zero.go
│ │ │ │ │ │ ├── count_nz_cc.go
│ │ │ │ │ │ ├── div_cc.go
│ │ │ │ │ │ ├── div_nz_cc.go
│ │ │ │ │ │ ├── ext_sub_set_cc.go
│ │ │ │ │ │ ├── false_cc.go
│ │ │ │ │ │ ├── imm_shift_nz_cc.go
│ │ │ │ │ │ ├── log_nz_cc.go
│ │ │ │ │ │ ├── log_set_cc.go
│ │ │ │ │ │ ├── mul_nz_cc.go
│ │ │ │ │ │ ├── no_cc.go
│ │ │ │ │ │ ├── release_cc.go
│ │ │ │ │ │ ├── shift_nz_cc.go
│ │ │ │ │ │ ├── sub_nz_cc.go
│ │ │ │ │ │ ├── sub_set_cc.go
│ │ │ │ │ │ ├── true_cc.go
│ │ │ │ │ │ └── true_false_cc.go
│ │ │ │ │ ├── endian.go
│ │ │ │ │ ├── exception.go
│ │ │ │ │ ├── flag.go
│ │ │ │ │ ├── instruction.go
│ │ │ │ │ ├── op_code.go
│ │ │ │ │ ├── reg_descriptor/
│ │ │ │ │ │ ├── gp_reg_descriptor.go
│ │ │ │ │ │ ├── pair_reg_descriptor.go
│ │ │ │ │ │ ├── sp_reg_descriptor.go
│ │ │ │ │ │ └── src_reg_descriptor.go
│ │ │ │ │ └── suffix.go
│ │ │ │ ├── kernel.go
│ │ │ │ ├── label.go
│ │ │ │ ├── liveness.go
│ │ │ │ ├── relocatable.go
│ │ │ │ └── section.go
│ │ │ ├── lex_job.go
│ │ │ ├── lexer/
│ │ │ │ ├── keyword_factory.go
│ │ │ │ ├── lexer.go
│ │ │ │ ├── regex.go
│ │ │ │ ├── regex_factory.go
│ │ │ │ ├── token.go
│ │ │ │ ├── token_stream.go
│ │ │ │ └── tokenizer.go
│ │ │ ├── linker.go
│ │ │ ├── logic/
│ │ │ │ ├── instruction_assigner.go
│ │ │ │ ├── label_assigner.go
│ │ │ │ ├── linker_constant.go
│ │ │ │ ├── linker_script.go
│ │ │ │ ├── liveness_analyzer.go
│ │ │ │ └── set_assigner.go
│ │ │ ├── parse_job.go
│ │ │ └── parser/
│ │ │ ├── ast.go
│ │ │ ├── expr/
│ │ │ │ ├── binary_add_expr.go
│ │ │ │ ├── binary_sub_expr.go
│ │ │ │ ├── ci_op_code_expr.go
│ │ │ │ ├── condition_expr.go
│ │ │ │ ├── ddci_op_code_expr.go
│ │ │ │ ├── dma_rri_op_code_expr.go
│ │ │ │ ├── drdici_op_code_expr.go
│ │ │ │ ├── endian_expr.go
│ │ │ │ ├── expr.go
│ │ │ │ ├── i_op_code_expr.go
│ │ │ │ ├── jump_op_code_expr.go
│ │ │ │ ├── load_op_code_expr.go
│ │ │ │ ├── negative_number_expr.go
│ │ │ │ ├── primary_expr.go
│ │ │ │ ├── program_counter_expr.go
│ │ │ │ ├── r_op_code_expr.go
│ │ │ │ ├── rici_op_code_expr.go
│ │ │ │ ├── rr_op_code_expr.go
│ │ │ │ ├── rri_op_code_expr.go
│ │ │ │ ├── rrri_op_code_expr.go
│ │ │ │ ├── section_name_expr.go
│ │ │ │ ├── section_type_expr.go
│ │ │ │ ├── src_reg_expr.go
│ │ │ │ ├── store_op_code_expr.go
│ │ │ │ ├── suffix_expr.go
│ │ │ │ └── symbol_type.go
│ │ │ ├── parser.go
│ │ │ ├── rule.go
│ │ │ ├── stack.go
│ │ │ ├── stack_item.go
│ │ │ ├── stmt/
│ │ │ │ ├── directive/
│ │ │ │ │ ├── addrsig_stmt.go
│ │ │ │ │ ├── addrsig_sym_stmt.go
│ │ │ │ │ ├── ascii_stmt.go
│ │ │ │ │ ├── asciz_stmt.go
│ │ │ │ │ ├── byte_stmt.go
│ │ │ │ │ ├── cfi_def_cfa_offset_stmt.go
│ │ │ │ │ ├── cfi_endproc.go
│ │ │ │ │ ├── cfi_offset_stmt.go
│ │ │ │ │ ├── cfi_sections_stmt.go
│ │ │ │ │ ├── cfi_startproc_stmt.go
│ │ │ │ │ ├── file_number_stmt.go
│ │ │ │ │ ├── file_string_stmt.go
│ │ │ │ │ ├── global_stmt.go
│ │ │ │ │ ├── loc_is_stmt_stmt.go
│ │ │ │ │ ├── loc_number_stmt.go
│ │ │ │ │ ├── loc_prologue_end_stmt.go
│ │ │ │ │ ├── long_program_counter.go
│ │ │ │ │ ├── long_section_name_stmt.go
│ │ │ │ │ ├── p2_align_stmt.go
│ │ │ │ │ ├── quad_stmt.go
│ │ │ │ │ ├── section_identifier_number_stmt.go
│ │ │ │ │ ├── section_identifier_stmt.go
│ │ │ │ │ ├── section_stack_sizes_stmt.go
│ │ │ │ │ ├── section_string_number_stmt.go
│ │ │ │ │ ├── section_string_stmt.go
│ │ │ │ │ ├── set_stmt.go
│ │ │ │ │ ├── short_stmt.go
│ │ │ │ │ ├── size_stmt.go
│ │ │ │ │ ├── text_stmt.go
│ │ │ │ │ ├── type_stmt.go
│ │ │ │ │ ├── weak_stmt.go
│ │ │ │ │ ├── zero_double_number_stmt.go
│ │ │ │ │ └── zero_single_number_stmt.go
│ │ │ │ ├── instruction/
│ │ │ │ │ ├── ci_stmt.go
│ │ │ │ │ ├── ddci_stmt.go
│ │ │ │ │ ├── dma_rri_stmt.go
│ │ │ │ │ ├── drdici_stmt.go
│ │ │ │ │ ├── edri_stmt.go
│ │ │ │ │ ├── erid_stmt.go
│ │ │ │ │ ├── erii_stmt.go
│ │ │ │ │ ├── erir_stmt.go
│ │ │ │ │ ├── erri_stmt.go
│ │ │ │ │ ├── i_stmt.go
│ │ │ │ │ ├── nop_stmt.go
│ │ │ │ │ ├── r_stmt.go
│ │ │ │ │ ├── rci_stmt.go
│ │ │ │ │ ├── rici_stmt.go
│ │ │ │ │ ├── rir_stmt.go
│ │ │ │ │ ├── rirc_stmt.go
│ │ │ │ │ ├── rirci_stmt.go
│ │ │ │ │ ├── rr_stmt.go
│ │ │ │ │ ├── rrc_stmt.go
│ │ │ │ │ ├── rrci_stmt.go
│ │ │ │ │ ├── rri_stmt.go
│ │ │ │ │ ├── rric_stmt.go
│ │ │ │ │ ├── rrici_stmt.go
│ │ │ │ │ ├── rrr_stmt.go
│ │ │ │ │ ├── rrrc_stmt.go
│ │ │ │ │ ├── rrrci_stmt.go
│ │ │ │ │ ├── rrri_stmt.go
│ │ │ │ │ ├── rrrici_stmt.go
│ │ │ │ │ ├── s_erri_stmt.go
│ │ │ │ │ ├── s_r_stmt.go
│ │ │ │ │ ├── s_rci_stmt.go
│ │ │ │ │ ├── s_rirc_stmt.go
│ │ │ │ │ ├── s_rirci_stmt.go
│ │ │ │ │ ├── s_rr_stmt.go
│ │ │ │ │ ├── s_rrc_stmt.go
│ │ │ │ │ ├── s_rrci_stmt.go
│ │ │ │ │ ├── s_rri_stmt.go
│ │ │ │ │ ├── s_rric_stmt.go
│ │ │ │ │ ├── s_rrici_stmt.go
│ │ │ │ │ ├── s_rrr_stmt.go
│ │ │ │ │ ├── s_rrrc_stmt.go
│ │ │ │ │ ├── s_rrrci_stmt.go
│ │ │ │ │ ├── s_rrri_stmt.go
│ │ │ │ │ └── s_rrrici_stmt.go
│ │ │ │ ├── label_stmt.go
│ │ │ │ ├── stmt.go
│ │ │ │ └── sugar/
│ │ │ │ ├── bkp_stmt.go
│ │ │ │ ├── boot_ri_stmt.go
│ │ │ │ ├── call_ri_stmt.go
│ │ │ │ ├── call_rr_stmt.go
│ │ │ │ ├── div_step_drdi_stmt.go
│ │ │ │ ├── jeq_rii_stmt.go
│ │ │ │ ├── jeq_rri_stmt.go
│ │ │ │ ├── jnz_ri_stmt.go
│ │ │ │ ├── jump_i_stmt.go
│ │ │ │ ├── jump_r_stmt.go
│ │ │ │ ├── lbs_rri_stmt.go
│ │ │ │ ├── lbs_s_rri_stmt.go
│ │ │ │ ├── ld_dri_stmt.go
│ │ │ │ ├── movd_dd_stmt.go
│ │ │ │ ├── move_ri_stmt.go
│ │ │ │ ├── move_rici_stmt.go
│ │ │ │ ├── move_s_ri_stmt.go
│ │ │ │ ├── move_s_rici_stmt.go
│ │ │ │ ├── sb_id_ri_stmt.go
│ │ │ │ ├── sb_id_rii_stmt.go
│ │ │ │ ├── sb_rir_stmt.go
│ │ │ │ ├── sd_rid_stmt.go
│ │ │ │ ├── stop_stmt.go
│ │ │ │ └── time_cfg_r_stmt.go
│ │ │ ├── table.go
│ │ │ └── walker.go
│ │ └── simulator/
│ │ ├── channel/
│ │ │ ├── channel.go
│ │ │ ├── channel_command.go
│ │ │ └── channel_command_q.go
│ │ ├── dpu/
│ │ │ ├── control_interface.go
│ │ │ ├── dpu.go
│ │ │ ├── dram/
│ │ │ │ ├── dma_command.go
│ │ │ │ ├── dma_command_q.go
│ │ │ │ ├── memory_command.go
│ │ │ │ ├── memory_command_q.go
│ │ │ │ ├── memory_controller.go
│ │ │ │ ├── memory_scheduler.go
│ │ │ │ ├── mram.go
│ │ │ │ ├── row_buffer.go
│ │ │ │ └── wordline.go
│ │ │ ├── logic/
│ │ │ │ ├── alu.go
│ │ │ │ ├── cycle_rule.go
│ │ │ │ ├── dma.go
│ │ │ │ ├── instruction_q.go
│ │ │ │ ├── logic.go
│ │ │ │ ├── operand_collector.go
│ │ │ │ ├── pipeline.go
│ │ │ │ ├── reg_set.go
│ │ │ │ ├── thread.go
│ │ │ │ ├── thread_q.go
│ │ │ │ └── thread_scheduler.go
│ │ │ ├── reg/
│ │ │ │ ├── condition_reg.go
│ │ │ │ ├── exception_reg.go
│ │ │ │ ├── flag_reg.go
│ │ │ │ ├── gp_reg.go
│ │ │ │ ├── pc_reg.go
│ │ │ │ ├── reg_file.go
│ │ │ │ └── sp_reg.go
│ │ │ └── sram/
│ │ │ ├── atomic.go
│ │ │ ├── iram.go
│ │ │ ├── lock.go
│ │ │ └── wram.go
│ │ └── rank/
│ │ ├── rank.go
│ │ ├── rank_command.go
│ │ └── rank_command_q.go
│ ├── encoding/
│ │ ├── ascii_encoder.go
│ │ └── byte_stream.go
│ ├── host/
│ │ ├── abi/
│ │ │ ├── binary.go
│ │ │ ├── bytecode.go
│ │ │ ├── label.go
│ │ │ ├── op_code.go
│ │ │ └── relocatable.go
│ │ ├── interpreter/
│ │ │ ├── codegen/
│ │ │ │ ├── codegen.go
│ │ │ │ └── type_system/
│ │ │ │ ├── method.go
│ │ │ │ ├── symbol.go
│ │ │ │ └── type_system.go
│ │ │ ├── interpreter.go
│ │ │ ├── lexer/
│ │ │ │ ├── keyword_factory.go
│ │ │ │ ├── lexer.go
│ │ │ │ ├── regex.go
│ │ │ │ ├── regex_factory.go
│ │ │ │ ├── token.go
│ │ │ │ ├── token_stream.go
│ │ │ │ └── tokenizer.go
│ │ │ └── parser/
│ │ │ ├── ast.go
│ │ │ ├── decl/
│ │ │ │ ├── decl.go
│ │ │ │ ├── func_decl.go
│ │ │ │ ├── func_def.go
│ │ │ │ └── struct_def.go
│ │ │ ├── directive/
│ │ │ │ ├── define_directive.go
│ │ │ │ ├── directive.go
│ │ │ │ └── include_directive.go
│ │ │ ├── expr/
│ │ │ │ ├── additive_expr.go
│ │ │ │ ├── arg_list.go
│ │ │ │ ├── assignment_expr.go
│ │ │ │ ├── bitwise_and_expr.go
│ │ │ │ ├── bitwise_or_expr.go
│ │ │ │ ├── bitwise_xor_expr.go
│ │ │ │ ├── conditional_expr.go
│ │ │ │ ├── equality_expr.go
│ │ │ │ ├── expr.go
│ │ │ │ ├── logical_and_expr.go
│ │ │ │ ├── logical_or_expr.go
│ │ │ │ ├── multiplicative_expr.go
│ │ │ │ ├── postfix_expr.go
│ │ │ │ ├── primary_expr.go
│ │ │ │ ├── relational_expr.go
│ │ │ │ ├── shift_expr.go
│ │ │ │ └── unary_expr.go
│ │ │ ├── param_list/
│ │ │ │ ├── param.go
│ │ │ │ └── param_list.go
│ │ │ ├── parser.go
│ │ │ ├── rule.go
│ │ │ ├── stack.go
│ │ │ ├── stack_item.go
│ │ │ ├── stmt/
│ │ │ │ ├── block_stmt.go
│ │ │ │ ├── break_stmt.go
│ │ │ │ ├── continue_stmt.go
│ │ │ │ ├── dpu_foreach_stmt.go
│ │ │ │ ├── empty_stmt.go
│ │ │ │ ├── expr_stmt.go
│ │ │ │ ├── for_stmt.go
│ │ │ │ ├── if_stmt.go
│ │ │ │ ├── return_stmt.go
│ │ │ │ ├── stmt.go
│ │ │ │ ├── var_decl_init_stmt.go
│ │ │ │ ├── var_decl_stmt.go
│ │ │ │ └── while_stmt.go
│ │ │ ├── table.go
│ │ │ └── type_specifier/
│ │ │ └── type_specifier.go
│ │ └── vm/
│ │ ├── arena/
│ │ │ ├── arena.go
│ │ │ ├── garbage_collector.go
│ │ │ ├── memory.go
│ │ │ └── pool.go
│ │ ├── bank_cycle_job.go
│ │ ├── base/
│ │ │ └── object.go
│ │ ├── dpu_compute_cycle_job.go
│ │ ├── dpu_cycle_job.go
│ │ ├── dpu_load_job.go
│ │ ├── dram/
│ │ │ ├── bank/
│ │ │ │ ├── array.go
│ │ │ │ ├── bank.go
│ │ │ │ ├── dma_command.go
│ │ │ │ ├── dma_command_q.go
│ │ │ │ ├── memory_command.go
│ │ │ │ ├── memory_command_q.go
│ │ │ │ ├── row_buffer.go
│ │ │ │ ├── segment.go
│ │ │ │ ├── transfer_command.go
│ │ │ │ ├── transfer_command_q.go
│ │ │ │ └── wordline.go
│ │ │ ├── channel/
│ │ │ │ ├── channel.go
│ │ │ │ ├── channel_command.go
│ │ │ │ └── channel_command_q.go
│ │ │ ├── memory_controller.go
│ │ │ ├── memory_mapping.go
│ │ │ ├── memory_scheduler.go
│ │ │ └── rank/
│ │ │ ├── rank.go
│ │ │ ├── rank_command.go
│ │ │ └── rank_command_q.go
│ │ ├── frame/
│ │ │ ├── frame.go
│ │ │ └── frame_chain.go
│ │ ├── pc/
│ │ │ └── pc.go
│ │ ├── stack/
│ │ │ ├── return_stack.go
│ │ │ ├── stack.go
│ │ │ └── stack_item.go
│ │ ├── symbol/
│ │ │ ├── scope.go
│ │ │ ├── scope_chain.go
│ │ │ └── symbol.go
│ │ ├── type_system/
│ │ │ ├── field.go
│ │ │ ├── registry.go
│ │ │ ├── skeleton.go
│ │ │ └── type_variable.go
│ │ └── virtual_machine.go
│ ├── main.go
│ ├── misc/
│ │ ├── command_line_option.go
│ │ ├── command_line_parser.go
│ │ ├── command_line_validator.go
│ │ ├── config_loader.go
│ │ ├── config_validator.go
│ │ ├── file_dumper.go
│ │ ├── file_scanner.go
│ │ └── stat_factory.go
│ ├── program/
│ │ ├── app.go
│ │ └── task.go
│ └── system/
│ └── system.go
├── python_cpp/
│ ├── README.md
│ ├── uPIMulator_backend/
│ │ ├── CMakeLists.txt
│ │ ├── script/
│ │ │ ├── build.sh
│ │ │ ├── format.sh
│ │ │ ├── run.sh
│ │ │ └── run_serial.sh
│ │ └── src/
│ │ ├── CMakeLists.txt
│ │ ├── abi/
│ │ │ ├── cc/
│ │ │ │ ├── _base_cc.cc
│ │ │ │ ├── _base_cc.h
│ │ │ │ ├── acquire_cc.h
│ │ │ │ ├── add_nz_cc.h
│ │ │ │ ├── boot_cc.h
│ │ │ │ ├── const_cc_ge0.h
│ │ │ │ ├── const_cc_geu.h
│ │ │ │ ├── const_cc_zero.h
│ │ │ │ ├── count_nz_cc.h
│ │ │ │ ├── div_cc.h
│ │ │ │ ├── div_nz_cc.h
│ │ │ │ ├── ext_sub_set_cc.h
│ │ │ │ ├── false_cc.h
│ │ │ │ ├── imm_shift_nz_cc.h
│ │ │ │ ├── log_nz_cc.h
│ │ │ │ ├── log_set_cc.h
│ │ │ │ ├── mul_nz_cc.h
│ │ │ │ ├── no_cc.h
│ │ │ │ ├── release_cc.h
│ │ │ │ ├── shift_nz_cc.h
│ │ │ │ ├── sub_nz_cc.h
│ │ │ │ ├── sub_set_cc.h
│ │ │ │ ├── true_cc.h
│ │ │ │ └── true_false_cc.h
│ │ │ ├── instruction/
│ │ │ │ ├── instruction.cc
│ │ │ │ ├── instruction.h
│ │ │ │ ├── op_code.h
│ │ │ │ └── suffix.h
│ │ │ ├── isa/
│ │ │ │ ├── condition.h
│ │ │ │ ├── endian.h
│ │ │ │ ├── exception.h
│ │ │ │ └── flag.h
│ │ │ ├── reg/
│ │ │ │ ├── gp_reg.h
│ │ │ │ ├── pair_reg.cc
│ │ │ │ ├── pair_reg.h
│ │ │ │ ├── sp_reg.h
│ │ │ │ ├── src_reg.cc
│ │ │ │ └── src_reg.h
│ │ │ └── word/
│ │ │ ├── _base_word.cc
│ │ │ ├── _base_word.h
│ │ │ ├── data_address_word.h
│ │ │ ├── data_word.h
│ │ │ ├── immediate.h
│ │ │ ├── instruction_address_word.h
│ │ │ ├── instruction_word.h
│ │ │ └── representation.h
│ │ ├── converter/
│ │ │ ├── condition_converter.cc
│ │ │ ├── condition_converter.h
│ │ │ ├── endian_converter.cc
│ │ │ ├── endian_converter.h
│ │ │ ├── flag_converter.cc
│ │ │ ├── flag_converter.h
│ │ │ ├── instruction_converter.cc
│ │ │ ├── instruction_converter.h
│ │ │ ├── op_code_converter.cc
│ │ │ ├── op_code_converter.h
│ │ │ ├── reg_converter.cc
│ │ │ ├── reg_converter.h
│ │ │ ├── reg_file_converter.cc
│ │ │ ├── reg_file_converter.h
│ │ │ ├── suffix_converter.cc
│ │ │ └── suffix_converter.h
│ │ ├── encoder/
│ │ │ ├── byte.h
│ │ │ ├── byte_stream.cc
│ │ │ ├── byte_stream.h
│ │ │ ├── instruction_encoder.cc
│ │ │ └── instruction_encoder.h
│ │ ├── initializer/
│ │ │ ├── int_initializer.cc
│ │ │ ├── int_initializer.h
│ │ │ ├── str_initializer.h
│ │ │ └── str_initialzier.cc
│ │ ├── main.cc
│ │ ├── main.h
│ │ ├── simulator/
│ │ │ ├── basic/
│ │ │ │ ├── queue.h
│ │ │ │ └── timer_queue.h
│ │ │ ├── cpu/
│ │ │ │ ├── cpu.cc
│ │ │ │ ├── cpu.h
│ │ │ │ ├── fini_thread.cc
│ │ │ │ ├── fini_thread.h
│ │ │ │ ├── init_thread.cc
│ │ │ │ ├── init_thread.h
│ │ │ │ ├── sched_thread.cc
│ │ │ │ ├── sched_thread.h
│ │ │ │ ├── thread.cc
│ │ │ │ └── thread.h
│ │ │ ├── dpu/
│ │ │ │ ├── alu.cc
│ │ │ │ ├── alu.h
│ │ │ │ ├── cycle_rule.cc
│ │ │ │ ├── cycle_rule.h
│ │ │ │ ├── dma.cc
│ │ │ │ ├── dma.h
│ │ │ │ ├── dma_command.cc
│ │ │ │ ├── dma_command.h
│ │ │ │ ├── dpu.cc
│ │ │ │ ├── dpu.h
│ │ │ │ ├── logic.cc
│ │ │ │ ├── logic.h
│ │ │ │ ├── operand_collector.cc
│ │ │ │ ├── operand_collector.h
│ │ │ │ ├── pipeline.cc
│ │ │ │ ├── pipeline.h
│ │ │ │ ├── revolver_scheduler.cc
│ │ │ │ ├── revolver_scheduler.h
│ │ │ │ ├── thread.cc
│ │ │ │ └── thread.h
│ │ │ ├── dram/
│ │ │ │ ├── fifo_scheduler.cc
│ │ │ │ ├── fifo_scheduler.h
│ │ │ │ ├── frfcfs_scheduler.cc
│ │ │ │ ├── frfcfs_scheduler.h
│ │ │ │ ├── memory_command.cc
│ │ │ │ ├── memory_command.h
│ │ │ │ ├── memory_controller.cc
│ │ │ │ ├── memory_controller.h
│ │ │ │ ├── mram.cc
│ │ │ │ ├── mram.h
│ │ │ │ ├── row_buffer.cc
│ │ │ │ ├── row_buffer.h
│ │ │ │ ├── scheduler.cc
│ │ │ │ ├── scheduler.h
│ │ │ │ ├── wordline.cc
│ │ │ │ └── wordline.h
│ │ │ ├── rank/
│ │ │ │ ├── rank.cc
│ │ │ │ ├── rank.h
│ │ │ │ └── rank_message.h
│ │ │ ├── reg/
│ │ │ │ ├── condition_reg.cc
│ │ │ │ ├── condition_reg.h
│ │ │ │ ├── exception_reg.h
│ │ │ │ ├── flag_reg.h
│ │ │ │ ├── gp_reg.cc
│ │ │ │ ├── gp_reg.h
│ │ │ │ ├── pc_reg.h
│ │ │ │ ├── reg_file.cc
│ │ │ │ ├── reg_file.h
│ │ │ │ ├── sp_reg.cc
│ │ │ │ └── sp_reg.h
│ │ │ ├── sram/
│ │ │ │ ├── atomic.cc
│ │ │ │ ├── atomic.h
│ │ │ │ ├── iram.cc
│ │ │ │ ├── iram.h
│ │ │ │ ├── lock.cc
│ │ │ │ ├── lock.h
│ │ │ │ ├── wram.cc
│ │ │ │ └── wram.h
│ │ │ ├── system.cc
│ │ │ └── system.h
│ │ └── util/
│ │ ├── argument_parser.cc
│ │ ├── argument_parser.h
│ │ ├── config_loader.h
│ │ ├── stat_factory.cc
│ │ └── stat_factory.h
│ └── uPIMulator_frontend/
│ ├── .flake8
│ ├── .hadolint.yaml
│ ├── .isort.cfg
│ ├── .markdownlint.yaml
│ ├── .shellcheckrc
│ ├── benchmark/
│ │ ├── Arithmetic-Throughput/
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── cyclecount.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── BFS/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── app.cu
│ │ │ ├── data/
│ │ │ │ └── loc-gowalla_edges.txt
│ │ │ ├── dpu/
│ │ │ │ ├── dpu-utils.h
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ ├── app.c
│ │ │ │ └── mram-management.h
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── graph.h
│ │ │ ├── params.h
│ │ │ ├── timer.h
│ │ │ └── utils.h
│ │ ├── BS/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── bs_omp.c
│ │ │ │ │ └── timer.h
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── binary_search.cu
│ │ │ │ ├── binary_search.h
│ │ │ │ ├── cpu_lib.py
│ │ │ │ ├── cu_lib_import.py
│ │ │ │ └── run.py
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── CPU-DPU/
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── GEMV/
│ │ │ ├── Makefile
│ │ │ ├── _BL_10.conf
│ │ │ ├── _NR_TASKLETS_10_BL_10.conf
│ │ │ ├── _NR_TASKLETS_11_BL_10.conf
│ │ │ ├── _NR_TASKLETS_12_BL_10.conf
│ │ │ ├── _NR_TASKLETS_13_BL_10.conf
│ │ │ ├── _NR_TASKLETS_14_BL_10.conf
│ │ │ ├── _NR_TASKLETS_15_BL_10.conf
│ │ │ ├── _NR_TASKLETS_16
│ │ │ ├── _NR_TASKLETS_16_BL_10.conf
│ │ │ ├── _NR_TASKLETS_17_BL_10.conf
│ │ │ ├── _NR_TASKLETS_18_BL_10.conf
│ │ │ ├── _NR_TASKLETS_19_BL_10.conf
│ │ │ ├── _NR_TASKLETS_1_BL_10.conf
│ │ │ ├── _NR_TASKLETS_20_BL_10.conf
│ │ │ ├── _NR_TASKLETS_21_BL_10.conf
│ │ │ ├── _NR_TASKLETS_22_BL_10.conf
│ │ │ ├── _NR_TASKLETS_23_BL_10.conf
│ │ │ ├── _NR_TASKLETS_24_BL_10.conf
│ │ │ ├── _NR_TASKLETS_2_BL_10.conf
│ │ │ ├── _NR_TASKLETS_3_BL_10.conf
│ │ │ ├── _NR_TASKLETS_4_BL_10.conf
│ │ │ ├── _NR_TASKLETS_5_BL_10.conf
│ │ │ ├── _NR_TASKLETS_6_BL_10.conf
│ │ │ ├── _NR_TASKLETS_7_BL_10.conf
│ │ │ ├── _NR_TASKLETS_8_BL_10.conf
│ │ │ ├── _NR_TASKLETS_9_BL_10.conf
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── gemv_openmp.c
│ │ │ │ │ └── gemv_utils.h
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── gemv.cu
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── HST-L/
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── input/
│ │ │ │ └── image_VanHateren.iml
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── HST-S/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── input/
│ │ │ │ │ └── image_VanHateren.iml
│ │ │ │ ├── kernel.cpp
│ │ │ │ ├── kernel.cu
│ │ │ │ ├── kernel.h
│ │ │ │ ├── main.cpp
│ │ │ │ └── support/
│ │ │ │ ├── common.h
│ │ │ │ ├── cuda-setup.h
│ │ │ │ ├── partitioner.h
│ │ │ │ ├── timer.h
│ │ │ │ └── verify.h
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── input/
│ │ │ │ └── image_VanHateren.iml
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── MLP/
│ │ │ ├── Makefile
│ │ │ ├── _BL_10.conf
│ │ │ ├── _NR_TASKLETS_10_BL_10.conf
│ │ │ ├── _NR_TASKLETS_11_BL_10.conf
│ │ │ ├── _NR_TASKLETS_12_BL_10.conf
│ │ │ ├── _NR_TASKLETS_13_BL_10.conf
│ │ │ ├── _NR_TASKLETS_14_BL_10.conf
│ │ │ ├── _NR_TASKLETS_15_BL_10.conf
│ │ │ ├── _NR_TASKLETS_16
│ │ │ ├── _NR_TASKLETS_16_BL_10.conf
│ │ │ ├── _NR_TASKLETS_17_BL_10.conf
│ │ │ ├── _NR_TASKLETS_18_BL_10.conf
│ │ │ ├── _NR_TASKLETS_19_BL_10.conf
│ │ │ ├── _NR_TASKLETS_1_BL_10.conf
│ │ │ ├── _NR_TASKLETS_20_BL_10.conf
│ │ │ ├── _NR_TASKLETS_21_BL_10.conf
│ │ │ ├── _NR_TASKLETS_22_BL_10.conf
│ │ │ ├── _NR_TASKLETS_23_BL_10.conf
│ │ │ ├── _NR_TASKLETS_24_BL_10.conf
│ │ │ ├── _NR_TASKLETS_2_BL_10.conf
│ │ │ ├── _NR_TASKLETS_3_BL_10.conf
│ │ │ ├── _NR_TASKLETS_4_BL_10.conf
│ │ │ ├── _NR_TASKLETS_5_BL_10.conf
│ │ │ ├── _NR_TASKLETS_6_BL_10.conf
│ │ │ ├── _NR_TASKLETS_7_BL_10.conf
│ │ │ ├── _NR_TASKLETS_8_BL_10.conf
│ │ │ ├── _NR_TASKLETS_9_BL_10.conf
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── mlp_openmp.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── mlp.cu
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── MRAM-Latency/
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ └── copy.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── cyclecount.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── NW/
│ │ │ ├── .conf
│ │ │ ├── Makefile
│ │ │ ├── _NR_TASKLETS_10_BL_1024
│ │ │ ├── _NR_TASKLETS_11_BL_1024
│ │ │ ├── _NR_TASKLETS_12_BL_1024
│ │ │ ├── _NR_TASKLETS_13_BL_1024
│ │ │ ├── _NR_TASKLETS_14_BL_1024
│ │ │ ├── _NR_TASKLETS_15_BL_1024
│ │ │ ├── _NR_TASKLETS_16_BL_1024
│ │ │ ├── _NR_TASKLETS_17_BL_1024
│ │ │ ├── _NR_TASKLETS_18_BL_1024
│ │ │ ├── _NR_TASKLETS_19_BL_1024
│ │ │ ├── _NR_TASKLETS_1_BL_1024
│ │ │ ├── _NR_TASKLETS_20_BL_1024
│ │ │ ├── _NR_TASKLETS_21_BL_1024
│ │ │ ├── _NR_TASKLETS_22_BL_1024
│ │ │ ├── _NR_TASKLETS_23_BL_1024
│ │ │ ├── _NR_TASKLETS_24_BL_1024
│ │ │ ├── _NR_TASKLETS_2_BL_1024
│ │ │ ├── _NR_TASKLETS_3_BL_1024
│ │ │ ├── _NR_TASKLETS_4_BL_1024
│ │ │ ├── _NR_TASKLETS_5_BL_1024
│ │ │ ├── _NR_TASKLETS_6_BL_1024
│ │ │ ├── _NR_TASKLETS_7_BL_1024
│ │ │ ├── _NR_TASKLETS_8_BL_1024
│ │ │ ├── _NR_TASKLETS_9_BL_1024
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── needle.cpp
│ │ │ │ │ ├── run
│ │ │ │ │ └── run_offload
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── Makefile_nvidia
│ │ │ │ ├── README
│ │ │ │ ├── common/
│ │ │ │ │ ├── common.mk
│ │ │ │ │ └── make.config
│ │ │ │ ├── needle.cu
│ │ │ │ ├── needle.h
│ │ │ │ ├── needle_kernel.cu
│ │ │ │ ├── run
│ │ │ │ └── timing.h
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── Operational-Intensity/
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── cyclecount.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── RED/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.cpp
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── app_baseline.cu
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── cyclecount.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── Random-GUPS/
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ └── gups.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── cyclecount.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SCAN-RSS/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.cpp
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── app_baseline.cu
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SCAN-SSA/
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SEL/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── ds.h
│ │ │ │ ├── kernel.cu
│ │ │ │ └── select.cu
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── STREAM/
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ ├── add.c
│ │ │ │ ├── copy.c
│ │ │ │ ├── copyw.c
│ │ │ │ ├── scale.c
│ │ │ │ └── triad.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── cyclecount.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── STRIDED/
│ │ │ ├── Makefile
│ │ │ ├── dpu/
│ │ │ │ └── strided.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ ├── run.sh
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── cyclecount.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── SpMV/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── app.cu
│ │ │ ├── data/
│ │ │ │ ├── bcsstk30.mtx
│ │ │ │ └── generate/
│ │ │ │ ├── Makefile
│ │ │ │ ├── generate.sh
│ │ │ │ └── replicate.c
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ ├── app.c
│ │ │ │ └── mram-management.h
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── matrix.h
│ │ │ ├── params.h
│ │ │ ├── timer.h
│ │ │ └── utils.h
│ │ ├── TRNS/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── kernel.cpp
│ │ │ │ │ ├── kernel.h
│ │ │ │ │ ├── main.cpp
│ │ │ │ │ └── support/
│ │ │ │ │ ├── common.h
│ │ │ │ │ ├── setup.h
│ │ │ │ │ ├── timer.h
│ │ │ │ │ └── verify.h
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── kernel.cu
│ │ │ │ ├── kernel.h
│ │ │ │ ├── main.cpp
│ │ │ │ └── support/
│ │ │ │ ├── common.h
│ │ │ │ ├── cuda-setup.h
│ │ │ │ ├── timer.h
│ │ │ │ └── verify.h
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── TS/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ ├── inputs/
│ │ │ │ │ │ └── randomlist33M.txt
│ │ │ │ │ ├── launch.sh
│ │ │ │ │ ├── mprofile.h
│ │ │ │ │ ├── streamp_openmp.cpp
│ │ │ │ │ └── tools.cpp
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── STREAMP.cu
│ │ │ │ ├── inputs/
│ │ │ │ │ └── randomlist33M.txt
│ │ │ │ ├── launch.sh
│ │ │ │ └── randlist.py
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── UNI/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ ├── ds.h
│ │ │ │ ├── kernel.cu
│ │ │ │ └── unique.cu
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ ├── VA/
│ │ │ ├── Makefile
│ │ │ ├── baselines/
│ │ │ │ ├── cpu/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README
│ │ │ │ │ └── app_baseline.c
│ │ │ │ └── gpu/
│ │ │ │ ├── Makefile
│ │ │ │ ├── README
│ │ │ │ └── vec_add.cu
│ │ │ ├── dpu/
│ │ │ │ └── task.c
│ │ │ ├── host/
│ │ │ │ └── app.c
│ │ │ └── support/
│ │ │ ├── common.h
│ │ │ ├── params.h
│ │ │ └── timer.h
│ │ └── WRAM/
│ │ ├── Makefile
│ │ ├── dpu/
│ │ │ └── task.c
│ │ ├── host/
│ │ │ └── app.c
│ │ ├── run.sh
│ │ └── support/
│ │ ├── common.h
│ │ ├── cyclecount.h
│ │ ├── params.h
│ │ └── timer.h
│ ├── docker/
│ │ ├── compiler.dockerfile
│ │ └── parser.dockerfile
│ ├── pyproject.toml
│ ├── requirements.txt
│ ├── sdk/
│ │ ├── misc/
│ │ │ ├── accessMramFromDpu.c
│ │ │ ├── coreDump.c
│ │ │ ├── crt0.c
│ │ │ ├── dpu.lds
│ │ │ ├── internalStateReset.c
│ │ │ ├── linkerScript.lds
│ │ │ ├── restoreRegisters.c
│ │ │ └── restore_carry_and_zero_flag.h
│ │ ├── stdlib/
│ │ │ ├── abort.c
│ │ │ ├── assert.h
│ │ │ ├── atoi.c
│ │ │ ├── atol.c
│ │ │ ├── ctype.h
│ │ │ ├── errno.c
│ │ │ ├── errno.h
│ │ │ ├── exit.c
│ │ │ ├── inttypes.h
│ │ │ ├── iso646.h
│ │ │ ├── limits.h
│ │ │ ├── memchr.c
│ │ │ ├── memcmp.c
│ │ │ ├── memcpy.c
│ │ │ ├── memmove.c
│ │ │ ├── memmram_utils.h
│ │ │ ├── memset.c
│ │ │ ├── stdalign.h
│ │ │ ├── stdarg.h
│ │ │ ├── stdbool.h
│ │ │ ├── stddef.h
│ │ │ ├── stdint.h
│ │ │ ├── stdio.c
│ │ │ ├── stdio.h
│ │ │ ├── stdlib.h
│ │ │ ├── stdnoreturn.h
│ │ │ ├── stpcpy.c
│ │ │ ├── stpncpy.c
│ │ │ ├── strcat.c
│ │ │ ├── strchr.c
│ │ │ ├── strcmp.c
│ │ │ ├── strcpy.c
│ │ │ ├── strcspn.c
│ │ │ ├── strdup.c
│ │ │ ├── strerror.c
│ │ │ ├── string.h
│ │ │ ├── strlen.c
│ │ │ ├── strlwr.c
│ │ │ ├── strncat.c
│ │ │ ├── strncmp.c
│ │ │ ├── strncpy.c
│ │ │ ├── strndup.c
│ │ │ ├── strnlen.c
│ │ │ ├── strpbrk.c
│ │ │ ├── strrchr.c
│ │ │ ├── strrev.c
│ │ │ ├── strsep.c
│ │ │ ├── strspn.c
│ │ │ ├── strstr.c
│ │ │ ├── strtok_r.c
│ │ │ ├── strtol.c
│ │ │ └── strupr.c
│ │ └── syslib/
│ │ ├── absvdi2.c
│ │ ├── absvsi2.c
│ │ ├── adddf3.c
│ │ ├── addsf3.c
│ │ ├── addvdi3.c
│ │ ├── addvsi3.c
│ │ ├── alloc.c
│ │ ├── alloc.h
│ │ ├── ashldi3.c
│ │ ├── ashrdi3.c
│ │ ├── atomic_bit.h
│ │ ├── atomics.c
│ │ ├── attributes.h
│ │ ├── barrier.c
│ │ ├── barrier.h
│ │ ├── bswapdi2.c
│ │ ├── bswapsi2.c
│ │ ├── buddy_alloc.c
│ │ ├── buddy_alloc.h
│ │ ├── buddy_realloc.c
│ │ ├── built_ins.h
│ │ ├── clzdi2.c
│ │ ├── clzsi2.c
│ │ ├── cmpdi2.c
│ │ ├── comparedf2.c
│ │ ├── comparesf2.c
│ │ ├── ctzdi2.c
│ │ ├── ctzsi2.c
│ │ ├── defs.c
│ │ ├── defs.h
│ │ ├── devprivate.h
│ │ ├── div32.c
│ │ ├── divdf3.c
│ │ ├── divdi3.c
│ │ ├── divmodsi4.c
│ │ ├── divsf3.c
│ │ ├── divsi3.c
│ │ ├── dpuconst.h
│ │ ├── dpufault.h
│ │ ├── dpuruntime.h
│ │ ├── extendhfsf2.c
│ │ ├── extendsfdf2.c
│ │ ├── ffsdi2.c
│ │ ├── ffssi2.c
│ │ ├── ffsti2.c
│ │ ├── fixdfdi.c
│ │ ├── fixdfsi.c
│ │ ├── fixsfdi.c
│ │ ├── fixsfsi.c
│ │ ├── fixunsdfdi.c
│ │ ├── fixunsdfsi.c
│ │ ├── fixunssfdi.c
│ │ ├── fixunssfsi.c
│ │ ├── float.h
│ │ ├── floatdidf.c
│ │ ├── floatdisf.c
│ │ ├── floatsidf.c
│ │ ├── floatsisf.c
│ │ ├── floatundidf.c
│ │ ├── floatundisf.c
│ │ ├── floatunsidf.c
│ │ ├── floatunsisf.c
│ │ ├── fp_add_impl.inc
│ │ ├── fp_extend.h
│ │ ├── fp_extend_impl.inc
│ │ ├── fp_fixint_impl.inc
│ │ ├── fp_fixuint_impl.inc
│ │ ├── fp_lib.h
│ │ ├── fp_mul_impl.inc
│ │ ├── fp_trunc.h
│ │ ├── fp_trunc_impl.inc
│ │ ├── fsb_allocator.c
│ │ ├── fsb_allocator.h
│ │ ├── handshake.c
│ │ ├── handshake.h
│ │ ├── int_endianness.h
│ │ ├── int_lib.h
│ │ ├── int_math.h
│ │ ├── int_types.h
│ │ ├── int_util.c
│ │ ├── int_util.h
│ │ ├── listener.c
│ │ ├── lshrdi3.c
│ │ ├── macro_utils.h
│ │ ├── mcount.c
│ │ ├── moddi3.c
│ │ ├── modsi3.c
│ │ ├── mram.h
│ │ ├── mul32.c
│ │ ├── mul64.c
│ │ ├── muldc3.c
│ │ ├── muldf3.c
│ │ ├── mulodi4.c
│ │ ├── mulosi4.c
│ │ ├── mulsf3.c
│ │ ├── mulvdi3.c
│ │ ├── mulvsi3.c
│ │ ├── mutex.h
│ │ ├── negdf2.c
│ │ ├── negdi2.c
│ │ ├── negsf2.c
│ │ ├── negvdi2.c
│ │ ├── negvsi2.c
│ │ ├── paritydi2.c
│ │ ├── paritysi2.c
│ │ ├── perfcounter.c
│ │ ├── perfcounter.h
│ │ ├── popcountdi2.c
│ │ ├── popcountsi2.c
│ │ ├── powidf2.c
│ │ ├── powisf2.c
│ │ ├── profiling.c
│ │ ├── profiling.h
│ │ ├── profiling_internals.h
│ │ ├── sem.c
│ │ ├── sem.h
│ │ ├── seqread.h
│ │ ├── seqread.inc
│ │ ├── seqread1024.c
│ │ ├── seqread128.c
│ │ ├── seqread256.c
│ │ ├── seqread32.c
│ │ ├── seqread512.c
│ │ ├── seqread64.c
│ │ ├── soft_cache.c
│ │ ├── soft_cache.h
│ │ ├── subdf3.c
│ │ ├── subsf3.c
│ │ ├── subvdi3.c
│ │ ├── subvsi3.c
│ │ ├── sysdef.h
│ │ ├── truncdfhf2.c
│ │ ├── truncdfsf2.c
│ │ ├── truncsfhf2.c
│ │ ├── ucmpdi2.c
│ │ ├── udiv64.c
│ │ ├── udivdi3.c
│ │ ├── udivmodsi4.c
│ │ ├── udivsi3.c
│ │ ├── umoddi3.c
│ │ ├── umodsi3.c
│ │ └── waitqueue.c
│ ├── src/
│ │ ├── abi/
│ │ │ ├── binary/
│ │ │ │ ├── executable.py
│ │ │ │ ├── liveness.py
│ │ │ │ └── relocatable.py
│ │ │ ├── directive/
│ │ │ │ ├── ascii_directive.py
│ │ │ │ ├── asciz_directive.py
│ │ │ │ ├── byte_directive.py
│ │ │ │ ├── long_directive.py
│ │ │ │ ├── quad_directive.py
│ │ │ │ ├── short_directive.py
│ │ │ │ └── zero_directive.py
│ │ │ ├── isa/
│ │ │ │ ├── cc/
│ │ │ │ │ ├── _base_cc.py
│ │ │ │ │ ├── acquire_cc.py
│ │ │ │ │ ├── add_nz_cc.py
│ │ │ │ │ ├── boot_cc.py
│ │ │ │ │ ├── const_cc_ge0.py
│ │ │ │ │ ├── const_cc_geu.py
│ │ │ │ │ ├── const_cc_zero.py
│ │ │ │ │ ├── count_nz_cc.py
│ │ │ │ │ ├── div_cc.py
│ │ │ │ │ ├── div_nz_cc.py
│ │ │ │ │ ├── ext_sub_set_cc.py
│ │ │ │ │ ├── false_cc.py
│ │ │ │ │ ├── imm_shift_nz_cc.py
│ │ │ │ │ ├── log_nz_cc.py
│ │ │ │ │ ├── log_set_cc.py
│ │ │ │ │ ├── mul_nz_cc.py
│ │ │ │ │ ├── no_cc.py
│ │ │ │ │ ├── release_cc.py
│ │ │ │ │ ├── shift_nz_cc.py
│ │ │ │ │ ├── sub_nz_cc.py
│ │ │ │ │ ├── sub_set_cc.py
│ │ │ │ │ ├── true_cc.py
│ │ │ │ │ └── true_false_cc.py
│ │ │ │ ├── exception.py
│ │ │ │ ├── flag.py
│ │ │ │ ├── instruction/
│ │ │ │ │ ├── condition.py
│ │ │ │ │ ├── endian.py
│ │ │ │ │ ├── instruction.py
│ │ │ │ │ ├── op_code.py
│ │ │ │ │ └── suffix.py
│ │ │ │ └── register/
│ │ │ │ ├── gp_register.py
│ │ │ │ ├── pair_register.py
│ │ │ │ └── sp_register.py
│ │ │ ├── label/
│ │ │ │ ├── label.py
│ │ │ │ └── symbol.py
│ │ │ ├── section/
│ │ │ │ ├── section.py
│ │ │ │ ├── section_flag.py
│ │ │ │ ├── section_name.py
│ │ │ │ └── section_type.py
│ │ │ └── word/
│ │ │ ├── _base_word.py
│ │ │ ├── data_address_word.py
│ │ │ ├── data_word.py
│ │ │ ├── double_data_word.py
│ │ │ ├── immediate.py
│ │ │ ├── instruction_address_word.py
│ │ │ ├── instruction_word.py
│ │ │ └── representation.py
│ │ ├── assembler/
│ │ │ ├── assembler.py
│ │ │ └── data_prep/
│ │ │ ├── bin.py
│ │ │ ├── bs_data_prep.py
│ │ │ ├── gemv_data_prep.py
│ │ │ ├── hst_data_prep.py
│ │ │ ├── mlp_data_prep.py
│ │ │ ├── red_data_prep.py
│ │ │ ├── scan_rss_data_prep.py
│ │ │ ├── scan_ssa_data_prep.py
│ │ │ ├── sel_data_prep.py
│ │ │ ├── trns_data_prep.py
│ │ │ ├── ts_data_prep.py
│ │ │ ├── uni_data_prep.py
│ │ │ └── va_data_prep.py
│ │ ├── compiler/
│ │ │ └── compiler.py
│ │ ├── converter/
│ │ │ ├── condition_converter.py
│ │ │ ├── endian_converter.py
│ │ │ ├── instruction_converter.py
│ │ │ ├── op_code_converter.py
│ │ │ ├── register_converter.py
│ │ │ ├── section_flag_converter.py
│ │ │ ├── section_name_converter.py
│ │ │ ├── section_type_converter.py
│ │ │ ├── suffix_converter.py
│ │ │ └── symbol_converter.py
│ │ ├── encoder/
│ │ │ ├── ascii_encoder.py
│ │ │ ├── byte.py
│ │ │ ├── directive_encoder.py
│ │ │ └── instruction_encoder.py
│ │ ├── initializer/
│ │ │ ├── directive_initializer.py
│ │ │ ├── instruction_initializer.py
│ │ │ ├── int_initializer.py
│ │ │ └── str_initializer.py
│ │ ├── iss/
│ │ │ ├── cpu/
│ │ │ │ ├── cpu.py
│ │ │ │ ├── fini_thread.py
│ │ │ │ ├── init_thread.py
│ │ │ │ └── sched_thread.py
│ │ │ ├── dpu/
│ │ │ │ ├── alu.py
│ │ │ │ ├── decoder.py
│ │ │ │ ├── dispatcher.py
│ │ │ │ ├── dma.py
│ │ │ │ ├── dpu.py
│ │ │ │ ├── logic.py
│ │ │ │ ├── scheduler.py
│ │ │ │ └── thread.py
│ │ │ ├── dram/
│ │ │ │ ├── mram.py
│ │ │ │ ├── mram_command.py
│ │ │ │ └── word.py
│ │ │ ├── register/
│ │ │ │ ├── condition_register.py
│ │ │ │ ├── exception_register.py
│ │ │ │ ├── flag_register.py
│ │ │ │ ├── gp_register.py
│ │ │ │ ├── pc_register.py
│ │ │ │ ├── register_file.py
│ │ │ │ └── sp_register.py
│ │ │ ├── sram/
│ │ │ │ ├── atomic.py
│ │ │ │ ├── iram.py
│ │ │ │ ├── lock.py
│ │ │ │ └── wram.py
│ │ │ └── system.py
│ │ ├── linker_/
│ │ │ ├── linker.py
│ │ │ ├── linker_script.py
│ │ │ └── logic/
│ │ │ ├── instruction_assigner.py
│ │ │ ├── label_assigner.py
│ │ │ ├── liveness_analyzer.py
│ │ │ └── set_assigner.py
│ │ ├── main.py
│ │ ├── parser_/
│ │ │ ├── grammar/
│ │ │ │ ├── .antlr/
│ │ │ │ │ ├── assembly.interp
│ │ │ │ │ ├── assembly.tokens
│ │ │ │ │ ├── assemblyLexer.interp
│ │ │ │ │ ├── assemblyLexer.java
│ │ │ │ │ ├── assemblyLexer.tokens
│ │ │ │ │ └── assemblyParser.java
│ │ │ │ ├── assembly.g4
│ │ │ │ ├── assembly.interp
│ │ │ │ ├── assembly.tokens
│ │ │ │ ├── assemblyLexer.interp
│ │ │ │ ├── assemblyLexer.py
│ │ │ │ ├── assemblyLexer.tokens
│ │ │ │ ├── assemblyListener.py
│ │ │ │ └── assemblyParser.py
│ │ │ ├── grammar_generator.py
│ │ │ └── parser.py
│ │ └── util/
│ │ ├── config_loader.py
│ │ ├── docker_client.py
│ │ ├── param_loader.py
│ │ └── path_collector.py
│ └── test/
│ ├── abi/
│ │ ├── binary/
│ │ │ ├── executable_test.py
│ │ │ └── liveness_test.py
│ │ ├── directive/
│ │ │ ├── ascii_directive_test.py
│ │ │ ├── asciz_directive_test.py
│ │ │ ├── byte_directive_test.py
│ │ │ ├── long_directive_test.py
│ │ │ ├── quad_directive_test.py
│ │ │ ├── short_directive_test.py
│ │ │ └── zero_directive_test.py
│ │ ├── isa/
│ │ │ └── register/
│ │ │ ├── gp_register_test.py
│ │ │ └── pair_register_test.py
│ │ ├── label/
│ │ │ └── label_test.py
│ │ ├── section/
│ │ │ └── section_test.py
│ │ └── word/
│ │ ├── immediate_test.py
│ │ └── words_test.py
│ ├── compiler/
│ │ └── compiler_test.py
│ ├── encoder/
│ │ ├── ascii_encoder_test.py
│ │ ├── directive_encoder_test.py
│ │ └── instruction_encoder_test.py
│ ├── iss/
│ │ ├── dpu/
│ │ │ ├── dma_test.py
│ │ │ └── scheduler_test.py
│ │ ├── dram/
│ │ │ └── mram_test.py
│ │ ├── register/
│ │ │ └── register_file_test.py
│ │ └── sram/
│ │ ├── atomic_test.py
│ │ ├── iram_test.py
│ │ └── wram_test.py
│ ├── linker_/
│ │ └── linker_test.py
│ ├── parser_/
│ │ ├── grammar_generator_test.py
│ │ └── parser_test.py
│ └── util/
│ └── config_loader_test.py
└── tools/
├── README.md
├── upmem_profiler/
│ ├── CMakeLists.txt
│ ├── script/
│ │ ├── active_tasklet_profile.sh
│ │ ├── build.sh
│ │ ├── example.sh
│ │ ├── function_profile.sh
│ │ ├── instruction_mix_profile.sh
│ │ ├── mram_access_pattern_profile.sh
│ │ ├── timeline_profile.sh
│ │ └── tlb_behavior_profile.sh
│ └── src/
│ ├── CMakeLists.txt
│ ├── abi/
│ │ └── instruction/
│ │ ├── op_code.h
│ │ └── suffix.h
│ ├── basic/
│ │ ├── instruction_parser.cc
│ │ ├── instruction_parser.h
│ │ ├── interval.cc
│ │ ├── interval.h
│ │ ├── reg_file_parser.cc
│ │ ├── reg_file_parser.h
│ │ ├── stats_parser.cc
│ │ └── stats_parser.h
│ ├── converter/
│ │ ├── op_code_converter.cc
│ │ ├── op_code_converter.h
│ │ ├── suffix_converter.cc
│ │ └── suffix_converter.h
│ ├── instruction_mix/
│ │ ├── instruction_mix_profiler.cc
│ │ └── instruction_mix_profiler.h
│ ├── main.cc
│ ├── main.h
│ └── util/
│ ├── argument_parser.cc
│ ├── argument_parser.h
│ └── config_loader.h
└── upmem_reg_model/
├── data/
│ ├── input.xlsx
│ └── output.xlsx
├── script/
│ └── format.sh
└── src/
├── benchmark/
│ ├── _base_benchmark.py
│ ├── bs.py
│ ├── gemv.py
│ ├── hst_l.py
│ ├── hst_s.py
│ ├── mlp.py
│ ├── red.py
│ ├── scan_rss.py
│ ├── scan_ssa.py
│ ├── sel.py
│ ├── trns.py
│ ├── ts.py
│ ├── uni.py
│ └── va.py
├── io_/
│ ├── excel_reader.py
│ └── excel_writer.py
├── main.py
└── regression/
├── datum.py
└── model.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
*.S
*.bin
bin/
build/
cmake-build-debug/
validation_log/
__pycache__/
.idea/
.vscode/
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
================================================
FILE: LICENSE
================================================
Copyright (c) 2024, VIA Research Group at KAIST
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: README.md
================================================
# 📖 Introduction
**Welcome to the uPIMulator Framework Documentation!**
This documentation serves as your comprehensive guide to the uPIMulator framework, catering to both novice and experienced researchers.
Here, you'll find the resources necessary to leverage uPIMulator effectively for your research projects.
We provide in-depth coverage of uPIMulator's features, from foundational concepts to advanced functionalities.
Explore this documentation to unlock the full potential of uPIMulator and elevate your research endeavors.
# 🤙 Contact Information
## 📍 Address
[KAIST](https://www.kaist.ac.kr/en/), School of Electrical Engineering
[Vertically Integrated Architecture Research Group](https://sites.google.com/view/kaist-via/home)
Office: N1 818 @ [KAIST](https://www.kaist.ac.kr/en/)
## 📧 Email
- Bongjoon Hyun: [bongjoon.hyun@gmail.com](mailto:bongjoon.hyun@gmail.com)
- Taehun Kim: [taehun.kim@kaist.ac.kr](mailto:taehun.kim@kaist.ac.kr)
- Dongjae Lee: [dongjae.lee@kaist.ac.kr](mailto:dongjae.lee@kaist.ac.kr)
- Minsoo Rhu: [minsoo.rhu@gmail.com](mailto:minsoo.rhu@gmail.com)
Please feel free to reach out to us if you have any questions or require further assistance.
# 🧑💻 Released Versions
> **uPIMulator: A Flexible and Scalable Simulation Framework for General-Purpose Processing-In-Memory (PIM) Architectures**
uPIMulator is a cycle-level performance simulator tailored for general-purpose Processing-In-Memory (PIM) systems adhering to the UPMEM Instruction Set Architecture (ISA). This tool provides a detailed simulation environment, empowering computer architecture researchers and PIM program developers to investigate and harness the capabilities of PIM technology.
For comprehensive insights into uPIMulator and its applications, refer to our HPCA-2024 publication:
"[Pathfinding Future PIM Architectures by Demystifying a Commercial PIM Technology](https://www.computer.org/csdl/proceedings-article/hpca/2024/931300a263/1VOAAZSdy0w)", HPCA, 2024
The currently available versions are:
1. Python & C++ version
2. Go version
3. Go & virtual machine version
All source code and version history can be accessed in our GitHub repository.
## The Python & C++ Version
This is our initial implementation of uPIMulator, used in our HPCA-2024 publication.
You'll find it in the `python_cpp` directory. Refer to the included [README](./python_cpp/README.md) for usage instructions.
## The Go Version
Our second implementation, optimized for speed and memory efficiency.
Located in the `golang` directory, it offers an 8.5x speed increase and 7.5x memory reduction due to multi-threading.
We generally recommend this version for most use cases. See the [README](./golang/README.md) for details.
## The Go & Virtual Machine Version
This latest version extends the Go version with virtual machine capabilities, eliminating the need for manual data preparation code in many scenarios.
It's particularly suitable for complex or dynamic benchmarks where manual data preparation is challenging, though it may not offer the fastest simulation speed.
Refer to the [README](./golang_vm/README.md) for further guidance.
# 🔍 Summary of Correlation Ratio (R²) and Mean Absolute Error (MAE)
## Single DPU
## Multiple DPUs
- Each data point represents a single kernel from the PrIM benchmark suite.
- Summary of Correlation Ratio (R²) and Mean Absolute Error (MAE) for single- and multi-DPU simulations:
| Scenario | Total Data Points | Correlation (R²) | MAE |
|---|---|---|---|
| Single DPU (no inter-DPU communication) | 710 | 98.4% | 12.0% |
| Multi-DPU (with inter-DPU communication) | 387 | 83.6% | 26.9% |
- These validation results were obtained using the Python & C++ version of uPIMulator.
# List of Supported Instructions
- uPIMulator currently supports 599 out of the 970 instructions defined in the [UPMEM-PIM ISA](https://sdk.upmem.com/2023.2.0/201_IS.html#instruction-set-architecture).
Due to limitations in the publicly available ISA documentation, support for the remaining 371 instructions is pending.
However, the currently supported instructions have been sufficient to enable functionally correct simulations of the [PrIM benchmark suite](https://github.com/CMU-SAFARI/prim-benchmarks), producing results consistent with those obtained on real UPMEM-PIM hardware.
- For a detailed list of the currently supported instructions, please refer to [this Google spreadsheet](https://docs.google.com/spreadsheets/d/1xq8t6aRvafmTlGmy4Am8i3QmoOzli1heNxLmCEIlCv4/edit?usp=sharing).
# 🪨 Materials
- Bongjoon Hyun, Taehun Kim, Dongjae Lee, and Minsoo Rhu, "[Pathfinding Future PIM Architectures by Demystifying a Commercial PIM Technology](https://www.computer.org/csdl/proceedings-article/hpca/2024/931300a263/1VOAAZSdy0w)", *The 30th IEEE International Symposium on High-Performance Computer Architecture ([HPCA-30](https://hpca-conf.org/2024/))*, Edinburgh, Scotland, Mar. 2024
- ${\textsf{\color{red}Best Paper Award}}$
- Acceptance rate: 18% (75 among 410)
- [Slide](https://drive.google.com/file/d/1TRgFu6YdBu2gtdtDKIuJI85u8Be8w60M/view?usp=sharing)
- [Presentation](https://youtu.be/e-RXYl568fw?si=LbTYbM7p7qod-L8w)
# 🎁 Contributions
We welcome and encourage contributions to uPIMulator!
If you are interested in contributing or have questions, please feel free to open an issue or submit a pull request.
## List of Maintainers
- Bongjoon Hyun (bongjoon.hyun@gmail.com)
- Taehun Kim (taehun.kim@kaist.ac.kr)
- Dongjae Lee (dongjae.lee@kaist.ac.kr)
- Minsoo Rhu (minsoo.rhu@gmail.com)
## List of Contributors
# 🙏 Acknowledgement
We would like to thank the developers of the [PrIM benchmark suite](https://github.com/CMU-SAFARI/prim-benchmarks), which was instrumental in developing this project.
This research is funded by the generous support from the following organizations:
- Institute of Information & Communications Technology Planning & Evaluation (IITP) grant funded by the Korea government(MSIT) (No. 2022-0-01037, Development of High Performance Processing-in-Memory Technology based on DRAM) and the Korea government(MSIT) (No.RS-2024-00438851, (SW Starlab) High-performance Privacy-preserving Machine Learning System and System Software)
- National Research Foundation of Korea (NRF) grant funded by the Korea government (MSIT) (NRF-2021R1A2C2091753)
- Samsung Electronics
We appreciate their commitment to advancing research in this field.
## 📔 Citation
Bongjoon Hyun, Taehun Kim, Dongjae Lee, and Minsoo Rhu, "[Pathfinding Future PIM Architectures by Demystifying a Commercial PIM Technology](https://www.computer.org/csdl/proceedings-article/hpca/2024/931300a263/1VOAAZSdy0w)", IEEE International Symposium on High-Performance Computer Architecture (HPCA), March 2024.
================================================
FILE: golang/README.md
================================================
# ⚙️ Usage
## Currently Supported Mode
uPIMulator operates in an execution-driven simulation mode, enabling cycle-level performance analysis of PIM-based applications.
## Workflow
The typical usage workflow comprises two primary stages:
1. **Binary Generation:** Compile, assemble, and link your application code to generate the required binary files for simulation.
2. **Cycle-Level Simulation:** Utilize the generated binary files as input to the cycle-level simulator to obtain detailed performance metrics and insights.
We are actively working on expanding uPIMulator's capabilities and may introduce additional usage modes in future releases.
## Installation & Build
### Prerequisites
- **Go Compiler and SDK:** uPIMulator requires Go 1.21.5 or later.
You can download and install Go from the [official website](https://go.dev/doc/install).
- **Docker:** Please ensure that Docker is installed on your system.
- **Docker Group Membership:** Your Ubuntu user account needs to be a member of the `docker` group.
- **Tested Environment:** uPIMulator has been thoroughly tested on Ubuntu 18.04 with an Intel CPU.
While we strive for compatibility across different environments, optimal performance and functionality are guaranteed within the tested setup.
### Installation Steps
1. **Install and Build**
Navigate to the `uPIMulator` directory and execute the build script:
```bash
cd /path/to/uPIMulator/golang/uPIMulator/script
python build.py
```
## Binary Files Generation & Cycle-Level Simulation
We will use the VA (vector addition) benchmark as an example to demonstrate the binary file generation and simulation process.
Please note that the initial simulation might take approximately 30 minutes.
### Execution
To initiate a simulation, provide the following:
- **Benchmark name:** Specify the desired benchmark (e.g., 'VA').
- **Number of tasklets:** Define the number of tasklets to be utilized.
- **Output directory path:** Indicate the absolute path to the directory where you want to store binary files, log files, and other simulation artifacts.
You can further customize the simulation by utilizing command-line options to adjust various parameters.
### Simulation Output
Detailed simulation results will be written to the standard output (`stdout`).
> **Important Notes:**
> - **Create Output Directory:** Prior to running the simulation, create an empty directory at the specified `bin_dirpath`.
> **Absolute Paths:** Always provide absolute paths for both `root_dirpath` (the repository's root directory) and `bin_dirpath`.
### Example Command
```bash
cd /path/to/uPIMulator/golang/uPIMulator
rm -rf bin
mkdir bin
./build/uPIMulator --root_dirpath /path/to/uPIMulator/golang/uPIMulator --bin_dirpath /path/to/uPIMulator/golang/uPIMulator/bin --benchmark VA --num_channels 1 --num_ranks_per_channel 1 --num_dpus_per_rank 1 --num_tasklets 16 --data_prep_params 1024
```
# 📄 Reproducing Figures from the Paper
To replicate the figures presented in our paper, please adhere to the instructions provided below.
We offer replication manuals for Figures 5, 6, 7, 9 and 10 for brevity.
## Configuration of PrIM Benchmarks
- **Single DPU Focus:** For Figures 5, 6, 7, and 9 the parameters `num_channels`, `num_ranks_per_channel`, and `num_dpus_per_rank` must always be set to `1`, as these experiments specifically characterize the behavior of a single DPU.
- **Data Preparation Parameter:** When generating the binary files for the PrIM benchmarks, please configure the `data_prep_param` parameter according to the following table:
| Benchmark | `data_prep_param` (Figures 5, 6) | `data_prep_param` (Figure 10) |
|---|---|---|
| BS | 32768 | 131072 |
| GEMV | 2048 | 4096 |
| HST-L | 131072 | 524288 |
| HST-S | 131072 | 524288 |
| MLP | 256 | 1024 |
| RED | 524288 | 2097152|
| SCAN-RSS | 262144 | 1048576|
| SCAN-SSA | 262144 | 1048576|
| SEL | 524288 | 2097152|
| TRNS | 1024 | 128 |
| TS | 2048 | 65536 |
| UNI | 524288 | 2097152|
| VA | 524288 | 2097152|
### Example Command
```bash
./uPIMulator --root_dirpath /path/to/uPIMulator/ --bin_dirpath /path/to/uPIMulator/bin --benchmark VA --num_channels 1 --num_ranks_per_channel 1 --num_dpus_per_rank 1 --num_tasklets 16 --data_prep_params 524288
```
Please ensure you adhere to these configurations to accurately replicate the figures presented in the paper.
## Figure 5: PrIM Compute and Memory Utilization
This figure illustrates the compute utilization (represented by red points) and memory read bandwidth utilization (represented by blue points) of the PrIM benchmarks when executed with varying numbers of threads (tasklets): 1, 4, and 16.
### Metrics Calculation
- **Compute Utilization (IPC):** `num_instructions` / `logic_cycle`
- **Memory Read Bandwidth Utilization (GB/s):** Refer to the provided Excel sheet for the calculation: [link](../assets/figure5_mem_util_calculator.xlsx)
> **Note:** The values for `num_instructions` and `logic_cycle` required in these calculations can be obtained from the simulation results generated by uPIMulator.
## Figure 6: DPU Runtime Breakdown
This figure presents a breakdown of DPU runtime, categorizing cycles into:
- **Active Cycles (Black):** Represent cycles when the DPU is actively executing instructions.
- **Idle Cycles (Red, Yellow, Blue):** Represent cycles when the DPU is stalled due to various reasons.
### Calculation of Cycle Ratios
To generate the breakdown depicted in the figure, you can utilize the following formulas:
- **Issuable Ratio:** `breakdown_run` / `logic_cycle`
- **Idle (Memory) Ratio:** `breakdown_dma` / `logic_cycle`
- **Idle (Revolver) Ratio:** `breakdown_etc` / `logic_cycle`
- **Idle (RF) Ratio:** `backpressure` / `logic_cycle`
> **Note:** The values for the variables used in these formulas (`breakdown_run`, `logic_cycle`, etc.) are available in the simulation results produced by uPIMulator.
## Figure 7: Issuable Tasklets
This figure visualizes the number of tasklets (threads) that are ready for execution (issuable) by the DPU scheduler at each cycle.
### Replication
To reproduce this figure, utilize the provided [Excel sheet](../assets/figure7_active_tasklet_breakdown.xlsx).
The spreadsheet includes instructions on how to populate it with the relevant simulation output data, and it will automatically generate the corresponding figure.
> **Important Configuration Note**: Please ensure that the number of threads is configured to **16 tasklets** when running the simulations for this figure.
> You can achieve this by using the following command-line argument: `--num_tasklets 16`.
## Figure 9: Instruction Mix (Single DPU)
Figure 9 provides a breakdown of the instruction mix observed during single-DPU execution.
To generate this figure, follow the steps outlined below using the `upmem_profiler` tool and the accompanying Excel sheet.
### Procedure
1. **Build the Profiler**
```bash
cd /path/to/uPIMulator/tools/upmem_profiler/script
bash build.sh
```
2. **Extract Instructions**
Run the simulation with the `--verbose 1` flag to capture detailed instruction traces.
```bash
cd /path/to/uPIMulator/golang/uPIMulator/
./build/uPIMulator --root_dirpath /path/to/uPIMulator/golang/uPIMulator --bin_dirpath /path/to/uPIMulator/golang/uPIMulator/bin --benchmark VA --num_channels 1 --num_ranks_per_channel 1 --num_dpus_per_rank 1 --num_tasklets 16 --data_prep_params 1024 --verbose 1 > trace.txt
```
3. **Run the Profiler**
Process the generated trace file using the `upmem_profiler` in `instruction_mix` mode.
```bash
cd /path/to/uPIMulator/tools/upmem_profiler/
./build/src/upmem_profiler --logpath /path/to/uPIMulator/golang/uPIMulator/trace.txt --mode instruction_mix
```
4. **Generate the Figure**
Utilize the profiler's output to populate the provided [Excel sheet](../assets/figure9_instruction_mix.xlsx), which will automatically generate the instruction mix figure.
> **Important Configuration Note:** Similar to Figure 7, the instruction mix analysis in Figure 9 is based on simulations with **16 tasklets**.
> Ensure that you maintain this configuration (`--num_tasklets 16`) for accurate replication.
## Figure 10: Multi-DPU Latency Breakdown and Speedup
Figure 10 presents the latency breakdown and speedup achieved in multi-DPU scenarios.
### Configuring the Number of DPUs
You can adjust the number of DPUs by modifying the following parameters in `uPIMulator`:
- `num_channels`
- `num_ranks_per_channel`
- `num_dpus_per_rank`
### Generating the Latency Breakdown
To obtain the latency breakdown data for plotting, utilize the `upmem_reg_model` tool located in the `tools/upmem_reg_model/` directory.
This tool implements a communication model between the host and DPUs based on linear regression.
### Procedure
1. **Prepare Input Excel:**
- We provide a sample input Excel file as a template.
- Append a new row to this file, specifying the benchmark name, number of DPUs, and the `data_prep_param` used in your simulation.
- Fill in the relevant time values (in milliseconds) obtained from your simulation results, such as kernel execution time.
You can convert cycle counts to time in milliseconds by dividing the cycle count by the corresponding clock frequency (in MHz) and then multiplying by 1000.
2. **Run the Regression Model:**
```bash
cd /path/to/uPIMulator/tools/upmem_reg_model/src
python main.py --input_excel_filepath /path/to/your/input_excel_file --output_excel_filepath /path/to/your/output_excel_file
```
3. **Access the Output:**
- The linear regression results will be available in the specified output Excel file.
- Use this data to create the latency breakdown plots as shown in Figure 10.
Please ensure that you follow these steps carefully to accurately reproduce the multi-DPU latency breakdown and speedup analysis presented in the paper.
# 🌋 Adding Custom Benchmarks
uPIMulator empowers you to go beyond the provided PrIM benchmark suite by incorporating your own custom benchmarks.
This is particularly beneficial if you have access to UPMEM-PIM hardware and want to evaluate your code's performance in a simulated environment.
## Requirements
To successfully integrate a new benchmark, ensure it adheres to the following:
1. **UPMEM-C Language:** The benchmark must be implemented in UPMEM-C, a C-like language tailored for UPMEM-PIM programming.
Consult the [UPMEM SDK documentation](https://sdk.upmem.com/2021.4.0/) for detailed programming guidelines.
2. **File Structure and Naming:**
- Maintain the same file hierarchy as the PrIM benchmarks, including a `dpu` subdirectory.
- Include a `CMakeLists.txt` file within your benchmark's directory hierarchy, mirroring the structure used in the PrIM examples.
This is essential as uPIMulator's interpreter and linker automatically detect and compile benchmarks using these `CMakeLists.txt` files.
## Data Preparation
Since UPMEM PIM-enabled memory directly utilizes physical addresses and uPIMulator currently doesn't support concurrent execution of host and PIM-enabled memory, exercise caution when feeding input/output data.
You'll need to provide Go source code to handle data preparation for your benchmark.
This script should reside in the `uPIMulator/src/assembler` directory and be recognized by `uPIMulator/src/assembler/assembler.go`.
> **Key Considerations for Data Preparation Scripts**
> - Data transferred from the host to DPUs using `dpu_push_xfer` must be organized within the `input_dpu_mram_heap_pointer_name` variable in your data preparation script.
> - Similarly, data transferred from DPUs to the host using `dpu_push_xfer` should be placed within the `output_dpu_mram_heap_pointer_name` variable.
## Reference Examples
We have included data preparation scripts for the 13 supported PrIM benchmarks.
These serve as excellent references for structuring your custom data preparation scripts.
By following these guidelines, you can seamlessly integrate your benchmarks into uPIMulator for comprehensive performance evaluation and analysis.
If you have any questions or encounter any difficulties during the integration process, don't hesitate to reach out to us for support.
================================================
FILE: golang/uPIMulator/benchmark/BS/CMakeLists.txt
================================================
#add_subdirectory(host)
add_subdirectory(dpu)
================================================
FILE: golang/uPIMulator/benchmark/BS/Makefile
================================================
DPU_DIR := dpu
HOST_DIR := host
BUILDDIR ?= bin
NR_TASKLETS ?= 16
NR_DPUS ?= 1
PROBLEM_SIZE ?= 2
define conf_filename
${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2).conf
endef
CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS})
COMMON_INCLUDES := support
HOST_TARGET := ${BUILDDIR}/host_code
DPU_TARGET := ${BUILDDIR}/dpu_code
HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
.PHONY: all clean test
__dirs := $(shell mkdir -p ${BUILDDIR})
COMMON_FLAGS := -w -I${COMMON_INCLUDES}
HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DPROBLEM_SIZE=${PROBLEM_SIZE}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS}
all: ${HOST_TARGET} ${DPU_TARGET}
${CONF}:
$(RM) $(call conf_filename,*,*)
touch ${CONF}
${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
$(CC) -S -o ${HOST_TARGET}.S ${HOST_SOURCES} ${HOST_FLAGS}
${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
dpu-upmem-dpurte-clang -S ${DPU_FLAGS} -o ${DPU_TARGET}.S ${DPU_SOURCES}
clean:
$(RM) -r $(BUILDDIR)
test: all
./${HOST_TARGET} -i 262144
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/cpu/Makefile
================================================
all:
gcc bs_omp.c -o bs_omp -fopenmp
run:
./bs_omp 262144 16777216
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/cpu/README
================================================
Binary Search (BS)
Compilation instructions:
make
Execution instructions
./bs_omp 2048576 16777216
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/cpu/bs_omp.c
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "timer.h"
#define DTYPE uint64_t
/*
* @brief creates a "test file" by filling a bufferwith values
*/
void create_test_file(DTYPE * input, uint64_t nr_elements, DTYPE * querys, uint64_t n_querys) {
uint64_t max = UINT64_MAX;
uint64_t min = 0;
srand(time(NULL));
input[0] = 1;
for (uint64_t i = 1; i < nr_elements; i++) {
input[i] = input[i - 1] + (rand() % 10) + 1;
}
for(uint64_t i = 0; i < n_querys; i++)
{
querys[i] = input[rand() % (nr_elements - 2)];
}
}
/**
* @brief compute output in the host
*/
uint64_t binarySearch(DTYPE * input, uint64_t input_size, DTYPE* querys, unsigned n_querys)
{
uint64_t found = -1;
uint64_t q, r, l, m;
#pragma omp parallel for private(q,r,l,m)
for(q = 0; q < n_querys; q++)
{
l = 0;
r = input_size;
while (l <= r)
{
m = l + (r - l) / 2;
// Check if x is present at mid
if (input[m] == querys[q])
{
found += m;
break;
}
// If x greater, ignore left half
if (input[m] < querys[q])
l = m + 1;
// If x is smaller, ignore right half
else
r = m - 1;
}
}
return found;
}
/**
* @brief Main of the Host Application.
*/
int main(int argc, char **argv) {
Timer timer;
uint64_t input_size = atol(argv[1]);
uint64_t n_querys = atol(argv[2]);
printf("Vector size: %lu, num searches: %lu\n", input_size, n_querys);
DTYPE * input = malloc((input_size) * sizeof(DTYPE));
DTYPE * querys = malloc((n_querys) * sizeof(DTYPE));
DTYPE result_host = -1;
// Create an input file with arbitrary data.
create_test_file(input, input_size, querys, n_querys);
start(&timer, 0, 0);
result_host = binarySearch(input, input_size - 1, querys, n_querys);
stop(&timer, 0);
int status = (result_host);
if (status) {
printf("[OK] Execution time: ");
print(&timer, 0, 1);
printf("ms.\n");
} else {
printf("[ERROR]\n");
}
free(input);
return status ? 0 : 1;
}
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/cpu/timer.h
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#include
typedef struct Timer{
struct timeval startTime[4];
struct timeval stopTime[4];
double time[4];
}Timer;
void start(Timer *timer, int i, int rep) {
if(rep == 0) {
timer->time[i] = 0.0;
}
gettimeofday(&timer->startTime[i], NULL);
}
void stop(Timer *timer, int i) {
gettimeofday(&timer->stopTime[i], NULL);
timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
(timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
}
void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/gpu/Makefile
================================================
all:
nvcc -arch=sm_30 -m64 -Xcompiler -fPIC -shared -o cu_binary_search.so binary_search.cu -std=c++11
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/gpu/README
================================================
Binary Search (BS)
Compilation instructions:
make
Execution instructions
python3 run.py
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/gpu/binary_search.cu
================================================
#include
#include
#include "binary_search.h"
#include
#include
#define BLOCKDIM 512
#define SEARCH_CHUNK 16
#define BLOCK_CHUNK (BLOCKDIM*SEARCH_CHUNK)
__global__ void search_kernel(const long int *arr,
const long int len, const long int *querys, const long int num_querys, long int *res, bool *flag)
{
int search;
if(*flag == false) {
int tid = threadIdx.x;
__shared__ int s_arr[BLOCK_CHUNK];
/* Since each value is being copied to shared memory, the rest of the
following uncommented code is unncessary, since a direct comparison
can be done at the time of copy below. */
// for(int i = 0; i < BLOCKDIM; ++i) {
// int shared_loc = i*SEARCH_CHUNK + tid;
// int global_loc = shared_loc + BLOCK_CHUNK * blockIdx.x;
// if(arr[global_loc] == search) {
// *flag = true;
// *res = global_loc;
// }
// __syncthreads();
// }
/* Copy chunk of array that this entire block of threads will read
from the slower global memory to the faster shared memory. */
for(long int i = 0; i < SEARCH_CHUNK; ++i) {
int shared_loc = tid*SEARCH_CHUNK + i;
int global_loc = shared_loc + BLOCK_CHUNK * blockIdx.x;
/* Make sure to stay within the bounds of the global array,
else assign a dummy value. */
if(global_loc < len) {
s_arr[shared_loc] = arr[global_loc];
}
else {
s_arr[shared_loc] = INT_MAX;
}
}
__syncthreads();
for(long int i = 0; i < num_querys; i++)
{
search = querys[i];
/* For each runtime, set the initial search range. */
int L = 0;
int R = SEARCH_CHUNK - 1;
int m = (L + R) / 2;
/* Pointer to the part of the shared array for this runtime. */
int *s_ptr = &s_arr[tid*SEARCH_CHUNK];
/* Each runtime will search a chunk of the block array.
Many blocks will not find a solution so the search must
be allowed to fail on a per block basis. The loop will
break (fail) when L >= R. */
while(L <= R && *flag == false)
{
if(s_ptr[m] < search) {
L = m + 1;
}
else if(s_ptr[m] > search) {
R = m - 1;
}
else {
*flag = true;
*res = m += tid*SEARCH_CHUNK + BLOCK_CHUNK * blockIdx.x;
}
m = (L + R) / 2;
}
}
}
}
int binary_search(const long int *arr, const long int len, const long int *querys, const long int num_querys)
{
long int *d_arr, *d_querys, *d_res;
bool *d_flag;
size_t arr_size = len * sizeof(long int);
size_t querys_size = num_querys * sizeof(long int);
size_t res_size = sizeof(long int);
size_t flag_size = sizeof(bool);
cudaMalloc(&d_arr, arr_size);
cudaMalloc(&d_querys, querys_size);
cudaMalloc(&d_res, res_size);
cudaMalloc(&d_flag, flag_size);
cudaMemcpy(d_arr, arr, arr_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_querys, querys, querys_size, cudaMemcpyHostToDevice);
cudaMemset(d_flag, 0, flag_size);
/* Set res value to -1, so that if the function returns -1, that
indicates an algorithm failure. */
cudaMemset(d_res, -0x1, res_size);
int blockSize = BLOCKDIM;
int gridSize = (len-1)/BLOCK_CHUNK + 1;
auto start = std::chrono::high_resolution_clock::now();
search_kernel<<>>(d_arr, len, d_querys, num_querys ,d_res, d_flag);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
std::cout << "Kernel Time: " <<
std::chrono::duration_cast(end-start).count() <<
" ms" << std::endl;
long int res;
cudaMemcpy(&res, d_res, res_size, cudaMemcpyDeviceToHost);
return res;
}
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/gpu/binary_search.h
================================================
#ifndef BINARY_SEARCH_H
#define BINARY_SEARCH_H
#ifdef _WIN32
#include
#define DLL_EXPORT __declspec(dllexport)
#else
#define DLL_EXPORT
#endif
extern "C" {
int DLL_EXPORT binary_search(const long int *arr, const long int len, const long int *querys, const long int num_querys);
}
#endif /* BINARY_SEARCH_H */
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/gpu/cpu_lib.py
================================================
# -*- coding: utf-8 -*-
def binary_search(arr, search):
L = 0
R = len(arr)
while L <= R:
if L > R:
return -1 # Error code 1
m = (L + R) / 2
if arr[m] < search:
L = m + 1
elif arr[m] > search:
R = m - 1
else:
return m
return -2 # Error code 2
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/gpu/cu_lib_import.py
================================================
# -*- coding: utf-8 -*-
__all__ = [
"binary_search",
]
import os.path as path
import platform
from ctypes import *
from numpy.ctypeslib import load_library, ndpointer
## Load the DLL
if platform.system() == "Linux":
cuda_lib = load_library("cu_binary_search.so", path.dirname(path.realpath(__file__)))
elif platform.system() == "Windows":
cuda_lib = load_library("cu_binary_search.dll", path.dirname(path.realpath(__file__)))
## Define argtypes for all functions to import
argtype_defs = {
"binary_search": [ndpointer("i8"), c_int, ndpointer("i8"), c_int],
}
## Import functions from DLL
for func, argtypes in argtype_defs.items():
locals().update({func: cuda_lib[func]})
locals()[func].argtypes = argtypes
================================================
FILE: golang/uPIMulator/benchmark/BS/baselines/gpu/run.py
================================================
# -*- coding: utf-8 -*-
import time
import numpy as np
# Local Imports
from cu_lib_import import binary_search as gpu_search
# Set an array label to create
arr_len = 2048576
num_querys = 16777216
# Dummy array created
arr = np.arange(0, arr_len, 1).astype("i8")
# Random search querys created
querys = np.random.randint(1, arr_len, num_querys)
# GPU search function call
t0 = time.time()
res_gpu = gpu_search(arr, len(arr), querys, len(querys))
print("Total GPU Time: %i ms" % ((time.time() - t0) * 1e003))
================================================
FILE: golang/uPIMulator/benchmark/BS/dpu/CMakeLists.txt
================================================
set(CMAKE_C_COMPILER "/root/upmem-2023.2.0-Linux-x86_64/bin/dpu-upmem-dpurte-clang")
set(CMAKE_C_FLAGS "-w -I/root/uPIMulator/benchmark/BS/support -O2 -S -DNR_TASKLETS=${NR_TASKLETS}")
file(GLOB_RECURSE SRCS *.c)
add_executable(BS_device ${SRCS})
================================================
FILE: golang/uPIMulator/benchmark/BS/dpu/task.c
================================================
/*
* Binary Search with multiple tasklets
*
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include "common.h"
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
__host dpu_results_t DPU_RESULTS[NR_TASKLETS];
// Search
DTYPE __attribute__ ((noinline)) search(DTYPE *bufferA, DTYPE searching_for) {
DTYPE found = -2;
if(bufferA[0] <= searching_for)
{
found = -1;
for (uint32_t i = 0; i < BLOCK_SIZE / sizeof(DTYPE); i++){
if(bufferA[i] == searching_for)
{
found = i;
break;
}
}
}
return found;
}
BARRIER_INIT(my_barrier, NR_TASKLETS);
extern int main_kernel1(void);
int(*kernels[nr_kernels])(void) = {main_kernel1};
int main(void){
// Kernel
return kernels[DPU_INPUT_ARGUMENTS.kernel]();
}
// main_kernel1
int main_kernel1() {
unsigned int tasklet_id = me();
#if PRINT
printf("tasklet_id = %u\n", tasklet_id);
#endif
if(tasklet_id == 0){
mem_reset(); // Reset the heap
}
// Barrier
barrier_wait(&my_barrier);
DTYPE searching_for, found;
uint64_t input_size = DPU_INPUT_ARGUMENTS.input_size;
// Address of the current processing block in MRAM
uint32_t start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
uint32_t start_mram_block_addr_aux = start_mram_block_addr_A;
uint32_t end_mram_block_addr_A = start_mram_block_addr_A + sizeof(DTYPE) * input_size;
uint32_t current_mram_block_addr_query = end_mram_block_addr_A + tasklet_id * (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS) * sizeof(DTYPE);
// Initialize a local cache to store the MRAM block
DTYPE *cache_A = (DTYPE *) mem_alloc(BLOCK_SIZE);
DTYPE *cache_aux_A = (DTYPE *) mem_alloc(BLOCK_SIZE);
DTYPE *cache_aux_B = (DTYPE *) mem_alloc(BLOCK_SIZE);
dpu_results_t *result = &DPU_RESULTS[tasklet_id];
// TODO(bongjoon.hyun@gmail.com): original PrIM benchmark uses uint64_t for targets' type
for(uint32_t targets = 0; targets < (DPU_INPUT_ARGUMENTS.slice_per_dpu / NR_TASKLETS); targets++)
{
found = -1;
mram_read((__mram_ptr void const *) current_mram_block_addr_query, &searching_for, 8);
current_mram_block_addr_query += 8;
bool end = false;
// Initialize input vector boundaries
start_mram_block_addr_A = (uint32_t) DPU_MRAM_HEAP_POINTER;
start_mram_block_addr_aux = start_mram_block_addr_A;
end_mram_block_addr_A = start_mram_block_addr_A + sizeof(DTYPE) * input_size;
uint32_t current_mram_block_addr_A = start_mram_block_addr_A;
// Bring first and last values to WRAM
mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_aux_A, BLOCK_SIZE);
mram_read((__mram_ptr void const *) (end_mram_block_addr_A - BLOCK_SIZE * sizeof(DTYPE)), cache_aux_B, BLOCK_SIZE);
current_mram_block_addr_A = (start_mram_block_addr_A + end_mram_block_addr_A) / 2;
while(!end)
{
// Load cache with current MRAM block
mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_A, BLOCK_SIZE);
// Search inside block
found = search(cache_A, searching_for);
// If found > -1, we found the searching_for query
if(found > -1)
{
result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE);
break;
}
// If found == -2, we need to discard right part of the input vector
if(found == -2)
{
end_mram_block_addr_A = current_mram_block_addr_A;
current_mram_block_addr_A = (current_mram_block_addr_A + start_mram_block_addr_A) / 2;
}
// If found == -1, we need to discard left part of the input vector
else if (found == -1)
{
start_mram_block_addr_A = current_mram_block_addr_A;
current_mram_block_addr_A = (current_mram_block_addr_A + end_mram_block_addr_A) / 2;
}
// Start boundary check
if(current_mram_block_addr_A < (start_mram_block_addr_aux + BLOCK_SIZE))
{
end = true;
mram_read((__mram_ptr void const *) current_mram_block_addr_A, cache_A, BLOCK_SIZE);
found = search(cache_A, searching_for);
if(found > -1)
{
end = true;
result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE);
}
}
// End boundary check
if(current_mram_block_addr_A > (end_mram_block_addr_A - BLOCK_SIZE))
{
end = true;
mram_read((__mram_ptr void const *) end_mram_block_addr_A - BLOCK_SIZE, cache_A, BLOCK_SIZE);
found = search(cache_A, searching_for);
if(found > -1)
{
result->found = found + (current_mram_block_addr_A - start_mram_block_addr_aux) / sizeof(DTYPE);
}
}
}
}
return 0;
}
================================================
FILE: golang/uPIMulator/benchmark/BS/host/app.c
================================================
/**
* app.c
* BS Host Application Source File
*
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if ENERGY
#include
#endif
#include "params.h"
#include "timer.h"
// Define the DPU Binary path as DPU_BINARY here
#define DPU_BINARY "./bin/bs_dpu"
// Create input arrays
void create_test_file(DTYPE * input, DTYPE * querys, uint64_t nr_elements, uint64_t nr_querys) {
input[0] = 1;
for (uint64_t i = 1; i < nr_elements; i++) {
input[i] = input[i - 1] + 1;
}
for (uint64_t i = 0; i < nr_querys; i++) {
querys[i] = i;
}
}
// Compute output in the host
int64_t binarySearch(DTYPE * input, DTYPE * querys, DTYPE input_size, uint64_t num_querys)
{
uint64_t result = -1;
DTYPE r;
for(uint64_t q = 0; q < num_querys; q++)
{
DTYPE l = 0;
r = input_size;
while (l <= r) {
DTYPE m = l + (r - l) / 2;
// Check if x is present at mid
if (input[m] == querys[q])
result = m;
// If x greater, ignore left half
if (input[m] < querys[q])
l = m + 1;
// If x is smaller, ignore right half
else
r = m - 1;
}
}
return result;
}
// Main of the Host Application
int main(int argc, char **argv) {
struct Params p = input_params(argc, argv);
struct dpu_set_t dpu_set, dpu;
uint32_t nr_of_dpus;
uint64_t input_size = INPUT_SIZE;
uint64_t num_querys = p.num_querys;
DTYPE result_host = -1;
DTYPE result_dpu = -1;
// Create the timer
Timer timer;
// Allocate DPUs and load binary
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
#if ENERGY
struct dpu_probe_t probe;
DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif
// Query number adjustement for proper partitioning
if(num_querys % (nr_of_dpus * NR_TASKLETS))
num_querys = num_querys + (nr_of_dpus * NR_TASKLETS - num_querys % (nr_of_dpus * NR_TASKLETS));
assert(num_querys % (nr_of_dpus * NR_TASKLETS) == 0 && "Input dimension"); // Allocate input and querys vectors
DTYPE * input = malloc((input_size) * sizeof(DTYPE));
DTYPE * querys = malloc((num_querys) * sizeof(DTYPE));
// Create an input file with arbitrary data
create_test_file(input, querys, input_size, num_querys);
// Compute host solution
start(&timer, 0, 0);
result_host = binarySearch(input, querys, input_size - 1, num_querys);
stop(&timer, 0);
// Create kernel arguments
uint64_t slice_per_dpu = num_querys / nr_of_dpus;
dpu_arguments_t input_arguments = {input_size, slice_per_dpu, 0};
for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
// Perform input transfers
uint64_t i = 0;
if (rep >= p.n_warmup)
start(&timer, 1, rep - p.n_warmup);
DPU_FOREACH(dpu_set, dpu, i)
{
DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments), DPU_XFER_DEFAULT));
i = 0;
DPU_FOREACH(dpu_set, dpu, i)
{
DPU_ASSERT(dpu_prepare_xfer(dpu, input));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size * sizeof(DTYPE), DPU_XFER_DEFAULT));
i = 0;
DPU_FOREACH(dpu_set, dpu, i)
{
DPU_ASSERT(dpu_prepare_xfer(dpu, querys + slice_per_dpu * i));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size * sizeof(DTYPE), slice_per_dpu * sizeof(DTYPE), DPU_XFER_DEFAULT));
if (rep >= p.n_warmup)
stop(&timer, 1);
// Run kernel on DPUs
if (rep >= p.n_warmup)
{
start(&timer, 2, rep - p.n_warmup);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
#endif
}
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
if (rep >= p.n_warmup)
{
stop(&timer, 2);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
#endif
}
// Print logs if required
#if PRINT
unsigned int each_dpu = 0;
printf("Display DPU Logs\n");
DPU_FOREACH(dpu_set, dpu)
{
printf("DPU#%d:\n", each_dpu);
DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
each_dpu++;
}
#endif
// Retrieve results
if (rep >= p.n_warmup)
start(&timer, 3, rep - p.n_warmup);
dpu_results_t* results_retrieve[nr_of_dpus];
i = 0;
DPU_FOREACH(dpu_set, dpu, i)
{
results_retrieve[i] = (dpu_results_t*)malloc(NR_TASKLETS * sizeof(dpu_results_t));
DPU_ASSERT(dpu_prepare_xfer(dpu, results_retrieve[i]));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, "DPU_RESULTS", 0, NR_TASKLETS * sizeof(dpu_results_t), DPU_XFER_DEFAULT));
DPU_FOREACH(dpu_set, dpu, i)
{
for(unsigned int each_tasklet = 0; each_tasklet < NR_TASKLETS; each_tasklet++)
{
if(results_retrieve[i][each_tasklet].found > result_dpu)
{
result_dpu = results_retrieve[i][each_tasklet].found;
}
}
free(results_retrieve[i]);
}
if(rep >= p.n_warmup)
stop(&timer, 3);
}
// Print timing results
printf("CPU Version Time (ms): ");
print(&timer, 0, p.n_reps);
printf("CPU-DPU Time (ms): ");
print(&timer, 1, p.n_reps);
printf("DPU Kernel Time (ms): ");
print(&timer, 2, p.n_reps);
printf("DPU-CPU Time (ms): ");
print(&timer, 3, p.n_reps);
#if ENERGY
double energy;
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
printf("DPU Energy (J): %f\t", energy * num_iterations);
#endif
int status = (result_dpu == result_host);
if (status) {
printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] results are equal\n");
} else {
printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] results differ!\n");
}
free(input);
DPU_ASSERT(dpu_free(dpu_set));
return status ? 0 : 1;
}
================================================
FILE: golang/uPIMulator/benchmark/BS/support/common.h
================================================
#ifndef _COMMON_H_
#define _COMMON_H_
#ifdef TL
#define TASKLETS_INITIALIZER TASKLETS(TL, main, 2048, 2)
#define NB_OF_TASKLETS_PER_DPU TL
#else
#define TASKLETS_INITIALIZER TASKLETS(16, main, 2048, 2)
#define NB_OF_TASKLETS_PER_DPU 16
#endif
// Transfer size between MRAM and WRAM
#ifdef BL
#define BLOCK_SIZE_LOG2 BL
#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2)
#else
#define BLOCK_SIZE_LOG2 8
#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2)
#endif
// Data type
#define DTYPE int64_t
// Vector size
#define INPUT_SIZE 2048576
typedef struct {
uint64_t input_size;
uint64_t slice_per_dpu;
enum kernels {
kernel1 = 0,
nr_kernels = 1,
} kernel;
} dpu_arguments_t;
// Structures used by both the host and the dpu to communicate information
typedef struct {
DTYPE found;
} dpu_results_t;
#ifndef ENERGY
#define ENERGY 0
#endif
#define PRINT 0
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_RESET "\x1b[0m"
#endif
================================================
FILE: golang/uPIMulator/benchmark/BS/support/params.h
================================================
#ifndef _PARAMS_H_
#define _PARAMS_H_
#include "common.h"
typedef struct Params {
long num_querys;
unsigned n_warmup;
unsigned n_reps;
}Params;
void usage() {
fprintf(stderr,
"\nUsage: ./program [options]"
"\n"
"\nGeneral options:"
"\n -h help"
"\n -w # of untimed warmup iterations (default=1)"
"\n -e # of timed repetition iterations (default=3)"
"\n"
"\nBenchmark-specific options:"
"\n -i problem size (default=2 queries)"
"\n");
}
struct Params input_params(int argc, char **argv) {
struct Params p;
p.num_querys = PROBLEM_SIZE;
p.n_warmup = 1;
p.n_reps = 3;
int opt;
while((opt = getopt(argc, argv, "h:i:w:e:")) >= 0) {
switch(opt) {
case 'h':
usage();
exit(0);
break;
case 'i': p.num_querys = atol(optarg); break;
case 'w': p.n_warmup = atoi(optarg); break;
case 'e': p.n_reps = atoi(optarg); break;
default:
fprintf(stderr, "\nUnrecognized option!\n");
usage();
exit(0);
}
}
assert(NR_DPUS > 0 && "Invalid # of dpus!");
return p;
}
#endif
================================================
FILE: golang/uPIMulator/benchmark/BS/support/timer.h
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#include
typedef struct Timer{
struct timeval startTime[4];
struct timeval stopTime[4];
double time[4];
}Timer;
void start(Timer *timer, int i, int rep) {
if(rep == 0) {
timer->time[i] = 0.0;
}
gettimeofday(&timer->startTime[i], NULL);
}
void stop(Timer *timer, int i) {
gettimeofday(&timer->stopTime[i], NULL);
timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
(timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
}
void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
================================================
FILE: golang/uPIMulator/benchmark/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.16)
project(benchmark)
add_subdirectory(BS)
add_subdirectory(GEMV)
add_subdirectory(HST-L)
add_subdirectory(HST-S)
add_subdirectory(MLP)
add_subdirectory(RED)
add_subdirectory(SCAN-RSS)
add_subdirectory(SCAN-SSA)
add_subdirectory(SEL)
add_subdirectory(TRNS)
add_subdirectory(TS)
add_subdirectory(UNI)
add_subdirectory(VA)
================================================
FILE: golang/uPIMulator/benchmark/GEMV/CMakeLists.txt
================================================
#add_subdirectory(host)
add_subdirectory(dpu)
================================================
FILE: golang/uPIMulator/benchmark/GEMV/Makefile
================================================
DPU_DIR := dpu
HOST_DIR := host
BUILDDIR ?= bin
NR_TASKLETS ?= 16
BL ?= 10
NR_DPUS ?= 1
define conf_filename
${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3).conf
endef
CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL})
HOST_TARGET := ${BUILDDIR}/host_code
DPU_TARGET := ${BUILDDIR}/dpu_code
COMMON_INCLUDES := support
HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
.PHONY: all clean test
__dirs := $(shell mkdir -p ${BUILDDIR})
COMMON_FLAGS := -w -I${COMMON_INCLUDES}
HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL}
all: ${HOST_TARGET} ${DPU_TARGET}
${CONF}:
$(RM) $(call conf_filename,*,*)
touch ${CONF}
${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
$(CC) -S -o ${HOST_TARGET}.S ${HOST_SOURCES} ${HOST_FLAGS}
${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
dpu-upmem-dpurte-clang -S ${DPU_FLAGS} -o ${DPU_TARGET}.S ${DPU_SOURCES}
clean:
$(RM) -r $(BUILDDIR)
test: all
./${HOST_TARGET} -m 1024 -n 1024
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_10_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_11_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_12_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_13_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_14_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_15_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_16
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_16_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_17_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_18_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_19_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_1_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_20_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_21_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_22_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_23_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_24_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_2_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_3_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_4_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_5_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_6_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_7_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_8_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/_NR_TASKLETS_9_BL_10.conf
================================================
================================================
FILE: golang/uPIMulator/benchmark/GEMV/baselines/cpu/Makefile
================================================
all:
gcc -o gemv -fopenmp gemv_openmp.c
clean:
rm gemv
================================================
FILE: golang/uPIMulator/benchmark/GEMV/baselines/cpu/README
================================================
Matrix-Vector Multiplication (GEMV)
Compilation instructions:
make
Execution instructions
./gemv
================================================
FILE: golang/uPIMulator/benchmark/GEMV/baselines/cpu/gemv_openmp.c
================================================
#include
#include
#include "../../support/timer.h"
#include "gemv_utils.h"
int main(int argc, char *argv[])
{
const size_t rows = 20480;
const size_t cols = 8192;
double **A, *b, *x;
b = (double*) malloc(sizeof(double)*rows);
x = (double*) malloc(sizeof(double)*cols);
allocate_dense(rows, cols, &A);
make_hilbert_mat(rows,cols, &A);
#pragma omp parallel
{
#pragma omp for
for (size_t i = 0; i < cols; i++) {
x[i] = (double) i+1 ;
}
#pragma omp for
for (size_t i = 0; i < rows; i++) {
b[i] = (double) 0.0;
}
}
Timer timer;
start(&timer, 0, 0);
gemv(A, x, rows, cols, &b);
stop(&timer, 0);
printf("Kernel ");
print(&timer, 0, 1);
printf("\n");
#if 0
print_vec(x, rows);
print_mat(A, rows, cols);
print_vec(b, rows);
#endif
printf("sum(x) = %f, sum(Ax) = %f\n", sum_vec(x,cols), sum_vec(b,rows));
return 0;
}
void gemv(double** A, double* x, size_t rows, size_t cols, double** b) {
#pragma omp parallel for
for (size_t i = 0; i < rows; i ++ )
for (size_t j = 0; j < cols; j ++ ) {
(*b)[i] = (*b)[i] + A[i][j]*x[j];
}
}
void make_hilbert_mat(size_t rows, size_t cols, double*** A) {
#pragma omp parallel for
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < cols; j++) {
(*A)[i][j] = 1.0/( (double) i + (double) j + 1.0);
}
}
}
double sum_vec(double* vec, size_t rows) {
double sum = 0.0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < rows; i++) sum = sum + vec[i];
return sum;
}
================================================
FILE: golang/uPIMulator/benchmark/GEMV/baselines/cpu/gemv_utils.h
================================================
void allocate_dense(size_t rows,size_t cols, double*** dense) {
*dense = malloc(sizeof(double)*rows);
**dense = malloc(sizeof(double)*rows*cols);
for (size_t i=0; i < rows; i++ ) {
(*dense)[i] = (*dense)[0] + i*cols;
}
}
void print_mat(double** A, size_t rows, size_t cols) {
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < cols; j++) {
printf("%f ", A[i][j]);
}
printf("\n");
}
}
void print_vec(double* b, size_t rows) {
for (size_t i = 0; i < rows; i++) {
printf("%f\n", b[i]);
}
}
void gemv(double** A, double* x, size_t rows, size_t cols, double** b);
void make_hilbert_mat(size_t rows, size_t cols, double*** A);
double sum_vec(double* vec, size_t rows);
================================================
FILE: golang/uPIMulator/benchmark/GEMV/baselines/gpu/Makefile
================================================
all:
/usr/local/cuda/bin/nvcc gemv.cu -I/usr/local/cuda/include -lm -o gemv
clean:
rm gemv
================================================
FILE: golang/uPIMulator/benchmark/GEMV/baselines/gpu/README
================================================
Matrix-Vector Multiplication (GEMV)
Compilation instructions:
make
Execution instructions
./gemv
================================================
FILE: golang/uPIMulator/benchmark/GEMV/baselines/gpu/gemv.cu
================================================
#include
#include
#include
#include
#define THREAD 128
#define T int
__global__ void gemv(int m, int n, T *adim, T *b, T *d_ans);
void cgemv(int m, int n, T *adim, T *b, T *d_ans);
double gettime()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + (double)tv.tv_usec*1.0e-6;
}
int main(int argc, char **argv)
{
/* for CPU */
int i, j;
int *bdim, *c, *ans, *h_ans;
//double start, stop;
//double cpu_time, gpu_time;
int n = 8192;
int m = 20480;
bdim = (T*)malloc(sizeof(T) *m*n);
c = (T*)malloc(sizeof(T) *n);
ans = (T*)malloc(sizeof(T) *m);
h_ans = (T*)malloc(sizeof(T) *m);
/* for GPU */
T *d_bdim, *d_c, *d_ans;
cudaMalloc((void **)&d_bdim, sizeof(T)*m*n);
cudaMalloc((void **)&d_c, sizeof(T)*n);
cudaMalloc((void **)&d_ans, sizeof(T)*m);
for(i = 0; i < n; i++)
{
c[i] = 1;
for(j = 0; j < m; j++)
bdim[i*m+j] = 1;
}
//start = gettime();
cgemv(m, n, bdim, c, ans);
//stop = gettime();
//cpu_time=stop - start;
// Event creation
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float time1 = 0;
cudaMemcpy(d_bdim, bdim, sizeof(T)*m*n, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, sizeof(T)*n, cudaMemcpyHostToDevice);
// Start timer
cudaEventRecord( start, 0 );
//start = gettime();
gemv<<>>(m, n, d_bdim, d_c, d_ans);
//stop = gettime();
// End timer
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time1, start, stop );
//gpu_time=stop - start;
cudaMemcpy(h_ans, d_ans, sizeof(T)*m, cudaMemcpyDeviceToHost);
//printf("cpu_time : %.6f[sec]\n",cpu_time);
//printf("gpu_time : %.6f[sec]\n",gpu_time);
//printf("%f x\n", cpu_time / gpu_time);
for(i = 0; i < m; i++)
printf("%d -- %d\n", ans[i], h_ans[i]);
printf("Execution time = %f ms\n", time1);
free(bdim);
free(c);
free(ans);
free(h_ans);
cudaFree(d_bdim);
cudaFree(d_c);
cudaFree(d_ans);
return 0;
}
__global__ void gemv(int m, int n, T* adim, T* b, T* d_ans)
{
int i;
int div = n/THREAD;
__shared__ T tmp[THREAD];
tmp[threadIdx.x] = 0.0;
for(i = 0; i < div; i++)
{
tmp[threadIdx.x] += adim[blockIdx.x*n+i*THREAD+threadIdx.x] * b[i * THREAD + threadIdx.x];
}
if(threadIdx.x < m%THREAD)
tmp[threadIdx.x] += adim[blockIdx.x*n+THREAD*div+threadIdx.x] * b[THREAD * div + threadIdx.x];
__syncthreads();
for(i = THREAD / 2; i > 31; i = i / 2)
{
if(threadIdx.x < i)
tmp[threadIdx.x] += tmp[threadIdx.x + i];
__syncthreads();
}
if(threadIdx.x < 16)
{
tmp[threadIdx.x] += tmp[threadIdx.x + 16];
__syncthreads();
tmp[threadIdx.x] += tmp[threadIdx.x + 8];
__syncthreads();
tmp[threadIdx.x] += tmp[threadIdx.x + 4];
__syncthreads();
tmp[threadIdx.x] += tmp[threadIdx.x + 2];
__syncthreads();
tmp[threadIdx.x] += tmp[threadIdx.x + 1];
__syncthreads();
}
if(threadIdx.x == 0)
d_ans[blockIdx.x] = tmp[0];
}
void cgemv(int m, int n, T *adim, T *b, T *d_ans)
{
int i, j;
for(i = 0; i < m; i++)
for(j = 0; j < n; j++)
d_ans[i] += adim[i*n+j] * b[j];
}
================================================
FILE: golang/uPIMulator/benchmark/GEMV/dpu/CMakeLists.txt
================================================
SET(BL 10)
set(CMAKE_C_COMPILER "/root/upmem-2023.2.0-Linux-x86_64/bin/dpu-upmem-dpurte-clang")
set(CMAKE_C_FLAGS "-w -I/root/uPIMulator/benchmark/GEMV/support -O2 -S -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL}")
file(GLOB_RECURSE SRCS *.c)
add_executable(GEMV_device ${SRCS})
================================================
FILE: golang/uPIMulator/benchmark/GEMV/dpu/task.c
================================================
/*
* Matrix vector multiplication with multiple tasklet
*
*/
#include
#include
#include
#include
#include
#include
#include
#include "../support/common.h"
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// GEMV
void __attribute__ ((noinline)) gemv(T *bufferC, T *bufferA, T *bufferB, int pos) {
for (unsigned int i = 0; i < BLOCK_SIZE / sizeof(T); i++) {
bufferC[pos] += bufferA[i] * bufferB[i];
}
return;
}
// Barrier
BARRIER_INIT(my_barrier, NR_TASKLETS);
// main
int main() {
unsigned int tasklet_id = me();
#if PRINT
printf("tasklet_id = %u\n", tasklet_id);
#endif
if (tasklet_id == 0){ // Initialize once the cycle counter
mem_reset(); // Reset the heap
}
// Barrier
barrier_wait(&my_barrier);
int32_t n_size = DPU_INPUT_ARGUMENTS.n_size;
int32_t n_size_pad = DPU_INPUT_ARGUMENTS.n_size_pad;
uint32_t nr_rows = DPU_INPUT_ARGUMENTS.nr_rows;
uint32_t max_rows = DPU_INPUT_ARGUMENTS.max_rows;
unsigned int nrows = nr_rows;
unsigned int rows_per_tasklet;
unsigned int start_row;
unsigned int chunks = nrows / (NR_TASKLETS + NR_TASKLETS);
unsigned int dbl_chunks = chunks + chunks;
rows_per_tasklet = dbl_chunks;
unsigned int rest_rows = nrows % (NR_TASKLETS + NR_TASKLETS);
if ((tasklet_id + tasklet_id) < rest_rows)
rows_per_tasklet += 2;
if (rest_rows > 0) {
if ((tasklet_id + tasklet_id) >= rest_rows) {
unsigned int hlf_rest_rows = rest_rows >> 1;
if ((rest_rows & 1) == 1)
start_row = (hlf_rest_rows + 1) * (dbl_chunks + 2) + (tasklet_id - 1 - hlf_rest_rows) * dbl_chunks;
else
start_row = (hlf_rest_rows) * (dbl_chunks + 2) + (tasklet_id - hlf_rest_rows) * dbl_chunks;
} else
start_row = tasklet_id * (dbl_chunks + 2);
} else {
start_row = tasklet_id * (dbl_chunks);
}
// Address of the current row in MRAM
uint32_t mram_base_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + start_row * n_size * sizeof(T));
uint32_t mram_base_addr_B = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T));
uint32_t mram_base_addr_C = (uint32_t) (DPU_MRAM_HEAP_POINTER + max_rows * n_size_pad * sizeof(T) + n_size_pad * sizeof(T) + start_row * sizeof(T));
uint32_t mram_temp_addr_A = mram_base_addr_A;
uint32_t mram_temp_addr_B = mram_base_addr_B;
// Inititalize a local cache to store the MRAM block
T *cache_A = (T *) mem_alloc(BLOCK_SIZE + 8);
T *cache_A_aux = (T *) mem_alloc(8);
T *cache_B = (T *) mem_alloc(BLOCK_SIZE);
T *cache_C = (T *) mem_alloc(8);
int offset = 0;
// Iterate over nr_rows
for (unsigned int i = start_row; i < start_row + rows_per_tasklet; i += 2) {
mram_temp_addr_A = (uint32_t) (DPU_MRAM_HEAP_POINTER + i * n_size * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
cache_C[0] = 0;
cache_C[1] = 0;
for(unsigned int pos = 0; pos < 2 && i + pos < nr_rows; pos++){
int n = 0, j;
for (n = 0; n < (int32_t) (n_size - (BLOCK_SIZE/sizeof(T))); n += (BLOCK_SIZE / sizeof(T)))
{
mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
if(offset)
{
for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) - 1; off++)
{
cache_A[off] = cache_A[off + 1];
}
mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE), cache_A_aux, 8);
cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
}
// Compute GEMV
gemv(cache_C, cache_A, cache_B, pos);
// Update memory addresses
mram_temp_addr_A += BLOCK_SIZE;
mram_temp_addr_B += BLOCK_SIZE;
}
mram_read((__mram_ptr void const*) (mram_temp_addr_A), cache_A, BLOCK_SIZE);
if(offset)
{
for(unsigned int off = 0; off < (BLOCK_SIZE / sizeof(T)) -1; off++)
{
cache_A[off] = cache_A[off + 1];
}
mram_read((__mram_ptr void const*) (mram_temp_addr_A + BLOCK_SIZE ), cache_A_aux, 8);
cache_A[BLOCK_SIZE / sizeof(T) - 1] = cache_A_aux[0];
}
mram_read((__mram_ptr void const*) (mram_temp_addr_B), cache_B, BLOCK_SIZE);
for (j = 0; j < (int) (n_size - n); j++) {
// Compute GEMV
if(j >= (int)(BLOCK_SIZE / sizeof(T))){
printf("error\n");
break;
}
cache_C[pos] += cache_A[j] * cache_B[j];
}
mram_temp_addr_A += (BLOCK_SIZE - ((BLOCK_SIZE / sizeof(T)) - (n_size - n)) * sizeof(T));
mram_temp_addr_B = mram_base_addr_B;
if(mram_temp_addr_A % 8 != 0)
{
offset = 1;
}
else
{
offset = 0;
}
}
// Write cache to current MRAM block
mram_write(cache_C, (__mram_ptr void *) (mram_base_addr_C), 8);
// Update memory address
mram_base_addr_C += 2 * sizeof(T);
}
return 0;
}
================================================
FILE: golang/uPIMulator/benchmark/GEMV/host/app.c
================================================
/**
* app.c
* GEMV Host Application Source File
*
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if ENERGY
#include
#endif
#include "../support/common.h"
#include "../support/timer.h"
#include "../support/params.h"
// Define the DPU Binary path as DPU_BINARY here
#ifndef DPU_BINARY
#define DPU_BINARY "./bin/gemv_dpu"
#endif
static T* A;
static T* B;
static T* C;
static T* C_dpu;
// Create input arrays
static void init_data(T* A, T* B, unsigned int m_size, unsigned int n_size) {
srand(0);
for (unsigned int i = 0; i < m_size * n_size; i++)
{
A[i] = (unsigned int) (rand()%50);
}
for (unsigned int i = 0; i < n_size; i++)
{
B[i] = (unsigned int) (rand()%50);
}
}
// Compute output in the host
static void gemv_host(T* C, T* A, T* B, unsigned int m_size, unsigned int n_size) {
for (unsigned int i = 0; i < m_size; i++)
{
C[i] = 0;
}
for (unsigned int m = 0; m < m_size; m++) {
for (unsigned int n = 0; n < n_size; n++)
{
C[m] += A[m * n_size + n] * B[n];
}
}
}
// Main of the Host Application
int main(int argc, char **argv) {
struct Params p = input_params(argc, argv);
struct dpu_set_t dpu_set, dpu;
uint32_t nr_of_dpus;
// Allocate DPUs and load binary
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
#if ENERGY
struct dpu_probe_t probe;
DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif
unsigned int i;
unsigned int m_size = p.m_size;
unsigned int n_size = p.n_size;
// Initialize help data
dpu_info = (struct dpu_info_t *) malloc(nr_of_dpus * sizeof(struct dpu_info_t));
dpu_arguments_t *input_args = (dpu_arguments_t *) malloc(nr_of_dpus * sizeof(dpu_arguments_t));
uint32_t max_rows_per_dpu = 0;
uint32_t n_size_pad = n_size;
if(n_size % 2 == 1)
{
n_size_pad++;
}
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
uint32_t rows_per_dpu;
uint32_t prev_rows_dpu = 0;
uint32_t chunks = m_size / nr_of_dpus;
rows_per_dpu = chunks;
uint32_t rest_rows = m_size % nr_of_dpus;
if (i < rest_rows)
rows_per_dpu++;
if (rest_rows > 0) {
if (i >= rest_rows)
prev_rows_dpu = rest_rows * (chunks + 1) + (i - rest_rows) * chunks;
else
prev_rows_dpu = i * (chunks + 1);
} else {
prev_rows_dpu = i * chunks;
}
// Keep max rows for parallel transfers
uint32_t rows_per_dpu_pad = rows_per_dpu;
if (rows_per_dpu_pad % 2 == 1) // 4-byte elements
rows_per_dpu_pad++;
if (rows_per_dpu_pad > max_rows_per_dpu)
max_rows_per_dpu = rows_per_dpu_pad;
dpu_info[i].rows_per_dpu = rows_per_dpu;
dpu_info[i].rows_per_dpu_pad = rows_per_dpu_pad;
dpu_info[i].prev_rows_dpu = prev_rows_dpu;
// Copy input arguments to DPU
input_args[i].n_size = n_size;
input_args[i].n_size_pad = n_size_pad;
input_args[i].nr_rows = rows_per_dpu;
}
A = malloc(max_rows_per_dpu * nr_of_dpus * n_size_pad * sizeof(T));
B = malloc(n_size_pad * sizeof(T));
C = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
// Initialize data with arbitrary data
init_data(A, B, m_size, n_size);
// Timer
Timer timer;
// Compute output on CPU (performance comparison and verification purposes)
start(&timer, 0, 0);
gemv_host(C, A, B, m_size, n_size);
stop(&timer, 0);
for (unsigned int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
if (rep >= p.n_warmup)
start(&timer, 1, rep - p.n_warmup);
// Input arguments
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
// Copy input arguments to DPU
input_args[i].max_rows = max_rows_per_dpu;
DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
// Copy input array and vector
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, A + dpu_info[i].prev_rows_dpu * n_size));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, max_rows_per_dpu * n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, B));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) , n_size_pad * sizeof(T), DPU_XFER_DEFAULT));
if (rep >= p.n_warmup)
stop(&timer, 1);
// Run kernel on DPUs
if (rep >= p.n_warmup)
{
start(&timer, 2, rep - p.n_warmup);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
#endif
}
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
if (rep >= p.n_warmup)
{
stop(&timer, 2);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
#endif
}
#if PRINT
// Display DPU Logs
DPU_FOREACH(dpu_set, dpu) {
DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
}
#endif
// Retrieve results
C_dpu = malloc(max_rows_per_dpu * nr_of_dpus * sizeof(T));
if (rep >= p.n_warmup)
start(&timer, 3, rep - p.n_warmup);
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, C_dpu + i * max_rows_per_dpu));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, max_rows_per_dpu * n_size_pad * sizeof(T) + n_size_pad * sizeof(T), max_rows_per_dpu * sizeof(T), DPU_XFER_DEFAULT));
if(rep >= p.n_warmup)
stop(&timer, 3);
}
#if ENERGY
double acc_energy, avg_energy, acc_time, avg_time;
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_ACCUMULATE, &acc_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &avg_energy));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_ACCUMULATE, &acc_time));
DPU_ASSERT(dpu_probe_get(&probe, DPU_TIME, DPU_AVERAGE, &avg_time));
#endif
// Print timing results
printf("CPU Version Time (ms): ");
print(&timer, 0, 1);
printf("CPU-DPU Time (ms): ");
print(&timer, 1, p.n_reps);
printf("DPU Kernel Time (ms): ");
print(&timer, 2, p.n_reps);
printf("DPU-CPU Time (ms): ");
print(&timer, 3, p.n_reps);
#if ENERGY
printf("Energy (J): %f J\t", avg_energy);
#endif
// Check output
bool status = true;
unsigned int n,j;
i = 0;
for (n = 0; n < nr_of_dpus; n++) {
for (j = 0; j < dpu_info[n].rows_per_dpu; j++) {
if(C[i] != C_dpu[n * max_rows_per_dpu + j]) {
status = false;
#if PRINT
// printf("%d: %d -- %d\n", i, C[i], C_dpu[n * max_rows_per_dpu + j]);
#endif
}
i++;
}
}
if (status) {
printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
} else {
printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
}
// Deallocation
free(A);
free(B);
free(C);
free(C_dpu);
DPU_ASSERT(dpu_free(dpu_set));
#if ENERGY
DPU_ASSERT(dpu_probe_deinit(&probe));
#endif
return status ? 0 : -1;
}
================================================
FILE: golang/uPIMulator/benchmark/GEMV/support/common.h
================================================
#ifndef _COMMON_H_
#define _COMMON_H_
// Structures used by both the host and the dpu to communicate information
typedef struct {
uint32_t n_size;
uint32_t n_size_pad;
uint32_t nr_rows;
uint32_t max_rows;
} dpu_arguments_t;
// Specific information for each DPU
struct dpu_info_t {
uint32_t rows_per_dpu;
uint32_t rows_per_dpu_pad;
uint32_t prev_rows_dpu;
};
struct dpu_info_t *dpu_info;
// Transfer size between MRAM and WRAM
#ifdef BL
#define BLOCK_SIZE_LOG2 BL
#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2)
#else
#define BLOCK_SIZE_LOG2 8
#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2)
#define BL BLOCK_SIZE_LOG2
#endif
// Data type
#define T uint32_t
#ifndef ENERGY
#define ENERGY 0
#endif
#define PRINT 0
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_RESET "\x1b[0m"
#endif
================================================
FILE: golang/uPIMulator/benchmark/GEMV/support/params.h
================================================
#ifndef _PARAMS_H_
#define _PARAMS_H_
#include "common.h"
typedef struct Params {
unsigned int m_size;
unsigned int n_size;
unsigned int n_warmup;
unsigned int n_reps;
}Params;
static void usage() {
fprintf(stderr,
"\nUsage: ./program [options]"
"\n"
"\nGeneral options:"
"\n -h help"
"\n -w # of untimed warmup iterations (default=1)"
"\n -e # of timed repetition iterations (default=3)"
"\n"
"\nBenchmark-specific options:"
"\n -m m_size (default=8192 elements)"
"\n -n n_size (default=8192 elements)"
"\n");
}
struct Params input_params(int argc, char **argv) {
struct Params p;
p.m_size = 8192;
p.n_size = 8192;
p.n_warmup = 1;
p.n_reps = 3;
int opt;
while((opt = getopt(argc, argv, "hm:n:w:e:")) >= 0) {
switch(opt) {
case 'h':
usage();
exit(0);
break;
case 'm': p.m_size = atoi(optarg); break;
case 'n': p.n_size = atoi(optarg); break;
case 'w': p.n_warmup = atoi(optarg); break;
case 'e': p.n_reps = atoi(optarg); break;
default:
fprintf(stderr, "\nUnrecognized option!\n");
usage();
exit(0);
}
}
assert(NR_DPUS > 0 && "Invalid # of dpus!");
return p;
}
#endif
================================================
FILE: golang/uPIMulator/benchmark/GEMV/support/timer.h
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#include
typedef struct Timer{
struct timeval startTime[4];
struct timeval stopTime[4];
double time[4];
}Timer;
void start(Timer *timer, int i, int rep) {
if(rep == 0) {
timer->time[i] = 0.0;
}
gettimeofday(&timer->startTime[i], NULL);
}
void stop(Timer *timer, int i) {
gettimeofday(&timer->stopTime[i], NULL);
timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
(timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
//printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
// (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
}
void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }
================================================
FILE: golang/uPIMulator/benchmark/HST-L/CMakeLists.txt
================================================
#add_subdirectory(host)
add_subdirectory(dpu)
================================================
FILE: golang/uPIMulator/benchmark/HST-L/Makefile
================================================
DPU_DIR := dpu
HOST_DIR := host
BUILDDIR ?= bin
NR_TASKLETS ?= 16
BL ?= 8
NR_DPUS ?= 1
NR_HISTO ?= 1
ENERGY ?= 0
define conf_filename
${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3)_NR_DPUS_$(4).conf
endef
CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL},${NR_DPUS})
HOST_TARGET := ${BUILDDIR}/host_code
DPU_TARGET := ${BUILDDIR}/dpu_code
COMMON_INCLUDES := support
HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
.PHONY: all clean test
__dirs := $(shell mkdir -p ${BUILDDIR})
COMMON_FLAGS := -w -I${COMMON_INCLUDES}
HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DENERGY=${ENERGY}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -DNR_HISTO=${NR_HISTO}
all: ${HOST_TARGET} ${DPU_TARGET}
${CONF}:
$(RM) $(call conf_filename,*,*)
touch ${CONF}
${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
$(CC) -S -o ${HOST_TARGET}.S ${HOST_SOURCES} ${HOST_FLAGS}
${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
dpu-upmem-dpurte-clang -S ${DPU_FLAGS} -o ${DPU_TARGET}.S ${DPU_SOURCES}
clean:
$(RM) -r $(BUILDDIR)
test: all
./${HOST_TARGET}
================================================
FILE: golang/uPIMulator/benchmark/HST-L/dpu/CMakeLists.txt
================================================
SET(BL 10)
SET(NR_HISTO 1)
set(CMAKE_C_COMPILER "/root/upmem-2023.2.0-Linux-x86_64/bin/dpu-upmem-dpurte-clang")
set(CMAKE_C_FLAGS "-w -I/root/uPIMulator/benchmark/HST-L/support -O2 -S -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -DNR_HISTO=${NR_HISTO}")
file(GLOB_RECURSE SRCS *.c)
add_executable(HST-L_device ${SRCS})
================================================
FILE: golang/uPIMulator/benchmark/HST-L/dpu/task.c
================================================
/*
* Histogram (HST-L) with multiple tasklets
*
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "../support/common.h"
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// Array for communication between adjacent tasklets
uint32_t* message[NR_TASKLETS];
// DPU histogram
uint32_t* histo_dpu;
// Barrier
BARRIER_INIT(my_barrier, NR_TASKLETS);
ATOMIC_BIT_INIT(barriers_mutexes)[NR_HISTO];
barrier_t barriers[NR_HISTO];
// Mutex
mutex_id_t my_mutex[NR_HISTO];
// Histogram in each tasklet
void __attribute__ ((noinline)) histogram(uint32_t* histo, uint32_t bins, T *input, uint32_t histo_id, unsigned int l_size){
for(unsigned int j = 0; j < l_size; j++) {
T d = (input[j] * bins) >> DEPTH;
mutex_lock(my_mutex[histo_id]);
histo[d] += 1;
mutex_unlock(my_mutex[histo_id]);
}
}
extern int main_kernel1(void);
int (*kernels[nr_kernels])(void) = {main_kernel1};
int main(void) {
// Kernel
return kernels[DPU_INPUT_ARGUMENTS.kernel]();
}
// main_kernel1
int main_kernel1() {
unsigned int tasklet_id = me();
#if PRINT
printf("tasklet_id = %u\n", tasklet_id);
#endif
unsigned int l_tasklet_id = tasklet_id / NR_HISTO;
unsigned int nr_l_tasklet = NR_TASKLETS / NR_HISTO;
unsigned int my_histo_id = tasklet_id & (NR_HISTO - 1);
if (tasklet_id == 0){ // Initialize once the cycle counter
mem_reset(); // Reset the heap
// Initialize barriers
for (unsigned int each_barrier = 0; each_barrier < NR_HISTO; each_barrier++) {
barriers[each_barrier].wait_queue = 0xff;
barriers[each_barrier].count = nr_l_tasklet;
barriers[each_barrier].initial_count = nr_l_tasklet;
barriers[each_barrier].lock = (uint8_t) &ATOMIC_BIT_GET(barriers_mutexes)[each_barrier];
}
}
// Barrier
barrier_wait(&my_barrier);
uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size;
uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
uint32_t bins = DPU_INPUT_ARGUMENTS.bins;
// Address of the current processing block in MRAM
uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2;
uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
uint32_t mram_base_addr_histo = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
// Initialize a local cache to store the MRAM block
T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
// Local histogram
if (tasklet_id < NR_HISTO){ // Allocate DPU histogram
uint32_t *histo = (uint32_t *) mem_alloc(bins * sizeof(uint32_t));
message[tasklet_id] = histo;
}
// Barrier
barrier_wait(&barriers[my_histo_id]);
uint32_t *my_histo = message[my_histo_id];
// Initialize local histogram
for(unsigned int i = l_tasklet_id; i < bins; i += nr_l_tasklet){
my_histo[i] = 0;
}
// Barrier
barrier_wait(&barriers[my_histo_id]);
// Compute histogram
for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){
// Bound checking
uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;
// Load cache with current MRAM block
mram_read((const __mram_ptr void*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);
// Histogram in each tasklet
histogram(my_histo, bins, cache_A, my_histo_id, l_size_bytes >> DIV);
}
// Barrier
barrier_wait(&my_barrier);
uint32_t *histo_dpu = message[0];
for (unsigned int i = tasklet_id; i < bins; i += NR_TASKLETS){
uint32_t b = 0;
for (unsigned int j = 0; j < NR_HISTO; j++){
b += *(message[j] + i);
}
histo_dpu[i] = b;
}
// Barrier
barrier_wait(&my_barrier);
// Write dpu histogram to current MRAM block
if(tasklet_id == 0){
if(bins * sizeof(uint32_t) <= 2048)
mram_write(histo_dpu, (__mram_ptr void*)(mram_base_addr_histo), bins * sizeof(uint32_t));
else
for(unsigned int offset = 0; offset < ((bins * sizeof(uint32_t)) >> 11); offset++){
mram_write(histo_dpu + (offset << 9), (__mram_ptr void*)(mram_base_addr_histo + (offset << 11)), 2048);
}
}
return 0;
}
================================================
FILE: golang/uPIMulator/benchmark/HST-L/host/app.c
================================================
/**
* app.c
* HST-L Host Application Source File
*
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "../support/common.h"
#include "../support/timer.h"
#include "../support/params.h"
// Define the DPU Binary path as DPU_BINARY here
#ifndef DPU_BINARY
#define DPU_BINARY "./bin/dpu_code"
#endif
#if ENERGY
#include
#endif
// Pointer declaration
static T* A;
static unsigned int* histo_host;
static unsigned int* histo;
// Create input arrays
static void read_input(T* A, const Params p) {
char dctFileName[100];
FILE *File = NULL;
// Open input file
unsigned short temp;
sprintf(dctFileName, p.file_name);
if((File = fopen(dctFileName, "rb")) != NULL) {
for(unsigned int y = 0; y < p.input_size; y++) {
fread(&temp, sizeof(unsigned short), 1, File);
A[y] = (unsigned int)ByteSwap16(temp);
if(A[y] >= 4096)
A[y] = 4095;
}
fclose(File);
} else {
printf("%s does not exist\n", dctFileName);
exit(1);
}
}
// Compute output in the host
static void histogram_host(unsigned int* histo, T* A, unsigned int bins, unsigned int nr_elements, int exp, unsigned int nr_of_dpus) {
if(!exp){
for (unsigned int i = 0; i < nr_of_dpus; i++) {
for (unsigned int j = 0; j < nr_elements; j++) {
T d = A[j];
histo[i * bins + ((d * bins) >> DEPTH)] += 1;
}
}
}
else{
for (unsigned int j = 0; j < nr_elements; j++) {
T d = A[j];
histo[(d * bins) >> DEPTH] += 1;
}
}
}
// Main of the Host Application
int main(int argc, char **argv) {
struct Params p = input_params(argc, argv);
struct dpu_set_t dpu_set, dpu;
uint32_t nr_of_dpus;
#if ENERGY
struct dpu_probe_t probe;
DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif
// Allocate DPUs and load binary
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
printf("Allocated %d DPU(s)\n", nr_of_dpus);
unsigned int i = 0;
unsigned int input_size; // Size of input image
unsigned int dpu_s = p.dpu_s;
if(p.exp == 0)
input_size = p.input_size * nr_of_dpus; // Size of input image
else if(p.exp == 1)
input_size = p.input_size; // Size of input image
else
input_size = p.input_size * dpu_s; // Size of input image
const unsigned int input_size_8bytes =
((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
const unsigned int input_size_dpu = divceil(input_size, nr_of_dpus); // Input size per DPU (max.)
const unsigned int input_size_dpu_8bytes =
((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
// Input/output allocation
A = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
T *bufferA = A;
histo_host = malloc(p.bins * sizeof(unsigned int));
histo = malloc(nr_of_dpus * p.bins * sizeof(unsigned int));
// Create an input file with arbitrary data
read_input(A, p);
if(p.exp == 0){
for(unsigned int j = 1; j < nr_of_dpus; j++){
memcpy(&A[j * input_size_dpu_8bytes], &A[0], input_size_dpu_8bytes * sizeof(T));
}
}
else if(p.exp == 2){
for(unsigned int j = 1; j < dpu_s; j++)
memcpy(&A[j * p.input_size], &A[0], p.input_size * sizeof(T));
}
// Timer declaration
Timer timer;
printf("NR_TASKLETS\t%d\tBL\t%d\tinput_size\t%u\n", NR_TASKLETS, BL, input_size);
// Loop over main kernel
for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
memset(histo_host, 0, p.bins * sizeof(unsigned int));
memset(histo, 0, nr_of_dpus * p.bins * sizeof(unsigned int));
// Compute output on CPU (performance comparison and verification purposes)
if(rep >= p.n_warmup)
start(&timer, 0, rep - p.n_warmup);
histogram_host(histo_host, A, p.bins, p.input_size, 1, nr_of_dpus);
if(rep >= p.n_warmup)
stop(&timer, 0);
printf("Load input data\n");
if(rep >= p.n_warmup)
start(&timer, 1, rep - p.n_warmup);
// Input arguments
unsigned int kernel = 0;
i = 0;
dpu_arguments_t input_arguments[NR_DPUS];
for(i=0; i= p.n_warmup)
stop(&timer, 1);
printf("Run program on DPU(s) \n");
// Run DPU kernel
if(rep >= p.n_warmup) {
start(&timer, 2, rep - p.n_warmup);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
#endif
}
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
if(rep >= p.n_warmup) {
stop(&timer, 2);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
#endif
}
#if PRINT
{
unsigned int each_dpu = 0;
printf("Display DPU Logs\n");
DPU_FOREACH (dpu_set, dpu) {
printf("DPU#%d:\n", each_dpu);
DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
each_dpu++;
}
}
#endif
printf("Retrieve results\n");
i = 0;
if(rep >= p.n_warmup)
start(&timer, 3, rep - p.n_warmup);
// PARALLEL RETRIEVE TRANSFER
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, histo + p.bins * i));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), p.bins * sizeof(unsigned int), DPU_XFER_DEFAULT));
// Final histogram merging
for(i = 1; i < nr_of_dpus; i++){
for(unsigned int j = 0; j < p.bins; j++){
histo[j] += histo[j + i * p.bins];
}
}
if(rep >= p.n_warmup)
stop(&timer, 3);
}
// Print timing results
printf("CPU ");
print(&timer, 0, p.n_reps);
printf("CPU-DPU ");
print(&timer, 1, p.n_reps);
printf("DPU Kernel ");
print(&timer, 2, p.n_reps);
printf("DPU-CPU ");
print(&timer, 3, p.n_reps);
#if ENERGY
double energy;
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
printf("DPU Energy (J): %f\t", energy);
#endif
// Check output
bool status = true;
if(p.exp == 1)
for (unsigned int j = 0; j < p.bins; j++) {
if(histo_host[j] != histo[j]){
status = false;
#if PRINT
printf("%u - %u: %u -- %u\n", j, j, histo_host[j], histo[j]);
#endif
}
}
else if(p.exp == 2)
for (unsigned int j = 0; j < p.bins; j++) {
if(dpu_s * histo_host[j] != histo[j]){
status = false;
#if PRINT
printf("%u - %u: %u -- %u\n", j, j, dpu_s * histo_host[j], histo[j]);
#endif
}
}
else
for (unsigned int j = 0; j < p.bins; j++) {
if(nr_of_dpus * histo_host[j] != histo[j]){
status = false;
#if PRINT
printf("%u - %u: %u -- %u\n", j, j, nr_of_dpus * histo_host[j], histo[j]);
#endif
}
}
if (status) {
printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
} else {
printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
}
// Deallocation
free(A);
free(histo_host);
free(histo);
DPU_ASSERT(dpu_free(dpu_set));
return status ? 0 : -1;
}
================================================
FILE: golang/uPIMulator/benchmark/HST-L/run.sh
================================================
#!/bin/bash
for i in 1
do
for b in 64 128 256 512 1024 2048 4096
do
for k in 1 2 4 8 16
do
NR_DPUS=$i NR_TASKLETS=$k BL=10 make all
wait
./bin/host_code -w 2 -e 5 -b ${b} > profile/HSTL_${b}_tl${k}_dpu${i}.txt
wait
make clean
wait
done
done
done
================================================
FILE: golang/uPIMulator/benchmark/HST-L/support/common.h
================================================
#ifndef _COMMON_H_
#define _COMMON_H_
// Transfer size between MRAM and WRAM
#ifdef BL
#define BLOCK_SIZE_LOG2 BL
#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2)
#else
#define BLOCK_SIZE_LOG2 8
#define BLOCK_SIZE (1 << BLOCK_SIZE_LOG2)
#define BL BLOCK_SIZE_LOG2
#endif
// Data type
#define T uint32_t
#define DIV 2 // Shift right to divide by sizeof(T)
#define REGS (BLOCK_SIZE >> 2) // 32 bits
// Pixel depth
#define DEPTH 12
#define ByteSwap16(n) (((((unsigned int)n) << 8) & 0xFF00) | ((((unsigned int)n) >> 8) & 0x00FF))
// Structures used by both the host and the dpu to communicate information
typedef struct {
uint32_t size;
uint32_t transfer_size;
uint32_t bins;
enum kernels {
kernel1 = 0,
nr_kernels = 1,
} kernel;
} dpu_arguments_t;
#ifndef ENERGY
#define ENERGY 0
#endif
#define PRINT 0
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_RESET "\x1b[0m"
#define divceil(n, m) (((n)-1) / (m) + 1)
#define roundup(n, m) ((n / m) * m + m)
#endif
================================================
FILE: golang/uPIMulator/benchmark/HST-L/support/params.h
================================================
#ifndef _PARAMS_H_
#define _PARAMS_H_
#include "common.h"
typedef struct Params {
unsigned int input_size;
unsigned int bins;
int n_warmup;
int n_reps;
const char *file_name;
int exp;
int dpu_s;
}Params;
static void usage() {
fprintf(stderr,
"\nUsage: ./program [options]"
"\n"
"\nGeneral options:"
"\n -h help"
"\n -w # of untimed warmup iterations (default=1)"
"\n -e # of timed repetition iterations (default=3)"
"\n -x Weak (0) or strong (1, 2) scaling (default=0)"
"\n"
"\nBenchmark-specific options:"
"\n -i input size (default=1536*1024 elements)"
"\n -b histogram size (default=256 bins)"
"\n -f input image file (default=../input/image_VanHateren.iml)"
"\n");
}
struct Params input_params(int argc, char **argv) {
struct Params p;
p.input_size = 1536 * 1024;
p.bins = 256;
p.n_warmup = 1;
p.n_reps = 3;
p.exp = 0;
p.file_name = "./input/image_VanHateren.iml";
p.dpu_s = 64;
int opt;
while((opt = getopt(argc, argv, "hi:b:w:e:f:x:z:")) >= 0) {
switch(opt) {
case 'h':
usage();
exit(0);
break;
case 'i': p.input_size = atoi(optarg); break;
case 'b': p.bins = atoi(optarg); break;
case 'w': p.n_warmup = atoi(optarg); break;
case 'e': p.n_reps = atoi(optarg); break;
case 'f': p.file_name = optarg; break;
case 'x': p.exp = atoi(optarg); break;
case 'z': p.dpu_s = atoi(optarg); break;
default:
fprintf(stderr, "\nUnrecognized option!\n");
usage();
exit(0);
}
}
assert(NR_DPUS > 0 && "Invalid # of dpus!");
return p;
}
#endif
================================================
FILE: golang/uPIMulator/benchmark/HST-L/support/timer.h
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#include
typedef struct Timer{
struct timeval startTime[4];
struct timeval stopTime[4];
double time[4];
}Timer;
void start(Timer *timer, int i, int rep) {
if(rep == 0) {
timer->time[i] = 0.0;
}
gettimeofday(&timer->startTime[i], NULL);
}
void stop(Timer *timer, int i) {
gettimeofday(&timer->stopTime[i], NULL);
timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
(timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
}
void print(Timer *timer, int i, int REP) { printf("Time (ms): %f\t", timer->time[i] / (1000 * REP)); }
================================================
FILE: golang/uPIMulator/benchmark/HST-S/CMakeLists.txt
================================================
#add_subdirectory(host)
add_subdirectory(dpu)
================================================
FILE: golang/uPIMulator/benchmark/HST-S/Makefile
================================================
DPU_DIR := dpu
HOST_DIR := host
BUILDDIR ?= bin
NR_TASKLETS ?= 16
BL ?= 10
NR_DPUS ?= 1
ENERGY ?= 0
define conf_filename
${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3).conf
endef
CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL})
HOST_TARGET := ${BUILDDIR}/host_code
DPU_TARGET := ${BUILDDIR}/dpu_code
COMMON_INCLUDES := support
HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
.PHONY: all clean test
__dirs := $(shell mkdir -p ${BUILDDIR})
COMMON_FLAGS := -w -I${COMMON_INCLUDES}
HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -DENERGY=${ENERGY}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL}
all: ${HOST_TARGET} ${DPU_TARGET}
${CONF}:
$(RM) $(call conf_filename,*,*)
touch ${CONF}
${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
$(CC) -S -o ${HOST_TARGET}.S ${HOST_SOURCES} ${HOST_FLAGS}
${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
dpu-upmem-dpurte-clang -S ${DPU_FLAGS} -o ${DPU_TARGET}.S ${DPU_SOURCES}
clean:
$(RM) -r $(BUILDDIR)
test: all
./${HOST_TARGET}
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/cpu/Makefile
================================================
all:
gcc -o hist -fopenmp app_baseline.c
clean:
rm hist
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/cpu/README
================================================
Histogram - input partition (HST)
Compilation instructions:
make
Execution instructions
./hist -y 1006632960 -t 4
For more options:
./hsti -h
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/cpu/app_baseline.c
================================================
/*
* JGL@SAFARI
*/
/**
* @file app.c
* @brief Template for a Host Application Source File.
*
* The macros DPU_BINARY and NR_TASKLETS are directly
* used in the static functions, and are not passed as arguments of these functions.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "../../support/common.h"
#include "../../support/timer.h"
// Pointer declaration
static T* A;
static unsigned int* histo_host;
typedef struct Params {
unsigned int input_size;
unsigned int bins;
int n_warmup;
int n_reps;
const char *file_name;
int exp;
int n_threads;
}Params;
/**
* @brief creates input arrays
* @param nr_elements how many elements in input arrays
*/
static void read_input(T* A, const Params p) {
char dctFileName[100];
FILE *File = NULL;
// Open input file
unsigned short temp;
sprintf(dctFileName, p.file_name);
if((File = fopen(dctFileName, "rb")) != NULL) {
for(unsigned int y = 0; y < p.input_size; y++) {
fread(&temp, sizeof(unsigned short), 1, File);
A[y] = (unsigned int)ByteSwap16(temp);
if(A[y] >= 4096)
A[y] = 4095;
}
fclose(File);
} else {
printf("%s does not exist\n", dctFileName);
exit(1);
}
}
/**
* @brief compute output in the host
*/
static void histogram_host(unsigned int* histo, T* A, unsigned int bins, unsigned int nr_elements, int exp, unsigned int nr_of_dpus, int t) {
omp_set_num_threads(t);
if(!exp){
#pragma omp parallel for
for (unsigned int i = 0; i < nr_of_dpus; i++) {
for (unsigned int j = 0; j < nr_elements; j++) {
T d = A[j];
histo[i * bins + ((d * bins) >> DEPTH)] += 1;
}
}
}
else{
#pragma omp parallel for
for (unsigned int j = 0; j < nr_elements; j++) {
T d = A[j];
#pragma omp atomic update
histo[(d * bins) >> DEPTH] += 1;
}
}
}
// Params ---------------------------------------------------------------------
void usage() {
fprintf(stderr,
"\nUsage: ./program [options]"
"\n"
"\nGeneral options:"
"\n -h help"
"\n -w # of untimed warmup iterations (default=1)"
"\n -e # of timed repetition iterations (default=3)"
"\n -t # of threads (default=8)"
"\n -x Weak (0) or strong (1) scaling (default=0)"
"\n"
"\nBenchmark-specific options:"
"\n -i input size (default=1536*1024 elements)"
"\n -b histogram size (default=256 bins)"
"\n -f input image file (default=../input/image_VanHateren.iml)"
"\n");
}
struct Params input_params(int argc, char **argv) {
struct Params p;
p.input_size = 1536 * 1024;
p.bins = 256;
p.n_warmup = 1;
p.n_reps = 3;
p.n_threads = 8;
p.exp = 1;
p.file_name = "../../input/image_VanHateren.iml";
int opt;
while((opt = getopt(argc, argv, "hi:b:w:e:f:x:t:")) >= 0) {
switch(opt) {
case 'h':
usage();
exit(0);
break;
case 'i': p.input_size = atoi(optarg); break;
case 'b': p.bins = atoi(optarg); break;
case 'w': p.n_warmup = atoi(optarg); break;
case 'e': p.n_reps = atoi(optarg); break;
case 'f': p.file_name = optarg; break;
case 'x': p.exp = atoi(optarg); break;
case 't': p.n_threads = atoi(optarg); break;
default:
fprintf(stderr, "\nUnrecognized option!\n");
usage();
exit(0);
}
}
assert(p.n_threads > 0 && "Invalid # of ranks!");
return p;
}
/**
* @brief Main of the Host Application.
*/
int main(int argc, char **argv) {
struct Params p = input_params(argc, argv);
uint32_t nr_of_dpus;
const unsigned int input_size = p.input_size; // Size of input image
if(!p.exp)
assert(input_size % p.n_threads == 0 && "Input size!");
else
assert(input_size % p.n_threads == 0 && "Input size!");
// Input/output allocation
A = malloc(input_size * sizeof(T));
T *bufferA = A;
if(!p.exp)
histo_host = malloc(nr_of_dpus * p.bins * sizeof(unsigned int));
else
histo_host = malloc(p.bins * sizeof(unsigned int));
// Create an input file with arbitrary data.
read_input(A, p);
Timer timer;
start(&timer, 0, 0);
if(!p.exp)
memset(histo_host, 0, nr_of_dpus * p.bins * sizeof(unsigned int));
else
memset(histo_host, 0, p.bins * sizeof(unsigned int));
histogram_host(histo_host, A, p.bins, input_size, p.exp, nr_of_dpus, p.n_threads);
stop(&timer, 0);
printf("Kernel ");
print(&timer, 0, 1);
printf("\n");
return 0;
}
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/Makefile
================================================
#
# Copyright (c) 2016 University of Cordoba and University of Illinois
# All rights reserved.
#
# Developed by: IMPACT Research Group
# University of Cordoba and University of Illinois
# http://impact.crhc.illinois.edu/
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# with the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# > Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimers.
# > Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimers in the
# documentation and/or other materials provided with the distribution.
# > Neither the names of IMPACT Research Group, University of Cordoba,
# University of Illinois nor the names of its contributors may be used
# to endorse or promote products derived from this Software without
# specific prior written permission.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
# THE SOFTWARE.
#
CXX=/usr/local/cuda/bin/nvcc
CXX_FLAGS=-std=c++11
LIB=-L/usr/lib/ -L/usr/local/cuda/lib64 -lm
INC=-I/usr/local/cuda/include
DEP=kernel.cpp kernel.h main.cpp kernel.cu support/common.h support/cuda-setup.h support/partitioner.h support/timer.h support/verify.h
SRC=main.cpp kernel.cpp kernel.cu
EXE=hsti
all:
$(CXX) $(CXX_FLAGS) $(SRC) $(LIB) $(INC) -o $(EXE)
clean:
rm -f $(EXE)
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/README
================================================
Histogram - input partition (HST)
Compilation instructions:
make
Execution instructions
./hsti -n 1006632960 -g 512
For more options:
./hsti -h
Note:
The input folder contains one image from Van Hateren's natural image database
(http://www.kyb.tuebingen.mpg.de/?id=227). Image pixels are 12-bit depth. Thus,
for calculation of the B-bin histogram of an image, the corresponding histogram
bin is computed as ((pixel * B) >> 12).
Monochrome images from other databases or synthetic images can also be used. The
read input function (in main.cpp) might need to be changed accordingly. If image
pixels are b-bit depth and the histogram contains B bins, the histogram bin will
be computed as ((pixel * B) >> b).
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/kernel.cpp
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#include "kernel.h"
#include "support/partitioner.h"
#include
#include
#include
#include
// CPU threads--------------------------------------------------------------------------------------
void run_cpu_threads(std::atomic_uint *histo, unsigned int *data, int size, int bins, int n_threads, int chunk, int n_tasks, float alpha
#ifdef CUDA_8_0
, std::atomic_int *worklist
#endif
) {
std::vector cpu_threads;
for(int k = 0; k < n_threads; k++) {
cpu_threads.push_back(std::thread([=]() {
#ifdef CUDA_8_0
Partitioner p = partitioner_create(n_tasks, alpha, k, n_threads, worklist);
#else
Partitioner p = partitioner_create(n_tasks, alpha, k, n_threads);
#endif
unsigned int Hs[bins];
// Local histogram initialization
for(int i = 0; i < bins; i++) {
Hs[i] = 0;
}
for(int i = cpu_first(&p); cpu_more(&p); i = cpu_next(&p)) {
for(int j = 0; j < chunk; j++) {
// Read pixel
unsigned int d = ((data[i * chunk + j] * bins) >> 12);
// Vote in histogram
Hs[d]++;
}
}
// Merge to global histogram
for(int i = 0; i < bins; i++) {
(&histo[i])->fetch_add(Hs[i]);
}
}));
}
std::for_each(cpu_threads.begin(), cpu_threads.end(), [](std::thread &t) { t.join(); });
}
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/kernel.cu
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#define _CUDA_COMPILER_
#include "support/common.h"
#include "support/partitioner.h"
// CUDA kernel ------------------------------------------------------------------------------------------
__global__ void Histogram_kernel(int size, int bins, int n_tasks, float alpha, unsigned int *data,
unsigned int *histo
#ifdef CUDA_8_0
, int *worklist
#endif
) {
extern __shared__ unsigned int l_mem[];
unsigned int* l_histo = l_mem;
#ifdef CUDA_8_0
int* l_tmp = (int*)&l_histo[bins];
#endif
#ifdef CUDA_8_0
Partitioner p = partitioner_create(n_tasks, alpha, worklist, l_tmp);
#else
Partitioner p = partitioner_create(n_tasks, alpha);
#endif
// Block and runtime index
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int bD = blockDim.x;
const int gD = gridDim.x;
// Sub-histograms initialization
for(int pos = tx; pos < bins; pos += bD) {
l_histo[pos] = 0;
}
__syncthreads(); // Intra-block synchronization
// Main loop
for(int i = gpu_first(&p); gpu_more(&p); i = gpu_next(&p)) {
// Global memory read
unsigned int d = data[i * bD + tx];
// Atomic vote in shared memory
atomicAdd(&l_histo[((d * bins) >> 12)], 1);
}
__syncthreads(); // Intra-block synchronization
// Merge per-block histograms and write to global memory
for(int pos = tx; pos < bins; pos += bD) {
// Atomic addition in global memory
#ifdef CUDA_8_0
atomicAdd_system(histo + pos, l_histo[pos]);
#else
atomicAdd(histo + pos, l_histo[pos]);
#endif
}
}
cudaError_t call_Histogram_kernel(int blocks, int threads, int size, int bins, int n_tasks, float alpha,
unsigned int *data, unsigned int *histo, int l_mem_size
#ifdef CUDA_8_0
, int* worklist
#endif
){
dim3 dimGrid(blocks);
dim3 dimBlock(threads);
Histogram_kernel<<>>(size, bins, n_tasks, alpha,
data, histo
#ifdef CUDA_8_0
, worklist
#endif
);
cudaError_t err = cudaGetLastError();
return err;
}
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/kernel.h
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#include
#include
#include "support/common.h"
void run_cpu_threads(std::atomic_uint *histo, unsigned int *data, int size, int bins, int num_threads, int chunk, int n_tasks, float alpha
#ifdef CUDA_8_0
, std::atomic_int *wl
#endif
);
cudaError_t call_Histogram_kernel(int blocks, int threads, int size, int bins, int n_tasks, float alpha,
unsigned int *data, unsigned int *histo, int l_mem_size
#ifdef CUDA_8_0
, int* worklist
#endif
);
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/main.cpp
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#include "support/cuda-setup.h"
#include "kernel.h"
#include "support/common.h"
#include "support/timer.h"
#include "support/verify.h"
#include
#include
#include
// Params ---------------------------------------------------------------------
struct Params {
int device;
int n_gpu_threads;
int n_gpu_blocks;
int n_threads;
int n_warmup;
int n_reps;
float alpha;
int in_size;
int n_bins;
Params(int argc, char **argv) {
device = 0;
n_gpu_threads = 256;
n_gpu_blocks = 16;
n_threads = 4;
n_warmup = 5;
n_reps = 50;
alpha = 0.2;
in_size = 1536 * 1024 * 640;
n_bins = 256;
int opt;
while((opt = getopt(argc, argv, "hd:i:g:t:w:r:a:n:b:")) >= 0) {
switch(opt) {
case 'h':
usage();
exit(0);
break;
case 'd': device = atoi(optarg); break;
case 'i': n_gpu_threads = atoi(optarg); break;
case 'g': n_gpu_blocks = atoi(optarg); break;
case 't': n_threads = atoi(optarg); break;
case 'w': n_warmup = atoi(optarg); break;
case 'r': n_reps = atoi(optarg); break;
case 'a': alpha = atof(optarg); break;
case 'n': in_size = atoi(optarg); break;
case 'b': n_bins = atoi(optarg); break;
default:
fprintf(stderr, "\nUnrecognized option!\n");
usage();
exit(0);
}
}
if(alpha == 0.0) {
assert(n_gpu_threads > 0 && "Invalid # of device threads!");
assert(n_gpu_blocks > 0 && "Invalid # of device blocks!");
} else if(alpha == 1.0) {
assert(n_threads > 0 && "Invalid # of host threads!");
} else if(alpha > 0.0 && alpha < 1.0) {
assert(n_gpu_threads > 0 && "Invalid # of device threads!");
assert(n_gpu_blocks > 0 && "Invalid # of device blocks!");
assert(n_threads > 0 && "Invalid # of host threads!");
} else {
#ifdef CUDA_8_0
assert((n_gpu_threads > 0 && n_gpu_blocks > 0 || n_threads > 0) && "Invalid # of host + device workers!");
#else
assert(0 && "Illegal value for -a");
#endif
}
}
void usage() {
fprintf(stderr,
"\nUsage: ./hsti [options]"
"\n"
"\nGeneral options:"
"\n -h help"
"\n -d CUDA device ID (default=0)"
"\n -i # of device threads per block (default=256)"
"\n -g # of device blocks (default=16)"
"\n -t # of host threads (default=4)"
"\n -w # of untimed warmup iterations (default=5)"
"\n -r # of timed repetition iterations (default=50)"
"\n"
"\nData-partitioning-specific options:"
"\n -a fraction of input elements to process on host (default=0.2)"
#ifdef CUDA_8_0
"\n NOTE: Dynamic partitioning used when is not between 0.0 and 1.0"
#else
"\n NOTE: must be between 0.0 and 1.0"
#endif
"\n"
"\nBenchmark-specific options:"
"\n -n input size (default=1572864, i.e., 1536x1024)"
"\n -b # of bins in histogram (default=256)"
"\n");
}
};
// Input Data -----------------------------------------------------------------
void read_input(unsigned int *input, const Params &p) {
char dctFileName[100];
FILE *File = NULL;
// Open input file
unsigned short temp;
sprintf(dctFileName, "./input/image_VanHateren.iml");
if((File = fopen(dctFileName, "rb")) != NULL) {
for(int y = 0; y < p.in_size; y++) {
int fr = fread(&temp, sizeof(unsigned short), 1, File);
input[y] = (unsigned int)ByteSwap16(temp);
if(input[y] >= 4096)
input[y] = 4095;
}
fclose(File);
} else {
printf("%s does not exist\n", dctFileName);
exit(1);
}
}
// Main ------------------------------------------------------------------------------------------
int main(int argc, char **argv) {
Params p(argc, argv);
CUDASetup setcuda(p.device);
Timer timer;
cudaError_t cudaStatus;
// Allocate buffers
timer.start("Allocation");
int n_tasks = divceil(p.in_size, p.n_gpu_threads);
#ifdef CUDA_8_0
unsigned int *h_in;
cudaStatus = cudaMallocManaged(&h_in, p.in_size * sizeof(unsigned int));
std::atomic_uint *h_histo;
cudaStatus = cudaMallocManaged(&h_histo, p.n_bins * sizeof(std::atomic_uint));
unsigned int * d_in = h_in;
std::atomic_uint *d_histo = h_histo;
std::atomic_int * worklist;
cudaStatus = cudaMallocManaged(&worklist, sizeof(std::atomic_int));
#else
unsigned int * h_in = (unsigned int *)malloc(p.in_size * sizeof(unsigned int));
std::atomic_uint *h_histo = (std::atomic_uint *)malloc(p.n_bins * sizeof(std::atomic_uint));
unsigned int * h_histo_merge = (unsigned int *)malloc(p.n_bins * sizeof(unsigned int));
unsigned int * d_in;
cudaStatus = cudaMalloc((void**)&d_in, p.in_size * sizeof(unsigned int));
unsigned int * d_histo;
cudaStatus = cudaMalloc((void**)&d_histo, p.n_bins * sizeof(unsigned int));
ALLOC_ERR(h_in, h_histo, h_histo_merge);
#endif
CUDA_ERR();
cudaDeviceSynchronize();
timer.stop("Allocation");
timer.print("Allocation", 1);
// Initialize
timer.start("Initialization");
const int max_gpu_threads = setcuda.max_gpu_threads();
read_input(h_in, p);
#ifdef CUDA_8_0
for(int i = 0; i < p.n_bins; i++) {
h_histo[i].store(0);
}
#else
memset(h_histo, 0, p.n_bins * sizeof(unsigned int));
#endif
cudaDeviceSynchronize();
timer.stop("Initialization");
timer.print("Initialization", 1);
#ifndef CUDA_8_0
// Copy to device
timer.start("Copy To Device");
cudaStatus = cudaMemcpy(d_in, h_in, p.in_size * sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(d_histo, h_histo, p.n_bins * sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
CUDA_ERR();
timer.stop("Copy To Device");
timer.print("Copy To Device", 1);
#endif
// Loop over main kernel
for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
// Reset
#ifdef CUDA_8_0
if(p.alpha < 0.0 || p.alpha > 1.0) { // Dynamic partitioning
worklist[0].store(0);
}
for(int i = 0; i < p.n_bins; i++) {
h_histo[i].store(0);
}
#else
memset(h_histo, 0, p.n_bins * sizeof(unsigned int));
cudaStatus = cudaMemcpy(d_histo, h_histo, p.n_bins * sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
CUDA_ERR();
#endif
if(rep >= p.n_warmup)
timer.start("Kernel");
p.n_gpu_blocks = p.in_size / p.n_gpu_threads;
// Launch GPU threads
// Kernel launch
if(p.n_gpu_blocks > 0) {
assert(p.n_gpu_threads <= max_gpu_threads &&
"The runtime block size is greater than the maximum runtime block size that can be used on this device");
cudaStatus = call_Histogram_kernel(p.n_gpu_blocks, p.n_gpu_threads, p.in_size, p.n_bins, n_tasks,
p.alpha, d_in, (unsigned int*)d_histo, p.n_bins * sizeof(unsigned int)
#ifdef CUDA_8_0
+ sizeof(int), (int*)worklist
#endif
);
CUDA_ERR();
}
// Launch CPU threads
std::thread main_thread(run_cpu_threads, h_histo, h_in, p.in_size, p.n_bins, p.n_threads, p.n_gpu_threads,
n_tasks, p.alpha
#ifdef CUDA_8_0
, worklist
#endif
);
cudaDeviceSynchronize();
main_thread.join();
if(rep >= p.n_warmup)
timer.stop("Kernel");
}
timer.print("Kernel", p.n_reps);
#ifndef CUDA_8_0
// Copy back
timer.start("Copy Back and Merge");
cudaStatus = cudaMemcpy(h_histo_merge, d_histo, p.n_bins * sizeof(unsigned int), cudaMemcpyDeviceToHost);
CUDA_ERR();
cudaDeviceSynchronize();
for(unsigned int i = 0; i < p.n_bins; ++i) {
h_histo_merge[i] += (unsigned int)h_histo[i];
}
timer.stop("Copy Back and Merge");
timer.print("Copy Back and Merge", 1);
#endif
// Verify answer
#ifdef CUDA_8_0
verify((unsigned int *)h_histo, h_in, p.in_size, p.n_bins);
#else
verify((unsigned int *)h_histo_merge, h_in, p.in_size, p.n_bins);
#endif
// Free memory
timer.start("Deallocation");
#ifdef CUDA_8_0
cudaStatus = cudaFree(h_in);
cudaStatus = cudaFree(h_histo);
cudaStatus = cudaFree(worklist);
#else
free(h_in);
free(h_histo);
free(h_histo_merge);
cudaStatus = cudaFree(d_in);
cudaStatus = cudaFree(d_histo);
#endif
CUDA_ERR();
cudaDeviceSynchronize();
timer.stop("Deallocation");
timer.print("Deallocation", 1);
// Release timers
timer.release("Allocation");
timer.release("Initialization");
timer.release("Copy To Device");
timer.release("Kernel");
timer.release("Copy Back and Merge");
timer.release("Deallocation");
printf("Test Passed\n");
return 0;
}
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/support/common.h
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#ifndef _COMMON_H_
#define _COMMON_H_
#define ByteSwap16(n) (((((unsigned int)n) << 8) & 0xFF00) | ((((unsigned int)n) >> 8) & 0x00FF))
#define PRINT 0
#define divceil(n, m) (((n)-1) / (m) + 1)
#endif
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/support/cuda-setup.h
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#include
#include
#include
// Allocation error checking
#define ERR_1(v1) \
if(v1 == NULL) { \
fprintf(stderr, "Allocation error at %s, %d\n", __FILE__, __LINE__); \
exit(-1); \
}
#define ERR_2(v1,v2) ERR_1(v1) ERR_1(v2)
#define ERR_3(v1,v2,v3) ERR_2(v1,v2) ERR_1(v3)
#define ERR_4(v1,v2,v3,v4) ERR_3(v1,v2,v3) ERR_1(v4)
#define ERR_5(v1,v2,v3,v4,v5) ERR_4(v1,v2,v3,v4) ERR_1(v5)
#define ERR_6(v1,v2,v3,v4,v5,v6) ERR_5(v1,v2,v3,v4,v5) ERR_1(v6)
#define GET_ERR_MACRO(_1,_2,_3,_4,_5,_6,NAME,...) NAME
#define ALLOC_ERR(...) GET_ERR_MACRO(__VA_ARGS__,ERR_6,ERR_5,ERR_4,ERR_3,ERR_2,ERR_1)(__VA_ARGS__)
#define CUDA_ERR() \
if(cudaStatus != cudaSuccess) { \
fprintf(stderr, "CUDA error: %s\n at %s, %d\n", cudaGetErrorString(cudaStatus), __FILE__, __LINE__); \
exit(-1); \
}
struct CUDASetup {
cudaDeviceProp device_prop;
CUDASetup(int device) {
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(device);
CUDA_ERR();
cudaStatus = cudaGetDeviceProperties(&device_prop, device);
CUDA_ERR();
fprintf(stderr, "%s\t", device_prop.name);
}
int max_gpu_threads() {
return device_prop.maxThreadsPerBlock;
}
};
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/support/partitioner.h
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#ifndef _PARTITIONER_H_
#define _PARTITIONER_H_
#ifndef _CUDA_COMPILER_
#include
#endif
#if !defined(_CUDA_COMPILER_) && defined(CUDA_8_0)
#include
#endif
// Partitioner definition -----------------------------------------------------
typedef struct Partitioner {
int n_tasks;
int cut;
int current;
#ifndef _CUDA_COMPILER_
int thread_id;
int n_threads;
#endif
#ifdef CUDA_8_0
// CUDA 8.0 support for dynamic partitioning
int strategy;
#ifdef _CUDA_COMPILER_
int *worklist;
int *tmp;
#else
std::atomic_int *worklist;
#endif
#endif
} Partitioner;
// Partitioning strategies
#define STATIC_PARTITIONING 0
#define DYNAMIC_PARTITIONING 1
// Create a partitioner -------------------------------------------------------
#ifdef _CUDA_COMPILER_
__device__
#endif
inline Partitioner partitioner_create(int n_tasks, float alpha
#ifndef _CUDA_COMPILER_
, int thread_id, int n_threads
#endif
#ifdef CUDA_8_0
#ifdef _CUDA_COMPILER_
, int *worklist
, int *tmp
#else
, std::atomic_int *worklist
#endif
#endif
) {
Partitioner p;
p.n_tasks = n_tasks;
#ifndef _CUDA_COMPILER_
p.thread_id = thread_id;
p.n_threads = n_threads;
#endif
if(alpha >= 0.0 && alpha <= 1.0) {
p.cut = p.n_tasks * alpha;
#ifdef CUDA_8_0
p.strategy = STATIC_PARTITIONING;
#endif
} else {
#ifdef CUDA_8_0
p.strategy = DYNAMIC_PARTITIONING;
p.worklist = worklist;
#ifdef _CUDA_COMPILER_
p.tmp = tmp;
#endif
#endif
}
return p;
}
// Partitioner iterators: first() ---------------------------------------------
#ifndef _CUDA_COMPILER_
inline int cpu_first(Partitioner *p) {
#ifdef CUDA_8_0
if(p->strategy == DYNAMIC_PARTITIONING) {
p->current = p->worklist->fetch_add(1);
} else
#endif
{
p->current = p->thread_id;
}
return p->current;
}
#else
__device__ inline int gpu_first(Partitioner *p) {
#ifdef CUDA_8_0
if(p->strategy == DYNAMIC_PARTITIONING) {
if(threadIdx.y == 0 && threadIdx.x == 0) {
p->tmp[0] = atomicAdd_system(p->worklist, 1);
}
__syncthreads();
p->current = p->tmp[0];
} else
#endif
{
p->current = p->cut + blockIdx.x;
}
return p->current;
}
#endif
// Partitioner iterators: more() ----------------------------------------------
#ifndef _CUDA_COMPILER_
inline bool cpu_more(const Partitioner *p) {
#ifdef CUDA_8_0
if(p->strategy == DYNAMIC_PARTITIONING) {
return (p->current < p->n_tasks);
} else
#endif
{
return (p->current < p->cut);
}
}
#else
__device__ inline bool gpu_more(const Partitioner *p) {
return (p->current < p->n_tasks);
}
#endif
// Partitioner iterators: next() ----------------------------------------------
#ifndef _CUDA_COMPILER_
inline int cpu_next(Partitioner *p) {
#ifdef CUDA_8_0
if(p->strategy == DYNAMIC_PARTITIONING) {
p->current = p->worklist->fetch_add(1);
} else
#endif
{
p->current = p->current + p->n_threads;
}
return p->current;
}
#else
__device__ inline int gpu_next(Partitioner *p) {
#ifdef CUDA_8_0
if(p->strategy == DYNAMIC_PARTITIONING) {
if(threadIdx.y == 0 && threadIdx.x == 0) {
p->tmp[0] = atomicAdd_system(p->worklist, 1);
}
__syncthreads();
p->current = p->tmp[0];
} else
#endif
{
p->current = p->current + gridDim.x;
}
return p->current;
}
#endif
#endif
================================================
FILE: golang/uPIMulator/benchmark/HST-S/baselines/gpu/support/timer.h
================================================
/*
* Copyright (c) 2016 University of Cordoba and University of Illinois
* All rights reserved.
*
* Developed by: IMPACT Research Group
* University of Cordoba and University of Illinois
* http://impact.crhc.illinois.edu/
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* with the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* > Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* > Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
* > Neither the names of IMPACT Research Group, University of Cordoba,
* University of Illinois nor the names of its contributors may be used
* to endorse or promote products derived from this Software without
* specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
* THE SOFTWARE.
*
*/
#include
#include
#include
#include