Repository: CNugteren/myGEMM Branch: master Commit: e2a364537f2b Files: 15 Total size: 107.1 KB Directory structure: gitextract_6alhd5yi/ ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── extra/ │ └── minimal.cpp ├── scripts/ │ └── stats.sh └── src/ ├── clGEMM.cpp ├── cl_to_cuda.h ├── common.h ├── cuGEMM.cu ├── kernels.cl ├── libclblas.cpp ├── libcublas.cu ├── main.cpp └── settings.h ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ bin/ obj/ ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2014 SURFsara Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ # ================================================================================================== # Project: # Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU. # # File information: # Institution.... SURFsara # Author......... Cedric Nugteren # Changed at..... 2014-11-07 # License........ MIT license # Tab-size....... 4 spaces # Line length.... 100 characters # # ================================================================================================== # Set the location of CUDA, OpenCL and clBlas CUDADIR = $(CUDA_HOME) OPENCLDIR = $(CUDA_HOME) CLBLASDIR = $(CLBLAS_HOME) # Disable all CUDA components (including cuBLAS) in the code to run on a non-NVIDIA system ENABLE_CUDA = 1 # ================================================================================================== # Compilers CXX = g++ NVCC = nvcc # Compiler flags CXXFLAGS += -O3 -Wall NVFLAGS += -O3 -arch=sm_35 -Xcompiler -Wall #NVFLAGS += -maxrregcount 127 # Folders SRCDIR = src BINDIR = bin OBJDIR = obj SCRDIR = scripts # Disable/enable CUDA in the C++ code ifeq ($(ENABLE_CUDA),1) DEFINES += -DENABLE_CUDA endif # Load OpenCL and the clBlas library INCLUDES += -I$(OPENCLDIR)/include -I$(CLBLASDIR)/include LDFLAGS += -L$(OPENCLDIR)/lib64 -L$(CLBLASDIR)/lib64 LDFLAGS += -lOpenCL -lclBLAS # Load CUDA and the cuBLAS library ifeq ($(ENABLE_CUDA),1) INCLUDES += -I$(CUDADIR)/include LDFLAGS += -L$(CUDADIR)/lib64 LDFLAGS += -lcuda -lcudart -lcublas endif # Set the source files CPPSOURCES = main.cpp clGEMM.cpp libclblas.cpp GPUSOURCES = cuGEMM.cu libcublas.cu # Define the names of the object files and the binary OBJS = $(CPPSOURCES:%.cpp=$(OBJDIR)/%.cpp.o) ifeq ($(ENABLE_CUDA),1) OBJS += $(GPUSOURCES:%.cu=$(OBJDIR)/%.cu.o) endif BIN = $(BINDIR)/myGEMM # ================================================================================================== # All (default target) all: build run # Build the binary from the objects build: $(OBJS) @mkdir -p $(BINDIR) $(CXX) $(CXXFLAGS) $(DEFINES) $(INCLUDES) $(OBJS) $(LDFLAGS) -o $(BIN) # C++ sources $(OBJDIR)/%.cpp.o: $(SRCDIR)/%.cpp $(SRCDIR)/*.h @mkdir -p $(OBJDIR) $(CXX) -c $(CXXFLAGS) $(DEFINES) $(INCLUDES) $< -o $@ # CUDA sources $(OBJDIR)/%.cu.o: $(SRCDIR)/%.cu $(SRCDIR)/*.h $(SRCDIR)/*.cl @mkdir -p $(OBJDIR) $(NVCC) -c $(NVFLAGS) $(DEFINES) $(INCLUDES) $< -o $@ # Generate assembly code from the kernels and print some statistics inspect: $(NVCC) -cubin $(NVFLAGS) -Xptxas -v $(INCLUDES) $(SRCDIR)/cuGEMM.cu -o $(BIN).cu.cubin nvdisasm -lrm narrow $(BIN).cu.cubin > $(BIN).cu.asm cuobjdump $(BIN) -xptx cuGEMM mv cuGEMM.sm_35.ptx $(BIN).cu.ptx cuobjdump $(BIN) -sass > $(BIN).cu.sass sh $(SCRDIR)/stats.sh $(BIN).cu.sass # Execute the binary run: ./$(BIN) # Clean-up clean: rm -f $(OBJDIR)/*.o rm -f $(BIN) rm -f $(BIN).* # ================================================================================================== .PHONY: run inspect clean # ================================================================================================== ================================================ FILE: README.md ================================================ Exploring the performance of SGEMM in OpenCL on NVIDIA GPUs ============= Date: 31-Oct-2014 - 07-Nov-2014 Author: Cedric Nugteren, SURFsara (http://www.surfsara.nl) This repository contains multiple OpenCL implementations of single-precision generalised matrix-multiplication (SGEMM) tuned for an NVIDIA Tesla K40m GPU. The different versions (named myGEMM) are part of a step-by-step tutorial, in which each step adds a new optimisation. The different steps and the details of the OpenCL kernel codes are all explained in depth at https://cnugteren.github.io/tutorial/pages/page1.html. The OpenCL kernels can be used natively using the OpenCL framework. However, there is also a header-file included which converts the OpenCL kernels into CUDA syntax. This allows the same code to be tested through the CUDA-toolchain. Apart from the OpenCL kernel codes, this repository contains fully working host code, including a loop over different matrix sizes and different BLAS libraries. It contains code to run NVIDIA's cuBLAS as a reference and the open-source clBlas library. Pre-requisites: * A C++ compiler (tested with GCC and ICC) * The CUDA toolkit and NVCC compiler (tested with version 6.5) * OpenCL headers and libraries (part of the CUDA toolkit) Requirements to run the performance and correctness comparisons: * The cuBLAS library (part of the CUDA toolkit, tested version 6.5) * The open-source clBlas library (tested 2.2.0) Usage ============= * Compile the code: make build Compiles the benchmarking infrastructure and the myGEMM kernels. Make sure there is a "bin" and "obj" directory available. Note that you might have to edit the Makefile to set the proper locations of the CUDA and OpenCL installations on your system. * Run the code: make run This runs the code for matrices ranging from MINSIZE to MAXSIZE (defined in src/common.h). It will run cuBLAS, clBlas, and the CUDA and OpenCL versions of the myGEMM kernels. The particular kernel to be executed is defined using the KERNEL keyword in src/settings.h. This file also contains other settings you might want to modify for your particular GPU. * Inspect the code: make inspect This generates all kinds of assembly-like versions of the CUDA kernels in the "bin" subdirectory. It also prints out statistics of the kernels such as the register usage. Minimal working example ============= Additionally, we supply the minimal.cpp file in the 'extra' directory. This file is a self-contained minimal working example (MWE) of the most basic SGEMM kernel (myGEMM1). This can be useful if you don't want to deal with Makefiles or don't have the CUDA, cuBLAS, or clBlas installed. Note that minimal.cpp misses some features compared to the main code, but we believe that it can nevertheless be a good starting point if you want to integrate myGEMM into your own code. The code can be compiled using a regular C++ compiler and only requires OpenCL installed. Example compilation from the root folder: g++ -O3 -Wall -I/path/to/opencl/include extra/minimal.cpp -o bin/minimal -lOpenCL Be aware that the minimal working example does not: * Iterate over multiple matrix sizes * Compare performance with cuBLAS or clBlas * Check for correctness of the results * Check for OpenCL errors * Load a kernel-file from disk, instead it is embedded as a string ################################################### ================================================ FILE: extra/minimal.cpp ================================================ // ================================================================================================= // Project: // Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU. // // File information: // Institution.... SURFsara // Author......... Cedric Nugteren // Changed at..... 2014-11-07 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // Compilation example: // g++ -O3 -I$OPENCL_DIR/include minimal.cpp -o minimal -lOpenCL // // ================================================================================================= // Includes #include #include #include // ================================================================================================= // Repeat all kernels multiple times to get an average timing result #define NUM_RUNS 2 // Size of the matrices - K, M, N (squared) #define SIZE 4096 // Threadblock sizes (e.g. for kernels myGEMM1 or myGEMM2) #define TS 32 // ================================================================================================= // Set the kernel as a string (better to do this in a separate file though) const char *kernelstring = "__kernel void myGEMM1(const int M, const int N, const int K," " const __global float* A," " const __global float* B," " __global float* C) {" " const int globalRow = get_global_id(0);" " const int globalCol = get_global_id(1);" " float acc = 0.0f;" " for (int k=0; k>> Initializing OpenCL...\n"); cl_platform_id platform = 0; clGetPlatformIDs(1, &platform, NULL); cl_device_id device = 0; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); char deviceName[1024]; clGetDeviceInfo(device, CL_DEVICE_NAME, 1024, deviceName, NULL); cl_event event = NULL; // Compile the kernel cl_program program = clCreateProgramWithSource(context, 1, &kernelstring, NULL, NULL); clBuildProgram(program, 0, NULL, "", NULL, NULL); // Check for compilation errors size_t logSize; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char* messages = (char*)malloc((1+logSize)*sizeof(char)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL); messages[logSize] = '\0'; if (logSize > 10) { printf(">>> Compiler message: %s\n", messages); } free(messages); // Prepare OpenCL memory objects cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, M*K*sizeof(float), NULL, NULL); cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, K*N*sizeof(float), NULL, NULL); cl_mem bufC = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*sizeof(float), NULL, NULL); // Copy matrices to the GPU clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M*K*sizeof(float), A, 0, NULL, NULL); clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K*N*sizeof(float), B, 0, NULL, NULL); clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(float), C, 0, NULL, NULL); // Configure the myGEMM kernel and set its arguments cl_kernel kernel = clCreateKernel(program, "myGEMM1", NULL); clSetKernelArg(kernel, 0, sizeof(int), (void*)&M); clSetKernelArg(kernel, 1, sizeof(int), (void*)&N); clSetKernelArg(kernel, 2, sizeof(int), (void*)&K); clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&bufA); clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&bufB); clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&bufC); // Start the timed loop printf(">>> Starting %d myGEMM runs...\n", NUM_RUNS); gettimeofday(&Tvalue, &dummy); double starttime = (double)Tvalue.tv_sec + 1.0e-6*((double)Tvalue.tv_usec); for (int r=0; r>> Done: took %.3lf seconds per run, %.1lf GFLOPS\n", runtime, gflop/runtime); // Copy the output matrix C back to the CPU memory clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(float), C, 0, NULL, NULL); // Free the OpenCL memory objects clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC); // Clean-up OpenCL clReleaseCommandQueue(queue); clReleaseContext(context); clReleaseProgram(program); clReleaseKernel(kernel); // Free the host memory objects free(A); free(B); free(C); // Exit return 0; } // ================================================================================================= ================================================ FILE: scripts/stats.sh ================================================ #!/bin/bash # ================================================================================================== # Project: # Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU. # # File information: # Institution.... SURFsara # Author......... Cedric Nugteren # Changed at..... 2014-10-30 # License........ MIT license # Tab-size....... 4 spaces # Line length.... 100 characters # # ================================================================================================== # Read the filename from the command-line file=$1 # Calculate occurences of particular instructions in the assembly FFMA=`cat $file | grep -c "FFMA"` LDS=`cat $file | grep -c "LDS"` STS=`cat $file | grep -c "STS"` SHFL=`cat $file | grep -c "SHFL"` LD=`cat $file | grep -c "LD[^S]"` ST=`cat $file | grep -c "ST[^S]"` MOV=`cat $file | grep -c "MOV"` SUM=$((FFMA+LDS+STS+SHFL+LD+ST+MOV+SUM)) # Print the resulting statistics to screen echo ">> Stats on $file:" echo ">> " echo ">> FFMA $FFMA" echo ">> LDS $LDS" echo ">> STS $STS" echo ">> SHFL $SHFL" echo ">> LD $LD" echo ">> ST $ST" echo ">> MOV $MOV" echo ">> " echo ">> TOTAL=$SUM" # ================================================================================================== ================================================ FILE: src/clGEMM.cpp ================================================ // ================================================================================================= // Project: // Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU. // // File information: // Institution.... SURFsara // Author......... Cedric Nugteren // Changed at..... 2014-11-17 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // ================================================================================================= // Common include #include "common.h" // Include OpenCL #include // Include kernel constants #include "settings.h" // Forward declaration of the OpenCL error checking function void checkError(cl_int error, int line); // ================================================================================================= // Set the locations of the OpenCL kernel files #define CL_INCLUDE_FILE "src/settings.h" #define CL_KERNEL_FILE "src/kernels.cl" // Determine the location where to output the PTX code #define CL_PTX_FILE "bin/myGEMM.cl.ptx" // Define OpenCL compiler options, such as "-cl-nv-maxrregcount=127" #define COMPILER_OPTIONS "" // ================================================================================================= // Matrix-multiplication using a custom OpenCL SGEMM kernel. This function also copies the input // matrices to the GPU, runs SGEMM, and copies the output matrix back to the CPU. void myclblas(float* A, float* B, float* C, int K, int M, int N, int timerID) { // In case of myGEMM10, compute matrix sizes K, M, N as rounded-up to form complete tiles #if KERNEL == 10 int K_XL = CEIL_DIV(K, TSK) * TSK; int M_XL = CEIL_DIV(M, TSM) * TSM; int N_XL = CEIL_DIV(N, TSN) * TSN; #else int K_XL = K; int M_XL = M; int N_XL = N; #endif // Define OpenCL variables cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_device_id devices[MAX_NUM_DEVICES]; cl_uint numDevices = 0; cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0}; cl_context context = 0; cl_command_queue queue = 0; cl_event event = NULL; cl_program program = NULL; char deviceName[MAX_DEVICE_NAME]; // Configure the OpenCL environment err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); device = devices[CURRENT_DEVICE]; props[1] = (cl_context_properties)platform; context = clCreateContext(props, 1, &device, NULL, NULL, &err); queue = clCreateCommandQueue(context, device, 0, &err); err = clGetDeviceInfo(device, CL_DEVICE_NAME, MAX_DEVICE_NAME, deviceName, NULL); checkError(err,__LINE__); //printf("## %d devices, running on %d: '%s'\n", numDevices, CURRENT_DEVICE, deviceName); // Read the kernel file from disk long sizeHeader, sizeSource; char* header = readKernelFile(CL_INCLUDE_FILE, &sizeHeader); char* source = readKernelFile(CL_KERNEL_FILE, &sizeSource); long size = 2 + sizeHeader + sizeSource; char* code = (char*)malloc(size*sizeof(char)); for (int c=0; c 10) { printf("## Compiler message: %s\n", messages); } free(messages); // Retrieve the PTX code from the OpenCL compiler and output it to disk size_t binSize; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binSize, NULL); checkError(err,__LINE__); unsigned char *bin = (unsigned char *)malloc(binSize); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL); checkError(err,__LINE__); FILE* file = fopen(CL_PTX_FILE, "wb"); fwrite(bin, sizeof(char), binSize, file); fclose(file); free(bin); // Prepare OpenCL memory objects cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, M*K*sizeof(*A), NULL, &err); cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, K*N*sizeof(*B), NULL, &err); cl_mem bufB_TR = clCreateBuffer(context, CL_MEM_READ_ONLY, N*K*sizeof(*B), NULL, &err); cl_mem bufC = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*sizeof(*C), NULL, &err); checkError(err,__LINE__); // Copy matrices to the GPU (also C to erase the results of the previous run) err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M*K*sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K*N*sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL); checkError(err,__LINE__); // Create extra objects for rounded-up sizes (only needed in case of myGEMM10) cl_mem bufA_XL = clCreateBuffer(context, CL_MEM_READ_ONLY, M_XL*K_XL*sizeof(*A), NULL, &err); cl_mem bufB_TR_XL = clCreateBuffer(context, CL_MEM_READ_ONLY, N_XL*K_XL*sizeof(*B), NULL, &err); cl_mem bufC_XL = clCreateBuffer(context, CL_MEM_READ_WRITE, M_XL*N_XL*sizeof(*C), NULL, &err); checkError(err,__LINE__); // Configure the myGEMM kernel char kernelname[100]; sprintf(kernelname, "myGEMM%d", KERNEL); cl_kernel kernel1 = clCreateKernel(program, kernelname, &err); checkError(err,__LINE__); // Set the arguments of the myGEMM kernel #if KERNEL == 10 err = clSetKernelArg(kernel1, 0, sizeof(int), (void*)&M_XL); err = clSetKernelArg(kernel1, 1, sizeof(int), (void*)&N_XL); err = clSetKernelArg(kernel1, 2, sizeof(int), (void*)&K_XL); err = clSetKernelArg(kernel1, 3, sizeof(cl_mem), (void*)&bufA_XL); err = clSetKernelArg(kernel1, 4, sizeof(cl_mem), (void*)&bufB_TR_XL); err = clSetKernelArg(kernel1, 5, sizeof(cl_mem), (void*)&bufC_XL); #else err = clSetKernelArg(kernel1, 0, sizeof(int), (void*)&M); err = clSetKernelArg(kernel1, 1, sizeof(int), (void*)&N); err = clSetKernelArg(kernel1, 2, sizeof(int), (void*)&K); err = clSetKernelArg(kernel1, 3, sizeof(cl_mem), (void*)&bufA); #if KERNEL == 5 || KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 err = clSetKernelArg(kernel1, 4, sizeof(cl_mem), (void*)&bufB_TR); #else err = clSetKernelArg(kernel1, 4, sizeof(cl_mem), (void*)&bufB); #endif err = clSetKernelArg(kernel1, 5, sizeof(cl_mem), (void*)&bufC); #endif checkError(err,__LINE__); // Configure the supporting transpose kernel and set its arguments (only for certain myGEMMs) #if KERNEL == 5 || KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 || KERNEL == 10 cl_kernel kernel2 = clCreateKernel(program, "transpose", &err); checkError(err,__LINE__); err = clSetKernelArg(kernel2, 0, sizeof(int), (void*)&K); err = clSetKernelArg(kernel2, 1, sizeof(int), (void*)&N); err = clSetKernelArg(kernel2, 2, sizeof(cl_mem), (void*)&bufB); err = clSetKernelArg(kernel2, 3, sizeof(cl_mem), (void*)&bufB_TR); checkError(err,__LINE__); const size_t tLocal[2] = { TRANSPOSEX, TRANSPOSEY }; const size_t tGlobal[2] = { (size_t)K, (size_t)N }; #endif // Configure the supporting padding kernels and set their arguments (only for myGEMM10) #if KERNEL == 10 cl_kernel kernel3a = clCreateKernel(program, "paddingAddZeroes", &err); checkError(err,__LINE__); err = clSetKernelArg(kernel3a, 0, sizeof(int), (void*)&M); err = clSetKernelArg(kernel3a, 1, sizeof(int), (void*)&K); err = clSetKernelArg(kernel3a, 2, sizeof(cl_mem), (void*)&bufA); err = clSetKernelArg(kernel3a, 3, sizeof(int), (void*)&M_XL); err = clSetKernelArg(kernel3a, 4, sizeof(int), (void*)&K_XL); err = clSetKernelArg(kernel3a, 5, sizeof(cl_mem), (void*)&bufA_XL); checkError(err,__LINE__); cl_kernel kernel3b = clCreateKernel(program, "paddingAddZeroes", &err); checkError(err,__LINE__); err = clSetKernelArg(kernel3b, 0, sizeof(int), (void*)&N); err = clSetKernelArg(kernel3b, 1, sizeof(int), (void*)&K); err = clSetKernelArg(kernel3b, 2, sizeof(cl_mem), (void*)&bufB_TR); err = clSetKernelArg(kernel3b, 3, sizeof(int), (void*)&N_XL); err = clSetKernelArg(kernel3b, 4, sizeof(int), (void*)&K_XL); err = clSetKernelArg(kernel3b, 5, sizeof(cl_mem), (void*)&bufB_TR_XL); checkError(err,__LINE__); cl_kernel kernel3c = clCreateKernel(program, "paddingRemoveZeroes", &err); checkError(err,__LINE__); err = clSetKernelArg(kernel3c, 0, sizeof(int), (void*)&M_XL); err = clSetKernelArg(kernel3c, 1, sizeof(int), (void*)&N_XL); err = clSetKernelArg(kernel3c, 2, sizeof(cl_mem), (void*)&bufC_XL); err = clSetKernelArg(kernel3c, 3, sizeof(int), (void*)&M); err = clSetKernelArg(kernel3c, 4, sizeof(int), (void*)&N); err = clSetKernelArg(kernel3c, 5, sizeof(cl_mem), (void*)&bufC); checkError(err,__LINE__); const size_t pLocal[2] = { PADDINGX, PADDINGY }; const size_t pAGlobal[2] = { (size_t)M_XL, (size_t)K_XL }; const size_t pBGlobal[2] = { (size_t)N_XL, (size_t)K_XL }; const size_t pCGlobal[2] = { (size_t)M, (size_t)N }; #endif // Configure the thread/work-group dimensions of the myGEMM kernel #if KERNEL == 1 || KERNEL == 2 const size_t local[2] = { TS, TS }; const size_t global[2] = { (size_t)M, (size_t)N }; #elif KERNEL == 3 || KERNEL == 5 const size_t local[2] = { TS, TS/WPT }; const size_t global[2] = { (size_t)M, (size_t)(N/WPT) }; #elif KERNEL == 4 const size_t local[2] = { TS/WIDTH, TS }; const size_t global[2] = { (size_t)(M/WIDTH), (size_t)N }; #elif KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 const size_t local[2] = { TSM/WPTM, TSN/WPTN }; const size_t global[2] = { (size_t)(M/WPTM), (size_t)(N/WPTN) }; #elif KERNEL == 10 const size_t local[2] = { TSM/WPTM, TSN/WPTN }; const size_t global[2] = { (size_t)(M_XL/WPTM), (size_t)(N_XL/WPTN) }; #elif KERNEL == 11 const size_t local[2] = { THREADSX, THREADSY }; const size_t global[2] = { (size_t)(M/RX), (size_t)(N/RY) }; #endif // Start the timed loop double startTime = timer(); for (int r=0; r // Author......... Cedric Nugteren // Changed at..... 2014-11-06 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // ================================================================================================= // Replace the OpenCL keywords with CUDA equivalent #define __kernel __placeholder__ #define __global #define __placeholder__ __global__ #define __local __shared__ #define restrict __restrict__ // Replace OpenCL synchronisation with CUDA synchronisation #define barrier(x) __syncthreads() // Replace the OpenCL get_xxx_ID with CUDA equivalents __device__ int get_local_id(int x) { return (x == 0) ? threadIdx.x : threadIdx.y; } __device__ int get_group_id(int x) { return (x == 0) ? blockIdx.x : blockIdx.y; } __device__ int get_global_id(int x) { return (x == 0) ? blockIdx.x*blockDim.x + threadIdx.x : blockIdx.y*blockDim.y + threadIdx.y; } // Add the float8 data-type which is not available natively under CUDA typedef struct { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; } float8; // ================================================================================================= ================================================ FILE: src/common.h ================================================ // ================================================================================================= // Project: // Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU. // // File information: // Institution.... SURFsara // Author......... Cedric Nugteren // Changed at..... 2014-11-17 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // ================================================================================================= // Common C includes #include #include #include #include #include #include // ================================================================================================= // Repeat all kernels multiple times to get an average timing result #define NUM_RUNS 4 // Squared matrices are tested within a certain range (e.g. 1024x1024, 2048x2048, 4096x4096) #define MINSIZE (1024) #define MAXSIZE (4*1024) // Set the alpha and beta values for the cuBLAS and clBlas libraries. Note that the myGEMM kernels // for simplicity only support alpha values of 1 and beta values of 0. #define ALPHA 1.0f #define BETA 0.0f // Define the current GPU's parameters #define GPU_NAME "Tesla K40m" #define GPU_CLOCK 0.745 // Core clock in GHz #define GPU_CORES 2880 // Total number of CUDA cores #define GPU_MOD 2 // Fused multiply-add // OpenCL settings #define MAX_NUM_DEVICES 16 #define MAX_DEVICE_NAME 1024 #define CURRENT_DEVICE 0 // ================================================================================================= // Timer structure typedef struct { double t; // Time int long long kf; // KFlops } profile_t; // Number of timers #define NUM_TIMERS 10 // Global variable holding the timing results extern profile_t timers[NUM_TIMERS]; // ================================================================================================= // Forward declarations of BLAS functions void libcublas(float* A, float* B, float* C, int K, int M, int N, int timerID); void libclblas(float* A, float* B, float* C, int K, int M, int N, int timerID); void mycublas(float* A, float* B, float* C, int K, int M, int N, int timerID); void myclblas(float* A, float* B, float* C, int K, int M, int N, int timerID); // Forward declarations of the timer functions double timer(void); double wtime(profile_t timer); double gflops(profile_t timer); // Other forward declarations char* readKernelFile(const char* filename, long* _size); // ================================================================================================= ================================================ FILE: src/cuGEMM.cu ================================================ // ================================================================================================= // Project: // Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU. // // File information: // Institution.... SURFsara // Author......... Cedric Nugteren // Changed at..... 2014-11-06 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // ================================================================================================= // Common include #include "common.h" // Include kernel constants #include "settings.h" // ================================================================================================= // Configuration settings for the CUDA version (comment out if not desired) #define USE_LDG // Whether to use the __ldg() intrinsic //#define USE_SHUFFLE // Whether to use warp-shuffle instructions // Include the OpenCL-to-CUDA header and the OpenCL kernel-code #include "cl_to_cuda.h" #include "kernels.cl" // ================================================================================================= // Matrix-multiplication using a custom CUDA SGEMM kernel. This function also copies the input // matrices to the GPU, runs SGEMM, and copies the output matrix back to the CPU. void mycublas(float* A, float* B, float* C, int K, int M, int N, int timerID) { // In case of myGEMM10, compute matrix sizes K, M, N as rounded-up to form complete tiles #if KERNEL == 10 int K_XL = CEIL_DIV(K, TSK) * TSK; int M_XL = CEIL_DIV(M, TSM) * TSM; int N_XL = CEIL_DIV(N, TSN) * TSN; #else int K_XL = K; int M_XL = M; int N_XL = N; #endif // Prepare CUDA memory objects float* bufA = 0; float* bufB = 0; float* bufB_TR = 0; // This is the transposed version of B float* bufC = 0; cudaMalloc((void**)&bufA, M*K*sizeof(*A)); cudaMalloc((void**)&bufB, K*N*sizeof(*B)); cudaMalloc((void**)&bufB_TR, N*K*sizeof(*B)); cudaMalloc((void**)&bufC, M*N*sizeof(*C)); // Copy matrices to the GPU (memset C to erase the results of the previous run) cudaMemcpy((void*)bufA, (void*)A, M*K*sizeof(*A), cudaMemcpyHostToDevice); cudaMemcpy((void*)bufB, (void*)B, K*N*sizeof(*B), cudaMemcpyHostToDevice); cudaMemset((void*)bufC, 0.0, M*N*sizeof(*C)); // Create extra objects for rounded-up sizes (only needed in case of myGEMM10) float* bufA_XL = 0; float* bufB_TR_XL = 0; float* bufC_XL = 0; cudaMalloc((void**)&bufA_XL, M_XL*K_XL*sizeof(*A)); cudaMalloc((void**)&bufB_TR_XL, K_XL*N_XL*sizeof(*B)); cudaMalloc((void**)&bufC_XL, M_XL*N_XL*sizeof(*C)); // Configure the local memory (banks of 8 bytes, 48KB local memory) cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); // Configure the thread/threadblock dimensions of the transpose kernel (only for certain myGEMMs) #if KERNEL == 5 || KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 || KERNEL == 10 dim3 blocksTRP(CEIL_DIV(K,TRANSPOSEX), CEIL_DIV(N,TRANSPOSEY)); dim3 threadsTRP(TRANSPOSEX, TRANSPOSEY); #endif // Configure the thread/threadblock dimensions of the padding kernels (only for myGEMM10) #if KERNEL == 10 dim3 blocksA(CEIL_DIV(M_XL,PADDINGX), CEIL_DIV(K_XL,PADDINGY)); dim3 threadsA(PADDINGX, PADDINGY); dim3 blocksB(CEIL_DIV(N_XL,PADDINGX), CEIL_DIV(K_XL,PADDINGY)); dim3 threadsB(PADDINGX, PADDINGY); dim3 blocksC(CEIL_DIV(M,PADDINGX), CEIL_DIV(N,PADDINGY)); dim3 threadsC(PADDINGX, PADDINGY); #endif // Configure the thread/threadblock dimensions of the myGEMM kernel #if KERNEL == 1 || KERNEL == 2 dim3 blocks(M/TS, N/TS); dim3 threads(TS, TS); #elif KERNEL == 3 || KERNEL == 5 dim3 blocks(M/TS, N/TS); dim3 threads(TS, TS/WPT); #elif KERNEL == 4 dim3 blocks(M/TS, N/TS); dim3 threads(TS/WIDTH, TS); #elif KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 dim3 blocks(M/TSM, N/TSN); dim3 threads(TSM/WPTM, TSN/WPTN); #elif KERNEL == 10 dim3 blocks(M_XL/TSM, N_XL/TSN); dim3 threads(TSM/WPTM, TSN/WPTN); #elif KERNEL == 11 dim3 blocks(M/(THREADSX*RX), N/(THREADSY*RY)); dim3 threads(THREADSX, THREADSY); #endif // Start the timed loop double startTime = timer(); for (int r=0; r>>(K, N, bufB, bufB_TR); #endif // Make the inputs extra large with padded zeros #if KERNEL == 10 paddingAddZeroes<<>>(M, K, bufA, M_XL, K_XL, bufA_XL); paddingAddZeroes<<>>(N, K, bufB_TR, N_XL, K_XL, bufB_TR_XL); #endif // Run the myGEMM kernel #if KERNEL == 1 myGEMM1<<>>(M, N, K, bufA, bufB, bufC); #elif KERNEL == 2 myGEMM2<<>>(M, N, K, bufA, bufB, bufC); #elif KERNEL == 3 myGEMM3<<>>(M, N, K, bufA, bufB, bufC); #elif KERNEL == 4 myGEMM4<<>>(M, N, K, (floatX*)bufA, (floatX*)bufB, (floatX*)bufC); #elif KERNEL == 5 myGEMM5<<>>(M, N, K, bufA, bufB_TR, bufC); #elif KERNEL == 6 myGEMM6<<>>(M, N, K, bufA, bufB_TR, bufC); #elif KERNEL == 7 myGEMM7<<>>(M, N, K, (floatX*)bufA, (floatX*)bufB_TR, bufC); #elif KERNEL == 8 myGEMM8<<>>(M, N, K, (floatX*)bufA, (floatX*)bufB_TR, bufC); #elif KERNEL == 9 myGEMM9<<>>(M, N, K, (floatX*)bufA, (floatX*)bufB_TR, bufC); #elif KERNEL == 10 myGEMM10<<>>(M_XL, N_XL, K_XL, (floatX*)bufA_XL, (floatX*)bufB_TR_XL, bufC_XL); #elif KERNEL == 11 myGEMM11<<>>(M, N, K, (floatA*)bufA, (floatB*)bufB, (floatC*)bufC); #endif // Remove padded zeroes from the larger output #if KERNEL == 10 paddingRemoveZeroes<<>>(M_XL, N_XL, bufC_XL, M, N, bufC); #endif // Wait for calculations to be finished cudaDeviceSynchronize(); } // End the timed loop timers[timerID].t += (timer() - startTime) / (double)NUM_RUNS; timers[timerID].kf += ((long)K * (long)M * (long)N * 2) / 1000; // Copy the output matrix C back to the CPU memory cudaMemcpy((void*)C, (void*)bufC, M*N*sizeof(*C), cudaMemcpyDeviceToHost); // Free the GPU memory objects cudaFree(bufA); cudaFree(bufB); cudaFree(bufB_TR); cudaFree(bufC); cudaFree(bufA_XL); cudaFree(bufB_TR_XL); cudaFree(bufC_XL); } // ================================================================================================= ================================================ FILE: src/kernels.cl ================================================ // ================================================================================================= // Project: // Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU. // // File information: // Institution.... SURFsara // Author......... Cedric Nugteren // Changed at..... 2014-11-06 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // ================================================================================================= // // Matrices in column-major format // A: K columns, M rows // B: N columns, K rows // C: N columns, M rows // // N // o-----o // | | // K | [B] | // | | // o-----o // K N // o-------o o-----o // M | [A] | M | [C] | // | | | | // o-------o o-----o // // // C-code for column-major matrix multiplication with alpha=1 and beta=0: // // for (int m=0; m // Author......... Cedric Nugteren // Changed at..... 2014-11-10 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // ================================================================================================= // Common include #include "common.h" // Include OpenCL and clBlas #include // ================================================================================================= // Matrix-multiplication using the clBlas library. This function copies the input matrices to the // GPU, runs SGEMM, and copies the output matrix back to the CPU. void libclblas(float* A, float* B, float* C, int K, int M, int N, int timerID) { cl_int err; // Define OpenCL variables cl_platform_id platform = 0; cl_device_id device = 0; cl_device_id devices[MAX_NUM_DEVICES]; cl_uint numDevices = 0; cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0}; cl_context ctx = 0; cl_command_queue queue = 0; cl_event event = NULL; char deviceName[MAX_DEVICE_NAME]; // Configure the OpenCL environment err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); device = devices[CURRENT_DEVICE]; props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); queue = clCreateCommandQueue(ctx, device, 0, &err); err = clGetDeviceInfo(device, CL_DEVICE_NAME, MAX_DEVICE_NAME, deviceName, NULL); //printf("## %d devices, running on %d: '%s'\n", numDevices, CURRENT_DEVICE, deviceName); // Configure clBlas err = clblasSetup(); // Prepare OpenCL memory objects cl_mem bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M*K*sizeof(*A), NULL, &err); cl_mem bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K*N*sizeof(*B), NULL, &err); cl_mem bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M*N*sizeof(*C), NULL, &err); // Copy matrices to the GPU (also C to erase the results of the previous run) err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M*K*sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K*N*sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL); // Run one (small) instance of clBlas first to pre-generate and compile the kernel err = clblasSgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, 128, 128, 128, ALPHA, bufA, 0, 128, bufB, 0, 128, BETA, bufC, 0, 128, 1, &queue, 0, NULL, &event); err = clWaitForEvents(1, &event); // Start the timed loop double startTime = timer(); for (int r=0; r // Author......... Cedric Nugteren // Changed at..... 2014-10-30 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // ================================================================================================= // Common include #include "common.h" // Include CUDA and cuBLAS (API v2) #include // ================================================================================================= // Matrix-multiplication using the cuBLAS library. This function copies the input matrices to the // GPU, runs SGEMM, and copies the output matrix back to the CPU. void libcublas(float* A, float* B, float* C, int K, int M, int N, int timerID) { // cuBLAS configuration cublasStatus_t status; cublasHandle_t handle; status = cublasCreate(&handle); // Prepare CUDA memory objects float* bufA = 0; float* bufB = 0; float* bufC = 0; cudaMalloc((void**)&bufA, M*K*sizeof(*A)); cudaMalloc((void**)&bufB, K*N*sizeof(*B)); cudaMalloc((void**)&bufC, M*N*sizeof(*C)); // Copy matrices to the GPU (also C to erase the results of the previous run) cudaMemcpy((void*)bufA, (void*)A, M*K*sizeof(*A), cudaMemcpyHostToDevice); cudaMemcpy((void*)bufB, (void*)B, K*N*sizeof(*B), cudaMemcpyHostToDevice); cudaMemcpy((void*)bufC, (void*)C, M*N*sizeof(*C), cudaMemcpyHostToDevice); // Configure SGEMM float alpha = ALPHA; float beta = BETA; // Start the timed loop double startTime = timer(); for (int r=0; r // Author......... Cedric Nugteren // Changed at..... 2014-11-10 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // ================================================================================================= // Common include #include "common.h" // Global variable with timing results profile_t timers[NUM_TIMERS]; // ================================================================================================= // Main function. This takes care of creating matrices of various sizes and iterating over the // different types of BLAS libraries. It also computes the error rate in terms of the L2-norm with // respect to cuBLAS (the 'golden' reference). int main(int argc, char* argv[]) { // Start of the function printf("\n##\n"); srand(time(NULL)); // Compute the peak performance of the GPU double peak = GPU_CLOCK * GPU_CORES * GPU_MOD; // Print information about the different configurations printf("## --- Configurations ---\n"); for (int c=0; c<=3; c++) { #ifndef ENABLE_CUDA if (c == 0 || c == 2) { continue; } #endif switch(c) { case 0: printf("## cuBLAS on '%s', peak: %.1lf GFLOPS\n", GPU_NAME, peak); break; case 1: printf("## clBlas on '%s', peak: %.1lf GFLOPS\n", GPU_NAME, peak); break; case 2: printf("## myGEMM.cu on '%s', peak: %.1lf GFLOPS\n", GPU_NAME, peak); break; case 3: printf("## myGEMM.cl on '%s', peak: %.1lf GFLOPS\n", GPU_NAME, peak); break; } } // Loop over the different input/output matrix sizes for (int size=MINSIZE; size<=MAXSIZE; size=size*2) { // Set the performance counters to zero for (int t=0; t %6.1lf GFLOPS (%2.0lf%%), L2 norm: %.2e\n", name, seconds, performance, fraction, L2norm); } // Free up the matrices free(A); free(B); free(C); free(goldC); } // End of the program printf("##\n"); printf("\n"); return 0; } // ================================================================================================= // Timer function: Measure the current time double timer(void) { struct timeval Tvalue; struct timezone dummy; gettimeofday(&Tvalue, &dummy); double etime = (double)Tvalue.tv_sec + 1.0e-6*((double)Tvalue.tv_usec); return etime; //return omp_get_wtime(); } // Timer function: Get the execution time double wtime(profile_t timer) { return (timer.t); } // Timer function: Get the GFLOPS number double gflops(profile_t timer) { return ((double)timer.kf/(1000.0*1000.0)) / (timer.t); } // ================================================================================================= // Load an OpenCL kernel from file char* readKernelFile(const char* filename, long* _size) { // Open the file FILE* file = fopen(filename, "r"); if (!file) { printf("-- Error opening file %s\n", filename); exit(1); } // Get its size fseek(file, 0, SEEK_END); long size = ftell(file); rewind(file); // Read the kernel code as a string char* source = (char *)malloc((size+1)*sizeof(char)); fread(source, 1, size*sizeof(char), file); source[size] = '\0'; fclose(file); // Save the size and return the source string *_size = (size+1); return source; } // ================================================================================================= ================================================ FILE: src/settings.h ================================================ // ================================================================================================= // Project: // Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU. // // File information: // Institution.... SURFsara // Author......... Cedric Nugteren // Changed at..... 2014-11-07 // License........ MIT license // Tab-size....... 4 spaces // Line length.... 100 characters // // ================================================================================================= // Select a kernel #define KERNEL 8 // Constants for kernels 1 -- 5 #define TS 32 // The square-root of the 2D tile-size (== work-group dims) // Constants for kernels 3, 5 #define WPT 8 // The amount of work-per-thread, i.e. the thread-coarsening factor #define RTS (TS/WPT) // The reduced tile-size in one dimension // Constants for kernels 4, 7 -- 10 #define WIDTH 4 // The vector-width (in number of floats) // Constants for kernel 5 #define TSDK 16 // The tile-size in dimension K (for kernel 5 only) #define LPT ((TSDK*WPT)/(TS)) // The amount of loads-per-thread (assume TSN==TSM) // Constants for kernels 6 -- 10 #define TSM 128 // The tile-size in dimension M #define TSN 128 // The tile-size in dimension N #define TSK 16 // The tile-size in dimension K #define WPTM 8 // The amount of work-per-thread in dimension M #define WPTN 8 // The amount of work-per-thread in dimension N #define RTSM (TSM/WPTM) // The reduced tile-size in dimension M (== number of threads) #define RTSN (TSN/WPTN) // The reduced tile-size in dimension N (== number of threads) #define LPTA ((TSK*WPTM*WPTN)/(TSN)) // The amount of loads-per-thread for A #define LPTB ((TSK*WPTM*WPTN)/(TSM)) // The amount of loads-per-thread for B // Constraints on settings for kernels 6 -- 10 // Note: TSM/WPTM has to be integer // Note: TSN/WPTN has to be integer // Note: TSM/WIDTH has to be integer // Note: TSN/WIDTH has to be integer // Note: (TSK*WPTM*WPTN)/(TSN*WIDTH) has to be integer // Note: (TSK*WPTM*WPTN)/(TSM*WIDTH) has to be integer // Constants for kernel 11 (mimicing clBlas) #define THREADSX 8 #define THREADSY 8 #define RX 8 #define RY 4 #define RK (RY) // Constants for the supporting transpose kernel #define TRANSPOSEX 16 #define TRANSPOSEY 16 // Constants for the supporting padding kernels #define PADDINGX 16 #define PADDINGY 16 // Macros for host and kernel code #define MIN(a,b) ((a) > (b)) ? (b) : (a) #define MAX(a,b) ((a) > (b)) ? (a) : (b) #define CEIL_DIV(x,y) (((x) + (y) - 1) / (y)) #define MOD2(x,y) ((x) % (y)) #define DIV2(x,y) ((x) / (y)) // =================================================================================================