Repository: CNugteren/myGEMM
Branch: master
Commit: e2a364537f2b
Files: 15
Total size: 107.1 KB

Directory structure:
gitextract_6alhd5yi/

├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── extra/
│   └── minimal.cpp
├── scripts/
│   └── stats.sh
└── src/
    ├── clGEMM.cpp
    ├── cl_to_cuda.h
    ├── common.h
    ├── cuGEMM.cu
    ├── kernels.cl
    ├── libclblas.cpp
    ├── libcublas.cu
    ├── main.cpp
    └── settings.h

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
bin/
obj/


================================================
FILE: LICENSE
================================================
The MIT License (MIT)

Copyright (c) 2014 SURFsara

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.


================================================
FILE: Makefile
================================================

# ==================================================================================================
# Project: 
# Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
#
# File information:
# Institution.... SURFsara <www.surfsara.nl>
# Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
# Changed at..... 2014-11-07
# License........ MIT license
# Tab-size....... 4 spaces
# Line length.... 100 characters
#
# ==================================================================================================

# Set the location of CUDA, OpenCL and clBlas
CUDADIR = $(CUDA_HOME)
OPENCLDIR = $(CUDA_HOME)
CLBLASDIR = $(CLBLAS_HOME)

# Disable all CUDA components (including cuBLAS) in the code to run on a non-NVIDIA system
ENABLE_CUDA = 1

# ==================================================================================================

# Compilers
CXX = g++
NVCC = nvcc

# Compiler flags
CXXFLAGS += -O3 -Wall
NVFLAGS += -O3 -arch=sm_35 -Xcompiler -Wall
#NVFLAGS += -maxrregcount 127

# Folders
SRCDIR = src
BINDIR = bin
OBJDIR = obj
SCRDIR = scripts

# Disable/enable CUDA in the C++ code
ifeq ($(ENABLE_CUDA),1)
	DEFINES += -DENABLE_CUDA
endif

# Load OpenCL and the clBlas library
INCLUDES += -I$(OPENCLDIR)/include -I$(CLBLASDIR)/include
LDFLAGS += -L$(OPENCLDIR)/lib64 -L$(CLBLASDIR)/lib64
LDFLAGS += -lOpenCL -lclBLAS

# Load CUDA and the cuBLAS library
ifeq ($(ENABLE_CUDA),1)
	INCLUDES += -I$(CUDADIR)/include
	LDFLAGS += -L$(CUDADIR)/lib64
	LDFLAGS += -lcuda -lcudart -lcublas
endif

# Set the source files
CPPSOURCES = main.cpp clGEMM.cpp libclblas.cpp
GPUSOURCES = cuGEMM.cu libcublas.cu

# Define the names of the object files and the binary
OBJS = $(CPPSOURCES:%.cpp=$(OBJDIR)/%.cpp.o)
ifeq ($(ENABLE_CUDA),1)
	OBJS +=  $(GPUSOURCES:%.cu=$(OBJDIR)/%.cu.o)
endif
BIN = $(BINDIR)/myGEMM

# ==================================================================================================

# All (default target)
all: build run

# Build the binary from the objects
build: $(OBJS)
	@mkdir -p $(BINDIR)
	$(CXX) $(CXXFLAGS) $(DEFINES) $(INCLUDES) $(OBJS) $(LDFLAGS) -o $(BIN)

# C++ sources
$(OBJDIR)/%.cpp.o: $(SRCDIR)/%.cpp $(SRCDIR)/*.h
	@mkdir -p $(OBJDIR)
	$(CXX) -c $(CXXFLAGS) $(DEFINES) $(INCLUDES) $< -o $@

# CUDA sources
$(OBJDIR)/%.cu.o: $(SRCDIR)/%.cu $(SRCDIR)/*.h $(SRCDIR)/*.cl
	@mkdir -p $(OBJDIR)
	$(NVCC) -c $(NVFLAGS) $(DEFINES) $(INCLUDES) $< -o $@

# Generate assembly code from the kernels and print some statistics
inspect:
	$(NVCC) -cubin $(NVFLAGS) -Xptxas -v $(INCLUDES) $(SRCDIR)/cuGEMM.cu -o $(BIN).cu.cubin
	nvdisasm -lrm narrow $(BIN).cu.cubin > $(BIN).cu.asm
	cuobjdump $(BIN) -xptx cuGEMM
	mv cuGEMM.sm_35.ptx $(BIN).cu.ptx
	cuobjdump $(BIN) -sass > $(BIN).cu.sass
	sh $(SCRDIR)/stats.sh $(BIN).cu.sass

# Execute the binary
run:
	./$(BIN)

# Clean-up
clean:
	rm -f $(OBJDIR)/*.o
	rm -f $(BIN)
	rm -f $(BIN).*

# ==================================================================================================

.PHONY: run inspect clean

# ==================================================================================================


================================================
FILE: README.md
================================================

Exploring the performance of SGEMM in OpenCL on NVIDIA GPUs
=============

Date: 31-Oct-2014 - 07-Nov-2014

Author: Cedric Nugteren, SURFsara (http://www.surfsara.nl)

This repository contains multiple OpenCL implementations of single-precision generalised matrix-multiplication (SGEMM) tuned for an NVIDIA Tesla K40m GPU. The different versions (named myGEMM) are part of a step-by-step tutorial, in which each step adds a new optimisation. The different steps and the details of the OpenCL kernel codes are all explained in depth at https://cnugteren.github.io/tutorial/pages/page1.html.

The OpenCL kernels can be used natively using the OpenCL framework. However, there is also a header-file included which converts the OpenCL kernels into CUDA syntax. This allows the same code to be tested through the CUDA-toolchain.

Apart from the OpenCL kernel codes, this repository contains fully working host code, including a loop over different matrix sizes and different BLAS libraries. It contains code to run NVIDIA's cuBLAS as a reference and the open-source clBlas library.

Pre-requisites:
* A C++ compiler (tested with GCC and ICC)
* The CUDA toolkit and NVCC compiler (tested with version 6.5)
* OpenCL headers and libraries (part of the CUDA toolkit)

Requirements to run the performance and correctness comparisons:
* The cuBLAS library (part of the CUDA toolkit, tested version 6.5)
* The open-source clBlas library (tested 2.2.0)

Usage
=============

*	Compile the code:

		make build

	Compiles the benchmarking infrastructure and the myGEMM kernels. Make sure there is a "bin" and "obj" directory available. Note that you might have to edit the Makefile to set the proper locations of the CUDA and OpenCL installations on your system.

*	Run the code:

		make run

	This runs the code for matrices ranging from MINSIZE to MAXSIZE (defined in src/common.h). It will run cuBLAS, clBlas, and the CUDA and OpenCL versions of the myGEMM kernels. The particular kernel to be executed is defined using the KERNEL keyword in src/settings.h. This file also contains other settings you might want to modify for your particular GPU.

*	Inspect the code:

		make inspect

	This generates all kinds of assembly-like versions of the CUDA kernels in the "bin" subdirectory. It also prints out statistics of the kernels such as the register usage.

Minimal working example
=============

Additionally, we supply the minimal.cpp file in the 'extra' directory. This file is a self-contained minimal working example (MWE) of the most basic SGEMM kernel (myGEMM1). This can be useful if you don't want to deal with Makefiles or don't have the CUDA, cuBLAS, or clBlas installed. Note that minimal.cpp misses some features compared to the main code, but we believe that it can nevertheless be a good starting point if you want to integrate myGEMM into your own code.

The code can be compiled using a regular C++ compiler and only requires OpenCL installed. Example compilation from the root folder:

	g++ -O3 -Wall -I/path/to/opencl/include extra/minimal.cpp -o bin/minimal -lOpenCL

Be aware that the minimal working example does not:
*	Iterate over multiple matrix sizes
*	Compare performance with cuBLAS or clBlas
*	Check for correctness of the results
*	Check for OpenCL errors
*	Load a kernel-file from disk, instead it is embedded as a string

###################################################


================================================
FILE: extra/minimal.cpp
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-11-07
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// Compilation example:
// g++ -O3 -I$OPENCL_DIR/include minimal.cpp -o minimal -lOpenCL
// 
// =================================================================================================

// Includes
#include <stdio.h>
#include <sys/time.h>
#include <CL/cl.h>

// =================================================================================================

// Repeat all kernels multiple times to get an average timing result
#define NUM_RUNS 2

// Size of the matrices - K, M, N (squared)
#define SIZE 4096

// Threadblock sizes (e.g. for kernels myGEMM1 or myGEMM2)
#define TS 32

// =================================================================================================

// Set the kernel as a string (better to do this in a separate file though)
const char *kernelstring =
    "__kernel void myGEMM1(const int M, const int N, const int K,"
    "                      const __global float* A,"
    "                      const __global float* B,"
    "                      __global float* C) {"
    "    const int globalRow = get_global_id(0);"
    "    const int globalCol = get_global_id(1);"
    "    float acc = 0.0f;"
    "    for (int k=0; k<K; k++) {"
    "        acc += A[k*M + globalRow] * B[globalCol*K + k];"
    "    }"
    "    C[globalCol*M + globalRow] = acc;"
    "}";

// =================================================================================================

// Matrix-multiplication using a custom OpenCL SGEMM kernel.
int main(int argc, char* argv[]) {

    // Timers
    struct timeval Tvalue;
    struct timezone dummy;

    // Set the sizes
    int K = SIZE;
    int M = SIZE;
    int N = SIZE;

    // Create the matrices and initialize them with random values
    float* A = (float*)malloc(M*K*sizeof(float*));
    float* B = (float*)malloc(K*N*sizeof(float*));
    float* C = (float*)malloc(M*N*sizeof(float*));
    for (int i=0; i<M*K; i++) { A[i] = 3.6*i + i*i + 3.1; }
    for (int i=0; i<K*N; i++) { B[i] = 1.2*i + 0.01*i*i + 13.9; }
    for (int i=0; i<M*N; i++) { C[i] = 0.0; }

    // Configure the OpenCL environment
    printf(">>> Initializing OpenCL...\n");
    cl_platform_id platform = 0;
    clGetPlatformIDs(1, &platform, NULL);
    cl_device_id device = 0;
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
    cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
    char deviceName[1024];
    clGetDeviceInfo(device, CL_DEVICE_NAME, 1024, deviceName, NULL);
    cl_event event = NULL;

    // Compile the kernel
    cl_program program = clCreateProgramWithSource(context, 1, &kernelstring, NULL, NULL);
    clBuildProgram(program, 0, NULL, "", NULL, NULL);

    // Check for compilation errors
    size_t logSize;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    char* messages = (char*)malloc((1+logSize)*sizeof(char));
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
    messages[logSize] = '\0';
    if (logSize > 10) { printf(">>> Compiler message: %s\n", messages); }
    free(messages);

    // Prepare OpenCL memory objects
    cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY,  M*K*sizeof(float), NULL, NULL);
    cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY,  K*N*sizeof(float), NULL, NULL);
    cl_mem bufC = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*sizeof(float), NULL, NULL);

    // Copy matrices to the GPU
    clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M*K*sizeof(float), A, 0, NULL, NULL);
    clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K*N*sizeof(float), B, 0, NULL, NULL);
    clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(float), C, 0, NULL, NULL);

    // Configure the myGEMM kernel and set its arguments
    cl_kernel kernel = clCreateKernel(program, "myGEMM1", NULL);
    clSetKernelArg(kernel, 0, sizeof(int), (void*)&M);
    clSetKernelArg(kernel, 1, sizeof(int), (void*)&N);
    clSetKernelArg(kernel, 2, sizeof(int), (void*)&K);
    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&bufA);
    clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&bufB);
    clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&bufC);

    // Start the timed loop
    printf(">>> Starting %d myGEMM runs...\n", NUM_RUNS);
    gettimeofday(&Tvalue, &dummy);
    double starttime = (double)Tvalue.tv_sec + 1.0e-6*((double)Tvalue.tv_usec);
    for (int r=0; r<NUM_RUNS; r++) {

        // Run the myGEMM kernel
        const size_t local[2] = { TS, TS };
        const size_t global[2] = { M, N };
        clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, &event);

        // Wait for calculations to be finished
        clWaitForEvents(1, &event);
    }

    // End the timed loop
    gettimeofday(&Tvalue, &dummy);
    double endtime = (double)Tvalue.tv_sec + 1.0e-6*((double)Tvalue.tv_usec);
    double runtime = (endtime - starttime) / (double)NUM_RUNS;
    double gflop = ((long)K * (long)M * (long)N * 2) / (1000*1000*1000);
    printf(">>> Done: took %.3lf seconds per run, %.1lf GFLOPS\n", runtime, gflop/runtime);

    // Copy the output matrix C back to the CPU memory
    clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(float), C, 0, NULL, NULL);

    // Free the OpenCL memory objects
    clReleaseMemObject(bufA);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufC);

    // Clean-up OpenCL 
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseProgram(program);
    clReleaseKernel(kernel);

    // Free the host memory objects
    free(A);
    free(B);
    free(C);

    // Exit
    return 0;
}

// =================================================================================================


================================================
FILE: scripts/stats.sh
================================================
#!/bin/bash

# ==================================================================================================
# Project: 
# Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
#
# File information:
# Institution.... SURFsara <www.surfsara.nl>
# Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
# Changed at..... 2014-10-30
# License........ MIT license
# Tab-size....... 4 spaces
# Line length.... 100 characters
#
# ==================================================================================================

# Read the filename from the command-line
file=$1

# Calculate occurences of particular instructions in the assembly
FFMA=`cat $file | grep -c "FFMA"`
LDS=`cat $file | grep -c "LDS"`
STS=`cat $file | grep -c "STS"`
SHFL=`cat $file | grep -c "SHFL"`
LD=`cat $file | grep -c "LD[^S]"`
ST=`cat $file | grep -c "ST[^S]"`
MOV=`cat $file | grep -c "MOV"`
SUM=$((FFMA+LDS+STS+SHFL+LD+ST+MOV+SUM))

# Print the resulting statistics to screen
echo ">> Stats on $file:"
echo ">> "
echo ">> FFMA  $FFMA"
echo ">> LDS   $LDS"
echo ">> STS   $STS"
echo ">> SHFL  $SHFL"
echo ">> LD    $LD"
echo ">> ST    $ST"
echo ">> MOV   $MOV"
echo ">> "
echo ">> TOTAL=$SUM"

# ==================================================================================================


================================================
FILE: src/clGEMM.cpp
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-11-17
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// =================================================================================================

// Common include
#include "common.h"

// Include OpenCL 
#include <CL/cl.h>

// Include kernel constants
#include "settings.h"

// Forward declaration of the OpenCL error checking function
void checkError(cl_int error, int line);

// =================================================================================================

// Set the locations of the OpenCL kernel files
#define CL_INCLUDE_FILE "src/settings.h"
#define CL_KERNEL_FILE "src/kernels.cl"

// Determine the location where to output the PTX code
#define CL_PTX_FILE "bin/myGEMM.cl.ptx"

// Define OpenCL compiler options, such as "-cl-nv-maxrregcount=127"
#define COMPILER_OPTIONS ""

// =================================================================================================

// Matrix-multiplication using a custom OpenCL SGEMM kernel. This function also copies the input
// matrices to the GPU, runs SGEMM, and copies the output matrix back to the CPU.
void myclblas(float* A, float* B, float* C,
              int K, int M, int N,
              int timerID) {

    // In case of myGEMM10, compute matrix sizes K, M, N as rounded-up to form complete tiles
    #if KERNEL == 10
        int K_XL = CEIL_DIV(K, TSK) * TSK;
        int M_XL = CEIL_DIV(M, TSM) * TSM;
        int N_XL = CEIL_DIV(N, TSN) * TSN;
    #else
        int K_XL = K;
        int M_XL = M;
        int N_XL = N;
    #endif

    // Define OpenCL variables
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_device_id devices[MAX_NUM_DEVICES];
    cl_uint numDevices = 0;
    cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0};
    cl_context context = 0;
    cl_command_queue queue = 0;
    cl_event event = NULL;
    cl_program program = NULL;
    char deviceName[MAX_DEVICE_NAME];

    // Configure the OpenCL environment
    err = clGetPlatformIDs(1, &platform, NULL);
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
    device = devices[CURRENT_DEVICE];
    props[1] = (cl_context_properties)platform;
    context = clCreateContext(props, 1, &device, NULL, NULL, &err);
    queue = clCreateCommandQueue(context, device, 0, &err);
    err = clGetDeviceInfo(device, CL_DEVICE_NAME, MAX_DEVICE_NAME, deviceName, NULL);
    checkError(err,__LINE__);
    //printf("## %d devices, running on %d: '%s'\n", numDevices, CURRENT_DEVICE, deviceName);

    // Read the kernel file from disk
    long sizeHeader, sizeSource;
    char* header = readKernelFile(CL_INCLUDE_FILE, &sizeHeader);
    char* source = readKernelFile(CL_KERNEL_FILE, &sizeSource);
    long size = 2 + sizeHeader + sizeSource;
    char* code = (char*)malloc(size*sizeof(char));
    for (int c=0; c<size; c++) { code[c] = '\0'; }
    strcat(code, header);
    strcat(code, source);
    const char* constCode = code;
    free(header);
    free(source);

    // Compile the kernel file
    program = clCreateProgramWithSource(context, 1, &constCode, NULL, &err);
    checkError(err,__LINE__);
    err = clBuildProgram(program, 0, NULL, COMPILER_OPTIONS, NULL, NULL);

    // Check for compilation errors
    size_t logSize;
    err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    checkError(err,__LINE__);
    char* messages = (char*)malloc((1+logSize)*sizeof(char));
    err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
    checkError(err,__LINE__);
    messages[logSize] = '\0';
    //if (logSize > 10) { printf("## Compiler message: %s\n", messages); }
    free(messages);

    // Retrieve the PTX code from the OpenCL compiler and output it to disk
    size_t binSize;
    err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binSize, NULL);
    checkError(err,__LINE__);
    unsigned char *bin = (unsigned char *)malloc(binSize);
    err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL);
    checkError(err,__LINE__);
    FILE* file = fopen(CL_PTX_FILE, "wb");
    fwrite(bin, sizeof(char), binSize, file);
    fclose(file);
    free(bin);

    // Prepare OpenCL memory objects
    cl_mem bufA    = clCreateBuffer(context, CL_MEM_READ_ONLY,  M*K*sizeof(*A), NULL, &err);
    cl_mem bufB    = clCreateBuffer(context, CL_MEM_READ_ONLY,  K*N*sizeof(*B), NULL, &err);
    cl_mem bufB_TR = clCreateBuffer(context, CL_MEM_READ_ONLY,  N*K*sizeof(*B), NULL, &err);
    cl_mem bufC    = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*sizeof(*C), NULL, &err);
    checkError(err,__LINE__);

    // Copy matrices to the GPU (also C to erase the results of the previous run)
    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M*K*sizeof(*A), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K*N*sizeof(*B), B, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL);
    checkError(err,__LINE__);

    // Create extra objects for rounded-up sizes (only needed in case of myGEMM10)
    cl_mem bufA_XL    = clCreateBuffer(context, CL_MEM_READ_ONLY,  M_XL*K_XL*sizeof(*A), NULL, &err);
    cl_mem bufB_TR_XL = clCreateBuffer(context, CL_MEM_READ_ONLY,  N_XL*K_XL*sizeof(*B), NULL, &err);
    cl_mem bufC_XL    = clCreateBuffer(context, CL_MEM_READ_WRITE, M_XL*N_XL*sizeof(*C), NULL, &err);
    checkError(err,__LINE__);

    // Configure the myGEMM kernel
    char kernelname[100];
    sprintf(kernelname, "myGEMM%d", KERNEL);
    cl_kernel kernel1 = clCreateKernel(program, kernelname, &err);
    checkError(err,__LINE__);

    // Set the arguments of the myGEMM kernel
    #if KERNEL == 10
        err = clSetKernelArg(kernel1, 0, sizeof(int), (void*)&M_XL);
        err = clSetKernelArg(kernel1, 1, sizeof(int), (void*)&N_XL);
        err = clSetKernelArg(kernel1, 2, sizeof(int), (void*)&K_XL);
        err = clSetKernelArg(kernel1, 3, sizeof(cl_mem), (void*)&bufA_XL);
        err = clSetKernelArg(kernel1, 4, sizeof(cl_mem), (void*)&bufB_TR_XL);
        err = clSetKernelArg(kernel1, 5, sizeof(cl_mem), (void*)&bufC_XL);
    #else
        err = clSetKernelArg(kernel1, 0, sizeof(int), (void*)&M);
        err = clSetKernelArg(kernel1, 1, sizeof(int), (void*)&N);
        err = clSetKernelArg(kernel1, 2, sizeof(int), (void*)&K);
        err = clSetKernelArg(kernel1, 3, sizeof(cl_mem), (void*)&bufA);
        #if KERNEL == 5 || KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9
            err = clSetKernelArg(kernel1, 4, sizeof(cl_mem), (void*)&bufB_TR);
        #else
            err = clSetKernelArg(kernel1, 4, sizeof(cl_mem), (void*)&bufB);
        #endif
        err = clSetKernelArg(kernel1, 5, sizeof(cl_mem), (void*)&bufC);
    #endif
    checkError(err,__LINE__);

    // Configure the supporting transpose kernel and set its arguments (only for certain myGEMMs)
    #if KERNEL == 5 || KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 || KERNEL == 10
        cl_kernel kernel2 = clCreateKernel(program, "transpose", &err);
        checkError(err,__LINE__);
        err = clSetKernelArg(kernel2, 0, sizeof(int), (void*)&K);
        err = clSetKernelArg(kernel2, 1, sizeof(int), (void*)&N);
        err = clSetKernelArg(kernel2, 2, sizeof(cl_mem), (void*)&bufB);
        err = clSetKernelArg(kernel2, 3, sizeof(cl_mem), (void*)&bufB_TR);
        checkError(err,__LINE__);
        const size_t tLocal[2] = { TRANSPOSEX, TRANSPOSEY };
        const size_t tGlobal[2] = { (size_t)K, (size_t)N };
    #endif

    // Configure the supporting padding kernels and set their arguments (only for myGEMM10)
    #if KERNEL == 10
        cl_kernel kernel3a = clCreateKernel(program, "paddingAddZeroes", &err);
        checkError(err,__LINE__);
        err = clSetKernelArg(kernel3a, 0, sizeof(int), (void*)&M);
        err = clSetKernelArg(kernel3a, 1, sizeof(int), (void*)&K);
        err = clSetKernelArg(kernel3a, 2, sizeof(cl_mem), (void*)&bufA);
        err = clSetKernelArg(kernel3a, 3, sizeof(int), (void*)&M_XL);
        err = clSetKernelArg(kernel3a, 4, sizeof(int), (void*)&K_XL);
        err = clSetKernelArg(kernel3a, 5, sizeof(cl_mem), (void*)&bufA_XL);
        checkError(err,__LINE__);
        cl_kernel kernel3b = clCreateKernel(program, "paddingAddZeroes", &err);
        checkError(err,__LINE__);
        err = clSetKernelArg(kernel3b, 0, sizeof(int), (void*)&N);
        err = clSetKernelArg(kernel3b, 1, sizeof(int), (void*)&K);
        err = clSetKernelArg(kernel3b, 2, sizeof(cl_mem), (void*)&bufB_TR);
        err = clSetKernelArg(kernel3b, 3, sizeof(int), (void*)&N_XL);
        err = clSetKernelArg(kernel3b, 4, sizeof(int), (void*)&K_XL);
        err = clSetKernelArg(kernel3b, 5, sizeof(cl_mem), (void*)&bufB_TR_XL);
        checkError(err,__LINE__);
        cl_kernel kernel3c = clCreateKernel(program, "paddingRemoveZeroes", &err);
        checkError(err,__LINE__);
        err = clSetKernelArg(kernel3c, 0, sizeof(int), (void*)&M_XL);
        err = clSetKernelArg(kernel3c, 1, sizeof(int), (void*)&N_XL);
        err = clSetKernelArg(kernel3c, 2, sizeof(cl_mem), (void*)&bufC_XL);
        err = clSetKernelArg(kernel3c, 3, sizeof(int), (void*)&M);
        err = clSetKernelArg(kernel3c, 4, sizeof(int), (void*)&N);
        err = clSetKernelArg(kernel3c, 5, sizeof(cl_mem), (void*)&bufC);
        checkError(err,__LINE__);
        const size_t pLocal[2] = { PADDINGX, PADDINGY };
        const size_t pAGlobal[2] = { (size_t)M_XL, (size_t)K_XL };
        const size_t pBGlobal[2] = { (size_t)N_XL, (size_t)K_XL };
        const size_t pCGlobal[2] = { (size_t)M, (size_t)N };
    #endif

    // Configure the thread/work-group dimensions of the myGEMM kernel
    #if KERNEL == 1 || KERNEL == 2
        const size_t local[2] = { TS, TS };
        const size_t global[2] = { (size_t)M, (size_t)N };
    #elif KERNEL == 3 || KERNEL == 5
        const size_t local[2] = { TS, TS/WPT };
        const size_t global[2] = { (size_t)M, (size_t)(N/WPT) };
    #elif KERNEL == 4
        const size_t local[2] = { TS/WIDTH, TS };
        const size_t global[2] = { (size_t)(M/WIDTH), (size_t)N };
    #elif KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9
        const size_t local[2] = { TSM/WPTM, TSN/WPTN };
        const size_t global[2] = { (size_t)(M/WPTM), (size_t)(N/WPTN) };
    #elif KERNEL == 10
        const size_t local[2] = { TSM/WPTM, TSN/WPTN };
        const size_t global[2] = { (size_t)(M_XL/WPTM), (size_t)(N_XL/WPTN) };
    #elif KERNEL == 11
        const size_t local[2] = { THREADSX, THREADSY };
        const size_t global[2] = { (size_t)(M/RX), (size_t)(N/RY) };
    #endif

    // Start the timed loop
    double startTime = timer();
    for (int r=0; r<NUM_RUNS; r++) {

        // Run the transpose kernel first
        #if KERNEL == 5 || KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 || KERNEL == 10
            err = clEnqueueNDRangeKernel(queue, kernel2, 2, NULL, tGlobal, tLocal, 0, NULL, &event);
        #endif

        // Make the inputs extra large with padded zeros
        #if KERNEL == 10
            err = clEnqueueNDRangeKernel(queue, kernel3a, 2, NULL, pAGlobal, pLocal, 0, NULL, &event);
            err = clEnqueueNDRangeKernel(queue, kernel3b, 2, NULL, pBGlobal, pLocal, 0, NULL, &event);
        #endif

        // Run the myGEMM kernel
        err = clEnqueueNDRangeKernel(queue, kernel1, 2, NULL, global, local, 0, NULL, &event);

        // Remove padded zeroes from the larger output
        #if KERNEL == 10
            err = clEnqueueNDRangeKernel(queue, kernel3c, 2, NULL, pCGlobal, pLocal, 0, NULL, &event);
        #endif

        // Wait for calculations to be finished
        checkError(err,__LINE__);
        err = clWaitForEvents(1, &event);
    }

    // End the timed loop
    timers[timerID].t += (timer() - startTime) / (double)NUM_RUNS;
    timers[timerID].kf += ((long)K * (long)M * (long)N * 2) / 1000;

    // Copy the output matrix C back to the CPU memory
    err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL);
    checkError(err,__LINE__);

    // Free the memory objects
    free(code);
    clReleaseMemObject(bufA);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufB_TR);
    clReleaseMemObject(bufC);
    clReleaseMemObject(bufA_XL);
    clReleaseMemObject(bufB_TR_XL);
    clReleaseMemObject(bufC_XL);

    // Clean-up OpenCL 
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseProgram(program);
    clReleaseKernel(kernel1);
    #if KERNEL == 5 || KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 || KERNEL == 10
        clReleaseKernel(kernel2);
    #endif
    #if KERNEL == 10
        clReleaseKernel(kernel3a);
        clReleaseKernel(kernel3b);
        clReleaseKernel(kernel3c);
    #endif
}

// =================================================================================================

// Print an error message to screen (only if it occurs)
void checkError(cl_int error, int line) {
    if (error != CL_SUCCESS) {
        switch (error) {
            case CL_DEVICE_NOT_FOUND:                 printf("-- Error at %d:  Device not found.\n", line); break;
            case CL_DEVICE_NOT_AVAILABLE:             printf("-- Error at %d:  Device not available\n", line); break;
            case CL_COMPILER_NOT_AVAILABLE:           printf("-- Error at %d:  Compiler not available\n", line); break;
            case CL_MEM_OBJECT_ALLOCATION_FAILURE:    printf("-- Error at %d:  Memory object allocation failure\n", line); break;
            case CL_OUT_OF_RESOURCES:                 printf("-- Error at %d:  Out of resources\n", line); break;
            case CL_OUT_OF_HOST_MEMORY:               printf("-- Error at %d:  Out of host memory\n", line); break;
            case CL_PROFILING_INFO_NOT_AVAILABLE:     printf("-- Error at %d:  Profiling information not available\n", line); break;
            case CL_MEM_COPY_OVERLAP:                 printf("-- Error at %d:  Memory copy overlap\n", line); break;
            case CL_IMAGE_FORMAT_MISMATCH:            printf("-- Error at %d:  Image format mismatch\n", line); break;
            case CL_IMAGE_FORMAT_NOT_SUPPORTED:       printf("-- Error at %d:  Image format not supported\n", line); break;
            case CL_BUILD_PROGRAM_FAILURE:            printf("-- Error at %d:  Program build failure\n", line); break;
            case CL_MAP_FAILURE:                      printf("-- Error at %d:  Map failure\n", line); break;
            case CL_INVALID_VALUE:                    printf("-- Error at %d:  Invalid value\n", line); break;
            case CL_INVALID_DEVICE_TYPE:              printf("-- Error at %d:  Invalid device type\n", line); break;
            case CL_INVALID_PLATFORM:                 printf("-- Error at %d:  Invalid platform\n", line); break;
            case CL_INVALID_DEVICE:                   printf("-- Error at %d:  Invalid device\n", line); break;
            case CL_INVALID_CONTEXT:                  printf("-- Error at %d:  Invalid context\n", line); break;
            case CL_INVALID_QUEUE_PROPERTIES:         printf("-- Error at %d:  Invalid queue properties\n", line); break;
            case CL_INVALID_COMMAND_QUEUE:            printf("-- Error at %d:  Invalid command queue\n", line); break;
            case CL_INVALID_HOST_PTR:                 printf("-- Error at %d:  Invalid host pointer\n", line); break;
            case CL_INVALID_MEM_OBJECT:               printf("-- Error at %d:  Invalid memory object\n", line); break;
            case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:  printf("-- Error at %d:  Invalid image format descriptor\n", line); break;
            case CL_INVALID_IMAGE_SIZE:               printf("-- Error at %d:  Invalid image size\n", line); break;
            case CL_INVALID_SAMPLER:                  printf("-- Error at %d:  Invalid sampler\n", line); break;
            case CL_INVALID_BINARY:                   printf("-- Error at %d:  Invalid binary\n", line); break;
            case CL_INVALID_BUILD_OPTIONS:            printf("-- Error at %d:  Invalid build options\n", line); break;
            case CL_INVALID_PROGRAM:                  printf("-- Error at %d:  Invalid program\n", line); break;
            case CL_INVALID_PROGRAM_EXECUTABLE:       printf("-- Error at %d:  Invalid program executable\n", line); break;
            case CL_INVALID_KERNEL_NAME:              printf("-- Error at %d:  Invalid kernel name\n", line); break;
            case CL_INVALID_KERNEL_DEFINITION:        printf("-- Error at %d:  Invalid kernel definition\n", line); break;
            case CL_INVALID_KERNEL:                   printf("-- Error at %d:  Invalid kernel\n", line); break;
            case CL_INVALID_ARG_INDEX:                printf("-- Error at %d:  Invalid argument index\n", line); break;
            case CL_INVALID_ARG_VALUE:                printf("-- Error at %d:  Invalid argument value\n", line); break;
            case CL_INVALID_ARG_SIZE:                 printf("-- Error at %d:  Invalid argument size\n", line); break;
            case CL_INVALID_KERNEL_ARGS:              printf("-- Error at %d:  Invalid kernel arguments\n", line); break;
            case CL_INVALID_WORK_DIMENSION:           printf("-- Error at %d:  Invalid work dimensionsension\n", line); break;
            case CL_INVALID_WORK_GROUP_SIZE:          printf("-- Error at %d:  Invalid work group size\n", line); break;
            case CL_INVALID_WORK_ITEM_SIZE:           printf("-- Error at %d:  Invalid work item size\n", line); break;
            case CL_INVALID_GLOBAL_OFFSET:            printf("-- Error at %d:  Invalid global offset\n", line); break;
            case CL_INVALID_EVENT_WAIT_LIST:          printf("-- Error at %d:  Invalid event wait list\n", line); break;
            case CL_INVALID_EVENT:                    printf("-- Error at %d:  Invalid event\n", line); break;
            case CL_INVALID_OPERATION:                printf("-- Error at %d:  Invalid operation\n", line); break;
            case CL_INVALID_GL_OBJECT:                printf("-- Error at %d:  Invalid OpenGL object\n", line); break;
            case CL_INVALID_BUFFER_SIZE:              printf("-- Error at %d:  Invalid buffer size\n", line); break;
            case CL_INVALID_MIP_LEVEL:                printf("-- Error at %d:  Invalid mip-map level\n", line); break;
            case -1024:                               printf("-- Error at %d:  *clBLAS* Functionality is not implemented\n", line); break;
            case -1023:                               printf("-- Error at %d:  *clBLAS* Library is not initialized yet\n", line); break;
            case -1022:                               printf("-- Error at %d:  *clBLAS* Matrix A is not a valid memory object\n", line); break;
            case -1021:                               printf("-- Error at %d:  *clBLAS* Matrix B is not a valid memory object\n", line); break;
            case -1020:                               printf("-- Error at %d:  *clBLAS* Matrix C is not a valid memory object\n", line); break;
            case -1019:                               printf("-- Error at %d:  *clBLAS* Vector X is not a valid memory object\n", line); break;
            case -1018:                               printf("-- Error at %d:  *clBLAS* Vector Y is not a valid memory object\n", line); break;
            case -1017:                               printf("-- Error at %d:  *clBLAS* An input dimension (M,N,K) is invalid\n", line); break;
            case -1016:                               printf("-- Error at %d:  *clBLAS* Leading dimension A must not be less than the size of the first dimension\n", line); break;
            case -1015:                               printf("-- Error at %d:  *clBLAS* Leading dimension B must not be less than the size of the second dimension\n", line); break;
            case -1014:                               printf("-- Error at %d:  *clBLAS* Leading dimension C must not be less than the size of the third dimension\n", line); break;
            case -1013:                               printf("-- Error at %d:  *clBLAS* The increment for a vector X must not be 0\n", line); break;
            case -1012:                               printf("-- Error at %d:  *clBLAS* The increment for a vector Y must not be 0\n", line); break;
            case -1011:                               printf("-- Error at %d:  *clBLAS* The memory object for Matrix A is too small\n", line); break;
            case -1010:                               printf("-- Error at %d:  *clBLAS* The memory object for Matrix B is too small\n", line); break;
            case -1009:                               printf("-- Error at %d:  *clBLAS* The memory object for Matrix C is too small\n", line); break;
            case -1008:                               printf("-- Error at %d:  *clBLAS* The memory object for Vector X is too small\n", line); break;
            case -1007:                               printf("-- Error at %d:  *clBLAS* The memory object for Vector Y is too small\n", line); break;
            case -1001:                               printf("-- Error at %d:  Code -1001: no GPU available?\n", line); break;
            default:                                  printf("-- Error at %d:  Unknown with code %d\n", line, error);
        }
        exit(1);
    }
}

// =================================================================================================


================================================
FILE: src/cl_to_cuda.h
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-11-06
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// =================================================================================================

// Replace the OpenCL keywords with CUDA equivalent
#define __kernel __placeholder__
#define __global 
#define __placeholder__ __global__
#define __local __shared__
#define restrict __restrict__

// Replace OpenCL synchronisation with CUDA synchronisation
#define barrier(x) __syncthreads()

// Replace the OpenCL get_xxx_ID with CUDA equivalents
__device__ int get_local_id(int x) {
    return (x == 0) ? threadIdx.x : threadIdx.y;
}
__device__ int get_group_id(int x) {
    return (x == 0) ? blockIdx.x : blockIdx.y;
}
__device__ int get_global_id(int x) {
    return (x == 0) ? blockIdx.x*blockDim.x + threadIdx.x : blockIdx.y*blockDim.y + threadIdx.y;
}

// Add the float8 data-type which is not available natively under CUDA
typedef struct { float s0; float s1; float s2; float s3;
                 float s4; float s5; float s6; float s7; } float8;

// =================================================================================================


================================================
FILE: src/common.h
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-11-17
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// =================================================================================================

// Common C includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>

// =================================================================================================

// Repeat all kernels multiple times to get an average timing result
#define NUM_RUNS 4

// Squared matrices are tested within a certain range (e.g. 1024x1024, 2048x2048, 4096x4096)
#define MINSIZE (1024)
#define MAXSIZE (4*1024)

// Set the alpha and beta values for the cuBLAS and clBlas libraries. Note that the myGEMM kernels
// for simplicity only support alpha values of 1 and beta values of 0.
#define ALPHA 1.0f
#define BETA 0.0f

// Define the current GPU's parameters
#define GPU_NAME "Tesla K40m"
#define GPU_CLOCK 0.745 // Core clock in GHz
#define GPU_CORES 2880 // Total number of CUDA cores
#define GPU_MOD 2 // Fused multiply-add

// OpenCL settings
#define MAX_NUM_DEVICES 16
#define MAX_DEVICE_NAME 1024
#define CURRENT_DEVICE 0

// =================================================================================================

// Timer structure
typedef struct {
    double t; // Time
    int long long kf; // KFlops
} profile_t;

// Number of timers
#define NUM_TIMERS 10

// Global variable holding the timing results
extern profile_t timers[NUM_TIMERS];

// =================================================================================================

// Forward declarations of BLAS functions
void libcublas(float* A, float* B, float* C,
               int K, int M, int N,
               int timerID);
void libclblas(float* A, float* B, float* C,
               int K, int M, int N,
               int timerID);
void mycublas(float* A, float* B, float* C,
              int K, int M, int N,
              int timerID);
void myclblas(float* A, float* B, float* C,
              int K, int M, int N,
              int timerID);

// Forward declarations of the timer functions
double timer(void);
double wtime(profile_t timer);
double gflops(profile_t timer);

// Other forward declarations
char* readKernelFile(const char* filename, long* _size);

// =================================================================================================


================================================
FILE: src/cuGEMM.cu
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-11-06
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// =================================================================================================

// Common include
#include "common.h"

// Include kernel constants
#include "settings.h"

// =================================================================================================

// Configuration settings for the CUDA version (comment out if not desired)
#define USE_LDG         // Whether to use the __ldg() intrinsic
//#define USE_SHUFFLE   // Whether to use warp-shuffle instructions

// Include the OpenCL-to-CUDA header and the OpenCL kernel-code
#include "cl_to_cuda.h"
#include "kernels.cl"

// =================================================================================================

// Matrix-multiplication using a custom CUDA SGEMM kernel. This function also copies the input
// matrices to the GPU, runs SGEMM, and copies the output matrix back to the CPU.
void mycublas(float* A, float* B, float* C,
              int K, int M, int N,
              int timerID) {

    // In case of myGEMM10, compute matrix sizes K, M, N as rounded-up to form complete tiles
    #if KERNEL == 10
        int K_XL = CEIL_DIV(K, TSK) * TSK;
        int M_XL = CEIL_DIV(M, TSM) * TSM;
        int N_XL = CEIL_DIV(N, TSN) * TSN;
    #else
        int K_XL = K;
        int M_XL = M;
        int N_XL = N;
    #endif

    // Prepare CUDA memory objects
    float* bufA = 0;
    float* bufB = 0;
    float* bufB_TR = 0; // This is the transposed version of B
    float* bufC = 0;
    cudaMalloc((void**)&bufA,    M*K*sizeof(*A));
    cudaMalloc((void**)&bufB,    K*N*sizeof(*B));
    cudaMalloc((void**)&bufB_TR, N*K*sizeof(*B));
    cudaMalloc((void**)&bufC,    M*N*sizeof(*C));

    // Copy matrices to the GPU (memset C to erase the results of the previous run)
    cudaMemcpy((void*)bufA, (void*)A, M*K*sizeof(*A), cudaMemcpyHostToDevice);
    cudaMemcpy((void*)bufB, (void*)B, K*N*sizeof(*B), cudaMemcpyHostToDevice);
    cudaMemset((void*)bufC, 0.0, M*N*sizeof(*C));

    // Create extra objects for rounded-up sizes (only needed in case of myGEMM10)
    float* bufA_XL = 0;
    float* bufB_TR_XL = 0;
    float* bufC_XL = 0;
    cudaMalloc((void**)&bufA_XL,    M_XL*K_XL*sizeof(*A));
    cudaMalloc((void**)&bufB_TR_XL, K_XL*N_XL*sizeof(*B));
    cudaMalloc((void**)&bufC_XL,    M_XL*N_XL*sizeof(*C));

    // Configure the local memory (banks of 8 bytes, 48KB local memory)
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);

    // Configure the thread/threadblock dimensions of the transpose kernel (only for certain myGEMMs)
    #if KERNEL == 5 || KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 || KERNEL == 10
        dim3 blocksTRP(CEIL_DIV(K,TRANSPOSEX), CEIL_DIV(N,TRANSPOSEY));
        dim3 threadsTRP(TRANSPOSEX, TRANSPOSEY);
    #endif

    // Configure the thread/threadblock dimensions of the padding kernels (only for myGEMM10)
    #if KERNEL == 10
        dim3 blocksA(CEIL_DIV(M_XL,PADDINGX), CEIL_DIV(K_XL,PADDINGY));
        dim3 threadsA(PADDINGX, PADDINGY);
        dim3 blocksB(CEIL_DIV(N_XL,PADDINGX), CEIL_DIV(K_XL,PADDINGY));
        dim3 threadsB(PADDINGX, PADDINGY);
        dim3 blocksC(CEIL_DIV(M,PADDINGX), CEIL_DIV(N,PADDINGY));
        dim3 threadsC(PADDINGX, PADDINGY);
    #endif

    // Configure the thread/threadblock dimensions of the myGEMM kernel
    #if KERNEL == 1 || KERNEL == 2
        dim3 blocks(M/TS, N/TS);
        dim3 threads(TS, TS);
    #elif KERNEL == 3 || KERNEL == 5
        dim3 blocks(M/TS, N/TS);
        dim3 threads(TS, TS/WPT);
    #elif KERNEL == 4
        dim3 blocks(M/TS, N/TS);
        dim3 threads(TS/WIDTH, TS);
    #elif KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9
        dim3 blocks(M/TSM, N/TSN);
        dim3 threads(TSM/WPTM, TSN/WPTN);
    #elif KERNEL == 10
        dim3 blocks(M_XL/TSM, N_XL/TSN);
        dim3 threads(TSM/WPTM, TSN/WPTN);
    #elif KERNEL == 11
        dim3 blocks(M/(THREADSX*RX), N/(THREADSY*RY));
        dim3 threads(THREADSX, THREADSY);
    #endif

    // Start the timed loop
    double startTime = timer();
    for (int r=0; r<NUM_RUNS; r++) {

        // Run the transpose kernel first
        #if KERNEL == 5 || KERNEL == 6 || KERNEL == 7 || KERNEL == 8 || KERNEL == 9 || KERNEL == 10
            transpose<<<blocksTRP, threadsTRP>>>(K, N, bufB, bufB_TR);
        #endif

        // Make the inputs extra large with padded zeros
        #if KERNEL == 10
            paddingAddZeroes<<<blocksA, threadsA>>>(M, K, bufA, M_XL, K_XL, bufA_XL);
            paddingAddZeroes<<<blocksB, threadsB>>>(N, K, bufB_TR, N_XL, K_XL, bufB_TR_XL);
        #endif

        // Run the myGEMM kernel
        #if KERNEL == 1
            myGEMM1<<<blocks, threads>>>(M, N, K, bufA, bufB, bufC);
        #elif KERNEL == 2
            myGEMM2<<<blocks, threads>>>(M, N, K, bufA, bufB, bufC);
        #elif KERNEL == 3
            myGEMM3<<<blocks, threads>>>(M, N, K, bufA, bufB, bufC);
        #elif KERNEL == 4
            myGEMM4<<<blocks, threads>>>(M, N, K, (floatX*)bufA, (floatX*)bufB, (floatX*)bufC);
        #elif KERNEL == 5
            myGEMM5<<<blocks, threads>>>(M, N, K, bufA, bufB_TR, bufC);
        #elif KERNEL == 6
            myGEMM6<<<blocks, threads>>>(M, N, K, bufA, bufB_TR, bufC);
        #elif KERNEL == 7
            myGEMM7<<<blocks, threads>>>(M, N, K, (floatX*)bufA, (floatX*)bufB_TR, bufC);
        #elif KERNEL == 8
            myGEMM8<<<blocks, threads>>>(M, N, K, (floatX*)bufA, (floatX*)bufB_TR, bufC);
        #elif KERNEL == 9
            myGEMM9<<<blocks, threads>>>(M, N, K, (floatX*)bufA, (floatX*)bufB_TR, bufC);
        #elif KERNEL == 10
            myGEMM10<<<blocks, threads>>>(M_XL, N_XL, K_XL, (floatX*)bufA_XL, (floatX*)bufB_TR_XL, bufC_XL);
        #elif KERNEL == 11
            myGEMM11<<<blocks, threads>>>(M, N, K, (floatA*)bufA, (floatB*)bufB, (floatC*)bufC);
        #endif

        // Remove padded zeroes from the larger output
        #if KERNEL == 10
            paddingRemoveZeroes<<<blocksC, threadsC>>>(M_XL, N_XL, bufC_XL, M, N, bufC);
        #endif

        // Wait for calculations to be finished
        cudaDeviceSynchronize();
    }

    // End the timed loop
    timers[timerID].t += (timer() - startTime) / (double)NUM_RUNS;
    timers[timerID].kf += ((long)K * (long)M * (long)N * 2) / 1000;

    // Copy the output matrix C back to the CPU memory
    cudaMemcpy((void*)C, (void*)bufC, M*N*sizeof(*C), cudaMemcpyDeviceToHost);

    // Free the GPU memory objects
    cudaFree(bufA);
    cudaFree(bufB);
    cudaFree(bufB_TR);
    cudaFree(bufC);
    cudaFree(bufA_XL);
    cudaFree(bufB_TR_XL);
    cudaFree(bufC_XL);
}

// =================================================================================================


================================================
FILE: src/kernels.cl
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-11-06
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// =================================================================================================
//
// Matrices in column-major format
// A: K columns, M rows
// B: N columns, K rows
// C: N columns, M rows
//                         
//                   N     
//                o-----o  
//                |     |  
//              K | [B] |  
//                |     |  
//                o-----o  
//        K          N     
//    o-------o   o-----o  
//  M |  [A]  | M | [C] |  
//    |       |   |     |  
//    o-------o   o-----o  
//                         
//
// C-code for column-major matrix multiplication with alpha=1 and beta=0:
//
// for (int m=0; m<M; m++) {
//     for (int n=0; n<N; n++) {
//         float acc = 0.0f;
//         for (int k=0; k<K; k++) {
//             acc += A[k*M + m] * B[n*K + k];
//         }
//         C[n*M + m] = acc;
//     }
// }
//
// =================================================================================================

// Data-widths
#if WIDTH == 1
    typedef float floatX;
#elif WIDTH == 2
    typedef float2 floatX;
#elif WIDTH == 4
    typedef float4 floatX;
#elif WIDTH == 8
    typedef float8 floatX;
#endif

// =================================================================================================
#if KERNEL == 1

// First naive implementation
__kernel void myGEMM1(const int M, const int N, const int K,
                      const __global float* A,
                      const __global float* B,
                      __global float* C) {
    
    // Thread identifiers
    const int globalRow = get_global_id(0); // Row ID of C (0..M)
    const int globalCol = get_global_id(1); // Col ID of C (0..N)

    // Compute a single element (loop over K)
    float acc = 0.0f;
    for (int k=0; k<K; k++) {
        acc += A[k*M + globalRow] * B[globalCol*K + k];
    }

    // Store the result
    C[globalCol*M + globalRow] = acc;
}

#endif
// =================================================================================================
#if KERNEL == 2

// Tiled and coalesced version
__kernel void myGEMM2(const int M, const int N, const int K,
                      const __global float* A,
                      const __global float* B,
                      __global float* C) {
    
    // Thread identifiers
    const int row = get_local_id(0); // Local row ID (max: TS)
    const int col = get_local_id(1); // Local col ID (max: TS)
    const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
    const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)

    // Local memory to fit a tile of TS*TS elements of A and B
    __local float Asub[TS][TS];
    __local float Bsub[TS][TS];

    // Initialise the accumulation register
    float acc = 0.0f;
    
    // Loop over all tiles
    const int numTiles = K/TS;
    for (int t=0; t<numTiles; t++) {

        // Load one tile of A and B into local memory
        const int tiledRow = TS*t + row;
        const int tiledCol = TS*t + col;
        Asub[col][row] = A[tiledCol*M + globalRow];
        Bsub[col][row] = B[globalCol*K + tiledRow];

        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Perform the computation for a single tile
        for (int k=0; k<TS; k++) {
            acc += Asub[k][row] * Bsub[col][k];
        }

        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // Store the final result in C
    C[globalCol*M + globalRow] = acc;
}

#endif
// =================================================================================================
#if KERNEL == 3

// Increased the amount of work-per-thread by a factor WPT
__kernel void myGEMM3(const int M, const int N, const int K,
                      const __global float* A,
                      const __global float* B,
                      __global float* C) {
    
    // Thread identifiers
    const int row = get_local_id(0); // Local row ID (max: TS)
    const int col = get_local_id(1); // Local col ID (max: TS/WPT == RTS)
    const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
    const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)

    // Local memory to fit a tile of TS*TS elements of A and B
    __local float Asub[TS][TS];
    __local float Bsub[TS][TS];

    // Initialise the accumulation registers
    float acc[WPT];
    for (int w=0; w<WPT; w++) {
        acc[w] = 0.0f;
    }
    
    // Loop over all tiles
    const int numTiles = K/TS;
    for (int t=0; t<numTiles; t++) {

        // Load one tile of A and B into local memory
        for (int w=0; w<WPT; w++) {
            const int tiledRow = TS*t + row;
            const int tiledCol = TS*t + col;
            Asub[col + w*RTS][row] = A[(tiledCol + w*RTS)*M + globalRow];
            Bsub[col + w*RTS][row] = B[(globalCol + w*RTS)*K + tiledRow];
        }

        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Perform the computation for a single tile
        for (int k=0; k<TS; k++) {
            for (int w=0; w<WPT; w++) {
                acc[w] += Asub[k][row] * Bsub[col + w*RTS][k];
            }
        }

        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // Store the final results in C
    for (int w=0; w<WPT; w++) {
        C[(globalCol + w*RTS)*M + globalRow] = acc[w];
    }
}

#endif
// =================================================================================================
#if KERNEL == 4

// Use wider data types
__kernel void myGEMM4(const int M, const int N, const int K,
                      const __global floatX* A,
                      const __global floatX* B,
                      __global floatX* C) {

    // Thread identifiers
    const int row = get_local_id(0); // Local row ID (max: TS/WIDTH)
    const int col = get_local_id(1); // Local col ID (max: TS)
    const int globalRow = (TS/WIDTH)*get_group_id(0) + row; // Row ID of C (0..M/WIDTH)
    const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)

    // Local memory to fit a tile of TS*TS elements of A and B
    __local floatX Asub[TS][TS/WIDTH];
    __local floatX Bsub[TS][TS/WIDTH];

    // Initialise the accumulation registers
    #if WIDTH == 1
        floatX acc = 0.0f;
    #elif WIDTH == 2
        floatX acc = { 0.0f, 0.0f };
    #elif WIDTH == 4
        floatX acc = { 0.0f, 0.0f, 0.0f, 0.0f };
    #elif WIDTH == 8
        floatX acc = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
    #endif
    
    // Loop over all tiles
    const int numTiles = K/TS;
    for (int tile=0; tile<numTiles; tile++) {

        // Load one tile of A and B into local memory
        const int tiledRow = (TS/WIDTH)*tile + row;
        const int tiledCol = TS*tile + col;
        Asub[col][row] = A[tiledCol*(M/WIDTH) + globalRow];
        Bsub[col][row] = B[globalCol*(K/WIDTH) + tiledRow];

        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Perform the computation for a single tile
        floatX vecA, vecB;
        float valB;
        for (int k=0; k<TS/WIDTH; k++) {
            vecB = Bsub[col][k];
            for (int w=0; w<WIDTH; w++) {
                vecA = Asub[WIDTH*k + w][row];
                #if WIDTH == 1
                    valB = vecB;
                    acc += vecA * valB;
                #elif WIDTH == 2
                    switch (w) {
                        case 0: valB = vecB.x; break;
                        case 1: valB = vecB.y; break;
                    }
                    acc.x += vecA.x * valB;
                    acc.y += vecA.y * valB;
                #elif WIDTH == 4
                    switch (w) {
                        case 0: valB = vecB.x; break;
                        case 1: valB = vecB.y; break;
                        case 2: valB = vecB.z; break;
                        case 3: valB = vecB.w; break;
                    }
                    acc.x += vecA.x * valB;
                    acc.y += vecA.y * valB;
                    acc.z += vecA.z * valB;
                    acc.w += vecA.w * valB;
                #elif WIDTH == 8
                    switch (w) {
                        case 0: valB = vecB.s0; break;
                        case 1: valB = vecB.s1; break;
                        case 2: valB = vecB.s2; break;
                        case 3: valB = vecB.s3; break;
                        case 4: valB = vecB.s4; break;
                        case 5: valB = vecB.s5; break;
                        case 6: valB = vecB.s6; break;
                        case 7: valB = vecB.s7; break;
                    }
                    acc.s0 += vecA.s0 * valB;
                    acc.s1 += vecA.s1 * valB;
                    acc.s2 += vecA.s2 * valB;
                    acc.s3 += vecA.s3 * valB;
                    acc.s4 += vecA.s4 * valB;
                    acc.s5 += vecA.s5 * valB;
                    acc.s6 += vecA.s6 * valB;
                    acc.s7 += vecA.s7 * valB;
                #endif
            }
        }

        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // Store the final results in C
    C[globalCol*(M/WIDTH) + globalRow] = acc;
}

#endif
// =================================================================================================
#if KERNEL == 5

// Pre-transpose the input matrix B and use rectangular tiles
__kernel void myGEMM5(const int M, const int N, const int K,
                      const __global float* A,
                      const __global float* B,
                      __global float* C) {

    // Thread identifiers
    const int row = get_local_id(0); // Local row ID (max: TS)
    const int col = get_local_id(1); // Local col ID (max: TS/WPT == RTS)
    const int globalRow = TS*get_group_id(0) + row; // Row ID of C (0..M)
    const int globalCol = TS*get_group_id(1) + col; // Col ID of C (0..N)

    // Local memory to fit a tile of A and B
    __local float Asub[TSDK][TS];
    __local float Bsub[TS][TSDK+2];

    // Initialise the accumulation registers
    float acc[WPT];
    for (int w=0; w<WPT; w++) {
        acc[w] = 0.0f;
    }
    
    // Loop over all tiles
    const int numTiles = K/TSDK;
    for (int t=0; t<numTiles; t++) {

        // Load one tile of A and B into local memory
        for (int l=0; l<LPT; l++) {
            const int tiledIndex = TSDK*t + col + l*RTS;
            int indexA = (tiledIndex)*M + TS*get_group_id(0) + row;
            int indexB = (tiledIndex)*N + TS*get_group_id(1) + row;
            Asub[col + l*RTS][row] = A[indexA];
            Bsub[row][col + l*RTS] = B[indexB];
        }

        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Perform the computation for a single tile
        for (int k=0; k<TSDK; k++) {
            for (int w=0; w<WPT; w++) {
                acc[w] += Asub[k][row] * Bsub[col + w*RTS][k];
            }
        }

        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // Store the final results in C
    for (int w=0; w<WPT; w++) {
        C[(globalCol + w*RTS)*M + globalRow] = acc[w];
    }
}

#endif
// =================================================================================================
#if KERNEL == 6

// Use 2D register blocking (further increase in work per thread)
__kernel void myGEMM6(const int M, const int N, const int K,
                      const __global float* A,
                      const __global float* B,
                      __global float* C) {

    // Thread identifiers
    const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM)
    const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN)
    const int offsetM = TSM*get_group_id(0); // Work-group offset
    const int offsetN = TSN*get_group_id(1); // Work-group offset

    // Local memory to fit a tile of A and B
    __local float Asub[TSK][TSM];
    __local float Bsub[TSN][TSK+2];

    // Allocate register space
    float Areg;
    float Breg[WPTN];
    float acc[WPTM][WPTN];

    // Initialise the accumulation registers
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            acc[wm][wn] = 0.0f;
        }
    }
    
    // Loop over all tiles
    const int numTiles = K/TSK;
    int t=0;
    do {

        // Load one tile of A and B into local memory
        #pragma unroll
        for (int la=0; la<LPTA; la++) {
            int tid = tidn*RTSM + tidm;
            int id = la*RTSN*RTSM + tid;
            int row = MOD2(id,TSM);
            int col = DIV2(id,TSM);
            int tiledIndex = TSK*t + col;
            Asub[col][row] = A[tiledIndex*M + offsetM + row];
            Bsub[row][col] = B[tiledIndex*N + offsetN + row];
        }

        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Loop over the values of a single tile
        for (int k=0; k<TSK; k++) {

            // Cache the values of Bsub in registers
            #pragma unroll
            for (int wn=0; wn<WPTN; wn++) {
                int col = tidn + wn*RTSN;
                Breg[wn] = Bsub[col][k];
            }

            // Perform the computation
            #pragma unroll
            for (int wm=0; wm<WPTM; wm++) {
                int row = tidm + wm*RTSM;
                Areg = Asub[k][row];
                #pragma unroll
                for (int wn=0; wn<WPTN; wn++) {
                    acc[wm][wn] += Areg * Breg[wn];
                }
            }
        }

        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);

        // Next tile
        t++;
    } while (t<numTiles);

    // Store the final results in C
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        int globalRow = offsetM + tidm + wm*RTSM;
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            int globalCol = offsetN + tidn + wn*RTSN;
            C[globalCol*M + globalRow] = acc[wm][wn];
        }
    }
}

#endif
// =================================================================================================
#if KERNEL == 7

// Wider loads combined with 2D register blocking
__kernel void myGEMM7(const int M, const int N, const int K,
                      const __global floatX* A,
                      const __global floatX* B,
                      __global float* C) {

    // Thread identifiers
    const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM)
    const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN)
    const int offsetM = TSM*get_group_id(0); // Work-group offset
    const int offsetN = TSN*get_group_id(1); // Work-group offset

    // Local memory to fit a tile of A and B
    __local float Asub[TSK][TSM];
    __local float Bsub[TSK][TSN];

    // Allocate register space
    float Areg;
    float Breg[WPTN];
    float acc[WPTM][WPTN];

    // Initialise the accumulation registers
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            acc[wm][wn] = 0.0f;
        }
    }
    
    // Loop over all tiles
    const int numTiles = K/TSK;
    int t=0;
    do {

        // Load one tile of A and B into local memory
        #pragma unroll
        for (int la=0; la<LPTA/WIDTH; la++) {
            int tid = tidn*RTSM + tidm;
            int id = la*RTSN*RTSM + tid;
            int row = MOD2(id,TSM/WIDTH);
            int col = DIV2(id,TSM/WIDTH);

            // Load the values (wide vector load)
            int tiledIndex = TSK*t + col;
            floatX vecA = A[tiledIndex*(M/WIDTH) + offsetM/WIDTH + row];
            floatX vecB = B[tiledIndex*(N/WIDTH) + offsetN/WIDTH + row];

            // Store the loaded vectors into local memory
            #if WIDTH == 1
                Asub[col][row] = vecA;
            #elif WIDTH == 2
                Asub[col][WIDTH*row + 0] = vecA.x;
                Asub[col][WIDTH*row + 1] = vecA.y;
            #elif WIDTH == 4
                Asub[col][WIDTH*row + 0] = vecA.x;
                Asub[col][WIDTH*row + 1] = vecA.y;
                Asub[col][WIDTH*row + 2] = vecA.z;
                Asub[col][WIDTH*row + 3] = vecA.w;
            #endif
            #if WIDTH == 1
                Bsub[col][row] = vecB;
            #elif WIDTH == 2
                Bsub[col][WIDTH*row + 0] = vecB.x;
                Bsub[col][WIDTH*row + 1] = vecB.y;
            #elif WIDTH == 4
                Bsub[col][WIDTH*row + 0] = vecB.x;
                Bsub[col][WIDTH*row + 1] = vecB.y;
                Bsub[col][WIDTH*row + 2] = vecB.z;
                Bsub[col][WIDTH*row + 3] = vecB.w;
            #endif
        }

        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Loop over the values of a single tile
        #pragma unroll
        for (int k=0; k<TSK; k++) {

            // Cache the values of Bsub in registers
            #pragma unroll
            for (int wn=0; wn<WPTN; wn++) {
                int col = tidn + wn*RTSN;
                Breg[wn] = Bsub[k][col];
            }

            // Perform the computation
            #pragma unroll
            for (int wm=0; wm<WPTM; wm++) {
                int row = tidm + wm*RTSM;
                Areg = Asub[k][row];
                #pragma unroll
                for (int wn=0; wn<WPTN; wn++) {
                    acc[wm][wn] += Areg * Breg[wn];
                }
            }
        }

        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);

        // Next tile
        t++;
    } while (t<numTiles);

    // Store the final results in C
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        int globalRow = offsetM + tidm + wm*RTSM;
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            int globalCol = offsetN + tidn + wn*RTSN;
            C[globalCol*M + globalRow] = acc[wm][wn];
        }
    }
}

#endif
// =================================================================================================
#if KERNEL == 8

// CUDA and Kepler-specific optimisations (LDG and warp-shuffle)
__kernel void myGEMM8(const int M, const int N, const int K,
                      const __global floatX* A,
                      const __global floatX* B,
                      __global float* C) {

    // Thread identifiers
    const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM)
    const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN)
    const int offsetM = TSM*get_group_id(0); // Work-group offset
    const int offsetN = TSN*get_group_id(1); // Work-group offset

    // Local memory to fit a tile of A and B
    __local float Asub[TSK][TSM];
    __local float Bsub[TSK][TSN];

    // Allocate register space
    float Areg;
    float Breg[WPTN];
    float acc[WPTM][WPTN];

    // Initialise the accumulation registers
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            acc[wm][wn] = 0.0f;
        }
    }
    
    // Loop over all tiles
    const int numTiles = K/TSK;
    int t=0;
    do {

        // Load one tile of A and B into local memory
        #pragma unroll
        for (int la=0; la<LPTA/WIDTH; la++) {
            int tid = tidn*RTSM + tidm;
            int id = la*RTSN*RTSM + tid;
            int row = MOD2(id,TSM/WIDTH);
            int col = DIV2(id,TSM/WIDTH);

            // Load the values (wide vector load)
            int tiledIndex = TSK*t + col;
            int indexA = tiledIndex*(M/WIDTH) + offsetM/WIDTH + row;
            int indexB = tiledIndex*(N/WIDTH) + offsetN/WIDTH + row;
            #ifdef USE_LDG
                floatX vecA = __ldg(&A[indexA]);
                floatX vecB = __ldg(&B[indexB]);
            #else
                floatX vecA = A[indexA];
                floatX vecB = B[indexB];
            #endif

            // Store the loaded vectors into local memory
            #if WIDTH == 1
                Asub[col][row] = vecA;
            #elif WIDTH == 2
                Asub[col][WIDTH*row + 0] = vecA.x;
                Asub[col][WIDTH*row + 1] = vecA.y;
            #elif WIDTH == 4
                Asub[col][WIDTH*row + 0] = vecA.x;
                Asub[col][WIDTH*row + 1] = vecA.y;
                Asub[col][WIDTH*row + 2] = vecA.z;
                Asub[col][WIDTH*row + 3] = vecA.w;
            #endif
            #if WIDTH == 1
                Bsub[col][row] = vecB;
            #elif WIDTH == 2
                Bsub[col][WIDTH*row + 0] = vecB.x;
                Bsub[col][WIDTH*row + 1] = vecB.y;
            #elif WIDTH == 4
                Bsub[col][WIDTH*row + 0] = vecB.x;
                Bsub[col][WIDTH*row + 1] = vecB.y;
                Bsub[col][WIDTH*row + 2] = vecB.z;
                Bsub[col][WIDTH*row + 3] = vecB.w;
            #endif
        }

        // Synchronise to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Loop over the values of a single tile
        #pragma unroll
        for (int k=0; k<TSK; k++) {

            // Cache the values of Bsub in registers
            #ifdef USE_SHUFFLE
                int col = tidn + (tidm % WPTN)*RTSN;
                float val = Bsub[k][col];
                #pragma unroll
                for (int wn=0; wn<WPTN; wn++) {
                    Breg[wn] = __shfl(val, wn, WPTN);
                }
            #else
                #pragma unroll
                for (int wn=0; wn<WPTN; wn++) {
                    int col = tidn + wn*RTSN;
                    Breg[wn] = Bsub[k][col];
                }
            #endif

            /*// Cache the values of Asub in registers
            #ifdef USE_SHUFFLE
                for (int wn=0; wn<WPTN; wn+=(32/RTSM)) {
                    int type = tidn % (32/RTSM);
                    int row = tidm + (wn+type)*RTSM;
                    float val = Asub[k][row];
                    Areg[wn] = __shfl_up(val, RTSM);
                    Areg[wn+1] = __shfl_down(val, RTSM);
                }
            #endif */

            // Perform the computation
            #pragma unroll
            for (int wm=0; wm<WPTM; wm++) {
                int row = tidm + wm*RTSM;
                Areg = Asub[k][row];
                #pragma unroll
                for (int wn=0; wn<WPTN; wn++) {
                    acc[wm][wn] += Areg * Breg[wn];
                }
            }
        }

        // Synchronise before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);

        // Next tile
        t++;
    } while (t<numTiles);

    // Store the final results in C
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        int globalRow = offsetM + tidm + wm*RTSM;
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            int globalCol = offsetN + tidn + wn*RTSN;
            C[globalCol*M + globalRow] = acc[wm][wn];
        }
    }
}

#endif
// =================================================================================================
#if KERNEL == 9

// With pre-fetching
__kernel void myGEMM9(const int M, const int N, const int K,
                      const __global floatX* A,
                      const __global floatX* B,
                      __global float* C) {

    // Thread identifiers
    const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM)
    const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN)
    const int offsetM = TSM*get_group_id(0); // Work-group offset
    const int offsetN = TSN*get_group_id(1); // Work-group offset

    // Local memory to fit two tiles of A and B
    __local float Asub[2][TSK*TSM];
    __local float Bsub[2][TSK*TSN];

    // Allocate register space
    float Areg;
    float Breg[WPTN];
    float acc[WPTM][WPTN];

    // Initialise the accumulation registers
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            acc[wm][wn] = 0.0f;
        }
    }

    // Load the first tile of A and B into local memory
    #pragma unroll
    for (int la=0; la<LPTA/WIDTH; la++) {
        int tid = tidn*RTSM + tidm;
        int id = la*RTSN*RTSM + tid;
        int row = MOD2(id,TSM/WIDTH);
        int col = DIV2(id,TSM/WIDTH);

        // Load the values (wide vector load)
        int tiledIndex = TSK*0 + col;
        int indexA = tiledIndex*(M/WIDTH) + offsetM/WIDTH + row;
        int indexB = tiledIndex*(N/WIDTH) + offsetN/WIDTH + row;
        #ifdef USE_LDG
            floatX vecA = __ldg(&A[indexA]);
            floatX vecB = __ldg(&B[indexB]);
        #else
            floatX vecA = A[indexA];
            floatX vecB = B[indexB];
        #endif

        // Store the loaded vectors into local memory
        #if WIDTH == 1
            Asub[0][col*TSM + row] = vecA;
        #elif WIDTH == 2
            Asub[0][col*TSM + WIDTH*row + 0] = vecA.x;
            Asub[0][col*TSM + WIDTH*row + 1] = vecA.y;
        #elif WIDTH == 4
            Asub[0][col*TSM + WIDTH*row + 0] = vecA.x;
            Asub[0][col*TSM + WIDTH*row + 1] = vecA.y;
            Asub[0][col*TSM + WIDTH*row + 2] = vecA.z;
            Asub[0][col*TSM + WIDTH*row + 3] = vecA.w;
        #endif
        #if WIDTH == 1
            Bsub[0][col*TSN + row] = vecB;
        #elif WIDTH == 2
            Bsub[0][col*TSN + WIDTH*row + 0] = vecB.x;
            Bsub[0][col*TSN + WIDTH*row + 1] = vecB.y;
        #elif WIDTH == 4
            Bsub[0][col*TSN + WIDTH*row + 0] = vecB.x;
            Bsub[0][col*TSN + WIDTH*row + 1] = vecB.y;
            Bsub[0][col*TSN + WIDTH*row + 2] = vecB.z;
            Bsub[0][col*TSN + WIDTH*row + 3] = vecB.w;
        #endif
    }

    // Synchronise
    barrier(CLK_LOCAL_MEM_FENCE);
    
    // Loop over all tiles
    const int numTiles = K/TSK;
    int t=0;
    do {

        // Load the next tile of A and B into local memory
        int tt = t + 1;
        if (tt < numTiles) {
            #pragma unroll
            for (int la=0; la<LPTA/WIDTH; la++) {
                int tid = tidn*RTSM + tidm;
                int id = la*RTSN*RTSM + tid;
                int row = MOD2(id,TSM/WIDTH);
                int col = DIV2(id,TSM/WIDTH);

                // Load the values (wide vector load)
                int tiledIndex = TSK*tt + col;
                int indexA = tiledIndex*(M/WIDTH) + offsetM/WIDTH + row;
                int indexB = tiledIndex*(N/WIDTH) + offsetN/WIDTH + row;
                #ifdef USE_LDG
                    floatX vecA = __ldg(&A[indexA]);
                    floatX vecB = __ldg(&B[indexB]);
                #else
                    floatX vecA = A[indexA];
                    floatX vecB = B[indexB];
                #endif

                // Store the loaded vectors into local memory
                #if WIDTH == 1
                    Asub[tt%2][col*TSM + row] = vecA;
                #elif WIDTH == 2
                    Asub[tt%2][col*TSM + WIDTH*row + 0] = vecA.x;
                    Asub[tt%2][col*TSM + WIDTH*row + 1] = vecA.y;
                #elif WIDTH == 4
                    Asub[tt%2][col*TSM + WIDTH*row + 0] = vecA.x;
                    Asub[tt%2][col*TSM + WIDTH*row + 1] = vecA.y;
                    Asub[tt%2][col*TSM + WIDTH*row + 2] = vecA.z;
                    Asub[tt%2][col*TSM + WIDTH*row + 3] = vecA.w;
                #endif
                #if WIDTH == 1
                    Bsub[tt%2][col*TSN + row] = vecB;
                #elif WIDTH == 2
                    Bsub[tt%2][col*TSN + WIDTH*row + 0] = vecB.x;
                    Bsub[tt%2][col*TSN + WIDTH*row + 1] = vecB.y;
                #elif WIDTH == 4
                    Bsub[tt%2][col*TSN + WIDTH*row + 0] = vecB.x;
                    Bsub[tt%2][col*TSN + WIDTH*row + 1] = vecB.y;
                    Bsub[tt%2][col*TSN + WIDTH*row + 2] = vecB.z;
                    Bsub[tt%2][col*TSN + WIDTH*row + 3] = vecB.w;
                #endif
            }
        }

        // Loop over the values of a single tile
        #pragma unroll
        for (int k=0; k<TSK; k++) {

            // Cache the values of Bsub in registers
            #pragma unroll
            for (int wn=0; wn<WPTN; wn++) {
                int col = tidn + wn*RTSN;
                Breg[wn] = Bsub[t%2][k*TSN + col];
            }

            // Perform the computation
            #pragma unroll
            for (int wm=0; wm<WPTM; wm++) {
                int row = tidm + wm*RTSM;
                Areg = Asub[t%2][k*TSM + row];
                #pragma unroll
                for (int wn=0; wn<WPTN; wn++) {
                    acc[wm][wn] += Areg * Breg[wn];
                }
            }
        }

        // Synchronise
        barrier(CLK_LOCAL_MEM_FENCE);

        // Next tile
        t++;
    } while (t<numTiles);

    // Store the final results in C
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        int globalRow = offsetM + tidm + wm*RTSM;
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            int globalCol = offsetN + tidn + wn*RTSN;
            C[globalCol*M + globalRow] = acc[wm][wn];
        }
    }
}

#endif
// =================================================================================================
#if KERNEL == 10

#define BK TSK
#define BN TSN
#define BM TSM
#define TX RTSM
#define TY RTSN
#define RX WPTM
#define RY WPTN

// With support for incomplete tiles and arbitrary input/output matrix sizes
__kernel void myGEMM10(const int M, const int N, const int K,
                       const __global floatX* A,
                       const __global floatX* B,
                       __global float* C) {

    // Thread identifiers
    const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM == RTSM)
    const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN == RTSN)
    const int gidm = get_group_id(0); // Work-group ID
    const int gidn = get_group_id(1); // Work-group ID
    const int tid = tidn*RTSM + tidm; // Global thread ID (max RTSM*RTSN)

    // Local memory to fit two tiles of A and B
    __local float Asub[2][TSK*TSM];
    __local float Bsub[2][TSK*TSN];

    // Allocate register space
    float Areg;
    float Breg[WPTN];
    float acc[WPTM][WPTN];

    // Initialise the accumulation registers
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            acc[wm][wn] = 0.0f;
        }
    }

    // Tile A
    #pragma unroll
    for (int la=0; la<LPTA/WIDTH; la++) {
        int id = la*RTSN*RTSM + tid;
        int row = MOD2(id,TSM/WIDTH);
        int col = DIV2(id,TSM/WIDTH);

        // Load the value (wide vector load)
        int tiledIndex = TSK*0 + col;
        int indexA = tiledIndex*(M/WIDTH) + gidm*(TSM/WIDTH) + row;
        #ifdef USE_LDG
            floatX vecA = __ldg(&A[indexA]);
        #else
            floatX vecA = A[indexA];
        #endif

        // Store the loaded vector into local memory
        #if WIDTH == 1
            Asub[0][col*TSM + row] = vecA;
        #elif WIDTH == 2
            Asub[0][col*TSM + WIDTH*row + 0] = vecA.x;
            Asub[0][col*TSM + WIDTH*row + 1] = vecA.y;
        #elif WIDTH == 4
            Asub[0][col*TSM + WIDTH*row + 0] = vecA.x;
            Asub[0][col*TSM + WIDTH*row + 1] = vecA.y;
            Asub[0][col*TSM + WIDTH*row + 2] = vecA.z;
            Asub[0][col*TSM + WIDTH*row + 3] = vecA.w;
        #endif
    }

    // Tile B
    #pragma unroll
    for (int lb=0; lb<LPTB/WIDTH; lb++) {
        int id = lb*RTSN*RTSM + tid;
        int row = MOD2(id,TSN/WIDTH);
        int col = DIV2(id,TSN/WIDTH);

        // Load the value (wide vector load)
        int tiledIndex = TSK*0 + col;
        int indexB = tiledIndex*(N/WIDTH) + gidn*(TSN/WIDTH) + row;
        #ifdef USE_LDG
            floatX vecB = __ldg(&B[indexB]);
        #else
            floatX vecB = B[indexB];
        #endif

        // Store the loaded vector into local memory
        #if WIDTH == 1
            Bsub[0][col*TSN + row] = vecB;
        #elif WIDTH == 2
            Bsub[0][col*TSN + WIDTH*row + 0] = vecB.x;
            Bsub[0][col*TSN + WIDTH*row + 1] = vecB.y;
        #elif WIDTH == 4
            Bsub[0][col*TSN + WIDTH*row + 0] = vecB.x;
            Bsub[0][col*TSN + WIDTH*row + 1] = vecB.y;
            Bsub[0][col*TSN + WIDTH*row + 2] = vecB.z;
            Bsub[0][col*TSN + WIDTH*row + 3] = vecB.w;
        #endif
    }
    
    // Loop over all tiles
    const int numTiles = K/TSK;
    int t=0;
    do {

        // Synchronise
        barrier(CLK_LOCAL_MEM_FENCE);

        // Load the next tile of A and B into local memory
        int tt = t + 1;
        if (tt < numTiles) {

            // Tile A
            #pragma unroll
            for (int la=0; la<LPTA/WIDTH; la++) {
                int id = la*RTSN*RTSM + tid;
                int row = MOD2(id,TSM/WIDTH);
                int col = DIV2(id,TSM/WIDTH);

                // Load the value (wide vector load)
                int tiledIndex = TSK*tt + col;
                int indexA = tiledIndex*(M/WIDTH) + gidm*(TSM/WIDTH) + row;
                #ifdef USE_LDG
                    floatX vecA = __ldg(&A[indexA]);
                #else
                    floatX vecA = A[indexA];
                #endif

                // Store the loaded vector into local memory
                #if WIDTH == 1
                    Asub[tt%2][col*TSM + row] = vecA;
                #elif WIDTH == 2
                    Asub[tt%2][col*TSM + WIDTH*row + 0] = vecA.x;
                    Asub[tt%2][col*TSM + WIDTH*row + 1] = vecA.y;
                #elif WIDTH == 4
                    Asub[tt%2][col*TSM + WIDTH*row + 0] = vecA.x;
                    Asub[tt%2][col*TSM + WIDTH*row + 1] = vecA.y;
                    Asub[tt%2][col*TSM + WIDTH*row + 2] = vecA.z;
                    Asub[tt%2][col*TSM + WIDTH*row + 3] = vecA.w;
                #endif
            }

            // Tile B
            #pragma unroll
            for (int lb=0; lb<LPTB/WIDTH; lb++) {
                int id = lb*RTSN*RTSM + tid;
                int row = MOD2(id,TSN/WIDTH);
                int col = DIV2(id,TSN/WIDTH);

                // Load the value (wide vector load)
                int tiledIndex = TSK*tt + col;
                int indexB = tiledIndex*(N/WIDTH) + gidn*(TSN/WIDTH) + row;
                #ifdef USE_LDG
                    floatX vecB = __ldg(&B[indexB]);
                #else
                    floatX vecB = B[indexB];
                #endif

                // Store the loaded vector into local memory
                #if WIDTH == 1
                    Bsub[tt%2][col*TSN + row] = vecB;
                #elif WIDTH == 2
                    Bsub[tt%2][col*TSN + WIDTH*row + 0] = vecB.x;
                    Bsub[tt%2][col*TSN + WIDTH*row + 1] = vecB.y;
                #elif WIDTH == 4
                    Bsub[tt%2][col*TSN + WIDTH*row + 0] = vecB.x;
                    Bsub[tt%2][col*TSN + WIDTH*row + 1] = vecB.y;
                    Bsub[tt%2][col*TSN + WIDTH*row + 2] = vecB.z;
                    Bsub[tt%2][col*TSN + WIDTH*row + 3] = vecB.w;
                #endif
            }
        }

        // Loop over the values of a single tile
        #pragma unroll
        for (int k=0; k<TSK; k++) {

            // Cache the values of Bsub in registers
            #pragma unroll
            for (int wn=0; wn<WPTN; wn++) {
                int col = tidn + wn*RTSN;
                Breg[wn] = Bsub[t%2][k*TSN + col];
            }

            // Perform the computation
            #pragma unroll
            for (int wm=0; wm<WPTM; wm++) {
                int row = tidm + wm*RTSM;
                Areg = Asub[t%2][k*TSM + row];
                #pragma unroll
                for (int wn=0; wn<WPTN; wn++) {
                    acc[wm][wn] += Areg * Breg[wn];
                }
            }
        }

        // Next tile
        t++;
    } while (t<numTiles);

    // Store the final results in C
    #pragma unroll
    for (int wm=0; wm<WPTM; wm++) {
        int globalRow = gidm*TSM + tidm + wm*RTSM;
        #pragma unroll
        for (int wn=0; wn<WPTN; wn++) {
            int globalCol = gidn*TSN + tidn + wn*RTSN;
            C[globalCol*M + globalRow] = acc[wm][wn];
        }
    }
}

#endif
// =================================================================================================
#if KERNEL == 11

// Typedefs for clBlas-mimic kernel (myGEMM11)
#if RX == 2
    typedef float2 floatA;
    typedef float2 floatC;
#elif RX == 4
    typedef float4 floatA;
    typedef float4 floatC;
#elif RX == 8
    typedef float8 floatA;
    typedef float8 floatC;
#endif
#if RK == 2
    typedef float2 floatB;
#elif RK == 4
    typedef float4 floatB;
#elif RK == 8
    typedef float8 floatB;
#endif

// Mimic clBlas (4x8 register tiling with vector data-types)
__kernel void myGEMM11(const int M, const int N, const int K,
                       const __global floatA* restrict A,
                       const __global floatB* restrict B,
                       __global floatC* C) {
    
    // Allocate register space
    float aReg[RK][RX];
    float bReg[RY][RK];
    float acc[RY][RX];

    // Initialise the accumulation registers
    #pragma unroll
    for (int y=0; y<RY; y++) {
        for (int x=0; x<RX; x++) {
            acc[y][x] = 0.0;
        }
    }

    // Loop over all tiles
    const int numTiles = K/RK;
    for (int t=0; t<numTiles; t++) {

        // Load a tile of A and B into register memory
        #pragma unroll
        for (int y=0; y<RY; y++) {

            // Load the data
            floatA aVec = A[(RK*t + y)*(M/RX) + get_global_id(0)];
            floatB bVec = B[(RY*get_global_id(1) + y)*numTiles + t];

            // Store the vector of A into registers
            #if RX == 2
                aReg[y][0] = aVec.x;
                aReg[y][1] = aVec.y;
            #elif RX == 4
                aReg[y][0] = aVec.x;
                aReg[y][1] = aVec.y;
                aReg[y][2] = aVec.z;
                aReg[y][3] = aVec.w;
            #elif RX == 8
                aReg[y][0] = aVec.s0;
                aReg[y][1] = aVec.s1;
                aReg[y][2] = aVec.s2;
                aReg[y][3] = aVec.s3;
                aReg[y][4] = aVec.s4;
                aReg[y][5] = aVec.s5;
                aReg[y][6] = aVec.s6;
                aReg[y][7] = aVec.s7;
            #endif

            // Store the vector of B into registers
            #if RK == 2
                bReg[y][0] = bVec.x;
                bReg[y][1] = bVec.y;
            #elif RK == 4
                bReg[y][0] = bVec.x;
                bReg[y][1] = bVec.y;
                bReg[y][2] = bVec.z;
                bReg[y][3] = bVec.w;
            #elif RK == 8
                bReg[y][0] = bVec.s0;
                bReg[y][1] = bVec.s1;
                bReg[y][2] = bVec.s2;
                bReg[y][3] = bVec.s3;
                bReg[y][4] = bVec.s4;
                bReg[y][5] = bVec.s5;
                bReg[y][6] = bVec.s6;
                bReg[y][7] = bVec.s7;
            #endif
        }

        // Perform the computations
        #pragma unroll
        for (int k=0; k<RK; k++) {
            #pragma unroll
            for (int y=0; y<RY; y++) {
                #pragma unroll
                for (int x=0; x<RX; x++) {
                    acc[y][x] += aReg[k][x] * bReg[y][k];
                }
            }
        }
    }

    // Store the final results in C
    #pragma unroll
    for (int y=0; y<RY; y++) {
        floatC accVec;
        #if RX == 2
            accVec.x = acc[y][0];
            accVec.y = acc[y][1];
        #elif RX == 4
            accVec.x = acc[y][0];
            accVec.y = acc[y][1];
            accVec.z = acc[y][2];
            accVec.w = acc[y][3];
        #elif RX == 8
            accVec.s0 = acc[y][0];
            accVec.s1 = acc[y][1];
            accVec.s2 = acc[y][2];
            accVec.s3 = acc[y][3];
            accVec.s4 = acc[y][4];
            accVec.s5 = acc[y][5];
            accVec.s6 = acc[y][6];
            accVec.s7 = acc[y][7];
        #endif
        C[(y + RY*get_global_id(1)) * (M/RX) + get_global_id(0)] = accVec;
    }
}

#endif
// =================================================================================================

// Simple transpose kernel for a P * Q matrix
__kernel void transpose(const int P, const int Q,
                        const __global float* input,
                        __global float* output) {
    
    // Thread identifiers
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    const int ID0 = get_group_id(0)*TRANSPOSEX + tx; // 0..P
    const int ID1 = get_group_id(1)*TRANSPOSEY + ty; // 0..Q

    // Set-up the local memory for shuffling
    __local float buffer[TRANSPOSEX][TRANSPOSEY];

    // Swap the x and y coordinates to perform the rotation (coalesced)
    if (ID0 < P && ID1 < Q) {
        buffer[ty][tx] = input[ID1*P + ID0];
    }

    // Synchronise all threads
    barrier(CLK_LOCAL_MEM_FENCE);

    // We don't have to swap the x and y thread indices here,
    // because that's already done in the local memory
    const int newID0 = get_group_id(1)*TRANSPOSEY + tx;
    const int newID1 = get_group_id(0)*TRANSPOSEX + ty;

    // Store the transposed result (coalesced)
    if (newID0 < Q && newID1 < P) {
        output[newID1*Q + newID0] = buffer[tx][ty];
    }
}

// =================================================================================================

// Pad the P * Q matrix with zeroes to form a P_XL * Q_XL matrix
__kernel void paddingAddZeroes(const int P, const int Q,
                               const __global float* input,
                               const int P_XL, const int Q_XL,
                               __global float* output) {
    
    // Thread identifiers
    const int tx = get_group_id(0)*PADDINGX + get_local_id(0); // 0..P_XL in blocks of PADDINGX
    const int ty = get_group_id(1)*PADDINGY + get_local_id(1); // 0..Q_XL in blocks of PADDINGY

    // Check whether we are within bounds of the XL matrix
    if (tx < P_XL && ty < Q_XL) {

        // Copy the input or pad a zero
        float value;
        if (tx < P && ty < Q) {
            value = input[ty*P + tx];
        }
        else {
            value = 0.0f;
        }

        // Store the result
        output[ty*P_XL + tx] = value;
    }
}

// =================================================================================================

// Remove padded values from a P_XL * Q_XL matrix to form a P * Q matrix
__kernel void paddingRemoveZeroes(const int P_XL, const int Q_XL,
                                  const __global float* input,
                                  const int P, const int Q,
                                  __global float* output) {
    
    // Thread identifiers
    const int tx = get_group_id(0)*PADDINGX + get_local_id(0); // 0..P in blocks of PADDINGX
    const int ty = get_group_id(1)*PADDINGY + get_local_id(1); // 0..Q in blocks of PADDINGY


    // Only store the result if within P * Q bounds
    if (tx < P && ty < Q) {
        output[ty*P + tx] = input[ty*P_XL + tx];
    }
}

// =================================================================================================


================================================
FILE: src/libclblas.cpp
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-11-10
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// =================================================================================================

// Common include
#include "common.h"

// Include OpenCL and clBlas
#include <clBLAS.h>

// =================================================================================================

// Matrix-multiplication using the clBlas library. This function copies the input matrices to the
// GPU, runs SGEMM, and copies the output matrix back to the CPU.
void libclblas(float* A, float* B, float* C,
               int K, int M, int N,
               int timerID) {
    cl_int err;

    // Define OpenCL variables
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_device_id devices[MAX_NUM_DEVICES];
    cl_uint numDevices = 0;
    cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0};
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_event event = NULL;
    char deviceName[MAX_DEVICE_NAME];

    // Configure the OpenCL environment
    err = clGetPlatformIDs(1, &platform, NULL);
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
    device = devices[CURRENT_DEVICE];
    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    queue = clCreateCommandQueue(ctx, device, 0, &err);
    err = clGetDeviceInfo(device, CL_DEVICE_NAME, MAX_DEVICE_NAME, deviceName, NULL);
    //printf("## %d devices, running on %d: '%s'\n", numDevices, CURRENT_DEVICE, deviceName);

    // Configure clBlas
    err = clblasSetup();

    // Prepare OpenCL memory objects
    cl_mem bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M*K*sizeof(*A), NULL, &err);
    cl_mem bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K*N*sizeof(*B), NULL, &err);
    cl_mem bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M*N*sizeof(*C), NULL, &err);

    // Copy matrices to the GPU (also C to erase the results of the previous run)
    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M*K*sizeof(*A), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K*N*sizeof(*B), B, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL);

    // Run one (small) instance of clBlas first to pre-generate and compile the kernel
    err = clblasSgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans,
                      128, 128, 128, ALPHA,
                      bufA, 0, 128,
                      bufB, 0, 128, BETA,
                      bufC, 0, 128,
                      1, &queue, 0, NULL, &event);
    err = clWaitForEvents(1, &event);

    // Start the timed loop
    double startTime = timer();
    for (int r=0; r<NUM_RUNS; r++) {

        // Call clBlas
        err = clblasSgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans,
                          M, N, K, ALPHA,
                          bufA, 0, M,
                          bufB, 0, K, BETA,
                          bufC, 0, M,
                          1, &queue, 0, NULL, &event);

        // Wait for calculations to be finished
        err = clWaitForEvents(1, &event);
    }

    // End the timed loop
    timers[timerID].t += (timer() - startTime) / (double)NUM_RUNS;
    timers[timerID].kf += ((long)K * (long)M * (long)N * 2) / 1000;

    // Copy the output matrix C back to the CPU memory
    err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M*N*sizeof(*C), C, 0, NULL, NULL);

    // Free the GPU memory objects
    clReleaseMemObject(bufA);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufC);

    // Clean-up OpenCL and clBlas 
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}

// =================================================================================================


================================================
FILE: src/libcublas.cu
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-10-30
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// =================================================================================================

// Common include
#include "common.h"

// Include CUDA and cuBLAS (API v2)
#include <cublas_v2.h>

// =================================================================================================

// Matrix-multiplication using the cuBLAS library. This function copies the input matrices to the
// GPU, runs SGEMM, and copies the output matrix back to the CPU.
void libcublas(float* A, float* B, float* C,
               int K, int M, int N,
               int timerID) {

    // cuBLAS configuration
    cublasStatus_t status;
    cublasHandle_t handle;
    status = cublasCreate(&handle);

    // Prepare CUDA memory objects
    float* bufA = 0;
    float* bufB = 0;
    float* bufC = 0;
    cudaMalloc((void**)&bufA, M*K*sizeof(*A));
    cudaMalloc((void**)&bufB, K*N*sizeof(*B));
    cudaMalloc((void**)&bufC, M*N*sizeof(*C));

    // Copy matrices to the GPU (also C to erase the results of the previous run)
    cudaMemcpy((void*)bufA, (void*)A, M*K*sizeof(*A), cudaMemcpyHostToDevice);
    cudaMemcpy((void*)bufB, (void*)B, K*N*sizeof(*B), cudaMemcpyHostToDevice);
    cudaMemcpy((void*)bufC, (void*)C, M*N*sizeof(*C), cudaMemcpyHostToDevice);

    // Configure SGEMM
    float alpha = ALPHA;
    float beta = BETA;

    // Start the timed loop
    double startTime = timer();
    for (int r=0; r<NUM_RUNS; r++) {

        // Call cuBLAS
        status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                             M, N, K, &alpha,
                             bufA, M,
                             bufB, K, &beta,
                             bufC, M);

        // Wait for calculations to be finished
        cudaDeviceSynchronize();
    }

    // End the timed loop
    timers[timerID].t += (timer() - startTime) / (double)NUM_RUNS;
    timers[timerID].kf += ((long)K * (long)M * (long)N * 2) / 1000;

    // Copy the output matrix C back to the CPU memory
    cudaMemcpy((void*)C, (void*)bufC, M*N*sizeof(*C), cudaMemcpyDeviceToHost);

    // Free the GPU memory objects
    cudaFree(bufA);
    cudaFree(bufB);
    cudaFree(bufC);

    // Clean-up cuBLAS
    status = cublasDestroy(handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
        exit(1);
    }
}

// =================================================================================================


================================================
FILE: src/main.cpp
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-11-10
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// =================================================================================================

// Common include
#include "common.h"

// Global variable with timing results
profile_t timers[NUM_TIMERS];

// =================================================================================================

// Main function. This takes care of creating matrices of various sizes and iterating over the
// different types of BLAS libraries. It also computes the error rate in terms of the L2-norm with
// respect to cuBLAS (the 'golden' reference).
int main(int argc, char* argv[]) {

    // Start of the function
    printf("\n##\n");
    srand(time(NULL));

    // Compute the peak performance of the GPU
    double peak = GPU_CLOCK * GPU_CORES * GPU_MOD;

    // Print information about the different configurations
    printf("## --- Configurations ---\n");
    for (int c=0; c<=3; c++) {
        #ifndef ENABLE_CUDA
            if (c == 0 || c == 2) { continue; }
        #endif
        switch(c) {
            case 0: printf("##    cuBLAS on '%s', peak: %.1lf GFLOPS\n", GPU_NAME, peak); break;
            case 1: printf("##    clBlas on '%s', peak: %.1lf GFLOPS\n", GPU_NAME, peak); break;
            case 2: printf("## myGEMM.cu on '%s', peak: %.1lf GFLOPS\n", GPU_NAME, peak); break;
            case 3: printf("## myGEMM.cl on '%s', peak: %.1lf GFLOPS\n", GPU_NAME, peak); break;
        }
    }

    // Loop over the different input/output matrix sizes
    for (int size=MINSIZE; size<=MAXSIZE; size=size*2) {

        // Set the performance counters to zero
        for (int t=0; t<NUM_TIMERS; t++) {
            timers[t].t = 0.0;
            timers[t].kf = 0;
        }

        // Set the matrices to be squared (change this to get rectangular matrices)
        const int k = size;
        const int m = size;
        const int n = size;
        printf("##\n");
        printf("## --- %dx%dx%d ---\n", k, m, n);

        // Allocate memory for the matrices and fill the inputs with random numbers
        float* A = (float*)malloc(m*k*sizeof(float*));
        float* B = (float*)malloc(k*n*sizeof(float*));
        float* C = (float*)malloc(m*n*sizeof(float*));
        float* goldC = (float*)malloc(MAXSIZE*MAXSIZE*sizeof(float*));
        for (int i=0; i<m*k; i++) {
            A[i] = (float)rand() / (float)RAND_MAX;
        }
        for (int i=0; i<k*n; i++) {
            B[i] = (float)rand() / (float)RAND_MAX;
        }

        // Run cuBLAS or clBlas first to get the 'golden' reference output
        #ifdef ENABLE_CUDA
            libcublas(A, B, goldC, k, m, n, NUM_TIMERS-1);
        #else
            libclblas(A, B, goldC, k, m, n, NUM_TIMERS-1);
        #endif

        // Loop over the configurations
        for (int c=0; c<=3; c++) {

            // Skip configurations if CUDA is disabled
            #ifndef ENABLE_CUDA
                if (c == 0 || c == 2) { continue; }
            #endif

            // Set the output matrix to zero (to erase the results of the previous run)
            for (int i=0; i<m*n; i++) {
                C[i] = 0.0f;
            }

            // Get the name of the configuration
            char name[100];
            switch(c) {
                case 0: sprintf(name, "cuBLAS"); break;
                case 1: sprintf(name, "clBlas"); break;
                case 2: sprintf(name, "myGEMM.cu"); break;
                case 3: sprintf(name, "myGEMM.cl"); break;
            }

            // Perform the matrix-multiplication
            switch(c) {
                #ifdef ENABLE_CUDA
                    case 0: libcublas(A, B, C, k, m, n, c); break;
                #endif
                case 1: libclblas(A, B, C, k, m, n, c); break;
                #ifdef ENABLE_CUDA
                    case 2: mycublas(A, B, C, k, m, n, c); break;
                #endif
                case 3: myclblas(A, B, C, k, m, n, c); break;
            }

            // Compare the result to the 'golden' reference output in terms of the L2-norm
            double L2norm = 0.0;
            for (int i=0; i<m*n; i++) {
                double val = C[i] - goldC[i];
                L2norm += val*val;
            }
            L2norm = sqrt(L2norm);

            // Print the results to screen
            double seconds = wtime(timers[c]);
            double performance = gflops(timers[c]);
            double fraction = 100.0 * performance / peak;
            printf("## [%9s] %6.3lf s --> %6.1lf GFLOPS (%2.0lf%%), L2 norm: %.2e\n",
                   name, seconds, performance, fraction, L2norm);
        }

        // Free up the matrices
        free(A);
        free(B);
        free(C);
        free(goldC);
    }

    // End of the program
    printf("##\n");
    printf("\n");
    return 0;
}

// =================================================================================================

// Timer function: Measure the current time
double timer(void) {
    struct timeval Tvalue;
    struct timezone dummy;
    gettimeofday(&Tvalue, &dummy);
    double etime = (double)Tvalue.tv_sec + 1.0e-6*((double)Tvalue.tv_usec);
    return etime;
    //return omp_get_wtime();
}

// Timer function: Get the execution time
double wtime(profile_t timer) {
    return (timer.t);
}

// Timer function: Get the GFLOPS number
double gflops(profile_t timer) {
    return ((double)timer.kf/(1000.0*1000.0)) / (timer.t);
}

// =================================================================================================

// Load an OpenCL kernel from file
char* readKernelFile(const char* filename, long* _size) {

    // Open the file
    FILE* file = fopen(filename, "r");
    if (!file) {
        printf("-- Error opening file %s\n", filename);
        exit(1);
    }

    // Get its size
    fseek(file, 0, SEEK_END);
    long size = ftell(file);
    rewind(file);

    // Read the kernel code as a string
    char* source = (char *)malloc((size+1)*sizeof(char));
    fread(source, 1, size*sizeof(char), file);
    source[size] = '\0';
    fclose(file);

    // Save the size and return the source string
    *_size = (size+1);
    return source;
}

// =================================================================================================


================================================
FILE: src/settings.h
================================================

// =================================================================================================
// Project: 
// Exploring the performance of general matrix-multiplication on an NVIDIA Tesla K40m GPU.
//
// File information:
// Institution.... SURFsara <www.surfsara.nl>
// Author......... Cedric Nugteren <cedric.nugteren@surfsara.nl>
// Changed at..... 2014-11-07
// License........ MIT license
// Tab-size....... 4 spaces
// Line length.... 100 characters
//
// =================================================================================================

// Select a kernel
#define KERNEL 8

// Constants for kernels 1 -- 5
#define TS 32                        // The square-root of the 2D tile-size (== work-group dims)

// Constants for kernels 3, 5
#define WPT 8                        // The amount of work-per-thread, i.e. the thread-coarsening factor
#define RTS (TS/WPT)                 // The reduced tile-size in one dimension

// Constants for kernels 4, 7 -- 10
#define WIDTH 4                      // The vector-width (in number of floats)

// Constants for kernel 5
#define TSDK 16                      // The tile-size in dimension K (for kernel 5 only)
#define LPT ((TSDK*WPT)/(TS))        // The amount of loads-per-thread (assume TSN==TSM)

// Constants for kernels 6 -- 10
#define TSM 128                      // The tile-size in dimension M
#define TSN 128                      // The tile-size in dimension N
#define TSK 16                       // The tile-size in dimension K
#define WPTM 8                       // The amount of work-per-thread in dimension M
#define WPTN 8                       // The amount of work-per-thread in dimension N
#define RTSM (TSM/WPTM)              // The reduced tile-size in dimension M (== number of threads)
#define RTSN (TSN/WPTN)              // The reduced tile-size in dimension N (== number of threads)
#define LPTA ((TSK*WPTM*WPTN)/(TSN)) // The amount of loads-per-thread for A
#define LPTB ((TSK*WPTM*WPTN)/(TSM)) // The amount of loads-per-thread for B

// Constraints on settings for kernels 6 -- 10
// Note: TSM/WPTM has to be integer
// Note: TSN/WPTN has to be integer
// Note: TSM/WIDTH has to be integer
// Note: TSN/WIDTH has to be integer
// Note: (TSK*WPTM*WPTN)/(TSN*WIDTH) has to be integer
// Note: (TSK*WPTM*WPTN)/(TSM*WIDTH) has to be integer

// Constants for kernel 11 (mimicing clBlas)
#define THREADSX 8
#define THREADSY 8
#define RX 8
#define RY 4
#define RK (RY)

// Constants for the supporting transpose kernel
#define TRANSPOSEX 16
#define TRANSPOSEY 16

// Constants for the supporting padding kernels
#define PADDINGX 16
#define PADDINGY 16

// Macros for host and kernel code
#define MIN(a,b) ((a) > (b)) ? (b) : (a)
#define MAX(a,b) ((a) > (b)) ? (a) : (b)
#define CEIL_DIV(x,y) (((x) + (y) - 1) / (y))
#define MOD2(x,y) ((x) % (y))
#define DIV2(x,y) ((x) / (y))

// =================================================================================================