Full Code of MAhaitao999/CUDA_Programming for AI

master df3edfa95f76 cached

135 files

2.0 MB

978.6k tokens

15 symbols

1 requests

Download .txt

Showing preview only (2,091K chars total). Download the full file or copy to clipboard to get everything.

Repository: MAhaitao999/CUDA_Programming
Branch: master
Commit: df3edfa95f76
Files: 135
Total size: 2.0 MB

Directory structure:
gitextract_28zc9teu/

├── .vscode/
│   ├── launch.json
│   └── settings.json
├── CUDA/
│   ├── chapter10_线程束基本函数与协作组/
│   │   ├── Makefile
│   │   ├── error.cuh
│   │   ├── reduce.cu
│   │   ├── reduce1parallelism.cu
│   │   ├── reduce2static.cu
│   │   └── warp.cu
│   ├── chapter11_CUDA流/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── common.h
│   │   ├── error.cuh
│   │   ├── host-kernel.cu
│   │   ├── kernel-kernel.cu
│   │   ├── kernel-transfer.cu
│   │   ├── pinMemTransfer.cu
│   │   ├── simpleHyperqOpenmp.cu
│   │   ├── simpleMultiAddBreadth.cu
│   │   └── simpleMultiAddDepth.cu
│   ├── chapter12_使用统一内存编程/
│   │   ├── Makefile
│   │   ├── add.cu
│   │   ├── add2_static.cu
│   │   ├── error.cuh
│   │   ├── oversubscription1.cu
│   │   ├── oversubscription2.cu
│   │   ├── oversubscription3.cu
│   │   └── prefetch.cu
│   ├── chapter13_分子动力学模拟的CUDA程序开发/
│   │   ├── cpp/
│   │   │   ├── common.cuh
│   │   │   ├── error.cuh
│   │   │   ├── force.cu
│   │   │   ├── force.cuh
│   │   │   ├── initialize.cu
│   │   │   ├── initialize.cuh
│   │   │   ├── integrate.cu
│   │   │   ├── integrate.cuh
│   │   │   ├── main.cu
│   │   │   ├── makefile
│   │   │   ├── makefile.windows
│   │   │   ├── memory.cu
│   │   │   ├── memory.cuh
│   │   │   ├── mic.cuh
│   │   │   ├── neighbor.cu
│   │   │   └── neighbor.cuh
│   │   ├── force-only/
│   │   │   ├── Makefile
│   │   │   ├── common.h
│   │   │   ├── error.cuh
│   │   │   ├── force.cu
│   │   │   ├── force.h
│   │   │   ├── initialize.cu
│   │   │   ├── initialize.h
│   │   │   ├── integrate.cu
│   │   │   ├── integrate.h
│   │   │   ├── main.cu
│   │   │   ├── makefile.windows
│   │   │   ├── memory.cu
│   │   │   ├── memory.h
│   │   │   ├── mic.h
│   │   │   ├── neighbor.cu
│   │   │   └── neighbor.h
│   │   ├── plot_energy.m
│   │   └── whole-code/
│   │       ├── Makefile
│   │       ├── common.h
│   │       ├── error.cuh
│   │       ├── force.cu
│   │       ├── force.h
│   │       ├── initialize.cu
│   │       ├── initialize.h
│   │       ├── integrate.cu
│   │       ├── integrate.h
│   │       ├── main.cu
│   │       ├── makefile.windows
│   │       ├── memory.cu
│   │       ├── memory.h
│   │       ├── mic.h
│   │       ├── neighbor.cu
│   │       ├── neighbor.h
│   │       ├── reduce.cu
│   │       └── reduce.h
│   ├── chapter14_CUDA标准库的使用/
│   │   ├── Makefile
│   │   ├── cublas_gemm.cu
│   │   ├── curand_host1.cu
│   │   ├── curand_host2.cu
│   │   ├── cusolver.cu
│   │   ├── error.cuh
│   │   ├── thrust_scan_pointer.cu
│   │   └── thrust_scan_vector.cu
│   ├── chapter1_GPU硬件与CUDA程序开发工具/
│   │   └── README.md
│   ├── chapter2_CUDA中的线程组织/
│   │   ├── Makefile
│   │   ├── hello1.cpp
│   │   ├── hello2.cu
│   │   ├── hello3.cu
│   │   ├── hello4.cu
│   │   └── hello5.cu
│   ├── chapter3_简单CUDA程序的基本框架/
│   │   ├── Makefile
│   │   ├── add.cpp
│   │   ├── add1.cu
│   │   ├── add2wrong.cu
│   │   ├── add3if.cu
│   │   └── add4device.cu
│   ├── chapter4_CUDA程序的错误检测/
│   │   ├── Makefile
│   │   ├── check1api
│   │   ├── check1api.cu
│   │   ├── check2kernel
│   │   ├── check2kernel.cu
│   │   ├── error.cuh
│   │   ├── memcheck
│   │   └── memcheck.cu
│   ├── chapter5_获得GPU加速的关键/
│   │   ├── Makefile
│   │   ├── add1cpu.cu
│   │   ├── add2gpu.cu
│   │   ├── add3memcpy.cu
│   │   ├── arithmetic1cpu.cu
│   │   ├── arithmetic2gpu.cu
│   │   └── error.cuh
│   ├── chapter6_CUDA的内存组织/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── error.cuh
│   │   ├── query.cu
│   │   └── static.cu
│   ├── chapter7_全局内存的合理使用/
│   │   ├── Makefile
│   │   ├── error.cuh
│   │   └── matrix.cu
│   ├── chapter8_共享内存的合理使用/
│   │   ├── Makefile
│   │   ├── bank.cu
│   │   ├── error.cuh
│   │   ├── reduce1cpu.cu
│   │   └── reduce2gpu.cu
│   └── chapter9_原子函数的合理使用/
│       ├── Makefile
│       ├── error.cuh
│       ├── neighbor.txt
│       ├── neighbor1cpu.cu
│       ├── neighbor2gpu.cu
│       ├── reduce.cu
│       └── xy.txt
└── README.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .vscode/launch.json
================================================
{
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "CUDA C++: Launch",
            "type": "cuda-gdb",
            "request": "launch",
            "program": ""
        }
    ]
}

================================================
FILE: .vscode/settings.json
================================================
{
    "files.associations": {
        "cstdio": "cpp",
        "istream": "cpp",
        "limits": "cpp",
        "ostream": "cpp",
        "stdexcept": "cpp",
        "system_error": "cpp",
        "cstdint": "cpp",
        "cstdlib": "cpp"
    },

    "editor.fontSize": 20
}

================================================
FILE: CUDA/chapter10_线程束基本函数与协作组/Makefile
================================================
all: reduce warp reduce1parallelism reduce2static

reduce: reduce.cu
	nvcc -g -arch=sm_50 reduce.cu -o reduce

warp: warp.cu
	nvcc -g -arch=sm_50 warp.cu -o warp

reduce1parallelism: reduce1parallelism.cu
	nvcc -g -arch=sm_50 reduce1parallelism.cu -o reduce1parallelism

reduce2static: reduce2static.cu
	nvcc -g -arch=sm_50 reduce2static.cu -o reduce2static

.PHONY: clean

clean:
	rm -rf reduce warp reduce1parallelism reduce2static


================================================
FILE: CUDA/chapter10_线程束基本函数与协作组/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


================================================
FILE: CUDA/chapter10_线程束基本函数与协作组/reduce.cu
================================================
#include "error.cuh"
#include <cstdio>
#include <cooperative_groups.h>

using namespace cooperative_groups;

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;
const unsigned FULL_MASK = 0xffffffff;

void timing(const real *d_x, const int method);

int main(int argc, char *argv[])
{
    real *h_x = (real *) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
    }
    real *d_x;
    CHECK(cudaMalloc(&d_x, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

    printf("\nusing syncwarp:\n");
    timing(d_x, 0);
    printf("\nusing shfl:\n");
    timing(d_x, 1);
    printf("\nusing cooperative group:\n");
    timing(d_x, 2);

    free(h_x);
    CHECK(cudaFree(d_x));
    return 0;
}

void __global__ reduce_syncwarp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads(); // 一个线程块内的所有线程同步
    } // 这一步执行完留下0-31号线程

    for (int offset = 16; offset > 0; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncwarp();
    }

    if (tid == 0)
    {
        atomicAdd(d_y, s_y[0]);
    }

}

void __global__ reduce_shfl(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    } // 这一步执行完留下0-31号线程

    real y = s_y[tid];

    for (int offset = 16; offset > 0; offset >>= 1)
    {
        y += __shfl_down_sync(FULL_MASK, y, offset);
    }

    if (tid == 0)
    {
        atomicAdd(d_y, y);
    }

}

void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    real y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i>>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        atomicAdd(d_y, y);
    }
}

real reduce(const real *d_x, const int method)
{
    const int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    const int smem = sizeof(real) * BLOCK_SIZE;

    real h_y[1] = {0};
    real *d_y;
    CHECK(cudaMalloc(&d_y, sizeof(real)));
    CHECK(cudaMemcpy(d_y, h_y, sizeof(real), cudaMemcpyHostToDevice));

    switch (method)
    {
        case 0:
            reduce_syncwarp<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, N);
            break;
        case 1:
            reduce_shfl<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, N);
            break;
        case 2:
            reduce_cp<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y, N);
            break;
        default:
            printf("Wrong method.\n");
            exit(1);
    }

    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    CHECK(cudaFree(d_y));

    return h_y[0];
}

void timing(const real *d_x, const int method)
{
    real sum = 0;
    
    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x, method); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("sum = %f.\n", sum);
}




================================================
FILE: CUDA/chapter10_线程束基本函数与协作组/reduce1parallelism.cu
================================================
#include "error.cuh"
#include <stdio.h>
#include <cooperative_groups.h>
using namespace cooperative_groups;

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;

void timing(const real *h_x);

int main(void)
{
    real *h_x = (real *) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
    }
    real *d_x;
    CHECK(cudaMalloc(&d_x, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

    timing(d_x);

    free(h_x);
    CHECK(cudaFree(d_x));
    return 0;
}

void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];

    real y = 0.0;
    const int stride = blockDim.x * gridDim.x;
    for (int n = bid * blockDim.x + tid; n < N; n += stride)
    {
        y += d_x[n];
    }
    s_y[tid] = y;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        d_y[bid] = y;
    }
}

real reduce(const real *d_x)
{
    const int ymem = sizeof(real) * GRID_SIZE;
    const int smem = sizeof(real) * BLOCK_SIZE;

    real h_y[1] = {0};
    real *d_y;
    CHECK(cudaMalloc(&d_y, ymem));

    reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, N);
    reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);

    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    CHECK(cudaFree(d_y));

    return h_y[0];
}

void timing(const real *d_x)
{
    real sum = 0;

    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("sum = %f.\n", sum);
}

================================================
FILE: CUDA/chapter10_线程束基本函数与协作组/reduce2static.cu
================================================
#include "error.cuh"
#include <cstdio>
#include <cooperative_groups.h>

using namespace cooperative_groups;

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;
const int GRID_SIZE = 10240;

void timing(const real *d_x);

int main(int argc, char *argv[])
{
    real *h_x = (real *) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
    }
    real *d_x;
    CHECK(cudaMalloc(&d_x, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

    timing(d_x);

    free(h_x);
    CHECK(cudaFree(d_x));
    return 0;
}

void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];

    real y = 0.0;
    const int stride = blockDim.x * gridDim.x;
    for (int n = bid * blockDim.x + tid; n < N; n += stride)
    {
        y += d_x[n];
    }
    s_y[tid] = y;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        d_y[bid] = y;
    }
}

__device__ real static_y[GRID_SIZE]; 

real reduce(const real *d_x)
{
    real *d_y;
    CHECK(cudaGetSymbolAddress((void**)&d_y, static_y));

    const int smem = sizeof(real) * BLOCK_SIZE;

    reduce_cp<<<GRID_SIZE, BLOCK_SIZE, smem>>>(d_x, d_y, N);
    reduce_cp<<<1, 1024, sizeof(real) * 1024>>>(d_y, d_y, GRID_SIZE);

    real h_y[1] = {0};
    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));
    // CHECK(cudaMemcpyFromSymbol(h_y, static_y, sizeof(real)); // also ok

    return h_y[0];

}

void timing(const real *d_x)
{
    real sum = 0;

    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x); 

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("sum = %f.\n", sum);
}

================================================
FILE: CUDA/chapter10_线程束基本函数与协作组/warp.cu
================================================
#include "error.cuh"
#include <cstdio>

const unsigned WIDTH = 8;
const unsigned BLOCK_SIZE = 16;
const unsigned FULL_MASK = 0xffffffff;

void __global__ test_warp_primitives(void);

int main(int argc, char *argv[])
{

    test_warp_primitives<<<1, BLOCK_SIZE>>>();
    CHECK(cudaDeviceSynchronize());

    return 0;
}

void __global__ test_warp_primitives(void)
{
    int tid = threadIdx.x;
    int lane_id = tid % WIDTH;

    if (tid == 0) printf("threadIdx.x: ");
    printf("%2d ", tid);
    if (tid == 0) printf("\n");

    if (tid == 0) printf("lane_id:     ");
    printf("%2d ", lane_id);
    if (tid == 0) printf("\n");

    unsigned mask1 = __ballot_sync(FULL_MASK, tid > 0);
    unsigned mask2 = __ballot_sync(FULL_MASK, tid == 0);
    if (tid == 0) printf("FULL_MASK = %x\n", FULL_MASK);
    if (tid == 1) printf("mask1     = %x\n", mask1);
    if (tid == 0) printf("mask2     = %x\n", mask2);

    int result = __all_sync(FULL_MASK, tid);
    if (tid == 0) printf("all_sync (FULL_MASK): %d\n", result);

    result = __all_sync(mask1, tid);
    if (tid == 1) printf("all_sync     (mask1): %d\n", result);

    result = __any_sync(FULL_MASK, tid);
    if (tid == 0) printf("any_sync (FULL_MASK): %d\n", result);

    result = __any_sync(mask2, tid);
    if (tid == 0) printf("any_sync     (mask2): %d\n", result);

    int value = __shfl_sync(FULL_MASK, tid, 2, WIDTH);
    if (tid == 0) printf("shfl:      ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_up_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_up:   ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_down_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_down: ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_xor_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_xor:  ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");
}

================================================
FILE: CUDA/chapter11_CUDA流/Makefile
================================================
all: simpleHyperqOpenmp simpleMultiAddBreadth simpleMultiAddDepth host-kernel kernel-kernel kernel-transfer pinMemTransfer

simpleHyperqOpenmp: simpleHyperqOpenmp.cu
	nvcc -O3 -Xcompiler -fopenmp simpleHyperqOpenmp.cu -o simpleHyperqOpenmp -lgomp

simpleMultiAddBreadth: simpleMultiAddBreadth.cu
	nvcc -O3 -Xcompiler -fopenmp simpleMultiAddBreadth.cu -o simpleMultiAddBreadth -lgomp

simpleMultiAddDepth: simpleMultiAddDepth.cu
	nvcc -O3 -Xcompiler -fopenmp simpleMultiAddDepth.cu -o simpleMultiAddDepth -lgomp

host-kernel: host-kernel.cu
	nvcc -O3 -Xcompiler -fopenmp host-kernel.cu -o host-kernel -lgomp

kernel-kernel: kernel-kernel.cu
	nvcc -O3 -Xcompiler -fopenmp kernel-kernel.cu -o kernel-kernel -lgomp

kernel-transfer: kernel-transfer.cu
	nvcc -O3 -Xcompiler -fopenmp kernel-transfer.cu -o kernel-transfer -lgomp

pinMemTransfer: pinMemTransfer.cu
	nvcc -O3 pinMemTransfer.cu -o pinMemTransfer

.PHONY: clean

clean:
	rm -rf simpleHyperqOpenmp simpleMultiAddBreadth simpleMultiAddDepth host-kernel kernel-kernel kernel-transfer pinMemTransfer


================================================
FILE: CUDA/chapter11_CUDA流/README.md
================================================
### Pinned Memory

**Allocated host memory is by default pageable**, that is, subject to page fault operations that move data in host virtual memory to different physical locations as directed by the operating system. Virtual memory offers the **illusion**(错觉) of much more main memory than is physically available, just as the L1 cache offers the illusion of much more on-chip memory than is physically available.

The GPU cannot safely access data in pageable host memory **because it has no control over when the host operating system may choose to physically move that data**. When transferring data from pageable host memory to device memory, the CUDA driver first allocates temporary page-locked or pinned host memory, copies the source host data to pinned memory, and then transfers the data from pinned memory to device memory, as illustrated on the left side of Figure 4-4.

![](./pic/Pinned_memory.png)

The CUDA runtime allows you to directly allocate pinned host memory using:

```c
cudaError_t cudaMallocHost(void **devPtr, size_t count);
```

This function allocates `count` bytes of host memory that is page-locked and accessible to the device. Since **the pinned memory can be accessed directly by the device**, it can be read and written with much higher bandwidth than pageable memory. However, allocating excessive amounts of pinned memory might degrade host system performance, since it reduces the amount of pageable memory available to the host system for storing virtual memory data.

The following code snippet demonstrates allocating pinned host memory with error checking and elementary error handling:

```c
cudaError_t status = cudaMallocHost((void**)&h_aPinned, bytes);
if (status != cudaSuccess)
{
    fprintf(stderr, "Error returned from pinned host memory allocation\n");
    exit(1);
}
```

Pinned host memory must be freed with:

```c
cudaError_t cudaFreeHost(void *ptr);
```



================================================
FILE: CUDA/chapter11_CUDA流/common.h
================================================
#include <sys/time.h>

#ifndef _COMMON_H
#define _COMMON_H

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
    }                                                                          \
}

#define CHECK_CUBLAS(call)                                                     \
{                                                                              \
    cublasStatus_t err;                                                        \
    if ((err = (call)) != CUBLAS_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CURAND(call)                                                     \
{                                                                              \
    curandStatus_t err;                                                        \
    if ((err = (call)) != CURAND_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult err;                                                           \
    if ( (err = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__,        \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUSPARSE(call)                                                   \
{                                                                              \
    cusparseStatus_t err;                                                      \
    if ((err = (call)) != CUSPARSE_STATUS_SUCCESS)                             \
    {                                                                          \
        fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__);   \
        cudaError_t cuda_err = cudaGetLastError();                             \
        if (cuda_err != cudaSuccess)                                           \
        {                                                                      \
            fprintf(stderr, "  CUDA error \"%s\" also detected\n",             \
                    cudaGetErrorString(cuda_err));                             \
        }                                                                      \
        exit(1);                                                               \
    }                                                                          \
}

inline double seconds()
{
    struct timeval tp;
    struct timezone tzp;
    int i = gettimeofday(&tp, &tzp);
    return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}

#endif // _COMMON_H

================================================
FILE: CUDA/chapter11_CUDA流/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


================================================
FILE: CUDA/chapter11_CUDA流/host-kernel.cu
================================================
#include "error.cuh"
#include <cmath>
#include <cstdlib>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 10;
const int N = 100000000;
const int M = sizeof(real) * N;
const int block_size = 128;
const int grid_size = (N - 1) / block_size + 1;

void timing(const real *h_x, const real *h_y, real *h_z,
            const real *d_x, const real *d_y, real *d_z,
            const int ratio, bool overlap);

int main(int argc, char *argv[])
{
    real *h_x = (real *)malloc(M);
    real *h_y = (real *)malloc(M);
    real *h_z = (real *)malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
        h_y[n] = 2.34;
    }

    real *d_x, *d_y, *d_z;
    CHECK(cudaMalloc(&d_x, M));
    CHECK(cudaMalloc(&d_y, M));
    CHECK(cudaMalloc(&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));

    printf("Without CPU-GPU overlap (ratio = 10)\n");
    timing(h_x, h_y, h_z, d_x, d_y, d_z, 10, false);
    printf("With CPU-GPU overlap (ratio = 10)\n");
    timing(h_x, h_y, h_z, d_x, d_y, d_z, 10, true);

    printf("Without CPU-GPU overlap (ratio = 1)\n");
    timing(h_x, h_y, h_z, d_x, d_y, d_z, 1, false);
    printf("With CPU-GPU overlap (ratio = 1)\n");
    timing(h_x, h_y, h_z, d_x, d_y, d_z, 1, true);

    printf("Without CPU-GPU overlap (ratio = 1000)\n");
    timing(h_x, h_y, h_z, d_x, d_y, d_z, 1000, false);
    printf("With CPU-GPU overlap (ratio = 1000)\n");
    timing(h_x, h_y, h_z, d_x, d_y, d_z, 1000, true);

    free(h_x);
    free(h_y);
    free(h_z);
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));

    return 0;

}

void cpu_sum(const real *x, const real *y, real *z, const int N_host)
{
    for (int n = 0; n < N_host; ++n)
    {
        z[n] = x[n] + y[n];
    }
}

void __global__ gpu_sum(const real *x, const real *y, real *z)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        z[n] = x[n] + y[n];
    }
}

void timing
(
    const real *h_x, const real *h_y, real *h_z,
    const real *d_x, const real *d_y, real *d_z,
    const int ratio, bool overlap
)
{
    float t_sum = 0;
    float t2_sum = 0;

    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        if (!overlap)
        {
            cpu_sum(h_x, h_y, h_z, N / ratio);
        }

        gpu_sum<<<grid_size, block_size>>>(d_x, d_y, d_z);

        if (overlap)
        {
            cpu_sum(h_x, h_y, h_z, N / ratio);
        }
 
        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("Time = %g +- %g ms.\n", t_ave, t_err);
}



================================================
FILE: CUDA/chapter11_CUDA流/kernel-kernel.cu
================================================
#include "error.cuh"
#include <cmath>
#include <cstdio>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 10;
const int N1 = 1024;
const int MAX_NUM_STREAMS = 30;
const int N = N1 * MAX_NUM_STREAMS;
const int M = sizeof(real) * N;
const int block_size = 128;
const int grid_size = (N1 - 1) / block_size + 1;
cudaStream_t streams[MAX_NUM_STREAMS];

void timing(const real *d_x, const real *d_y, real *d_z, const int num);

int main(int argc, char *argv[])
{

    real *h_x = (real *)malloc(M);
    real *h_y = (real *)malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
        h_y[n] = 2.34;
    }

    real *d_x, *d_y, *d_z;
    CHECK(cudaMalloc(&d_x, M));
    CHECK(cudaMalloc(&d_y, M));
    CHECK(cudaMalloc(&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));

    for (int n = 0; n < MAX_NUM_STREAMS; ++n)
    {
        CHECK(cudaStreamCreate(&streams[n]));
    }

    for (int num = 1; num <= MAX_NUM_STREAMS; ++num)
    {
        timing(d_x, d_y, d_z, num);
    }

    for (int n = 0 ; n < MAX_NUM_STREAMS; ++n)
    {
        CHECK(cudaStreamDestroy(streams[n]));
    }

    free(h_x);
    free(h_y);
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));
    
    return 0;
}

void __global__ add(const real *d_x, const real *d_y, real *d_z)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N1)
    {
        for (int i = 0; i < 100000; ++i)
        {
            d_z[n] = d_x[n] + d_y[n];
        }
    }
}

void timing(const real *d_x, const real *d_y, real *d_z, const int num)
{
    float t_sum = 0;
    float t2_sum = 0;

    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        for (int n = 0; n < num; ++n)
        {
            int offset = n * N1;
            add<<<grid_size, block_size, 0, streams[n]>>>
            (d_x + offset, d_y + offset, d_z + offset);
        }
 
        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("%g\n", t_ave);
}

================================================
FILE: CUDA/chapter11_CUDA流/kernel-transfer.cu
================================================
#include "error.cuh"
#include <cmath>
#include <cstdio>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 10;
const int N = 1 << 22;
const int M = sizeof(real) * N;
const int MAX_NUM_STREAMS = 64;
cudaStream_t streams[MAX_NUM_STREAMS];

void timing(const real *h_x, const real *h_y, real *h_z,
            real *d_x, real *d_y, real *d_z,
            const int num
           );

int main(int argc, char *argv[])
{
    real *h_x, *h_y, *h_z;
    CHECK(cudaMallocHost(&h_x, M));
    CHECK(cudaMallocHost(&h_y, M));
    CHECK(cudaMallocHost(&h_z, M));
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
        h_y[n] = 2.34;
    }

    real *d_x, *d_y, *d_z;
    CHECK(cudaMalloc(&d_x, M));
    CHECK(cudaMalloc(&d_y, M));
    CHECK(cudaMalloc(&d_z, M));

    for (int i = 0; i < MAX_NUM_STREAMS; i++)
    {
        CHECK(cudaStreamCreate(&(streams[i])));
    }

    for (int num = 1; num <= MAX_NUM_STREAMS; num *= 2)
    {
        timing(h_x, h_y, h_z, d_x, d_y, d_z, num);
    }

    for (int i = 0 ; i < MAX_NUM_STREAMS; i++)
    {
        CHECK(cudaStreamDestroy(streams[i]));
    }

    CHECK(cudaFreeHost(h_x));
    CHECK(cudaFreeHost(h_y));
    CHECK(cudaFreeHost(h_z));
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));
    
    return 0;
}

void __global__ add(const real *x, const real *y, real *z, int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        for (int i = 0; i < 40; ++i)
        {
            z[n] = x[n] + y[n];
        }
    }
}

void timing(const real *h_x, const real *h_y, real *h_z,
            real *d_x, real *d_y, real *d_z, 
            const int num)
{
    int N1 = N / num;
    int M1 = M / num;
    
    float t_sum = 0;
    float t2_sum = 0;

    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        for (int i = 0; i < num; i++)
        {
            int offset = i * N1;
            CHECK(cudaMemcpyAsync(d_x + offset, h_x + offset, M1, cudaMemcpyHostToDevice, streams[i]));
            CHECK(cudaMemcpyAsync(d_y + offset, h_y + offset, M1, cudaMemcpyHostToDevice, streams[i]));
            
            int block_size = 128;
            int grid_size = (N1 - 1) / block_size + 1;
            add<<<grid_size, block_size, 0, streams[i]>>>
            (d_x + offset, d_y + offset, d_z + offset, N1);

            CHECK(cudaMemcpyAsync(h_z + offset, d_z + offset, M1, cudaMemcpyDeviceToHost, streams[i]));
        }

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));

        const float t_ave = t_sum / NUM_REPEATS;
        const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
        printf("%d %g\n", num, t_ave);
        
    }
}

================================================
FILE: CUDA/chapter11_CUDA流/pinMemTransfer.cu
================================================
#include "common.h"
#include <cuda_runtime.h>
#include <stdio.h>

/*
 * An example of using CUDA's memory copy API to transfer data to and from the
 * device. In this case, cudaMalloc is used to allocate memory on the GPU and
 * cudaMemcpy is used to transfer the contents of host memory to an array
 * allocated using cudaMalloc. Host memory is allocated using cudaMallocHost to
 * create a page-locked host array.
 */

int main(int argc, char **argv)
{
    // set up device
    int dev = 0;
    CHECK(cudaSetDevice(dev));

    // memory size
    unsigned int isize = 1 << 22;
    unsigned int nbytes = isize * sizeof(float);

    // get device information
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));

    if (!deviceProp.canMapHostMemory)
    {
        printf("Device %d does not support mapping CPU host memory!\n", dev);
        CHECK(cudaDeviceReset());
        exit(EXIT_SUCCESS);
    }

    printf("%s starting at ", argv[0]);
    printf("device %d: %s memory size %d nbyte %5.2fMB canMap %d\n", dev,
           deviceProp.name, isize, nbytes / (1024.0f * 1024.0f),
           deviceProp.canMapHostMemory);

    // allocate pinned host memory
    float *h_a;
    CHECK(cudaMallocHost ((float **)&h_a, nbytes));

    // allocate device memory
    float *d_a;
    CHECK(cudaMalloc((float **)&d_a, nbytes));

    // initialize host memory
    memset(h_a, 0, nbytes);

    for (int i = 0; i < isize; i++) h_a[i] = 100.10f;

    // transfer data from the host to the device
    CHECK(cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice));

    // transfer data from the device to the host
    CHECK(cudaMemcpy(h_a, d_a, nbytes, cudaMemcpyDeviceToHost));

    // free memory
    CHECK(cudaFree(d_a));
    CHECK(cudaFreeHost(h_a));

    // reset device
    CHECK(cudaDeviceReset());
    return EXIT_SUCCESS;
}

================================================
FILE: CUDA/chapter11_CUDA流/simpleHyperqOpenmp.cu
================================================
#include "common.h"
#include <cstdio>
#include <cuda_runtime.h>
#include <cstdlib>
#include <omp.h>

/*
 * An example of using OpenMP to parallelize the creation of CUDA work in
 * multiple streams. This example using n_streams OpenMP threads to launch 4
 * kernels in each stream. Note the new pragma introduced, #pragma omp parallel.
 */

#define N 300000000
#define NSTREAM 4

__global__ void kernel_1()
{
    double sum = 0.0;

    for(int i = 0; i < N; i++)
    {
        sum = sum + tan(0.1) * tan(0.1);
    }
}

__global__ void kernel_2()
{
    double sum = 0.0;

    for(int i = 0; i < N; i++)
    {
        sum = sum + tan(0.1) * tan(0.1);
    }
}

__global__ void kernel_3()
{
    double sum = 0.0;

    for(int i = 0; i < N; i++)
    {
        sum = sum + tan(0.1) * tan(0.1);
    }
}

__global__ void kernel_4()
{
    double sum = 0.0;

    for(int i = 0; i < N; i++)
    {
        sum = sum + tan(0.1) * tan(0.1);
    }
}

int main(int argc, char **argv)
{
    int n_streams = NSTREAM;
    int isize = 1;
    int iblock = 1;
    int bigcase = 0;

    // get argument from command line
    if (argc > 1) n_streams = atoi(argv[1]);

    if (argc > 2) bigcase = atoi(argv[2]);

    float elapsed_time;

    // set up max connectioin
    char iname[] = "CUDA_DEVICE_MAX_CONNECTIONS";
    setenv (iname, "4", 1);
    char *ivalue =  getenv (iname);
    printf ("%s = %s\n", iname, ivalue);

    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("> Using Device %d: %s with num_streams=%d\n", dev, deviceProp.name,
           n_streams);
    CHECK(cudaSetDevice(dev));

    // check if device support hyper-q
    if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
    {
        if (deviceProp.concurrentKernels == 0)
        {
            printf("> GPU does not support concurrent kernel execution (SM 3.5 "
                    "or higher required)\n");
            printf("> CUDA kernel runs will be serialized\n");
        }
        else
        {
            printf("> GPU does not support HyperQ\n");
            printf("> CUDA kernel runs will have limited concurrency\n");
        }
    }

    printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);

    // Allocate and initialize an array of stream handles
    cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof(
                                cudaStream_t));

    for (int i = 0 ; i < n_streams ; i++)
    {
        CHECK(cudaStreamCreate(&(streams[i])));
    }

    // run kernel with more threads
    if (bigcase == 1)
    {
        iblock = 512;
        isize = 1 << 12;
    }

    // set up execution configuration
    dim3 block (iblock);
    dim3 grid  (isize / iblock);
    printf("> grid %d block %d\n", grid.x, block.x);

    // creat events
    cudaEvent_t start, stop;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop));

    // record start event
    CHECK(cudaEventRecord(start, 0));

    // dispatch job with depth first ordering using OpenMP
    omp_set_num_threads(n_streams);
    #pragma omp parallel
    {
        int i = omp_get_thread_num();
        kernel_1<<<grid, block, 0, streams[i]>>>();
        kernel_2<<<grid, block, 0, streams[i]>>>();
        kernel_3<<<grid, block, 0, streams[i]>>>();
        kernel_4<<<grid, block, 0, streams[i]>>>();
    }

    // record stop event
    CHECK(cudaEventRecord(stop, 0));
    CHECK(cudaEventSynchronize(stop));

    // calculate elapsed time
    CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
    printf("Measured time for parallel execution = %.3fs\n",
           elapsed_time / 1000.0f);

    // release all stream
    for (int i = 0 ; i < n_streams ; i++)
    {
        CHECK(cudaStreamDestroy(streams[i]));
    }

    free(streams);

    // destroy events
    CHECK(cudaEventDestroy(start));
    CHECK(cudaEventDestroy(stop));

    // reset device
    CHECK(cudaDeviceReset());

    return 0;
}

================================================
FILE: CUDA/chapter11_CUDA流/simpleMultiAddBreadth.cu
================================================
#include "common.h"
#include <cstdio>
#include <cuda_runtime.h>

/*
 * This example demonstrates overlapping computation and communication by
 * partitioning a data set and asynchronously launching the memory copies and
 * kernels for each subset. Launching all transfers and kernels for a given
 * subset in the same CUDA stream ensures that computation on the device is not
 * started until the necessary data has been transferred. However, because the
 * work of each subset is independent of all other subsets, the communication
 * and computation of different subsets will overlap.
 *
 * This example launches copies and kernels in breadth-first order.
 */

#define NSTREAM 4
#define BDIM 128

void initialData(float *ip, int size)
{
    int i;

    for(i = 0; i < size; i++)
    {
        ip[i] = (float)(rand() & 0xFF) / 10.0f;
    }
}

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx < N; idx++)
        C[idx] = A[idx] + B[idx];
}

__global__ void sumArrays(float *A, float *B, float *C, const int N)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < N)
    {
        for (int i = 0; i < N; ++i)
        {
            C[idx] = A[idx] + B[idx];
        }
    }
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            match = 0;
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at %d\n", hostRef[i], gpuRef[i], i);
            break;
        }
    }

    if (match) printf("Arrays match.\n\n");
}

int main(int argc, char **argv)
{
    printf("> %s Starting...\n", argv[0]);

    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("> Using Device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));

    // check if device support hyper-q
    if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
    {
        if (deviceProp.concurrentKernels == 0)
        {
            printf("> GPU does not support concurrent kernel execution (SM 3.5 "
                    "or higher required)\n");
            printf("> CUDA kernel runs will be serialized\n");
        }
        else
        {
            printf("> GPU does not support HyperQ\n");
            printf("> CUDA kernel runs will have limited concurrency\n");
        }
    }

    printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);

    // set up max connectioin
    char iname[] = "CUDA_DEVICE_MAX_CONNECTIONS";
    setenv (iname, "1", 1);
    char *ivalue =  getenv (iname);
    printf ("> %s = %s\n", iname, ivalue);
    printf ("> with streams = %d\n", NSTREAM);

    // set up data size of vectors
    int nElem = 1 << 18;
    printf("> vector size = %d\n", nElem);
    size_t nBytes = nElem * sizeof(float);

    // malloc pinned host memory for async memcpy
    float *h_A, *h_B, *hostRef, *gpuRef;
    CHECK(cudaHostAlloc((void**)&h_A, nBytes, cudaHostAllocDefault));
    CHECK(cudaHostAlloc((void**)&h_B, nBytes, cudaHostAllocDefault));
    CHECK(cudaHostAlloc((void**)&gpuRef, nBytes, cudaHostAllocDefault));
    CHECK(cudaHostAlloc((void**)&hostRef, nBytes, cudaHostAllocDefault));

    // initialize data at host side
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    memset(hostRef, 0, nBytes);
    memset(gpuRef,  0, nBytes);

    // add vector at host side for result checks
    sumArraysOnHost(h_A, h_B, hostRef, nElem);

    // malloc device global memory
    float *d_A, *d_B, *d_C;
    CHECK(cudaMalloc((float**)&d_A, nBytes));
    CHECK(cudaMalloc((float**)&d_B, nBytes));
    CHECK(cudaMalloc((float**)&d_C, nBytes));

    cudaEvent_t start, stop;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop));

    // invoke kernel at host side
    dim3 block (BDIM);
    dim3 grid  ((nElem + block.x - 1) / block.x);
    printf("> grid (%d, %d) block (%d, %d)\n", grid.x, grid.y, block.x,
            block.y);

    // sequential operation
    CHECK(cudaEventRecord(start, 0));
    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaEventRecord(stop, 0));
    CHECK(cudaEventSynchronize(stop));
    float memcpy_h2d_time;
    CHECK(cudaEventElapsedTime(&memcpy_h2d_time, start, stop));

    CHECK(cudaEventRecord(start, 0));
    sumArrays<<<grid, block>>>(d_A, d_B, d_C, nElem);
    CHECK(cudaEventRecord(stop, 0));
    CHECK(cudaEventSynchronize(stop));
    float kernel_time;
    CHECK(cudaEventElapsedTime(&kernel_time, start, stop));

    CHECK(cudaEventRecord(start, 0));
    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
    CHECK(cudaEventRecord(stop, 0));
    CHECK(cudaEventSynchronize(stop));
    float memcpy_d2h_time;
    CHECK(cudaEventElapsedTime(&memcpy_d2h_time, start, stop));
    float itotal = kernel_time + memcpy_h2d_time + memcpy_d2h_time;

    printf("\n");
    printf("Measured timings (throughput):\n");
    printf(" Memcpy host to device\t: %f ms (%f GB/s)\n",
           memcpy_h2d_time, (nBytes * 1e-6) / memcpy_h2d_time);
    printf(" Memcpy device to host\t: %f ms (%f GB/s)\n",
           memcpy_d2h_time, (nBytes * 1e-6) / memcpy_d2h_time);
    printf(" Kernel\t\t\t: %f ms (%f GB/s)\n",
           kernel_time, (nBytes * 2e-6) / kernel_time);
    printf(" Total\t\t\t: %f ms (%f GB/s)\n",
           itotal, (nBytes * 2e-6) / itotal);

    // grid parallel operation
    int iElem = nElem / NSTREAM;
    size_t iBytes = iElem * sizeof(float);
    grid.x = (iElem + block.x - 1) / block.x;

    cudaStream_t stream[NSTREAM];

    for (int i = 0; i < NSTREAM; ++i)
    {
        CHECK(cudaStreamCreate(&stream[i]));
    }

    CHECK(cudaEventRecord(start, 0));

    // initiate all asynchronous transfers to the device
    for (int i = 0; i < NSTREAM; ++i)
    {
        int ioffset = i * iElem;
        CHECK(cudaMemcpyAsync(&d_A[ioffset], &h_A[ioffset], iBytes,
                              cudaMemcpyHostToDevice, stream[i]));
        CHECK(cudaMemcpyAsync(&d_B[ioffset], &h_B[ioffset], iBytes,
                              cudaMemcpyHostToDevice, stream[i]));
    }

    // launch a kernel in each stream
    for (int i = 0; i < NSTREAM; ++i)
    {
        int ioffset = i * iElem;
        sumArrays<<<grid, block, 0, stream[i]>>>(&d_A[ioffset], &d_B[ioffset],
                &d_C[ioffset], iElem);
    }

    // enqueue asynchronous transfers from the device
    for (int i = 0; i < NSTREAM; ++i)
    {
        int ioffset = i * iElem;
        CHECK(cudaMemcpyAsync(&gpuRef[ioffset], &d_C[ioffset], iBytes,
                              cudaMemcpyDeviceToHost, stream[i]));
    }

    CHECK(cudaEventRecord(stop, 0));
    CHECK(cudaEventSynchronize(stop));
    float execution_time;
    CHECK(cudaEventElapsedTime(&execution_time, start, stop));

    printf("\n");
    printf("Actual results from overlapped data transfers:\n");
    printf(" overlap with %d streams : %f ms (%f GB/s)\n", NSTREAM,
           execution_time, (nBytes * 2e-6) / execution_time );
    printf(" speedup                : %f \n",
           ((itotal - execution_time) * 100.0f) / itotal);

    // check kernel error
    CHECK(cudaGetLastError());

    // check device results
    checkResult(hostRef, gpuRef, nElem);

    // free device global memory
    CHECK(cudaFree(d_A));
    CHECK(cudaFree(d_B));
    CHECK(cudaFree(d_C));

    // free host memory
    CHECK(cudaFreeHost(h_A));
    CHECK(cudaFreeHost(h_B));
    CHECK(cudaFreeHost(hostRef));
    CHECK(cudaFreeHost(gpuRef));

    // destroy events
    CHECK(cudaEventDestroy(start));
    CHECK(cudaEventDestroy(stop));

    // destroy streams
    for (int i = 0; i < NSTREAM; ++i)
    {
        CHECK(cudaStreamDestroy(stream[i]));
    }

    CHECK(cudaDeviceReset());
    return(0);
}

================================================
FILE: CUDA/chapter11_CUDA流/simpleMultiAddDepth.cu
================================================
#include "common.h"
#include <stdio.h>
#include <cuda_runtime.h>

/*
 * This example demonstrates overlapping computation and communication by
 * partitioning a data set and asynchronously launching the memory copies and
 * kernels for each subset. Launching all transfers and kernels for a given
 * subset in the same CUDA stream ensures that computation on the device is not
 * started until the necessary data has been transferred. However, because the
 * work of each subset is independent of all other subsets, the communication
 * and computation of different subsets will overlap.
 *
 * This example launches copies and kernels in depth-first order.
 */


#define NSTREAM 4
#define BDIM 128

void initialData(float *ip, int size)
{
    int i;

    for(i = 0; i < size; i++)
    {
        ip[i] = (float)(rand() & 0xFF) / 10.0f;
    }
}

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx < N; idx++)
        C[idx] = A[idx] + B[idx];
}

__global__ void sumArrays(float *A, float *B, float *C, const int N)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < N)
    {
        for (int i = 0; i < N; ++i)
        {
            C[idx] = A[idx] + B[idx];
        }
    }
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            match = 0;
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at %d\n", hostRef[i], gpuRef[i], i);
            break;
        }
    }

    if (match) printf("Arrays match.\n\n");
}

int main(int argc, char **argv)
{
    printf("> %s Starting...\n", argv[0]);

    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("> Using Device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));

    // check if device support hyper-q
    if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
    {
        if (deviceProp.concurrentKernels == 0)
        {
            printf("> GPU does not support concurrent kernel execution (SM 3.5 "
                    "or higher required)\n");
            printf("> CUDA kernel runs will be serialized\n");
        }
        else
        {
            printf("> GPU does not support HyperQ\n");
            printf("> CUDA kernel runs will have limited concurrency\n");
        }
    }

    printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);

    // set up max connectioin
    char iname[] = "CUDA_DEVICE_MAX_CONNECTIONS";
    setenv (iname, "1", 1);
    char *ivalue =  getenv (iname);
    printf ("> %s = %s\n", iname, ivalue);
    printf ("> with streams = %d\n", NSTREAM);

    // set up data size of vectors
    int nElem = 1 << 18;
    printf("> vector size = %d\n", nElem);
    size_t nBytes = nElem * sizeof(float);

    // malloc pinned host memory for async memcpy
    float *h_A, *h_B, *hostRef, *gpuRef;
    CHECK(cudaHostAlloc((void**)&h_A, nBytes, cudaHostAllocDefault));
    CHECK(cudaHostAlloc((void**)&h_B, nBytes, cudaHostAllocDefault));
    CHECK(cudaHostAlloc((void**)&gpuRef, nBytes, cudaHostAllocDefault));
    CHECK(cudaHostAlloc((void**)&hostRef, nBytes, cudaHostAllocDefault));

    // initialize data at host side
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    memset(hostRef, 0, nBytes);
    memset(gpuRef,  0, nBytes);

    // add vector at host side for result checks
    sumArraysOnHost(h_A, h_B, hostRef, nElem);

    // malloc device global memory
    float *d_A, *d_B, *d_C;
    CHECK(cudaMalloc((float**)&d_A, nBytes));
    CHECK(cudaMalloc((float**)&d_B, nBytes));
    CHECK(cudaMalloc((float**)&d_C, nBytes));

    cudaEvent_t start, stop;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop));

    // invoke kernel at host side
    dim3 block (BDIM);
    dim3 grid  ((nElem + block.x - 1) / block.x);
    printf("> grid (%d, %d) block (%d, %d)\n", grid.x, grid.y, block.x,
            block.y);

    // sequential operation
    CHECK(cudaEventRecord(start, 0));
    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaEventRecord(stop, 0));
    CHECK(cudaEventSynchronize(stop));
    float memcpy_h2d_time;
    CHECK(cudaEventElapsedTime(&memcpy_h2d_time, start, stop));

    CHECK(cudaEventRecord(start, 0));
    sumArrays<<<grid, block>>>(d_A, d_B, d_C, nElem);
    CHECK(cudaEventRecord(stop, 0));
    CHECK(cudaEventSynchronize(stop));
    float kernel_time;
    CHECK(cudaEventElapsedTime(&kernel_time, start, stop));

    CHECK(cudaEventRecord(start, 0));
    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
    CHECK(cudaEventRecord(stop, 0));
    CHECK(cudaEventSynchronize(stop));
    float memcpy_d2h_time;
    CHECK(cudaEventElapsedTime(&memcpy_d2h_time, start, stop));
    float itotal = kernel_time + memcpy_h2d_time + memcpy_d2h_time;

    printf("\n");
    printf("Measured timings (throughput):\n");
    printf(" Memcpy host to device\t: %f ms (%f GB/s)\n",
           memcpy_h2d_time, (nBytes * 1e-6) / memcpy_h2d_time);
    printf(" Memcpy device to host\t: %f ms (%f GB/s)\n",
           memcpy_d2h_time, (nBytes * 1e-6) / memcpy_d2h_time);
    printf(" Kernel\t\t\t: %f ms (%f GB/s)\n",
           kernel_time, (nBytes * 2e-6) / kernel_time);
    printf(" Total\t\t\t: %f ms (%f GB/s)\n",
           itotal, (nBytes * 2e-6) / itotal);

    // grid parallel operation
    int iElem = nElem / NSTREAM;
    size_t iBytes = iElem * sizeof(float);
    grid.x = (iElem + block.x - 1) / block.x;

    cudaStream_t stream[NSTREAM];

    for (int i = 0; i < NSTREAM; ++i)
    {
        CHECK(cudaStreamCreate(&stream[i]));
    }

    CHECK(cudaEventRecord(start, 0));

    // initiate all work on the device asynchronously in depth-first order
    for (int i = 0; i < NSTREAM; ++i)
    {
        int ioffset = i * iElem;
        CHECK(cudaMemcpyAsync(&d_A[ioffset], &h_A[ioffset], iBytes,
                              cudaMemcpyHostToDevice, stream[i]));
        CHECK(cudaMemcpyAsync(&d_B[ioffset], &h_B[ioffset], iBytes,
                              cudaMemcpyHostToDevice, stream[i]));
        sumArrays<<<grid, block, 0, stream[i]>>>(&d_A[ioffset], &d_B[ioffset],
                &d_C[ioffset], iElem);
        CHECK(cudaMemcpyAsync(&gpuRef[ioffset], &d_C[ioffset], iBytes,
                              cudaMemcpyDeviceToHost, stream[i]));
    }

    CHECK(cudaEventRecord(stop, 0));
    CHECK(cudaEventSynchronize(stop));
    float execution_time;
    CHECK(cudaEventElapsedTime(&execution_time, start, stop));

    printf("\n");
    printf("Actual results from overlapped data transfers:\n");
    printf(" overlap with %d streams : %f ms (%f GB/s)\n", NSTREAM,
           execution_time, (nBytes * 2e-6) / execution_time );
    printf(" speedup                : %f \n",
           ((itotal - execution_time) * 100.0f) / itotal);

    // check kernel error
    CHECK(cudaGetLastError());

    // check device results
    checkResult(hostRef, gpuRef, nElem);

    // free device global memory
    CHECK(cudaFree(d_A));
    CHECK(cudaFree(d_B));
    CHECK(cudaFree(d_C));

    // free host memory
    CHECK(cudaFreeHost(h_A));
    CHECK(cudaFreeHost(h_B));
    CHECK(cudaFreeHost(hostRef));
    CHECK(cudaFreeHost(gpuRef));

    // destroy events
    CHECK(cudaEventDestroy(start));
    CHECK(cudaEventDestroy(stop));

    // destroy streams
    for (int i = 0; i < NSTREAM; ++i)
    {
        CHECK(cudaStreamDestroy(stream[i]));
    }

    CHECK(cudaDeviceReset());
    return(0);
}

================================================
FILE: CUDA/chapter12_使用统一内存编程/Makefile
================================================
all: add add2_static oversubscription1 oversubscription2 oversubscription3 prefetch

add: add.cu
	nvcc add.cu -o add

add2_static: add2_static.cu
	nvcc add2_static.cu -o add2_static

oversubscription1: oversubscription1.cu
	nvcc -arch=sm_75 -O3 -DUNIFIED oversubscription1.cu -o oversubscription1

oversubscription2: oversubscription2.cu
	nvcc -arch=sm_75 -O3 oversubscription2.cu -o oversubscription2

oversubscription3: oversubscription3.cu
	nvcc -arch=sm_75 -O3 oversubscription3.cu -o oversubscription3

prefetch: prefetch.cu
	nvcc -arch=sm_60 -O3 prefetch.cu -o prefetch

.PHONY: clean

clean:
	rm -rf add add2_static oversubscription1 oversubscription2 oversubscription3 prefetch


================================================
FILE: CUDA/chapter12_使用统一内存编程/add.cu
================================================
#include "error.cuh"
#include <cmath>
#include <cstdio>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z);
void check(const double *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double *x, *y, *z;
    CHECK(cudaMallocManaged((void **)&x, M));
    CHECK(cudaMallocManaged((void **)&y, M));
    CHECK(cudaMallocManaged((void **)&z, M));

    for (int n = 0; n < N; ++n)
    {
        x[n] = a;
        y[n] = b;
    }

    const int block_size = 128;
    const int grid_size = N / block_size;
    add<<<grid_size, block_size>>>(x, y, z);

    CHECK(cudaDeviceSynchronize());
    check(z, N);

    CHECK(cudaFree(x));
    CHECK(cudaFree(y));
    CHECK(cudaFree(z));
    return 0;
}

void __global__ add(const double *x, const double *y, double *z)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    z[n] = x[n] + y[n];
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

================================================
FILE: CUDA/chapter12_使用统一内存编程/add2_static.cu
================================================
#include "error.cuh"
#include <cmath>
#include <cstdio>

__device__ __managed__ int ret[1000];

__global__ void AplusB(int a, int b)
{
    ret[threadIdx.x] = a + b + threadIdx.x;
}

int main(int argc, char *argv[])
{
    AplusB<<<1, 1000>>>(10, 100);
    cudaDeviceSynchronize();
    for (int i = 0; i < 1000; i++)
    {
        printf("%d: A+B = %d\n", i, ret[i]);
    }
}

================================================
FILE: CUDA/chapter12_使用统一内存编程/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


================================================
FILE: CUDA/chapter12_使用统一内存编程/oversubscription1.cu
================================================
#include "error.cuh"
#include <cstdio>
#include <cstdint>

const int N = 30;

int main(int argc, char *argv[])
{
    for (int n = 1; n <= N; ++n)
    {
        const size_t size = size_t(n) * 1024 * 1024 * 1024;
        uint16_t *x;
#ifdef UNIFIED
    CHECK(cudaMallocManaged(&x, size));
    CHECK(cudaFree(x));
    printf("Allocated %d GB unified memory without touch.\n", n);
#else
    CHECK(cudaMalloc(&x, size));
    CHECK(cudaFree(x));
    printf("Allocate %d GB device memory.\n", n);
#endif
    }

    return 0;
}



================================================
FILE: CUDA/chapter12_使用统一内存编程/oversubscription2.cu
================================================
#include "error.cuh"
#include <cstdio>
#include <cstdint>

const int N = 30;

__global__ void gpu_touch(uint64_t *x, const size_t size);

int main(int argc, char *argv[])
{
    for (int n = 1; n <= N; ++n)
    {
        const size_t memory_size = size_t(n) * 1024 * 1024 * 1024;
        const size_t data_size = memory_size / sizeof(uint64_t);
        uint64_t *x;
        CHECK(cudaMallocManaged(&x, memory_size));
        gpu_touch<<<(data_size - 1) / 1024 + 1, 1024>>>(x, data_size);
        CHECK(cudaGetLastError());
        CHECK(cudaDeviceSynchronize());
        CHECK(cudaFree(x));
        printf("Allocated %d GB unified memory with GPU touch.\n", n);
    }

    return 0;
}

__global__ void gpu_touch(uint64_t *x, const size_t size)
{
    const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < size)
    {
        x[i] = 0;
    }
}



================================================
FILE: CUDA/chapter12_使用统一内存编程/oversubscription3.cu
================================================
#include "error.cuh"
#include <cstdio>
#include <cstdint>

const int N = 30;

void cpu_touch(uint64_t *x, size_t size)
{
    for (size_t i = 0; i < size / sizeof(uint64_t); i++)
    {
        x[i] = 0;
    }
}

int main(int argc, char *argv[])
{
    for (int n = 1; n <= N; ++n)
    {
        size_t size = size_t(n) * 1024 * 1024 * 1024;
        uint64_t *x;
        CHECK(cudaMallocManaged(&x, size));
        cpu_touch(x, size);
        CHECK(cudaFree(x));
        printf("Allocated %d GB unified memory with CPU touch.\n", n);
    }

    return 0;
}

================================================
FILE: CUDA/chapter12_使用统一内存编程/prefetch.cu
================================================
#include "error.cuh" 
#include <math.h>
#include <stdio.h>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z);
void check(const double *z, const int N);

int main(void)
{
    int device_id = 0;
    CHECK(cudaGetDevice(&device_id));
	  
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double *x, *y, *z;
    CHECK(cudaMallocManaged((void **)&x, M));
    CHECK(cudaMallocManaged((void **)&y, M));
    CHECK(cudaMallocManaged((void **)&z, M));

    for (int n = 0; n < N; ++n)
    {
        x[n] = a;
        y[n] = b;
    }

    const int block_size = 128;
    const int grid_size = N / block_size;
    
    CHECK(cudaMemPrefetchAsync(x, M, device_id, NULL));
    CHECK(cudaMemPrefetchAsync(y, M, device_id, NULL));
    CHECK(cudaMemPrefetchAsync(z, M, device_id, NULL));
    
    add<<<grid_size, block_size>>>(x, y, z);
    
    CHECK(cudaMemPrefetchAsync(z, M, cudaCpuDeviceId, NULL));

    CHECK(cudaDeviceSynchronize());
    check(z, N);

    CHECK(cudaFree(x));
    CHECK(cudaFree(y));
    CHECK(cudaFree(z));
    return 0;
}

void __global__ add(const double *x, const double *y, double *z)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    z[n] = x[n] + y[n];
    // printf("z[n] is: %f\n", z[n]);
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            // printf("z[%d] -c = %f, EPSILON=%f\n", n, z[n] - c, EPSILON);
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/common.cuh
================================================
#pragma once

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const real K_B = 8.617343e-5;
const real TIME_UNIT_CONVERSION = 1.018051e+1;

struct Atom
{
    int *NN;
    int *NL;
    real *m;
    real *x;
    real *y;
    real *z;
    real *vx;
    real *vy;
    real *vz;
    real *fx;
    real *fy;
    real *fz;
    real *pe;
    real *ke;
    real *box;
};


================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/force.cu
================================================
#include "force.cuh"
#include "mic.cuh"

void find_force(int N, int MN, Atom *atom)
{
    int *NN = atom->NN;
    int *NL = atom->NL;
    real *x = atom->x;
    real *y = atom->y;
    real *z = atom->z;
    real *fx = atom->fx;
    real *fy = atom->fy;
    real *fz = atom->fz;
    real *pe = atom->pe;
    real *box = atom->box;
    const real epsilon = 1.032e-2;
    const real sigma = 3.405;
    const real cutoff = 10.0;
    const real cutoff_square = cutoff * cutoff;
    const real sigma_3 = sigma * sigma * sigma;
    const real sigma_6 = sigma_3 * sigma_3;
    const real sigma_12 = sigma_6 * sigma_6;
    const real e24s6 = 24.0 * epsilon * sigma_6; 
    const real e48s12 = 48.0 * epsilon * sigma_12;
    const real e4s6 = 4.0 * epsilon * sigma_6; 
    const real e4s12 = 4.0 * epsilon * sigma_12;
    for (int n = 0; n < N; ++n) 
    { 
        fx[n] = fy[n] = fz[n] = pe[n] = 0.0; 
    }
    for (int i = 0; i < N; ++i)
    {
        for (int k = 0; k < NN[i]; k++)
        {
            int j = NL[i * MN + k];
            if (j < i) { continue; }
            real x_ij = x[j] - x[i];
            real y_ij = y[j] - y[i];
            real z_ij = z[j] - z[i];
            apply_mic(box, &x_ij, &y_ij, &z_ij);
            real r2 = x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;
            if (r2 > cutoff_square) { continue; }
            real r2inv = 1.0 / r2;
            real r4inv = r2inv * r2inv;
            real r6inv = r2inv * r4inv;
            real r8inv = r4inv * r4inv;
            real r12inv = r4inv * r8inv;
            real r14inv = r6inv * r8inv;
            real f_ij = e24s6 * r8inv - e48s12 * r14inv;
            pe[i] += e4s12 * r12inv - e4s6 * r6inv;
            fx[i] += f_ij * x_ij; fx[j] -= f_ij * x_ij;
            fy[i] += f_ij * y_ij; fy[j] -= f_ij * y_ij;
            fz[i] += f_ij * z_ij; fz[j] -= f_ij * z_ij;
        }
    }
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/force.cuh
================================================
#pragma once
#include "common.cuh"

void find_force(int N, int MN, Atom *atom);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/initialize.cu
================================================
#include "initialize.cuh"
#include <stdlib.h>
#include <math.h>

static void scale_velocity(int N, real T_0, Atom *atom)
{
    real *m = atom->m;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real temperature = 0.0;
    for (int n = 0; n < N; ++n) 
    {
        real v2 = vx[n]*vx[n] + vy[n]*vy[n] + vz[n]*vz[n];     
        temperature += m[n] * v2; 
    }
    temperature /= 3.0 * K_B * N;
    real scale_factor = sqrt(T_0 / temperature);
    for (int n = 0; n < N; ++n)
    { 
        vx[n] *= scale_factor;
        vy[n] *= scale_factor;
        vz[n] *= scale_factor;
    }
}

void initialize_position(int nx, real ax, Atom *atom)
{
    atom->box[0] = ax * nx;
    atom->box[1] = ax * nx;
    atom->box[2] = ax * nx;
    atom->box[3] = atom->box[0] * 0.5;
    atom->box[4] = atom->box[1] * 0.5;
    atom->box[5] = atom->box[2] * 0.5;
    real *x = atom->x;
    real *y = atom->y;
    real *z = atom->z;
    real x0[4] = {0.0, 0.0, 0.5, 0.5};
    real y0[4] = {0.0, 0.5, 0.0, 0.5}; 
    real z0[4] = {0.0, 0.5, 0.5, 0.0};
    int n = 0;
    for (int ix = 0; ix < nx; ++ix)
    {
        for (int iy = 0; iy < nx; ++iy)
        {
            for (int iz = 0; iz < nx; ++iz)
            {
                for (int i = 0; i < 4; ++i)
                {
                    x[n] = (ix + x0[i]) * ax;
                    y[n] = (iy + y0[i]) * ax;
                    z[n] = (iz + z0[i]) * ax;
                    n++;
                }
            }
        }
    }
}
  
void initialize_velocity(int N, real T_0, Atom *atom)
{
    real *m = atom->m;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real momentum_average[3] = {0.0, 0.0, 0.0};
    for (int n = 0; n < N; ++n)
    { 
        vx[n] = -1.0 + (rand() * 2.0) / RAND_MAX; 
        vy[n] = -1.0 + (rand() * 2.0) / RAND_MAX; 
        vz[n] = -1.0 + (rand() * 2.0) / RAND_MAX;    
        
        momentum_average[0] += m[n] * vx[n] / N;
        momentum_average[1] += m[n] * vy[n] / N;
        momentum_average[2] += m[n] * vz[n] / N;
    } 
    for (int n = 0; n < N; ++n) 
    { 
        vx[n] -= momentum_average[0] / m[n];
        vy[n] -= momentum_average[1] / m[n];
        vz[n] -= momentum_average[2] / m[n]; 
    }
    scale_velocity(N, T_0, atom);
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/initialize.cuh
================================================
#pragma once
#include "common.cuh"

void initialize_position(int nx, real ax, Atom *atom);
void initialize_velocity(int N, real T_0, Atom *atom);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/integrate.cu
================================================
#include "integrate.cuh"
#include "force.cuh"
#include "error.cuh"
#include <stdio.h>
#include <math.h>
#include <time.h>

static real sum(int N, real *x)
{
    real s = 0.0;
    for (int n = 0; n < N; ++n) 
    {
        s += x[n];
    }
    return s;
}

static void scale_velocity(int N, real T_0, Atom *atom)
{
    real temperature = sum(N, atom->ke) / (1.5 * K_B * N);
    real scale_factor = sqrt(T_0 / temperature);
    for (int n = 0; n < N; ++n)
    { 
        atom->vx[n] *= scale_factor;
        atom->vy[n] *= scale_factor;
        atom->vz[n] *= scale_factor;
    }
}

static void integrate
(int N, real time_step, Atom *atom, int flag)
{
    real *m = atom->m;
    real *x = atom->x;
    real *y = atom->y;
    real *z = atom->z;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real *fx = atom->fx;
    real *fy = atom->fy;
    real *fz = atom->fz;
    real *ke = atom->ke;
    real time_step_half = time_step * 0.5;
    for (int n = 0; n < N; ++n)
    {
        real mass_inv = 1.0 / m[n];
        real ax = fx[n] * mass_inv;
        real ay = fy[n] * mass_inv;
        real az = fz[n] * mass_inv;
        vx[n] += ax * time_step_half;
        vy[n] += ay * time_step_half;
        vz[n] += az * time_step_half;
        if (flag == 1) 
        { 
            x[n] += vx[n] * time_step; 
            y[n] += vy[n] * time_step; 
            z[n] += vz[n] * time_step; 
        }
        else
        {
            real v2 = vx[n]*vx[n] + vy[n]*vy[n] + vz[n]*vz[n];
            ke[n] = m[n] * v2 * 0.5;
        }
    }
}

void equilibration
(
    int Ne, int N, int MN, real T_0, 
    real time_step, Atom *atom
)
{
    find_force(N, MN, atom);
    for (int step = 0; step < Ne; ++step)
    { 
        integrate(N, time_step, atom, 1);
        find_force(N, MN, atom);
        integrate(N, time_step, atom, 2);
        scale_velocity(N, T_0, atom);
    }
}

void production
(
    int Np, int Ns, int N, int MN, real T_0, 
    real time_step, Atom *atom
)
{
    float t_force = 0.0f;

    clock_t t_total_start = clock();

    FILE *fid = fopen("energy.txt", "w");
    for (int step = 0; step < Np; ++step)
    {
        integrate(N, time_step, atom, 1);

        clock_t t_force_start = clock();

        find_force(N, MN, atom);

        clock_t t_force_stop = clock();

        t_force += float(t_force_stop - t_force_start) / CLOCKS_PER_SEC;

        integrate(N, time_step, atom, 2);

        if (0 == step % Ns)
        {
            fprintf(fid, "%g %g\n", sum(N, atom->ke), sum(N, atom->pe));
        }
    }
    fclose(fid);

    clock_t t_total_stop = clock();

    float t_total = float(t_total_stop - t_total_start) / CLOCKS_PER_SEC;
    printf("Time used for production = %g s\n", t_total);
    printf("Time used for force part = %g s\n", t_force);
}




================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/integrate.cuh
================================================
#pragma once
#include "common.cuh"

void equilibration
(
    int Ne, int N, int MN, real T_0, 
    real time_step, Atom *atom
);

void production
(
    int Np, int Ns, int N, int MN, real T_0, 
    real time_step, Atom *atom
);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/main.cu
================================================
#include "common.cuh"
#include "memory.cuh"
#include "initialize.cuh"
#include "neighbor.cuh"
#include "integrate.cuh"
#include <stdlib.h>
#include <stdio.h>

int main(int argc, char **argv)
{
    int nx = 5;
    int Ne = 20000;
    int Np = 20000;

    if (argc != 3) 
    { 
        printf("Usage: %s nx Ne\n", argv[0]);
        exit(1);
    }
    else
    {
        nx = atoi(argv[1]);
        Ne = atoi(argv[2]);
        Np = Ne;
    }

    int N = 4 * nx * nx * nx;
    int Ns = 100;
    int MN = 200;
    real T_0 = 60.0;
    real ax = 5.385;
    real time_step = 5.0 / TIME_UNIT_CONVERSION;
    Atom atom;
    allocate_memory(N, MN, &atom);
    for (int n = 0; n < N; ++n) { atom.m[n] = 40.0; }
    initialize_position(nx, ax, &atom);
    initialize_velocity(N, T_0, &atom);
    find_neighbor(N, MN, &atom);
    equilibration(Ne, N, MN, T_0, time_step, &atom);
    production(Np, Ns, N, MN, T_0, time_step, &atom);
    deallocate_memory(&atom);
    return 0;
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/makefile
================================================
all: ljmd

CC = nvcc
CFLAGS = -O3 -arch=sm_70 

ljmd: initialize.o integrate.o neighbor.o force.o memory.o main.o
	$(CC) -o ljmd \
	initialize.o integrate.o neighbor.o force.o memory.o main.o

initialize.o: initialize.cu
	$(CC) $(CFLAGS) -c initialize.cu
integrate.o: integrate.cu
	$(CC) $(CFLAGS) -c integrate.cu
memory.o: memory.cu
	$(CC) $(CFLAGS) -c memory.cu
neighbor.o: neighbor.cu
	$(CC) $(CFLAGS) -c neighbor.cu
force.o: force.cu
	$(CC) $(CFLAGS) -c force.cu
main.o: main.cu
	$(CC) $(CFLAGS) -c main.cu

clean:
	rm -rf *o ljmd



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/makefile.windows
================================================
all: ljmd

CC = nvcc
CFLAGS = -O3 -arch=sm_75 -Xcompiler "/wd 4819"

ljmd: initialize.obj integrate.obj neighbor.obj force.obj memory.obj main.obj
	$(CC) -o ljmd \
	initialize.obj integrate.obj neighbor.obj force.obj memory.obj main.obj

initialize.obj: initialize.cu
	$(CC) $(CFLAGS) -c initialize.cu
integrate.obj: integrate.cu
	$(CC) $(CFLAGS) -c integrate.cu
memory.obj: memory.cu
	$(CC) $(CFLAGS) -c memory.cu
neighbor.obj: neighbor.cu
	$(CC) $(CFLAGS) -c neighbor.cu
force.obj: force.cu
	$(CC) $(CFLAGS) -c force.cu
main.obj: main.cu
	$(CC) $(CFLAGS) -c main.cu

clean:
	del *obj ljmd*




================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/memory.cu
================================================
#include "memory.cuh"
#include <stdlib.h>

void allocate_memory(int N, int MN, Atom *atom)
{
    atom->NN = (int*) malloc(N * sizeof(int));
    atom->NL = (int*) malloc(N * MN * sizeof(int));
    atom->m  = (real*) malloc(N * sizeof(real));
    atom->x  = (real*) malloc(N * sizeof(real));
    atom->y  = (real*) malloc(N * sizeof(real));
    atom->z  = (real*) malloc(N * sizeof(real));
    atom->vx = (real*) malloc(N * sizeof(real));
    atom->vy = (real*) malloc(N * sizeof(real));
    atom->vz = (real*) malloc(N * sizeof(real));
    atom->fx = (real*) malloc(N * sizeof(real));
    atom->fy = (real*) malloc(N * sizeof(real));
    atom->fz = (real*) malloc(N * sizeof(real));
    atom->pe = (real*) malloc(N * sizeof(real));
    atom->ke = (real*) malloc(N * sizeof(real));
    atom->box = (real*) malloc(6 * sizeof(real));
}

void deallocate_memory(Atom *atom)
{
    free(atom->NN);
    free(atom->NL);
    free(atom->m);
    free(atom->x);
    free(atom->y);
    free(atom->z);
    free(atom->vx);
    free(atom->vy);
    free(atom->vz);
    free(atom->fx);
    free(atom->fy);
    free(atom->fz);
    free(atom->pe);
    free(atom->ke);
    free(atom->box);
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/memory.cuh
================================================
#pragma once
#include "common.cuh"

void allocate_memory(int N, int MN, Atom *atom);
void deallocate_memory(Atom *atom);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/mic.cuh
================================================
#pragma once

static void apply_mic(real *box, real *x12, real *y12, real *z12)
{
    if      (*x12 < - box[3]) { *x12 += box[0]; } 
    else if (*x12 > + box[3]) { *x12 -= box[0]; }
    if      (*y12 < - box[4]) { *y12 += box[1]; } 
    else if (*y12 > + box[4]) { *y12 -= box[1]; }
    if      (*z12 < - box[5]) { *z12 += box[2]; } 
    else if (*z12 > + box[5]) { *z12 -= box[2]; }
}




================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/neighbor.cu
================================================
#include "neighbor.cuh"
#include "mic.cuh"
#include <stdio.h>
#include <stdlib.h>

void find_neighbor(int N, int MN, Atom *atom)
{
    int *NN = atom->NN;
    int *NL = atom->NL;
    real *x = atom->x;
    real *y = atom->y;
    real *z = atom->z;
    real *box = atom->box; 
    real cutoff = 11.0;
    real cutoff_square = cutoff * cutoff;

    for (int n = 0; n < N; n++)
    {
        NN[n] = 0;
    }

    for (int n1 = 0; n1 < N - 1; n1++)
    {  
        for (int n2 = n1 + 1; n2 < N; n2++)
        {   
            real x12 = x[n2] - x[n1];
            real y12 = y[n2] - y[n1];
            real z12 = z[n2] - z[n1];
            apply_mic(box, &x12, &y12, &z12);
            real d_square = x12*x12 + y12*y12 + z12*z12;

            if (d_square < cutoff_square)
            {        
                NL[n1 * MN + NN[n1]++] = n2;
                NL[n2 * MN + NN[n2]++] = n1;
            }
        }
    }

    for (int n1 = 0; n1 < N - 1; n1++)
    {
        if (NN[n1] > MN)
        {
            printf("Error: MN is too small.\n");
            exit(1);
        }
    } 
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/neighbor.cuh
================================================
#pragma once
#include "common.cuh"

void find_neighbor(int N, int MN, Atom *atom);


================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/Makefile
================================================
all: ljmd

CC = nvcc
CFLAGS = -O3 -arch=sm_75

ljmd: initialize.o integrate.o neighbor.o force.o memory.o main.o
	$(CC) -o ljmd \
	initialize.o integrate.o neighbor.o force.o memory.o main.o

initialize.o: initialize.cu
	$(CC) $(CFLAGS) -c initialize.cu
integrate.o: integrate.cu
	$(CC) $(CFLAGS) -c integrate.cu
memory.o: memory.cu
	$(CC) $(CFLAGS) -c memory.cu
neighbor.o: neighbor.cu
	$(CC) $(CFLAGS) -c neighbor.cu
force.o: force.cu
	$(CC) $(CFLAGS) -c force.cu
main.o: main.cu
	$(CC) $(CFLAGS) -c main.cu

clean:
	rm -rf *o ljmd


================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/common.h
================================================
#pragma once

#ifdef DOUBLE_PRECISION
    typedef double real;
#else
    typedef float real;
#endif

#define K_B                   8.617343e-5
#define TIME_UNIT_CONVERSION  1.018051e+1

struct Atom
{
    real *m;
    real *x;
    real *y;
    real *z;
    real *vx;
    real *vy;
    real *vz;
    real *fx;
    real *fy;
    real *fz;
    real *pe;
    real *ke;
    real *box;

    int *g_NN;
    int *g_NL;
    real *g_x;
    real *g_y;
    real *g_z;
    real *g_fx;
    real *g_fy;
    real *g_fz;
    real *g_pe;
};

struct Box
{
    real lx;
    real ly;
    real lz;
    real lx2;
    real ly2;
    real lz2;
};


================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/force.cu
================================================
#include "error.cuh"
#include "force.h"
#include "mic.h"

struct LJ
{
    real cutoff2;
    real e24s6;
    real e48s12;
    real e4s6;
    real e4s12;
};

static void __global__ gpu_find_force
(
    LJ lj, int N, int *g_NN, int *g_NL, Box box,
    real *g_x, real *g_y, real *g_z,
    real *g_fx, real *g_fy, real *g_fz, real *g_pe
)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
    {
        real fx = 0.0;
        real fy = 0.0;
        real fz = 0.0;
        real potential = 0.0;
        int NN = g_NN[i];
        real x_i = g_x[i];
        real y_i = g_y[i];
        real z_i = g_z[i];
        for (int k = 0; k < NN; ++k)
        {
            int j = g_NL[i + N * k];
            real x_ij  = g_x[j] - x_i;
            real y_ij  = g_y[j] - y_i;
            real z_ij  = g_z[j] - z_i;
            apply_mic(box, &x_ij, &y_ij, &z_ij);
            real r2 = x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;
            if (r2 > lj.cutoff2) { continue; }

            real r2inv = 1.0 / r2;
            real r4inv = r2inv * r2inv;
            real r6inv = r2inv * r4inv;
            real r8inv = r4inv * r4inv;
            real r12inv = r4inv * r8inv;
            real r14inv = r6inv * r8inv;
            real f_ij = lj.e24s6 * r8inv - lj.e48s12 * r14inv;
            potential += lj.e4s12 * r12inv - lj.e4s6 * r6inv;
            fx += f_ij * x_ij;
            fy += f_ij * y_ij;
            fz += f_ij * z_ij;
        }
        g_fx[i] = fx;
        g_fy[i] = fy;
        g_fz[i] = fz;
        g_pe[i] = potential * 0.5;
    }
}

void find_force(int N, int MN, Atom *atom)
{
    const real epsilon = 1.032e-2;
    const real sigma = 3.405;
    const real cutoff = 10.0;
    const real cutoff2 = cutoff * cutoff;
    const real sigma_3 = sigma * sigma * sigma;
    const real sigma_6 = sigma_3 * sigma_3;
    const real sigma_12 = sigma_6 * sigma_6;
    const real e24s6 = 24.0 * epsilon * sigma_6;
    const real e48s12 = 48.0 * epsilon * sigma_12;
    const real e4s6 = 4.0 * epsilon * sigma_6;
    const real e4s12 = 4.0 * epsilon * sigma_12;
    LJ lj;
    lj.cutoff2 = cutoff2;
    lj.e24s6 = e24s6;
    lj.e48s12 = e48s12;
    lj.e4s6 = e4s6;
    lj.e4s12 = e4s12;

    Box box;
    box.lx = atom->box[0];
    box.ly = atom->box[1];
    box.lz = atom->box[2];
    box.lx2 = atom->box[3];
    box.ly2 = atom->box[4];
    box.lz2 = atom->box[5];

    int m = sizeof(real) * N;
    CHECK(cudaMemcpy(atom->g_x, atom->x, m, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_y, atom->y, m, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_z, atom->z, m, cudaMemcpyHostToDevice));

    int block_size = 128;
    int grid_size = (N - 1) / block_size + 1;
    gpu_find_force<<<grid_size, block_size>>>
    (
        lj, N,  atom->g_NN, atom->g_NL, box,
        atom->g_x, atom->g_y, atom->g_z,
        atom->g_fx, atom->g_fy, atom->g_fz, atom->g_pe
    );

    CHECK(cudaMemcpy(atom->fx, atom->g_fx, m, cudaMemcpyDeviceToHost));
    CHECK(cudaMemcpy(atom->fy, atom->g_fy, m, cudaMemcpyDeviceToHost));
    CHECK(cudaMemcpy(atom->fz, atom->g_fz, m, cudaMemcpyDeviceToHost));
    CHECK(cudaMemcpy(atom->pe, atom->g_pe, m, cudaMemcpyDeviceToHost));
}




================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/force.h
================================================
#pragma once
#include "common.h"

void find_force(int N, int MN, Atom *atom);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/initialize.cu
================================================
#include "initialize.h"
#include "error.cuh"
#include <stdlib.h>
#include <math.h>

static void scale_velocity(int N, real T_0, Atom *atom)
{
    real *m = atom->m;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real temperature = 0.0;
    for (int n = 0; n < N; ++n) 
    {
        real v2 = vx[n]*vx[n] + vy[n]*vy[n] + vz[n]*vz[n];     
        temperature += m[n] * v2; 
    }
    temperature /= 3.0 * K_B * N;
    real scale_factor = sqrt(T_0 / temperature);
    for (int n = 0; n < N; ++n)
    { 
        vx[n] *= scale_factor;
        vy[n] *= scale_factor;
        vz[n] *= scale_factor;
    }
}

void initialize_position(int nx, real ax, Atom *atom)
{
    atom->box[0] = ax * nx;
    atom->box[1] = ax * nx;
    atom->box[2] = ax * nx;
    atom->box[3] = atom->box[0] * 0.5;
    atom->box[4] = atom->box[1] * 0.5;
    atom->box[5] = atom->box[2] * 0.5;
    real *x = atom->x;
    real *y = atom->y;
    real *z = atom->z;
    real x0[4] = {0.0, 0.0, 0.5, 0.5};
    real y0[4] = {0.0, 0.5, 0.0, 0.5}; 
    real z0[4] = {0.0, 0.5, 0.5, 0.0};
    int n = 0;
    for (int ix = 0; ix < nx; ++ix)
    {
        for (int iy = 0; iy < nx; ++iy)
        {
            for (int iz = 0; iz < nx; ++iz)
            {
                for (int i = 0; i < 4; ++i)
                {
                    x[n] = (ix + x0[i]) * ax;
                    y[n] = (iy + y0[i]) * ax;
                    z[n] = (iz + z0[i]) * ax;
                    n++;
                }
            }
        }
    }

    int m1 = sizeof(real) * 4 * nx * nx * nx;
    CHECK(cudaMemcpy(atom->g_x, atom->x, m1, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_y, atom->y, m1, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_z, atom->z, m1, cudaMemcpyHostToDevice));
}
  
void initialize_velocity(int N, real T_0, Atom *atom)
{
    real *m = atom->m;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real momentum_average[3] = {0.0, 0.0, 0.0};
    for (int n = 0; n < N; ++n)
    { 
        vx[n] = -1.0 + (rand() * 2.0) / RAND_MAX; 
        vy[n] = -1.0 + (rand() * 2.0) / RAND_MAX; 
        vz[n] = -1.0 + (rand() * 2.0) / RAND_MAX;    
        
        momentum_average[0] += m[n] * vx[n] / N;
        momentum_average[1] += m[n] * vy[n] / N;
        momentum_average[2] += m[n] * vz[n] / N;
    } 
    for (int n = 0; n < N; ++n) 
    { 
        vx[n] -= momentum_average[0] / m[n];
        vy[n] -= momentum_average[1] / m[n];
        vz[n] -= momentum_average[2] / m[n]; 
    }
    scale_velocity(N, T_0, atom);
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/initialize.h
================================================
#pragma once
#include "common.h"

void initialize_position(int nx, real ax, Atom *atom);
void initialize_velocity(int N, real T_0, Atom *atom);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/integrate.cu
================================================
#include "integrate.h"
#include "force.h"
#include "error.cuh"
#include <stdio.h>
#include <math.h>
#include <time.h>

static real sum(int N, real *x)
{
    real s = 0.0;
    for (int n = 0; n < N; ++n)
    {
        s += x[n];
    }
    return s;
}

static void scale_velocity(int N, real T_0, Atom *atom)
{
    real temperature = sum(N, atom->ke) / (1.5 * K_B * N);
    real scale_factor = sqrt(T_0 / temperature);
    for (int n = 0; n < N; ++n)
    {
        atom->vx[n] *= scale_factor;
        atom->vy[n] *= scale_factor;
        atom->vz[n] *= scale_factor;
    }
}

static void integrate
(int N, real time_step, Atom *atom, int flag)
{
    real *m = atom->m;
    real *x = atom->x;
    real *y = atom->y;
    real *z = atom->z;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real *fx = atom->fx;
    real *fy = atom->fy;
    real *fz = atom->fz;
    real *ke = atom->ke;
    real time_step_half = time_step * 0.5;
    for (int n = 0; n < N; ++n)
    {
        real mass_inv = 1.0 / m[n];
        real ax = fx[n] * mass_inv;
        real ay = fy[n] * mass_inv;
        real az = fz[n] * mass_inv;
        vx[n] += ax * time_step_half;
        vy[n] += ay * time_step_half;
        vz[n] += az * time_step_half;
        if (flag == 1)
        {
            x[n] += vx[n] * time_step;
            y[n] += vy[n] * time_step;
            z[n] += vz[n] * time_step;
        }
        else
        {
            real v2 = vx[n]*vx[n] + vy[n]*vy[n] + vz[n]*vz[n];
            ke[n] = m[n] * v2 * 0.5;
        }
    }
}

void equilibration
(
    int Ne, int N, int MN, real T_0,
    real time_step, Atom *atom
)
{
    find_force(N, MN, atom);
    for (int step = 0; step < Ne; ++step)
    {
        integrate(N, time_step, atom, 1);
        find_force(N, MN, atom);
        integrate(N, time_step, atom, 2);
        scale_velocity(N, T_0, atom);
    }
}

void production
(
    int Np, int Ns, int N, int MN, real T_0,
    real time_step, Atom *atom
)
{
    float t_force = 0.0f;
    CHECK(cudaDeviceSynchronize());
    clock_t t_total_start = clock();

    FILE *fid = fopen("energy.txt", "w");
    for (int step = 0; step < Np; ++step)
    {
        integrate(N, time_step, atom, 1);

        CHECK(cudaDeviceSynchronize());
        clock_t t_force_start = clock();

        find_force(N, MN, atom);

        CHECK(cudaDeviceSynchronize());
        clock_t t_force_stop = clock();

        t_force += float(t_force_stop - t_force_start) / CLOCKS_PER_SEC;

        integrate(N, time_step, atom, 2);

        if (0 == step % Ns)
        {
            fprintf(fid, "%g %g\n", sum(N, atom->ke), sum(N, atom->pe));
        }
    }
    fclose(fid);

    CHECK(cudaDeviceSynchronize());
    clock_t t_total_stop = clock();

    float t_total = float(t_total_stop - t_total_start) / CLOCKS_PER_SEC;
    printf("Time used for production = %g s\n", t_total);
    printf("Time used for force part = %g s\n", t_force);
}





================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/integrate.h
================================================
#pragma once
#include "common.h"

void equilibration
(
    int Ne, int N, int MN, real T_0, 
    real time_step, Atom *atom
);

void production
(
    int Np, int Ns, int N, int MN, real T_0, 
    real time_step, Atom *atom
);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/main.cu
================================================
#include "common.h"
#include "memory.h"
#include "initialize.h"
#include "neighbor.h"
#include "integrate.h"
#include <stdlib.h>
#include <stdio.h>

int main(int argc, char **argv)
{
    int nx = 5;
    int Ne = 2000;
    int Np = 2000;

    if (argc != 3) 
    { 
        printf("Usage: %s nx Ne\n", argv[0]);
        exit(1);
    }
    else
    {
        nx = atoi(argv[1]);
        Ne = atoi(argv[2]);
        Np = Ne;
    }

    int N = 4 * nx * nx * nx;
    int Ns = 100;
    int MN = 200;
    real T_0 = 60.0;
    real ax = 5.385;
    real time_step = 5.0 / TIME_UNIT_CONVERSION;
    Atom atom;
    allocate_memory(N, MN, &atom);
    for (int n = 0; n < N; ++n) { atom.m[n] = 40.0; }
    initialize_position(nx, ax, &atom);
    initialize_velocity(N, T_0, &atom);
    find_neighbor(N, MN, &atom);
    equilibration(Ne, N, MN, T_0, time_step, &atom);
    production(Np, Ns, N, MN, T_0, time_step, &atom);
    deallocate_memory(&atom);
    return 0;
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/makefile.windows
================================================
all: ljmd

CC = nvcc
CFLAGS = -O3 -arch=sm_75 -Xcompiler "/wd 4819"

ljmd: initialize.obj integrate.obj neighbor.obj force.obj memory.obj main.obj
	$(CC) -o ljmd \
	initialize.obj integrate.obj neighbor.obj force.obj memory.obj main.obj

initialize.obj: initialize.cu
	$(CC) $(CFLAGS) -c initialize.cu
integrate.obj: integrate.cu
	$(CC) $(CFLAGS) -c integrate.cu
memory.obj: memory.cu
	$(CC) $(CFLAGS) -c memory.cu
neighbor.obj: neighbor.cu
	$(CC) $(CFLAGS) -c neighbor.cu
force.obj: force.cu
	$(CC) $(CFLAGS) -c force.cu
main.obj: main.cu
	$(CC) $(CFLAGS) -c main.cu

clean:
	del *obj ljmd*




================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/memory.cu
================================================
#include "error.cuh"
#include "memory.h"
#include <stdlib.h>

void allocate_memory(int N, int MN, Atom *atom)
{
    atom->m  = (real*) malloc(N * sizeof(real));
    atom->x  = (real*) malloc(N * sizeof(real));
    atom->y  = (real*) malloc(N * sizeof(real));
    atom->z  = (real*) malloc(N * sizeof(real));
    atom->vx = (real*) malloc(N * sizeof(real));
    atom->vy = (real*) malloc(N * sizeof(real));
    atom->vz = (real*) malloc(N * sizeof(real));
    atom->fx = (real*) malloc(N * sizeof(real));
    atom->fy = (real*) malloc(N * sizeof(real));
    atom->fz = (real*) malloc(N * sizeof(real));
    atom->pe = (real*) malloc(N * sizeof(real));
    atom->ke = (real*) malloc(N * sizeof(real));
    atom->box = (real*) malloc(6 * sizeof(real));

    CHECK(cudaMalloc((void**)&atom->g_NN, sizeof(int) * N));
    CHECK(cudaMalloc((void**)&atom->g_NL, sizeof(int) * N * MN));
    CHECK(cudaMalloc((void**)&atom->g_x, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_y, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_z, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_fx, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_fy, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_fz, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_pe, sizeof(real) * N));
}

void deallocate_memory(Atom *atom)
{
    free(atom->m);
    free(atom->x);
    free(atom->y);
    free(atom->z);
    free(atom->vx);
    free(atom->vy);
    free(atom->vz);
    free(atom->fx);
    free(atom->fy);
    free(atom->fz);
    free(atom->pe);
    free(atom->ke);
    free(atom->box);

    CHECK(cudaFree(atom->g_NN));
    CHECK(cudaFree(atom->g_NL));
    CHECK(cudaFree(atom->g_x));
    CHECK(cudaFree(atom->g_y));
    CHECK(cudaFree(atom->g_z));
    CHECK(cudaFree(atom->g_fx));
    CHECK(cudaFree(atom->g_fy));
    CHECK(cudaFree(atom->g_fz));
    CHECK(cudaFree(atom->g_pe));
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/memory.h
================================================
#pragma once
#include "common.h"

void allocate_memory(int N, int MN, Atom *atom);
void deallocate_memory(Atom *atom);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/mic.h
================================================
#pragma once

static void __device__ apply_mic
(
    Box box, real *x12, real *y12, real *z12
)
{
    if      (*x12 < - box.lx2) { *x12 += box.lx; }
    else if (*x12 > + box.lx2) { *x12 -= box.lx; }
    if      (*y12 < - box.ly2) { *y12 += box.ly; }
    else if (*y12 > + box.ly2) { *y12 -= box.ly; }
    if      (*z12 < - box.lz2) { *z12 += box.lz; }
    else if (*z12 > + box.lz2) { *z12 -= box.lz; }
}




================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/neighbor.cu
================================================
#include "neighbor.h"
#include "mic.h"
#include <stdio.h>
#include <stdlib.h>

static void __global__ gpu_find_neighbor
(
    int N, int MN, int *g_NN, int *g_NL, Box box,
    real *g_x, real *g_y, real *g_z, real cutoff2
)
{
    int n1 = blockIdx.x * blockDim.x + threadIdx.x;
    if (n1 < N)
    {
        int count = 0;
        real x1 = g_x[n1];
        real y1 = g_y[n1];
        real z1 = g_z[n1];
        for (int n2 = 0; n2 < N; n2++)
        {
            real x12 = g_x[n2] - x1;
            real y12 = g_y[n2] - y1;
            real z12 = g_z[n2] - z1;
            apply_mic(box, &x12, &y12, &z12);
            real d12_square = x12*x12 + y12*y12 + z12*z12;
            if ((n2 != n1) && (d12_square < cutoff2))
            {
                g_NL[count++ * N + n1] = n2;
            }
        }
        g_NN[n1] = count;
    }
}

void find_neighbor(int N, int MN, Atom *atom)
{
    real cutoff = 11.0;
    real cutoff2 = cutoff * cutoff;

    Box box;
    box.lx = atom->box[0];
    box.ly = atom->box[1];
    box.lz = atom->box[2];
    box.lx2 = atom->box[3];
    box.ly2 = atom->box[4];
    box.lz2 = atom->box[5];

    int block_size = 128;
    int grid_size = (N - 1) / block_size + 1;
    gpu_find_neighbor<<<grid_size, block_size>>>
    (
        N, MN, atom->g_NN, atom->g_NL, box,
        atom->g_x, atom->g_y, atom->g_z, cutoff2
    );
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/neighbor.h
================================================
#pragma once
#include "common.h"

void find_neighbor(int N, int MN, Atom *atom);


================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/plot_energy.m
================================================
clear;close all; font_size=12;
%load cpp/energy.txt;
%load force-only/energy.txt;
load whole-code/energy.txt;

t=(1:size(energy,1))*0.5; %ps
ek=energy(:,1);
ep=energy(:,2);
et=ek+ep;
figure;
plot(t,ek,'linewidth',2);
hold on;
plot(t,ep,'linewidth',2);
plot(t,et,'linewidth',2);
xlabel('time (ps)', 'fontsize', font_size);
ylabel('energy (eV)', 'fontsize', font_size);
set(gca,'fontsize', font_size);
legend('kinetic','potential','total');

figure;
plot(t,(et-mean(et))/abs(mean(et)),'linewidth',2);
xlabel('time (ps)', 'fontsize', font_size);
ylabel('Relative energy fluctuations', 'fontsize', font_size);
set(gca,'fontsize', font_size);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/Makefile
================================================
all: ljmd

CC = nvcc
CFLAGS = -O3 -arch=sm_60

ljmd: initialize.o integrate.o neighbor.o force.o memory.o reduce.o main.o
	$(CC) -o ljmd \
	initialize.o integrate.o neighbor.o force.o memory.o reduce.o main.o

initialize.o: initialize.cu
	$(CC) $(CFLAGS) -c initialize.cu
integrate.o: integrate.cu
	$(CC) $(CFLAGS) -c integrate.cu
memory.o: memory.cu
	$(CC) $(CFLAGS) -c memory.cu
neighbor.o: neighbor.cu
	$(CC) $(CFLAGS) -c neighbor.cu
force.o: force.cu
	$(CC) $(CFLAGS) -c force.cu
reduce.o: reduce.cu
	$(CC) $(CFLAGS) -c reduce.cu
main.o: main.cu
	$(CC) $(CFLAGS) -c main.cu

clean:
	rm -rf *o ljmd
	


================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/common.h
================================================
#pragma once

#ifdef DOUBLE_PRECISION
    typedef double real;
#else
    typedef float real;
#endif

#define K_B                   8.617343e-5
#define TIME_UNIT_CONVERSION  1.018051e+1

struct Atom
{
    real *m;
    real *x;
    real *y;
    real *z;
    real *vx;
    real *vy;
    real *vz;
    real *fx;
    real *fy;
    real *fz;
    real *pe;
    real *ke;
    real *box;

    int *g_NN;
    int *g_NL;
    real *g_m;
    real *g_x;
    real *g_y;
    real *g_z;
    real *g_vx;
    real *g_vy;
    real *g_vz;
    real *g_fx;
    real *g_fy;
    real *g_fz;
    real *g_pe;
    real *g_ke;
};

struct Box
{
    real lx;
    real ly;
    real lz;
    real lx2;
    real ly2;
    real lz2;
};


================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/force.cu
================================================
#include "error.cuh"
#include "force.h"
#include "mic.h"

struct LJ
{
    real cutoff2;
    real e24s6; 
    real e48s12;
    real e4s6;
    real e4s12;
};

static void __global__ gpu_find_force
(
    LJ lj, int N, int *g_NN, int *g_NL, Box box,
    real *g_x, 
    real *g_y, 
    real *g_z,
    real *g_fx, real *g_fy, real *g_fz, real *g_pe
)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
    {
        real fx = 0.0;
        real fy = 0.0;
        real fz = 0.0;
        real potential = 0.0;
        int NN = g_NN[i];
        real x_i = g_x[i];
        real y_i = g_y[i];
        real z_i = g_z[i];
        for (int k = 0; k < NN; ++k)
        {   
            int j = g_NL[i + N * k];
            real x_ij  = g_x[j] - x_i;
            real y_ij  = g_y[j] - y_i;
            real z_ij  = g_z[j] - z_i;
            apply_mic(box, &x_ij, &y_ij, &z_ij);
            real r2 = x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;
            if (r2 > lj.cutoff2) { continue; }

            real r2inv = 1.0 / r2;
            real r4inv = r2inv * r2inv;
            real r6inv = r2inv * r4inv;
            real r8inv = r4inv * r4inv;
            real r12inv = r4inv * r8inv;
            real r14inv = r6inv * r8inv;
            real f_ij = lj.e24s6 * r8inv - lj.e48s12 * r14inv;
            potential += lj.e4s12 * r12inv - lj.e4s6 * r6inv;
            fx += f_ij * x_ij;
            fy += f_ij * y_ij;
            fz += f_ij * z_ij;
        }
        g_fx[i] = fx; 
        g_fy[i] = fy; 
        g_fz[i] = fz; 
        g_pe[i] = potential * 0.5;
    }
}

void find_force(int N, int MN, Atom *atom)
{
    const real epsilon = 1.032e-2;
    const real sigma = 3.405;
    const real cutoff = 10.0;
    const real cutoff2 = cutoff * cutoff;
    const real sigma_3 = sigma * sigma * sigma;
    const real sigma_6 = sigma_3 * sigma_3;
    const real sigma_12 = sigma_6 * sigma_6;
    const real e24s6 = 24.0 * epsilon * sigma_6; 
    const real e48s12 = 48.0 * epsilon * sigma_12;
    const real e4s6 = 4.0 * epsilon * sigma_6;
    const real e4s12 = 4.0 * epsilon * sigma_12;
    LJ lj;
    lj.cutoff2 = cutoff2;
    lj.e24s6 = e24s6;
    lj.e48s12 = e48s12;
    lj.e4s6 = e4s6;
    lj.e4s12 = e4s12;

    Box box;
    box.lx = atom->box[0];
    box.ly = atom->box[1];
    box.lz = atom->box[2];
    box.lx2 = atom->box[3];
    box.ly2 = atom->box[4];
    box.lz2 = atom->box[5];

    int block_size = 128;
    int grid_size = (N - 1) / block_size + 1;
    gpu_find_force<<<grid_size, block_size>>>
    (
        lj, N,  atom->g_NN, atom->g_NL, box,
        atom->g_x, atom->g_y, atom->g_z,
        atom->g_fx, atom->g_fy, atom->g_fz, atom->g_pe
    );
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/force.h
================================================
#pragma once
#include "common.h"

void find_force(int N, int MN, Atom *atom);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/initialize.cu
================================================
#include "initialize.h"
#include "error.cuh"
#include <stdlib.h>
#include <math.h>

static void scale_velocity(int N, real T_0, Atom *atom)
{
    real *m = atom->m;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real temperature = 0.0;
    for (int n = 0; n < N; ++n) 
    {
        real v2 = vx[n]*vx[n] + vy[n]*vy[n] + vz[n]*vz[n];     
        temperature += m[n] * v2; 
    }
    temperature /= 3.0 * K_B * N;
    real scale_factor = sqrt(T_0 / temperature);
    for (int n = 0; n < N; ++n)
    { 
        vx[n] *= scale_factor;
        vy[n] *= scale_factor;
        vz[n] *= scale_factor;
    }
}

void initialize_position(int nx, real ax, Atom *atom)
{
    atom->box[0] = ax * nx;
    atom->box[1] = ax * nx;
    atom->box[2] = ax * nx;
    atom->box[3] = atom->box[0] * 0.5;
    atom->box[4] = atom->box[1] * 0.5;
    atom->box[5] = atom->box[2] * 0.5;
    real *x = atom->x;
    real *y = atom->y;
    real *z = atom->z;
    real x0[4] = {0.0, 0.0, 0.5, 0.5};
    real y0[4] = {0.0, 0.5, 0.0, 0.5}; 
    real z0[4] = {0.0, 0.5, 0.5, 0.0};
    int n = 0;
    for (int ix = 0; ix < nx; ++ix)
    {
        for (int iy = 0; iy < nx; ++iy)
        {
            for (int iz = 0; iz < nx; ++iz)
            {
                for (int i = 0; i < 4; ++i)
                {
                    x[n] = (ix + x0[i]) * ax;
                    y[n] = (iy + y0[i]) * ax;
                    z[n] = (iz + z0[i]) * ax;
                    n++;
                }
            }
        }
    }

    int m1 = sizeof(real) * 4 * nx * nx * nx;
    CHECK(cudaMemcpy(atom->g_x, atom->x, m1, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_y, atom->y, m1, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_z, atom->z, m1, cudaMemcpyHostToDevice));
}

void initialize_velocity(int N, real T_0, Atom *atom)
{
    real *m = atom->m;
    real *vx = atom->vx;
    real *vy = atom->vy;
    real *vz = atom->vz;
    real momentum_average[3] = {0.0, 0.0, 0.0};
    for (int n = 0; n < N; ++n)
    { 
        vx[n] = -1.0 + (rand() * 2.0) / RAND_MAX; 
        vy[n] = -1.0 + (rand() * 2.0) / RAND_MAX; 
        vz[n] = -1.0 + (rand() * 2.0) / RAND_MAX;    
        
        momentum_average[0] += m[n] * vx[n] / N;
        momentum_average[1] += m[n] * vy[n] / N;
        momentum_average[2] += m[n] * vz[n] / N;
    } 
    for (int n = 0; n < N; ++n) 
    { 
        vx[n] -= momentum_average[0] / m[n];
        vy[n] -= momentum_average[1] / m[n];
        vz[n] -= momentum_average[2] / m[n]; 
    }
    scale_velocity(N, T_0, atom);

    CHECK(cudaMemcpy(atom->g_m, atom->m, sizeof(real) * N, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_vx, atom->vx, sizeof(real) * N, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_vy, atom->vy, sizeof(real) * N, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(atom->g_vz, atom->vz, sizeof(real) * N, cudaMemcpyHostToDevice));
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/initialize.h
================================================
#pragma once
#include "common.h"

void initialize_position(int nx, real ax, Atom *atom);
void initialize_velocity(int N, real T_0, Atom *atom);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/integrate.cu
================================================
#include "integrate.h"
#include "error.cuh"
#include "force.h"
#include "reduce.h"
#include <stdio.h>
#include <math.h>
#include <time.h>

static void __global__ gpu_scale_velocity
(
    int N, real scale_factor, 
    real *g_vx, real *g_vy, real *g_vz
)
{
    int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    { 
        g_vx[n] *= scale_factor;
        g_vy[n] *= scale_factor;
        g_vz[n] *= scale_factor;
    }
}

static void scale_velocity(int N, real T_0, Atom *atom)
{
    real temperature = sum(N, atom->g_ke) / (1.5 * K_B * N);
    real scale_factor = sqrt(T_0 / temperature);

    int block_size = 128;
    int grid_size = (N - 1) / block_size + 1;
    gpu_scale_velocity<<<grid_size, block_size>>>
    (N, scale_factor, atom->g_vx, atom->g_vy, atom->g_vz);
}

static void __global__ gpu_integrate
(
    int N, real time_step, real time_step_half,
    real *g_m, real *g_x, real *g_y, real *g_z,
    real *g_vx, real *g_vy, real *g_vz,
    real *g_fx, real *g_fy, real *g_fz, 
    real *g_ke, int flag
)
{
    int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        real mass = g_m[n];
        real mass_inv = 1.0 / mass;
        real ax = g_fx[n] * mass_inv;
        real ay = g_fy[n] * mass_inv;
        real az = g_fz[n] * mass_inv;
        real vx = g_vx[n];
        real vy = g_vy[n];
        real vz = g_vz[n];

        vx += ax * time_step_half;
        vy += ay * time_step_half;
        vz += az * time_step_half;
        g_vx[n] = vx;
        g_vy[n] = vy;
        g_vz[n] = vz;

        if (flag == 1) 
        { 
            g_x[n] += vx * time_step; 
            g_y[n] += vy * time_step; 
            g_z[n] += vz * time_step; 
        }
        else
        {
            g_ke[n] = (vx*vx + vy*vy + vz*vz) * mass * 0.5;
        }
    }
}

static void integrate(int N, real time_step, Atom *atom, int flag)
{
    real time_step_half = time_step * 0.5;

    int block_size = 128;
    int grid_size = (N - 1) / block_size + 1;
    gpu_integrate<<<grid_size, block_size>>>
    (
        N, time_step, time_step_half,
        atom->g_m, atom->g_x, atom->g_y, atom->g_z,
        atom->g_vx, atom->g_vy, atom->g_vz,
        atom->g_fx, atom->g_fy, atom->g_fz, 
        atom->g_ke, flag
    );
}

void equilibration
(
    int Ne, int N, int MN, real T_0, 
    real time_step, Atom *atom
)
{
    find_force(N, MN, atom);
    for (int step = 0; step < Ne; ++step)
    { 
        integrate(N, time_step, atom, 1);
        find_force(N, MN, atom);
        integrate(N, time_step, atom, 2);
        scale_velocity(N, T_0, atom);
    } 
}

void production
(
    int Np, int Ns, int N, int MN, real T_0, 
    real time_step, Atom *atom
)
{
    float t_force = 0.0f;
    CHECK(cudaDeviceSynchronize());
    clock_t t_total_start = clock();

    FILE *fid_e = fopen("energy.txt", "w");
    for (int step = 0; step < Np; ++step)
    {  
        integrate(N, time_step, atom, 1);

        CHECK(cudaDeviceSynchronize());
        clock_t t_force_start = clock();

        find_force(N, MN, atom);

        CHECK(cudaDeviceSynchronize());
        clock_t t_force_stop = clock();

        t_force += float(t_force_stop - t_force_start) / CLOCKS_PER_SEC;

        integrate(N, time_step, atom, 2);

        if (0 == step % Ns)
        {
            fprintf(fid_e, "%g %g\n", sum(N, atom->g_ke), sum(N, atom->g_pe));
        }
    }
    fclose(fid_e);

    clock_t t_total_stop = clock();

    float t_total = float(t_total_stop - t_total_start) / CLOCKS_PER_SEC;
    printf("Time used for production = %g s\n", t_total);
    printf("Time used for force part = %g s\n", t_force);
}




================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/integrate.h
================================================
#pragma once
#include "common.h"

void equilibration
(
    int Ne, int N, int MN, real T_0, 
    real time_step, Atom *atom
);

void production
(
    int Np, int Ns, int N, int MN, real T_0, 
    real time_step, Atom *atom
);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/main.cu
================================================
#include "common.h"
#include "memory.h"
#include "initialize.h"
#include "neighbor.h"
#include "integrate.h"
#include <stdlib.h>
#include <stdio.h>

int main(int argc, char **argv)
{
    int nx = 5;
    int Ne = 2000;
    int Np = 2000;

    if (argc != 3) 
    { 
        printf("Usage: %s nx Ne\n", argv[0]);
        exit(1);
    }
    else
    {
        nx = atoi(argv[1]);
        Ne = atoi(argv[2]);
        Np = Ne;
    }

    int N = 4 * nx * nx * nx;
    int Ns = 100;
    int MN = 200;
    real T_0 = 60.0;
    real ax = 5.385;
    real time_step = 5.0 / TIME_UNIT_CONVERSION;
    Atom atom;
    allocate_memory(N, MN, &atom);
    for (int n = 0; n < N; ++n) { atom.m[n] = 40.0; }
    initialize_position(nx, ax, &atom);
    initialize_velocity(N, T_0, &atom);
    find_neighbor(N, MN, &atom);
    equilibration(Ne, N, MN, T_0, time_step, &atom);
    production(Np, Ns, N, MN, T_0, time_step, &atom);
    deallocate_memory(&atom);
    return 0;
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/makefile.windows
================================================
all: ljmd

CC = nvcc
CFLAGS = -O3 -arch=sm_75 -Xcompiler "/wd 4819"

ljmd: initialize.obj integrate.obj neighbor.obj force.obj memory.obj reduce.obj main.obj
	$(CC) -o ljmd \
	initialize.obj integrate.obj neighbor.obj force.obj memory.obj reduce.obj main.obj

initialize.obj: initialize.cu
	$(CC) $(CFLAGS) -c initialize.cu
integrate.obj: integrate.cu
	$(CC) $(CFLAGS) -c integrate.cu
memory.obj: memory.cu
	$(CC) $(CFLAGS) -c memory.cu
neighbor.obj: neighbor.cu
	$(CC) $(CFLAGS) -c neighbor.cu
force.obj: force.cu
	$(CC) $(CFLAGS) -c force.cu
reduce.obj: reduce.cu
	$(CC) $(CFLAGS) -c reduce.cu
main.obj: main.cu
	$(CC) $(CFLAGS) -c main.cu

clean:
	del *obj ljmd*




================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/memory.cu
================================================
#include "error.cuh"
#include "memory.h"
#include <stdlib.h>

void allocate_memory(int N, int MN, Atom *atom)
{
    atom->m  = (real*) malloc(N * sizeof(real));
    atom->x  = (real*) malloc(N * sizeof(real));
    atom->y  = (real*) malloc(N * sizeof(real));
    atom->z  = (real*) malloc(N * sizeof(real));
    atom->vx = (real*) malloc(N * sizeof(real));
    atom->vy = (real*) malloc(N * sizeof(real));
    atom->vz = (real*) malloc(N * sizeof(real));
    atom->fx = (real*) malloc(N * sizeof(real));
    atom->fy = (real*) malloc(N * sizeof(real));
    atom->fz = (real*) malloc(N * sizeof(real));
    atom->pe = (real*) malloc(N * sizeof(real));
    atom->ke = (real*) malloc(N * sizeof(real));
    atom->box = (real*) malloc(6 * sizeof(real));

    CHECK(cudaMalloc((void**)&atom->g_NN, sizeof(int) * N));
    CHECK(cudaMalloc((void**)&atom->g_NL, sizeof(int) * N * MN));
    CHECK(cudaMalloc((void**)&atom->g_m, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_x, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_y, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_z, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_vx, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_vy, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_vz, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_fx, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_fy, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_fz, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_pe, sizeof(real) * N));
    CHECK(cudaMalloc((void**)&atom->g_ke, sizeof(real) * N));
}

void deallocate_memory(Atom *atom)
{
    free(atom->m);
    free(atom->x);
    free(atom->y);
    free(atom->z);
    free(atom->vx);
    free(atom->vy);
    free(atom->vz);
    free(atom->fx);
    free(atom->fy);
    free(atom->fz);
    free(atom->pe);
    free(atom->ke);
    free(atom->box);

    CHECK(cudaFree(atom->g_NN));
    CHECK(cudaFree(atom->g_NL));
    CHECK(cudaFree(atom->g_m));
    CHECK(cudaFree(atom->g_x));
    CHECK(cudaFree(atom->g_y));
    CHECK(cudaFree(atom->g_z));
    CHECK(cudaFree(atom->g_vx));
    CHECK(cudaFree(atom->g_vy));
    CHECK(cudaFree(atom->g_vz));
    CHECK(cudaFree(atom->g_fx));
    CHECK(cudaFree(atom->g_fy));
    CHECK(cudaFree(atom->g_fz));
    CHECK(cudaFree(atom->g_pe));
    CHECK(cudaFree(atom->g_ke));
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/memory.h
================================================
#pragma once
#include "common.h"

void allocate_memory(int N, int MN, Atom *atom);
void deallocate_memory(Atom *atom);



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/mic.h
================================================
#pragma once

static void __device__ apply_mic
(
    Box box, real *x12, real *y12, real *z12
)
{
    if      (*x12 < - box.lx2) { *x12 += box.lx; } 
    else if (*x12 > + box.lx2) { *x12 -= box.lx; }
    if      (*y12 < - box.ly2) { *y12 += box.ly; } 
    else if (*y12 > + box.ly2) { *y12 -= box.ly; }
    if      (*z12 < - box.lz2) { *z12 += box.lz; } 
    else if (*z12 > + box.lz2) { *z12 -= box.lz; }
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/neighbor.cu
================================================
#include "neighbor.h"
#include "mic.h"
#include <stdio.h>
#include <stdlib.h>

static void __global__ gpu_find_neighbor
(
    int N, int MN, int *g_NN, int *g_NL, Box box, 
    real *g_x, real *g_y, real *g_z, real cutoff2
)
{
    int n1 = blockIdx.x * blockDim.x + threadIdx.x;
    if (n1 < N)
    {
        int count = 0;
        real x1 = g_x[n1];
        real y1 = g_y[n1];
        real z1 = g_z[n1];
        for (int n2 = 0; n2 < N; n2++)
        {
            real x12 = g_x[n2] - x1;
            real y12 = g_y[n2] - y1;
            real z12 = g_z[n2] - z1;
            apply_mic(box, &x12, &y12, &z12);
            real d12_square = x12*x12 + y12*y12 + z12*z12;
            if ((n2 != n1) && (d12_square < cutoff2))
            {
                g_NL[count++ * N + n1] = n2;
            }
        }
        g_NN[n1] = count;
    }
}

void find_neighbor(int N, int MN, Atom *atom)
{
    real cutoff = 11.0;
    real cutoff2 = cutoff * cutoff;

    Box box;
    box.lx = atom->box[0];
    box.ly = atom->box[1];
    box.lz = atom->box[2];
    box.lx2 = atom->box[3];
    box.ly2 = atom->box[4];
    box.lz2 = atom->box[5];

    int block_size = 128;
    int grid_size = (N - 1) / block_size + 1;
    gpu_find_neighbor<<<grid_size, block_size>>>
    (
        N, MN, atom->g_NN, atom->g_NL, box,
        atom->g_x, atom->g_y, atom->g_z, cutoff2
    );
}



================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/neighbor.h
================================================
#pragma once
#include "common.h"

void find_neighbor(int N, int MN, Atom *atom);


================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/reduce.cu
================================================
#include "reduce.h"
#include "error.cuh"
#include <cooperative_groups.h>
using namespace cooperative_groups;
const int block_size = 128;

void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];

    real y = 0.0;
    const int stride = blockDim.x * gridDim.x;
    for (int n = bid * blockDim.x + tid; n < N; n += stride)
    {
        y += d_x[n];
    }
    s_y[tid] = y;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        d_y[bid] = y;
    }
}

__device__ real static_y[block_size];

real sum(const int N, const real *d_x)
{
    real *d_y;
    CHECK(cudaGetSymbolAddress((void**)&d_y, static_y));
    
    const int smem = sizeof(real) * block_size;

    reduce_cp<<<block_size, block_size, smem>>>(d_x, d_y, N);
    reduce_cp<<<1, block_size, smem>>>(d_y, d_y, block_size);

    real h_y[1] = {0};
    CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));

    return h_y[0];
}




================================================
FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/reduce.h
================================================
#include "common.h"

real sum(const int N, const real *d_x);



================================================
FILE: CUDA/chapter14_CUDA标准库的使用/Makefile
================================================
all: thrust_scan_vector thrust_scan_pointer cublas_gemm cusolver curand_host1 curand_host2

thrust_scan_vector: thrust_scan_vector.cu
	nvcc -arch=sm_60 -O3 thrust_scan_vector.cu -o thrust_scan_vector

thrust_scan_pointer: thrust_scan_pointer.cu
	nvcc -arch=sm_60 -O3 thrust_scan_pointer.cu -o thrust_scan_pointer

cublas_gemm: cublas_gemm.cu
	nvcc -arch=sm_60 -lcublas -O3 cublas_gemm.cu -o cublas_gemm

cusolver: cusolver.cu
	nvcc -arch=sm_60 -lcusolver cusolver.cu -o cusolver

curand_host1: curand_host1.cu
	nvcc -arch=sm_60 -lcurand curand_host1.cu -o curand_host1

curand_host2: curand_host2.cu
	nvcc -arch=sm_60 -lcurand curand_host2.cu -o curand_host2

.PHONY: clean

clean:
	rm -rf thrust_scan_vector thrust_scan_pointer cublas_gemm cusolver curand_host1 curand_host2


================================================
FILE: CUDA/chapter14_CUDA标准库的使用/cublas_gemm.cu
================================================
#include "error.cuh" 
#include <stdio.h>
#include <cublas_v2.h>

void print_matrix(int R, int C, double* A, const char* name);

int main(void)
{
    int M = 2;
    int K = 3;
    int N = 2;
    int MK = M * K;
    int KN = K * N;
    int MN = M * N;

    double *h_A = (double*) malloc(sizeof(double) * MK);
    double *h_B = (double*) malloc(sizeof(double) * KN);
    double *h_C = (double*) malloc(sizeof(double) * MN);
    for (int i = 0; i < MK; i++)
    {
        h_A[i] = i;
    }
    print_matrix(M, K, h_A, "A");
    for (int i = 0; i < KN; i++)
    {
        h_B[i] = i;
    }
    print_matrix(K, N, h_B, "B");
    for (int i = 0; i < MN; i++)
    {
        h_C[i] = 0;
    }

    double *g_A, *g_B, *g_C;
    CHECK(cudaMalloc((void **)&g_A, sizeof(double) * MK));
    CHECK(cudaMalloc((void **)&g_B, sizeof(double) * KN));
    CHECK(cudaMalloc((void **)&g_C, sizeof(double) * MN));

    cublasSetVector(MK, sizeof(double), h_A, 1, g_A, 1);
    cublasSetVector(KN, sizeof(double), h_B, 1, g_B, 1);
    cublasSetVector(MN, sizeof(double), h_C, 1, g_C, 1);

    cublasHandle_t handle;
    cublasCreate(&handle);
    double alpha = 1.0;
    double beta = 0.0;
    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
        M, N, K, &alpha, g_A, M, g_B, K, &beta, g_C, M);
    cublasDestroy(handle);

    cublasGetVector(MN, sizeof(double), g_C, 1, h_C, 1);
    print_matrix(M, N, h_C, "C = A x B");

    free(h_A);
    free(h_B);
    free(h_C);
    CHECK(cudaFree(g_A));
    CHECK(cudaFree(g_B));
    CHECK(cudaFree(g_C));
    return 0;
}

void print_matrix(int R, int C, double* A, const char* name)
{
    printf("%s = \n", name);
    for (int r = 0; r < R; ++r)
    {
        for (int c = 0; c < C; ++c)
        {
            printf("%10.6f", A[c * R + r]);
        }
        printf("\n");
    }
}

================================================
FILE: CUDA/chapter14_CUDA标准库的使用/curand_host1.cu
================================================
#include <cstdio>
#include <cstdlib>
#include <curand.h>

void output_results(int N, double *g_x);

int main(int argc, char *argv[])
{
    curandGenerator_t generator;
    curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
    curandSetPseudoRandomGeneratorSeed(generator, 1234);
    int N = 100000;
    double *g_x;
    cudaMalloc((void **)&g_x, sizeof(double) * N);
    curandGenerateUniformDouble(generator, g_x, N);
    double *x = (double *)calloc(N, sizeof(double));
    cudaMemcpy(x, g_x, sizeof(double) * N, cudaMemcpyDeviceToHost);
    cudaFree(g_x);
    output_results(N, x);

    free(x);

    return 0;
}

void output_results(int N, double *x)
{
    FILE *fid = fopen("x1.txt", "w");
    for (int n = 0; n < N; n++)
    {
        fprintf(fid, "%g\n", x[n]);
    }
    fclose(fid);
}



================================================
FILE: CUDA/chapter14_CUDA标准库的使用/curand_host2.cu
================================================
#include <cstdio>
#include <cstdlib>
#include <curand.h>

void output_results(int N, double *g_x);

int main(int argc, char *argv[])
{
    curandGenerator_t generator;
    curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
    curandSetPseudoRandomGeneratorSeed(generator, 1234);
    int N = 100000;
    double *g_x;
    cudaMalloc((void **)&g_x, sizeof(double) * N);
    curandGenerateNormalDouble(generator, g_x, N, 0.0, 1.0);
    double *x = (double *)calloc(N, sizeof(double));
    cudaMemcpy(x, g_x, sizeof(double) * N, cudaMemcpyDeviceToHost);
    cudaFree(g_x);
    output_results(N, x);
    free(x);

    return 0;
}

void output_results(int N, double *x)
{
    FILE *fid = fopen("x2.txt", "w");
    for(int n = 0; n < N; n++)
    {
        fprintf(fid, "%g\n", x[n]);
    }
    fclose(fid);
}

================================================
FILE: CUDA/chapter14_CUDA标准库的使用/cusolver.cu
================================================
#include "error.cuh"
#include <cstdio>
#include <cstdlib>
#include <cusolverDn.h>

int main(int argc, char *argv[])
{
    int N = 2;
    int N2 = N * N;
    cuDoubleComplex *A_cpu = (cuDoubleComplex *)malloc(sizeof(cuDoubleComplex) * N2);

    for (int n = 0; n < N2; ++n)
    {
        A_cpu[0].x = 0;
        A_cpu[1].x = 0;
        A_cpu[2].x = 0;
        A_cpu[3].x = 0;
        A_cpu[0].y = 0;
        A_cpu[1].y = 1;
        A_cpu[2].y = -1;
        A_cpu[3].y = 0;
    }

    cuDoubleComplex *A;
    CHECK(cudaMalloc((void**)&A, sizeof(cuDoubleComplex) * N2));
    CHECK(cudaMemcpy(A, A_cpu, sizeof(cuDoubleComplex) * N2, cudaMemcpyHostToDevice));

    double *W_cpu = (double*)malloc(sizeof(double) * N);
    double *W;
    CHECK(cudaMalloc((void**)&W, sizeof(double) * N));

    cusolverDnHandle_t handle = NULL;
    cusolverDnCreate(&handle);
    cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
    cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;

    int lwork = 0;
    cusolverDnZheevd_bufferSize(handle, jobz, uplo, N, A, N, W, &lwork);
    cuDoubleComplex* work;
    CHECK(cudaMalloc((void **)&work, sizeof(cuDoubleComplex) * lwork));

    int *info;
    CHECK(cudaMalloc((void **)&info, sizeof(int)));
    cusolverDnZheevd(handle, jobz, uplo, N, A, N, W, work, lwork, info);
    cudaMemcpy(W_cpu, W, sizeof(double) * N, cudaMemcpyDeviceToHost);

    printf("Eigenvalues are: \n");
    for (int n = 0; n < N; ++n)
    {
        printf("%g\n", W_cpu[n]);
    }

    cusolverDnDestroy(handle);

    free(A_cpu);
    free(W_cpu);
    CHECK(cudaFree(A));
    CHECK(cudaFree(W));
    CHECK(cudaFree(work));
    CHECK(cudaFree(info));

    return 0;
}

================================================
FILE: CUDA/chapter14_CUDA标准库的使用/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


================================================
FILE: CUDA/chapter14_CUDA标准库的使用/thrust_scan_pointer.cu
================================================
#include <thrust/execution_policy.h>
#include <thrust/scan.h>
#include <cstdio>

int main(int argc, char *argv[])
{
    int N = 10;
    int *x, *y;
    cudaMalloc((void **)&x, sizeof(int) * N);
    cudaMalloc((void **)&y, sizeof(int) * N);
    int *h_x = (int *)malloc(sizeof(int) * N);
    for (int i = 0; i < N; ++i)
    {
        h_x[i] = i + 1;
    }
    cudaMemcpy(x, h_x, sizeof(int) * N, cudaMemcpyHostToDevice);

    thrust::inclusive_scan(thrust::device, x, x + N, y);

    int *h_y = (int *)malloc(sizeof(int) * N);
    cudaMemcpy(h_y, y, sizeof(int) * N, cudaMemcpyDeviceToHost);
    for (int i = 0; i < N; i++)
    {
        printf("%d ", h_y[i]);
    }
    printf("\n");

    cudaFree(x);
    cudaFree(y);
    free(h_x);
    free(h_y);
    return 0;
}

================================================
FILE: CUDA/chapter14_CUDA标准库的使用/thrust_scan_vector.cu
================================================
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <cstdio>

int main(void)
{
    int N = 10;
    thrust::device_vector<int> x(N, 0);
    thrust::device_vector<int> y(N, 0);
    for (int i = 0; i < x.size(); ++i)
    {
        x[i] = i + 1;
    }
    thrust::inclusive_scan(x.begin(), x.end(), y.begin());
    for (int i = 0; i < y.size(); ++i)
    {
        printf("%d ", (int) y[i]);
    }
    printf("\n");
    return 0;
}

================================================
FILE: CUDA/chapter1_GPU硬件与CUDA程序开发工具/README.md
================================================
### 1. 各个GPU主计算能力的架构代号与发布年份

| 主计算能力  | 架构代号 |   发布年份  |
|:------------:|:---------------:|:--------------:|
| `X=1` | Tesla | 2006 |
| `X=2` | Fermi | 2010 |
| `X=3` | Kepler | 2012 |
| `X=5` | Maxwell | 2014 |
| `X=6` | Pascal | 2016 |
| `X=7` | Volta | 2017 |
| `X.Y=7.5` | Turing | 2018 |
| `X=8` | Ampere | 2020 |

### 2. 当前常用的各种GPU的名称

| 架构  | Tesla系列 |   Quadro系列  | GeForce系列 | Jetson系列
|:------------:|:---------------:|:--------------:|:--------------:|:--------------:|
|开普勒|Tesla K系列|Quadro K 系列|GeForce 600/700系列|Tegra K1|
|麦克斯韦|Tesla M系列|Quadro M 系列|GeForce 900系列|Tegra X1|
|帕斯卡|Tesla P系列|Quadro P 系列|GeForce 1000系列|Tegra X2|
|伏特|Tesla V系列|无|无|AGX Xavier|
|图灵|Tesla T系列|Quadro RTX 系列|GeForce 2000系列|无|

注：特斯拉架构和费米架构的GPU已经不再受到最新CUDA的支持，故没有列出。

### 3. 若干GPU的主要性能指标

| GPU型号  | 计算能力 |   显存容量/GB  | 显存带宽/(GB/s) | 浮点数运算峰值/TFLOPS
|:------------:|:---------------:|:--------------:|:--------------:|:--------------:|
|Tesla K40|3.5|12|288|1.4(4.3)|
|Tesla P100|6.0|16|732|4.7(9.3)|
|Tesla V100|7.0|32|900|7(14)|
|GeForce RTX 2070|7.5|8|448|0.2(6.5)
|GeForce RTX 2080ti|7.5|11|616|0.4(13)

### 4. 最近的几个CUDA版本对GPU计算能力的支持情况

|CUDA版本|所支持GPU的计算能力|架构|
|:------------:|:---------------:|:--------------:|
|10.0~10.2|3.0~7.5|从开普勒到图灵|
|9.0~9.2|3.0~7.2|从开普勒到伏特|
|8.0|2.0~6.2|从费米到帕斯卡|
|7.0~7.5|2.0~5.3|从费米到麦克斯韦|

================================================
FILE: CUDA/chapter2_CUDA中的线程组织/Makefile
================================================
all: hello1 hello2 hello3 hello4 hello5

hello1: hello1.cpp
	g++ hello1.cpp -o hello1

hello2: hello2.cu
	nvcc hello2.cu -o hello2

hello3: hello3.cu
	nvcc hello3.cu -o hello3

hello4: hello4.cu
	nvcc hello4.cu -o hello4

hello5: hello5.cu
	nvcc hello5.cu -o hello5

.PHONY: clean

clean:
	rm -rf hello1 hello2 hello3 hello4 hello5


================================================
FILE: CUDA/chapter2_CUDA中的线程组织/hello1.cpp
================================================
#include <stdio.h>

int main(void)
{
    printf("Hello World!\n");
    return 0;
}

================================================
FILE: CUDA/chapter2_CUDA中的线程组织/hello2.cu
================================================
#include <cstdio>
#include <iostream>

__global__ void hello_from_gpu()
{
    printf("Hello World from the GPU!\n");
    // std::cout << "Hello world!!" << std::endl;
}

int main(int argc, char *argv[])
{
    hello_from_gpu<<<1, 1>>>();
    // cudaDeviceSynchronize();

    return 0;
}

================================================
FILE: CUDA/chapter2_CUDA中的线程组织/hello3.cu
================================================
#include <cstdio>

__global__ void hello_from_gpu()
{
    printf("Hello World from the GPU!\n");
}

int main(int argc, char *argv[])
{
    hello_from_gpu<<<2, 4>>>();
    cudaDeviceSynchronize();
    return 0;
}

================================================
FILE: CUDA/chapter2_CUDA中的线程组织/hello4.cu
================================================
#include <cstdio>

__global__ void hello_from_gpu()
{
    const int bid = blockIdx.x;
    const int tid = threadIdx.x;
    printf("Hello world from block %d and thread %d\n", bid, tid);
}

int main(int argc, char *argv[])
{
    hello_from_gpu<<<2, 4>>>();
    cudaDeviceSynchronize();
    return 0;
}

================================================
FILE: CUDA/chapter2_CUDA中的线程组织/hello5.cu
================================================
#include <cstdio>

__global__ void hello_from_gpu()
{
    const int b = blockIdx.x;
    const int tx = threadIdx.x;
    const int ty = threadIdx.y;

    printf("Hello World from block-%d and thread-(%d, %d)!\n", b, tx, ty);
}

int main(int argc, char *argv[])
{
    const dim3 block_size(2, 4);
    hello_from_gpu<<<1, block_size>>>();

    cudaDeviceSynchronize();

    return 0;
}

================================================
FILE: CUDA/chapter3_简单CUDA程序的基本框架/Makefile
================================================
all: add add1 add2wrong add3if add4device

add: add.cpp
	g++ add.cpp -o add

add1: add1.cu
	nvcc add1.cu -o add1

add2wrong: add2wrong.cu
	nvcc add2wrong.cu -o add2wrong

add3if: add3if.cu
	nvcc add3if.cu -o add3if

add4device: add4device.cu
	nvcc add4device.cu -o add4device

.PHONY: clean

clean:
	rm -rf add add1 add2wrong add3if add4device


================================================
FILE: CUDA/chapter3_简单CUDA程序的基本框架/add.cpp
================================================
#include <cmath>
#include <cstdlib>
#include <cstdio>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void add(const double *x, const double *y, double *z, const int N);
void check(const double *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double *x = (double*)malloc(M);
    double *y = (double*)malloc(M);
    double *z = (double*)malloc(M);

    for (int n = 0; n < N; ++n)
    {
        x[n] = a;
        y[n] = b;
    }

    add(x, y, z, N);
    check(z, N);

    free(x);
    free(y);
    free(z);

    return 0;
}

void add(const double *x, const double *y, double *z, const int N)
{
    for (int n = 0; n < N; ++n)
    {
        z[n] = x[n] + y[n];
    }
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }

    printf("%s\n", has_error ? "Has errors": "No errors");
}

================================================
FILE: CUDA/chapter3_简单CUDA程序的基本框架/add1.cu
================================================
#include <cmath>
#include <cstdio>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;

void __global__ add(const double *x, const double *y, double *z);
void check(const double *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 10000000;
    const int M = sizeof(double) * N;
    double *h_x = (double *)malloc(M);
    double *h_y = (double *)malloc(M);
    double *h_z = (double *)malloc(M);

    for (int n = 0; n < N; n++)
    {
        h_x[n] = a;
        h_y[n] = b;
    }

    double *d_x, *d_y, *d_z;
    cudaMalloc((void **)&d_x, M);
    cudaMalloc((void **)&d_y, M);
    cudaMalloc((void **)&d_z, M);
    cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);

    const int block_size = 128;
    const int grid_size = N / block_size;
    add<<<grid_size, block_size>>>(d_x, d_y, d_z);

    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);

    return 0;

}


void __global__ add(const double *x, const double *y, double *z)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    z[n] = x[n] + y[n];
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
            printf("z[%d] is: %f, c is: %f\n", n, z[n], c);
        }
    }

    printf("%s\n", has_error ? "Has errors": "No errors");
}

================================================
FILE: CUDA/chapter3_简单CUDA程序的基本框架/add2wrong.cu
================================================
#include <cmath>
#include <cstdio>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z);
void check(const double *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double *h_x = (double*) malloc(M);
    double *h_y = (double*) malloc(M);
    double *h_z = (double*) malloc(M);

    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
        h_y[n] = b;
    }

    double *d_x, *d_y, *d_z;
    cudaMalloc((void **)&d_x, M);
    cudaMalloc((void **)&d_y, M);
    cudaMalloc((void **)&d_z, M);
    cudaMemcpy(d_x, h_x, M, cudaMemcpyDeviceToHost);
    cudaMemcpy(d_y, h_y, M, cudaMemcpyDeviceToHost);

    const int block_size = 128;
    const int grid_size = N / block_size;
    add<<<grid_size, block_size>>>(d_x, d_y, d_z);

    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
    return 0;
}

void __global__ add(const double *x, const double *y, double *z)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    z[n] = x[n] + y[n];
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

================================================
FILE: CUDA/chapter3_简单CUDA程序的基本框架/add3if.cu
================================================
#include <math.h>
#include <stdio.h>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z, const int N);
void check(const double *z, const int N);

int main(void)
{
    const int N = 100000001;
    const int M = sizeof(double) * N;
    double *h_x = (double*) malloc(M);
    double *h_y = (double*) malloc(M);
    double *h_z = (double*) malloc(M);

    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
        h_y[n] = b;
    }

    double *d_x, *d_y, *d_z;
    cudaMalloc((void **)&d_x, M);
    cudaMalloc((void **)&d_y, M);
    cudaMalloc((void **)&d_z, M);
    cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);

    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;
    add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);

    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
    return 0;
}

void __global__ add(const double *x, const double *y, double *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        z[n] = x[n] + y[n];
    }
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}


================================================
FILE: CUDA/chapter3_简单CUDA程序的基本框架/add4device.cu
================================================
#include <cmath>
#include <cstdio>
#include <cuda.h>
#include <cuda_runtime.h>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add1(const double *x, const double *y, double *z, const int N);
void __global__ add2(const double *x, const double *y, double *z, const int N);
void __global__ add3(const double *x, const double *y, double *z, const int N);
void check(const double *z, int N);

int main(int argc, char *argv[])
{
    const int N = 100000001;
    const int M = sizeof(double) * N;
    double *h_x = (double*) malloc(M);
    double *h_y = (double*) malloc(M);
    double *h_z = (double*) malloc(M);

    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
        h_y[n] = b;
    }

    double *d_x, *d_y, *d_z;
    cudaMalloc((void **)&d_x, M);
    cudaMalloc((void **)&d_y, M);
    cudaMalloc((void **)&d_z, M);
    cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);

    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;

    add1<<<grid_size, block_size>>>(d_x, d_y, d_z, N);
    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
    check(h_z, N);

    add2<<<grid_size, block_size>>>(d_x, d_y, d_z, N);
    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
    check(h_z, N);

    add3<<<grid_size, block_size>>>(d_x, d_y, d_z, N);
    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
    return 0;
}

// 版本一：有返回值的设备函数
double __device__ add1_device(const double x, const double y)
{
    return (x + y);
}

void __global__ add1(const double *x, const double *y, double *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        z[n] = add1_device(x[n], y[n]);
    }
}

// 版本二：用指针的设备函数
void __device__ add2_device(const double x, const double y, double *z)
{
    *z = x + y;
}

void __global__ add2(const double *x, const double *y, double *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        add2_device(x[n], y[n], &z[n]);
    }
}

// 版本三：用引用（reference）的设备函数
void __device__ add3_device(const double x, const double y, double &z)
{
    z = x + y;
}

void __global__ add3(const double *x, const double *y, double *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        add3_device(x[n], y[n], z[n]);
    }
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}



================================================
FILE: CUDA/chapter4_CUDA程序的错误检测/Makefile
================================================
all: check1api check2kernel memcheck

check1api: check1api.cu
	nvcc check1api.cu -o check1api

check2kernel: check2kernel.cu
	nvcc check2kernel.cu -o check2kernel

memcheck: memcheck.cu
	nvcc memcheck.cu -o memcheck

.PHONY: clean

clean:
	rm -rf check1api check2kernel memcheck


================================================
FILE: CUDA/chapter4_CUDA程序的错误检测/check1api.cu
================================================
#include "error.cuh"
#include <math.h>
#include <stdio.h>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z, const int N);
void check(const double *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double *h_x = (double*) malloc(M);
    double *h_y = (double*) malloc(M);
    double *h_z = (double*) malloc(M);

    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
        h_y[n] = b;
    }

    double *d_x, *d_y, *d_z;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaMalloc((void **)&d_y, M));
    CHECK(cudaMalloc((void **)&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyDeviceToHost));
    CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyDeviceToHost));

    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;
    add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);

    CHECK(cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost));
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));
    return 0;
}

void __global__ add(const double *x, const double *y, double *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        z[n] = x[n] + y[n];
    }
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

================================================
FILE: CUDA/chapter4_CUDA程序的错误检测/check2kernel.cu
================================================
#include "error.cuh"
#include <math.h>
#include <stdio.h>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z, const int N);
void check(const double *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000000;
    const int M = sizeof(double) * N;
    double *h_x = (double*) malloc(M);
    double *h_y = (double*) malloc(M);
    double *h_z = (double*) malloc(M);

    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
        h_y[n] = b;
    }

    double *d_x, *d_y, *d_z;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaMalloc((void **)&d_y, M));
    CHECK(cudaMalloc((void **)&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));

    const int block_size = 1280;
    const int grid_size = (N + block_size - 1) / block_size;
    add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);
    CHECK(cudaGetLastError());
    CHECK(cudaDeviceSynchronize());

    CHECK(cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost));
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));
    return 0;
}

void __global__ add(const double *x, const double *y, double *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        z[n] = x[n] + y[n];
    }
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

================================================
FILE: CUDA/chapter4_CUDA程序的错误检测/error.cuh
================================================

#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)



================================================
FILE: CUDA/chapter4_CUDA程序的错误检测/memcheck.cu
================================================
#include <cmath>
#include <cstdio>

const double EPSILON = 1.0e-15;
const double a = 1.23;
const double b = 2.34;
const double c = 3.57;
void __global__ add(const double *x, const double *y, double *z, const int N);
void check(const double *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000001;
    const int M = sizeof(double) * N;
    double *h_x = (double*) malloc(M);
    double *h_y = (double*) malloc(M);
    double *h_z = (double*) malloc(M);

    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
        h_y[n] = b;
    }

    double *d_x, *d_y, *d_z;
    cudaMalloc((void **)&d_x, M);
    cudaMalloc((void **)&d_y, M);
    cudaMalloc((void **)&d_z, M);
    cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);

    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;
    add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);

    cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_z);
    return 0;
}

void __global__ add(const double *x, const double *y, double *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    // if (n < N)
    z[n] = x[n] + y[n];
}

void check(const double *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

// cuda-memcheck ./memcheck


================================================
FILE: CUDA/chapter5_获得GPU加速的关键/Makefile
================================================
all: add1cpu add2gpu add3memcpy arithmetic1cpu arithmetic2gpu

add1cpu: add1cpu.cu
	nvcc -O3 -DUSE_DP add1cpu.cu -o add1cpu

add2gpu: add2gpu.cu
	nvcc -O3 -DUSE_DP add2gpu.cu -o add2gpu

add3memcpy: add3memcpy.cu
	nvcc -O3 -DUSE_DP add3memcpy.cu -o add3memcpy

arithmetic1cpu: arithmetic1cpu.cu
	nvcc -O3 -DUSE_DP arithmetic1cpu.cu -o arithmetic1cpu

arithmetic2gpu: arithmetic2gpu.cu
	nvcc -O3 -DUSE_DP arithmetic2gpu.cu -o arithmetic2gpu

.PHONY: clean

clean:
	rm -rf add1cpu add2gpu add3memcpy arithmetic1cpu arithmetic2gpu


================================================
FILE: CUDA/chapter5_获得GPU加速的关键/add1cpu.cu
================================================
#include "error.cuh"
#include <math.h>
#include <stdio.h>

#ifdef USE_DP
    typedef double real;
    const real EPSILON = 1.0e-15;
#else
    typedef float real;
    const real EPSILON = 1.0e-6f;
#endif

const int NUM_REPEATS = 10;
const real a = 1.23;
const real b = 2.34;
const real c = 3.57;
void add(const real *x, const real *y, real *z, const int N);
void check(const real *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000000;
    const int M = sizeof(real) * N;
    real *x = (real*) malloc(M);
    real *y = (real*) malloc(M);
    real *z = (real*) malloc(M);

    for (int n = 0; n < N; ++n)
    {
        x[n] = a;
        y[n] = b;
    }

    float t_sum = 0;
    float t2_sum = 0;
    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        add(x, y, z, N);

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("Time = %g +- %g ms.\n", t_ave, t_err);

    check(z, N);

    free(x);
    free(y);
    free(z);
    return 0;
}

void add(const real *x, const real *y, real *z, const int N)
{
    for (int n = 0; n < N; ++n)
    {
        z[n] = x[n] + y[n];
    }
}

void check(const real *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}


================================================
FILE: CUDA/chapter5_获得GPU加速的关键/add2gpu.cu
================================================
#include "error.cuh"
#include <math.h>
#include <stdio.h>

#ifdef USE_DP
    typedef double real;
    const real EPSILON = 1.0e-15;
#else
    typedef float real;
    const real EPSILON = 1.0e-6f;
#endif

const int NUM_REPEATS = 10;
const real a = 1.23;
const real b = 2.34;
const real c = 3.57;

void __global__ add(const real *x, const real *y, real *z, const int N);
void check(const real *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000000;
    const int M = sizeof(real) * N;
    real *h_x = (real*)malloc(M);
    real *h_y = (real*)malloc(M);
    real *h_z = (real*)malloc(M);

    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
        h_y[n] = b;
    }

    real *d_x, *d_y, *d_z;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaMalloc((void **)&d_y, M));
    CHECK(cudaMalloc((void **)&d_z, M));
    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));

    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;

    float t_sum = 0;
    float t2_sum = 0;
    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));

    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("Time = %g +- %g ms.\n", t_ave, t_err);

    CHECK(cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost));
    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));
    return 0;
}

void __global__ add(const real *x, const real *y, real *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        z[n] = x[n] + y[n];
    }
}

void check(const real *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

================================================
FILE: CUDA/chapter5_获得GPU加速的关键/add3memcpy.cu
================================================
#include "error.cuh"
#include <math.h>
#include <stdio.h>

#ifdef USE_DP
    typedef double real;
    const real EPSILON = 1.0e-15;
#else
    typedef float real;
    const real EPSILON = 1.0e-6f;
#endif

const int NUM_REPEATS = 10;
const real a = 1.23;
const real b = 2.34;
const real c = 3.57;

void __global__ add(const real *x, const real *y, real *z, const int N);
void check(const real *z, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000000;
    const int M = sizeof(real) * N;
    real *h_x = (real*)malloc(M);
    real *h_y = (real*)malloc(M);
    real *h_z = (real*)malloc(M);

    for (int n = 0; n < N; ++n)
    {
        h_x[n] = a;
        h_y[n] = b;
    }

    real *d_x, *d_y, *d_z;
    CHECK(cudaMalloc((void **)&d_x, M));
    CHECK(cudaMalloc((void **)&d_y, M));
    CHECK(cudaMalloc((void **)&d_z, M));

    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;

    float t_sum = 0;
    float t2_sum = 0;
    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));

        add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);

        CHECK(cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost));

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));

    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("Time = %g +- %g ms.\n", t_ave, t_err);

    check(h_z, N);

    free(h_x);
    free(h_y);
    free(h_z);
    CHECK(cudaFree(d_x));
    CHECK(cudaFree(d_y));
    CHECK(cudaFree(d_z));
    return 0;
}

void __global__ add(const real *x, const real *y, real *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        z[n] = x[n] + y[n];
    }
}

void check(const real *z, const int N)
{
    bool has_error = false;
    for (int n = 0; n < N; ++n)
    {
        if (fabs(z[n] - c) > EPSILON)
        {
            has_error = true;
        }
    }
    printf("%s\n", has_error ? "Has errors" : "No errors");
}

// nvprof ./add3memcpy

================================================
FILE: CUDA/chapter5_获得GPU加速的关键/arithmetic1cpu.cu
================================================
#include "error.cuh"
#include <cmath>
#include <cstdio>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 10;
const real x0 = 100.0;
void arithmetic(real *x, const real x0, const int N);

int main(int argc, char *argv[])
{
    const int N = 10000;
    const int M = sizeof(real) * N;
    real *x = (real*)malloc(M);

    float t_sum = 0;
    float t2_sum = 0;

    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        for (int n = 0; n < N; ++n)
        {
            x[n] = 0.0;
        }

        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        arithmetic(x, x0, N);

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("Time = %g +- %g ms.\n", t_ave, t_err);

    free(x);
    return 0;
}

void arithmetic(real *x, const real x0, const int N)
{
    for (int n = 0; n < N; ++n)
    {
        real x_tmp = x[n];
        while (sqrt(x_tmp) < x0)
        {
            ++x_tmp;
        }
        x[n] = x_tmp;
    }
}

================================================
FILE: CUDA/chapter5_获得GPU加速的关键/arithmetic2gpu.cu
================================================
#include "error.cuh"
#include <cmath>
#include <cstdio>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 10;
const real x0 = 100.0;
__global__ void arithmetic(real *d_x, const real x0, const int N);

int main(int argc, char *argv[])
{
    if (argc != 2)
    {
        printf("usage: %s N\n", argv[0]);
    }
    const int N = atoi(argv[1]);
    const int block_size = 128;
    const int grid_size = (N + block_size - 1) / block_size;

    const int M = sizeof(real) * N;
    real *h_x = (real *)malloc(M);
    real *d_x;

    CHECK(cudaMalloc((void **)&d_x, M));

    float t_sum = 0;
    float t2_sum = 0;
    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        for (int n = 0; n < N; ++n)
        {
            h_x[n] = 0.0;
        }
        CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        arithmetic<<<grid_size, block_size>>>(d_x, x0, N);

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("Time = %g +- %g ms.\n", t_ave, t_err);

    free(h_x);
    CHECK(cudaFree(d_x));

    return 0;
}

__global__ void arithmetic(real *d_x, const real x0, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        real x_tmp = d_x[n];
        while (sqrt(x_tmp) < x0)
        {
            ++x_tmp;
        }
        d_x[n] = x_tmp;
    }
}

================================================
FILE: CUDA/chapter5_获得GPU加速的关键/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


================================================
FILE: CUDA/chapter6_CUDA的内存组织/Makefile
================================================
all: static query

static: static.cu
	nvcc -O3 -DUSE_DP static.cu -o static

query: query.cu
	nvcc -O3 --ptxas-options=-v query.cu -o query
.PHONY: clean

clean:
	rm -rf static query


================================================
FILE: CUDA/chapter6_CUDA的内存组织/README.md
================================================
### 1. CUDA中设备内存的分类与特征

| 内存类型  | 物理位置 |   访问权限  | 可见范围 | 生命周期 |
|:------------:|:---------------:|:--------------:|:---------------:|:--------------:|
| 全局内存 | 在芯片外 | 可读可写 | 所有线程和主机端 | 由主机分配与释放 |
| 常量内存 | 在芯片外 | 仅可读 | 所有线程和主机端 | 由主机分配与释放 |
| 纹理和表面内存 | 在芯片外 | 一般仅可读 | 所有线程和主机端 | 由主机分配和释放 |
|寄存器内存|在芯片内|可读可写|单个线程|所在线程| 
|局部内存|在芯片外|可读可写|单个线程|所在线程|
|共享内存|在芯片内|可读可写|单个线程块|所在线程块|

### 2. CUDA中的内存组织示意图

![](./pic/CUDA%E5%86%85%E5%AD%98%E7%BB%84%E7%BB%87%E7%A4%BA%E6%84%8F%E5%9B%BE.png)

### 3. 几个不同能力的GPU中与寄存器和共享内存有关的技术指标

| 计算能力  | 3.5 |   6.0  | 7.0 | 7.5 |
|:------------:|:---------------:|:--------------:|:---------------:|:--------------:|
|GPU代表|Tesla K40|Tesla P100|Tesla V100|GeForce RTX 2080|
|SM寄存器数上限|64K|64K|64K|64K|
|单个线程块寄存器数上限|64K|64K|64K|64K|
|单个线程寄存器数上限|255|255|255|255|
|SM共享内存上限/KB|48|64|96|64|
|单个线程块共享内存上限/KB|48|48|96|64|

### 4. SM及其占有率

#### 4.1 SM的构成

一个GPU是由多个SM(Streaming Multiprocessor)构成的。一个SM包含如下资源：

- (1) 一定数量的**寄存器**。
- (2) 一定数量的**共享内存**。
- (3) 常量内存的缓存。
- (4) 纹理和表面内存的缓存。
- (5) L1缓存。
- (6) 两个（计算能力6.0）或4个（其他计算能力）**线程束调度器**（warp scheduler）用于不同线程的上下文之间迅速地切换，以及为准备就绪的线程束发出执行指令。
- (7) 执行核心，包括：
    - a) 若干整型数运算的核心（INT32）。
    - b) 若干单精度浮点数运算的核心（FP32）。
    - c) 若干双精度浮点数运算的核心（FP64）。
    - d) 若干单精度浮点数超越函数（transcendental functions）的特殊函数单元（special function units, SFUs）。
    - e) 若干混合精度的张量核心（tensor cores，由伏特架构引入，适用于机器学习中的低精度矩阵计算）。

#### 4.2 SM的占有率

要分析SM的理论占用率，需要知道第三节中提到的寄存器和共享内存的上限，还有第2章中提到的**一个线程块（无论几维的）中线程数不能超过1024**限制。还需要知道两个指标：

- (1) 一个SM中最多能拥有的线程块个数为$N_b=16$(开普勒架构和图灵架构)或者$N_b=32$(麦克斯韦架构、帕斯卡架构和伏特架构)。
- (2) 一个SM中最多能拥有的线程个数$N_t=2048$（从开普勒架构到伏特架构）或者$N_t=1024$（图灵架构）。

在并行规模足够大（即核函数执行配置中定义的总线程数足够多）的前提下分几种情况来分析SM的理论占有率：

- (1) **寄存器和共享内存使用量很小的情况**。此时，SM的占有率完全由执行配置中的线程块大小决定。关于线程块大小，读者也许注意到我们之前一直用128。这是因为，SM中线程的执行是以线程束为单位的，所以最好将线程块大小取为线程束大小（32个线程）的整数倍。例如，假设将线程块大小定义为100，那么一个线程块中将有3个完整的线程束（一共96个线程）和一个不完整的线程束（只有4个线程）。在执行核函数中的指令时，不完整的线程束花的时间和完整的线程束花费的时间一样，这就无形中浪费了计算资源。所以，建议将线程块大小取为32的整数倍。在该前提下，任何不小于$N_t/N_b$而且能整除$N_t$的线程块大小都能得到100%的占有率；线程块大小不小于64时其他架构能获得100%的占有率。根据我们列出的数据，线程块大小不小于128时开普勒架构能获得100%的占有率；线程块大小不小于64时其他架构能获得100%的占有率。作者近几年都用一块开普勒架构的Tesla K40开发程序，所以习惯了在一般情况下用128的线程块大小。

- (2) **有限寄存器数量对占有率的约束情况**。我们只针对第三节中列出的几个计算能力进行分析，读者可以类似地分析其他未列出的计算能力。对于第三节中列出的所有计算能力，一个SM最多能使用的寄存器个数为64K（64 x 1024）。除图灵架构外，如果我们希望在一个SM中驻留最多的线程（2048个），核函数中的每个线程最多只能用32个寄存器。当每个线程所用寄存器个数大于64时，SM的占有率将小于50%；当每个线程所用寄存器个数大于128时，SM的占有率将小于25%。对于图灵架构，同样的占有率允许使用更多的寄存器。

- (3) **有限的共享内存对占有率的约束清理**。因为共享内存的数量随着计算能力的上升没有显著的变化规律，所以我们这里仅对计算能力3.5进行分析，对其他计算能力可以类似地分析。如果线程块大小为128，那么每个SM要激活16个线程块才能有2048个线程，达到100%的占有率。此时，一个线程块最多能使用3KB的共享内存。在不改变线程块大小的情况下，要达到50%的占有率，一个线程块最多能使用6KB的共享内存；要达到25%的占有率，一个线程块最多能使用12KB的共享内存。如果一个线程块使用了超过48KB的共享内存，会直接导致核函数无法允许。对其他线程块大小可进行类似的分析。


================================================
FILE: CUDA/chapter6_CUDA的内存组织/error.cuh
================================================
#pragma once
#include <cstdio>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


================================================
FILE: CUDA/chapter6_CUDA的内存组织/query.cu
================================================
#include "error.cuh"
#include <cstdio>

int main(int argc, char *argv[])
{
    int device_id = 0;
    if (argc > 1) device_id = atoi(argv[1]);
    CHECK(cudaSetDevice(device_id));

    cudaDeviceProp prop;
    CHECK(cudaGetDeviceProperties(&prop, device_id));

    printf("Device id:                                 %d\n",
        device_id);
    printf("Device name:                               %s\n",
        prop.name);
    printf("Compute capability:                        %d.%d\n",
        prop.major, prop.minor);
    printf("Amount of global memory:                   %g GB\n",
        prop.totalGlobalMem / (1024.0 * 1024 * 1024));
    printf("Amount of constant memory:                 %g KB\n",
        prop.totalConstMem  / 1024.0);
    printf("Maximum grid size:                         %d %d %d\n",
        prop.maxGridSize[0], 
        prop.maxGridSize[1], prop.maxGridSize[2]);
    printf("Maximum block size:                        %d %d %d\n",
        prop.maxThreadsDim[0], prop.maxThreadsDim[1], 
        prop.maxThreadsDim[2]);
    printf("Number of SMs:                             %d\n",
        prop.multiProcessorCount);
    printf("Maximum amount of shared memory per block: %g KB\n",
        prop.sharedMemPerBlock / 1024.0);
    printf("Maximum amount of shared memory per SM:    %g KB\n",
        prop.sharedMemPerMultiprocessor / 1024.0);
    printf("Maximum number of registers per block:     %d K\n",
        prop.regsPerBlock / 1024);
    printf("Maximum number of registers per SM:        %d K\n",
        prop.regsPerMultiprocessor / 1024);
    printf("Maximum number of threads per block:       %d\n",
        prop.maxThreadsPerBlock);
    printf("Maximum number of threads per SM:          %d\n",
        prop.maxThreadsPerMultiProcessor);
    

}

================================================
FILE: CUDA/chapter6_CUDA的内存组织/static.cu
================================================
#include "error.cuh"
#include <cstdio>

__device__ int d_x = 1;
__device__ int d_y[2];

void __global__ my_kernel(void)
{
    d_y[0] += d_x;
    d_y[1] += d_x;
    printf("d_x = %d, d_y[0] = %d, d_y[1] = %d.\n", d_x, d_y[0], d_y[1]);
}

int main(int argc, char *argv[])
{
    int h_y[2] = {10, 20};
    CHECK(cudaMemcpyToSymbol(d_y, h_y, sizeof(int) * 2));

    my_kernel<<<1, 1>>>();
    CHECK(cudaDeviceSynchronize());

    CHECK(cudaMemcpyFromSymbol(h_y, d_y, sizeof(int) * 2));
    printf("h_y[0] = %d, h_y[1] = %d.\n", h_y[0], h_y[1]);

    return 0;
}



================================================
FILE: CUDA/chapter7_全局内存的合理使用/Makefile
================================================
all: matrix

matrix: matrix.cu
	nvcc -arch=sm_50 matrix.cu -o matrix
.PHONY: clean

clean:
	rm -rf matrix


================================================
FILE: CUDA/chapter7_全局内存的合理使用/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


================================================
FILE: CUDA/chapter7_全局内存的合理使用/matrix.cu
================================================
#include "error.cuh"
#include <cstdio>
#include <cuda_runtime.h>
#include <cuda.h>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 10;
const int TILE_DIM = 32;

void timing(const real *d_A, real *d_B, const int N, const int task);

__global__ void copy(const real *A, real *B, const int N);
__global__ void transpose1(const real *A, real *B, const int N);
__global__ void transpose2(const real *A, real *B, const int N);
__global__ void transpose3(const real *A, real *B, const int N);
void print_matrix(const int N, const real *A);

int main(int argc, char *argv[])
{

    if (argc != 2)
    {
        printf("usage: %s N\n", argv[0]);
        exit(1);
    }
    const int N = atoi(argv[1]);

    const int N2 = N * N;
    const int M = sizeof(real) * N2;
    real *h_A = (real *) malloc(M);
    real *h_B = (real *) malloc(M);
    for (int n = 0; n < N2; ++n)
    {
        h_A[n] = n;
    }
    real *d_A, *d_B;
    CHECK(cudaMalloc(&d_A, M));
    CHECK(cudaMalloc(&d_B, M));
    CHECK(cudaMemcpy(d_A, h_A, M, cudaMemcpyHostToDevice));

    printf("\ncopy:\n");
    timing(d_A, d_B, N, 0);
    printf("\ntranspose with coalesced read:\n");
    timing(d_A, d_B, N, 1);
    printf("\ntranspose with coalesced write:\n");
    timing(d_A, d_B, N, 2);
    printf("\ntranspose with coalesced write and __ldg read:\n");
    timing(d_A, d_B, N, 3);

    CHECK(cudaMemcpy(h_B, d_B, M, cudaMemcpyDeviceToHost));
    if (N <= 10)
    {
        printf("A =\n");
        print_matrix(N, h_A);
        printf("\nB =\n");
        print_matrix(N, h_B);
    }

    free(h_A);
    free(h_B);
    CHECK(cudaFree(d_A));
    CHECK(cudaFree(d_B));
    return 0;
}

__global__ void copy(const real *A, real *B, const int N)
{
    const int nx = blockIdx.x * TILE_DIM + threadIdx.x;
    const int ny = blockIdx.y * TILE_DIM + threadIdx.y;
    if (nx < N && ny < N)
    {
        B[nx * N + ny] = A[nx * N + ny];
    }    
}

__global__ void transpose1(const real *A, real *B, const int N)
{
    const int nx = blockIdx.x * blockDim.x + threadIdx.x;
    const int ny = blockIdx.y * blockDim.y + threadIdx.y;
    if (nx < N && ny < N)
    {
        B[nx * N + ny] = A[ny * N + nx];
    }
}

__global__ void transpose2(const real *A, real *B, const int N)
{
    const int nx = blockIdx.x * blockDim.x + threadIdx.x;
    const int ny = blockIdx.y * blockDim.y + threadIdx.y;
    if (nx < N && ny < N)
    {
        B[ny * N + nx] = A[nx * N + ny];
    }
}

__global__ void transpose3(const real *A, real *B, const int N)
{
    const int nx = blockIdx.x * blockDim.x + threadIdx.x;
    const int ny = blockIdx.y * blockDim.y + threadIdx.y;
    if (nx < N && ny < N)
    {
        B[ny * N + nx] = __ldg(&A[nx * N + ny]);
    }
}

void timing(const real *d_A, real *d_B, const int N, const int task)
{
    const int grid_size_x = (N + TILE_DIM - 1) / TILE_DIM;
    const int grid_size_y = grid_size_x;
    const dim3 block_size(TILE_DIM, TILE_DIM);
    const dim3 grid_size(grid_size_x, grid_size_y);

    float t_sum = 0;
    float t2_sum = 0;
    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        switch (task)
        {
            case 0:
                copy<<<grid_size, block_size>>>(d_A, d_B, N);
                break;
            case 1:
                transpose1<<<grid_size, block_size>>>(d_A, d_B, N);
                break;
            case 2:
                transpose2<<<grid_size, block_size>>>(d_A, d_B, N);
                break;
            case 3:
                transpose3<<<grid_size, block_size>>>(d_A, d_B, N);
                break;
            default:
                printf("Error: wrong task\n");
                exit(1);
                break;
        }

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("Time = %g +- %g ms.\n", t_ave, t_err);
}

void print_matrix(const int N, const real *A)
{
    for (int ny = 0; ny < N; ny++)
    {
        for (int nx = 0; nx < N; nx++)
        {
            printf("%g\t", A[ny * N + nx]);
        }
        printf("\n");
    }
}


================================================
FILE: CUDA/chapter8_共享内存的合理使用/Makefile
================================================
all: reduce1cpu reduce2gpu bank

reduce1cpu: reduce1cpu.cu
	nvcc -arch=sm_50 reduce1cpu.cu -o reduce1cpu

reduce2gpu: reduce2gpu.cu
	nvcc -arch=sm_50 reduce2gpu.cu -o reduce2gpu

bank: bank.cu
	nvcc -arch=sm_50 bank.cu -o bank

.PHONY: clean

clean:
	rm -rf reduce1cpu reduce2gpu bank


================================================
FILE: CUDA/chapter8_共享内存的合理使用/bank.cu
================================================
#include "error.cuh"
#include <stdio.h>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 10;
const int TILE_DIM = 32;

void timing(const real *d_A, real *d_B, const int N, const int task);
__global__ void transpose1(const real *A, real *B, const int N);
__global__ void transpose2(const real *A, real *B, const int N);
void print_matrix(const int N, const real *A);

int main(int argc, char **argv)
{
    if (argc != 2)
    {
        printf("usage: %s N\n", argv[0]);
        exit(1);
    }
    const int N = atoi(argv[1]);

    const int N2 = N * N;
    const int M = sizeof(real) * N2;
    real *h_A = (real *) malloc(M);
    real *h_B = (real *) malloc(M);
    for (int n = 0; n < N2; ++n)
    {
        h_A[n] = n;
    }
    real *d_A, *d_B;
    CHECK(cudaMalloc(&d_A, M));
    CHECK(cudaMalloc(&d_B, M));
    CHECK(cudaMemcpy(d_A, h_A, M, cudaMemcpyHostToDevice));

    printf("\ntranspose with shared memory bank conflict:\n");
    timing(d_A, d_B, N, 1);
    printf("\ntranspose without shared memory bank conflict:\n");
    timing(d_A, d_B, N, 2);

    CHECK(cudaMemcpy(h_B, d_B, M, cudaMemcpyDeviceToHost));
    if (N <= 10)
    {
        printf("A =\n");
        print_matrix(N, h_A);
        printf("\nB =\n");
        print_matrix(N, h_B);
    }

    free(h_A);
    free(h_B);
    CHECK(cudaFree(d_A));
    CHECK(cudaFree(d_B));
    return 0;
}

void timing(const real *d_A, real *d_B, const int N, const int task)
{
    const int grid_size_x = (N + TILE_DIM - 1) / TILE_DIM;
    const int grid_size_y = grid_size_x;
    const dim3 block_size(TILE_DIM, TILE_DIM);
    const dim3 grid_size(grid_size_x, grid_size_y);

    float t_sum = 0;
    float t2_sum = 0;
    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        switch (task)
        {
            case 1:
                transpose1<<<grid_size, block_size>>>(d_A, d_B, N);
                break;
            case 2:
                transpose2<<<grid_size, block_size>>>(d_A, d_B, N);
                break;
            default:
                printf("Error: wrong task\n");
                exit(1);
                break;
        }

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("Time = %g +- %g ms.\n", t_ave, t_err);
}

__global__ void transpose1(const real *A, real *B, const int N)
{
    __shared__ real S[TILE_DIM][TILE_DIM];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;

    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;

    if (nx1 < N && ny1 < N)
    {
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    }
    __syncthreads();

    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
    {
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
    }
}

__global__ void transpose2(const real *A, real *B, const int N)
{
    __shared__ real S[TILE_DIM][TILE_DIM + 1];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;

    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
    {
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    }
    __syncthreads();

    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
    {
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
    }
}

void print_matrix(const int N, const real *A)
{
    for (int ny = 0; ny < N; ny++)
    {
        for (int nx = 0; nx < N; nx++)
        {
            printf("%g\t", A[ny * N + nx]);
        }
        printf("\n");
    }
}

================================================
FILE: CUDA/chapter8_共享内存的合理使用/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


================================================
FILE: CUDA/chapter8_共享内存的合理使用/reduce1cpu.cu
================================================
#include "error.cuh"
#include <cstdio>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 20;
void timing(const real *x, const int N);
real reduce(const real *x, const int N);

int main(int argc, char *argv[])
{
    const int N = 100000000;
    const int M = sizeof(real) * N;
    real *x = (real *)malloc(M);
    for (int n = 0; n < N; ++n)
    {
        x[n] = 1.23;
    }

    timing(x, N);

    free(x);
}

void timing(const real *x, const int N)
{
    real sum = 0;

    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(x, N);

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("sum = %f.\n", sum);
}

real reduce(const real *x, const int N)
{
    real sum = 0.0;
    for (int n = 0; n < N; ++n)
    {
        sum += x[n];
    }
    return sum;
}

================================================
FILE: CUDA/chapter8_共享内存的合理使用/reduce2gpu.cu
================================================
#include "error.cuh"
#include <cstdio>

#ifdef USE_DP
    typedef double real;
#else
    typedef float real;
#endif

const int NUM_REPEATS = 100;
const int N = 100000000;
const int M = sizeof(real) * N;
const int BLOCK_SIZE = 128;

void timing(real *h_x, real *d_x, const int method);

int main(int argc, char *argv[])
{
    real *h_x = (real *) malloc(M);
    for (int n = 0; n < N; ++n)
    {
        h_x[n] = 1.23;
    }
    real *d_x;
    CHECK(cudaMalloc(&d_x, M));

    printf("\nUsing global memory only:\n");
    timing(h_x, d_x, 0);
    printf("\nUsing static shared memory:\n");
    timing(h_x, d_x, 1);
    printf("\nUsing dynamic shared memory:\n");
    timing(h_x, d_x, 2);

    free(h_x);
    CHECK(cudaFree(d_x));
    return 0;
}

void __global__ reduce_global(real *d_x, real *d_y)
{
    const int tid = threadIdx.x;
    real *x = d_x + blockDim.x * blockIdx.x;

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {
        if (tid < offset)
        {
            x[tid] += x[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        d_y[blockIdx.x] = x[0];
    }
}

void __global__ reduce_shared(real *d_x, real *d_y)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;

    const int n = bid * blockDim.x + tid;
    __shared__ real s_y[128];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        d_y[bid] = s_y[0];
    }
}

void __global__ reduce_dynamic(real *d_x, real *d_y)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        d_y[bid] = s_y[0];
    }
}

real reduce(real *d_x, const int method)
{
    int grid_size = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    const int ymem = sizeof(real) * grid_size;
    const int smem = sizeof(real) * BLOCK_SIZE;
    real *d_y;
    CHECK(cudaMalloc(&d_y, ymem));
    real *h_y = (real *) malloc(ymem);

    switch (method)
    {
    case 0:
        reduce_global<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
        break;

    case 1:
        reduce_shared<<<grid_size, BLOCK_SIZE>>>(d_x, d_y);
        break;
    
    case 2:
        reduce_dynamic<<<grid_size, BLOCK_SIZE, smem>>>(d_x, d_y);
        break;

    default:
        printf("Error: wrong method\n");
        exit(1);
        break;
    }

    CHECK(cudaMemcpy(h_y, d_y, ymem, cudaMemcpyDeviceToHost));

    real result = 0.0;
    for (int n = 0; n < grid_size; ++n)
    {
        result += h_y[n];
    }

    free(h_y);
    CHECK(cudaFree(d_y));
    return result;
}

void timing(real *h_x, real *d_x, const int method)
{
    real sum = 0;

    for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
    {
        CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));

        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        sum = reduce(d_x, method);

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    printf("sum = %f.\n", sum);
}

================================================
FILE: CUDA/chapter9_原子函数的合理使用/Makefile
================================================
all: reduce neighbor1cpu neighbor2gpu

reduce: reduce.cu
	nvcc -g -arch=sm_50 reduce.cu -o reduce

neighbor1cpu: neighbor1cpu.cu
	nvcc -g -arch=sm_50 neighbor1cpu.cu -o neighbor1cpu

neighbor2gpu: neighbor2gpu.cu
	nvcc -g -arch=sm_50 neighbor2gpu.cu -o neighbor2gpu

.PHONY: clean

clean:
	rm -rf reduce neighbor1cpu neighbor2gpu


================================================
FILE: CUDA/chapter9_原子函数的合理使用/error.cuh
================================================
#pragma once
#include <stdio.h>

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)


================================================
FILE: CUDA/chapter9_原子函数的合理使用/neighbor.txt
================================================
2 2 22321 NaN NaN NaN NaN NaN NaN NaN NaN
2 3 55 NaN NaN NaN NaN NaN NaN NaN NaN
2 0 22322 NaN NaN NaN NaN NaN NaN NaN NaN
3 1 5 272 NaN NaN NaN NaN NaN NaN NaN
2 5 7 NaN NaN NaN NaN NaN NaN NaN NaN
3 3 4 328 NaN NaN NaN NaN NaN NaN NaN
2 8 22323 NaN NaN NaN NaN NaN NaN NaN NaN
2 4 10 NaN NaN NaN NaN NaN NaN NaN NaN
2 6 22307 NaN NaN NaN NaN NaN NaN NaN NaN
1 11 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 7 11 329 NaN NaN NaN NaN NaN NaN NaN
3 9 10 274 NaN NaN NaN NaN NaN NaN NaN
2 13 178 NaN NaN NaN NaN NaN NaN NaN NaN
2 12 273 NaN NaN NaN NaN NaN NaN NaN NaN
3 15 22291 22442 NaN NaN NaN NaN NaN NaN NaN
2 14 175 NaN NaN NaN NaN NaN NaN NaN NaN
1 22442 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 18 19 22309 NaN NaN NaN NaN NaN NaN NaN
2 17 275 NaN NaN NaN NaN NaN NaN NaN NaN
2 17 22324 NaN NaN NaN NaN NaN NaN NaN NaN
2 22 278 NaN NaN NaN NaN NaN NaN NaN NaN
2 22 58 NaN NaN NaN NaN NaN NaN NaN NaN
3 20 21 330 NaN NaN NaN NaN NaN NaN NaN
3 24 25 22311 NaN NaN NaN NaN NaN NaN NaN
2 23 279 NaN NaN NaN NaN NaN NaN NaN NaN
2 23 22326 NaN NaN NaN NaN NaN NaN NaN NaN
2 28 282 NaN NaN NaN NaN NaN NaN NaN NaN
2 28 64 NaN NaN NaN NaN NaN NaN NaN NaN
3 26 27 332 NaN NaN NaN NaN NaN NaN NaN
3 30 31 22283 NaN NaN NaN NaN NaN NaN NaN
2 29 97 NaN NaN NaN NaN NaN NaN NaN NaN
2 29 22452 NaN NaN NaN NaN NaN NaN NaN NaN
2 34 100 NaN NaN NaN NaN NaN NaN NaN NaN
2 34 355 NaN NaN NaN NaN NaN NaN NaN NaN
3 32 33 399 NaN NaN NaN NaN NaN NaN NaN
2 36 78 NaN NaN NaN NaN NaN NaN NaN NaN
3 35 39 401 NaN NaN NaN NaN NaN NaN NaN
3 38 22285 22460 NaN NaN NaN NaN NaN NaN NaN
2 37 75 NaN NaN NaN NaN NaN NaN NaN NaN
2 36 359 NaN NaN NaN NaN NaN NaN NaN NaN
3 41 42 22305 NaN NaN NaN NaN NaN NaN NaN
2 40 105 NaN NaN NaN NaN NaN NaN NaN NaN
2 40 22456 NaN NaN NaN NaN NaN NaN NaN NaN
2 45 108 NaN NaN NaN NaN NaN NaN NaN NaN
2 45 391 NaN NaN NaN NaN NaN NaN NaN NaN
3 43 44 403 NaN NaN NaN NaN NaN NaN NaN
2 47 112 NaN NaN NaN NaN NaN NaN NaN NaN
3 46 50 405 NaN NaN NaN NaN NaN NaN NaN
3 49 22289 22463 NaN NaN NaN NaN NaN NaN NaN
2 48 109 NaN NaN NaN NaN NaN NaN NaN NaN
2 47 367 NaN NaN NaN NaN NaN NaN NaN NaN
2 53 22358 NaN NaN NaN NaN NaN NaN NaN NaN
1 54 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 51 22320 NaN NaN NaN NaN NaN NaN NaN NaN
3 52 56 269 NaN NaN NaN NaN NaN NaN NaN
2 1 56 NaN NaN NaN NaN NaN NaN NaN NaN
3 54 55 271 NaN NaN NaN NaN NaN NaN NaN
2 59 22325 NaN NaN NaN NaN NaN NaN NaN NaN
2 21 60 NaN NaN NaN NaN NaN NaN NaN NaN
2 57 22407 NaN NaN NaN NaN NaN NaN NaN NaN
3 58 62 331 NaN NaN NaN NaN NaN NaN NaN
2 62 114 NaN NaN NaN NaN NaN NaN NaN NaN
3 60 61 370 NaN NaN NaN NaN NaN NaN NaN
2 65 22327 NaN NaN NaN NaN NaN NaN NaN NaN
2 27 66 NaN NaN NaN NaN NaN NaN NaN NaN
2 63 22360 NaN NaN NaN NaN NaN NaN NaN NaN
3 64 68 333 NaN NaN NaN NaN NaN NaN NaN
2 68 92 NaN NaN NaN NaN NaN NaN NaN NaN
3 66 67 372 NaN NaN NaN NaN NaN NaN NaN
3 70 71 22299 NaN NaN NaN NaN NaN NaN NaN
2 69 125 NaN NaN NaN NaN NaN NaN NaN NaN
2 69 22450 NaN NaN NaN NaN NaN NaN NaN NaN
2 74 128 NaN NaN NaN NaN NaN NaN NaN NaN
2 74 351 NaN NaN NaN NaN NaN NaN NaN NaN
3 72 73 397 NaN NaN NaN NaN NaN NaN NaN
3 38 76 22315 NaN NaN NaN NaN NaN NaN NaN
2 75 129 NaN NaN NaN NaN NaN NaN NaN NaN
2 78 132 NaN NaN NaN NaN NaN NaN NaN NaN
3 35 77 411 NaN NaN NaN NaN NaN NaN NaN
3 80 81 22301 NaN NaN NaN NaN NaN NaN NaN
2 79 133 NaN NaN NaN NaN NaN NaN NaN NaN
2 79 22461 NaN NaN NaN NaN NaN NaN NaN NaN
2 84 136 NaN NaN NaN NaN NaN NaN NaN NaN
2 84 383 NaN NaN NaN NaN NaN NaN NaN NaN
3 82 83 413 NaN NaN NaN NaN NaN NaN NaN
3 86 87 22303 NaN NaN NaN NaN NaN NaN NaN
2 85 101 NaN NaN NaN NaN NaN NaN NaN NaN
2 85 22454 NaN NaN NaN NaN NaN NaN NaN NaN
2 90 104 NaN NaN NaN NaN NaN NaN NaN NaN
2 90 387 NaN NaN NaN NaN NaN NaN NaN NaN
3 88 89 415 NaN NaN NaN NaN NaN NaN NaN
2 93 22361 NaN NaN NaN NaN NaN NaN NaN NaN
2 67 94 NaN NaN NaN NaN NaN NaN NaN NaN
2 91 22409 NaN NaN NaN NaN NaN NaN NaN NaN
3 92 96 373 NaN NaN NaN NaN NaN NaN NaN
2 96 120 NaN NaN NaN NaN NaN NaN NaN NaN
3 94 95 394 NaN NaN NaN NaN NaN NaN NaN
3 30 98 22313 NaN NaN NaN NaN NaN NaN NaN
2 97 179 NaN NaN NaN NaN NaN NaN NaN NaN
2 100 182 NaN NaN NaN NaN NaN NaN NaN NaN
3 32 99 425 NaN NaN NaN NaN NaN NaN NaN
2 86 22335 NaN NaN NaN NaN NaN NaN NaN NaN
1 103 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 102 104 190 NaN NaN NaN NaN NaN NaN NaN
3 88 103 427 NaN NaN NaN NaN NaN NaN NaN
3 41 106 22317 NaN NaN NaN NaN NaN NaN NaN
2 105 191 NaN NaN NaN NaN NaN NaN NaN NaN
2 108 194 NaN NaN NaN NaN NaN NaN NaN NaN
3 43 107 429 NaN NaN NaN NaN NaN NaN NaN
3 49 110 22319 NaN NaN NaN NaN NaN NaN NaN
2 109 137 NaN NaN NaN NaN NaN NaN NaN NaN
2 112 140 NaN NaN NaN NaN NaN NaN NaN NaN
3 46 111 431 NaN NaN NaN NaN NaN NaN NaN
2 115 22408 NaN NaN NaN NaN NaN NaN NaN NaN
2 61 116 NaN NaN NaN NaN NaN NaN NaN NaN
2 113 22425 NaN NaN NaN NaN NaN NaN NaN NaN
3 114 118 371 NaN NaN NaN NaN NaN NaN NaN
2 118 176 NaN NaN NaN NaN NaN NaN NaN NaN
3 116 117 407 NaN NaN NaN NaN NaN NaN NaN
2 121 22410 NaN NaN NaN NaN NaN NaN NaN NaN
2 95 122 NaN NaN NaN NaN NaN NaN NaN NaN
2 119 22443 NaN NaN NaN NaN NaN NaN NaN NaN
3 120 124 395 NaN NaN NaN NaN NaN NaN NaN
2 124 276 NaN NaN NaN NaN NaN NaN NaN NaN
3 122 123 409 NaN NaN NaN NaN NaN NaN NaN
2 70 22329 NaN NaN NaN NaN NaN NaN NaN NaN
1 127 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 126 128 311 NaN NaN NaN NaN NaN NaN NaN
3 72 127 447 NaN NaN NaN NaN NaN NaN NaN
2 76 22331 NaN NaN NaN NaN NaN NaN NaN NaN
1 131 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 130 132 319 NaN NaN NaN NaN NaN NaN NaN
3 77 131 437 NaN NaN NaN NaN NaN NaN NaN
2 80 22333 NaN NaN NaN NaN NaN NaN NaN NaN
1 135 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 134 136 186 NaN NaN NaN NaN NaN NaN NaN
3 82 135 439 NaN NaN NaN NaN NaN NaN NaN
2 110 22373 NaN NaN NaN NaN NaN NaN NaN NaN
1 139 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 138 140 327 NaN NaN NaN NaN NaN NaN NaN
3 111 139 441 NaN NaN NaN NaN NaN NaN NaN
1 22343 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 144 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 145 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 142 146 150 NaN NaN NaN NaN NaN NaN NaN
2 143 146 NaN NaN NaN NaN NaN NaN NaN NaN
3 144 145 467 NaN NaN NaN NaN NaN NaN NaN
1 22345 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 149 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 148 150 154 NaN NaN NaN NaN NaN NaN NaN
3 144 149 468 NaN NaN NaN NaN NaN NaN NaN
1 22347 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 153 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 152 154 158 NaN NaN NaN NaN NaN NaN NaN
3 149 153 470 NaN NaN NaN NaN NaN NaN NaN
1 22349 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 157 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 156 158 162 NaN NaN NaN NaN NaN NaN NaN
3 153 157 472 NaN NaN NaN NaN NaN NaN NaN
1 22351 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 161 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 160 162 166 NaN NaN NaN NaN NaN NaN NaN
3 157 161 474 NaN NaN NaN NaN NaN NaN NaN
1 22353 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 165 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 164 166 170 NaN NaN NaN NaN NaN NaN NaN
3 161 165 476 NaN NaN NaN NaN NaN NaN NaN
1 22355 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 169 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 168 170 174 NaN NaN NaN NaN NaN NaN NaN
3 165 169 478 NaN NaN NaN NaN NaN NaN NaN
1 22357 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 173 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 172 174 270 NaN NaN NaN NaN NaN NaN NaN
3 169 173 480 NaN NaN NaN NaN NaN NaN NaN
2 15 22426 NaN NaN NaN NaN NaN NaN NaN NaN
2 117 177 NaN NaN NaN NaN NaN NaN NaN NaN
3 176 178 408 NaN NaN NaN NaN NaN NaN NaN
3 12 177 419 NaN NaN NaN NaN NaN NaN NaN
2 98 22366 NaN NaN NaN NaN NaN NaN NaN NaN
1 181 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 180 182 315 NaN NaN NaN NaN NaN NaN NaN
3 99 181 450 NaN NaN NaN NaN NaN NaN NaN
1 22422 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 185 358 NaN NaN NaN NaN NaN NaN NaN NaN
3 184 186 361 NaN NaN NaN NaN NaN NaN NaN
3 135 185 452 NaN NaN NaN NaN NaN NaN NaN
1 22369 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 189 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 188 190 365 NaN NaN NaN NaN NaN NaN NaN
3 103 189 454 NaN NaN NaN NaN NaN NaN NaN
2 106 22371 NaN NaN NaN NaN NaN NaN NaN NaN
1 193 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 192 194 323 NaN NaN NaN NaN NaN NaN NaN
3 107 193 456 NaN NaN NaN NaN NaN NaN NaN
2 197 22338 NaN NaN NaN NaN NaN NaN NaN NaN
1 198 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 195 22375 NaN NaN NaN NaN NaN NaN NaN NaN
3 196 200 205 NaN NaN NaN NaN NaN NaN NaN
2 201 366 NaN NaN NaN NaN NaN NaN NaN NaN
3 198 201 459 NaN NaN NaN NaN NaN NaN NaN
3 199 200 369 NaN NaN NaN NaN NaN NaN NaN
1 22340 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 204 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 203 205 209 NaN NaN NaN NaN NaN NaN NaN
3 198 204 458 NaN NaN NaN NaN NaN NaN NaN
1 22377 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 208 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 207 209 213 NaN NaN NaN NaN NaN NaN NaN
3 204 208 461 NaN NaN NaN NaN NaN NaN NaN
1 22379 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 212 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 211 213 217 NaN NaN NaN NaN NaN NaN NaN
3 208 212 463 NaN NaN NaN NaN NaN NaN NaN
1 22381 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 216 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 215 217 221 NaN NaN NaN NaN NaN NaN NaN
3 212 216 498 NaN NaN NaN NaN NaN NaN NaN
1 22383 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 220 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 219 221 225 NaN NaN NaN NaN NaN NaN NaN
3 216 220 500 NaN NaN NaN NaN NaN NaN NaN
1 22385 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 224 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 223 225 229 NaN NaN NaN NaN NaN NaN NaN
3 220 224 502 NaN NaN NaN NaN NaN NaN NaN
1 22387 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 228 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 227 229 233 NaN NaN NaN NaN NaN NaN NaN
3 224 228 504 NaN NaN NaN NaN NaN NaN NaN
1 22389 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 232 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 231 233 237 NaN NaN NaN NaN NaN

Download .txt

gitextract_28zc9teu/

├── .vscode/
│   ├── launch.json
│   └── settings.json
├── CUDA/
│   ├── chapter10_线程束基本函数与协作组/
│   │   ├── Makefile
│   │   ├── error.cuh
│   │   ├── reduce.cu
│   │   ├── reduce1parallelism.cu
│   │   ├── reduce2static.cu
│   │   └── warp.cu
│   ├── chapter11_CUDA流/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── common.h
│   │   ├── error.cuh
│   │   ├── host-kernel.cu
│   │   ├── kernel-kernel.cu
│   │   ├── kernel-transfer.cu
│   │   ├── pinMemTransfer.cu
│   │   ├── simpleHyperqOpenmp.cu
│   │   ├── simpleMultiAddBreadth.cu
│   │   └── simpleMultiAddDepth.cu
│   ├── chapter12_使用统一内存编程/
│   │   ├── Makefile
│   │   ├── add.cu
│   │   ├── add2_static.cu
│   │   ├── error.cuh
│   │   ├── oversubscription1.cu
│   │   ├── oversubscription2.cu
│   │   ├── oversubscription3.cu
│   │   └── prefetch.cu
│   ├── chapter13_分子动力学模拟的CUDA程序开发/
│   │   ├── cpp/
│   │   │   ├── common.cuh
│   │   │   ├── error.cuh
│   │   │   ├── force.cu
│   │   │   ├── force.cuh
│   │   │   ├── initialize.cu
│   │   │   ├── initialize.cuh
│   │   │   ├── integrate.cu
│   │   │   ├── integrate.cuh
│   │   │   ├── main.cu
│   │   │   ├── makefile
│   │   │   ├── makefile.windows
│   │   │   ├── memory.cu
│   │   │   ├── memory.cuh
│   │   │   ├── mic.cuh
│   │   │   ├── neighbor.cu
│   │   │   └── neighbor.cuh
│   │   ├── force-only/
│   │   │   ├── Makefile
│   │   │   ├── common.h
│   │   │   ├── error.cuh
│   │   │   ├── force.cu
│   │   │   ├── force.h
│   │   │   ├── initialize.cu
│   │   │   ├── initialize.h
│   │   │   ├── integrate.cu
│   │   │   ├── integrate.h
│   │   │   ├── main.cu
│   │   │   ├── makefile.windows
│   │   │   ├── memory.cu
│   │   │   ├── memory.h
│   │   │   ├── mic.h
│   │   │   ├── neighbor.cu
│   │   │   └── neighbor.h
│   │   ├── plot_energy.m
│   │   └── whole-code/
│   │       ├── Makefile
│   │       ├── common.h
│   │       ├── error.cuh
│   │       ├── force.cu
│   │       ├── force.h
│   │       ├── initialize.cu
│   │       ├── initialize.h
│   │       ├── integrate.cu
│   │       ├── integrate.h
│   │       ├── main.cu
│   │       ├── makefile.windows
│   │       ├── memory.cu
│   │       ├── memory.h
│   │       ├── mic.h
│   │       ├── neighbor.cu
│   │       ├── neighbor.h
│   │       ├── reduce.cu
│   │       └── reduce.h
│   ├── chapter14_CUDA标准库的使用/
│   │   ├── Makefile
│   │   ├── cublas_gemm.cu
│   │   ├── curand_host1.cu
│   │   ├── curand_host2.cu
│   │   ├── cusolver.cu
│   │   ├── error.cuh
│   │   ├── thrust_scan_pointer.cu
│   │   └── thrust_scan_vector.cu
│   ├── chapter1_GPU硬件与CUDA程序开发工具/
│   │   └── README.md
│   ├── chapter2_CUDA中的线程组织/
│   │   ├── Makefile
│   │   ├── hello1.cpp
│   │   ├── hello2.cu
│   │   ├── hello3.cu
│   │   ├── hello4.cu
│   │   └── hello5.cu
│   ├── chapter3_简单CUDA程序的基本框架/
│   │   ├── Makefile
│   │   ├── add.cpp
│   │   ├── add1.cu
│   │   ├── add2wrong.cu
│   │   ├── add3if.cu
│   │   └── add4device.cu
│   ├── chapter4_CUDA程序的错误检测/
│   │   ├── Makefile
│   │   ├── check1api
│   │   ├── check1api.cu
│   │   ├── check2kernel
│   │   ├── check2kernel.cu
│   │   ├── error.cuh
│   │   ├── memcheck
│   │   └── memcheck.cu
│   ├── chapter5_获得GPU加速的关键/
│   │   ├── Makefile
│   │   ├── add1cpu.cu
│   │   ├── add2gpu.cu
│   │   ├── add3memcpy.cu
│   │   ├── arithmetic1cpu.cu
│   │   ├── arithmetic2gpu.cu
│   │   └── error.cuh
│   ├── chapter6_CUDA的内存组织/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── error.cuh
│   │   ├── query.cu
│   │   └── static.cu
│   ├── chapter7_全局内存的合理使用/
│   │   ├── Makefile
│   │   ├── error.cuh
│   │   └── matrix.cu
│   ├── chapter8_共享内存的合理使用/
│   │   ├── Makefile
│   │   ├── bank.cu
│   │   ├── error.cuh
│   │   ├── reduce1cpu.cu
│   │   └── reduce2gpu.cu
│   └── chapter9_原子函数的合理使用/
│       ├── Makefile
│       ├── error.cuh
│       ├── neighbor.txt
│       ├── neighbor1cpu.cu
│       ├── neighbor2gpu.cu
│       ├── reduce.cu
│       └── xy.txt
└── README.md

Download .txt

SYMBOL INDEX (15 symbols across 7 files)

FILE: CUDA/chapter11_CUDA流/common.h
  function seconds (line 66) | inline double seconds()

FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/common.h
  type real (line 4) | typedef double real;
  type real (line 6) | typedef float real;
  type Atom (line 12) | struct Atom
  type Box (line 39) | struct Box

FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/mic.h
  function __device__ (line 3) | static void __device__ apply_mic

FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/common.h
  type real (line 4) | typedef double real;
  type real (line 6) | typedef float real;
  type Atom (line 12) | struct Atom
  type Box (line 44) | struct Box

FILE: CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/mic.h
  function __device__ (line 3) | static void __device__ apply_mic

FILE: CUDA/chapter2_CUDA中的线程组织/hello1.cpp
  function main (line 3) | int main(void)

FILE: CUDA/chapter3_简单CUDA程序的基本框架/add.cpp
  function main (line 12) | int main(int argc, char *argv[])
  function add (line 36) | void add(const double *x, const double *y, double *z, const int N)
  function check (line 44) | void check(const double *z, const int N)

Download .json

Condensed preview — 135 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,128K chars).

[
  {
    "path": ".vscode/launch.json",
    "chars": 409,
    "preview": "{\n    // Use IntelliSense to learn about possible attributes.\n    // Hover to view descriptions of existing attributes.\n"
  },
  {
    "path": ".vscode/settings.json",
    "chars": 277,
    "preview": "{\n    \"files.associations\": {\n        \"cstdio\": \"cpp\",\n        \"istream\": \"cpp\",\n        \"limits\": \"cpp\",\n        \"ostre"
  },
  {
    "path": "CUDA/chapter10_线程束基本函数与协作组/Makefile",
    "chars": 434,
    "preview": "all: reduce warp reduce1parallelism reduce2static\n\nreduce: reduce.cu\n\tnvcc -g -arch=sm_50 reduce.cu -o reduce\n\nwarp: war"
  },
  {
    "path": "CUDA/chapter10_线程束基本函数与协作组/error.cuh",
    "chars": 829,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter10_线程束基本函数与协作组/reduce.cu",
    "chars": 4578,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n#include <cooperative_groups.h>\n\nusing namespace cooperative_groups;\n\n#ifdef USE_"
  },
  {
    "path": "CUDA/chapter10_线程束基本函数与协作组/reduce1parallelism.cu",
    "chars": 2587,
    "preview": "#include \"error.cuh\"\n#include <stdio.h>\n#include <cooperative_groups.h>\nusing namespace cooperative_groups;\n\n#ifdef USE_"
  },
  {
    "path": "CUDA/chapter10_线程束基本函数与协作组/reduce2static.cu",
    "chars": 2669,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n#include <cooperative_groups.h>\n\nusing namespace cooperative_groups;\n\n#ifdef USE_"
  },
  {
    "path": "CUDA/chapter10_线程束基本函数与协作组/warp.cu",
    "chars": 1951,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n\nconst unsigned WIDTH = 8;\nconst unsigned BLOCK_SIZE = 16;\nconst unsigned FULL_MA"
  },
  {
    "path": "CUDA/chapter11_CUDA流/Makefile",
    "chars": 1053,
    "preview": "all: simpleHyperqOpenmp simpleMultiAddBreadth simpleMultiAddDepth host-kernel kernel-kernel kernel-transfer pinMemTransf"
  },
  {
    "path": "CUDA/chapter11_CUDA流/README.md",
    "chars": 1911,
    "preview": "### Pinned Memory\n\n**Allocated host memory is by default pageable**, that is, subject to page fault operations that move"
  },
  {
    "path": "CUDA/chapter11_CUDA流/common.h",
    "chars": 4319,
    "preview": "#include <sys/time.h>\n\n#ifndef _COMMON_H\n#define _COMMON_H\n\n#define CHECK(call)                                         "
  },
  {
    "path": "CUDA/chapter11_CUDA流/error.cuh",
    "chars": 829,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter11_CUDA流/host-kernel.cu",
    "chars": 3331,
    "preview": "#include \"error.cuh\"\n#include <cmath>\n#include <cstdlib>\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float"
  },
  {
    "path": "CUDA/chapter11_CUDA流/kernel-kernel.cu",
    "chars": 2717,
    "preview": "#include \"error.cuh\"\n#include <cmath>\n#include <cstdio>\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float "
  },
  {
    "path": "CUDA/chapter11_CUDA流/kernel-transfer.cu",
    "chars": 3243,
    "preview": "#include \"error.cuh\"\n#include <cmath>\n#include <cstdio>\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float "
  },
  {
    "path": "CUDA/chapter11_CUDA流/pinMemTransfer.cu",
    "chars": 1850,
    "preview": "#include \"common.h\"\n#include <cuda_runtime.h>\n#include <stdio.h>\n\n/*\n * An example of using CUDA's memory copy API to tr"
  },
  {
    "path": "CUDA/chapter11_CUDA流/simpleHyperqOpenmp.cu",
    "chars": 4059,
    "preview": "#include \"common.h\"\n#include <cstdio>\n#include <cuda_runtime.h>\n#include <cstdlib>\n#include <omp.h>\n\n/*\n * An example of"
  },
  {
    "path": "CUDA/chapter11_CUDA流/simpleMultiAddBreadth.cu",
    "chars": 8034,
    "preview": "#include \"common.h\"\n#include <cstdio>\n#include <cuda_runtime.h>\n\n/*\n * This example demonstrates overlapping computation"
  },
  {
    "path": "CUDA/chapter11_CUDA流/simpleMultiAddDepth.cu",
    "chars": 7792,
    "preview": "#include \"common.h\"\n#include <stdio.h>\n#include <cuda_runtime.h>\n\n/*\n * This example demonstrates overlapping computatio"
  },
  {
    "path": "CUDA/chapter12_使用统一内存编程/Makefile",
    "chars": 686,
    "preview": "all: add add2_static oversubscription1 oversubscription2 oversubscription3 prefetch\n\nadd: add.cu\n\tnvcc add.cu -o add\n\nad"
  },
  {
    "path": "CUDA/chapter12_使用统一内存编程/add.cu",
    "chars": 1290,
    "preview": "#include \"error.cuh\"\n#include <cmath>\n#include <cstdio>\n\nconst double EPSILON = 1.0e-15;\nconst double a = 1.23;\nconst do"
  },
  {
    "path": "CUDA/chapter12_使用统一内存编程/add2_static.cu",
    "chars": 373,
    "preview": "#include \"error.cuh\"\n#include <cmath>\n#include <cstdio>\n\n__device__ __managed__ int ret[1000];\n\n__global__ void AplusB(i"
  },
  {
    "path": "CUDA/chapter12_使用统一内存编程/error.cuh",
    "chars": 829,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter12_使用统一内存编程/oversubscription1.cu",
    "chars": 522,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n#include <cstdint>\n\nconst int N = 30;\n\nint main(int argc, char *argv[])\n{\n    for"
  },
  {
    "path": "CUDA/chapter12_使用统一内存编程/oversubscription2.cu",
    "chars": 856,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n#include <cstdint>\n\nconst int N = 30;\n\n__global__ void gpu_touch(uint64_t *x, con"
  },
  {
    "path": "CUDA/chapter12_使用统一内存编程/oversubscription3.cu",
    "chars": 553,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n#include <cstdint>\n\nconst int N = 30;\n\nvoid cpu_touch(uint64_t *x, size_t size)\n{"
  },
  {
    "path": "CUDA/chapter12_使用统一内存编程/prefetch.cu",
    "chars": 1699,
    "preview": "#include \"error.cuh\" \n#include <math.h>\n#include <stdio.h>\n\nconst double EPSILON = 1.0e-15;\nconst double a = 1.23;\nconst"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/common.cuh",
    "chars": 391,
    "preview": "#pragma once\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float real;\n#endif\n\nconst real K_B = 8.617343e-5;"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/error.cuh",
    "chars": 830,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/force.cu",
    "chars": 1861,
    "preview": "#include \"force.cuh\"\n#include \"mic.cuh\"\n\nvoid find_force(int N, int MN, Atom *atom)\n{\n    int *NN = atom->NN;\n    int *N"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/force.cuh",
    "chars": 81,
    "preview": "#pragma once\n#include \"common.cuh\"\n\nvoid find_force(int N, int MN, Atom *atom);\n\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/initialize.cu",
    "chars": 2283,
    "preview": "#include \"initialize.cuh\"\n#include <stdlib.h>\n#include <math.h>\n\nstatic void scale_velocity(int N, real T_0, Atom *atom)"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/initialize.cuh",
    "chars": 147,
    "preview": "#pragma once\n#include \"common.cuh\"\n\nvoid initialize_position(int nx, real ax, Atom *atom);\nvoid initialize_velocity(int "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/integrate.cu",
    "chars": 2807,
    "preview": "#include \"integrate.cuh\"\n#include \"force.cuh\"\n#include \"error.cuh\"\n#include <stdio.h>\n#include <math.h>\n#include <time.h"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/integrate.cuh",
    "chars": 229,
    "preview": "#pragma once\n#include \"common.cuh\"\n\nvoid equilibration\n(\n    int Ne, int N, int MN, real T_0, \n    real time_step, Atom "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/main.cu",
    "chars": 969,
    "preview": "#include \"common.cuh\"\n#include \"memory.cuh\"\n#include \"initialize.cuh\"\n#include \"neighbor.cuh\"\n#include \"integrate.cuh\"\n#"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/makefile",
    "chars": 536,
    "preview": "all: ljmd\n\nCC = nvcc\nCFLAGS = -O3 -arch=sm_70 \n\nljmd: initialize.o integrate.o neighbor.o force.o memory.o main.o\n\t$(CC)"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/makefile.windows",
    "chars": 594,
    "preview": "all: ljmd\n\nCC = nvcc\nCFLAGS = -O3 -arch=sm_75 -Xcompiler \"/wd 4819\"\n\nljmd: initialize.obj integrate.obj neighbor.obj for"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/memory.cu",
    "chars": 1170,
    "preview": "#include \"memory.cuh\"\n#include <stdlib.h>\n\nvoid allocate_memory(int N, int MN, Atom *atom)\n{\n    atom->NN = (int*) mallo"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/memory.cuh",
    "chars": 122,
    "preview": "#pragma once\n#include \"common.cuh\"\n\nvoid allocate_memory(int N, int MN, Atom *atom);\nvoid deallocate_memory(Atom *atom);"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/mic.cuh",
    "chars": 389,
    "preview": "#pragma once\n\nstatic void apply_mic(real *box, real *x12, real *y12, real *z12)\n{\n    if      (*x12 < - box[3]) { *x12 +"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/neighbor.cu",
    "chars": 1084,
    "preview": "#include \"neighbor.cuh\"\n#include \"mic.cuh\"\n#include <stdio.h>\n#include <stdlib.h>\n\nvoid find_neighbor(int N, int MN, Ato"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/cpp/neighbor.cuh",
    "chars": 83,
    "preview": "#pragma once\n#include \"common.cuh\"\n\nvoid find_neighbor(int N, int MN, Atom *atom);\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/Makefile",
    "chars": 534,
    "preview": "all: ljmd\n\nCC = nvcc\nCFLAGS = -O3 -arch=sm_75\n\nljmd: initialize.o integrate.o neighbor.o force.o memory.o main.o\n\t$(CC) "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/common.h",
    "chars": 620,
    "preview": "#pragma once\n\n#ifdef DOUBLE_PRECISION\n    typedef double real;\n#else\n    typedef float real;\n#endif\n\n#define K_B        "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/error.cuh",
    "chars": 830,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/force.cu",
    "chars": 3171,
    "preview": "#include \"error.cuh\"\n#include \"force.h\"\n#include \"mic.h\"\n\nstruct LJ\n{\n    real cutoff2;\n    real e24s6;\n    real e48s12;"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/force.h",
    "chars": 79,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid find_force(int N, int MN, Atom *atom);\n\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/initialize.cu",
    "chars": 2562,
    "preview": "#include \"initialize.h\"\n#include \"error.cuh\"\n#include <stdlib.h>\n#include <math.h>\n\nstatic void scale_velocity(int N, re"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/initialize.h",
    "chars": 145,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid initialize_position(int nx, real ax, Atom *atom);\nvoid initialize_velocity(int N,"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/integrate.cu",
    "chars": 2945,
    "preview": "#include \"integrate.h\"\n#include \"force.h\"\n#include \"error.cuh\"\n#include <stdio.h>\n#include <math.h>\n#include <time.h>\n\ns"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/integrate.h",
    "chars": 227,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid equilibration\n(\n    int Ne, int N, int MN, real T_0, \n    real time_step, Atom *a"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/main.cu",
    "chars": 957,
    "preview": "#include \"common.h\"\n#include \"memory.h\"\n#include \"initialize.h\"\n#include \"neighbor.h\"\n#include \"integrate.h\"\n#include <s"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/makefile.windows",
    "chars": 594,
    "preview": "all: ljmd\n\nCC = nvcc\nCFLAGS = -O3 -arch=sm_75 -Xcompiler \"/wd 4819\"\n\nljmd: initialize.obj integrate.obj neighbor.obj for"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/memory.cu",
    "chars": 1904,
    "preview": "#include \"error.cuh\"\n#include \"memory.h\"\n#include <stdlib.h>\n\nvoid allocate_memory(int N, int MN, Atom *atom)\n{\n    atom"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/memory.h",
    "chars": 120,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid allocate_memory(int N, int MN, Atom *atom);\nvoid deallocate_memory(Atom *atom);\n\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/mic.h",
    "chars": 408,
    "preview": "#pragma once\n\nstatic void __device__ apply_mic\n(\n    Box box, real *x12, real *y12, real *z12\n)\n{\n    if      (*x12 < - "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/neighbor.cu",
    "chars": 1359,
    "preview": "#include \"neighbor.h\"\n#include \"mic.h\"\n#include <stdio.h>\n#include <stdlib.h>\n\nstatic void __global__ gpu_find_neighbor\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/force-only/neighbor.h",
    "chars": 81,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid find_neighbor(int N, int MN, Atom *atom);\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/plot_energy.m",
    "chars": 639,
    "preview": "clear;close all; font_size=12;\n%load cpp/energy.txt;\n%load force-only/energy.txt;\nload whole-code/energy.txt;\n\nt=(1:size"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/Makefile",
    "chars": 604,
    "preview": "all: ljmd\n\nCC = nvcc\nCFLAGS = -O3 -arch=sm_60\n\nljmd: initialize.o integrate.o neighbor.o force.o memory.o reduce.o main."
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/common.h",
    "chars": 699,
    "preview": "#pragma once\n\n#ifdef DOUBLE_PRECISION\n    typedef double real;\n#else\n    typedef float real;\n#endif\n\n#define K_B        "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/error.cuh",
    "chars": 830,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/force.cu",
    "chars": 2658,
    "preview": "#include \"error.cuh\"\n#include \"force.h\"\n#include \"mic.h\"\n\nstruct LJ\n{\n    real cutoff2;\n    real e24s6; \n    real e48s12"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/force.h",
    "chars": 79,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid find_force(int N, int MN, Atom *atom);\n\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/initialize.cu",
    "chars": 2907,
    "preview": "#include \"initialize.h\"\n#include \"error.cuh\"\n#include <stdlib.h>\n#include <math.h>\n\nstatic void scale_velocity(int N, re"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/initialize.h",
    "chars": 145,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid initialize_position(int nx, real ax, Atom *atom);\nvoid initialize_velocity(int N,"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/integrate.cu",
    "chars": 3625,
    "preview": "#include \"integrate.h\"\n#include \"error.cuh\"\n#include \"force.h\"\n#include \"reduce.h\"\n#include <stdio.h>\n#include <math.h>\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/integrate.h",
    "chars": 227,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid equilibration\n(\n    int Ne, int N, int MN, real T_0, \n    real time_step, Atom *a"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/main.cu",
    "chars": 957,
    "preview": "#include \"common.h\"\n#include \"memory.h\"\n#include \"initialize.h\"\n#include \"neighbor.h\"\n#include \"integrate.h\"\n#include <s"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/makefile.windows",
    "chars": 668,
    "preview": "all: ljmd\n\nCC = nvcc\nCFLAGS = -O3 -arch=sm_75 -Xcompiler \"/wd 4819\"\n\nljmd: initialize.obj integrate.obj neighbor.obj for"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/memory.cu",
    "chars": 2377,
    "preview": "#include \"error.cuh\"\n#include \"memory.h\"\n#include <stdlib.h>\n\nvoid allocate_memory(int N, int MN, Atom *atom)\n{\n    atom"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/memory.h",
    "chars": 120,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid allocate_memory(int N, int MN, Atom *atom);\nvoid deallocate_memory(Atom *atom);\n\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/mic.h",
    "chars": 410,
    "preview": "#pragma once\n\nstatic void __device__ apply_mic\n(\n    Box box, real *x12, real *y12, real *z12\n)\n{\n    if      (*x12 < - "
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/neighbor.cu",
    "chars": 1360,
    "preview": "#include \"neighbor.h\"\n#include \"mic.h\"\n#include <stdio.h>\n#include <stdlib.h>\n\nstatic void __global__ gpu_find_neighbor\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/neighbor.h",
    "chars": 81,
    "preview": "#pragma once\n#include \"common.h\"\n\nvoid find_neighbor(int N, int MN, Atom *atom);\n"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/reduce.cu",
    "chars": 1388,
    "preview": "#include \"reduce.h\"\n#include \"error.cuh\"\n#include <cooperative_groups.h>\nusing namespace cooperative_groups;\nconst int b"
  },
  {
    "path": "CUDA/chapter13_分子动力学模拟的CUDA程序开发/whole-code/reduce.h",
    "chars": 62,
    "preview": "#include \"common.h\"\n\nreal sum(const int N, const real *d_x);\n\n"
  },
  {
    "path": "CUDA/chapter14_CUDA标准库的使用/Makefile",
    "chars": 776,
    "preview": "all: thrust_scan_vector thrust_scan_pointer cublas_gemm cusolver curand_host1 curand_host2\n\nthrust_scan_vector: thrust_s"
  },
  {
    "path": "CUDA/chapter14_CUDA标准库的使用/cublas_gemm.cu",
    "chars": 1800,
    "preview": "#include \"error.cuh\" \n#include <stdio.h>\n#include <cublas_v2.h>\n\nvoid print_matrix(int R, int C, double* A, const char* "
  },
  {
    "path": "CUDA/chapter14_CUDA标准库的使用/curand_host1.cu",
    "chars": 807,
    "preview": "#include <cstdio>\n#include <cstdlib>\n#include <curand.h>\n\nvoid output_results(int N, double *g_x);\n\nint main(int argc, c"
  },
  {
    "path": "CUDA/chapter14_CUDA标准库的使用/curand_host2.cu",
    "chars": 812,
    "preview": "#include <cstdio>\n#include <cstdlib>\n#include <curand.h>\n\nvoid output_results(int N, double *g_x);\n\nint main(int argc, c"
  },
  {
    "path": "CUDA/chapter14_CUDA标准库的使用/cusolver.cu",
    "chars": 1664,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n#include <cstdlib>\n#include <cusolverDn.h>\n\nint main(int argc, char *argv[])\n{\n  "
  },
  {
    "path": "CUDA/chapter14_CUDA标准库的使用/error.cuh",
    "chars": 829,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter14_CUDA标准库的使用/thrust_scan_pointer.cu",
    "chars": 764,
    "preview": "#include <thrust/execution_policy.h>\n#include <thrust/scan.h>\n#include <cstdio>\n\nint main(int argc, char *argv[])\n{\n    "
  },
  {
    "path": "CUDA/chapter14_CUDA标准库的使用/thrust_scan_vector.cu",
    "chars": 442,
    "preview": "#include <thrust/device_vector.h>\n#include <thrust/scan.h>\n#include <cstdio>\n\nint main(void)\n{\n    int N = 10;\n    thrus"
  },
  {
    "path": "CUDA/chapter1_GPU硬件与CUDA程序开发工具/README.md",
    "chars": 1307,
    "preview": "### 1. 各个GPU主计算能力的架构代号与发布年份\n\n| 主计算能力  | 架构代号 |   发布年份  |\n|:------------:|:---------------:|:--------------:|\n| `X=1` | T"
  },
  {
    "path": "CUDA/chapter2_CUDA中的线程组织/Makefile",
    "chars": 332,
    "preview": "all: hello1 hello2 hello3 hello4 hello5\n\nhello1: hello1.cpp\n\tg++ hello1.cpp -o hello1\n\nhello2: hello2.cu\n\tnvcc hello2.cu"
  },
  {
    "path": "CUDA/chapter2_CUDA中的线程组织/hello1.cpp",
    "chars": 82,
    "preview": "#include <stdio.h>\n\nint main(void)\n{\n    printf(\"Hello World!\\n\");\n    return 0;\n}"
  },
  {
    "path": "CUDA/chapter2_CUDA中的线程组织/hello2.cu",
    "chars": 285,
    "preview": "#include <cstdio>\n#include <iostream>\n\n__global__ void hello_from_gpu()\n{\n    printf(\"Hello World from the GPU!\\n\");\n   "
  },
  {
    "path": "CUDA/chapter2_CUDA中的线程组织/hello3.cu",
    "chars": 211,
    "preview": "#include <cstdio>\n\n__global__ void hello_from_gpu()\n{\n    printf(\"Hello World from the GPU!\\n\");\n}\n\nint main(int argc, c"
  },
  {
    "path": "CUDA/chapter2_CUDA中的线程组织/hello4.cu",
    "chars": 300,
    "preview": "#include <cstdio>\n\n__global__ void hello_from_gpu()\n{\n    const int bid = blockIdx.x;\n    const int tid = threadIdx.x;\n "
  },
  {
    "path": "CUDA/chapter2_CUDA中的线程组织/hello5.cu",
    "chars": 382,
    "preview": "#include <cstdio>\n\n__global__ void hello_from_gpu()\n{\n    const int b = blockIdx.x;\n    const int tx = threadIdx.x;\n    "
  },
  {
    "path": "CUDA/chapter3_简单CUDA程序的基本框架/Makefile",
    "chars": 344,
    "preview": "all: add add1 add2wrong add3if add4device\n\nadd: add.cpp\n\tg++ add.cpp -o add\n\nadd1: add1.cu\n\tnvcc add1.cu -o add1\n\nadd2wr"
  },
  {
    "path": "CUDA/chapter3_简单CUDA程序的基本框架/add.cpp",
    "chars": 1063,
    "preview": "#include <cmath>\n#include <cstdlib>\n#include <cstdio>\n\nconst double EPSILON = 1.0e-15;\nconst double a = 1.23;\nconst doub"
  },
  {
    "path": "CUDA/chapter3_简单CUDA程序的基本框架/add1.cu",
    "chars": 1585,
    "preview": "#include <cmath>\n#include <cstdio>\n\nconst double EPSILON = 1.0e-15;\nconst double a = 1.23;\nconst double b = 2.34;\nconst "
  },
  {
    "path": "CUDA/chapter3_简单CUDA程序的基本框架/add2wrong.cu",
    "chars": 1522,
    "preview": "#include <cmath>\n#include <cstdio>\n\nconst double EPSILON = 1.0e-15;\nconst double a = 1.23;\nconst double b = 2.34;\nconst "
  },
  {
    "path": "CUDA/chapter3_简单CUDA程序的基本框架/add3if.cu",
    "chars": 1586,
    "preview": "#include <math.h>\n#include <stdio.h>\n\nconst double EPSILON = 1.0e-15;\nconst double a = 1.23;\nconst double b = 2.34;\ncons"
  },
  {
    "path": "CUDA/chapter3_简单CUDA程序的基本框架/add4device.cu",
    "chars": 2817,
    "preview": "#include <cmath>\n#include <cstdio>\n#include <cuda.h>\n#include <cuda_runtime.h>\n\nconst double EPSILON = 1.0e-15;\nconst do"
  },
  {
    "path": "CUDA/chapter4_CUDA程序的错误检测/Makefile",
    "chars": 279,
    "preview": "all: check1api check2kernel memcheck\n\ncheck1api: check1api.cu\n\tnvcc check1api.cu -o check1api\n\ncheck2kernel: check2kerne"
  },
  {
    "path": "CUDA/chapter4_CUDA程序的错误检测/check1api.cu",
    "chars": 1687,
    "preview": "#include \"error.cuh\"\n#include <math.h>\n#include <stdio.h>\n\nconst double EPSILON = 1.0e-15;\nconst double a = 1.23;\nconst "
  },
  {
    "path": "CUDA/chapter4_CUDA程序的错误检测/check2kernel.cu",
    "chars": 1755,
    "preview": "#include \"error.cuh\"\n#include <math.h>\n#include <stdio.h>\n\nconst double EPSILON = 1.0e-15;\nconst double a = 1.23;\nconst "
  },
  {
    "path": "CUDA/chapter4_CUDA程序的错误检测/error.cuh",
    "chars": 831,
    "preview": "\n#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                            "
  },
  {
    "path": "CUDA/chapter4_CUDA程序的错误检测/memcheck.cu",
    "chars": 1618,
    "preview": "#include <cmath>\n#include <cstdio>\n\nconst double EPSILON = 1.0e-15;\nconst double a = 1.23;\nconst double b = 2.34;\nconst "
  },
  {
    "path": "CUDA/chapter5_获得GPU加速的关键/Makefile",
    "chars": 528,
    "preview": "all: add1cpu add2gpu add3memcpy arithmetic1cpu arithmetic2gpu\n\nadd1cpu: add1cpu.cu\n\tnvcc -O3 -DUSE_DP add1cpu.cu -o add1"
  },
  {
    "path": "CUDA/chapter5_获得GPU加速的关键/add1cpu.cu",
    "chars": 2076,
    "preview": "#include \"error.cuh\"\n#include <math.h>\n#include <stdio.h>\n\n#ifdef USE_DP\n    typedef double real;\n    const real EPSILON"
  },
  {
    "path": "CUDA/chapter5_获得GPU加速的关键/add2gpu.cu",
    "chars": 2690,
    "preview": "#include \"error.cuh\"\n#include <math.h>\n#include <stdio.h>\n\n#ifdef USE_DP\n    typedef double real;\n    const real EPSILON"
  },
  {
    "path": "CUDA/chapter5_获得GPU加速的关键/add3memcpy.cu",
    "chars": 2728,
    "preview": "#include \"error.cuh\"\n#include <math.h>\n#include <stdio.h>\n\n#ifdef USE_DP\n    typedef double real;\n    const real EPSILON"
  },
  {
    "path": "CUDA/chapter5_获得GPU加速的关键/arithmetic1cpu.cu",
    "chars": 1631,
    "preview": "#include \"error.cuh\"\n#include <cmath>\n#include <cstdio>\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float "
  },
  {
    "path": "CUDA/chapter5_获得GPU加速的关键/arithmetic2gpu.cu",
    "chars": 2059,
    "preview": "#include \"error.cuh\"\n#include <cmath>\n#include <cstdio>\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float "
  },
  {
    "path": "CUDA/chapter5_获得GPU加速的关键/error.cuh",
    "chars": 829,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter6_CUDA的内存组织/Makefile",
    "chars": 183,
    "preview": "all: static query\n\nstatic: static.cu\n\tnvcc -O3 -DUSE_DP static.cu -o static\n\nquery: query.cu\n\tnvcc -O3 --ptxas-options=-"
  },
  {
    "path": "CUDA/chapter6_CUDA的内存组织/README.md",
    "chars": 2661,
    "preview": "### 1. CUDA中设备内存的分类与特征\n\n| 内存类型  | 物理位置 |   访问权限  | 可见范围 | 生命周期 |\n|:------------:|:---------------:|:--------------:|:---"
  },
  {
    "path": "CUDA/chapter6_CUDA的内存组织/error.cuh",
    "chars": 828,
    "preview": "#pragma once\n#include <cstdio>\n\n#define CHECK(call)                                   \\\ndo                              "
  },
  {
    "path": "CUDA/chapter6_CUDA的内存组织/query.cu",
    "chars": 1786,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n\nint main(int argc, char *argv[])\n{\n    int device_id = 0;\n    if (argc > 1) devi"
  },
  {
    "path": "CUDA/chapter6_CUDA的内存组织/static.cu",
    "chars": 559,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n\n__device__ int d_x = 1;\n__device__ int d_y[2];\n\nvoid __global__ my_kernel(void)\n"
  },
  {
    "path": "CUDA/chapter7_全局内存的合理使用/Makefile",
    "chars": 106,
    "preview": "all: matrix\n\nmatrix: matrix.cu\n\tnvcc -arch=sm_50 matrix.cu -o matrix\n.PHONY: clean\n\nclean:\n\trm -rf matrix\n"
  },
  {
    "path": "CUDA/chapter7_全局内存的合理使用/error.cuh",
    "chars": 829,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter7_全局内存的合理使用/matrix.cu",
    "chars": 4746,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n#include <cuda_runtime.h>\n#include <cuda.h>\n\n#ifdef USE_DP\n    typedef double rea"
  },
  {
    "path": "CUDA/chapter8_共享内存的合理使用/Makefile",
    "chars": 285,
    "preview": "all: reduce1cpu reduce2gpu bank\n\nreduce1cpu: reduce1cpu.cu\n\tnvcc -arch=sm_50 reduce1cpu.cu -o reduce1cpu\n\nreduce2gpu: re"
  },
  {
    "path": "CUDA/chapter8_共享内存的合理使用/bank.cu",
    "chars": 4241,
    "preview": "#include \"error.cuh\"\n#include <stdio.h>\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float real;\n#endif\n\nco"
  },
  {
    "path": "CUDA/chapter8_共享内存的合理使用/error.cuh",
    "chars": 829,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter8_共享内存的合理使用/reduce1cpu.cu",
    "chars": 1287,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float real;\n#endif\n\ncon"
  },
  {
    "path": "CUDA/chapter8_共享内存的合理使用/reduce2gpu.cu",
    "chars": 3815,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float real;\n#endif\n\ncon"
  },
  {
    "path": "CUDA/chapter9_原子函数的合理使用/Makefile",
    "chars": 330,
    "preview": "all: reduce neighbor1cpu neighbor2gpu\n\nreduce: reduce.cu\n\tnvcc -g -arch=sm_50 reduce.cu -o reduce\n\nneighbor1cpu: neighbo"
  },
  {
    "path": "CUDA/chapter9_原子函数的合理使用/error.cuh",
    "chars": 829,
    "preview": "#pragma once\n#include <stdio.h>\n\n#define CHECK(call)                                   \\\ndo                             "
  },
  {
    "path": "CUDA/chapter9_原子函数的合理使用/neighbor.txt",
    "chars": 1044630,
    "preview": "2 2 22321 NaN NaN NaN NaN NaN NaN NaN NaN\n2 3 55 NaN NaN NaN NaN NaN NaN NaN NaN\n2 0 22322 NaN NaN NaN NaN NaN NaN NaN N"
  },
  {
    "path": "CUDA/chapter9_原子函数的合理使用/neighbor1cpu.cu",
    "chars": 3820,
    "preview": "#include \"error.cuh\"\n#include <cmath>\n#include <iostream>\n#include <fstream>\n#include <sstream>\n#include <string>\n#inclu"
  },
  {
    "path": "CUDA/chapter9_原子函数的合理使用/neighbor2gpu.cu",
    "chars": 5934,
    "preview": "#include \"error.cuh\"\n#include <cmath>\n#include <iostream>\n#include <fstream>\n#include <sstream>\n#include <string>\n#inclu"
  },
  {
    "path": "CUDA/chapter9_原子函数的合理使用/reduce.cu",
    "chars": 2290,
    "preview": "#include \"error.cuh\"\n#include <cstdio>\n\n#ifdef USE_DP\n    typedef double real;\n#else\n    typedef float real;\n#endif\n\ncon"
  },
  {
    "path": "CUDA/chapter9_原子函数的合理使用/xy.txt",
    "chars": 831006,
    "preview": "28.06444083882057 241.17654747387496\n27.36108113647276 0.09243891995926815\n29.484873440235663 241.16665258048\n28.0792660"
  },
  {
    "path": "README.md",
    "chars": 38,
    "preview": "# CUDA_Programming\n《CUDA编程基础与实践》一书的代码\n"
  }
]

// ... and 3 more files (download for full content)

About this extraction

This page contains the full source code of the MAhaitao999/CUDA_Programming GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 135 files (2.0 MB), approximately 978.6k tokens, and a symbol index with 15 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo