[
  {
    "path": ".gitignore",
    "content": "*.swp\n*.{6,8,5,o}\n"
  },
  {
    "path": "Makefile",
    "content": "all: 6g doc\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngccgo:\n\tgo install -v -compiler $(GCCGO)\n\ntest: 6gtest gccgotest\n\n6gtest: \n\tgo test\n\ngccgotest: \n\tgo test -compiler $(GCCGO)\n\nbench: 6gbench gccgobench\n\n6gbench:\n\tgo test -bench=.\n\ngccgobench:\n\tgo test -bench=. -compiler $(GCCGO)\n\nclean:\n\tgo clean\n\tgo-optview -c -w *.go\n\tgofmt -w *.go\n\nopt:\n\tgo-optview -w *.go\n\tgofmt -w *.go\n\ndoc:\n\tgodoc github.com/barnex/cuda5 > README\n"
  },
  {
    "path": "README.md",
    "content": "# Go bindings for CUDA\n\nGo bindings for nVIDIA CUDA 5 and later. This package compiles with both gc and gccgo.\n\n![fig](gophergpu.png)\n"
  },
  {
    "path": "cu/Makefile",
    "content": "all: 6g gccgo doc\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngccgo:\n\tgo build -v -compiler $(GCCGO)\n\ntest: 6gtest gccgotest\n\n6gtest: \n\tgo test\n\ngccgotest: \n\tgo test -compiler $(GCCGO)\n\nbench: 6gbench gccgobench\n\n6gbench:\n\tgo test -bench=.\n\ngccgobench:\n\tgo test -bench=. -compiler $(GCCGO)\n\nclean:\n\tgo clean\n\ndoc:\n\tgodoc github.com/barnex/cuda5/cu > README\n"
  },
  {
    "path": "cu/README",
    "content": "PACKAGE\n\npackage cu\n    import \"github.com/barnex/cuda5/cu\"\n\n    Go bindings for the CUDA driver API.\n\nCONSTANTS\n\nconst (\n    // If  the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.\n    CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO\n    // Spin when waiting for results from the GPU. \n    CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN\n    // Yield its thread when waiting for results from the GPU.\n    CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD\n    // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.\n    CTX_BLOCKING_SYNC\n    // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.\n    CTX_MAP_HOST = C.CU_CTX_MAP_HOST\n    //Do not reduce local memory after resizing local memory for a kernel. \n    CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX\n)\n    Flags for CtxCreate\nconst (\n    SIZEOF_FLOAT32    = 4\n    SIZEOF_FLOAT64    = 8\n    SIZEOF_COMPLEX64  = 8\n    SIZEOF_COMPLEX128 = 16\n)\n    Type size in bytes\n\n\nFUNCTIONS\n\nfunc CtxDestroy(ctx *Context)\n    Destroys the CUDA context specified by ctx. If the context usage count\n    is not equal to 1, or the context is current to any CPU thread other\n    than the current one, this function fails. Floating contexts (detached\n    from a CPU thread via cuCtxPopCurrent()) may be destroyed by this\n    function.\n\nfunc CtxDisablePeerAccess(peer Context)\n    Reverses CtxEnablePeerAccess().\n\nfunc CtxEnablePeerAccess(peer Context)\n    Make allocations from the peer Context available to the current context.\n\nfunc CtxGetApiVersion(ctx Context) (version int)\n    Returns the API version to create the context.\n\nfunc CtxSetCurrent(ctx Context)\n    Sets the current active context.\n\nfunc CtxSynchronize()\n    Blocks until the device has completed all preceding requested tasks, if\n    the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.\n\nfunc DeviceCanAccessPeer(dev, peer Device) bool\n    Returns true if CtxEnablePeerAccess can be called on a context for dev\n    and peerDev.\n\nfunc DeviceComputeCapability(device Device) (major, minor int)\n    Returns the compute capability of the device.\n\nfunc DeviceGetAttribute(attrib DeviceAttribute, dev Device) int\n    Gets the value of a device attribute.\n\nfunc DeviceGetCount() int\n    Returns the number of devices with compute capability greater than or\n    equal to 1.0 that are available for execution.\n\nfunc DeviceGetName(dev Device) string\n    Gets the name of the device.\n\nfunc DeviceTotalMem(device Device) int64\n    Returns the total amount of memory available on the device in bytes.\n\nfunc FuncGetAttribute(attrib FunctionAttribute, function Function) int\n\nfunc Init(flags int)\n    Initialize the CUDA driver API. Currently, flags must be 0. If Init()\n    has not been called, any function from the driver API will panic with\n    ERROR_NOT_INITIALIZED.\n\nfunc LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer)\n\nfunc MemAllocHost(bytes int64) unsafe.Pointer\n\nfunc MemFree(ptr *DevicePtr)\n    Frees device memory allocated by MemAlloc(). Overwrites the pointer with\n    NULL. It is safe to double-free.\n\nfunc MemFreeHost(ptr unsafe.Pointer)\n\nfunc MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr)\n    Returns the base address and size of the allocation (by MemAlloc) that\n    contains the input pointer ptr.\n\nfunc MemGetInfo() (free, total int64)\n    Returns the free and total amount of memroy in the current Context (in\n    bytes).\n\nfunc MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag)\n    Page-locks memory specified by the pointer and bytes. The pointer and\n    byte size must be aligned to the host page size (4KB) See also:\n    MemHostUnregister()\n\nfunc MemHostUnregister(ptr unsafe.Pointer)\n    Unmaps memory locked by MemHostRegister().\n\nfunc Memcpy(dst, src DevicePtr, bytes int64)\n    Copies a number of bytes on the current device. Requires unified\n    addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually\n    an auto copy for device and/or host memory\n\nfunc MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream)\n    Asynchronously copies a number of bytes on the current device.\n\nfunc MemcpyDtoD(dst, src DevicePtr, bytes int64)\n    Copies a number of bytes from host to device.\n\nfunc MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream)\n    Asynchronously copies a number of bytes from host to device.\n\nfunc MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64)\n    Copies a number of bytes from device to host.\n\nfunc MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream)\n    Asynchronously copies a number of bytes device host to host. The host\n    memory must be page-locked (see MemRegister)\n\nfunc MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64)\n    Copies a number of bytes from host to device.\n\nfunc MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream)\n    Asynchronously copies a number of bytes from host to device. The host\n    memory must be page-locked (see MemRegister)\n\nfunc MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64)\n    Copies from device memory in one context (device) to another.\n\nfunc MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream)\n    Asynchronously copies from device memory in one context (device) to\n    another.\n\nfunc MemsetD32(deviceptr DevicePtr, value uint32, N int64)\n    Sets the first N 32-bit values of dst array to value. Asynchronous.\n\nfunc MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream)\n    Asynchronously sets the first N 32-bit values of dst array to value.\n\nfunc MemsetD8(deviceptr DevicePtr, value uint8, N int64)\n    Sets the first N 8-bit values of dst array to value. Asynchronous.\n\nfunc MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream)\n    Asynchronously sets the first N 32-bit values of dst array to value.\n\nfunc StreamDestroy(stream *Stream)\n    Destroys an asynchronous stream\n\nfunc StreamSynchronize(stream Stream)\n    Blocks until the stream has completed.\n\nfunc Version() int\n    Returns the CUDA driver version.\n\n\nTYPES\n\ntype Context uintptr\n    CUDA context.\n\nfunc CtxCreate(flags uint, dev Device) Context\n    Create a CUDA context.\n\nfunc CtxGetCurrent() Context\n    Gets the current active context.\n\nfunc (ctx Context) ApiVersion() (version int)\n    Returns the API version to create the context.\n\nfunc (ctx *Context) Destroy()\n    Destroys the CUDA context.\n\nfunc (peer Context) DisablePeerAccess()\n    Reverses EnablePeerAccess().\n\nfunc (peer Context) EnablePeerAccess()\n    Make allocations from the peer Context available to the current context.\n\nfunc (ctx Context) SetCurrent()\n    Sets the current active context.\n\ntype DevProp struct {\n    MaxThreadsPerBlock  int\n    MaxThreadsDim       [3]int\n    MaxGridSize         [3]int\n    SharedMemPerBlock   int\n    TotalConstantMemory int\n    SIMDWidth           int\n    MemPitch            int\n    RegsPerBlock        int\n    ClockRate           int\n    TextureAlign        int\n}\n    Device properties\n\nfunc DeviceGetProperties(dev Device) (prop DevProp)\n    Returns the device's properties.\n\ntype Device int\n    CUDA Device number.\n\nfunc CtxGetDevice() Device\n    Returns the ordinal of the current context's device.\n\nfunc DeviceGet(ordinal int) Device\n    Returns in a device handle given an ordinal in the range [0,\n    DeviceGetCount()-1].\n\nfunc (dev Device) Attribute(attrib DeviceAttribute) int\n    Gets the value of a device attribute.\n\nfunc (dev Device) CanAccessPeer(peer Device) bool\n    Returns true if CtxEnablePeerAccess can be called on a context for dev\n    and peerDev.\n\nfunc (device Device) ComputeCapability() (major, minor int)\n    Returns the compute capability of the device.\n\nfunc (dev Device) Name() string\n    Gets the name of the device.\n\nfunc (dev Device) Properties() DevProp\n    Returns the device's properties.\n\nfunc (device Device) TotalMem() int64\n    Returns the total amount of memory available on the device in bytes.\n\ntype DeviceAttribute int\n\nconst (\n    MAX_THREADS_PER_BLOCK            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK            // Maximum number of threads per block\n    MAX_BLOCK_DIM_X                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X                  // Maximum block dimension X\n    MAX_BLOCK_DIM_Y                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y                  // Maximum block dimension Y\n    MAX_BLOCK_DIM_Z                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z                  // Maximum block dimension Z\n    MAX_GRID_DIM_X                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X                   // Maximum grid dimension X\n    MAX_GRID_DIM_Y                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y                   // Maximum grid dimension Y\n    MAX_GRID_DIM_Z                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z                   // Maximum grid dimension Z\n    MAX_SHARED_MEMORY_PER_BLOCK      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK      // Maximum shared memory available per block in bytes\n    TOTAL_CONSTANT_MEMORY            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY            // Memory available on device for __constant__ variables in a CUDA C kernel in bytes\n    WARP_SIZE                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE                        // Warp size in threads\n    MAX_PITCH                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH                        // Maximum pitch in bytes allowed by memory copies\n    MAX_REGISTERS_PER_BLOCK          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK          // Maximum number of 32-bit registers available per block\n    CLOCK_RATE                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE                       // Peak clock frequency in kilohertz\n    TEXTURE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT                // Alignment requirement for textures\n    MULTIPROCESSOR_COUNT             DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT             // Number of multiprocessors on device\n    KERNEL_EXEC_TIMEOUT              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT              // Specifies whether there is a run time limit on kernels\n    INTEGRATED                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED                       // Device is integrated with host memory\n    CAN_MAP_HOST_MEMORY              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY              // Device can map host memory into CUDA address space\n    COMPUTE_MODE                     DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE                     // Compute mode (See ::CUcomputemode for details)\n    MAXIMUM_TEXTURE1D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH          // Maximum 1D texture width\n    MAXIMUM_TEXTURE2D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH          // Maximum 2D texture width\n    MAXIMUM_TEXTURE2D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT         // Maximum 2D texture height\n    MAXIMUM_TEXTURE3D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH          // Maximum 3D texture width\n    MAXIMUM_TEXTURE3D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT         // Maximum 3D texture height\n    MAXIMUM_TEXTURE3D_DEPTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH          // Maximum 3D texture depth\n    MAXIMUM_TEXTURE2D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH  // Maximum 2D layered texture width\n    MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height\n    MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture\n    SURFACE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                // Alignment requirement for surfaces\n    CONCURRENT_KERNELS               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS               // Device can possibly execute multiple kernels concurrently\n    ECC_ENABLED                      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED                      // Device has ECC support enabled\n    PCI_BUS_ID                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID                       // PCI bus ID of the device\n    PCI_DEVICE_ID                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID                    // PCI device ID of the device\n    TCC_DRIVER                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER                       // Device is using TCC driver model\n    MEMORY_CLOCK_RATE                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE                // Peak memory clock frequency in kilohertz\n    GLOBAL_MEMORY_BUS_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH          // Global memory bus width in bits\n    L2_CACHE_SIZE                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                    // Size of L2 cache in bytes\n    MAX_THREADS_PER_MULTIPROCESSOR   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR   // Maximum resident threads per multiprocessor\n    ASYNC_ENGINE_COUNT               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT               // Number of asynchronous engines\n    UNIFIED_ADDRESSING               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING               // Device uses shares a unified address space with the host \n    MAXIMUM_TEXTURE1D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  // Maximum 1D layered texture width\n    MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture\n)\n\ntype DevicePtr uintptr\n\nfunc MemAlloc(bytes int64) DevicePtr\n    Allocates a number of bytes of device memory.\n\nfunc (ptr DevicePtr) Bytes() (bytes int64)\n    Returns the size of the allocation (by MemAlloc) that contains the input\n    pointer ptr.\n\nfunc (ptr *DevicePtr) Free()\n    Frees device memory allocated by MemAlloc(). Overwrites the pointer with\n    NULL. It is safe to double-free.\n\nfunc (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr)\n    Returns the base address and size of the allocation (by MemAlloc) that\n    contains the input pointer ptr.\n\nfunc (ptr DevicePtr) MemoryType() MemoryType\n    Returns the physical memory type that ptr addresses.\n\nfunc (p DevicePtr) String() string\n\ntype Dim3 struct {\n    X, Y, Z int\n}\n\ntype Function uintptr\n    Represents a CUDA CUfunction, a reference to a function within a module.\n\nfunc ModuleGetFunction(module Module, name string) Function\n    Returns a Function handle.\n\nfunc (f Function) GetAttribute(attrib FunctionAttribute) int\n\ntype FunctionAttribute int\n\nconst (\n    FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.\n    FUNC_A_SHARED_SIZE_BYTES     FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES     // The size in bytes of statically-allocated shared memory required by this function. \n    FUNC_A_CONST_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES      // The size in bytes of user-allocated constant memory required by this function.\n    FUNC_A_LOCAL_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES      // The size in bytes of local memory used by each thread of this function.\n    FUNC_A_NUM_REGS              FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS              // The number of registers used by each thread of this function.\n    FUNC_A_PTX_VERSION           FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION           // The PTX virtual architecture version for which the function was compiled. \n    FUNC_A_BINARY_VERSION        FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION        // The binary architecture version for which the function was compiled.\n)\n\ntype MemHostRegisterFlag int\n\nconst (\n    // Memory is pinned in all CUDA contexts.\n    MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE\n    // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()\n    MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP\n)\n    Flag for MemHostRegister\n\ntype MemoryType uint\n\nconst (\n    MemoryTypeHost    MemoryType = C.CU_MEMORYTYPE_HOST\n    MemoryTypeDevice  MemoryType = C.CU_MEMORYTYPE_DEVICE\n    MemoryTypeArray   MemoryType = C.CU_MEMORYTYPE_ARRAY\n    MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED\n)\n\nfunc PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result)\n    Returns the physical memory type that ptr addresses.\n\nfunc (t MemoryType) String() string\n\ntype Module uintptr\n    Represents a CUDA CUmodule, a reference to executable device code.\n\nfunc ModuleLoad(fname string) Module\n    Loads a compute module from file\n\nfunc ModuleLoadData(image string) Module\n    Loads a compute module from string\n\nfunc (m Module) GetFunction(name string) Function\n    Returns a Function handle.\n\ntype Result int\n    CUDA error status. CUDA error statuses are not returned by functions but\n    checked and passed to panic() when not successful. If desired, they can\n    be caught by recover().\n\nconst (\n    SUCCESS                              Result = C.CUDA_SUCCESS\n    ERROR_INVALID_VALUE                  Result = C.CUDA_ERROR_INVALID_VALUE\n    ERROR_OUT_OF_MEMORY                  Result = C.CUDA_ERROR_OUT_OF_MEMORY\n    ERROR_NOT_INITIALIZED                Result = C.CUDA_ERROR_NOT_INITIALIZED\n    ERROR_DEINITIALIZED                  Result = C.CUDA_ERROR_DEINITIALIZED\n    ERROR_PROFILER_DISABLED              Result = C.CUDA_ERROR_PROFILER_DISABLED\n    ERROR_PROFILER_NOT_INITIALIZED       Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED\n    ERROR_PROFILER_ALREADY_STARTED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED\n    ERROR_PROFILER_ALREADY_STOPPED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED\n    ERROR_NO_DEVICE                      Result = C.CUDA_ERROR_NO_DEVICE\n    ERROR_INVALID_DEVICE                 Result = C.CUDA_ERROR_INVALID_DEVICE\n    ERROR_INVALID_IMAGE                  Result = C.CUDA_ERROR_INVALID_IMAGE\n    ERROR_INVALID_CONTEXT                Result = C.CUDA_ERROR_INVALID_CONTEXT\n    ERROR_CONTEXT_ALREADY_CURRENT        Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT\n    ERROR_MAP_FAILED                     Result = C.CUDA_ERROR_MAP_FAILED\n    ERROR_UNMAP_FAILED                   Result = C.CUDA_ERROR_UNMAP_FAILED\n    ERROR_ARRAY_IS_MAPPED                Result = C.CUDA_ERROR_ARRAY_IS_MAPPED\n    ERROR_ALREADY_MAPPED                 Result = C.CUDA_ERROR_ALREADY_MAPPED\n    ERROR_NO_BINARY_FOR_GPU              Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU\n    ERROR_ALREADY_ACQUIRED               Result = C.CUDA_ERROR_ALREADY_ACQUIRED\n    ERROR_NOT_MAPPED                     Result = C.CUDA_ERROR_NOT_MAPPED\n    ERROR_NOT_MAPPED_AS_ARRAY            Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY\n    ERROR_NOT_MAPPED_AS_POINTER          Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER\n    ERROR_ECC_UNCORRECTABLE              Result = C.CUDA_ERROR_ECC_UNCORRECTABLE\n    ERROR_UNSUPPORTED_LIMIT              Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT\n    ERROR_CONTEXT_ALREADY_IN_USE         Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE\n    ERROR_INVALID_SOURCE                 Result = C.CUDA_ERROR_INVALID_SOURCE\n    ERROR_FILE_NOT_FOUND                 Result = C.CUDA_ERROR_FILE_NOT_FOUND\n    ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND\n    ERROR_SHARED_OBJECT_INIT_FAILED      Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED\n    ERROR_OPERATING_SYSTEM               Result = C.CUDA_ERROR_OPERATING_SYSTEM\n    ERROR_INVALID_HANDLE                 Result = C.CUDA_ERROR_INVALID_HANDLE\n    ERROR_NOT_FOUND                      Result = C.CUDA_ERROR_NOT_FOUND\n    ERROR_NOT_READY                      Result = C.CUDA_ERROR_NOT_READY\n    ERROR_LAUNCH_FAILED                  Result = C.CUDA_ERROR_LAUNCH_FAILED\n    ERROR_LAUNCH_OUT_OF_RESOURCES        Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES\n    ERROR_LAUNCH_TIMEOUT                 Result = C.CUDA_ERROR_LAUNCH_TIMEOUT\n    ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING\n    ERROR_PEER_ACCESS_ALREADY_ENABLED    Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED\n    ERROR_PEER_ACCESS_NOT_ENABLED        Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED\n    ERROR_PRIMARY_CONTEXT_ACTIVE         Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE\n    ERROR_CONTEXT_IS_DESTROYED           Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED\n    ERROR_ASSERT                         Result = C.CUDA_ERROR_ASSERT\n    ERROR_TOO_MANY_PEERS                 Result = C.CUDA_ERROR_TOO_MANY_PEERS\n    ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED\n    ERROR_HOST_MEMORY_NOT_REGISTERED     Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED\n    ERROR_UNKNOWN                        Result = C.CUDA_ERROR_UNKNOWN\n)\n\nfunc StreamQuery(stream Stream) Result\n    Returns Success if all operations have completed, ErrorNotReady\n    otherwise\n\nfunc (err Result) String() string\n    Message string for the error\n\ntype Stream uintptr\n    CUDA stream.\n\nfunc StreamCreate() Stream\n    Creates an asynchronous stream\n\nfunc (stream *Stream) Destroy()\n    Destroys the asynchronous stream\n\nfunc (stream Stream) Query() Result\n    Returns Success if all operations have completed, ErrorNotReady\n    otherwise\n\nfunc (stream Stream) Synchronize()\n    Blocks until the stream has completed.\n\n\n"
  },
  {
    "path": "cu/cgoflags.go",
    "content": "package cu\n\n// This file provides CGO flags to find CUDA libraries and headers.\n\n//#cgo LDFLAGS:-lcuda -lcudart\n//\n////default location:\n//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib\n//#cgo CFLAGS: -I/usr/local/cuda/include/\n//\n////default location if not properly symlinked:\n//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib\n//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib\n//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib\n//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/\n//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/\n//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/\n//\n////arch linux:\n//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib\n//#cgo CFLAGS: -I/opt/cuda/include\n//\n////WINDOWS:\n//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64\n//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include\nimport \"C\"\n"
  },
  {
    "path": "cu/context.go",
    "content": "package cu\n\n// This file implements CUDA driver context management\n\n//#include <cuda.h>\nimport \"C\"\nimport \"unsafe\"\n\n// CUDA context.\ntype Context uintptr\n\n// Create a CUDA context.\nfunc CtxCreate(flags uint, dev Device) Context {\n\tvar ctx C.CUcontext\n\terr := Result(C.cuCtxCreate(&ctx, C.uint(flags), C.CUdevice(dev)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Context(uintptr(unsafe.Pointer(ctx)))\n}\n\n//Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function.\nfunc CtxDestroy(ctx *Context) {\n\terr := Result(C.cuCtxDestroy(C.CUcontext(unsafe.Pointer(uintptr(*ctx)))))\n\t*ctx = 0\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n//Destroys the CUDA context.\nfunc (ctx *Context) Destroy() {\n\tCtxDestroy(ctx)\n}\n\n// Returns the API version to create the context.\nfunc CtxGetApiVersion(ctx Context) (version int) {\n\tvar cversion C.uint\n\terr := Result(C.cuCtxGetApiVersion(C.CUcontext(unsafe.Pointer(uintptr(ctx))), &cversion))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\tversion = int(cversion)\n\treturn\n}\n\n// Returns the API version to create the context.\nfunc (ctx Context) ApiVersion() (version int) {\n\treturn CtxGetApiVersion(ctx)\n}\n\n// Gets the current active context.\nfunc CtxGetCurrent() Context {\n\tvar ctx C.CUcontext\n\terr := Result(C.cuCtxGetCurrent(&ctx))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Context(uintptr(unsafe.Pointer(ctx)))\n}\n\n// Returns the ordinal of the current context's device.\nfunc CtxGetDevice() Device {\n\tvar dev C.CUdevice\n\terr := Result(C.cuCtxGetDevice(&dev))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Device(dev)\n}\n\n// Sets the current active context.\nfunc CtxSetCurrent(ctx Context) {\n\terr := Result(C.cuCtxSetCurrent(C.CUcontext(unsafe.Pointer(uintptr(ctx)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Sets the current active context.\nfunc (ctx Context) SetCurrent() {\n\tCtxSetCurrent(ctx)\n}\n\n// Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.\nfunc CtxSynchronize() {\n\terr := Result(C.cuCtxSynchronize())\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Flags for CtxCreate\nconst (\n\t// If  the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.\n\tCTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO\n\t// Spin when waiting for results from the GPU.\n\tCTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN\n\t// Yield its thread when waiting for results from the GPU.\n\tCTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD\n\t// Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.\n\tCTX_BLOCKING_SYNC\n\t// Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.\n\tCTX_MAP_HOST = C.CU_CTX_MAP_HOST\n\t//Do not reduce local memory after resizing local memory for a kernel.\n\tCTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX\n)\n"
  },
  {
    "path": "cu/context_test.go",
    "content": "package cu\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n)\n\nfunc TestContext(t *testing.T) {\n\tfmt.Println(\"CtxCreate\")\n\tctx := CtxCreate(CTX_SCHED_AUTO, 0)\n\tfmt.Println(\"CtxSetCurrent\")\n\tCtxSetCurrent(ctx)\n\tfmt.Println(\"CtxGetApiVersion:\", ctx.ApiVersion())\n\tfmt.Println(\"CtxGetDevice:\", CtxGetDevice())\n\t(&ctx).Destroy()\n}\n\nfunc BenchmarkGetContext(b *testing.B) {\n\tb.StopTimer()\n\tctx := CtxCreate(CTX_SCHED_AUTO, 0)\n\tCtxSetCurrent(ctx)\n\tb.StartTimer()\n\tfor i := 0; i < b.N; i++ {\n\t\tCtxGetCurrent()\n\t}\n}\n\nfunc BenchmarkSetContext(b *testing.B) {\n\tb.StopTimer()\n\tctx := CtxCreate(CTX_SCHED_AUTO, 0)\n\tb.StartTimer()\n\tfor i := 0; i < b.N; i++ {\n\t\tctx.SetCurrent()\n\t}\n}\n"
  },
  {
    "path": "cu/device.go",
    "content": "package cu\n\n// This file implements CUDA driver device management\n\n//#include <cuda.h>\nimport \"C\"\n\nimport ()\n\n// CUDA Device number.\ntype Device int\n\n// Returns the compute capability of the device.\nfunc DeviceComputeCapability(device Device) (major, minor int) {\n\tvar maj, min C.int\n\terr := Result(C.cuDeviceComputeCapability(&maj, &min, C.CUdevice(device)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\tmajor = int(maj)\n\tminor = int(min)\n\treturn\n}\n\n// Returns the compute capability of the device.\nfunc (device Device) ComputeCapability() (major, minor int) {\n\treturn DeviceComputeCapability(device)\n}\n\n// Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1].\nfunc DeviceGet(ordinal int) Device {\n\tvar device C.CUdevice\n\terr := Result(C.cuDeviceGet(&device, C.int(ordinal)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Device(device)\n}\n\n// Gets the value of a device attribute.\nfunc DeviceGetAttribute(attrib DeviceAttribute, dev Device) int {\n\tvar attr C.int\n\terr := Result(C.cuDeviceGetAttribute(&attr, C.CUdevice_attribute(attrib), C.CUdevice(dev)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn int(attr)\n}\n\n// Gets the value of a device attribute.\nfunc (dev Device) Attribute(attrib DeviceAttribute) int {\n\treturn DeviceGetAttribute(attrib, dev)\n}\n\n// Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution.\nfunc DeviceGetCount() int {\n\tvar count C.int\n\terr := Result(C.cuDeviceGetCount(&count))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn int(count)\n}\n\n// Gets the name of the device.\nfunc DeviceGetName(dev Device) string {\n\tsize := 256\n\tbuf := make([]byte, size)\n\tcstr := C.CString(string(buf))\n\terr := Result(C.cuDeviceGetName(cstr, C.int(size), C.CUdevice(dev)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn C.GoString(cstr)\n}\n\n// Gets the name of the device.\nfunc (dev Device) Name() string {\n\treturn DeviceGetName(dev)\n}\n\n// Device properties\ntype DevProp struct {\n\tMaxThreadsPerBlock  int\n\tMaxThreadsDim       [3]int\n\tMaxGridSize         [3]int\n\tSharedMemPerBlock   int\n\tTotalConstantMemory int\n\tSIMDWidth           int\n\tMemPitch            int\n\tRegsPerBlock        int\n\tClockRate           int\n\tTextureAlign        int\n}\n\n// Returns the device's properties.\nfunc DeviceGetProperties(dev Device) (prop DevProp) {\n\tvar cprop C.CUdevprop\n\terr := Result(C.cuDeviceGetProperties(&cprop, C.CUdevice(dev)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\tprop.MaxThreadsPerBlock = int(cprop.maxThreadsPerBlock)\n\tprop.MaxThreadsDim[0] = int(cprop.maxThreadsDim[0])\n\tprop.MaxThreadsDim[1] = int(cprop.maxThreadsDim[1])\n\tprop.MaxThreadsDim[2] = int(cprop.maxThreadsDim[2])\n\tprop.MaxGridSize[0] = int(cprop.maxGridSize[0])\n\tprop.MaxGridSize[1] = int(cprop.maxGridSize[1])\n\tprop.MaxGridSize[2] = int(cprop.maxGridSize[2])\n\tprop.SharedMemPerBlock = int(cprop.sharedMemPerBlock)\n\tprop.TotalConstantMemory = int(cprop.totalConstantMemory)\n\tprop.SIMDWidth = int(cprop.SIMDWidth)\n\tprop.MemPitch = int(cprop.memPitch)\n\tprop.RegsPerBlock = int(cprop.regsPerBlock)\n\tprop.ClockRate = int(cprop.clockRate)\n\tprop.TextureAlign = int(cprop.textureAlign)\n\treturn\n}\n\n// Returns the device's properties.\nfunc (dev Device) Properties() DevProp {\n\treturn DeviceGetProperties(dev)\n}\n\n// Returns the total amount of memory available on the device in bytes.\nfunc (device Device) TotalMem() int64 {\n\treturn DeviceTotalMem(device)\n}\n\n// Returns the total amount of memory available on the device in bytes.\nfunc DeviceTotalMem(device Device) int64 {\n\tvar bytes C.size_t\n\terr := Result(C.cuDeviceTotalMem(&bytes, C.CUdevice(device)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn int64(bytes)\n}\n\ntype DeviceAttribute int\n\nconst (\n\tMAX_THREADS_PER_BLOCK            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK            // Maximum number of threads per block\n\tMAX_BLOCK_DIM_X                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X                  // Maximum block dimension X\n\tMAX_BLOCK_DIM_Y                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y                  // Maximum block dimension Y\n\tMAX_BLOCK_DIM_Z                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z                  // Maximum block dimension Z\n\tMAX_GRID_DIM_X                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X                   // Maximum grid dimension X\n\tMAX_GRID_DIM_Y                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y                   // Maximum grid dimension Y\n\tMAX_GRID_DIM_Z                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z                   // Maximum grid dimension Z\n\tMAX_SHARED_MEMORY_PER_BLOCK      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK      // Maximum shared memory available per block in bytes\n\tTOTAL_CONSTANT_MEMORY            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY            // Memory available on device for __constant__ variables in a CUDA C kernel in bytes\n\tWARP_SIZE                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE                        // Warp size in threads\n\tMAX_PITCH                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH                        // Maximum pitch in bytes allowed by memory copies\n\tMAX_REGISTERS_PER_BLOCK          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK          // Maximum number of 32-bit registers available per block\n\tCLOCK_RATE                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE                       // Peak clock frequency in kilohertz\n\tTEXTURE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT                // Alignment requirement for textures\n\tMULTIPROCESSOR_COUNT             DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT             // Number of multiprocessors on device\n\tKERNEL_EXEC_TIMEOUT              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT              // Specifies whether there is a run time limit on kernels\n\tINTEGRATED                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED                       // Device is integrated with host memory\n\tCAN_MAP_HOST_MEMORY              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY              // Device can map host memory into CUDA address space\n\tCOMPUTE_MODE                     DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE                     // Compute mode (See ::CUcomputemode for details)\n\tMAXIMUM_TEXTURE1D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH          // Maximum 1D texture width\n\tMAXIMUM_TEXTURE2D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH          // Maximum 2D texture width\n\tMAXIMUM_TEXTURE2D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT         // Maximum 2D texture height\n\tMAXIMUM_TEXTURE3D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH          // Maximum 3D texture width\n\tMAXIMUM_TEXTURE3D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT         // Maximum 3D texture height\n\tMAXIMUM_TEXTURE3D_DEPTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH          // Maximum 3D texture depth\n\tMAXIMUM_TEXTURE2D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH  // Maximum 2D layered texture width\n\tMAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height\n\tMAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture\n\tSURFACE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                // Alignment requirement for surfaces\n\tCONCURRENT_KERNELS               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS               // Device can possibly execute multiple kernels concurrently\n\tECC_ENABLED                      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED                      // Device has ECC support enabled\n\tPCI_BUS_ID                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID                       // PCI bus ID of the device\n\tPCI_DEVICE_ID                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID                    // PCI device ID of the device\n\tTCC_DRIVER                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER                       // Device is using TCC driver model\n\tMEMORY_CLOCK_RATE                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE                // Peak memory clock frequency in kilohertz\n\tGLOBAL_MEMORY_BUS_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH          // Global memory bus width in bits\n\tL2_CACHE_SIZE                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                    // Size of L2 cache in bytes\n\tMAX_THREADS_PER_MULTIPROCESSOR   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR   // Maximum resident threads per multiprocessor\n\tASYNC_ENGINE_COUNT               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT               // Number of asynchronous engines\n\tUNIFIED_ADDRESSING               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING               // Device uses shares a unified address space with the host\n\tMAXIMUM_TEXTURE1D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  // Maximum 1D layered texture width\n\tMAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture\n)\n"
  },
  {
    "path": "cu/device_test.go",
    "content": "package cu\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n)\n\nfunc TestDevice(t *testing.T) {\n\tfmt.Println(\"DeviceGetCount:\", DeviceGetCount())\n\tfor i := 0; i < DeviceGetCount(); i++ {\n\t\tfmt.Println(\"DeviceGet\", i)\n\t\tdev := DeviceGet(i)\n\t\tmajor, minor := dev.ComputeCapability()\n\t\tfmt.Println(\"Name: \", dev.Name())\n\t\tfmt.Println(\"ComputeCapability: \", major, minor)\n\t\tfmt.Println(\"TotalMem: \", dev.TotalMem())\n\n\t\tfmt.Println(\"ATTRIBUTE_MAX_THREADS_PER_BLOCK           :\", dev.Attribute(MAX_THREADS_PER_BLOCK))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_BLOCK_DIM_X                 :\", dev.Attribute(MAX_BLOCK_DIM_X))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_BLOCK_DIM_Y                 :\", dev.Attribute(MAX_BLOCK_DIM_Y))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_BLOCK_DIM_Z                 :\", dev.Attribute(MAX_BLOCK_DIM_Z))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_GRID_DIM_X                  :\", dev.Attribute(MAX_GRID_DIM_X))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_GRID_DIM_Y                  :\", dev.Attribute(MAX_GRID_DIM_Y))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_GRID_DIM_Z                  :\", dev.Attribute(MAX_GRID_DIM_Z))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK     :\", dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK))\n\t\tfmt.Println(\"ATTRIBUTE_TOTAL_CONSTANT_MEMORY           :\", dev.Attribute(TOTAL_CONSTANT_MEMORY))\n\t\tfmt.Println(\"ATTRIBUTE_WARP_SIZE                       :\", dev.Attribute(WARP_SIZE))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_PITCH                       :\", dev.Attribute(MAX_PITCH))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_REGISTERS_PER_BLOCK         :\", dev.Attribute(MAX_REGISTERS_PER_BLOCK))\n\t\tfmt.Println(\"ATTRIBUTE_CLOCK_RATE                      :\", dev.Attribute(CLOCK_RATE))\n\t\tfmt.Println(\"ATTRIBUTE_TEXTURE_ALIGNMENT               :\", dev.Attribute(TEXTURE_ALIGNMENT))\n\t\tfmt.Println(\"ATTRIBUTE_MULTIPROCESSOR_COUNT            :\", dev.Attribute(MULTIPROCESSOR_COUNT))\n\t\tfmt.Println(\"ATTRIBUTE_KERNEL_EXEC_TIMEOUT             :\", dev.Attribute(KERNEL_EXEC_TIMEOUT))\n\t\tfmt.Println(\"ATTRIBUTE_INTEGRATED                      :\", dev.Attribute(INTEGRATED))\n\t\tfmt.Println(\"ATTRIBUTE_CAN_MAP_HOST_MEMORY             :\", dev.Attribute(CAN_MAP_HOST_MEMORY))\n\t\tfmt.Println(\"ATTRIBUTE_COMPUTE_MODE                    :\", dev.Attribute(COMPUTE_MODE))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH         :\", dev.Attribute(MAXIMUM_TEXTURE1D_WIDTH))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH         :\", dev.Attribute(MAXIMUM_TEXTURE2D_WIDTH))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT        :\", dev.Attribute(MAXIMUM_TEXTURE2D_HEIGHT))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH         :\", dev.Attribute(MAXIMUM_TEXTURE3D_WIDTH))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT        :\", dev.Attribute(MAXIMUM_TEXTURE3D_HEIGHT))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH         :\", dev.Attribute(MAXIMUM_TEXTURE3D_DEPTH))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH :\", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_WIDTH))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:\", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:\", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_LAYERS))\n\t\tfmt.Println(\"ATTRIBUTE_SURFACE_ALIGNMENT               :\", dev.Attribute(SURFACE_ALIGNMENT))\n\t\tfmt.Println(\"ATTRIBUTE_CONCURRENT_KERNELS              :\", dev.Attribute(CONCURRENT_KERNELS))\n\t\tfmt.Println(\"ATTRIBUTE_ECC_ENABLED                     :\", dev.Attribute(ECC_ENABLED))\n\t\tfmt.Println(\"ATTRIBUTE_PCI_BUS_ID                      :\", dev.Attribute(PCI_BUS_ID))\n\t\tfmt.Println(\"ATTRIBUTE_PCI_DEVICE_ID                   :\", dev.Attribute(PCI_DEVICE_ID))\n\t\tfmt.Println(\"ATTRIBUTE_TCC_DRIVER                      :\", dev.Attribute(TCC_DRIVER))\n\t\tfmt.Println(\"ATTRIBUTE_MEMORY_CLOCK_RATE               :\", dev.Attribute(MEMORY_CLOCK_RATE))\n\t\tfmt.Println(\"ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH         :\", dev.Attribute(GLOBAL_MEMORY_BUS_WIDTH))\n\t\tfmt.Println(\"ATTRIBUTE_L2_CACHE_SIZE                   :\", dev.Attribute(L2_CACHE_SIZE))\n\t\tfmt.Println(\"ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR  :\", dev.Attribute(MAX_THREADS_PER_MULTIPROCESSOR))\n\t\tfmt.Println(\"ATTRIBUTE_ASYNC_ENGINE_COUNT              :\", dev.Attribute(ASYNC_ENGINE_COUNT))\n\t\tfmt.Println(\"ATTRIBUTE_UNIFIED_ADDRESSING              :\", dev.Attribute(UNIFIED_ADDRESSING))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH :\", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_WIDTH))\n\t\tfmt.Println(\"ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:\", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_LAYERS))\n\n\t\tfmt.Printf(\"Properties:%#v\\n\", dev.Properties())\n\t}\n}\n"
  },
  {
    "path": "cu/dim3.go",
    "content": "package cu\n\ntype Dim3 struct {\n\tX, Y, Z int\n}\n"
  },
  {
    "path": "cu/doc.go",
    "content": "// Go bindings for the CUDA driver API.\npackage cu\n"
  },
  {
    "path": "cu/execution.go",
    "content": "package cu\n\n// This file implements execution of CUDA kernels\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\nconst pointerSize = 8 // sorry, 64 bits only.\n\nfunc LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) {\n\n\t// Since Go 1.6, a cgo argument cannot have a Go pointer to Go pointer,\n\t// so we copy the argument values go C memory first.\n\targv := C.malloc(C.size_t(len(kernelParams) * pointerSize))\n\targp := C.malloc(C.size_t(len(kernelParams) * pointerSize))\n\tdefer C.free(argv)\n\tdefer C.free(argp)\n\tfor i := range kernelParams {\n\t\t*((*unsafe.Pointer)(offset(argp, i))) = offset(argv, i)       // argp[i] = &argv[i]\n\t\t*((*uint64)(offset(argv, i))) = *((*uint64)(kernelParams[i])) // argv[i] = *kernelParams[i]\n\t}\n\n\terr := Result(C.cuLaunchKernel(\n\t\tC.CUfunction(unsafe.Pointer(uintptr(f))),\n\t\tC.uint(gridDimX),\n\t\tC.uint(gridDimY),\n\t\tC.uint(gridDimZ),\n\t\tC.uint(blockDimX),\n\t\tC.uint(blockDimY),\n\t\tC.uint(blockDimZ),\n\t\tC.uint(sharedMemBytes),\n\t\tC.CUstream(unsafe.Pointer(uintptr(stream))),\n\t\t(*unsafe.Pointer)(argp),\n\t\t(*unsafe.Pointer)(unsafe.Pointer(uintptr(0)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\nfunc offset(ptr unsafe.Pointer, i int) unsafe.Pointer {\n\treturn unsafe.Pointer(uintptr(ptr) + pointerSize*uintptr(i))\n}\n"
  },
  {
    "path": "cu/function.go",
    "content": "package cu\n\n// This file implements manipulations on CUDA functions\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\n// Represents a CUDA CUfunction, a reference to a function within a module.\ntype Function uintptr\n\nfunc FuncGetAttribute(attrib FunctionAttribute, function Function) int {\n\tvar attr C.int\n\terr := Result(C.cuFuncGetAttribute(&attr, C.CUfunction_attribute(attrib), C.CUfunction(unsafe.Pointer(uintptr(function)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn int(attr)\n}\n\nfunc (f Function) GetAttribute(attrib FunctionAttribute) int {\n\treturn FuncGetAttribute(attrib, f)\n}\n\ntype FunctionAttribute int\n\nconst (\n\tFUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.\n\tFUNC_A_SHARED_SIZE_BYTES     FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES     // The size in bytes of statically-allocated shared memory required by this function.\n\tFUNC_A_CONST_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES      // The size in bytes of user-allocated constant memory required by this function.\n\tFUNC_A_LOCAL_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES      // The size in bytes of local memory used by each thread of this function.\n\tFUNC_A_NUM_REGS              FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS              // The number of registers used by each thread of this function.\n\tFUNC_A_PTX_VERSION           FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION           // The PTX virtual architecture version for which the function was compiled.\n\tFUNC_A_BINARY_VERSION        FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION        // The binary architecture version for which the function was compiled.\n)\n"
  },
  {
    "path": "cu/init.go",
    "content": "package cu\n\n// This file implements CUDA driver initialization\n\n//#include <cuda.h>\nimport \"C\"\n\n// Initialize the CUDA driver API.\n// Currently, flags must be 0.\n// If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED.\nfunc Init(flags int) {\n\terr := Result(C.cuInit(C.uint(flags)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n"
  },
  {
    "path": "cu/init_test.go",
    "content": "package cu\n\nimport (\n\t\"fmt\"\n)\n\n// needed for all other tests.\nfunc init() {\n\tInit(0)\n\tctx := CtxCreate(CTX_SCHED_AUTO, 0)\n\tCtxSetCurrent(ctx)\n\tfmt.Println(\"Created CUDA context\")\n}\n"
  },
  {
    "path": "cu/memory.go",
    "content": "package cu\n\n// This file implements CUDA memory management on the driver level\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"fmt\"\n\t\"unsafe\"\n)\n\ntype DevicePtr uintptr\n\n// Allocates a number of bytes of device memory.\nfunc MemAlloc(bytes int64) DevicePtr {\n\tvar devptr C.CUdeviceptr\n\terr := Result(C.cuMemAlloc(&devptr, C.size_t(bytes)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn DevicePtr(devptr)\n}\n\n// Frees device memory allocated by MemAlloc().\n// It is safe to double-free.\nfunc MemFree(p DevicePtr) {\n\tif p == DevicePtr(uintptr(0)) {\n\t\treturn // Allready freed\n\t}\n\terr := Result(C.cuMemFree(C.CUdeviceptr(p)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Frees device memory allocated by MemAlloc().\n// Overwrites the pointer with NULL.\n// It is safe to double-free.\nfunc (ptr DevicePtr) Free() {\n\tMemFree(ptr)\n}\n\n// Copies a number of bytes on the current device.\n// Requires unified addressing to be supported.\n// See also: MemcpyDtoD().\nfunc Memcpy(dst, src DevicePtr, bytes int64) {\n\terr := Result(C.cuMemcpy(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Asynchronously copies a number of bytes on the current device.\nfunc MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) {\n\terr := Result(C.cuMemcpyAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Copies a number of bytes from host to device.\nfunc MemcpyDtoD(dst, src DevicePtr, bytes int64) {\n\terr := Result(C.cuMemcpyDtoD(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Asynchronously copies a number of bytes from host to device.\nfunc MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) {\n\terr := Result(C.cuMemcpyDtoDAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Copies a number of bytes from host to device.\nfunc MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) {\n\terr := Result(C.cuMemcpyHtoD(C.CUdeviceptr(dst), src, C.size_t(bytes)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Asynchronously copies a number of bytes from host to device.\n// The host memory must be page-locked (see MemRegister)\nfunc MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) {\n\terr := Result(C.cuMemcpyHtoDAsync(C.CUdeviceptr(dst), src, C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Copies a number of bytes from device to host.\nfunc MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) {\n\terr := Result(C.cuMemcpyDtoH(dst, C.CUdeviceptr(src), C.size_t(bytes)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Asynchronously copies a number of bytes device host to host.\n// The host memory must be page-locked (see MemRegister)\nfunc MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) {\n\terr := Result(C.cuMemcpyDtoHAsync(dst, C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Copies from device memory in one context (device) to another.\nfunc MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) {\n\terr := Result(C.cuMemcpyPeer(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Asynchronously copies from device memory in one context (device) to another.\nfunc MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) {\n\terr := Result(C.cuMemcpyPeerAsync(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr.\nfunc MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) {\n\tvar cbytes C.size_t\n\tvar cptr C.CUdeviceptr\n\terr := Result(C.cuMemGetAddressRange(&cptr, &cbytes, C.CUdeviceptr(ptr)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\tbytes = int64(cbytes)\n\tbase = DevicePtr(cptr)\n\treturn\n}\n\n// Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr.\nfunc (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) {\n\treturn MemGetAddressRange(ptr)\n}\n\n// Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr.\nfunc (ptr DevicePtr) Bytes() (bytes int64) {\n\tbytes, _ = MemGetAddressRange(ptr)\n\treturn\n}\n\n// Returns the free and total amount of memroy in the current Context (in bytes).\nfunc MemGetInfo() (free, total int64) {\n\tvar cfree, ctotal C.size_t\n\terr := Result(C.cuMemGetInfo(&cfree, &ctotal))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\tfree = int64(cfree)\n\ttotal = int64(ctotal)\n\treturn\n}\n\n// Page-locks memory specified by the pointer and bytes.\n// The pointer and byte size must be aligned to the host page size (4KB)\n// See also: MemHostUnregister()\n// doesn't link with cuda6.5\n//func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) {\n//\terr := Result(C.cuMemHostRegister(ptr, C.size_t(bytes), C.uint(flags)))\n//\tif err != SUCCESS {\n//\t\tpanic(err)\n//\t}\n//}\n\n// Unmaps memory locked by MemHostRegister().\n// doesn't link with cuda6.5\n//func MemHostUnregister(ptr unsafe.Pointer) {\n//\terr := Result(C.cuMemHostUnregister(ptr))\n//\tif err != SUCCESS {\n//\t\tpanic(err)\n//\t}\n//}\n\nfunc MemAllocHost(bytes int64) unsafe.Pointer {\n\tvar p unsafe.Pointer\n\terr := Result(C.cuMemAllocHost(&p, C.size_t(bytes)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn p\n}\n\nfunc MemFreeHost(ptr unsafe.Pointer) {\n\terr := Result(C.cuMemFreeHost(ptr))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\ntype MemHostRegisterFlag int\n\n// Flag for MemHostRegister\nconst (\n\t// Memory is pinned in all CUDA contexts.\n\tMEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE\n\t// Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()\n\tMEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP\n)\n\nfunc (p DevicePtr) String() string {\n\treturn fmt.Sprint(unsafe.Pointer(uintptr(p)))\n}\n\n// Type size in bytes\nconst (\n\tSIZEOF_FLOAT32    = 4\n\tSIZEOF_FLOAT64    = 8\n\tSIZEOF_COMPLEX64  = 8\n\tSIZEOF_COMPLEX128 = 16\n)\n\n// Physical memory type of device pointer.\ntype MemoryType uint\n\nconst (\n\tMemoryTypeHost    MemoryType = C.CU_MEMORYTYPE_HOST\n\tMemoryTypeDevice  MemoryType = C.CU_MEMORYTYPE_DEVICE\n\tMemoryTypeArray   MemoryType = C.CU_MEMORYTYPE_ARRAY\n\tMemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED\n)\n\nvar memorytype = map[MemoryType]string{\n\tMemoryTypeHost:    \"MemoryTypeHost\",\n\tMemoryTypeDevice:  \"MemoryTypeDevice\",\n\tMemoryTypeArray:   \"MemoryTypeArray\",\n\tMemoryTypeUnified: \"MemoryTypeUnified\"}\n\nfunc (t MemoryType) String() string {\n\tif s, ok := memorytype[t]; ok {\n\t\treturn s\n\t}\n\treturn \"MemoryTypeUnknown\"\n}\n\n// Returns the physical memory type that ptr addresses.\nfunc PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) {\n\tvar typ uint64 // foresee enough memory just to be safe\n\terr = Result(C.cuPointerGetAttribute(unsafe.Pointer(&typ),\n\t\tC.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, C.CUdeviceptr(uintptr(ptr))))\n\treturn MemoryType(uint(typ)), err\n}\n\n// Returns the physical memory type that ptr addresses.\nfunc (ptr DevicePtr) MemoryType() MemoryType {\n\tt, err := PointerGetAttributeMemoryType(ptr)\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn t\n}\n"
  },
  {
    "path": "cu/memory_test.go",
    "content": "package cu\n\nimport (\n\t\"fmt\"\n\t\"math\"\n\t\"testing\"\n\t\"unsafe\"\n)\n\nfunc TestMalloc(t *testing.T) {\n\tfor i := 0; i < 1024; i++ {\n\t\tpointer := MemAlloc(16 * 1024 * 1024)\n\t\tpointer.Free()\n\t}\n\tfor i := 0; i < 1024; i++ {\n\t\tpointer := MemAlloc(16 * 1024 * 1024)\n\t\tMemFree(pointer)\n\t}\n}\n\nfunc BenchmarkMallocFree1B(b *testing.B) {\n\tfor i := 0; i < b.N; i++ {\n\t\tm := MemAlloc(1)\n\t\tm.Free()\n\t}\n}\n\nfunc BenchmarkMallocFree1kB(b *testing.B) {\n\tfor i := 0; i < b.N; i++ {\n\t\tm := MemAlloc(1024)\n\t\tm.Free()\n\t}\n}\n\nfunc BenchmarkMallocFree1MB(b *testing.B) {\n\tfor i := 0; i < b.N; i++ {\n\t\tm := MemAlloc(1024 * 1024)\n\t\tm.Free()\n\t}\n}\n\nfunc TestMemAddressRange(t *testing.T) {\n\tN := 12345\n\tptr := MemAlloc(int64(N))\n\tsize, base := MemGetAddressRange(ptr)\n\tif size != int64(N) {\n\t\tt.Fail()\n\t}\n\tif base != ptr {\n\t\tt.Fail()\n\t}\n\tsize, base = 0, DevicePtr(0)\n\tsize, base = ptr.GetAddressRange()\n\tif ptr.Bytes() != int64(N) {\n\t\tt.Fail()\n\t}\n}\n\nfunc TestMemGetInfo(t *testing.T) {\n\tfree, total := MemGetInfo()\n\tfmt.Println(\"MemGetInfo: \", free, \"/\", total)\n\tif free > total {\n\t\tt.Fail()\n\t}\n\tif total == 0 {\n\t\tt.Fail()\n\t}\n}\n\nfunc TestMemsetAsync(t *testing.T) {\n\tN := int64(32 * 1024)\n\thost1 := make([]float32, N)\n\tfor i := range host1 {\n\t\thost1[i] = float32(i)\n\t}\n\thost2 := make([]float32, N)\n\tdev1 := MemAlloc(int64(4 * N))\n\tMemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)\n\tstr := StreamCreate()\n\tMemsetD32Async(dev1, math.Float32bits(42), N, str)\n\tMemsetD32Async(dev1, math.Float32bits(21), N/2, str)\n\tMemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N)\n\tstr.Synchronize()\n\t(&str).Destroy()\n\tfor i := 0; i < len(host2)/2; i++ {\n\t\tif host2[i] != 21 {\n\t\t\tt.Fail()\n\t\t}\n\t}\n\tfor i := len(host2) / 2; i < len(host2); i++ {\n\t\tif host2[i] != 42 {\n\t\t\tt.Fail()\n\t\t}\n\t}\n\tdev1.Free()\n}\n\nfunc TestMemset(t *testing.T) {\n\tN := int64(32 * 1024)\n\thost1 := make([]float32, N)\n\tfor i := range host1 {\n\t\thost1[i] = float32(i)\n\t}\n\thost2 := make([]float32, N)\n\tdev1 := MemAlloc(int64(4 * N))\n\tMemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)\n\tMemsetD32(dev1, math.Float32bits(42), N)\n\tMemsetD32(dev1, math.Float32bits(21), N/2)\n\tMemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N)\n\tfor i := 0; i < len(host2)/2; i++ {\n\t\tif host2[i] != 21 {\n\t\t\tt.Fail()\n\t\t}\n\t}\n\tfor i := len(host2) / 2; i < len(host2); i++ {\n\t\tif host2[i] != 42 {\n\t\t\tt.Fail()\n\t\t}\n\t}\n\tdev1.Free()\n}\n\nfunc TestMemcpy(t *testing.T) {\n\tN := int64(32 * 1024)\n\thost1 := make([]float32, N)\n\tfor i := range host1 {\n\t\thost1[i] = float32(i)\n\t}\n\thost2 := make([]float32, N)\n\tdev1 := MemAlloc(int64(4 * N))\n\tdev2 := MemAlloc(int64(4 * N))\n\tMemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)\n\tMemcpyDtoD(dev2, dev1, 4*N)\n\tMemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N)\n\tfor i := range host2 {\n\t\tif host2[i] != float32(i) {\n\t\t\tt.Fail()\n\t\t}\n\t}\n\tdev1.Free()\n\tdev2.Free()\n}\n\nfunc TestMemcpyAsync(t *testing.T) {\n\tN := int64(32 * 1024)\n\thost1 := make([]float32, N)\n\tfor i := range host1 {\n\t\thost1[i] = float32(i)\n\t}\n\thost2 := make([]float32, N)\n\tdev1 := MemAlloc(int64(4 * N))\n\tdev2 := MemAlloc(int64(4 * N))\n\tstream := StreamCreate()\n\tMemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream)\n\tMemcpyDtoDAsync(dev2, dev1, 4*N, stream)\n\tMemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream)\n\tstream.Synchronize()\n\tfor i := range host2 {\n\t\tif host2[i] != float32(i) {\n\t\t\tt.Fail()\n\t\t}\n\t}\n\tdev1.Free()\n\tdev2.Free()\n}\n\nfunc TestMemcpyAsyncRegistered(t *testing.T) {\n\tN := int64(32 * 1024)\n\thost1 := make([]float32, N)\n\tfor i := range host1 {\n\t\thost1[i] = float32(i)\n\t}\n\thost2 := make([]float32, N)\n\tdev1 := MemAlloc(int64(4 * N))\n\tdev2 := MemAlloc(int64(4 * N))\n\tstream := StreamCreate()\n\tMemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream)\n\tMemcpyDtoDAsync(dev2, dev1, 4*N, stream)\n\tMemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream)\n\tstream.Synchronize()\n\tfor i := range host2 {\n\t\tif host2[i] != float32(i) {\n\t\t\tt.Fail()\n\t\t}\n\t}\n\tdev1.Free()\n\tdev2.Free()\n}\n\nfunc BenchmarkMemcpy(b *testing.B) {\n\tb.StopTimer()\n\tN := int64(32 * 1024 * 1024)\n\thost1 := make([]float32, N)\n\thost2 := make([]float32, N)\n\tdev1 := MemAlloc(int64(4 * N))\n\tdefer dev1.Free()\n\tdev2 := MemAlloc(int64(4 * N))\n\tdefer dev2.Free()\n\tb.SetBytes(4 * N)\n\tb.StartTimer()\n\tfor i := 0; i < b.N; i++ {\n\t\tMemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)\n\t\tMemcpyDtoD(dev2, dev1, 4*N)\n\t\tMemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N)\n\t}\n}\n"
  },
  {
    "path": "cu/memset.go",
    "content": "package cu\n\n// This file implements CUDA memset functions.\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\n// Sets the first N 32-bit values of dst array to value.\n// Asynchronous.\nfunc MemsetD32(deviceptr DevicePtr, value uint32, N int64) {\n\terr := Result(C.cuMemsetD32(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Asynchronously sets the first N 32-bit values of dst array to value.\nfunc MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) {\n\terr := Result(C.cuMemsetD32Async(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Sets the first N 8-bit values of dst array to value.\n// Asynchronous.\nfunc MemsetD8(deviceptr DevicePtr, value uint8, N int64) {\n\terr := Result(C.cuMemsetD8(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Asynchronously sets the first N 32-bit values of dst array to value.\nfunc MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) {\n\terr := Result(C.cuMemsetD8Async(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n"
  },
  {
    "path": "cu/module.go",
    "content": "package cu\n\n// This file implements loading of CUDA ptx modules\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\n// Represents a CUDA CUmodule, a reference to executable device code.\ntype Module uintptr\n\n// Loads a compute module from file\nfunc ModuleLoad(fname string) Module {\n\t//fmt.Fprintln(os.Stderr, \"driver.ModuleLoad\", fname)\n\tvar mod C.CUmodule\n\terr := Result(C.cuModuleLoad(&mod, C.CString(fname)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Module(uintptr(unsafe.Pointer(mod)))\n}\n\n// Loads a compute module from string\nfunc ModuleLoadData(image string) Module {\n\tvar mod C.CUmodule\n\terr := Result(C.cuModuleLoadData(&mod, unsafe.Pointer(C.CString(image))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Module(uintptr(unsafe.Pointer(mod)))\n}\n\n// Returns a Function handle.\nfunc ModuleGetFunction(module Module, name string) Function {\n\tvar function C.CUfunction\n\terr := Result(C.cuModuleGetFunction(\n\t\t&function,\n\t\tC.CUmodule(unsafe.Pointer(uintptr(module))),\n\t\tC.CString(name)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Function(uintptr(unsafe.Pointer(function)))\n}\n\n// Returns a Function handle.\nfunc (m Module) GetFunction(name string) Function {\n\treturn ModuleGetFunction(m, name)\n}\n"
  },
  {
    "path": "cu/module_test.go",
    "content": "package cu\n\nimport (\n\t\"testing\"\n\t\"unsafe\"\n\t//\"fmt\"\n)\n\nfunc TestModule(test *testing.T) {\n\tmod := ModuleLoad(\"/testdata/testmodule.ptx\")\n\tf := mod.GetFunction(\"testMemset\")\n\n\tN := 1000\n\tN4 := 4 * int64(N)\n\ta := make([]float32, N)\n\tA := MemAlloc(N4)\n\tdefer A.Free()\n\taptr := unsafe.Pointer(&a[0])\n\tMemcpyHtoD(A, aptr, N4)\n\n\tvar value float32\n\tvalue = 42\n\n\tvar n int\n\tn = N / 2\n\n\tblock := 128\n\tgrid := DivUp(N, block)\n\tshmem := 0\n\targs := []unsafe.Pointer{unsafe.Pointer(&A), unsafe.Pointer(&value), unsafe.Pointer(&n)}\n\tLaunchKernel(f, grid, 1, 1, block, 1, 1, shmem, 0, args)\n\n\tMemcpyDtoH(aptr, A, N4)\n\tfor i := 0; i < N/2; i++ {\n\t\tif a[i] != 42 {\n\t\t\ttest.Fail()\n\t\t}\n\t}\n\tfor i := N / 2; i < N; i++ {\n\t\tif a[i] != 0 {\n\t\t\ttest.Fail()\n\t\t}\n\t}\n\t//fmt.Println(a)\n}\n\n// Integer division rounded up.\nfunc DivUp(x, y int) int {\n\treturn ((x - 1) / y) + 1\n}\n"
  },
  {
    "path": "cu/peer.go",
    "content": "package cu\n\n// This file implements CUDA unified addressing.\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\n// Make allocations from the peer Context available to the current context.\nfunc CtxEnablePeerAccess(peer Context) {\n\terr := Result(C.cuCtxEnablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))), C.uint(0)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Make allocations from the peer Context available to the current context.\nfunc (peer Context) EnablePeerAccess() {\n\tCtxEnablePeerAccess(peer)\n}\n\n// Reverses CtxEnablePeerAccess().\nfunc CtxDisablePeerAccess(peer Context) {\n\terr := Result(C.cuCtxDisablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Reverses EnablePeerAccess().\nfunc (peer Context) DisablePeerAccess() {\n\tCtxDisablePeerAccess(peer)\n}\n\n// Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev.\nfunc DeviceCanAccessPeer(dev, peer Device) bool {\n\tvar canAccessPeer C.int\n\terr := Result(C.cuDeviceCanAccessPeer(&canAccessPeer, C.CUdevice(dev), C.CUdevice(peer)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn int(canAccessPeer) != 0\n}\n\n// Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev.\nfunc (dev Device) CanAccessPeer(peer Device) bool {\n\treturn DeviceCanAccessPeer(dev, peer)\n}\n"
  },
  {
    "path": "cu/result.go",
    "content": "package cu\n\n// This file provides access to CUDA driver error statuses (type CUresult).\n\n//#include <cuda.h>\nimport \"C\"\nimport (\n\t\"fmt\"\n)\n\n// CUDA error status.\n// CUDA error statuses are not returned by functions but checked and passed to\n// panic() when not successful. If desired, they can be caught by\n// recover().\ntype Result int\n\n// Message string for the error\nfunc (err Result) String() string {\n\tstr, ok := errorString[err]\n\tif !ok {\n\t\treturn \"Unknown CUresult: \" + fmt.Sprint(int(err))\n\t}\n\treturn str\n}\n\nconst (\n\tSUCCESS                              Result = C.CUDA_SUCCESS\n\tERROR_INVALID_VALUE                  Result = C.CUDA_ERROR_INVALID_VALUE\n\tERROR_OUT_OF_MEMORY                  Result = C.CUDA_ERROR_OUT_OF_MEMORY\n\tERROR_NOT_INITIALIZED                Result = C.CUDA_ERROR_NOT_INITIALIZED\n\tERROR_DEINITIALIZED                  Result = C.CUDA_ERROR_DEINITIALIZED\n\tERROR_PROFILER_DISABLED              Result = C.CUDA_ERROR_PROFILER_DISABLED\n\tERROR_PROFILER_NOT_INITIALIZED       Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED\n\tERROR_PROFILER_ALREADY_STARTED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED\n\tERROR_PROFILER_ALREADY_STOPPED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED\n\tERROR_NO_DEVICE                      Result = C.CUDA_ERROR_NO_DEVICE\n\tERROR_INVALID_DEVICE                 Result = C.CUDA_ERROR_INVALID_DEVICE\n\tERROR_INVALID_IMAGE                  Result = C.CUDA_ERROR_INVALID_IMAGE\n\tERROR_INVALID_CONTEXT                Result = C.CUDA_ERROR_INVALID_CONTEXT\n\tERROR_CONTEXT_ALREADY_CURRENT        Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT\n\tERROR_MAP_FAILED                     Result = C.CUDA_ERROR_MAP_FAILED\n\tERROR_UNMAP_FAILED                   Result = C.CUDA_ERROR_UNMAP_FAILED\n\tERROR_ARRAY_IS_MAPPED                Result = C.CUDA_ERROR_ARRAY_IS_MAPPED\n\tERROR_ALREADY_MAPPED                 Result = C.CUDA_ERROR_ALREADY_MAPPED\n\tERROR_NO_BINARY_FOR_GPU              Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU\n\tERROR_ALREADY_ACQUIRED               Result = C.CUDA_ERROR_ALREADY_ACQUIRED\n\tERROR_NOT_MAPPED                     Result = C.CUDA_ERROR_NOT_MAPPED\n\tERROR_NOT_MAPPED_AS_ARRAY            Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY\n\tERROR_NOT_MAPPED_AS_POINTER          Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER\n\tERROR_ECC_UNCORRECTABLE              Result = C.CUDA_ERROR_ECC_UNCORRECTABLE\n\tERROR_UNSUPPORTED_LIMIT              Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT\n\tERROR_CONTEXT_ALREADY_IN_USE         Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE\n\tERROR_INVALID_SOURCE                 Result = C.CUDA_ERROR_INVALID_SOURCE\n\tERROR_FILE_NOT_FOUND                 Result = C.CUDA_ERROR_FILE_NOT_FOUND\n\tERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND\n\tERROR_SHARED_OBJECT_INIT_FAILED      Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED\n\tERROR_OPERATING_SYSTEM               Result = C.CUDA_ERROR_OPERATING_SYSTEM\n\tERROR_INVALID_HANDLE                 Result = C.CUDA_ERROR_INVALID_HANDLE\n\tERROR_NOT_FOUND                      Result = C.CUDA_ERROR_NOT_FOUND\n\tERROR_NOT_READY                      Result = C.CUDA_ERROR_NOT_READY\n\tERROR_LAUNCH_FAILED                  Result = C.CUDA_ERROR_LAUNCH_FAILED\n\tERROR_LAUNCH_OUT_OF_RESOURCES        Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES\n\tERROR_LAUNCH_TIMEOUT                 Result = C.CUDA_ERROR_LAUNCH_TIMEOUT\n\tERROR_LAUNCH_INCOMPATIBLE_TEXTURING  Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING\n\tERROR_PEER_ACCESS_ALREADY_ENABLED    Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED\n\tERROR_PEER_ACCESS_NOT_ENABLED        Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED\n\tERROR_PRIMARY_CONTEXT_ACTIVE         Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE\n\tERROR_CONTEXT_IS_DESTROYED           Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED\n\tERROR_ASSERT                         Result = C.CUDA_ERROR_ASSERT\n\tERROR_TOO_MANY_PEERS                 Result = C.CUDA_ERROR_TOO_MANY_PEERS\n\tERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED\n\tERROR_HOST_MEMORY_NOT_REGISTERED     Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED\n\tERROR_HARDWARE_STACK_ERROR           Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR\n\tERROR_ILLEGAL_INSTRUCTION            Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION\n\tERROR_MISALIGNED_ADDRESS             Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS\n\tERROR_INVALID_ADDRESS_SPACE          Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE\n\tERROR_INVALID_PC                     Result = 718 //C.CUDA_ERROR_INVALID_PC\n\tERROR_NOT_PERMITTED                  Result = 800 //C.CUDA_ERROR_NOT_PERMITTED\n\tERROR_NOT_SUPPORTED                  Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED\n\tERROR_UNKNOWN                        Result = C.CUDA_ERROR_UNKNOWN\n)\n\n// Map with error strings for Result error numbers\nvar errorString map[Result]string = map[Result]string{\n\tSUCCESS:                              \"CUDA_SUCCESS\",\n\tERROR_INVALID_VALUE:                  \"CUDA_ERROR_INVALID_VALUE\",\n\tERROR_OUT_OF_MEMORY:                  \"CUDA_ERROR_OUT_OF_MEMORY\",\n\tERROR_NOT_INITIALIZED:                \"CUDA_ERROR_NOT_INITIALIZED\",\n\tERROR_DEINITIALIZED:                  \"CUDA_ERROR_DEINITIALIZED\",\n\tERROR_PROFILER_DISABLED:              \"CUDA_ERROR_PROFILER_DISABLED\",\n\tERROR_PROFILER_NOT_INITIALIZED:       \"CUDA_ERROR_PROFILER_NOT_INITIALIZED\",\n\tERROR_PROFILER_ALREADY_STARTED:       \"CUDA_ERROR_PROFILER_ALREADY_STARTED\",\n\tERROR_PROFILER_ALREADY_STOPPED:       \"CUDA_ERROR_PROFILER_ALREADY_STOPPED\",\n\tERROR_NO_DEVICE:                      \"CUDA_ERROR_NO_DEVICE\",\n\tERROR_INVALID_DEVICE:                 \"CUDA_ERROR_INVALID_DEVICE\",\n\tERROR_INVALID_IMAGE:                  \"CUDA_ERROR_INVALID_IMAGE\",\n\tERROR_INVALID_CONTEXT:                \"CUDA_ERROR_INVALID_CONTEXT\",\n\tERROR_CONTEXT_ALREADY_CURRENT:        \"CUDA_ERROR_CONTEXT_ALREADY_CURRENT\",\n\tERROR_MAP_FAILED:                     \"CUDA_ERROR_MAP_FAILED\",\n\tERROR_UNMAP_FAILED:                   \"CUDA_ERROR_UNMAP_FAILED\",\n\tERROR_ARRAY_IS_MAPPED:                \"CUDA_ERROR_ARRAY_IS_MAPPED\",\n\tERROR_ALREADY_MAPPED:                 \"CUDA_ERROR_ALREADY_MAPPED\",\n\tERROR_NO_BINARY_FOR_GPU:              \"CUDA_ERROR_NO_BINARY_FOR_GPU\",\n\tERROR_ALREADY_ACQUIRED:               \"CUDA_ERROR_ALREADY_ACQUIRED\",\n\tERROR_NOT_MAPPED:                     \"CUDA_ERROR_NOT_MAPPED\",\n\tERROR_NOT_MAPPED_AS_ARRAY:            \"CUDA_ERROR_NOT_MAPPED_AS_ARRAY\",\n\tERROR_NOT_MAPPED_AS_POINTER:          \"CUDA_ERROR_NOT_MAPPED_AS_POINTER\",\n\tERROR_ECC_UNCORRECTABLE:              \"CUDA_ERROR_ECC_UNCORRECTABLE\",\n\tERROR_UNSUPPORTED_LIMIT:              \"CUDA_ERROR_UNSUPPORTED_LIMIT\",\n\tERROR_CONTEXT_ALREADY_IN_USE:         \"CUDA_ERROR_CONTEXT_ALREADY_IN_USE\",\n\tERROR_INVALID_SOURCE:                 \"CUDA_ERROR_INVALID_SOURCE\",\n\tERROR_FILE_NOT_FOUND:                 \"CUDA_ERROR_FILE_NOT_FOUND\",\n\tERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: \"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND\",\n\tERROR_SHARED_OBJECT_INIT_FAILED:      \"CUDA_ERROR_SHARED_OBJECT_INIT_FAILED\",\n\tERROR_OPERATING_SYSTEM:               \"CUDA_ERROR_OPERATING_SYSTEM\",\n\tERROR_INVALID_HANDLE:                 \"CUDA_ERROR_INVALID_HANDLE\",\n\tERROR_NOT_FOUND:                      \"CUDA_ERROR_NOT_FOUND\",\n\tERROR_NOT_READY:                      \"CUDA_ERROR_NOT_READY\",\n\tERROR_LAUNCH_OUT_OF_RESOURCES:        \"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES\",\n\tERROR_LAUNCH_TIMEOUT:                 \"CUDA_ERROR_LAUNCH_TIMEOUT\",\n\tERROR_LAUNCH_INCOMPATIBLE_TEXTURING:  \"CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING\",\n\tERROR_PEER_ACCESS_ALREADY_ENABLED:    \"CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED\",\n\tERROR_PEER_ACCESS_NOT_ENABLED:        \"CUDA_ERROR_PEER_ACCESS_NOT_ENABLED\",\n\tERROR_PRIMARY_CONTEXT_ACTIVE:         \"CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE\",\n\tERROR_CONTEXT_IS_DESTROYED:           \"CUDA_ERROR_CONTEXT_IS_DESTROYED\",\n\tERROR_ASSERT:                         \"CUDA_ERROR_ASSERT\",\n\tERROR_TOO_MANY_PEERS:                 \"CUDA_ERROR_TOO_MANY_PEERS\",\n\tERROR_HOST_MEMORY_ALREADY_REGISTERED: \"CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED\",\n\tERROR_HOST_MEMORY_NOT_REGISTERED:     \"CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED\",\n\tERROR_HARDWARE_STACK_ERROR:           \"CUDA_ERROR_HARDWARE_STACK_ERROR\",\n\tERROR_ILLEGAL_INSTRUCTION:            \"CUDA_ERROR_ILLEGAL_INSTRUCTION\",\n\tERROR_MISALIGNED_ADDRESS:             \"CUDA_ERROR_MISALIGNED_ADDRESS\",\n\tERROR_INVALID_ADDRESS_SPACE:          \"CUDA_ERROR_INVALID_ADDRESS_SPACE\",\n\tERROR_INVALID_PC:                     \"CUDA_ERROR_INVALID_PC\",\n\tERROR_LAUNCH_FAILED:                  \"CUDA_ERROR_LAUNCH_FAILED\",\n\tERROR_NOT_PERMITTED:                  \"CUDA_ERROR_NOT_PERMITTED\",\n\tERROR_NOT_SUPPORTED:                  \"CUDA_ERROR_NOT_SUPPORTED\",\n\tERROR_UNKNOWN:                        \"CUDA_ERROR_UNKNOWN\"}\n"
  },
  {
    "path": "cu/runtimeapi.go",
    "content": "package cu\n\n// This file implements parts of the CUDA runtime api instead of the driver\n// api the rest of this package uses.\n// It might be useful to move this to a seperate package at some point.\n\n//#include <cuda_runtime.h>\nimport \"C\"\nimport \"unsafe\"\n\n// Set the device as current.\nfunc SetDevice(device Device) {\n\terr := Result(C.cudaSetDevice(C.int(device)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Reset the state of the current device.\nfunc DeviceReset() {\n\terr := Result(C.cudaDeviceReset())\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Set CUDA device flags.\nfunc SetDeviceFlags(flags uint) {\n\terr := Result(C.cudaSetDeviceFlags(C.uint(flags)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n//Flags for SetDeviceFlasgs\nconst (\n\t// The default, decides to yield or not based on active CUDA threads and processors.\n\tDeviceAuto = C.cudaDeviceScheduleAuto\n\t// Actively spin while waiting for device.\n\tDeviceSpin = C.cudaDeviceScheduleSpin\n\t// Yield when waiting.\n\tDeviceYield = C.cudaDeviceScheduleYield\n\t// ScheduleBlockingSync block CPU on sync.\n\tDeviceScheduleBlockingSync = C.cudaDeviceScheduleBlockingSync\n\t// ScheduleBlockingSync block CPU on sync.  Deprecated since cuda 4.0\n\tDeviceBlockingSync = C.cudaDeviceBlockingSync\n\t// For use with pinned host memory\n\tDeviceMapHost = C.cudaDeviceMapHost\n\t// Do not reduce local memory to try and prevent thrashing\n\tDeviceLmemResizeToMax = C.cudaDeviceLmemResizeToMax\n)\n\nfunc Malloc(bytes int64) DevicePtr {\n\tvar devptr unsafe.Pointer\n\terr := Result(C.cudaMalloc(&devptr, C.size_t(bytes)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn DevicePtr(devptr)\n}\n\nfunc MallocHost(bytes int64) unsafe.Pointer {\n\tvar p unsafe.Pointer\n\terr := Result(C.cudaMallocHost(&p, C.size_t(bytes)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn p\n}\n\nfunc FreeHost(ptr unsafe.Pointer) {\n\terr := Result(C.cudaFreeHost(ptr))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Copies a number of bytes in the direction specified by flags\nfunc MemCpy(dst, src unsafe.Pointer, bytes int64, flags uint) {\n\terr := Result(C.cudaMemcpy(dst, src, C.size_t(bytes), uint32(flags)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n//Flags for memory copy types\nconst (\n\t// Host to Host\n\tHtoH = C.cudaMemcpyHostToHost\n\t// Host to Device\n\tHtoD = C.cudaMemcpyHostToDevice\n\t// Device to Host\n\tDtoH = C.cudaMemcpyDeviceToHost\n\t// Device to Device\n\tDtoD = C.cudaMemcpyDeviceToDevice\n\t// Default, unified virtual address space\n\tVirt = C.cudaMemcpyDefault\n)\n"
  },
  {
    "path": "cu/stream.go",
    "content": "package cu\n\n// This file implements CUDA streams\n\n//#include <cuda.h>\nimport \"C\"\nimport \"unsafe\"\n\n// CUDA stream.\ntype Stream uintptr\n\n// Creates an asynchronous stream\nfunc StreamCreate() Stream {\n\tvar stream C.CUstream\n\terr := Result(C.cuStreamCreate(&stream, C.uint(0))) // flags has to be zero\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Stream(uintptr(unsafe.Pointer(stream)))\n}\n\n// Destroys the asynchronous stream\nfunc (stream *Stream) Destroy() {\n\tstr := *stream\n\terr := Result(C.cuStreamDestroy(C.CUstream(unsafe.Pointer(uintptr(str)))))\n\t*stream = 0\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Destroys an asynchronous stream\nfunc StreamDestroy(stream *Stream) {\n\tstream.Destroy()\n}\n\n// Blocks until the stream has completed.\nfunc (stream Stream) Synchronize() {\n\terr := Result(C.cuStreamSynchronize(C.CUstream(unsafe.Pointer(uintptr(stream)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Returns Success if all operations have completed, ErrorNotReady otherwise\nfunc (stream Stream) Query() Result {\n\treturn Result(C.cuStreamQuery(C.CUstream(unsafe.Pointer(uintptr(stream)))))\n}\n\n// Returns Success if all operations have completed, ErrorNotReady otherwise\nfunc StreamQuery(stream Stream) Result {\n\treturn stream.Query()\n}\n\n// Blocks until the stream has completed.\nfunc StreamSynchronize(stream Stream) {\n\tstream.Synchronize()\n}\n"
  },
  {
    "path": "cu/testdata/testmodule.cu",
    "content": "/*\n * Module to test CUDA module loading and execution.\n * To be compiled with:\n * nvcc -ptx testmodule.cu\n */\n\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n#define threadindex ( ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x )\n\n/// Sets the first N elements of array to value.\n__global__ void testMemset(float* array, float value, int N){\n\tint i = threadindex;\n\tif(i < N){\n\t\tarray[i] = value;\n\t}\n}\n\n\n#ifdef __cplusplus\n}\n#endif\n"
  },
  {
    "path": "cu/testdata/testmodule.ptx",
    "content": "\t.version 1.4\n\t.target sm_10, map_f64_to_f32\n\t// compiled with /usr/local/cuda/open64/lib//be\n\t// nvopencc 4.0 built on 2011-02-18\n\n\t//-----------------------------------------------------------\n\t// Compiling /tmp/tmpxft_00000e56_00000000-9_testmodule.cpp3.i (/tmp/ccBI#.rDLD4T)\n\t//-----------------------------------------------------------\n\n\t//-----------------------------------------------------------\n\t// Options:\n\t//-----------------------------------------------------------\n\t//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64\n\t//  -O3\t(Optimization level)\n\t//  -g0\t(Debug level)\n\t//  -m2\t(Report advisories)\n\t//-----------------------------------------------------------\n\n\t.file\t1\t\"<command-line>\"\n\t.file\t2\t\"/tmp/tmpxft_00000e56_00000000-8_testmodule.cudafe2.gpu\"\n\t.file\t3\t\"/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h\"\n\t.file\t4\t\"/usr/local/cuda/bin/../include/crt/device_runtime.h\"\n\t.file\t5\t\"/usr/local/cuda/bin/../include/host_defines.h\"\n\t.file\t6\t\"/usr/local/cuda/bin/../include/builtin_types.h\"\n\t.file\t7\t\"/usr/local/cuda/bin/../include/device_types.h\"\n\t.file\t8\t\"/usr/local/cuda/bin/../include/driver_types.h\"\n\t.file\t9\t\"/usr/local/cuda/bin/../include/surface_types.h\"\n\t.file\t10\t\"/usr/local/cuda/bin/../include/texture_types.h\"\n\t.file\t11\t\"/usr/local/cuda/bin/../include/vector_types.h\"\n\t.file\t12\t\"/usr/local/cuda/bin/../include/device_launch_parameters.h\"\n\t.file\t13\t\"/usr/local/cuda/bin/../include/crt/storage_class.h\"\n\t.file\t14\t\"/usr/include/bits/types.h\"\n\t.file\t15\t\"/usr/include/time.h\"\n\t.file\t16\t\"testmodule.cu\"\n\t.file\t17\t\"/usr/local/cuda/bin/../include/common_functions.h\"\n\t.file\t18\t\"/usr/local/cuda/bin/../include/math_functions.h\"\n\t.file\t19\t\"/usr/local/cuda/bin/../include/math_constants.h\"\n\t.file\t20\t\"/usr/local/cuda/bin/../include/device_functions.h\"\n\t.file\t21\t\"/usr/local/cuda/bin/../include/sm_11_atomic_functions.h\"\n\t.file\t22\t\"/usr/local/cuda/bin/../include/sm_12_atomic_functions.h\"\n\t.file\t23\t\"/usr/local/cuda/bin/../include/sm_13_double_functions.h\"\n\t.file\t24\t\"/usr/local/cuda/bin/../include/sm_20_atomic_functions.h\"\n\t.file\t25\t\"/usr/local/cuda/bin/../include/sm_20_intrinsics.h\"\n\t.file\t26\t\"/usr/local/cuda/bin/../include/surface_functions.h\"\n\t.file\t27\t\"/usr/local/cuda/bin/../include/texture_fetch_functions.h\"\n\t.file\t28\t\"/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h\"\n\n\n\t.entry testMemset (\n\t\t.param .u64 __cudaparm_testMemset_array,\n\t\t.param .f32 __cudaparm_testMemset_value,\n\t\t.param .s32 __cudaparm_testMemset_N)\n\t{\n\t.reg .u16 %rh<4>;\n\t.reg .u32 %r<10>;\n\t.reg .u64 %rd<6>;\n\t.reg .f32 %f<3>;\n\t.reg .pred %p<3>;\n\t.loc\t16\t7\t0\n$LDWbegin_testMemset:\n\tmov.u16 \t%rh1, %nctaid.x;\n\tmov.u16 \t%rh2, %ctaid.y;\n\tmul.wide.u16 \t%r1, %rh1, %rh2;\n\tcvt.u32.u16 \t%r2, %ctaid.x;\n\tadd.u32 \t%r3, %r2, %r1;\n\tcvt.u32.u16 \t%r4, %ntid.x;\n\tmul.lo.u32 \t%r5, %r4, %r3;\n\tcvt.u32.u16 \t%r6, %tid.x;\n\tadd.u32 \t%r7, %r6, %r5;\n\tld.param.s32 \t%r8, [__cudaparm_testMemset_N];\n\tsetp.le.s32 \t%p1, %r8, %r7;\n\t@%p1 bra \t$Lt_0_1026;\n\t.loc\t16\t10\t0\n\tld.param.f32 \t%f1, [__cudaparm_testMemset_value];\n\tld.param.u64 \t%rd1, [__cudaparm_testMemset_array];\n\tcvt.s64.s32 \t%rd2, %r7;\n\tmul.wide.s32 \t%rd3, %r7, 4;\n\tadd.u64 \t%rd4, %rd1, %rd3;\n\tst.global.f32 \t[%rd4+0], %f1;\n$Lt_0_1026:\n\t.loc\t16\t12\t0\n\texit;\n$LDWend_testMemset:\n\t} // testMemset\n\n"
  },
  {
    "path": "cu/version.go",
    "content": "package cu\n\n// This file implements CUDA driver version management\n\n//#include <cuda.h>\nimport \"C\"\n\n// Returns the CUDA driver version.\nfunc Version() int {\n\tvar version C.int\n\terr := Result(C.cuDriverGetVersion(&version))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn int(version)\n}\n"
  },
  {
    "path": "cu/version_test.go",
    "content": "package cu\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n)\n\nfunc TestVersion(t *testing.T) {\n\tfmt.Println(\"CUDA driver version: \", Version())\n}\n"
  },
  {
    "path": "cuda/Makefile",
    "content": "all: 6g gccgo doc\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngccgo:\n\tgo build -v -compiler $(GCCGO)\n\ntest: 6gtest gccgotest\n\n6gtest: \n\tgo test\n\ngccgotest: \n\tgo test -compiler $(GCCGO)\n\nbench: 6gbench gccgobench\n\n6gbench:\n\tgo test -bench=.\n\ngccgobench:\n\tgo test -bench=. -compiler $(GCCGO)\n\nclean:\n\tgo clean\n\ndoc:\n\tgodoc github.com/barnex/cuda5/cu > README\n"
  },
  {
    "path": "cuda/README",
    "content": "PACKAGE\n\npackage cu\n    import \"github.com/barnex/cuda5/cu\"\n\n    Go bindings for the CUDA driver API.\n\nCONSTANTS\n\nconst (\n    // If  the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.\n    CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO\n    // Spin when waiting for results from the GPU. \n    CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN\n    // Yield its thread when waiting for results from the GPU.\n    CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD\n    // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.\n    CTX_BLOCKING_SYNC\n    // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.\n    CTX_MAP_HOST = C.CU_CTX_MAP_HOST\n    //Do not reduce local memory after resizing local memory for a kernel. \n    CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX\n)\n    Flags for CtxCreate\nconst (\n    SIZEOF_FLOAT32    = 4\n    SIZEOF_FLOAT64    = 8\n    SIZEOF_COMPLEX64  = 8\n    SIZEOF_COMPLEX128 = 16\n)\n    Type size in bytes\n\n\nFUNCTIONS\n\nfunc CtxDestroy(ctx *Context)\n    Destroys the CUDA context specified by ctx. If the context usage count\n    is not equal to 1, or the context is current to any CPU thread other\n    than the current one, this function fails. Floating contexts (detached\n    from a CPU thread via cuCtxPopCurrent()) may be destroyed by this\n    function.\n\nfunc CtxDisablePeerAccess(peer Context)\n    Reverses CtxEnablePeerAccess().\n\nfunc CtxEnablePeerAccess(peer Context)\n    Make allocations from the peer Context available to the current context.\n\nfunc CtxGetApiVersion(ctx Context) (version int)\n    Returns the API version to create the context.\n\nfunc CtxSetCurrent(ctx Context)\n    Sets the current active context.\n\nfunc CtxSynchronize()\n    Blocks until the device has completed all preceding requested tasks, if\n    the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.\n\nfunc DeviceCanAccessPeer(dev, peer Device) bool\n    Returns true if CtxEnablePeerAccess can be called on a context for dev\n    and peerDev.\n\nfunc DeviceComputeCapability(device Device) (major, minor int)\n    Returns the compute capability of the device.\n\nfunc DeviceGetAttribute(attrib DeviceAttribute, dev Device) int\n    Gets the value of a device attribute.\n\nfunc DeviceGetCount() int\n    Returns the number of devices with compute capability greater than or\n    equal to 1.0 that are available for execution.\n\nfunc DeviceGetName(dev Device) string\n    Gets the name of the device.\n\nfunc DeviceTotalMem(device Device) int64\n    Returns the total amount of memory available on the device in bytes.\n\nfunc FuncGetAttribute(attrib FunctionAttribute, function Function) int\n\nfunc Init(flags int)\n    Initialize the CUDA driver API. Currently, flags must be 0. If Init()\n    has not been called, any function from the driver API will panic with\n    ERROR_NOT_INITIALIZED.\n\nfunc LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer)\n\nfunc MemAllocHost(bytes int64) unsafe.Pointer\n\nfunc MemFree(ptr *DevicePtr)\n    Frees device memory allocated by MemAlloc(). Overwrites the pointer with\n    NULL. It is safe to double-free.\n\nfunc MemFreeHost(ptr unsafe.Pointer)\n\nfunc MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr)\n    Returns the base address and size of the allocation (by MemAlloc) that\n    contains the input pointer ptr.\n\nfunc MemGetInfo() (free, total int64)\n    Returns the free and total amount of memroy in the current Context (in\n    bytes).\n\nfunc MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag)\n    Page-locks memory specified by the pointer and bytes. The pointer and\n    byte size must be aligned to the host page size (4KB) See also:\n    MemHostUnregister()\n\nfunc MemHostUnregister(ptr unsafe.Pointer)\n    Unmaps memory locked by MemHostRegister().\n\nfunc Memcpy(dst, src DevicePtr, bytes int64)\n    Copies a number of bytes on the current device. Requires unified\n    addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually\n    an auto copy for device and/or host memory\n\nfunc MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream)\n    Asynchronously copies a number of bytes on the current device.\n\nfunc MemcpyDtoD(dst, src DevicePtr, bytes int64)\n    Copies a number of bytes from host to device.\n\nfunc MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream)\n    Asynchronously copies a number of bytes from host to device.\n\nfunc MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64)\n    Copies a number of bytes from device to host.\n\nfunc MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream)\n    Asynchronously copies a number of bytes device host to host. The host\n    memory must be page-locked (see MemRegister)\n\nfunc MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64)\n    Copies a number of bytes from host to device.\n\nfunc MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream)\n    Asynchronously copies a number of bytes from host to device. The host\n    memory must be page-locked (see MemRegister)\n\nfunc MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64)\n    Copies from device memory in one context (device) to another.\n\nfunc MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream)\n    Asynchronously copies from device memory in one context (device) to\n    another.\n\nfunc MemsetD32(deviceptr DevicePtr, value uint32, N int64)\n    Sets the first N 32-bit values of dst array to value. Asynchronous.\n\nfunc MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream)\n    Asynchronously sets the first N 32-bit values of dst array to value.\n\nfunc MemsetD8(deviceptr DevicePtr, value uint8, N int64)\n    Sets the first N 8-bit values of dst array to value. Asynchronous.\n\nfunc MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream)\n    Asynchronously sets the first N 32-bit values of dst array to value.\n\nfunc StreamDestroy(stream *Stream)\n    Destroys an asynchronous stream\n\nfunc StreamSynchronize(stream Stream)\n    Blocks until the stream has completed.\n\nfunc Version() int\n    Returns the CUDA driver version.\n\n\nTYPES\n\ntype Context uintptr\n    CUDA context.\n\nfunc CtxCreate(flags uint, dev Device) Context\n    Create a CUDA context.\n\nfunc CtxGetCurrent() Context\n    Gets the current active context.\n\nfunc (ctx Context) ApiVersion() (version int)\n    Returns the API version to create the context.\n\nfunc (ctx *Context) Destroy()\n    Destroys the CUDA context.\n\nfunc (peer Context) DisablePeerAccess()\n    Reverses EnablePeerAccess().\n\nfunc (peer Context) EnablePeerAccess()\n    Make allocations from the peer Context available to the current context.\n\nfunc (ctx Context) SetCurrent()\n    Sets the current active context.\n\ntype DevProp struct {\n    MaxThreadsPerBlock  int\n    MaxThreadsDim       [3]int\n    MaxGridSize         [3]int\n    SharedMemPerBlock   int\n    TotalConstantMemory int\n    SIMDWidth           int\n    MemPitch            int\n    RegsPerBlock        int\n    ClockRate           int\n    TextureAlign        int\n}\n    Device properties\n\nfunc DeviceGetProperties(dev Device) (prop DevProp)\n    Returns the device's properties.\n\ntype Device int\n    CUDA Device number.\n\nfunc CtxGetDevice() Device\n    Returns the ordinal of the current context's device.\n\nfunc DeviceGet(ordinal int) Device\n    Returns in a device handle given an ordinal in the range [0,\n    DeviceGetCount()-1].\n\nfunc (dev Device) Attribute(attrib DeviceAttribute) int\n    Gets the value of a device attribute.\n\nfunc (dev Device) CanAccessPeer(peer Device) bool\n    Returns true if CtxEnablePeerAccess can be called on a context for dev\n    and peerDev.\n\nfunc (device Device) ComputeCapability() (major, minor int)\n    Returns the compute capability of the device.\n\nfunc (dev Device) Name() string\n    Gets the name of the device.\n\nfunc (dev Device) Properties() DevProp\n    Returns the device's properties.\n\nfunc (device Device) TotalMem() int64\n    Returns the total amount of memory available on the device in bytes.\n\ntype DeviceAttribute int\n\nconst (\n    MAX_THREADS_PER_BLOCK            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK            // Maximum number of threads per block\n    MAX_BLOCK_DIM_X                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X                  // Maximum block dimension X\n    MAX_BLOCK_DIM_Y                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y                  // Maximum block dimension Y\n    MAX_BLOCK_DIM_Z                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z                  // Maximum block dimension Z\n    MAX_GRID_DIM_X                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X                   // Maximum grid dimension X\n    MAX_GRID_DIM_Y                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y                   // Maximum grid dimension Y\n    MAX_GRID_DIM_Z                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z                   // Maximum grid dimension Z\n    MAX_SHARED_MEMORY_PER_BLOCK      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK      // Maximum shared memory available per block in bytes\n    TOTAL_CONSTANT_MEMORY            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY            // Memory available on device for __constant__ variables in a CUDA C kernel in bytes\n    WARP_SIZE                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE                        // Warp size in threads\n    MAX_PITCH                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH                        // Maximum pitch in bytes allowed by memory copies\n    MAX_REGISTERS_PER_BLOCK          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK          // Maximum number of 32-bit registers available per block\n    CLOCK_RATE                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE                       // Peak clock frequency in kilohertz\n    TEXTURE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT                // Alignment requirement for textures\n    MULTIPROCESSOR_COUNT             DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT             // Number of multiprocessors on device\n    KERNEL_EXEC_TIMEOUT              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT              // Specifies whether there is a run time limit on kernels\n    INTEGRATED                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED                       // Device is integrated with host memory\n    CAN_MAP_HOST_MEMORY              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY              // Device can map host memory into CUDA address space\n    COMPUTE_MODE                     DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE                     // Compute mode (See ::CUcomputemode for details)\n    MAXIMUM_TEXTURE1D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH          // Maximum 1D texture width\n    MAXIMUM_TEXTURE2D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH          // Maximum 2D texture width\n    MAXIMUM_TEXTURE2D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT         // Maximum 2D texture height\n    MAXIMUM_TEXTURE3D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH          // Maximum 3D texture width\n    MAXIMUM_TEXTURE3D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT         // Maximum 3D texture height\n    MAXIMUM_TEXTURE3D_DEPTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH          // Maximum 3D texture depth\n    MAXIMUM_TEXTURE2D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH  // Maximum 2D layered texture width\n    MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height\n    MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture\n    SURFACE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                // Alignment requirement for surfaces\n    CONCURRENT_KERNELS               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS               // Device can possibly execute multiple kernels concurrently\n    ECC_ENABLED                      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED                      // Device has ECC support enabled\n    PCI_BUS_ID                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID                       // PCI bus ID of the device\n    PCI_DEVICE_ID                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID                    // PCI device ID of the device\n    TCC_DRIVER                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER                       // Device is using TCC driver model\n    MEMORY_CLOCK_RATE                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE                // Peak memory clock frequency in kilohertz\n    GLOBAL_MEMORY_BUS_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH          // Global memory bus width in bits\n    L2_CACHE_SIZE                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                    // Size of L2 cache in bytes\n    MAX_THREADS_PER_MULTIPROCESSOR   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR   // Maximum resident threads per multiprocessor\n    ASYNC_ENGINE_COUNT               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT               // Number of asynchronous engines\n    UNIFIED_ADDRESSING               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING               // Device uses shares a unified address space with the host \n    MAXIMUM_TEXTURE1D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  // Maximum 1D layered texture width\n    MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture\n)\n\ntype DevicePtr uintptr\n\nfunc MemAlloc(bytes int64) DevicePtr\n    Allocates a number of bytes of device memory.\n\nfunc (ptr DevicePtr) Bytes() (bytes int64)\n    Returns the size of the allocation (by MemAlloc) that contains the input\n    pointer ptr.\n\nfunc (ptr *DevicePtr) Free()\n    Frees device memory allocated by MemAlloc(). Overwrites the pointer with\n    NULL. It is safe to double-free.\n\nfunc (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr)\n    Returns the base address and size of the allocation (by MemAlloc) that\n    contains the input pointer ptr.\n\nfunc (ptr DevicePtr) MemoryType() MemoryType\n    Returns the physical memory type that ptr addresses.\n\nfunc (p DevicePtr) String() string\n\ntype Dim3 struct {\n    X, Y, Z int\n}\n\ntype Function uintptr\n    Represents a CUDA CUfunction, a reference to a function within a module.\n\nfunc ModuleGetFunction(module Module, name string) Function\n    Returns a Function handle.\n\nfunc (f Function) GetAttribute(attrib FunctionAttribute) int\n\ntype FunctionAttribute int\n\nconst (\n    FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.\n    FUNC_A_SHARED_SIZE_BYTES     FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES     // The size in bytes of statically-allocated shared memory required by this function. \n    FUNC_A_CONST_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES      // The size in bytes of user-allocated constant memory required by this function.\n    FUNC_A_LOCAL_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES      // The size in bytes of local memory used by each thread of this function.\n    FUNC_A_NUM_REGS              FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS              // The number of registers used by each thread of this function.\n    FUNC_A_PTX_VERSION           FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION           // The PTX virtual architecture version for which the function was compiled. \n    FUNC_A_BINARY_VERSION        FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION        // The binary architecture version for which the function was compiled.\n)\n\ntype MemHostRegisterFlag int\n\nconst (\n    // Memory is pinned in all CUDA contexts.\n    MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE\n    // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()\n    MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP\n)\n    Flag for MemHostRegister\n\ntype MemoryType uint\n    Physical memory type of device pointer.\n\nconst (\n    MemoryTypeHost    MemoryType = C.CU_MEMORYTYPE_HOST\n    MemoryTypeDevice  MemoryType = C.CU_MEMORYTYPE_DEVICE\n    MemoryTypeArray   MemoryType = C.CU_MEMORYTYPE_ARRAY\n    MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED\n)\n\nfunc PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result)\n    Returns the physical memory type that ptr addresses.\n\nfunc (t MemoryType) String() string\n\ntype Module uintptr\n    Represents a CUDA CUmodule, a reference to executable device code.\n\nfunc ModuleLoad(fname string) Module\n    Loads a compute module from file\n\nfunc ModuleLoadData(image string) Module\n    Loads a compute module from string\n\nfunc (m Module) GetFunction(name string) Function\n    Returns a Function handle.\n\ntype Result int\n    CUDA error status. CUDA error statuses are not returned by functions but\n    checked and passed to panic() when not successful. If desired, they can\n    be caught by recover().\n\nconst (\n    SUCCESS                              Result = C.CUDA_SUCCESS\n    ERROR_INVALID_VALUE                  Result = C.CUDA_ERROR_INVALID_VALUE\n    ERROR_OUT_OF_MEMORY                  Result = C.CUDA_ERROR_OUT_OF_MEMORY\n    ERROR_NOT_INITIALIZED                Result = C.CUDA_ERROR_NOT_INITIALIZED\n    ERROR_DEINITIALIZED                  Result = C.CUDA_ERROR_DEINITIALIZED\n    ERROR_PROFILER_DISABLED              Result = C.CUDA_ERROR_PROFILER_DISABLED\n    ERROR_PROFILER_NOT_INITIALIZED       Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED\n    ERROR_PROFILER_ALREADY_STARTED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED\n    ERROR_PROFILER_ALREADY_STOPPED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED\n    ERROR_NO_DEVICE                      Result = C.CUDA_ERROR_NO_DEVICE\n    ERROR_INVALID_DEVICE                 Result = C.CUDA_ERROR_INVALID_DEVICE\n    ERROR_INVALID_IMAGE                  Result = C.CUDA_ERROR_INVALID_IMAGE\n    ERROR_INVALID_CONTEXT                Result = C.CUDA_ERROR_INVALID_CONTEXT\n    ERROR_CONTEXT_ALREADY_CURRENT        Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT\n    ERROR_MAP_FAILED                     Result = C.CUDA_ERROR_MAP_FAILED\n    ERROR_UNMAP_FAILED                   Result = C.CUDA_ERROR_UNMAP_FAILED\n    ERROR_ARRAY_IS_MAPPED                Result = C.CUDA_ERROR_ARRAY_IS_MAPPED\n    ERROR_ALREADY_MAPPED                 Result = C.CUDA_ERROR_ALREADY_MAPPED\n    ERROR_NO_BINARY_FOR_GPU              Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU\n    ERROR_ALREADY_ACQUIRED               Result = C.CUDA_ERROR_ALREADY_ACQUIRED\n    ERROR_NOT_MAPPED                     Result = C.CUDA_ERROR_NOT_MAPPED\n    ERROR_NOT_MAPPED_AS_ARRAY            Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY\n    ERROR_NOT_MAPPED_AS_POINTER          Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER\n    ERROR_ECC_UNCORRECTABLE              Result = C.CUDA_ERROR_ECC_UNCORRECTABLE\n    ERROR_UNSUPPORTED_LIMIT              Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT\n    ERROR_CONTEXT_ALREADY_IN_USE         Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE\n    ERROR_INVALID_SOURCE                 Result = C.CUDA_ERROR_INVALID_SOURCE\n    ERROR_FILE_NOT_FOUND                 Result = C.CUDA_ERROR_FILE_NOT_FOUND\n    ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND\n    ERROR_SHARED_OBJECT_INIT_FAILED      Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED\n    ERROR_OPERATING_SYSTEM               Result = C.CUDA_ERROR_OPERATING_SYSTEM\n    ERROR_INVALID_HANDLE                 Result = C.CUDA_ERROR_INVALID_HANDLE\n    ERROR_NOT_FOUND                      Result = C.CUDA_ERROR_NOT_FOUND\n    ERROR_NOT_READY                      Result = C.CUDA_ERROR_NOT_READY\n    ERROR_LAUNCH_FAILED                  Result = C.CUDA_ERROR_LAUNCH_FAILED\n    ERROR_LAUNCH_OUT_OF_RESOURCES        Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES\n    ERROR_LAUNCH_TIMEOUT                 Result = C.CUDA_ERROR_LAUNCH_TIMEOUT\n    ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING\n    ERROR_PEER_ACCESS_ALREADY_ENABLED    Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED\n    ERROR_PEER_ACCESS_NOT_ENABLED        Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED\n    ERROR_PRIMARY_CONTEXT_ACTIVE         Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE\n    ERROR_CONTEXT_IS_DESTROYED           Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED\n    ERROR_ASSERT                         Result = C.CUDA_ERROR_ASSERT\n    ERROR_TOO_MANY_PEERS                 Result = C.CUDA_ERROR_TOO_MANY_PEERS\n    ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED\n    ERROR_HOST_MEMORY_NOT_REGISTERED     Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED\n    ERROR_UNKNOWN                        Result = C.CUDA_ERROR_UNKNOWN\n)\n\nfunc StreamQuery(stream Stream) Result\n    Returns Success if all operations have completed, ErrorNotReady\n    otherwise\n\nfunc (err Result) String() string\n    Message string for the error\n\ntype Stream uintptr\n    CUDA stream.\n\nfunc StreamCreate() Stream\n    Creates an asynchronous stream\n\nfunc (stream *Stream) Destroy()\n    Destroys the asynchronous stream\n\nfunc (stream Stream) Query() Result\n    Returns Success if all operations have completed, ErrorNotReady\n    otherwise\n\nfunc (stream Stream) Synchronize()\n    Blocks until the stream has completed.\n\n\n"
  },
  {
    "path": "cuda/cgoflags.go",
    "content": "package cuda\n\n// This file provides CGO flags.\n\nimport \"C\"\n\n//#cgo LDFLAGS:-lcudart\n//\n////default location:\n//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib\n//#cgo CFLAGS: -I/usr/local/cuda/include/\n//\n////default location if not properly symlinked:\n//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib\n//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib\n//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib\n//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/\n//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/\n//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/\n//\n////arch linux:\n//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib\n//#cgo CFLAGS: -I/opt/cuda/include\n//\n////WINDOWS:\n//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64\n//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include\nimport \"C\"\n"
  },
  {
    "path": "cuda/device.go",
    "content": "package cuda\n\n//#include <cuda_runtime.h>\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n)\n\n// Reset the current GPU device.\nfunc DeviceReset() {\n\terr := cu.Result(C.cudaDeviceReset())\n\tif err != cu.SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Set preference for more cache or shared memory.\nfunc DeviceSetCacheConfig(cacheConfig FuncCache) {\n\terr := cu.Result(C.cudaDeviceSetCacheConfig(uint32(cacheConfig)))\n\tif err != cu.SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Cache preference option.\ntype FuncCache int\n\nconst (\n\tFUNC_CACHE_PREFER_NONE   FuncCache = C.CU_FUNC_CACHE_PREFER_NONE\n\tFUNC_CACHE_PREFER_SHARED FuncCache = C.CU_FUNC_CACHE_PREFER_SHARED\n\tFUNC_CACHE_PREFER_L1     FuncCache = C.CU_FUNC_CACHE_PREFER_L1\n\tFUNC_CACHE_PREFER_EQUAL  FuncCache = C.CU_FUNC_CACHE_PREFER_EQUAL\n)\n"
  },
  {
    "path": "cufft/Makefile",
    "content": "all: 6g gccgo doc\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngccgo:\n\tgo build -v -compiler $(GCCGO)\n\ntest: 6gtest gccgotest\n\n6gtest: \n\tgo test\n\ngccgotest: \n\tgo test -compiler $(GCCGO)\n\nbench: 6gbench gccgobench\n\n6gbench:\n\tgo test -bench=.\n\ngccgobench:\n\tgo test -bench=. -compiler $(GCCGO)\n\nclean:\n\tgo clean\n\ndoc:\n\tgodoc github.com/barnex/cuda5/cufft > README\n"
  },
  {
    "path": "cufft/README",
    "content": "PACKAGE DOCUMENTATION\n\npackage cufft\n    import \"github.com/barnex/cuda5/cufft\"\n\n    Go bindings for the CUDA CUFFT API.\n\n\nCONSTANTS\n\nconst (\n    FORWARD = -1 // Forward FFT\n    INVERSE = 1  // Inverse FFT\n)\n\n\nTYPES\n\ntype CompatibilityMode int\n    CUFFT compatibility mode\n\nconst (\n    COMPATIBILITY_NATIVE          CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE\n    COMPATIBILITY_FFTW_PADDING    CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING\n    COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC\n    COMPATIBILITY_FFTW_ALL        CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL\n)\n\n\nfunc (t CompatibilityMode) String() string\n\n\ntype Handle uintptr\n    FFT plan handle, reference type to a plan\n\n\nfunc Plan1d(nx int, typ Type, batch int) Handle\n    1D FFT plan\n\n\nfunc Plan2d(nx, ny int, typ Type) Handle\n    2D FFT plan\n\n\nfunc Plan3d(nx, ny, nz int, typ Type) Handle\n    3D FFT plan\n\n\nfunc PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle\n    1D,2D or 3D FFT plan\n\n\nfunc (plan *Handle) Destroy()\n    Destroys the plan.\n\nfunc (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int)\n    Execute Complex-to-Complex plan\n\nfunc (plan Handle) ExecC2R(idata, odata cu.DevicePtr)\n    Execute Complex-to-Real plan\n\nfunc (plan Handle) ExecD2Z(idata, odata cu.DevicePtr)\n    Execute Double Real-to-Complex plan\n\nfunc (plan Handle) ExecR2C(idata, odata cu.DevicePtr)\n    Execute Real-to-Complex plan\n\nfunc (plan Handle) ExecZ2D(idata, odata cu.DevicePtr)\n    Execute Double Complex-to-Real plan\n\nfunc (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int)\n    Execute Double Complex-to-Complex plan\n\nfunc (plan Handle) SetCompatibilityMode(mode CompatibilityMode)\n    Sets the FFTW compatibility mode\n\nfunc (plan Handle) SetStream(stream cu.Stream)\n    Sets the cuda stream for this plan\n\n\ntype Result int\n    FFT result\n\nconst (\n    SUCCESS        Result = C.CUFFT_SUCCESS\n    INVALID_PLAN   Result = C.CUFFT_INVALID_PLAN\n    ALLOC_FAILED   Result = C.CUFFT_ALLOC_FAILED\n    INVALID_TYPE   Result = C.CUFFT_INVALID_TYPE\n    INVALID_VALUE  Result = C.CUFFT_INVALID_VALUE\n    INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR\n    EXEC_FAILED    Result = C.CUFFT_EXEC_FAILED\n    SETUP_FAILED   Result = C.CUFFT_SETUP_FAILED\n    INVALID_SIZE   Result = C.CUFFT_INVALID_SIZE\n    UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA\n)\n    FFT result value\n\n\nfunc (r Result) String() string\n\n\ntype Type int\n    FFT type\n\nconst (\n    R2C Type = C.CUFFT_R2C // Real to Complex (interleaved)\n    C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real\n    C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved\n    D2Z Type = C.CUFFT_D2Z // Double to Double-Complex\n    Z2D Type = C.CUFFT_Z2D // Double-Complex to Double\n    Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex\n)\n\n\nfunc (t Type) String() string\n\n\n\n"
  },
  {
    "path": "cufft/cgoflags.go",
    "content": "package cufft\n\n// This file provides CGO flags to find CUDA libraries and headers.\n\n//#cgo LDFLAGS:-lcufft\n//\n////default location:\n//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib\n//#cgo CFLAGS: -I/usr/local/cuda/include/\n//\n////default location if not properly symlinked:\n//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib\n//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib\n//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib\n//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/\n//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/\n//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/\n//\n////arch linux:\n//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib\n//#cgo CFLAGS: -I/opt/cuda/include\n//\n////WINDOWS:\n//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64\n//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w\nimport \"C\"\n"
  },
  {
    "path": "cufft/doc.go",
    "content": "// Go bindings for the CUDA CUFFT API.\npackage cufft\n"
  },
  {
    "path": "cufft/fft_test.go",
    "content": "package cufft\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\nfunc ExampleFFT1D() {\n\tN := 8\n\n\thostIn := make([]float32, N)\n\thostIn[0] = 1\n\n\tdevIn := cu.MemAlloc(int64(len(hostIn)) * cu.SIZEOF_FLOAT32)\n\tdefer cu.MemFree(&devIn)\n\tcu.MemcpyHtoD(devIn, unsafe.Pointer(&hostIn[0]), devIn.Bytes())\n\n\thostOut := make([]complex64, N/2+1)\n\tdevOut := cu.MemAlloc(int64(len(hostOut)) * cu.SIZEOF_COMPLEX64)\n\tdefer cu.MemFree(&devOut)\n\n\tplan := Plan1d(N, R2C, 1)\n\tdefer plan.Destroy()\n\tplan.ExecR2C(devIn, devOut)\n\n\tcu.MemcpyDtoH(unsafe.Pointer(&hostOut[0]), devOut, devOut.Bytes())\n\n\tfmt.Println(\"hostIn:\", hostIn)\n\tfmt.Println(\"hostOut:\", hostOut)\n\n\t// Output:\n\t// hostIn: [1 0 0 0 0 0 0 0]\n\t// hostOut: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]\n}\n"
  },
  {
    "path": "cufft/init_test.go",
    "content": "package cufft\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cu\"\n)\n\n// needed for all other tests.\nfunc init() {\n\tcu.Init(0)\n\tctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0)\n\tcu.CtxSetCurrent(ctx)\n\tfmt.Println(\"Created CUDA context\")\n}\n"
  },
  {
    "path": "cufft/mode.go",
    "content": "package cufft\n\n//#include <cufft.h>\nimport \"C\"\n\nimport (\n\t\"fmt\"\n)\n\n// CUFFT compatibility mode\ntype CompatibilityMode int\n\nconst (\n\tCOMPATIBILITY_NATIVE          CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE\n\tCOMPATIBILITY_FFTW_PADDING    CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING\n\tCOMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC\n\tCOMPATIBILITY_FFTW_ALL        CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL\n)\n\nfunc (t CompatibilityMode) String() string {\n\tif str, ok := compatibilityModeString[t]; ok {\n\t\treturn str\n\t}\n\treturn fmt.Sprint(\"CUFFT Compatibility mode with unknown number:\", int(t))\n}\n\nvar compatibilityModeString map[CompatibilityMode]string = map[CompatibilityMode]string{\n\tCOMPATIBILITY_NATIVE:          \"CUFFT_COMPATIBILITY_NATIVE\",\n\tCOMPATIBILITY_FFTW_PADDING:    \"CUFFT_COMPATIBILITY_FFTW_PADDING\",\n\tCOMPATIBILITY_FFTW_ASYMMETRIC: \"CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC\",\n\tCOMPATIBILITY_FFTW_ALL:        \"CUFFT_COMPATIBILITY_FFTW_ALL\"}\n"
  },
  {
    "path": "cufft/plan.go",
    "content": "// Copyright 2011 Arne Vansteenkiste (barnex@gmail.com).  All rights reserved.\n// Use of this source code is governed by a freeBSD\n// license that can be found in the LICENSE.txt file.\n\npackage cufft\n\n//#include <cufft.h>\nimport \"C\"\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\n// FFT plan handle, reference type to a plan\ntype Handle uintptr\n\n// 1D FFT plan\nfunc Plan1d(nx int, typ Type, batch int) Handle {\n\tvar handle C.cufftHandle\n\terr := Result(C.cufftPlan1d(\n\t\t&handle,\n\t\tC.int(nx),\n\t\tC.cufftType(typ),\n\t\tC.int(batch)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Handle(handle)\n}\n\n// 2D FFT plan\nfunc Plan2d(nx, ny int, typ Type) Handle {\n\tvar handle C.cufftHandle\n\terr := Result(C.cufftPlan2d(\n\t\t&handle,\n\t\tC.int(nx),\n\t\tC.int(ny),\n\t\tC.cufftType(typ)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Handle(handle)\n}\n\n// 3D FFT plan\nfunc Plan3d(nx, ny, nz int, typ Type) Handle {\n\tvar handle C.cufftHandle\n\terr := Result(C.cufftPlan3d(\n\t\t&handle,\n\t\tC.int(nx),\n\t\tC.int(ny),\n\t\tC.int(nz),\n\t\tC.cufftType(typ)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Handle(handle)\n}\n\n//cufftPlanMany(\n//    cufftHandle *plan, int rank, int *n, int *inembed,\n//    int istride, int idist, int *onembed, int ostride,\n//    int odist, cufftType type, int batch );\n\n// 1D,2D or 3D FFT plan\nfunc PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle {\n\tvar handle C.cufftHandle\n\n\tNULL := (*C.int)(unsafe.Pointer(uintptr(0)))\n\n\tinembedptr := NULL\n\tidist := 0\n\tif inembed != nil {\n\t\tinembedptr = (*C.int)(unsafe.Pointer(&inembed[0]))\n\t\tidist = inembed[0]\n\t}\n\n\toembedptr := NULL\n\todist := 0\n\tif oembed != nil {\n\t\toembedptr = (*C.int)(unsafe.Pointer(&oembed[0]))\n\t\todist = oembed[0]\n\t}\n\n\terr := Result(C.cufftPlanMany(\n\t\t&handle,\n\t\tC.int(len(n)),                   // rank\n\t\t(*C.int)(unsafe.Pointer(&n[0])), // n\n\t\tinembedptr,\n\t\tC.int(istride),\n\t\tC.int(idist),\n\t\toembedptr,\n\t\tC.int(ostride),\n\t\tC.int(odist),\n\t\tC.cufftType(typ),\n\t\tC.int(batch)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Handle(handle)\n}\n\n// Execute Complex-to-Complex plan\nfunc (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) {\n\terr := Result(C.cufftExecC2C(\n\t\tC.cufftHandle(plan),\n\t\t(*C.cufftComplex)(unsafe.Pointer(uintptr(idata))),\n\t\t(*C.cufftComplex)(unsafe.Pointer(uintptr(odata))),\n\t\tC.int(direction)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Execute Real-to-Complex plan\nfunc (plan Handle) ExecR2C(idata, odata cu.DevicePtr) {\n\terr := Result(C.cufftExecR2C(\n\t\tC.cufftHandle(plan),\n\t\t(*C.cufftReal)(unsafe.Pointer(uintptr(idata))),\n\t\t(*C.cufftComplex)(unsafe.Pointer(uintptr(odata)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Execute Complex-to-Real plan\nfunc (plan Handle) ExecC2R(idata, odata cu.DevicePtr) {\n\terr := Result(C.cufftExecC2R(\n\t\tC.cufftHandle(plan),\n\t\t(*C.cufftComplex)(unsafe.Pointer(uintptr(idata))),\n\t\t(*C.cufftReal)(unsafe.Pointer(uintptr(odata)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Execute Double Complex-to-Complex plan\nfunc (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) {\n\terr := Result(C.cufftExecZ2Z(\n\t\tC.cufftHandle(plan),\n\t\t(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))),\n\t\t(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))),\n\t\tC.int(direction)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Execute Double Real-to-Complex plan\nfunc (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) {\n\terr := Result(C.cufftExecD2Z(\n\t\tC.cufftHandle(plan),\n\t\t(*C.cufftDoubleReal)(unsafe.Pointer(uintptr(idata))),\n\t\t(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Execute Double Complex-to-Real plan\nfunc (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) {\n\terr := Result(C.cufftExecZ2D(\n\t\tC.cufftHandle(plan),\n\t\t(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))),\n\t\t(*C.cufftDoubleReal)(unsafe.Pointer(uintptr(odata)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Destroys the plan.\nfunc (plan *Handle) Destroy() {\n\terr := Result(C.cufftDestroy(C.cufftHandle(*plan)))\n\t*plan = 0 // make sure plan is not used anymore\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Sets the cuda stream for this plan\nfunc (plan Handle) SetStream(stream cu.Stream) {\n\terr := Result(C.cufftSetStream(\n\t\tC.cufftHandle(plan),\n\t\tC.cudaStream_t(unsafe.Pointer(uintptr(stream)))))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Sets the FFTW compatibility mode\nfunc (plan Handle) SetCompatibilityMode(mode CompatibilityMode) {\n\terr := Result(C.cufftSetCompatibilityMode(\n\t\tC.cufftHandle(plan),\n\t\tC.cufftCompatibility(mode)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n"
  },
  {
    "path": "cufft/result.go",
    "content": "package cufft\n\n//#include <cufft.h>\nimport \"C\"\n\nimport (\n\t\"fmt\"\n)\n\n// FFT result\ntype Result int\n\n// FFT result value\nconst (\n\tSUCCESS                   Result = C.CUFFT_SUCCESS\n\tINVALID_PLAN              Result = C.CUFFT_INVALID_PLAN\n\tALLOC_FAILED              Result = C.CUFFT_ALLOC_FAILED\n\tINVALID_TYPE              Result = C.CUFFT_INVALID_TYPE\n\tINVALID_VALUE             Result = C.CUFFT_INVALID_VALUE\n\tINTERNAL_ERROR            Result = C.CUFFT_INTERNAL_ERROR\n\tEXEC_FAILED               Result = C.CUFFT_EXEC_FAILED\n\tSETUP_FAILED              Result = C.CUFFT_SETUP_FAILED\n\tINVALID_SIZE              Result = C.CUFFT_INVALID_SIZE\n\tUNALIGNED_DATA            Result = C.CUFFT_UNALIGNED_DATA\n\tINCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h\n\tINVALID_DEVICE            Result = 0xB\n\tPARSE_ERROR               Result = 0xC\n\tNO_WORKSPACE              Result = 0xD\n)\n\nfunc (r Result) String() string {\n\tif str, ok := resultString[r]; ok {\n\t\treturn str\n\t}\n\treturn fmt.Sprint(\"CUFFT Result with unknown error number:\", int(r))\n}\n\nvar resultString map[Result]string = map[Result]string{\n\tSUCCESS:                   \"CUFFT_SUCCESS\",\n\tINVALID_PLAN:              \"CUFFT_INVALID_PLAN\",\n\tALLOC_FAILED:              \"CUFFT_ALLOC_FAILED\",\n\tINVALID_TYPE:              \"CUFFT_INVALID_TYPE\",\n\tINVALID_VALUE:             \"CUFFT_INVALID_VALUE\",\n\tINTERNAL_ERROR:            \"CUFFT_INTERNAL_ERROR\",\n\tEXEC_FAILED:               \"CUFFT_EXEC_FAILED\",\n\tSETUP_FAILED:              \"CUFFT_SETUP_FAILED\",\n\tINVALID_SIZE:              \"CUFFT_INVALID_SIZE\",\n\tUNALIGNED_DATA:            \"CUFFT_UNALIGNED_DATA\",\n\tINCOMPLETE_PARAMETER_LIST: \"CUFFT_INCOMPLETE_PARAMETER_LIST\",\n\tINVALID_DEVICE:            \"CUFFT_INVALID_DEVICE\",\n\tPARSE_ERROR:               \"CUFFT_PARSE_ERROR\",\n\tNO_WORKSPACE:              \"CUFFT_NO_WORKSPACE\"}\n"
  },
  {
    "path": "cufft/type.go",
    "content": "package cufft\n\n//#include <cufft.h>\nimport \"C\"\n\nimport (\n\t\"fmt\"\n)\n\n// FFT type\ntype Type int\n\nconst (\n\tR2C Type = C.CUFFT_R2C // Real to Complex (interleaved)\n\tC2R Type = C.CUFFT_C2R // Complex (interleaved) to Real\n\tC2C Type = C.CUFFT_C2C // Complex to Complex, interleaved\n\tD2Z Type = C.CUFFT_D2Z // Double to Double-Complex\n\tZ2D Type = C.CUFFT_Z2D // Double-Complex to Double\n\tZ2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex\n)\n\nconst (\n\tFORWARD = -1 // Forward FFT\n\tINVERSE = 1  // Inverse FFT\n)\n\nfunc (t Type) String() string {\n\tif str, ok := typeString[t]; ok {\n\t\treturn str\n\t}\n\treturn fmt.Sprint(\"CUFFT Type with unknown number:\", int(t))\n}\n\nvar typeString map[Type]string = map[Type]string{\n\tR2C: \"CUFFT_R2C\",\n\tC2R: \"CUFFT_C2R\",\n\tC2C: \"CUFFT_C2C\",\n\tD2Z: \"CUFFT_D2Z\",\n\tZ2D: \"CUFFT_Z2D\",\n\tZ2Z: \"CUFFT_Z2Z\"}\n"
  },
  {
    "path": "curand/Makefile",
    "content": "all: 6g gccgo doc\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngccgo:\n\tgo build -v -compiler $(GCCGO)\n\ntest: 6gtest gccgotest\n\n6gtest: \n\tgo test\n\ngccgotest: \n\tgo test -compiler $(GCCGO)\n\nbench: 6gbench gccgobench\n\n6gbench:\n\tgo test -bench=.\n\ngccgobench:\n\tgo test -bench=. -compiler $(GCCGO)\n\nclean:\n\tgo clean\n\ndoc:\n\tgodoc github.com/barnex/cuda5/curand > README\n"
  },
  {
    "path": "curand/README",
    "content": "PACKAGE DOCUMENTATION\n\npackage curand\n    import \"github.com/barnex/cuda5/curand\"\n\n\n\nTYPES\n\ntype Generator uintptr\n\n\nfunc CreateGenerator(rngType RngType) Generator\n\n\nfunc (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32)\n\nfunc (g Generator) SetSeed(seed int64)\n\n\ntype RngType int\n\nconst (\n    PSEUDO_DEFAULT          RngType = C.CURAND_RNG_PSEUDO_DEFAULT          // Default pseudorandom generator\n    PSEUDO_XORWOW           RngType = C.CURAND_RNG_PSEUDO_XORWOW           // XORWOW pseudorandom generator\n    QUASI_DEFAULT           RngType = C.CURAND_RNG_QUASI_DEFAULT           // Default quasirandom generator\n    QUASI_SOBOL32           RngType = C.CURAND_RNG_QUASI_SOBOL32           // Sobol32 quasirandom generator\n    QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator\n    QUASI_SOBOL64           RngType = C.CURAND_RNG_QUASI_SOBOL64           // Sobol64 quasirandom generator\n    QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator\n)\n\n\n\ntype Status int\n\nconst (\n    SUCCESS               Status = C.CURAND_STATUS_SUCCESS               // No errors\n    VERSION_MISMATCH      Status = C.CURAND_STATUS_VERSION_MISMATCH      // Header file and linked library version do not match\n    NOT_INITIALIZED       Status = C.CURAND_STATUS_NOT_INITIALIZED       // Generator not initialized\n    ALLOCATION_FAILED     Status = C.CURAND_STATUS_ALLOCATION_FAILED     // Memory allocation failed\n    TYPE_ERROR            Status = C.CURAND_STATUS_TYPE_ERROR            // Generator is wrong type\n    OUT_OF_RANGE          Status = C.CURAND_STATUS_OUT_OF_RANGE          // Argument out of range\n    LENGTH_NOT_MULTIPLE   Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE   // Length requested is not a multple of dimension\n    LAUNCH_FAILURE        Status = C.CURAND_STATUS_LAUNCH_FAILURE        // Kernel launch failure\n    PREEXISTING_FAILURE   Status = C.CURAND_STATUS_PREEXISTING_FAILURE   // Preexisting failure on library entry\n    INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed\n    ARCH_MISMATCH         Status = C.CURAND_STATUS_ARCH_MISMATCH         // Architecture mismatch, GPU does not support requested feature\n    INTERNAL_ERROR        Status = C.CURAND_STATUS_INTERNAL_ERROR        // Internal library error\n)\n\n\n\n\n"
  },
  {
    "path": "curand/cgoflags.go",
    "content": "package curand\n\n// This file provides CGO flags to find CUDA libraries and headers.\n\n//#cgo LDFLAGS:-lcurand\n//\n////default location:\n//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib\n//#cgo CFLAGS: -I/usr/local/cuda/include/\n//\n////default location if not properly symlinked:\n//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib\n//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib\n//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib\n//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/\n//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/\n//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/\n//\n////arch linux:\n//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib\n//#cgo CFLAGS: -I/opt/cuda/include\n//\n////WINDOWS:\n//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64\n//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w\nimport \"C\"\n"
  },
  {
    "path": "curand/generator.go",
    "content": "package curand\n\n//#include <curand.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\ntype Generator uintptr\n\ntype RngType int\n\nconst (\n\tPSEUDO_DEFAULT          RngType = C.CURAND_RNG_PSEUDO_DEFAULT          // Default pseudorandom generator\n\tPSEUDO_XORWOW           RngType = C.CURAND_RNG_PSEUDO_XORWOW           // XORWOW pseudorandom generator\n\tQUASI_DEFAULT           RngType = C.CURAND_RNG_QUASI_DEFAULT           // Default quasirandom generator\n\tQUASI_SOBOL32           RngType = C.CURAND_RNG_QUASI_SOBOL32           // Sobol32 quasirandom generator\n\tQUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator\n\tQUASI_SOBOL64           RngType = C.CURAND_RNG_QUASI_SOBOL64           // Sobol64 quasirandom generator\n\tQUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator\n)\n\nfunc CreateGenerator(rngType RngType) Generator {\n\tvar rng C.curandGenerator_t\n\terr := Status(C.curandCreateGenerator(&rng, C.curandRngType_t(rngType)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n\treturn Generator(uintptr(unsafe.Pointer(rng))) // cgo\n}\n\nfunc (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) {\n\terr := Status(C.curandGenerateNormal(\n\t\tC.curandGenerator_t(unsafe.Pointer(uintptr(g))),\n\t\t(*C.float)(unsafe.Pointer(output)),\n\t\tC.size_t(n),\n\t\tC.float(mean),\n\t\tC.float(stddev)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\nfunc (g Generator) SetSeed(seed int64) {\n\terr := Status(C.curandSetPseudoRandomGeneratorSeed(C.curandGenerator_t(unsafe.Pointer(uintptr(g))), _Ctype_ulonglong(seed)))\n\tif err != SUCCESS {\n\t\tpanic(err)\n\t}\n}\n\n// Documentation was taken from the curand headers.\n"
  },
  {
    "path": "curand/status.go",
    "content": "package curand\n\n//#include <curand.h>\nimport \"C\"\n\nimport (\n\t\"fmt\"\n)\n\ntype Status int\n\nconst (\n\tSUCCESS               Status = C.CURAND_STATUS_SUCCESS               // No errors\n\tVERSION_MISMATCH      Status = C.CURAND_STATUS_VERSION_MISMATCH      // Header file and linked library version do not match\n\tNOT_INITIALIZED       Status = C.CURAND_STATUS_NOT_INITIALIZED       // Generator not initialized\n\tALLOCATION_FAILED     Status = C.CURAND_STATUS_ALLOCATION_FAILED     // Memory allocation failed\n\tTYPE_ERROR            Status = C.CURAND_STATUS_TYPE_ERROR            // Generator is wrong type\n\tOUT_OF_RANGE          Status = C.CURAND_STATUS_OUT_OF_RANGE          // Argument out of range\n\tLENGTH_NOT_MULTIPLE   Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE   // Length requested is not a multple of dimension\n\tLAUNCH_FAILURE        Status = C.CURAND_STATUS_LAUNCH_FAILURE        // Kernel launch failure\n\tPREEXISTING_FAILURE   Status = C.CURAND_STATUS_PREEXISTING_FAILURE   // Preexisting failure on library entry\n\tINITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed\n\tARCH_MISMATCH         Status = C.CURAND_STATUS_ARCH_MISMATCH         // Architecture mismatch, GPU does not support requested feature\n\tINTERNAL_ERROR        Status = C.CURAND_STATUS_INTERNAL_ERROR        // Internal library error\n)\n\nfunc (s Status) String() string {\n\tif str, ok := statusStr[s]; ok {\n\t\treturn str\n\t} else {\n\t\treturn fmt.Sprint(\"CURAND ERROR NUMBER \", int(s))\n\t}\n}\n\nvar statusStr = map[Status]string{\n\tSUCCESS:               \"CURAND_STATUS_SUCCESS\",\n\tVERSION_MISMATCH:      \"CURAND_STATUS_VERSION_MISMATCH\",\n\tNOT_INITIALIZED:       \"CURAND_STATUS_NOT_INITIALIZED\",\n\tALLOCATION_FAILED:     \"CURAND_STATUS_ALLOCATION_FAILED\",\n\tTYPE_ERROR:            \"CURAND_STATUS_TYPE_ERROR\",\n\tOUT_OF_RANGE:          \"CURAND_STATUS_OUT_OF_RANGE\",\n\tLENGTH_NOT_MULTIPLE:   \"CURAND_STATUS_LENGTH_NOT_MULTIPLE\",\n\tLAUNCH_FAILURE:        \"CURAND_STATUS_LAUNCH_FAILURE\",\n\tPREEXISTING_FAILURE:   \"CURAND_STATUS_PREEXISTING_FAILURE\",\n\tINITIALIZATION_FAILED: \"CURAND_STATUS_INITIALIZATION_FAILED\",\n\tARCH_MISMATCH:         \"CURAND_STATUS_ARCH_MISMATCH\",\n\tINTERNAL_ERROR:        \"CURAND_STATUS_INTERNAL_ERROR\",\n}\n\n// Documentation was taken from the curand headers.\n"
  },
  {
    "path": "doc.go",
    "content": "/*\n\tGo bindings for nVIDIA CUDA 5.\n\tThis package compiles with both gc and gccgo.\n*/\npackage cuda5\n\n// Dummy imports so that\n// \tgo get github.com/barnex/cuda5\n// will install everything.\nimport (\n\t_ \"github.com/barnex/cuda5/cu\"\n\t_ \"github.com/barnex/cuda5/cufft\"\n\t_ \"github.com/barnex/cuda5/safe\"\n)\n"
  },
  {
    "path": "safe/Makefile",
    "content": "all: 6g doc #gccgo\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngccgo:\n\tgo build -v -compiler $(GCCGO)\n\ntest: 6gtest gccgotest\n\n6gtest: \n\tgo test\n\ngccgotest: \n\tgo test -compiler $(GCCGO)\n\nbench: 6gbench gccgobench\n\n6gbench:\n\tgo test -bench=.\n\ngccgobench:\n\tgo test -bench=. -compiler $(GCCGO)\n\nclean:\n\tgo clean\n\tgo-optview -c -w *.go\n\tgofmt -w *.go\n\nopt:\n\tgo-optview -w *.go\n\tgofmt -w *.go\n\ndoc:\n\tgodoc github.com/barnex/cuda5/safe > README\n"
  },
  {
    "path": "safe/README",
    "content": "PACKAGE\n\npackage safe\n    import \"github.com/barnex/cuda5/safe\"\n\n    Safe and more idiomatic wrappers for the low-level CUDA functions.\n\nFUNCTIONS\n\nfunc InitCuda()\n\n\nTYPES\n\ntype Complex128s struct {\n    // contains filtered or unexported fields\n}\n    Slice of complex128's on the GPU.\n\nfunc MakeComplex128s(len_ int) Complex128s\n    Make a slice of complex128's on the GPU. Initialized to zero.\n\nfunc (s *Complex128s) Cap() int\n    Slice capacity.\n\nfunc (dst Complex128s) CopyDtoD(src Complex128s)\n    Copy src on host to dst on host.\n\nfunc (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream)\n    Copy src on host to dst on host, asynchronously.\n\nfunc (src Complex128s) CopyDtoH(dst []complex128)\n    Copy src form device to dst on host.\n\nfunc (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream)\n    Copy src form device to dst on host, asynchronously.\n\nfunc (dst Complex128s) CopyHtoD(src []complex128)\n    Copy src from host to dst on the device.\n\nfunc (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream)\n    Copy src from host to dst on the device, asynchronously.\n\nfunc (s Complex128s) Float() Float64s\n    Re-interpret the array as float numbers, in interleaved format.\n    Underlying storage is shared.\n\nfunc (s *Complex128s) Free()\n    Free the underlying storage. To be used with care. Free() should only be\n    called on a slice created by MakeXXX(), not on a slice created by\n    x.Slice(). Freeing a slice invalidates all other slices referring to it.\n\nfunc (src Complex128s) Host() []complex128\n    Returns a fresh copy on host.\n\nfunc (s *Complex128s) Len() int\n    Slice length (number of elements).\n\nfunc (s *Complex128s) Pointer() cu.DevicePtr\n    Pointer to the first element.\n\nfunc (s Complex128s) Slice(start, stop int) Complex128s\n    Return a slice from start (inclusive) to stop (exclusive), sharing the\n    underlying storage with the original slice. Slices obtained in this way\n    should not be Free()'d\n\nfunc (s *Complex128s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)\n    Manually set the pointer, length and capacity. Side-steps the security\n    mechanisms, use with caution.\n\ntype Complex64s struct {\n    // contains filtered or unexported fields\n}\n    Slice of complex64's on the GPU.\n\nfunc MakeComplex64s(len_ int) Complex64s\n    Make a slice of complex64's on the GPU. Initialized to zero.\n\nfunc (s *Complex64s) Cap() int\n    Slice capacity.\n\nfunc (dst Complex64s) CopyDtoD(src Complex64s)\n    Copy src on host to dst on host.\n\nfunc (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream)\n    Copy src on host to dst on host, asynchronously.\n\nfunc (src Complex64s) CopyDtoH(dst []complex64)\n    Copy src form device to dst on host.\n\nfunc (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream)\n    Copy src form device to dst on host, asynchronously.\n\nfunc (dst Complex64s) CopyHtoD(src []complex64)\n    Copy src from host to dst on the device.\n\nfunc (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream)\n    Copy src from host to dst on the device, asynchronously.\n\nfunc (s Complex64s) Float() Float32s\n    Re-interpret the array as float numbers, in interleaved format.\n    Underlying storage is shared.\n\nfunc (s *Complex64s) Free()\n    Free the underlying storage. To be used with care. Free() should only be\n    called on a slice created by MakeXXX(), not on a slice created by\n    x.Slice(). Freeing a slice invalidates all other slices referring to it.\n\nfunc (src Complex64s) Host() []complex64\n    Returns a fresh copy on host.\n\nfunc (s *Complex64s) Len() int\n    Slice length (number of elements).\n\nfunc (s *Complex64s) Pointer() cu.DevicePtr\n    Pointer to the first element.\n\nfunc (s Complex64s) Slice(start, stop int) Complex64s\n    Return a slice from start (inclusive) to stop (exclusive), sharing the\n    underlying storage with the original slice. Slices obtained in this way\n    should not be Free()'d\n\nfunc (s *Complex64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)\n    Manually set the pointer, length and capacity. Side-steps the security\n    mechanisms, use with caution.\n\ntype FFT1DC2RPlan struct {\n    // contains filtered or unexported fields\n}\n    1D single-precission complex-to-real FFT plan.\n\nfunc FFT1DC2R(size, batch int) FFT1DC2RPlan\n    1D single-precission complex-to-real FFT plan.\n\nfunc (p FFT1DC2RPlan) Destroy()\n    Releases all resources associated with the FFT plan.\n\nfunc (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s)\n    Execute the FFT plan. Synchronized.\n\nfunc (p FFT1DC2RPlan) InputLen() int\n    Required length of the output array.\n\nfunc (p FFT1DC2RPlan) OutputLen() int\n    Required length of the input array.\n\nfunc (p FFT1DC2RPlan) SetStream(stream cu.Stream)\n    Associates a CUDA stream with the FFT plan. If a stream is set,\n    plan.Stream().Synchronize() can to be called to wait for the execution\n    to finish.\n\nfunc (s FFT1DC2RPlan) Size() int\n    Returns the logical size of the FFT: the number of elements (real or\n    complex) it transforms.\n\nfunc (p FFT1DC2RPlan) Stream() cu.Stream\n    Returns the CUDA stream associated with the FFT plan.\n\ntype FFT1DR2CPlan struct {\n    // contains filtered or unexported fields\n}\n    1D single-precission real-to-complex FFT plan.\n\nfunc FFT1DR2C(size, batch int) FFT1DR2CPlan\n    1D single-precission real-to-complex FFT plan.\n\nfunc (p FFT1DR2CPlan) Destroy()\n    Releases all resources associated with the FFT plan.\n\nfunc (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s)\n    Execute the FFT plan. Synchronized.\n\nfunc (p FFT1DR2CPlan) InputLen() int\n    Required length of the input array.\n\nfunc (p FFT1DR2CPlan) OutputLen() int\n    Required length of the output array.\n\nfunc (p FFT1DR2CPlan) SetStream(stream cu.Stream)\n    Associates a CUDA stream with the FFT plan. If a stream is set,\n    plan.Stream().Synchronize() can to be called to wait for the execution\n    to finish.\n\nfunc (s FFT1DR2CPlan) Size() int\n    Returns the logical size of the FFT: the number of elements (real or\n    complex) it transforms.\n\nfunc (p FFT1DR2CPlan) Stream() cu.Stream\n    Returns the CUDA stream associated with the FFT plan.\n\ntype FFT3DC2RPlan struct {\n    // contains filtered or unexported fields\n}\n    3D single-precission real-to-complex FFT plan.\n\nfunc FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan\n    3D single-precission real-to-complex FFT plan.\n\nfunc (p FFT3DC2RPlan) Destroy()\n    Releases all resources associated with the FFT plan.\n\nfunc (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s)\n    Execute the FFT plan. src and dst are 3D arrays stored 1D arrays.\n\nfunc (p FFT3DC2RPlan) InputLen() int\n    Required length of the (1D) input array.\n\nfunc (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int)\n    3D size of the input array.\n\nfunc (p FFT3DC2RPlan) OutputLen() int\n    Required length of the (1D) output array.\n\nfunc (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int)\n    3D size of the output array.\n\nfunc (p FFT3DC2RPlan) SetStream(stream cu.Stream)\n    Associates a CUDA stream with the FFT plan. If a stream is set,\n    plan.Stream().Synchronize() can to be called to wait for the execution\n    to finish.\n\nfunc (s FFT3DC2RPlan) Size() (Nx, Ny, Nz int)\n    Returns the logical size of the FFT: the number of elements (real or\n    complex) it transforms.\n\nfunc (p FFT3DC2RPlan) Stream() cu.Stream\n    Returns the CUDA stream associated with the FFT plan.\n\ntype FFT3DD2ZPlan struct {\n    // contains filtered or unexported fields\n}\n    3D single-precission real-to-complex FFT plan.\n\nfunc FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan\n    3D single-precission real-to-complex FFT plan.\n\nfunc (p FFT3DD2ZPlan) Destroy()\n    Releases all resources associated with the FFT plan.\n\nfunc (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s)\n    Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D\n    arrays.\n\nfunc (p FFT3DD2ZPlan) InputLen() int\n    Required length of the (1D) input array.\n\nfunc (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int)\n    3D size of the input array.\n\nfunc (p FFT3DD2ZPlan) OutputLen() int\n    Required length of the (1D) output array.\n\nfunc (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int)\n    3D size of the output array.\n\nfunc (p FFT3DD2ZPlan) SetStream(stream cu.Stream)\n    Associates a CUDA stream with the FFT plan. If a stream is set,\n    plan.Stream().Synchronize() can to be called to wait for the execution\n    to finish.\n\nfunc (s FFT3DD2ZPlan) Size() (Nx, Ny, Nz int)\n    Returns the logical size of the FFT: the number of elements (real or\n    complex) it transforms.\n\nfunc (p FFT3DD2ZPlan) Stream() cu.Stream\n    Returns the CUDA stream associated with the FFT plan.\n\ntype FFT3DR2CPlan struct {\n    // contains filtered or unexported fields\n}\n    3D single-precission real-to-complex FFT plan.\n\nfunc FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan\n    3D single-precission real-to-complex FFT plan.\n\nfunc (p FFT3DR2CPlan) Destroy()\n    Releases all resources associated with the FFT plan.\n\nfunc (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s)\n    Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D\n    arrays.\n\nfunc (p FFT3DR2CPlan) InputLen() int\n    Required length of the (1D) input array.\n\nfunc (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int)\n    3D size of the input array.\n\nfunc (p FFT3DR2CPlan) OutputLen() int\n    Required length of the (1D) output array.\n\nfunc (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int)\n    3D size of the output array.\n\nfunc (p FFT3DR2CPlan) SetStream(stream cu.Stream)\n    Associates a CUDA stream with the FFT plan. If a stream is set,\n    plan.Stream().Synchronize() can to be called to wait for the execution\n    to finish.\n\nfunc (s FFT3DR2CPlan) Size() (Nx, Ny, Nz int)\n    Returns the logical size of the FFT: the number of elements (real or\n    complex) it transforms.\n\nfunc (p FFT3DR2CPlan) Stream() cu.Stream\n    Returns the CUDA stream associated with the FFT plan.\n\ntype FFT3DZ2DPlan struct {\n    // contains filtered or unexported fields\n}\n    3D single-precission real-to-complex FFT plan.\n\nfunc FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan\n    3D single-precission real-to-complex FFT plan.\n\nfunc (p FFT3DZ2DPlan) Destroy()\n    Releases all resources associated with the FFT plan.\n\nfunc (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s)\n    Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D\n    arrays.\n\nfunc (p FFT3DZ2DPlan) InputLen() int\n    Required length of the (1D) input array.\n\nfunc (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int)\n    3D size of the input array.\n\nfunc (p FFT3DZ2DPlan) OutputLen() int\n    Required length of the (1D) output array.\n\nfunc (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int)\n    3D size of the output array.\n\nfunc (p FFT3DZ2DPlan) SetStream(stream cu.Stream)\n    Associates a CUDA stream with the FFT plan. If a stream is set,\n    plan.Stream().Synchronize() can to be called to wait for the execution\n    to finish.\n\nfunc (s FFT3DZ2DPlan) Size() (Nx, Ny, Nz int)\n    Returns the logical size of the FFT: the number of elements (real or\n    complex) it transforms.\n\nfunc (p FFT3DZ2DPlan) Stream() cu.Stream\n    Returns the CUDA stream associated with the FFT plan.\n\ntype Float32s struct {\n    // contains filtered or unexported fields\n}\n    Slice of float32's on the GPU.\n\nfunc MakeFloat32s(len_ int) Float32s\n    Make a slice of float32's on the GPU. Initialized to zero.\n\nfunc (s *Float32s) Cap() int\n    Slice capacity.\n\nfunc (s Float32s) Complex() Complex64s\n    Re-interpret the array as complex numbers, in interleaved format.\n    Underlying storage is shared.\n\nfunc (dst Float32s) CopyDtoD(src Float32s)\n    Copy src on host to dst on host.\n\nfunc (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream)\n    Copy src on host to dst on host, asynchronously.\n\nfunc (src Float32s) CopyDtoH(dst []float32)\n    Copy src form device to dst on host.\n\nfunc (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream)\n    Copy src form device to dst on host, asynchronously.\n\nfunc (dst Float32s) CopyHtoD(src []float32)\n    Copy src from host to dst on the device.\n\nfunc (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream)\n    Copy src from host to dst on the device, asynchronously.\n\nfunc (s *Float32s) Free()\n    Free the underlying storage. To be used with care. Free() should only be\n    called on a slice created by MakeXXX(), not on a slice created by\n    x.Slice(). Freeing a slice invalidates all other slices referring to it.\n\nfunc (src Float32s) Host() []float32\n    Returns a fresh copy on host.\n\nfunc (s *Float32s) Len() int\n    Slice length (number of elements).\n\nfunc (s Float32s) Memset(value float32)\n    Set the entire slice to this value.\n\nfunc (s Float32s) MemsetAsync(value float32, stream cu.Stream)\n    Set the entire slice to this value, asynchronously.\n\nfunc (s *Float32s) Pointer() cu.DevicePtr\n    Pointer to the first element.\n\nfunc (s Float32s) Slice(start, stop int) Float32s\n    Return a slice from start (inclusive) to stop (exclusive), sharing the\n    underlying storage with the original slice. Slices obtained in this way\n    should not be Free()'d\n\nfunc (s *Float32s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)\n    Manually set the pointer, length and capacity. Side-steps the security\n    mechanisms, use with caution.\n\ntype Float64s struct {\n    // contains filtered or unexported fields\n}\n    Slice of float64's on the GPU.\n\nfunc MakeFloat64s(len_ int) Float64s\n    Make a slice of float64's on the GPU. Initialized to zero.\n\nfunc (s *Float64s) Cap() int\n    Slice capacity.\n\nfunc (s Float64s) Complex() Complex128s\n    Re-interpret the array as complex numbers, in interleaved format.\n    Underlying storage is shared.\n\nfunc (dst Float64s) CopyDtoD(src Float64s)\n    Copy src on host to dst on host.\n\nfunc (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream)\n    Copy src on host to dst on host, asynchronously.\n\nfunc (src Float64s) CopyDtoH(dst []float64)\n    Copy src form device to dst on host.\n\nfunc (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream)\n    Copy src form device to dst on host, asynchronously.\n\nfunc (dst Float64s) CopyHtoD(src []float64)\n    Copy src from host to dst on the device.\n\nfunc (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream)\n    Copy src from host to dst on the device, asynchronously.\n\nfunc (s *Float64s) Free()\n    Free the underlying storage. To be used with care. Free() should only be\n    called on a slice created by MakeXXX(), not on a slice created by\n    x.Slice(). Freeing a slice invalidates all other slices referring to it.\n\nfunc (src Float64s) Host() []float64\n    Returns a fresh copy on host.\n\nfunc (s *Float64s) Len() int\n    Slice length (number of elements).\n\nfunc (s *Float64s) Pointer() cu.DevicePtr\n    Pointer to the first element.\n\nfunc (s Float64s) Slice(start, stop int) Float64s\n    Return a slice from start (inclusive) to stop (exclusive), sharing the\n    underlying storage with the original slice. Slices obtained in this way\n    should not be Free()'d\n\nfunc (s *Float64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)\n    Manually set the pointer, length and capacity. Side-steps the security\n    mechanisms, use with caution.\n\n\n"
  },
  {
    "path": "safe/complex128s.go",
    "content": "package safe\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\n// Slice of complex128's on the GPU.\ntype Complex128s struct{ slice }\n\n// Make a slice of complex128's on the GPU.\n// Initialized to zero.\nfunc MakeComplex128s(len_ int) Complex128s {\n\treturn Complex128s{makeslice(len_, cu.SIZEOF_COMPLEX128)}\n}\n\n// Return a slice from start (inclusive) to stop (exclusive),\n// sharing the underlying storage with the original slice.\n// Slices obtained in this way should not be Free()'d\nfunc (s Complex128s) Slice(start, stop int) Complex128s {\n\treturn Complex128s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX128)}\n}\n\n// Copy src from host to dst on the device.\nfunc (dst Complex128s) CopyHtoD(src []complex128) {\n\tdst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128)\n}\n\n// Copy src form device to dst on host.\nfunc (src Complex128s) CopyDtoH(dst []complex128) {\n\tsrc.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128)\n}\n\n// Copy src on host to dst on host.\nfunc (dst Complex128s) CopyDtoD(src Complex128s) {\n\tdst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX128)\n}\n\n// Copy src from host to dst on the device, asynchronously.\nfunc (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream) {\n\tdst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128, stream)\n}\n\n// Copy src form device to dst on host, asynchronously.\nfunc (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream) {\n\tsrc.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128, stream)\n}\n\n// Copy src on host to dst on host, asynchronously.\nfunc (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream) {\n\tdst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX128, stream)\n}\n\n// Returns a fresh copy on host.\nfunc (src Complex128s) Host() []complex128 {\n\tcpy := make([]complex128, src.Len())\n\tsrc.CopyDtoH(cpy)\n\treturn cpy\n}\n\n// Re-interpret the array as float numbers,\n// in interleaved format. Underlying storage\n// is shared.\nfunc (s Complex128s) Float() Float64s {\n\treturn Float64s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}}\n}\n"
  },
  {
    "path": "safe/complex128s_test.go",
    "content": "package safe\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n)\n\nfunc TestComplex128sSlice(test *testing.T) {\n\tInitCuda()\n\n\ta := MakeComplex128s(100)\n\tdefer a.Free()\n\n\tif !reflect.DeepEqual(a.Host(), make([]complex128, 100)) {\n\t\ttest.Error(a.Host())\n\t}\n\n\tb := make([]complex128, 100)\n\n\tif a.Len() != len(b) {\n\t\ttest.Error(\"len:\", a.Len(), \"!=\", cap(b))\n\t}\n\tif a.Cap() != cap(b) {\n\t\ttest.Error(\"cap:\", a.Cap(), \"!=\", cap(b))\n\t}\n\n\tc := a.Slice(20, 30)\n\td := b[20:30]\n\n\tif c.Len() != len(d) {\n\t\ttest.Error(\"sliced len:\", c.Len(), \"!=\", cap(d))\n\t}\n\tif c.Cap() != cap(d) {\n\t\ttest.Error(\"sliced cap:\", c.Cap(), \"!=\", cap(d))\n\t}\n\n\te := a.Slice(0, 50)\n\tf := b[0:50]\n\n\tif e.Len() != len(f) {\n\t\ttest.Error(\"sliced len:\", e.Len(), \"!=\", cap(f))\n\t}\n\tif e.Cap() != cap(f) {\n\t\ttest.Error(\"sliced cap:\", e.Cap(), \"!=\", cap(f))\n\t}\n}\n\nfunc TestComplex128sPanic1(test *testing.T) {\n\tInitCuda()\n\n\tdefer func() {\n\t\terr := recover()\n\t\ttest.Log(\"recovered:\", err)\n\t\tif err == nil {\n\t\t\ttest.Fail()\n\t\t}\n\t}()\n\n\ta := MakeComplex128s(100)\n\tdefer a.Free()\n\n\ta.Slice(-1, 10)\n}\n\nfunc TestComplex128sPanic2(test *testing.T) {\n\tInitCuda()\n\n\tdefer func() {\n\t\terr := recover()\n\t\ttest.Log(\"recovered:\", err)\n\t\tif err == nil {\n\t\t\ttest.Fail()\n\t\t}\n\t}()\n\n\ta := MakeComplex128s(100)\n\tdefer a.Free()\n\n\ta.Slice(0, 101)\n}\n\nfunc TestComplex128sCopy(test *testing.T) {\n\tInitCuda()\n\n\ta := make([]complex128, 100)\n\n\tb := MakeComplex128s(100)\n\tdefer b.Free()\n\n\tc := MakeComplex128s(100)\n\tdefer c.Free()\n\n\td := make([]complex128, 200)\n\n\tfor i := range a {\n\t\ta[i] = complex(float64(i), float64(2*i))\n\t}\n\n\tb.CopyHtoD(a)\n\n\tc.CopyDtoD(b)\n\n\tc.CopyDtoH(d[:100])\n\n\tif !reflect.DeepEqual(a, d[:100]) {\n\t\ttest.Error(d)\n\t}\n\tif !reflect.DeepEqual(d[100:], make([]complex128, 100)) {\n\t\ttest.Error(d)\n\t}\n}\n"
  },
  {
    "path": "safe/complex64s.go",
    "content": "package safe\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\n// Slice of complex64's on the GPU.\ntype Complex64s struct{ slice }\n\n// Make a slice of complex64's on the GPU.\n// Initialized to zero.\nfunc MakeComplex64s(len_ int) Complex64s {\n\treturn Complex64s{makeslice(len_, cu.SIZEOF_COMPLEX64)}\n}\n\n// Return a slice from start (inclusive) to stop (exclusive),\n// sharing the underlying storage with the original slice.\n// Slices obtained in this way should not be Free()'d\nfunc (s Complex64s) Slice(start, stop int) Complex64s {\n\treturn Complex64s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX64)}\n}\n\n// Copy src from host to dst on the device.\nfunc (dst Complex64s) CopyHtoD(src []complex64) {\n\tdst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64)\n}\n\n// Copy src form device to dst on host.\nfunc (src Complex64s) CopyDtoH(dst []complex64) {\n\tsrc.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64)\n}\n\n// Copy src on host to dst on host.\nfunc (dst Complex64s) CopyDtoD(src Complex64s) {\n\tdst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX64)\n}\n\n// Copy src from host to dst on the device, asynchronously.\nfunc (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream) {\n\tdst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64, stream)\n}\n\n// Copy src form device to dst on host, asynchronously.\nfunc (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream) {\n\tsrc.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64, stream)\n}\n\n// Copy src on host to dst on host, asynchronously.\nfunc (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream) {\n\tdst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX64, stream)\n}\n\n// Returns a fresh copy on host.\nfunc (src Complex64s) Host() []complex64 {\n\tcpy := make([]complex64, src.Len())\n\tsrc.CopyDtoH(cpy)\n\treturn cpy\n}\n\n// Re-interpret the array as float numbers,\n// in interleaved format. Underlying storage\n// is shared.\nfunc (s Complex64s) Float() Float32s {\n\treturn Float32s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}}\n}\n"
  },
  {
    "path": "safe/complex64s_test.go",
    "content": "package safe\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n)\n\nfunc TestComplex64sSlice(test *testing.T) {\n\tInitCuda()\n\n\ta := MakeComplex64s(100)\n\tdefer a.Free()\n\n\tif !reflect.DeepEqual(a.Host(), make([]complex64, 100)) {\n\t\ttest.Error(a.Host())\n\t}\n\n\tb := make([]complex64, 100)\n\n\tif a.Len() != len(b) {\n\t\ttest.Error(\"len:\", a.Len(), \"!=\", cap(b))\n\t}\n\tif a.Cap() != cap(b) {\n\t\ttest.Error(\"cap:\", a.Cap(), \"!=\", cap(b))\n\t}\n\n\tc := a.Slice(20, 30)\n\td := b[20:30]\n\n\tif c.Len() != len(d) {\n\t\ttest.Error(\"sliced len:\", c.Len(), \"!=\", cap(d))\n\t}\n\tif c.Cap() != cap(d) {\n\t\ttest.Error(\"sliced cap:\", c.Cap(), \"!=\", cap(d))\n\t}\n\n\te := a.Slice(0, 50)\n\tf := b[0:50]\n\n\tif e.Len() != len(f) {\n\t\ttest.Error(\"sliced len:\", e.Len(), \"!=\", cap(f))\n\t}\n\tif e.Cap() != cap(f) {\n\t\ttest.Error(\"sliced cap:\", e.Cap(), \"!=\", cap(f))\n\t}\n}\n\nfunc TestComplex64sPanic1(test *testing.T) {\n\tInitCuda()\n\n\tdefer func() {\n\t\terr := recover()\n\t\ttest.Log(\"recovered:\", err)\n\t\tif err == nil {\n\t\t\ttest.Fail()\n\t\t}\n\t}()\n\n\ta := MakeComplex64s(100)\n\tdefer a.Free()\n\n\ta.Slice(-1, 10)\n}\n\nfunc TestComplex64sPanic2(test *testing.T) {\n\tInitCuda()\n\n\tdefer func() {\n\t\terr := recover()\n\t\ttest.Log(\"recovered:\", err)\n\t\tif err == nil {\n\t\t\ttest.Fail()\n\t\t}\n\t}()\n\n\ta := MakeComplex64s(100)\n\tdefer a.Free()\n\n\ta.Slice(0, 101)\n}\n\nfunc TestComplex64sCopy(test *testing.T) {\n\tInitCuda()\n\n\ta := make([]complex64, 100)\n\n\tb := MakeComplex64s(100)\n\tdefer b.Free()\n\n\tc := MakeComplex64s(100)\n\tdefer c.Free()\n\n\td := make([]complex64, 200)\n\n\tfor i := range a {\n\t\ta[i] = complex(float32(i), float32(2*i))\n\t}\n\n\tb.CopyHtoD(a)\n\n\tc.CopyDtoD(b)\n\n\tc.CopyDtoH(d[:100])\n\n\tif !reflect.DeepEqual(a, d[:100]) {\n\t\ttest.Error(d)\n\t}\n\tif !reflect.DeepEqual(d[100:], make([]complex64, 100)) {\n\t\ttest.Error(d)\n\t}\n}\n"
  },
  {
    "path": "safe/doc.go",
    "content": "/*\n\tSafe and more idiomatic wrappers for the low-level CUDA functions.\n*/\npackage safe\n"
  },
  {
    "path": "safe/fft1d_test.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n)\n\nfunc ExampleFFT1DR2C() {\n\tInitCuda()\n\n\tN := 8\n\tbatch := 1\n\n\tfft := FFT1DR2C(N, batch)\n\tdefer fft.Destroy()\n\n\tinput := MakeFloat32s(N)\n\tdefer input.Free()\n\tinput.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0})\n\n\toutput := MakeComplex64s(fft.OutputLen())\n\tdefer output.Free()\n\n\tfft.Exec(input, output)\n\n\tfmt.Println(\"input:\", input.Host())\n\tfmt.Println(\"output:\", output.Host())\n\n\t// Output:\n\t// input: [1 0 0 0 0 0 0 0]\n\t// output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]\n}\n\nfunc ExampleFFT1DR2C_Inplace() {\n\tInitCuda()\n\n\tN := 8\n\tbatch := 2\n\n\tfft := FFT1DR2C(N, batch)\n\tdefer fft.Destroy()\n\n\toutput := MakeComplex64s(fft.OutputLen())\n\tdefer output.Free()\n\n\tinput := output.Float().Slice(0, fft.InputLen())\n\t// input uses same layout as out-of-place transform\n\t// (CUFFT native layout)\n\tinput.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0})\n\tfmt.Println(\"input:\", input.Host())\n\n\tfft.Exec(input, output)\n\tfmt.Println(\"output:\", output.Host())\n\n\tinverse := FFT1DC2R(N, batch)\n\tdefer inverse.Destroy()\n\tinverse.Exec(output, input)\n\tfmt.Println(\"input:\", input.Host())\n\n\t// Output:\n\t// input: [1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]\n\t// output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]\n\t// input: [8 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0]\n}\nfunc ExampleFFT1DC2R() {\n\tInitCuda()\n\n\tN := 8\n\tbatch := 1\n\n\tfft := FFT1DC2R(N, batch)\n\tdefer fft.Destroy()\n\n\tinput := MakeComplex64s(fft.InputLen())\n\tdefer input.Free()\n\tinput.CopyHtoD([]complex64{(1 + 0i), (+1 + 0i), (+1 + 0i), (+1 - 0i), (+1 + 0i)})\n\n\toutput := MakeFloat32s(fft.OutputLen())\n\tdefer output.Free()\n\n\tfft.Exec(input, output)\n\n\tfmt.Println(\"input:\", input.Host())\n\tfmt.Println(\"output:\", output.Host())\n\n\t// Output:\n\t// input: [(1+0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i)]\n\t// output: [8 0 0 0 0 0 0 0]\n}\n"
  },
  {
    "path": "safe/fft1dc2r.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 1D single-precission complex-to-real FFT plan.\ntype FFT1DC2RPlan struct {\n\tfftplan\n\tsize1D\n\tbatch int\n}\n\n// 1D single-precission complex-to-real FFT plan.\nfunc FFT1DC2R(size, batch int) FFT1DC2RPlan {\n\thandle := cufft.Plan1d(size, cufft.C2R, batch)\n\thandle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)\n\treturn FFT1DC2RPlan{fftplan{handle, 0}, size1D(size), batch}\n}\n\n// Execute the FFT plan. Synchronized.\nfunc (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s) {\n\toksrclen := p.InputLen()\n\tif src.Len() != oksrclen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting src len %v, got %v\", oksrclen, src.Len()))\n\t}\n\tokdstlen := p.OutputLen()\n\tif dst.Len() != okdstlen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting dst len %v, got %v\", okdstlen, dst.Len()))\n\t}\n\tp.handle.ExecC2R(src.Pointer(), dst.Pointer())\n\tp.stream.Synchronize() //!\n}\n\n// Required length of the input array.\nfunc (p FFT1DC2RPlan) OutputLen() int {\n\treturn p.batch * p.Size()\n}\n\n// Required length of the output array.\nfunc (p FFT1DC2RPlan) InputLen() int {\n\treturn p.batch * (p.Size()/2 + 1)\n}\n"
  },
  {
    "path": "safe/fft1dr2c.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 1D single-precission real-to-complex FFT plan.\ntype FFT1DR2CPlan struct {\n\tfftplan\n\tsize1D\n\tbatch int\n}\n\n// 1D single-precission real-to-complex FFT plan.\nfunc FFT1DR2C(size, batch int) FFT1DR2CPlan {\n\thandle := cufft.Plan1d(size, cufft.R2C, batch)\n\thandle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)\n\treturn FFT1DR2CPlan{fftplan{handle, 0}, size1D(size), batch}\n}\n\n// Execute the FFT plan. Synchronized.\nfunc (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s) {\n\toksrclen := p.InputLen()\n\tif src.Len() != oksrclen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting src len %v, got %v\", oksrclen, src.Len()))\n\t}\n\tokdstlen := p.OutputLen()\n\tif dst.Len() != okdstlen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting dst len %v, got %v\", okdstlen, dst.Len()))\n\t}\n\tp.handle.ExecR2C(src.Pointer(), dst.Pointer())\n\tp.stream.Synchronize() //!\n}\n\n// Required length of the input array.\nfunc (p FFT1DR2CPlan) InputLen() int {\n\treturn p.batch * p.Size()\n}\n\n// Required length of the output array.\nfunc (p FFT1DR2CPlan) OutputLen() int {\n\treturn p.batch * (p.Size()/2 + 1)\n}\n"
  },
  {
    "path": "safe/fft3d_test.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n)\n\nfunc ExampleFFT3DR2C() {\n\tInitCuda()\n\n\tNx, Ny, Nz := 2, 4, 8\n\n\tfft := FFT3DR2C(Nx, Ny, Nz)\n\tdefer fft.Destroy()\n\n\tinput := MakeFloat32s(fft.InputLen())\n\tdefer input.Free()\n\n\tinputData := make([]float32, Nx*Ny*Nz)\n\tinputData[0*Ny*Nz] = 1\n\tinputData[1*Ny*Nz] = 1\n\tinput.CopyHtoD(inputData)\n\n\toutput := MakeComplex64s(fft.OutputLen())\n\tdefer output.Free()\n\n\tfft.Exec(input, output)\n\n\tfmt.Println(\"input:\", Reshape3DFloat32(input.Host(), Nx, Ny, Nz))\n\tOx, Oy, Oz := fft.OutputSize()\n\tfmt.Println(\"output:\", Reshape3DComplex64(output.Host(), Ox, Oy, Oz))\n\n\t// Output:\n\t// input: [[[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]\n\t// output: [[[(2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)]] [[(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)]]]\n}\n\nfunc ExampleFFT3DC2R() {\n\tInitCuda()\n\n\tNx, Ny, Nz := 2, 4, 8\n\n\tfft := FFT3DC2R(Nx, Ny, Nz)\n\tdefer fft.Destroy()\n\n\tinput := MakeComplex64s(fft.InputLen())\n\tdefer input.Free()\n\n\tinputData := make([]complex64, fft.InputLen())\n\tfor i := range inputData {\n\t\tinputData[i] = 2\n\t}\n\tinput.CopyHtoD(inputData)\n\n\toutput := MakeFloat32s(fft.OutputLen())\n\tdefer output.Free()\n\n\tfft.Exec(input, output)\n\n\tIx, Iy, Iz := fft.InputSize()\n\tfmt.Println(\"input:\", Reshape3DComplex64(input.Host(), Ix, Iy, Iz))\n\tfmt.Println(\"output:\", Reshape3DFloat32(output.Host(), Nx, Ny, Nz))\n\n\t// Output:\n\t// input: [[[(2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]] [[(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]]]\n\t// output: [[[128 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]\n}\n\nfunc ExampleFFT3D() {\n\tInitCuda()\n\n\tNx, Ny, Nz := 2, 4, 8\n\n\tforward := FFT3DR2C(Nx, Ny, Nz)\n\tdefer forward.Destroy()\n\n\tinput := MakeFloat32s(forward.InputLen())\n\tdefer input.Free()\n\n\tinputData := make([]float32, forward.InputLen())\n\tinputData[5] = 1\n\tinput.CopyHtoD(inputData)\n\n\toutput := MakeComplex64s(forward.OutputLen())\n\tdefer output.Free()\n\n\tforward.Exec(input, output)\n\n\tbackward := FFT3DC2R(Nx, Ny, Nz)\n\tbackward.Exec(output, input)\n\n\tfmt.Println(\"input:\", Reshape3DFloat32(inputData, Nx, Ny, Nz))\n\tfmt.Println(\"forward+inverse:\", Reshape3DFloat32(input.Host(), Nx, Ny, Nz))\n\n\t// Output:\n\t// input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]\n\t// forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]\n}\n\n//func ExampleFFT3D64() {\n//\tInitCuda()\n//\n//\tNx, Ny, Nz := 2, 4, 8\n//\n//\tforward := FFT3DD2Z(Nx, Ny, Nz)\n//\tdefer forward.Destroy()\n//\n//\tinput := MakeFloat64s(forward.InputLen())\n//\tdefer input.Free()\n//\n//\tinputData := make([]float64, forward.InputLen())\n//\tinputData[5] = 1\n//\tinput.CopyHtoD(inputData)\n//\n//\toutput := MakeComplex128s(forward.OutputLen())\n//\tdefer output.Free()\n//\n//\tforward.Exec(input, output)\n//\n//\tbackward := FFT3DZ2D(Nx, Ny, Nz)\n//\tbackward.Exec(output, input)\n//\n//\tfmt.Println(\"input:\", Reshape3DFloat64(inputData, Nx, Ny, Nz))\n//\tfmt.Println(\"forward+inverse:\", Reshape3DFloat64(input.Host(), Nx, Ny, Nz))\n//\n//\t// Output:\n//\t// input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]\n//\t// forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]\n//}\n"
  },
  {
    "path": "safe/fft3dc2r.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 3D single-precission real-to-complex FFT plan.\ntype FFT3DC2RPlan struct {\n\tfftplan\n\tsize3D\n}\n\n// 3D single-precission real-to-complex FFT plan.\nfunc FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan {\n\thandle := cufft.Plan3d(Nx, Ny, Nz, cufft.C2R)\n\thandle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)\n\treturn FFT3DC2RPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}\n}\n\n// Execute the FFT plan.\n// src and dst are 3D arrays stored 1D arrays.\nfunc (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s) {\n\toksrclen := p.InputLen()\n\tif src.Len() != oksrclen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting src len %v, got %v\", oksrclen, src.Len()))\n\t}\n\tokdstlen := p.OutputLen()\n\tif dst.Len() != okdstlen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting dst len %v, got %v\", okdstlen, dst.Len()))\n\t}\n\tp.handle.ExecC2R(src.Pointer(), dst.Pointer())\n\tp.stream.Synchronize() //!\n}\n\n// 3D size of the input array.\nfunc (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int) {\n\treturn p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1\n}\n\n// 3D size of the output array.\nfunc (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int) {\n\treturn p.size3D[0], p.size3D[1], p.size3D[2]\n}\n\n// Required length of the (1D) input array.\nfunc (p FFT3DC2RPlan) InputLen() int {\n\treturn prod3(p.InputSize())\n}\n\n// Required length of the (1D) output array.\nfunc (p FFT3DC2RPlan) OutputLen() int {\n\treturn prod3(p.OutputSize())\n}\n"
  },
  {
    "path": "safe/fft3dd2z.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 3D single-precission real-to-complex FFT plan.\ntype FFT3DD2ZPlan struct {\n\tfftplan\n\tsize3D\n}\n\n// 3D single-precission real-to-complex FFT plan.\nfunc FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan {\n\thandle := cufft.Plan3d(Nx, Ny, Nz, cufft.D2Z)\n\thandle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)\n\treturn FFT3DD2ZPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}\n}\n\n// Execute the FFT plan. Synchronized.\n// src and dst are 3D arrays stored 1D arrays.\nfunc (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s) {\n\toksrclen := p.InputLen()\n\tif src.Len() != oksrclen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting src len %v, got %v\", oksrclen, src.Len()))\n\t}\n\tokdstlen := p.OutputLen()\n\tif dst.Len() != okdstlen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting dst len %v, got %v\", okdstlen, dst.Len()))\n\t}\n\tp.handle.ExecD2Z(src.Pointer(), dst.Pointer())\n\tp.stream.Synchronize() //!\n}\n\n// 3D size of the input array.\nfunc (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int) {\n\treturn p.size3D[0], p.size3D[1], p.size3D[2]\n}\n\n// 3D size of the output array.\nfunc (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int) {\n\treturn p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1\n}\n\n// Required length of the (1D) input array.\nfunc (p FFT3DD2ZPlan) InputLen() int {\n\treturn prod3(p.InputSize())\n}\n\n// Required length of the (1D) output array.\nfunc (p FFT3DD2ZPlan) OutputLen() int {\n\treturn prod3(p.OutputSize())\n}\n"
  },
  {
    "path": "safe/fft3dr2c.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 3D single-precission real-to-complex FFT plan.\ntype FFT3DR2CPlan struct {\n\tfftplan\n\tsize3D\n}\n\n// 3D single-precission real-to-complex FFT plan.\nfunc FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan {\n\thandle := cufft.Plan3d(Nx, Ny, Nz, cufft.R2C)\n\thandle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)\n\treturn FFT3DR2CPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}\n}\n\n// Execute the FFT plan. Synchronized.\n// src and dst are 3D arrays stored 1D arrays.\nfunc (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s) {\n\toksrclen := p.InputLen()\n\tif src.Len() != oksrclen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting src len %v, got %v\", oksrclen, src.Len()))\n\t}\n\tokdstlen := p.OutputLen()\n\tif dst.Len() != okdstlen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting dst len %v, got %v\", okdstlen, dst.Len()))\n\t}\n\tp.handle.ExecR2C(src.Pointer(), dst.Pointer())\n\tp.stream.Synchronize() //!\n}\n\n// 3D size of the input array.\nfunc (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int) {\n\treturn p.size3D[0], p.size3D[1], p.size3D[2]\n}\n\n// 3D size of the output array.\nfunc (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int) {\n\treturn p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1\n}\n\n// Required length of the (1D) input array.\nfunc (p FFT3DR2CPlan) InputLen() int {\n\treturn prod3(p.InputSize())\n}\n\n// Required length of the (1D) output array.\nfunc (p FFT3DR2CPlan) OutputLen() int {\n\treturn prod3(p.OutputSize())\n}\n"
  },
  {
    "path": "safe/fft3dz2d.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 3D single-precission real-to-complex FFT plan.\ntype FFT3DZ2DPlan struct {\n\tfftplan\n\tsize3D\n}\n\n// 3D single-precission real-to-complex FFT plan.\nfunc FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan {\n\thandle := cufft.Plan3d(Nx, Ny, Nz, cufft.Z2D)\n\thandle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)\n\treturn FFT3DZ2DPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}\n}\n\n// Execute the FFT plan. Synchronized.\n// src and dst are 3D arrays stored 1D arrays.\nfunc (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s) {\n\toksrclen := p.InputLen()\n\tif src.Len() != oksrclen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting src len %v, got %v\", oksrclen, src.Len()))\n\t}\n\tokdstlen := p.OutputLen()\n\tif dst.Len() != okdstlen {\n\t\tpanic(fmt.Errorf(\"size mismatch: expecting dst len %v, got %v\", okdstlen, dst.Len()))\n\t}\n\tp.handle.ExecZ2D(src.Pointer(), dst.Pointer())\n\tp.stream.Synchronize() //!\n}\n\n// 3D size of the input array.\nfunc (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int) {\n\treturn p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1\n}\n\n// 3D size of the output array.\nfunc (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int) {\n\treturn p.size3D[0], p.size3D[1], p.size3D[2]\n}\n\n// Required length of the (1D) input array.\nfunc (p FFT3DZ2DPlan) InputLen() int {\n\treturn prod3(p.InputSize())\n}\n\n// Required length of the (1D) output array.\nfunc (p FFT3DZ2DPlan) OutputLen() int {\n\treturn prod3(p.OutputSize())\n}\n"
  },
  {
    "path": "safe/fftplan.go",
    "content": "package safe\n\n// INTERNAL\n// Base implementation for all FFT plans.\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// Base implementation for all FFT plans.\ntype fftplan struct {\n\thandle cufft.Handle\n\tstream cu.Stream\n}\n\n// For the sake of embedding.\ntype size1D int\n\n// Returns the logical size of the FFT:\n// the number of elements (real or complex)\n// it transforms.\nfunc (s size1D) Size() int { return int(s) }\n\n// For the sake of embedding.\ntype size3D [3]int\n\n// Returns the logical size of the FFT:\n// the number of elements (real or complex)\n// it transforms.\nfunc (s size3D) Size() (Nx, Ny, Nz int) { return s[0], s[1], s[2] }\n\nfunc prod3(x, y, z int) int {\n\treturn x * y * z\n}\n\n// Releases all resources associated with the FFT plan.\nfunc (p fftplan) Destroy() { p.handle.Destroy() }\n\n// Associates a CUDA stream with the FFT plan.\n// If a stream is set, plan.Stream().Synchronize() can\n// to be called to wait for the execution to finish.\nfunc (p fftplan) SetStream(stream cu.Stream) {\n\tp.handle.SetStream(stream)\n\tp.stream = stream\n}\n\n// Returns the CUDA stream associated with the FFT plan.\nfunc (p fftplan) Stream() cu.Stream {\n\treturn p.stream\n}\n"
  },
  {
    "path": "safe/float32s.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cu\"\n\t\"math\"\n\t\"unsafe\"\n)\n\n// Slice of float32's on the GPU.\ntype Float32s struct{ slice }\n\n// Make a slice of float32's on the GPU.\n// Initialized to zero.\nfunc MakeFloat32s(len_ int) Float32s {\n\treturn Float32s{makeslice(len_, cu.SIZEOF_FLOAT32)}\n}\n\n// Return a slice from start (inclusive) to stop (exclusive),\n// sharing the underlying storage with the original slice.\n// Slices obtained in this way should not be Free()'d\nfunc (s Float32s) Slice(start, stop int) Float32s {\n\treturn Float32s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT32)}\n}\n\n// Copy src from host to dst on the device.\nfunc (dst Float32s) CopyHtoD(src []float32) {\n\tdst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32)\n}\n\n// Copy src form device to dst on host.\nfunc (src Float32s) CopyDtoH(dst []float32) {\n\tsrc.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32)\n}\n\n// Copy src on host to dst on host.\nfunc (dst Float32s) CopyDtoD(src Float32s) {\n\tdst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT32)\n}\n\n// Copy src from host to dst on the device, asynchronously.\nfunc (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream) {\n\tdst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32, stream)\n}\n\n// Copy src form device to dst on host, asynchronously.\nfunc (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream) {\n\tsrc.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32, stream)\n}\n\n// Copy src on host to dst on host, asynchronously.\nfunc (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream) {\n\tdst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT32, stream)\n}\n\n// Returns a fresh copy on host.\nfunc (src Float32s) Host() []float32 {\n\tcpy := make([]float32, src.Len())\n\tsrc.CopyDtoH(cpy)\n\treturn cpy\n}\n\n// Set the entire slice to this value.\nfunc (s Float32s) Memset(value float32) {\n\tcu.MemsetD32(s.Pointer(), math.Float32bits(value), int64(s.Len()))\n\tcu.CtxSynchronize()\n}\n\n// Set the entire slice to this value, asynchronously.\nfunc (s Float32s) MemsetAsync(value float32, stream cu.Stream) {\n\tcu.MemsetD32Async(s.Pointer(), math.Float32bits(value), int64(s.Len()), stream)\n}\n\n// Re-interpret the array as complex numbers,\n// in interleaved format. Underlying storage\n// is shared.\nfunc (s Float32s) Complex() Complex64s {\n\tif s.Len()%2 != 0 {\n\t\tpanic(fmt.Errorf(\"complex: need even number of elements, have:%v\", s.Len()))\n\t}\n\treturn Complex64s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}}\n}\n"
  },
  {
    "path": "safe/float32s_test.go",
    "content": "package safe\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n)\n\nfunc TestFloat32sSlice(test *testing.T) {\n\tInitCuda()\n\n\ta := MakeFloat32s(100)\n\tdefer a.Free()\n\n\tif !reflect.DeepEqual(a.Host(), make([]float32, 100)) {\n\t\ttest.Error(a.Host())\n\t}\n\n\tb := make([]float32, 100)\n\n\tif a.Len() != len(b) {\n\t\ttest.Error(\"len:\", a.Len(), \"!=\", cap(b))\n\t}\n\tif a.Cap() != cap(b) {\n\t\ttest.Error(\"cap:\", a.Cap(), \"!=\", cap(b))\n\t}\n\n\tc := a.Slice(20, 30)\n\td := b[20:30]\n\n\tif c.Len() != len(d) {\n\t\ttest.Error(\"sliced len:\", c.Len(), \"!=\", cap(d))\n\t}\n\tif c.Cap() != cap(d) {\n\t\ttest.Error(\"sliced cap:\", c.Cap(), \"!=\", cap(d))\n\t}\n\n\te := a.Slice(0, 50)\n\tf := b[0:50]\n\n\tif e.Len() != len(f) {\n\t\ttest.Error(\"sliced len:\", e.Len(), \"!=\", cap(f))\n\t}\n\tif e.Cap() != cap(f) {\n\t\ttest.Error(\"sliced cap:\", e.Cap(), \"!=\", cap(f))\n\t}\n}\n\nfunc TestFloat32sPanic1(test *testing.T) {\n\tInitCuda()\n\n\tdefer func() {\n\t\terr := recover()\n\t\ttest.Log(\"recovered:\", err)\n\t\tif err == nil {\n\t\t\ttest.Fail()\n\t\t}\n\t}()\n\n\ta := MakeFloat32s(100)\n\tdefer a.Free()\n\n\ta.Slice(-1, 10)\n}\n\nfunc TestFloat32sPanic2(test *testing.T) {\n\tInitCuda()\n\n\tdefer func() {\n\t\terr := recover()\n\t\ttest.Log(\"recovered:\", err)\n\t\tif err == nil {\n\t\t\ttest.Fail()\n\t\t}\n\t}()\n\n\ta := MakeFloat32s(100)\n\tdefer a.Free()\n\n\ta.Slice(0, 101)\n}\n\nfunc TestFloat32sCopy(test *testing.T) {\n\tInitCuda()\n\n\ta := make([]float32, 100)\n\n\tb := MakeFloat32s(100)\n\tdefer b.Free()\n\n\tc := MakeFloat32s(100)\n\tdefer c.Free()\n\n\td := make([]float32, 200)\n\n\tfor i := range a {\n\t\ta[i] = float32(i)\n\t}\n\n\tb.CopyHtoD(a)\n\n\tc.CopyDtoD(b)\n\n\tc.CopyDtoH(d[:100])\n\n\tif !reflect.DeepEqual(a, d[:100]) {\n\t\ttest.Error(d)\n\t}\n\tif !reflect.DeepEqual(d[100:], make([]float32, 100)) {\n\t\ttest.Error(d)\n\t}\n}\n"
  },
  {
    "path": "safe/float64s.go",
    "content": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\n// Slice of float64's on the GPU.\ntype Float64s struct{ slice }\n\n// Make a slice of float64's on the GPU.\n// Initialized to zero.\nfunc MakeFloat64s(len_ int) Float64s {\n\treturn Float64s{makeslice(len_, cu.SIZEOF_FLOAT64)}\n}\n\n// Return a slice from start (inclusive) to stop (exclusive),\n// sharing the underlying storage with the original slice.\n// Slices obtained in this way should not be Free()'d\nfunc (s Float64s) Slice(start, stop int) Float64s {\n\treturn Float64s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT64)}\n}\n\n// Copy src from host to dst on the device.\nfunc (dst Float64s) CopyHtoD(src []float64) {\n\tdst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64)\n}\n\n// Copy src form device to dst on host.\nfunc (src Float64s) CopyDtoH(dst []float64) {\n\tsrc.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64)\n}\n\n// Copy src on host to dst on host.\nfunc (dst Float64s) CopyDtoD(src Float64s) {\n\tdst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT64)\n}\n\n// Copy src from host to dst on the device, asynchronously.\nfunc (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream) {\n\tdst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64, stream)\n}\n\n// Copy src form device to dst on host, asynchronously.\nfunc (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream) {\n\tsrc.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64, stream)\n}\n\n// Copy src on host to dst on host, asynchronously.\nfunc (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream) {\n\tdst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT64, stream)\n}\n\n// Returns a fresh copy on host.\nfunc (src Float64s) Host() []float64 {\n\tcpy := make([]float64, src.Len())\n\tsrc.CopyDtoH(cpy)\n\treturn cpy\n}\n\n// Re-interpret the array as complex numbers,\n// in interleaved format. Underlying storage\n// is shared.\nfunc (s Float64s) Complex() Complex128s {\n\tif s.Len()%2 != 0 {\n\t\tpanic(fmt.Errorf(\"complex: need even number of elements, have:%v\", s.Len()))\n\t}\n\treturn Complex128s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}}\n}\n"
  },
  {
    "path": "safe/float64s_test.go",
    "content": "package safe\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n)\n\nfunc TestFloat64sSlice(test *testing.T) {\n\tInitCuda()\n\n\ta := MakeFloat64s(100)\n\tdefer a.Free()\n\n\tif !reflect.DeepEqual(a.Host(), make([]float64, 100)) {\n\t\ttest.Error(a.Host())\n\t}\n\n\tb := make([]float64, 100)\n\n\tif a.Len() != len(b) {\n\t\ttest.Error(\"len:\", a.Len(), \"!=\", cap(b))\n\t}\n\tif a.Cap() != cap(b) {\n\t\ttest.Error(\"cap:\", a.Cap(), \"!=\", cap(b))\n\t}\n\n\tc := a.Slice(20, 30)\n\td := b[20:30]\n\n\tif c.Len() != len(d) {\n\t\ttest.Error(\"sliced len:\", c.Len(), \"!=\", cap(d))\n\t}\n\tif c.Cap() != cap(d) {\n\t\ttest.Error(\"sliced cap:\", c.Cap(), \"!=\", cap(d))\n\t}\n\n\te := a.Slice(0, 50)\n\tf := b[0:50]\n\n\tif e.Len() != len(f) {\n\t\ttest.Error(\"sliced len:\", e.Len(), \"!=\", cap(f))\n\t}\n\tif e.Cap() != cap(f) {\n\t\ttest.Error(\"sliced cap:\", e.Cap(), \"!=\", cap(f))\n\t}\n}\n\nfunc TestFloat64sPanic1(test *testing.T) {\n\tInitCuda()\n\n\tdefer func() {\n\t\terr := recover()\n\t\ttest.Log(\"recovered:\", err)\n\t\tif err == nil {\n\t\t\ttest.Fail()\n\t\t}\n\t}()\n\n\ta := MakeFloat64s(100)\n\tdefer a.Free()\n\n\ta.Slice(-1, 10)\n}\n\nfunc TestFloat64sPanic2(test *testing.T) {\n\tInitCuda()\n\n\tdefer func() {\n\t\terr := recover()\n\t\ttest.Log(\"recovered:\", err)\n\t\tif err == nil {\n\t\t\ttest.Fail()\n\t\t}\n\t}()\n\n\ta := MakeFloat64s(100)\n\tdefer a.Free()\n\n\ta.Slice(0, 101)\n}\n\nfunc TestFloat64sCopy(test *testing.T) {\n\tInitCuda()\n\n\ta := make([]float64, 100)\n\n\tb := MakeFloat64s(100)\n\tdefer b.Free()\n\n\tc := MakeFloat64s(100)\n\tdefer c.Free()\n\n\td := make([]float64, 200)\n\n\tfor i := range a {\n\t\ta[i] = float64(i)\n\t}\n\n\tb.CopyHtoD(a)\n\n\tc.CopyDtoD(b)\n\n\tc.CopyDtoH(d[:100])\n\n\tif !reflect.DeepEqual(a, d[:100]) {\n\t\ttest.Error(d)\n\t}\n\tif !reflect.DeepEqual(d[100:], make([]float64, 100)) {\n\t\ttest.Error(d)\n\t}\n}\n"
  },
  {
    "path": "safe/init.go",
    "content": "package safe\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n\t\"runtime\"\n)\n\nfunc InitCuda() {\n\truntime.LockOSThread()\n\tcu.Init(0)\n\tcu.CtxCreate(cu.CTX_SCHED_AUTO, 0).SetCurrent()\n}\n"
  },
  {
    "path": "safe/slice.go",
    "content": "package safe\n\n// INTERNAL.\n// This file implements common functionality for all slice types\n// (Float32s, Float64s, Complex64s, ...).\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\n// internal base func for all makeXXX() functions\nfunc makeslice(len_ int, elemsize int) slice {\n\tbytes := int64(len_) * int64(elemsize)\n\ts := slice{0, len_, len_}\n\tif bytes > 0 {\n\t\ts.ptr_ = cu.MemAlloc(bytes)\n\t\tcu.MemsetD8(s.ptr_, 0, bytes)\n\t\tcu.CtxSynchronize()\n\t}\n\treturn s\n}\n\n// internal base type for all slices\ntype slice struct {\n\tptr_ cu.DevicePtr // address offset of first element\n\tlen_ int          // number of elements\n\tcap_ int\n}\n\n// Pointer to the first element.\nfunc (s *slice) Pointer() cu.DevicePtr { return s.ptr_ }\n\n// Slice length (number of elements).\nfunc (s *slice) Len() int { return s.len_ }\n\n// Slice capacity.\nfunc (s *slice) Cap() int { return s.cap_ }\n\n// Free the underlying storage.\n// To be used with care. Free() should only be called on\n// a slice created by MakeXXX(), not on a slice created\n// by x.Slice(). Freeing a slice invalidates all other\n// slices referring to it.\nfunc (s *slice) Free() {\n\ts.ptr_.Free()\n\ts.len_ = 0\n\ts.cap_ = 0\n}\n\n// internal base func for all slice() functions\nfunc (s *slice) slice(start, stop int, elemsize uintptr) slice {\n\tif start >= s.cap_ || start < 0 || stop > s.cap_ || stop < 0 {\n\t\tpanic(\"cuda4/safe: slice index out of bounds\")\n\t}\n\tif start > stop {\n\t\tpanic(\"cuda4/safe: inverted slice range\")\n\t}\n\treturn slice{cu.DevicePtr(uintptr(s.ptr_) + uintptr(start)*elemsize), stop - start, s.cap_ - start}\n}\n\nfunc (dst *slice) copyHtoD(src unsafe.Pointer, srclen int, elemsize int) {\n\tif srclen != dst.Len() {\n\t\tpanic(fmt.Errorf(\"cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)\", srclen, dst.Len()))\n\t}\n\tcu.MemcpyHtoD(dst.Pointer(), src, int64(elemsize)*int64(srclen))\n}\n\nfunc (src *slice) copyDtoH(dst unsafe.Pointer, dstlen int, elemsize int) {\n\tif dstlen != src.Len() {\n\t\tpanic(fmt.Errorf(\"cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)\", src.Len(), dstlen))\n\t}\n\tcu.MemcpyDtoH(dst, src.Pointer(), int64(elemsize)*int64(dstlen))\n}\n\nfunc (dst *slice) copyDtoD(src *slice, elemsize int) {\n\tif dst.Len() != src.Len() {\n\t\tpanic(fmt.Errorf(\"cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v\", src.Len(), dst.Len()))\n\t}\n\tcu.MemcpyDtoD(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()))\n}\n\nfunc (dst *slice) copyHtoDAsync(src unsafe.Pointer, srclen int, elemsize int, stream cu.Stream) {\n\tif srclen != dst.Len() {\n\t\tpanic(fmt.Errorf(\"cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)\", srclen, dst.Len()))\n\t}\n\tcu.MemcpyHtoDAsync(dst.Pointer(), src, int64(elemsize)*int64(srclen), stream)\n}\n\nfunc (src *slice) copyDtoHAsync(dst unsafe.Pointer, dstlen int, elemsize int, stream cu.Stream) {\n\tif dstlen != src.Len() {\n\t\tpanic(fmt.Errorf(\"cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)\", src.Len(), dstlen))\n\t}\n\tcu.MemcpyDtoHAsync(dst, src.Pointer(), int64(elemsize)*int64(dstlen), stream)\n}\n\nfunc (dst *slice) copyDtoDAsync(src *slice, elemsize int, stream cu.Stream) {\n\tif dst.Len() != src.Len() {\n\t\tpanic(fmt.Errorf(\"cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v\", src.Len(), dst.Len()))\n\t}\n\tcu.MemcpyDtoDAsync(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()), stream)\n}\n\n// Manually set the pointer, length and capacity.\n// Side-steps the security mechanisms, use with caution.\nfunc (s *slice) UnsafeSet(pointer unsafe.Pointer, length, capacity int) {\n\ts.ptr_ = cu.DevicePtr(uintptr(pointer))\n\ts.len_ = length\n\ts.cap_ = capacity\n}\n"
  },
  {
    "path": "safe/subs.sh",
    "content": "#! /bin/bash\n\nsubs32='s/loat32/loat64/g;'\nsubs32+='s/FLOAT32/FLOAT64/g;'\n\n#sed $subs32 float32s.go > float64s.go\n#sed $subs32 float32s_test.go > float64s_test.go\n\nsubsc64='s/Float32/Complex64/g;'\nsubsc64+='s/float32/complex64/g;'\nsubsc64+='s/FLOAT32/COMPLEX64/g;'\n#sed $subsc64 float32s_test.go > complex64s_test.go\n#sed $subsc64 float32s.go > complex64s.go\n\n\nsubsc128='s/omplex64/omplex128/g;'\nsubsc128+='s/COMPLEX64/COMPLEX128/g;'\nsed $subsc128 complex64s.go > complex128s.go\nsed $subsc128 complex64s_test.go > complex128s_test.go\n"
  }
]