Repository: barnex/cuda5 Branch: master Commit: da30a9b287d8 Files: 72 Total size: 172.6 KB Directory structure: gitextract_cibr8rm8/ ├── .gitignore ├── Makefile ├── README.md ├── cu/ │ ├── Makefile │ ├── README │ ├── cgoflags.go │ ├── context.go │ ├── context_test.go │ ├── device.go │ ├── device_test.go │ ├── dim3.go │ ├── doc.go │ ├── execution.go │ ├── function.go │ ├── init.go │ ├── init_test.go │ ├── memory.go │ ├── memory_test.go │ ├── memset.go │ ├── module.go │ ├── module_test.go │ ├── peer.go │ ├── result.go │ ├── runtimeapi.go │ ├── stream.go │ ├── testdata/ │ │ ├── testmodule.cu │ │ └── testmodule.ptx │ ├── version.go │ └── version_test.go ├── cuda/ │ ├── Makefile │ ├── README │ ├── cgoflags.go │ └── device.go ├── cufft/ │ ├── Makefile │ ├── README │ ├── cgoflags.go │ ├── doc.go │ ├── fft_test.go │ ├── init_test.go │ ├── mode.go │ ├── plan.go │ ├── result.go │ └── type.go ├── curand/ │ ├── Makefile │ ├── README │ ├── cgoflags.go │ ├── generator.go │ └── status.go ├── doc.go └── safe/ ├── Makefile ├── README ├── complex128s.go ├── complex128s_test.go ├── complex64s.go ├── complex64s_test.go ├── doc.go ├── fft1d_test.go ├── fft1dc2r.go ├── fft1dr2c.go ├── fft3d_test.go ├── fft3dc2r.go ├── fft3dd2z.go ├── fft3dr2c.go ├── fft3dz2d.go ├── fftplan.go ├── float32s.go ├── float32s_test.go ├── float64s.go ├── float64s_test.go ├── init.go ├── slice.go └── subs.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.swp *.{6,8,5,o} ================================================ FILE: Makefile ================================================ all: 6g doc 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go install -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean go-optview -c -w *.go gofmt -w *.go opt: go-optview -w *.go gofmt -w *.go doc: godoc github.com/barnex/cuda5 > README ================================================ FILE: README.md ================================================ # Go bindings for CUDA Go bindings for nVIDIA CUDA 5 and later. This package compiles with both gc and gccgo. ![fig](gophergpu.png) ================================================ FILE: cu/Makefile ================================================ all: 6g gccgo doc 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/cu > README ================================================ FILE: cu/README ================================================ PACKAGE package cu import "github.com/barnex/cuda5/cu" Go bindings for the CUDA driver API. CONSTANTS const ( // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO // Spin when waiting for results from the GPU. CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN // Yield its thread when waiting for results from the GPU. CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. CTX_BLOCKING_SYNC // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. CTX_MAP_HOST = C.CU_CTX_MAP_HOST //Do not reduce local memory after resizing local memory for a kernel. CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX ) Flags for CtxCreate const ( SIZEOF_FLOAT32 = 4 SIZEOF_FLOAT64 = 8 SIZEOF_COMPLEX64 = 8 SIZEOF_COMPLEX128 = 16 ) Type size in bytes FUNCTIONS func CtxDestroy(ctx *Context) Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function. func CtxDisablePeerAccess(peer Context) Reverses CtxEnablePeerAccess(). func CtxEnablePeerAccess(peer Context) Make allocations from the peer Context available to the current context. func CtxGetApiVersion(ctx Context) (version int) Returns the API version to create the context. func CtxSetCurrent(ctx Context) Sets the current active context. func CtxSynchronize() Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. func DeviceCanAccessPeer(dev, peer Device) bool Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func DeviceComputeCapability(device Device) (major, minor int) Returns the compute capability of the device. func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int Gets the value of a device attribute. func DeviceGetCount() int Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution. func DeviceGetName(dev Device) string Gets the name of the device. func DeviceTotalMem(device Device) int64 Returns the total amount of memory available on the device in bytes. func FuncGetAttribute(attrib FunctionAttribute, function Function) int func Init(flags int) Initialize the CUDA driver API. Currently, flags must be 0. If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED. func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) func MemAllocHost(bytes int64) unsafe.Pointer func MemFree(ptr *DevicePtr) Frees device memory allocated by MemAlloc(). Overwrites the pointer with NULL. It is safe to double-free. func MemFreeHost(ptr unsafe.Pointer) func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func MemGetInfo() (free, total int64) Returns the free and total amount of memroy in the current Context (in bytes). func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) Page-locks memory specified by the pointer and bytes. The pointer and byte size must be aligned to the host page size (4KB) See also: MemHostUnregister() func MemHostUnregister(ptr unsafe.Pointer) Unmaps memory locked by MemHostRegister(). func Memcpy(dst, src DevicePtr, bytes int64) Copies a number of bytes on the current device. Requires unified addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually an auto copy for device and/or host memory func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes on the current device. func MemcpyDtoD(dst, src DevicePtr, bytes int64) Copies a number of bytes from host to device. func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes from host to device. func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) Copies a number of bytes from device to host. func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes device host to host. The host memory must be page-locked (see MemRegister) func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) Copies a number of bytes from host to device. func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) Asynchronously copies a number of bytes from host to device. The host memory must be page-locked (see MemRegister) func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) Copies from device memory in one context (device) to another. func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) Asynchronously copies from device memory in one context (device) to another. func MemsetD32(deviceptr DevicePtr, value uint32, N int64) Sets the first N 32-bit values of dst array to value. Asynchronous. func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD8(deviceptr DevicePtr, value uint8, N int64) Sets the first N 8-bit values of dst array to value. Asynchronous. func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) Asynchronously sets the first N 32-bit values of dst array to value. func StreamDestroy(stream *Stream) Destroys an asynchronous stream func StreamSynchronize(stream Stream) Blocks until the stream has completed. func Version() int Returns the CUDA driver version. TYPES type Context uintptr CUDA context. func CtxCreate(flags uint, dev Device) Context Create a CUDA context. func CtxGetCurrent() Context Gets the current active context. func (ctx Context) ApiVersion() (version int) Returns the API version to create the context. func (ctx *Context) Destroy() Destroys the CUDA context. func (peer Context) DisablePeerAccess() Reverses EnablePeerAccess(). func (peer Context) EnablePeerAccess() Make allocations from the peer Context available to the current context. func (ctx Context) SetCurrent() Sets the current active context. type DevProp struct { MaxThreadsPerBlock int MaxThreadsDim [3]int MaxGridSize [3]int SharedMemPerBlock int TotalConstantMemory int SIMDWidth int MemPitch int RegsPerBlock int ClockRate int TextureAlign int } Device properties func DeviceGetProperties(dev Device) (prop DevProp) Returns the device's properties. type Device int CUDA Device number. func CtxGetDevice() Device Returns the ordinal of the current context's device. func DeviceGet(ordinal int) Device Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1]. func (dev Device) Attribute(attrib DeviceAttribute) int Gets the value of a device attribute. func (dev Device) CanAccessPeer(peer Device) bool Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func (device Device) ComputeCapability() (major, minor int) Returns the compute capability of the device. func (dev Device) Name() string Gets the name of the device. func (dev Device) Properties() DevProp Returns the device's properties. func (device Device) TotalMem() int64 Returns the total amount of memory available on the device in bytes. type DeviceAttribute int const ( MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture ) type DevicePtr uintptr func MemAlloc(bytes int64) DevicePtr Allocates a number of bytes of device memory. func (ptr DevicePtr) Bytes() (bytes int64) Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr *DevicePtr) Free() Frees device memory allocated by MemAlloc(). Overwrites the pointer with NULL. It is safe to double-free. func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) MemoryType() MemoryType Returns the physical memory type that ptr addresses. func (p DevicePtr) String() string type Dim3 struct { X, Y, Z int } type Function uintptr Represents a CUDA CUfunction, a reference to a function within a module. func ModuleGetFunction(module Module, name string) Function Returns a Function handle. func (f Function) GetAttribute(attrib FunctionAttribute) int type FunctionAttribute int const ( FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. ) type MemHostRegisterFlag int const ( // Memory is pinned in all CUDA contexts. MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP ) Flag for MemHostRegister type MemoryType uint const ( MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED ) func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) Returns the physical memory type that ptr addresses. func (t MemoryType) String() string type Module uintptr Represents a CUDA CUmodule, a reference to executable device code. func ModuleLoad(fname string) Module Loads a compute module from file func ModuleLoadData(image string) Module Loads a compute module from string func (m Module) GetFunction(name string) Function Returns a Function handle. type Result int CUDA error status. CUDA error statuses are not returned by functions but checked and passed to panic() when not successful. If desired, they can be caught by recover(). const ( SUCCESS Result = C.CUDA_SUCCESS ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN ) func StreamQuery(stream Stream) Result Returns Success if all operations have completed, ErrorNotReady otherwise func (err Result) String() string Message string for the error type Stream uintptr CUDA stream. func StreamCreate() Stream Creates an asynchronous stream func (stream *Stream) Destroy() Destroys the asynchronous stream func (stream Stream) Query() Result Returns Success if all operations have completed, ErrorNotReady otherwise func (stream Stream) Synchronize() Blocks until the stream has completed. ================================================ FILE: cu/cgoflags.go ================================================ package cu // This file provides CGO flags to find CUDA libraries and headers. //#cgo LDFLAGS:-lcuda -lcudart // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////default location if not properly symlinked: //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/ //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/ //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/ // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include import "C" ================================================ FILE: cu/context.go ================================================ package cu // This file implements CUDA driver context management //#include import "C" import "unsafe" // CUDA context. type Context uintptr // Create a CUDA context. func CtxCreate(flags uint, dev Device) Context { var ctx C.CUcontext err := Result(C.cuCtxCreate(&ctx, C.uint(flags), C.CUdevice(dev))) if err != SUCCESS { panic(err) } return Context(uintptr(unsafe.Pointer(ctx))) } //Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function. func CtxDestroy(ctx *Context) { err := Result(C.cuCtxDestroy(C.CUcontext(unsafe.Pointer(uintptr(*ctx))))) *ctx = 0 if err != SUCCESS { panic(err) } } //Destroys the CUDA context. func (ctx *Context) Destroy() { CtxDestroy(ctx) } // Returns the API version to create the context. func CtxGetApiVersion(ctx Context) (version int) { var cversion C.uint err := Result(C.cuCtxGetApiVersion(C.CUcontext(unsafe.Pointer(uintptr(ctx))), &cversion)) if err != SUCCESS { panic(err) } version = int(cversion) return } // Returns the API version to create the context. func (ctx Context) ApiVersion() (version int) { return CtxGetApiVersion(ctx) } // Gets the current active context. func CtxGetCurrent() Context { var ctx C.CUcontext err := Result(C.cuCtxGetCurrent(&ctx)) if err != SUCCESS { panic(err) } return Context(uintptr(unsafe.Pointer(ctx))) } // Returns the ordinal of the current context's device. func CtxGetDevice() Device { var dev C.CUdevice err := Result(C.cuCtxGetDevice(&dev)) if err != SUCCESS { panic(err) } return Device(dev) } // Sets the current active context. func CtxSetCurrent(ctx Context) { err := Result(C.cuCtxSetCurrent(C.CUcontext(unsafe.Pointer(uintptr(ctx))))) if err != SUCCESS { panic(err) } } // Sets the current active context. func (ctx Context) SetCurrent() { CtxSetCurrent(ctx) } // Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. func CtxSynchronize() { err := Result(C.cuCtxSynchronize()) if err != SUCCESS { panic(err) } } // Flags for CtxCreate const ( // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO // Spin when waiting for results from the GPU. CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN // Yield its thread when waiting for results from the GPU. CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. CTX_BLOCKING_SYNC // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. CTX_MAP_HOST = C.CU_CTX_MAP_HOST //Do not reduce local memory after resizing local memory for a kernel. CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX ) ================================================ FILE: cu/context_test.go ================================================ package cu import ( "fmt" "testing" ) func TestContext(t *testing.T) { fmt.Println("CtxCreate") ctx := CtxCreate(CTX_SCHED_AUTO, 0) fmt.Println("CtxSetCurrent") CtxSetCurrent(ctx) fmt.Println("CtxGetApiVersion:", ctx.ApiVersion()) fmt.Println("CtxGetDevice:", CtxGetDevice()) (&ctx).Destroy() } func BenchmarkGetContext(b *testing.B) { b.StopTimer() ctx := CtxCreate(CTX_SCHED_AUTO, 0) CtxSetCurrent(ctx) b.StartTimer() for i := 0; i < b.N; i++ { CtxGetCurrent() } } func BenchmarkSetContext(b *testing.B) { b.StopTimer() ctx := CtxCreate(CTX_SCHED_AUTO, 0) b.StartTimer() for i := 0; i < b.N; i++ { ctx.SetCurrent() } } ================================================ FILE: cu/device.go ================================================ package cu // This file implements CUDA driver device management //#include import "C" import () // CUDA Device number. type Device int // Returns the compute capability of the device. func DeviceComputeCapability(device Device) (major, minor int) { var maj, min C.int err := Result(C.cuDeviceComputeCapability(&maj, &min, C.CUdevice(device))) if err != SUCCESS { panic(err) } major = int(maj) minor = int(min) return } // Returns the compute capability of the device. func (device Device) ComputeCapability() (major, minor int) { return DeviceComputeCapability(device) } // Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1]. func DeviceGet(ordinal int) Device { var device C.CUdevice err := Result(C.cuDeviceGet(&device, C.int(ordinal))) if err != SUCCESS { panic(err) } return Device(device) } // Gets the value of a device attribute. func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int { var attr C.int err := Result(C.cuDeviceGetAttribute(&attr, C.CUdevice_attribute(attrib), C.CUdevice(dev))) if err != SUCCESS { panic(err) } return int(attr) } // Gets the value of a device attribute. func (dev Device) Attribute(attrib DeviceAttribute) int { return DeviceGetAttribute(attrib, dev) } // Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution. func DeviceGetCount() int { var count C.int err := Result(C.cuDeviceGetCount(&count)) if err != SUCCESS { panic(err) } return int(count) } // Gets the name of the device. func DeviceGetName(dev Device) string { size := 256 buf := make([]byte, size) cstr := C.CString(string(buf)) err := Result(C.cuDeviceGetName(cstr, C.int(size), C.CUdevice(dev))) if err != SUCCESS { panic(err) } return C.GoString(cstr) } // Gets the name of the device. func (dev Device) Name() string { return DeviceGetName(dev) } // Device properties type DevProp struct { MaxThreadsPerBlock int MaxThreadsDim [3]int MaxGridSize [3]int SharedMemPerBlock int TotalConstantMemory int SIMDWidth int MemPitch int RegsPerBlock int ClockRate int TextureAlign int } // Returns the device's properties. func DeviceGetProperties(dev Device) (prop DevProp) { var cprop C.CUdevprop err := Result(C.cuDeviceGetProperties(&cprop, C.CUdevice(dev))) if err != SUCCESS { panic(err) } prop.MaxThreadsPerBlock = int(cprop.maxThreadsPerBlock) prop.MaxThreadsDim[0] = int(cprop.maxThreadsDim[0]) prop.MaxThreadsDim[1] = int(cprop.maxThreadsDim[1]) prop.MaxThreadsDim[2] = int(cprop.maxThreadsDim[2]) prop.MaxGridSize[0] = int(cprop.maxGridSize[0]) prop.MaxGridSize[1] = int(cprop.maxGridSize[1]) prop.MaxGridSize[2] = int(cprop.maxGridSize[2]) prop.SharedMemPerBlock = int(cprop.sharedMemPerBlock) prop.TotalConstantMemory = int(cprop.totalConstantMemory) prop.SIMDWidth = int(cprop.SIMDWidth) prop.MemPitch = int(cprop.memPitch) prop.RegsPerBlock = int(cprop.regsPerBlock) prop.ClockRate = int(cprop.clockRate) prop.TextureAlign = int(cprop.textureAlign) return } // Returns the device's properties. func (dev Device) Properties() DevProp { return DeviceGetProperties(dev) } // Returns the total amount of memory available on the device in bytes. func (device Device) TotalMem() int64 { return DeviceTotalMem(device) } // Returns the total amount of memory available on the device in bytes. func DeviceTotalMem(device Device) int64 { var bytes C.size_t err := Result(C.cuDeviceTotalMem(&bytes, C.CUdevice(device))) if err != SUCCESS { panic(err) } return int64(bytes) } type DeviceAttribute int const ( MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture ) ================================================ FILE: cu/device_test.go ================================================ package cu import ( "fmt" "testing" ) func TestDevice(t *testing.T) { fmt.Println("DeviceGetCount:", DeviceGetCount()) for i := 0; i < DeviceGetCount(); i++ { fmt.Println("DeviceGet", i) dev := DeviceGet(i) major, minor := dev.ComputeCapability() fmt.Println("Name: ", dev.Name()) fmt.Println("ComputeCapability: ", major, minor) fmt.Println("TotalMem: ", dev.TotalMem()) fmt.Println("ATTRIBUTE_MAX_THREADS_PER_BLOCK :", dev.Attribute(MAX_THREADS_PER_BLOCK)) fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_X :", dev.Attribute(MAX_BLOCK_DIM_X)) fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Y :", dev.Attribute(MAX_BLOCK_DIM_Y)) fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Z :", dev.Attribute(MAX_BLOCK_DIM_Z)) fmt.Println("ATTRIBUTE_MAX_GRID_DIM_X :", dev.Attribute(MAX_GRID_DIM_X)) fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Y :", dev.Attribute(MAX_GRID_DIM_Y)) fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Z :", dev.Attribute(MAX_GRID_DIM_Z)) fmt.Println("ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK :", dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK)) fmt.Println("ATTRIBUTE_TOTAL_CONSTANT_MEMORY :", dev.Attribute(TOTAL_CONSTANT_MEMORY)) fmt.Println("ATTRIBUTE_WARP_SIZE :", dev.Attribute(WARP_SIZE)) fmt.Println("ATTRIBUTE_MAX_PITCH :", dev.Attribute(MAX_PITCH)) fmt.Println("ATTRIBUTE_MAX_REGISTERS_PER_BLOCK :", dev.Attribute(MAX_REGISTERS_PER_BLOCK)) fmt.Println("ATTRIBUTE_CLOCK_RATE :", dev.Attribute(CLOCK_RATE)) fmt.Println("ATTRIBUTE_TEXTURE_ALIGNMENT :", dev.Attribute(TEXTURE_ALIGNMENT)) fmt.Println("ATTRIBUTE_MULTIPROCESSOR_COUNT :", dev.Attribute(MULTIPROCESSOR_COUNT)) fmt.Println("ATTRIBUTE_KERNEL_EXEC_TIMEOUT :", dev.Attribute(KERNEL_EXEC_TIMEOUT)) fmt.Println("ATTRIBUTE_INTEGRATED :", dev.Attribute(INTEGRATED)) fmt.Println("ATTRIBUTE_CAN_MAP_HOST_MEMORY :", dev.Attribute(CAN_MAP_HOST_MEMORY)) fmt.Println("ATTRIBUTE_COMPUTE_MODE :", dev.Attribute(COMPUTE_MODE)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE2D_HEIGHT)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE3D_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE3D_HEIGHT)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH :", dev.Attribute(MAXIMUM_TEXTURE3D_DEPTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_LAYERS)) fmt.Println("ATTRIBUTE_SURFACE_ALIGNMENT :", dev.Attribute(SURFACE_ALIGNMENT)) fmt.Println("ATTRIBUTE_CONCURRENT_KERNELS :", dev.Attribute(CONCURRENT_KERNELS)) fmt.Println("ATTRIBUTE_ECC_ENABLED :", dev.Attribute(ECC_ENABLED)) fmt.Println("ATTRIBUTE_PCI_BUS_ID :", dev.Attribute(PCI_BUS_ID)) fmt.Println("ATTRIBUTE_PCI_DEVICE_ID :", dev.Attribute(PCI_DEVICE_ID)) fmt.Println("ATTRIBUTE_TCC_DRIVER :", dev.Attribute(TCC_DRIVER)) fmt.Println("ATTRIBUTE_MEMORY_CLOCK_RATE :", dev.Attribute(MEMORY_CLOCK_RATE)) fmt.Println("ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH :", dev.Attribute(GLOBAL_MEMORY_BUS_WIDTH)) fmt.Println("ATTRIBUTE_L2_CACHE_SIZE :", dev.Attribute(L2_CACHE_SIZE)) fmt.Println("ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR :", dev.Attribute(MAX_THREADS_PER_MULTIPROCESSOR)) fmt.Println("ATTRIBUTE_ASYNC_ENGINE_COUNT :", dev.Attribute(ASYNC_ENGINE_COUNT)) fmt.Println("ATTRIBUTE_UNIFIED_ADDRESSING :", dev.Attribute(UNIFIED_ADDRESSING)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_LAYERS)) fmt.Printf("Properties:%#v\n", dev.Properties()) } } ================================================ FILE: cu/dim3.go ================================================ package cu type Dim3 struct { X, Y, Z int } ================================================ FILE: cu/doc.go ================================================ // Go bindings for the CUDA driver API. package cu ================================================ FILE: cu/execution.go ================================================ package cu // This file implements execution of CUDA kernels //#include import "C" import ( "unsafe" ) const pointerSize = 8 // sorry, 64 bits only. func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) { // Since Go 1.6, a cgo argument cannot have a Go pointer to Go pointer, // so we copy the argument values go C memory first. argv := C.malloc(C.size_t(len(kernelParams) * pointerSize)) argp := C.malloc(C.size_t(len(kernelParams) * pointerSize)) defer C.free(argv) defer C.free(argp) for i := range kernelParams { *((*unsafe.Pointer)(offset(argp, i))) = offset(argv, i) // argp[i] = &argv[i] *((*uint64)(offset(argv, i))) = *((*uint64)(kernelParams[i])) // argv[i] = *kernelParams[i] } err := Result(C.cuLaunchKernel( C.CUfunction(unsafe.Pointer(uintptr(f))), C.uint(gridDimX), C.uint(gridDimY), C.uint(gridDimZ), C.uint(blockDimX), C.uint(blockDimY), C.uint(blockDimZ), C.uint(sharedMemBytes), C.CUstream(unsafe.Pointer(uintptr(stream))), (*unsafe.Pointer)(argp), (*unsafe.Pointer)(unsafe.Pointer(uintptr(0))))) if err != SUCCESS { panic(err) } } func offset(ptr unsafe.Pointer, i int) unsafe.Pointer { return unsafe.Pointer(uintptr(ptr) + pointerSize*uintptr(i)) } ================================================ FILE: cu/function.go ================================================ package cu // This file implements manipulations on CUDA functions //#include import "C" import ( "unsafe" ) // Represents a CUDA CUfunction, a reference to a function within a module. type Function uintptr func FuncGetAttribute(attrib FunctionAttribute, function Function) int { var attr C.int err := Result(C.cuFuncGetAttribute(&attr, C.CUfunction_attribute(attrib), C.CUfunction(unsafe.Pointer(uintptr(function))))) if err != SUCCESS { panic(err) } return int(attr) } func (f Function) GetAttribute(attrib FunctionAttribute) int { return FuncGetAttribute(attrib, f) } type FunctionAttribute int const ( FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. ) ================================================ FILE: cu/init.go ================================================ package cu // This file implements CUDA driver initialization //#include import "C" // Initialize the CUDA driver API. // Currently, flags must be 0. // If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED. func Init(flags int) { err := Result(C.cuInit(C.uint(flags))) if err != SUCCESS { panic(err) } } ================================================ FILE: cu/init_test.go ================================================ package cu import ( "fmt" ) // needed for all other tests. func init() { Init(0) ctx := CtxCreate(CTX_SCHED_AUTO, 0) CtxSetCurrent(ctx) fmt.Println("Created CUDA context") } ================================================ FILE: cu/memory.go ================================================ package cu // This file implements CUDA memory management on the driver level //#include import "C" import ( "fmt" "unsafe" ) type DevicePtr uintptr // Allocates a number of bytes of device memory. func MemAlloc(bytes int64) DevicePtr { var devptr C.CUdeviceptr err := Result(C.cuMemAlloc(&devptr, C.size_t(bytes))) if err != SUCCESS { panic(err) } return DevicePtr(devptr) } // Frees device memory allocated by MemAlloc(). // It is safe to double-free. func MemFree(p DevicePtr) { if p == DevicePtr(uintptr(0)) { return // Allready freed } err := Result(C.cuMemFree(C.CUdeviceptr(p))) if err != SUCCESS { panic(err) } } // Frees device memory allocated by MemAlloc(). // Overwrites the pointer with NULL. // It is safe to double-free. func (ptr DevicePtr) Free() { MemFree(ptr) } // Copies a number of bytes on the current device. // Requires unified addressing to be supported. // See also: MemcpyDtoD(). func Memcpy(dst, src DevicePtr, bytes int64) { err := Result(C.cuMemcpy(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes on the current device. func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies a number of bytes from host to device. func MemcpyDtoD(dst, src DevicePtr, bytes int64) { err := Result(C.cuMemcpyDtoD(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes from host to device. func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyDtoDAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies a number of bytes from host to device. func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) { err := Result(C.cuMemcpyHtoD(C.CUdeviceptr(dst), src, C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes from host to device. // The host memory must be page-locked (see MemRegister) func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) { err := Result(C.cuMemcpyHtoDAsync(C.CUdeviceptr(dst), src, C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies a number of bytes from device to host. func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) { err := Result(C.cuMemcpyDtoH(dst, C.CUdeviceptr(src), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes device host to host. // The host memory must be page-locked (see MemRegister) func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyDtoHAsync(dst, C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies from device memory in one context (device) to another. func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) { err := Result(C.cuMemcpyPeer(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies from device memory in one context (device) to another. func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) { err := Result(C.cuMemcpyPeerAsync(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) { var cbytes C.size_t var cptr C.CUdeviceptr err := Result(C.cuMemGetAddressRange(&cptr, &cbytes, C.CUdeviceptr(ptr))) if err != SUCCESS { panic(err) } bytes = int64(cbytes) base = DevicePtr(cptr) return } // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) { return MemGetAddressRange(ptr) } // Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) Bytes() (bytes int64) { bytes, _ = MemGetAddressRange(ptr) return } // Returns the free and total amount of memroy in the current Context (in bytes). func MemGetInfo() (free, total int64) { var cfree, ctotal C.size_t err := Result(C.cuMemGetInfo(&cfree, &ctotal)) if err != SUCCESS { panic(err) } free = int64(cfree) total = int64(ctotal) return } // Page-locks memory specified by the pointer and bytes. // The pointer and byte size must be aligned to the host page size (4KB) // See also: MemHostUnregister() // doesn't link with cuda6.5 //func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) { // err := Result(C.cuMemHostRegister(ptr, C.size_t(bytes), C.uint(flags))) // if err != SUCCESS { // panic(err) // } //} // Unmaps memory locked by MemHostRegister(). // doesn't link with cuda6.5 //func MemHostUnregister(ptr unsafe.Pointer) { // err := Result(C.cuMemHostUnregister(ptr)) // if err != SUCCESS { // panic(err) // } //} func MemAllocHost(bytes int64) unsafe.Pointer { var p unsafe.Pointer err := Result(C.cuMemAllocHost(&p, C.size_t(bytes))) if err != SUCCESS { panic(err) } return p } func MemFreeHost(ptr unsafe.Pointer) { err := Result(C.cuMemFreeHost(ptr)) if err != SUCCESS { panic(err) } } type MemHostRegisterFlag int // Flag for MemHostRegister const ( // Memory is pinned in all CUDA contexts. MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP ) func (p DevicePtr) String() string { return fmt.Sprint(unsafe.Pointer(uintptr(p))) } // Type size in bytes const ( SIZEOF_FLOAT32 = 4 SIZEOF_FLOAT64 = 8 SIZEOF_COMPLEX64 = 8 SIZEOF_COMPLEX128 = 16 ) // Physical memory type of device pointer. type MemoryType uint const ( MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED ) var memorytype = map[MemoryType]string{ MemoryTypeHost: "MemoryTypeHost", MemoryTypeDevice: "MemoryTypeDevice", MemoryTypeArray: "MemoryTypeArray", MemoryTypeUnified: "MemoryTypeUnified"} func (t MemoryType) String() string { if s, ok := memorytype[t]; ok { return s } return "MemoryTypeUnknown" } // Returns the physical memory type that ptr addresses. func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) { var typ uint64 // foresee enough memory just to be safe err = Result(C.cuPointerGetAttribute(unsafe.Pointer(&typ), C.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, C.CUdeviceptr(uintptr(ptr)))) return MemoryType(uint(typ)), err } // Returns the physical memory type that ptr addresses. func (ptr DevicePtr) MemoryType() MemoryType { t, err := PointerGetAttributeMemoryType(ptr) if err != SUCCESS { panic(err) } return t } ================================================ FILE: cu/memory_test.go ================================================ package cu import ( "fmt" "math" "testing" "unsafe" ) func TestMalloc(t *testing.T) { for i := 0; i < 1024; i++ { pointer := MemAlloc(16 * 1024 * 1024) pointer.Free() } for i := 0; i < 1024; i++ { pointer := MemAlloc(16 * 1024 * 1024) MemFree(pointer) } } func BenchmarkMallocFree1B(b *testing.B) { for i := 0; i < b.N; i++ { m := MemAlloc(1) m.Free() } } func BenchmarkMallocFree1kB(b *testing.B) { for i := 0; i < b.N; i++ { m := MemAlloc(1024) m.Free() } } func BenchmarkMallocFree1MB(b *testing.B) { for i := 0; i < b.N; i++ { m := MemAlloc(1024 * 1024) m.Free() } } func TestMemAddressRange(t *testing.T) { N := 12345 ptr := MemAlloc(int64(N)) size, base := MemGetAddressRange(ptr) if size != int64(N) { t.Fail() } if base != ptr { t.Fail() } size, base = 0, DevicePtr(0) size, base = ptr.GetAddressRange() if ptr.Bytes() != int64(N) { t.Fail() } } func TestMemGetInfo(t *testing.T) { free, total := MemGetInfo() fmt.Println("MemGetInfo: ", free, "/", total) if free > total { t.Fail() } if total == 0 { t.Fail() } } func TestMemsetAsync(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) str := StreamCreate() MemsetD32Async(dev1, math.Float32bits(42), N, str) MemsetD32Async(dev1, math.Float32bits(21), N/2, str) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N) str.Synchronize() (&str).Destroy() for i := 0; i < len(host2)/2; i++ { if host2[i] != 21 { t.Fail() } } for i := len(host2) / 2; i < len(host2); i++ { if host2[i] != 42 { t.Fail() } } dev1.Free() } func TestMemset(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) MemsetD32(dev1, math.Float32bits(42), N) MemsetD32(dev1, math.Float32bits(21), N/2) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N) for i := 0; i < len(host2)/2; i++ { if host2[i] != 21 { t.Fail() } } for i := len(host2) / 2; i < len(host2); i++ { if host2[i] != 42 { t.Fail() } } dev1.Free() } func TestMemcpy(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) dev2 := MemAlloc(int64(4 * N)) MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) MemcpyDtoD(dev2, dev1, 4*N) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N) for i := range host2 { if host2[i] != float32(i) { t.Fail() } } dev1.Free() dev2.Free() } func TestMemcpyAsync(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) dev2 := MemAlloc(int64(4 * N)) stream := StreamCreate() MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream) MemcpyDtoDAsync(dev2, dev1, 4*N, stream) MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream) stream.Synchronize() for i := range host2 { if host2[i] != float32(i) { t.Fail() } } dev1.Free() dev2.Free() } func TestMemcpyAsyncRegistered(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) dev2 := MemAlloc(int64(4 * N)) stream := StreamCreate() MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream) MemcpyDtoDAsync(dev2, dev1, 4*N, stream) MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream) stream.Synchronize() for i := range host2 { if host2[i] != float32(i) { t.Fail() } } dev1.Free() dev2.Free() } func BenchmarkMemcpy(b *testing.B) { b.StopTimer() N := int64(32 * 1024 * 1024) host1 := make([]float32, N) host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) defer dev1.Free() dev2 := MemAlloc(int64(4 * N)) defer dev2.Free() b.SetBytes(4 * N) b.StartTimer() for i := 0; i < b.N; i++ { MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) MemcpyDtoD(dev2, dev1, 4*N) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N) } } ================================================ FILE: cu/memset.go ================================================ package cu // This file implements CUDA memset functions. //#include import "C" import ( "unsafe" ) // Sets the first N 32-bit values of dst array to value. // Asynchronous. func MemsetD32(deviceptr DevicePtr, value uint32, N int64) { err := Result(C.cuMemsetD32(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N))) if err != SUCCESS { panic(err) } } // Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) { err := Result(C.cuMemsetD32Async(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Sets the first N 8-bit values of dst array to value. // Asynchronous. func MemsetD8(deviceptr DevicePtr, value uint8, N int64) { err := Result(C.cuMemsetD8(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N))) if err != SUCCESS { panic(err) } } // Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) { err := Result(C.cuMemsetD8Async(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } ================================================ FILE: cu/module.go ================================================ package cu // This file implements loading of CUDA ptx modules //#include import "C" import ( "unsafe" ) // Represents a CUDA CUmodule, a reference to executable device code. type Module uintptr // Loads a compute module from file func ModuleLoad(fname string) Module { //fmt.Fprintln(os.Stderr, "driver.ModuleLoad", fname) var mod C.CUmodule err := Result(C.cuModuleLoad(&mod, C.CString(fname))) if err != SUCCESS { panic(err) } return Module(uintptr(unsafe.Pointer(mod))) } // Loads a compute module from string func ModuleLoadData(image string) Module { var mod C.CUmodule err := Result(C.cuModuleLoadData(&mod, unsafe.Pointer(C.CString(image)))) if err != SUCCESS { panic(err) } return Module(uintptr(unsafe.Pointer(mod))) } // Returns a Function handle. func ModuleGetFunction(module Module, name string) Function { var function C.CUfunction err := Result(C.cuModuleGetFunction( &function, C.CUmodule(unsafe.Pointer(uintptr(module))), C.CString(name))) if err != SUCCESS { panic(err) } return Function(uintptr(unsafe.Pointer(function))) } // Returns a Function handle. func (m Module) GetFunction(name string) Function { return ModuleGetFunction(m, name) } ================================================ FILE: cu/module_test.go ================================================ package cu import ( "testing" "unsafe" //"fmt" ) func TestModule(test *testing.T) { mod := ModuleLoad("/testdata/testmodule.ptx") f := mod.GetFunction("testMemset") N := 1000 N4 := 4 * int64(N) a := make([]float32, N) A := MemAlloc(N4) defer A.Free() aptr := unsafe.Pointer(&a[0]) MemcpyHtoD(A, aptr, N4) var value float32 value = 42 var n int n = N / 2 block := 128 grid := DivUp(N, block) shmem := 0 args := []unsafe.Pointer{unsafe.Pointer(&A), unsafe.Pointer(&value), unsafe.Pointer(&n)} LaunchKernel(f, grid, 1, 1, block, 1, 1, shmem, 0, args) MemcpyDtoH(aptr, A, N4) for i := 0; i < N/2; i++ { if a[i] != 42 { test.Fail() } } for i := N / 2; i < N; i++ { if a[i] != 0 { test.Fail() } } //fmt.Println(a) } // Integer division rounded up. func DivUp(x, y int) int { return ((x - 1) / y) + 1 } ================================================ FILE: cu/peer.go ================================================ package cu // This file implements CUDA unified addressing. //#include import "C" import ( "unsafe" ) // Make allocations from the peer Context available to the current context. func CtxEnablePeerAccess(peer Context) { err := Result(C.cuCtxEnablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))), C.uint(0))) if err != SUCCESS { panic(err) } } // Make allocations from the peer Context available to the current context. func (peer Context) EnablePeerAccess() { CtxEnablePeerAccess(peer) } // Reverses CtxEnablePeerAccess(). func CtxDisablePeerAccess(peer Context) { err := Result(C.cuCtxDisablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))))) if err != SUCCESS { panic(err) } } // Reverses EnablePeerAccess(). func (peer Context) DisablePeerAccess() { CtxDisablePeerAccess(peer) } // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func DeviceCanAccessPeer(dev, peer Device) bool { var canAccessPeer C.int err := Result(C.cuDeviceCanAccessPeer(&canAccessPeer, C.CUdevice(dev), C.CUdevice(peer))) if err != SUCCESS { panic(err) } return int(canAccessPeer) != 0 } // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func (dev Device) CanAccessPeer(peer Device) bool { return DeviceCanAccessPeer(dev, peer) } ================================================ FILE: cu/result.go ================================================ package cu // This file provides access to CUDA driver error statuses (type CUresult). //#include import "C" import ( "fmt" ) // CUDA error status. // CUDA error statuses are not returned by functions but checked and passed to // panic() when not successful. If desired, they can be caught by // recover(). type Result int // Message string for the error func (err Result) String() string { str, ok := errorString[err] if !ok { return "Unknown CUresult: " + fmt.Sprint(int(err)) } return str } const ( SUCCESS Result = C.CUDA_SUCCESS ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED ERROR_HARDWARE_STACK_ERROR Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR ERROR_ILLEGAL_INSTRUCTION Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION ERROR_MISALIGNED_ADDRESS Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS ERROR_INVALID_ADDRESS_SPACE Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE ERROR_INVALID_PC Result = 718 //C.CUDA_ERROR_INVALID_PC ERROR_NOT_PERMITTED Result = 800 //C.CUDA_ERROR_NOT_PERMITTED ERROR_NOT_SUPPORTED Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN ) // Map with error strings for Result error numbers var errorString map[Result]string = map[Result]string{ SUCCESS: "CUDA_SUCCESS", ERROR_INVALID_VALUE: "CUDA_ERROR_INVALID_VALUE", ERROR_OUT_OF_MEMORY: "CUDA_ERROR_OUT_OF_MEMORY", ERROR_NOT_INITIALIZED: "CUDA_ERROR_NOT_INITIALIZED", ERROR_DEINITIALIZED: "CUDA_ERROR_DEINITIALIZED", ERROR_PROFILER_DISABLED: "CUDA_ERROR_PROFILER_DISABLED", ERROR_PROFILER_NOT_INITIALIZED: "CUDA_ERROR_PROFILER_NOT_INITIALIZED", ERROR_PROFILER_ALREADY_STARTED: "CUDA_ERROR_PROFILER_ALREADY_STARTED", ERROR_PROFILER_ALREADY_STOPPED: "CUDA_ERROR_PROFILER_ALREADY_STOPPED", ERROR_NO_DEVICE: "CUDA_ERROR_NO_DEVICE", ERROR_INVALID_DEVICE: "CUDA_ERROR_INVALID_DEVICE", ERROR_INVALID_IMAGE: "CUDA_ERROR_INVALID_IMAGE", ERROR_INVALID_CONTEXT: "CUDA_ERROR_INVALID_CONTEXT", ERROR_CONTEXT_ALREADY_CURRENT: "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", ERROR_MAP_FAILED: "CUDA_ERROR_MAP_FAILED", ERROR_UNMAP_FAILED: "CUDA_ERROR_UNMAP_FAILED", ERROR_ARRAY_IS_MAPPED: "CUDA_ERROR_ARRAY_IS_MAPPED", ERROR_ALREADY_MAPPED: "CUDA_ERROR_ALREADY_MAPPED", ERROR_NO_BINARY_FOR_GPU: "CUDA_ERROR_NO_BINARY_FOR_GPU", ERROR_ALREADY_ACQUIRED: "CUDA_ERROR_ALREADY_ACQUIRED", ERROR_NOT_MAPPED: "CUDA_ERROR_NOT_MAPPED", ERROR_NOT_MAPPED_AS_ARRAY: "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", ERROR_NOT_MAPPED_AS_POINTER: "CUDA_ERROR_NOT_MAPPED_AS_POINTER", ERROR_ECC_UNCORRECTABLE: "CUDA_ERROR_ECC_UNCORRECTABLE", ERROR_UNSUPPORTED_LIMIT: "CUDA_ERROR_UNSUPPORTED_LIMIT", ERROR_CONTEXT_ALREADY_IN_USE: "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", ERROR_INVALID_SOURCE: "CUDA_ERROR_INVALID_SOURCE", ERROR_FILE_NOT_FOUND: "CUDA_ERROR_FILE_NOT_FOUND", ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", ERROR_SHARED_OBJECT_INIT_FAILED: "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", ERROR_OPERATING_SYSTEM: "CUDA_ERROR_OPERATING_SYSTEM", ERROR_INVALID_HANDLE: "CUDA_ERROR_INVALID_HANDLE", ERROR_NOT_FOUND: "CUDA_ERROR_NOT_FOUND", ERROR_NOT_READY: "CUDA_ERROR_NOT_READY", ERROR_LAUNCH_OUT_OF_RESOURCES: "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", ERROR_LAUNCH_TIMEOUT: "CUDA_ERROR_LAUNCH_TIMEOUT", ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", ERROR_PEER_ACCESS_ALREADY_ENABLED: "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", ERROR_PEER_ACCESS_NOT_ENABLED: "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", ERROR_PRIMARY_CONTEXT_ACTIVE: "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", ERROR_CONTEXT_IS_DESTROYED: "CUDA_ERROR_CONTEXT_IS_DESTROYED", ERROR_ASSERT: "CUDA_ERROR_ASSERT", ERROR_TOO_MANY_PEERS: "CUDA_ERROR_TOO_MANY_PEERS", ERROR_HOST_MEMORY_ALREADY_REGISTERED: "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", ERROR_HOST_MEMORY_NOT_REGISTERED: "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", ERROR_HARDWARE_STACK_ERROR: "CUDA_ERROR_HARDWARE_STACK_ERROR", ERROR_ILLEGAL_INSTRUCTION: "CUDA_ERROR_ILLEGAL_INSTRUCTION", ERROR_MISALIGNED_ADDRESS: "CUDA_ERROR_MISALIGNED_ADDRESS", ERROR_INVALID_ADDRESS_SPACE: "CUDA_ERROR_INVALID_ADDRESS_SPACE", ERROR_INVALID_PC: "CUDA_ERROR_INVALID_PC", ERROR_LAUNCH_FAILED: "CUDA_ERROR_LAUNCH_FAILED", ERROR_NOT_PERMITTED: "CUDA_ERROR_NOT_PERMITTED", ERROR_NOT_SUPPORTED: "CUDA_ERROR_NOT_SUPPORTED", ERROR_UNKNOWN: "CUDA_ERROR_UNKNOWN"} ================================================ FILE: cu/runtimeapi.go ================================================ package cu // This file implements parts of the CUDA runtime api instead of the driver // api the rest of this package uses. // It might be useful to move this to a seperate package at some point. //#include import "C" import "unsafe" // Set the device as current. func SetDevice(device Device) { err := Result(C.cudaSetDevice(C.int(device))) if err != SUCCESS { panic(err) } } // Reset the state of the current device. func DeviceReset() { err := Result(C.cudaDeviceReset()) if err != SUCCESS { panic(err) } } // Set CUDA device flags. func SetDeviceFlags(flags uint) { err := Result(C.cudaSetDeviceFlags(C.uint(flags))) if err != SUCCESS { panic(err) } } //Flags for SetDeviceFlasgs const ( // The default, decides to yield or not based on active CUDA threads and processors. DeviceAuto = C.cudaDeviceScheduleAuto // Actively spin while waiting for device. DeviceSpin = C.cudaDeviceScheduleSpin // Yield when waiting. DeviceYield = C.cudaDeviceScheduleYield // ScheduleBlockingSync block CPU on sync. DeviceScheduleBlockingSync = C.cudaDeviceScheduleBlockingSync // ScheduleBlockingSync block CPU on sync. Deprecated since cuda 4.0 DeviceBlockingSync = C.cudaDeviceBlockingSync // For use with pinned host memory DeviceMapHost = C.cudaDeviceMapHost // Do not reduce local memory to try and prevent thrashing DeviceLmemResizeToMax = C.cudaDeviceLmemResizeToMax ) func Malloc(bytes int64) DevicePtr { var devptr unsafe.Pointer err := Result(C.cudaMalloc(&devptr, C.size_t(bytes))) if err != SUCCESS { panic(err) } return DevicePtr(devptr) } func MallocHost(bytes int64) unsafe.Pointer { var p unsafe.Pointer err := Result(C.cudaMallocHost(&p, C.size_t(bytes))) if err != SUCCESS { panic(err) } return p } func FreeHost(ptr unsafe.Pointer) { err := Result(C.cudaFreeHost(ptr)) if err != SUCCESS { panic(err) } } // Copies a number of bytes in the direction specified by flags func MemCpy(dst, src unsafe.Pointer, bytes int64, flags uint) { err := Result(C.cudaMemcpy(dst, src, C.size_t(bytes), uint32(flags))) if err != SUCCESS { panic(err) } } //Flags for memory copy types const ( // Host to Host HtoH = C.cudaMemcpyHostToHost // Host to Device HtoD = C.cudaMemcpyHostToDevice // Device to Host DtoH = C.cudaMemcpyDeviceToHost // Device to Device DtoD = C.cudaMemcpyDeviceToDevice // Default, unified virtual address space Virt = C.cudaMemcpyDefault ) ================================================ FILE: cu/stream.go ================================================ package cu // This file implements CUDA streams //#include import "C" import "unsafe" // CUDA stream. type Stream uintptr // Creates an asynchronous stream func StreamCreate() Stream { var stream C.CUstream err := Result(C.cuStreamCreate(&stream, C.uint(0))) // flags has to be zero if err != SUCCESS { panic(err) } return Stream(uintptr(unsafe.Pointer(stream))) } // Destroys the asynchronous stream func (stream *Stream) Destroy() { str := *stream err := Result(C.cuStreamDestroy(C.CUstream(unsafe.Pointer(uintptr(str))))) *stream = 0 if err != SUCCESS { panic(err) } } // Destroys an asynchronous stream func StreamDestroy(stream *Stream) { stream.Destroy() } // Blocks until the stream has completed. func (stream Stream) Synchronize() { err := Result(C.cuStreamSynchronize(C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Returns Success if all operations have completed, ErrorNotReady otherwise func (stream Stream) Query() Result { return Result(C.cuStreamQuery(C.CUstream(unsafe.Pointer(uintptr(stream))))) } // Returns Success if all operations have completed, ErrorNotReady otherwise func StreamQuery(stream Stream) Result { return stream.Query() } // Blocks until the stream has completed. func StreamSynchronize(stream Stream) { stream.Synchronize() } ================================================ FILE: cu/testdata/testmodule.cu ================================================ /* * Module to test CUDA module loading and execution. * To be compiled with: * nvcc -ptx testmodule.cu */ #ifdef __cplusplus extern "C" { #endif #define threadindex ( ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x ) /// Sets the first N elements of array to value. __global__ void testMemset(float* array, float value, int N){ int i = threadindex; if(i < N){ array[i] = value; } } #ifdef __cplusplus } #endif ================================================ FILE: cu/testdata/testmodule.ptx ================================================ .version 1.4 .target sm_10, map_f64_to_f32 // compiled with /usr/local/cuda/open64/lib//be // nvopencc 4.0 built on 2011-02-18 //----------------------------------------------------------- // Compiling /tmp/tmpxft_00000e56_00000000-9_testmodule.cpp3.i (/tmp/ccBI#.rDLD4T) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "" .file 2 "/tmp/tmpxft_00000e56_00000000-8_testmodule.cudafe2.gpu" .file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h" .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h" .file 5 "/usr/local/cuda/bin/../include/host_defines.h" .file 6 "/usr/local/cuda/bin/../include/builtin_types.h" .file 7 "/usr/local/cuda/bin/../include/device_types.h" .file 8 "/usr/local/cuda/bin/../include/driver_types.h" .file 9 "/usr/local/cuda/bin/../include/surface_types.h" .file 10 "/usr/local/cuda/bin/../include/texture_types.h" .file 11 "/usr/local/cuda/bin/../include/vector_types.h" .file 12 "/usr/local/cuda/bin/../include/device_launch_parameters.h" .file 13 "/usr/local/cuda/bin/../include/crt/storage_class.h" .file 14 "/usr/include/bits/types.h" .file 15 "/usr/include/time.h" .file 16 "testmodule.cu" .file 17 "/usr/local/cuda/bin/../include/common_functions.h" .file 18 "/usr/local/cuda/bin/../include/math_functions.h" .file 19 "/usr/local/cuda/bin/../include/math_constants.h" .file 20 "/usr/local/cuda/bin/../include/device_functions.h" .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h" .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h" .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h" .file 24 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h" .file 25 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h" .file 26 "/usr/local/cuda/bin/../include/surface_functions.h" .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h" .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h" .entry testMemset ( .param .u64 __cudaparm_testMemset_array, .param .f32 __cudaparm_testMemset_value, .param .s32 __cudaparm_testMemset_N) { .reg .u16 %rh<4>; .reg .u32 %r<10>; .reg .u64 %rd<6>; .reg .f32 %f<3>; .reg .pred %p<3>; .loc 16 7 0 $LDWbegin_testMemset: mov.u16 %rh1, %nctaid.x; mov.u16 %rh2, %ctaid.y; mul.wide.u16 %r1, %rh1, %rh2; cvt.u32.u16 %r2, %ctaid.x; add.u32 %r3, %r2, %r1; cvt.u32.u16 %r4, %ntid.x; mul.lo.u32 %r5, %r4, %r3; cvt.u32.u16 %r6, %tid.x; add.u32 %r7, %r6, %r5; ld.param.s32 %r8, [__cudaparm_testMemset_N]; setp.le.s32 %p1, %r8, %r7; @%p1 bra $Lt_0_1026; .loc 16 10 0 ld.param.f32 %f1, [__cudaparm_testMemset_value]; ld.param.u64 %rd1, [__cudaparm_testMemset_array]; cvt.s64.s32 %rd2, %r7; mul.wide.s32 %rd3, %r7, 4; add.u64 %rd4, %rd1, %rd3; st.global.f32 [%rd4+0], %f1; $Lt_0_1026: .loc 16 12 0 exit; $LDWend_testMemset: } // testMemset ================================================ FILE: cu/version.go ================================================ package cu // This file implements CUDA driver version management //#include import "C" // Returns the CUDA driver version. func Version() int { var version C.int err := Result(C.cuDriverGetVersion(&version)) if err != SUCCESS { panic(err) } return int(version) } ================================================ FILE: cu/version_test.go ================================================ package cu import ( "fmt" "testing" ) func TestVersion(t *testing.T) { fmt.Println("CUDA driver version: ", Version()) } ================================================ FILE: cuda/Makefile ================================================ all: 6g gccgo doc 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/cu > README ================================================ FILE: cuda/README ================================================ PACKAGE package cu import "github.com/barnex/cuda5/cu" Go bindings for the CUDA driver API. CONSTANTS const ( // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO // Spin when waiting for results from the GPU. CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN // Yield its thread when waiting for results from the GPU. CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. CTX_BLOCKING_SYNC // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. CTX_MAP_HOST = C.CU_CTX_MAP_HOST //Do not reduce local memory after resizing local memory for a kernel. CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX ) Flags for CtxCreate const ( SIZEOF_FLOAT32 = 4 SIZEOF_FLOAT64 = 8 SIZEOF_COMPLEX64 = 8 SIZEOF_COMPLEX128 = 16 ) Type size in bytes FUNCTIONS func CtxDestroy(ctx *Context) Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function. func CtxDisablePeerAccess(peer Context) Reverses CtxEnablePeerAccess(). func CtxEnablePeerAccess(peer Context) Make allocations from the peer Context available to the current context. func CtxGetApiVersion(ctx Context) (version int) Returns the API version to create the context. func CtxSetCurrent(ctx Context) Sets the current active context. func CtxSynchronize() Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. func DeviceCanAccessPeer(dev, peer Device) bool Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func DeviceComputeCapability(device Device) (major, minor int) Returns the compute capability of the device. func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int Gets the value of a device attribute. func DeviceGetCount() int Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution. func DeviceGetName(dev Device) string Gets the name of the device. func DeviceTotalMem(device Device) int64 Returns the total amount of memory available on the device in bytes. func FuncGetAttribute(attrib FunctionAttribute, function Function) int func Init(flags int) Initialize the CUDA driver API. Currently, flags must be 0. If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED. func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) func MemAllocHost(bytes int64) unsafe.Pointer func MemFree(ptr *DevicePtr) Frees device memory allocated by MemAlloc(). Overwrites the pointer with NULL. It is safe to double-free. func MemFreeHost(ptr unsafe.Pointer) func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func MemGetInfo() (free, total int64) Returns the free and total amount of memroy in the current Context (in bytes). func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) Page-locks memory specified by the pointer and bytes. The pointer and byte size must be aligned to the host page size (4KB) See also: MemHostUnregister() func MemHostUnregister(ptr unsafe.Pointer) Unmaps memory locked by MemHostRegister(). func Memcpy(dst, src DevicePtr, bytes int64) Copies a number of bytes on the current device. Requires unified addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually an auto copy for device and/or host memory func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes on the current device. func MemcpyDtoD(dst, src DevicePtr, bytes int64) Copies a number of bytes from host to device. func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes from host to device. func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) Copies a number of bytes from device to host. func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes device host to host. The host memory must be page-locked (see MemRegister) func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) Copies a number of bytes from host to device. func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) Asynchronously copies a number of bytes from host to device. The host memory must be page-locked (see MemRegister) func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) Copies from device memory in one context (device) to another. func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) Asynchronously copies from device memory in one context (device) to another. func MemsetD32(deviceptr DevicePtr, value uint32, N int64) Sets the first N 32-bit values of dst array to value. Asynchronous. func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD8(deviceptr DevicePtr, value uint8, N int64) Sets the first N 8-bit values of dst array to value. Asynchronous. func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) Asynchronously sets the first N 32-bit values of dst array to value. func StreamDestroy(stream *Stream) Destroys an asynchronous stream func StreamSynchronize(stream Stream) Blocks until the stream has completed. func Version() int Returns the CUDA driver version. TYPES type Context uintptr CUDA context. func CtxCreate(flags uint, dev Device) Context Create a CUDA context. func CtxGetCurrent() Context Gets the current active context. func (ctx Context) ApiVersion() (version int) Returns the API version to create the context. func (ctx *Context) Destroy() Destroys the CUDA context. func (peer Context) DisablePeerAccess() Reverses EnablePeerAccess(). func (peer Context) EnablePeerAccess() Make allocations from the peer Context available to the current context. func (ctx Context) SetCurrent() Sets the current active context. type DevProp struct { MaxThreadsPerBlock int MaxThreadsDim [3]int MaxGridSize [3]int SharedMemPerBlock int TotalConstantMemory int SIMDWidth int MemPitch int RegsPerBlock int ClockRate int TextureAlign int } Device properties func DeviceGetProperties(dev Device) (prop DevProp) Returns the device's properties. type Device int CUDA Device number. func CtxGetDevice() Device Returns the ordinal of the current context's device. func DeviceGet(ordinal int) Device Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1]. func (dev Device) Attribute(attrib DeviceAttribute) int Gets the value of a device attribute. func (dev Device) CanAccessPeer(peer Device) bool Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func (device Device) ComputeCapability() (major, minor int) Returns the compute capability of the device. func (dev Device) Name() string Gets the name of the device. func (dev Device) Properties() DevProp Returns the device's properties. func (device Device) TotalMem() int64 Returns the total amount of memory available on the device in bytes. type DeviceAttribute int const ( MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture ) type DevicePtr uintptr func MemAlloc(bytes int64) DevicePtr Allocates a number of bytes of device memory. func (ptr DevicePtr) Bytes() (bytes int64) Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr *DevicePtr) Free() Frees device memory allocated by MemAlloc(). Overwrites the pointer with NULL. It is safe to double-free. func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) MemoryType() MemoryType Returns the physical memory type that ptr addresses. func (p DevicePtr) String() string type Dim3 struct { X, Y, Z int } type Function uintptr Represents a CUDA CUfunction, a reference to a function within a module. func ModuleGetFunction(module Module, name string) Function Returns a Function handle. func (f Function) GetAttribute(attrib FunctionAttribute) int type FunctionAttribute int const ( FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. ) type MemHostRegisterFlag int const ( // Memory is pinned in all CUDA contexts. MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP ) Flag for MemHostRegister type MemoryType uint Physical memory type of device pointer. const ( MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED ) func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) Returns the physical memory type that ptr addresses. func (t MemoryType) String() string type Module uintptr Represents a CUDA CUmodule, a reference to executable device code. func ModuleLoad(fname string) Module Loads a compute module from file func ModuleLoadData(image string) Module Loads a compute module from string func (m Module) GetFunction(name string) Function Returns a Function handle. type Result int CUDA error status. CUDA error statuses are not returned by functions but checked and passed to panic() when not successful. If desired, they can be caught by recover(). const ( SUCCESS Result = C.CUDA_SUCCESS ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN ) func StreamQuery(stream Stream) Result Returns Success if all operations have completed, ErrorNotReady otherwise func (err Result) String() string Message string for the error type Stream uintptr CUDA stream. func StreamCreate() Stream Creates an asynchronous stream func (stream *Stream) Destroy() Destroys the asynchronous stream func (stream Stream) Query() Result Returns Success if all operations have completed, ErrorNotReady otherwise func (stream Stream) Synchronize() Blocks until the stream has completed. ================================================ FILE: cuda/cgoflags.go ================================================ package cuda // This file provides CGO flags. import "C" //#cgo LDFLAGS:-lcudart // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////default location if not properly symlinked: //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/ //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/ //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/ // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include import "C" ================================================ FILE: cuda/device.go ================================================ package cuda //#include //#include import "C" import ( "github.com/barnex/cuda5/cu" ) // Reset the current GPU device. func DeviceReset() { err := cu.Result(C.cudaDeviceReset()) if err != cu.SUCCESS { panic(err) } } // Set preference for more cache or shared memory. func DeviceSetCacheConfig(cacheConfig FuncCache) { err := cu.Result(C.cudaDeviceSetCacheConfig(uint32(cacheConfig))) if err != cu.SUCCESS { panic(err) } } // Cache preference option. type FuncCache int const ( FUNC_CACHE_PREFER_NONE FuncCache = C.CU_FUNC_CACHE_PREFER_NONE FUNC_CACHE_PREFER_SHARED FuncCache = C.CU_FUNC_CACHE_PREFER_SHARED FUNC_CACHE_PREFER_L1 FuncCache = C.CU_FUNC_CACHE_PREFER_L1 FUNC_CACHE_PREFER_EQUAL FuncCache = C.CU_FUNC_CACHE_PREFER_EQUAL ) ================================================ FILE: cufft/Makefile ================================================ all: 6g gccgo doc 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/cufft > README ================================================ FILE: cufft/README ================================================ PACKAGE DOCUMENTATION package cufft import "github.com/barnex/cuda5/cufft" Go bindings for the CUDA CUFFT API. CONSTANTS const ( FORWARD = -1 // Forward FFT INVERSE = 1 // Inverse FFT ) TYPES type CompatibilityMode int CUFFT compatibility mode const ( COMPATIBILITY_NATIVE CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC COMPATIBILITY_FFTW_ALL CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL ) func (t CompatibilityMode) String() string type Handle uintptr FFT plan handle, reference type to a plan func Plan1d(nx int, typ Type, batch int) Handle 1D FFT plan func Plan2d(nx, ny int, typ Type) Handle 2D FFT plan func Plan3d(nx, ny, nz int, typ Type) Handle 3D FFT plan func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle 1D,2D or 3D FFT plan func (plan *Handle) Destroy() Destroys the plan. func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) Execute Complex-to-Complex plan func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) Execute Complex-to-Real plan func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) Execute Double Real-to-Complex plan func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) Execute Real-to-Complex plan func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) Execute Double Complex-to-Real plan func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) Execute Double Complex-to-Complex plan func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) Sets the FFTW compatibility mode func (plan Handle) SetStream(stream cu.Stream) Sets the cuda stream for this plan type Result int FFT result const ( SUCCESS Result = C.CUFFT_SUCCESS INVALID_PLAN Result = C.CUFFT_INVALID_PLAN ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED INVALID_TYPE Result = C.CUFFT_INVALID_TYPE INVALID_VALUE Result = C.CUFFT_INVALID_VALUE INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR EXEC_FAILED Result = C.CUFFT_EXEC_FAILED SETUP_FAILED Result = C.CUFFT_SETUP_FAILED INVALID_SIZE Result = C.CUFFT_INVALID_SIZE UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA ) FFT result value func (r Result) String() string type Type int FFT type const ( R2C Type = C.CUFFT_R2C // Real to Complex (interleaved) C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved D2Z Type = C.CUFFT_D2Z // Double to Double-Complex Z2D Type = C.CUFFT_Z2D // Double-Complex to Double Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex ) func (t Type) String() string ================================================ FILE: cufft/cgoflags.go ================================================ package cufft // This file provides CGO flags to find CUDA libraries and headers. //#cgo LDFLAGS:-lcufft // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////default location if not properly symlinked: //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/ //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/ //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/ // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w import "C" ================================================ FILE: cufft/doc.go ================================================ // Go bindings for the CUDA CUFFT API. package cufft ================================================ FILE: cufft/fft_test.go ================================================ package cufft import ( "fmt" "github.com/barnex/cuda5/cu" "unsafe" ) func ExampleFFT1D() { N := 8 hostIn := make([]float32, N) hostIn[0] = 1 devIn := cu.MemAlloc(int64(len(hostIn)) * cu.SIZEOF_FLOAT32) defer cu.MemFree(&devIn) cu.MemcpyHtoD(devIn, unsafe.Pointer(&hostIn[0]), devIn.Bytes()) hostOut := make([]complex64, N/2+1) devOut := cu.MemAlloc(int64(len(hostOut)) * cu.SIZEOF_COMPLEX64) defer cu.MemFree(&devOut) plan := Plan1d(N, R2C, 1) defer plan.Destroy() plan.ExecR2C(devIn, devOut) cu.MemcpyDtoH(unsafe.Pointer(&hostOut[0]), devOut, devOut.Bytes()) fmt.Println("hostIn:", hostIn) fmt.Println("hostOut:", hostOut) // Output: // hostIn: [1 0 0 0 0 0 0 0] // hostOut: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)] } ================================================ FILE: cufft/init_test.go ================================================ package cufft import ( "fmt" "github.com/barnex/cuda5/cu" ) // needed for all other tests. func init() { cu.Init(0) ctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0) cu.CtxSetCurrent(ctx) fmt.Println("Created CUDA context") } ================================================ FILE: cufft/mode.go ================================================ package cufft //#include import "C" import ( "fmt" ) // CUFFT compatibility mode type CompatibilityMode int const ( COMPATIBILITY_NATIVE CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC COMPATIBILITY_FFTW_ALL CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL ) func (t CompatibilityMode) String() string { if str, ok := compatibilityModeString[t]; ok { return str } return fmt.Sprint("CUFFT Compatibility mode with unknown number:", int(t)) } var compatibilityModeString map[CompatibilityMode]string = map[CompatibilityMode]string{ COMPATIBILITY_NATIVE: "CUFFT_COMPATIBILITY_NATIVE", COMPATIBILITY_FFTW_PADDING: "CUFFT_COMPATIBILITY_FFTW_PADDING", COMPATIBILITY_FFTW_ASYMMETRIC: "CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC", COMPATIBILITY_FFTW_ALL: "CUFFT_COMPATIBILITY_FFTW_ALL"} ================================================ FILE: cufft/plan.go ================================================ // Copyright 2011 Arne Vansteenkiste (barnex@gmail.com). All rights reserved. // Use of this source code is governed by a freeBSD // license that can be found in the LICENSE.txt file. package cufft //#include import "C" import ( "github.com/barnex/cuda5/cu" "unsafe" ) // FFT plan handle, reference type to a plan type Handle uintptr // 1D FFT plan func Plan1d(nx int, typ Type, batch int) Handle { var handle C.cufftHandle err := Result(C.cufftPlan1d( &handle, C.int(nx), C.cufftType(typ), C.int(batch))) if err != SUCCESS { panic(err) } return Handle(handle) } // 2D FFT plan func Plan2d(nx, ny int, typ Type) Handle { var handle C.cufftHandle err := Result(C.cufftPlan2d( &handle, C.int(nx), C.int(ny), C.cufftType(typ))) if err != SUCCESS { panic(err) } return Handle(handle) } // 3D FFT plan func Plan3d(nx, ny, nz int, typ Type) Handle { var handle C.cufftHandle err := Result(C.cufftPlan3d( &handle, C.int(nx), C.int(ny), C.int(nz), C.cufftType(typ))) if err != SUCCESS { panic(err) } return Handle(handle) } //cufftPlanMany( // cufftHandle *plan, int rank, int *n, int *inembed, // int istride, int idist, int *onembed, int ostride, // int odist, cufftType type, int batch ); // 1D,2D or 3D FFT plan func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle { var handle C.cufftHandle NULL := (*C.int)(unsafe.Pointer(uintptr(0))) inembedptr := NULL idist := 0 if inembed != nil { inembedptr = (*C.int)(unsafe.Pointer(&inembed[0])) idist = inembed[0] } oembedptr := NULL odist := 0 if oembed != nil { oembedptr = (*C.int)(unsafe.Pointer(&oembed[0])) odist = oembed[0] } err := Result(C.cufftPlanMany( &handle, C.int(len(n)), // rank (*C.int)(unsafe.Pointer(&n[0])), // n inembedptr, C.int(istride), C.int(idist), oembedptr, C.int(ostride), C.int(odist), C.cufftType(typ), C.int(batch))) if err != SUCCESS { panic(err) } return Handle(handle) } // Execute Complex-to-Complex plan func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) { err := Result(C.cufftExecC2C( C.cufftHandle(plan), (*C.cufftComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftComplex)(unsafe.Pointer(uintptr(odata))), C.int(direction))) if err != SUCCESS { panic(err) } } // Execute Real-to-Complex plan func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) { err := Result(C.cufftExecR2C( C.cufftHandle(plan), (*C.cufftReal)(unsafe.Pointer(uintptr(idata))), (*C.cufftComplex)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Execute Complex-to-Real plan func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) { err := Result(C.cufftExecC2R( C.cufftHandle(plan), (*C.cufftComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftReal)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Execute Double Complex-to-Complex plan func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) { err := Result(C.cufftExecZ2Z( C.cufftHandle(plan), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))), C.int(direction))) if err != SUCCESS { panic(err) } } // Execute Double Real-to-Complex plan func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) { err := Result(C.cufftExecD2Z( C.cufftHandle(plan), (*C.cufftDoubleReal)(unsafe.Pointer(uintptr(idata))), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Execute Double Complex-to-Real plan func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) { err := Result(C.cufftExecZ2D( C.cufftHandle(plan), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftDoubleReal)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Destroys the plan. func (plan *Handle) Destroy() { err := Result(C.cufftDestroy(C.cufftHandle(*plan))) *plan = 0 // make sure plan is not used anymore if err != SUCCESS { panic(err) } } // Sets the cuda stream for this plan func (plan Handle) SetStream(stream cu.Stream) { err := Result(C.cufftSetStream( C.cufftHandle(plan), C.cudaStream_t(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Sets the FFTW compatibility mode func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) { err := Result(C.cufftSetCompatibilityMode( C.cufftHandle(plan), C.cufftCompatibility(mode))) if err != SUCCESS { panic(err) } } ================================================ FILE: cufft/result.go ================================================ package cufft //#include import "C" import ( "fmt" ) // FFT result type Result int // FFT result value const ( SUCCESS Result = C.CUFFT_SUCCESS INVALID_PLAN Result = C.CUFFT_INVALID_PLAN ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED INVALID_TYPE Result = C.CUFFT_INVALID_TYPE INVALID_VALUE Result = C.CUFFT_INVALID_VALUE INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR EXEC_FAILED Result = C.CUFFT_EXEC_FAILED SETUP_FAILED Result = C.CUFFT_SETUP_FAILED INVALID_SIZE Result = C.CUFFT_INVALID_SIZE UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA INCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h INVALID_DEVICE Result = 0xB PARSE_ERROR Result = 0xC NO_WORKSPACE Result = 0xD ) func (r Result) String() string { if str, ok := resultString[r]; ok { return str } return fmt.Sprint("CUFFT Result with unknown error number:", int(r)) } var resultString map[Result]string = map[Result]string{ SUCCESS: "CUFFT_SUCCESS", INVALID_PLAN: "CUFFT_INVALID_PLAN", ALLOC_FAILED: "CUFFT_ALLOC_FAILED", INVALID_TYPE: "CUFFT_INVALID_TYPE", INVALID_VALUE: "CUFFT_INVALID_VALUE", INTERNAL_ERROR: "CUFFT_INTERNAL_ERROR", EXEC_FAILED: "CUFFT_EXEC_FAILED", SETUP_FAILED: "CUFFT_SETUP_FAILED", INVALID_SIZE: "CUFFT_INVALID_SIZE", UNALIGNED_DATA: "CUFFT_UNALIGNED_DATA", INCOMPLETE_PARAMETER_LIST: "CUFFT_INCOMPLETE_PARAMETER_LIST", INVALID_DEVICE: "CUFFT_INVALID_DEVICE", PARSE_ERROR: "CUFFT_PARSE_ERROR", NO_WORKSPACE: "CUFFT_NO_WORKSPACE"} ================================================ FILE: cufft/type.go ================================================ package cufft //#include import "C" import ( "fmt" ) // FFT type type Type int const ( R2C Type = C.CUFFT_R2C // Real to Complex (interleaved) C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved D2Z Type = C.CUFFT_D2Z // Double to Double-Complex Z2D Type = C.CUFFT_Z2D // Double-Complex to Double Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex ) const ( FORWARD = -1 // Forward FFT INVERSE = 1 // Inverse FFT ) func (t Type) String() string { if str, ok := typeString[t]; ok { return str } return fmt.Sprint("CUFFT Type with unknown number:", int(t)) } var typeString map[Type]string = map[Type]string{ R2C: "CUFFT_R2C", C2R: "CUFFT_C2R", C2C: "CUFFT_C2C", D2Z: "CUFFT_D2Z", Z2D: "CUFFT_Z2D", Z2Z: "CUFFT_Z2Z"} ================================================ FILE: curand/Makefile ================================================ all: 6g gccgo doc 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/curand > README ================================================ FILE: curand/README ================================================ PACKAGE DOCUMENTATION package curand import "github.com/barnex/cuda5/curand" TYPES type Generator uintptr func CreateGenerator(rngType RngType) Generator func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) func (g Generator) SetSeed(seed int64) type RngType int const ( PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator ) type Status int const ( SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error ) ================================================ FILE: curand/cgoflags.go ================================================ package curand // This file provides CGO flags to find CUDA libraries and headers. //#cgo LDFLAGS:-lcurand // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////default location if not properly symlinked: //#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib //#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib //#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib //#cgo CFLAGS: -I/usr/local/cuda-6.0/include/ //#cgo CFLAGS: -I/usr/local/cuda-5.5/include/ //#cgo CFLAGS: -I/usr/local/cuda-5.0/include/ // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w import "C" ================================================ FILE: curand/generator.go ================================================ package curand //#include import "C" import ( "unsafe" ) type Generator uintptr type RngType int const ( PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator ) func CreateGenerator(rngType RngType) Generator { var rng C.curandGenerator_t err := Status(C.curandCreateGenerator(&rng, C.curandRngType_t(rngType))) if err != SUCCESS { panic(err) } return Generator(uintptr(unsafe.Pointer(rng))) // cgo } func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) { err := Status(C.curandGenerateNormal( C.curandGenerator_t(unsafe.Pointer(uintptr(g))), (*C.float)(unsafe.Pointer(output)), C.size_t(n), C.float(mean), C.float(stddev))) if err != SUCCESS { panic(err) } } func (g Generator) SetSeed(seed int64) { err := Status(C.curandSetPseudoRandomGeneratorSeed(C.curandGenerator_t(unsafe.Pointer(uintptr(g))), _Ctype_ulonglong(seed))) if err != SUCCESS { panic(err) } } // Documentation was taken from the curand headers. ================================================ FILE: curand/status.go ================================================ package curand //#include import "C" import ( "fmt" ) type Status int const ( SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error ) func (s Status) String() string { if str, ok := statusStr[s]; ok { return str } else { return fmt.Sprint("CURAND ERROR NUMBER ", int(s)) } } var statusStr = map[Status]string{ SUCCESS: "CURAND_STATUS_SUCCESS", VERSION_MISMATCH: "CURAND_STATUS_VERSION_MISMATCH", NOT_INITIALIZED: "CURAND_STATUS_NOT_INITIALIZED", ALLOCATION_FAILED: "CURAND_STATUS_ALLOCATION_FAILED", TYPE_ERROR: "CURAND_STATUS_TYPE_ERROR", OUT_OF_RANGE: "CURAND_STATUS_OUT_OF_RANGE", LENGTH_NOT_MULTIPLE: "CURAND_STATUS_LENGTH_NOT_MULTIPLE", LAUNCH_FAILURE: "CURAND_STATUS_LAUNCH_FAILURE", PREEXISTING_FAILURE: "CURAND_STATUS_PREEXISTING_FAILURE", INITIALIZATION_FAILED: "CURAND_STATUS_INITIALIZATION_FAILED", ARCH_MISMATCH: "CURAND_STATUS_ARCH_MISMATCH", INTERNAL_ERROR: "CURAND_STATUS_INTERNAL_ERROR", } // Documentation was taken from the curand headers. ================================================ FILE: doc.go ================================================ /* Go bindings for nVIDIA CUDA 5. This package compiles with both gc and gccgo. */ package cuda5 // Dummy imports so that // go get github.com/barnex/cuda5 // will install everything. import ( _ "github.com/barnex/cuda5/cu" _ "github.com/barnex/cuda5/cufft" _ "github.com/barnex/cuda5/safe" ) ================================================ FILE: safe/Makefile ================================================ all: 6g doc #gccgo 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean go-optview -c -w *.go gofmt -w *.go opt: go-optview -w *.go gofmt -w *.go doc: godoc github.com/barnex/cuda5/safe > README ================================================ FILE: safe/README ================================================ PACKAGE package safe import "github.com/barnex/cuda5/safe" Safe and more idiomatic wrappers for the low-level CUDA functions. FUNCTIONS func InitCuda() TYPES type Complex128s struct { // contains filtered or unexported fields } Slice of complex128's on the GPU. func MakeComplex128s(len_ int) Complex128s Make a slice of complex128's on the GPU. Initialized to zero. func (s *Complex128s) Cap() int Slice capacity. func (dst Complex128s) CopyDtoD(src Complex128s) Copy src on host to dst on host. func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream) Copy src on host to dst on host, asynchronously. func (src Complex128s) CopyDtoH(dst []complex128) Copy src form device to dst on host. func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream) Copy src form device to dst on host, asynchronously. func (dst Complex128s) CopyHtoD(src []complex128) Copy src from host to dst on the device. func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream) Copy src from host to dst on the device, asynchronously. func (s Complex128s) Float() Float64s Re-interpret the array as float numbers, in interleaved format. Underlying storage is shared. func (s *Complex128s) Free() Free the underlying storage. To be used with care. Free() should only be called on a slice created by MakeXXX(), not on a slice created by x.Slice(). Freeing a slice invalidates all other slices referring to it. func (src Complex128s) Host() []complex128 Returns a fresh copy on host. func (s *Complex128s) Len() int Slice length (number of elements). func (s *Complex128s) Pointer() cu.DevicePtr Pointer to the first element. func (s Complex128s) Slice(start, stop int) Complex128s Return a slice from start (inclusive) to stop (exclusive), sharing the underlying storage with the original slice. Slices obtained in this way should not be Free()'d func (s *Complex128s) UnsafeSet(pointer unsafe.Pointer, length, capacity int) Manually set the pointer, length and capacity. Side-steps the security mechanisms, use with caution. type Complex64s struct { // contains filtered or unexported fields } Slice of complex64's on the GPU. func MakeComplex64s(len_ int) Complex64s Make a slice of complex64's on the GPU. Initialized to zero. func (s *Complex64s) Cap() int Slice capacity. func (dst Complex64s) CopyDtoD(src Complex64s) Copy src on host to dst on host. func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream) Copy src on host to dst on host, asynchronously. func (src Complex64s) CopyDtoH(dst []complex64) Copy src form device to dst on host. func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream) Copy src form device to dst on host, asynchronously. func (dst Complex64s) CopyHtoD(src []complex64) Copy src from host to dst on the device. func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream) Copy src from host to dst on the device, asynchronously. func (s Complex64s) Float() Float32s Re-interpret the array as float numbers, in interleaved format. Underlying storage is shared. func (s *Complex64s) Free() Free the underlying storage. To be used with care. Free() should only be called on a slice created by MakeXXX(), not on a slice created by x.Slice(). Freeing a slice invalidates all other slices referring to it. func (src Complex64s) Host() []complex64 Returns a fresh copy on host. func (s *Complex64s) Len() int Slice length (number of elements). func (s *Complex64s) Pointer() cu.DevicePtr Pointer to the first element. func (s Complex64s) Slice(start, stop int) Complex64s Return a slice from start (inclusive) to stop (exclusive), sharing the underlying storage with the original slice. Slices obtained in this way should not be Free()'d func (s *Complex64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int) Manually set the pointer, length and capacity. Side-steps the security mechanisms, use with caution. type FFT1DC2RPlan struct { // contains filtered or unexported fields } 1D single-precission complex-to-real FFT plan. func FFT1DC2R(size, batch int) FFT1DC2RPlan 1D single-precission complex-to-real FFT plan. func (p FFT1DC2RPlan) Destroy() Releases all resources associated with the FFT plan. func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s) Execute the FFT plan. Synchronized. func (p FFT1DC2RPlan) InputLen() int Required length of the output array. func (p FFT1DC2RPlan) OutputLen() int Required length of the input array. func (p FFT1DC2RPlan) SetStream(stream cu.Stream) Associates a CUDA stream with the FFT plan. If a stream is set, plan.Stream().Synchronize() can to be called to wait for the execution to finish. func (s FFT1DC2RPlan) Size() int Returns the logical size of the FFT: the number of elements (real or complex) it transforms. func (p FFT1DC2RPlan) Stream() cu.Stream Returns the CUDA stream associated with the FFT plan. type FFT1DR2CPlan struct { // contains filtered or unexported fields } 1D single-precission real-to-complex FFT plan. func FFT1DR2C(size, batch int) FFT1DR2CPlan 1D single-precission real-to-complex FFT plan. func (p FFT1DR2CPlan) Destroy() Releases all resources associated with the FFT plan. func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s) Execute the FFT plan. Synchronized. func (p FFT1DR2CPlan) InputLen() int Required length of the input array. func (p FFT1DR2CPlan) OutputLen() int Required length of the output array. func (p FFT1DR2CPlan) SetStream(stream cu.Stream) Associates a CUDA stream with the FFT plan. If a stream is set, plan.Stream().Synchronize() can to be called to wait for the execution to finish. func (s FFT1DR2CPlan) Size() int Returns the logical size of the FFT: the number of elements (real or complex) it transforms. func (p FFT1DR2CPlan) Stream() cu.Stream Returns the CUDA stream associated with the FFT plan. type FFT3DC2RPlan struct { // contains filtered or unexported fields } 3D single-precission real-to-complex FFT plan. func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan 3D single-precission real-to-complex FFT plan. func (p FFT3DC2RPlan) Destroy() Releases all resources associated with the FFT plan. func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s) Execute the FFT plan. src and dst are 3D arrays stored 1D arrays. func (p FFT3DC2RPlan) InputLen() int Required length of the (1D) input array. func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int) 3D size of the input array. func (p FFT3DC2RPlan) OutputLen() int Required length of the (1D) output array. func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int) 3D size of the output array. func (p FFT3DC2RPlan) SetStream(stream cu.Stream) Associates a CUDA stream with the FFT plan. If a stream is set, plan.Stream().Synchronize() can to be called to wait for the execution to finish. func (s FFT3DC2RPlan) Size() (Nx, Ny, Nz int) Returns the logical size of the FFT: the number of elements (real or complex) it transforms. func (p FFT3DC2RPlan) Stream() cu.Stream Returns the CUDA stream associated with the FFT plan. type FFT3DD2ZPlan struct { // contains filtered or unexported fields } 3D single-precission real-to-complex FFT plan. func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan 3D single-precission real-to-complex FFT plan. func (p FFT3DD2ZPlan) Destroy() Releases all resources associated with the FFT plan. func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s) Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D arrays. func (p FFT3DD2ZPlan) InputLen() int Required length of the (1D) input array. func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int) 3D size of the input array. func (p FFT3DD2ZPlan) OutputLen() int Required length of the (1D) output array. func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int) 3D size of the output array. func (p FFT3DD2ZPlan) SetStream(stream cu.Stream) Associates a CUDA stream with the FFT plan. If a stream is set, plan.Stream().Synchronize() can to be called to wait for the execution to finish. func (s FFT3DD2ZPlan) Size() (Nx, Ny, Nz int) Returns the logical size of the FFT: the number of elements (real or complex) it transforms. func (p FFT3DD2ZPlan) Stream() cu.Stream Returns the CUDA stream associated with the FFT plan. type FFT3DR2CPlan struct { // contains filtered or unexported fields } 3D single-precission real-to-complex FFT plan. func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan 3D single-precission real-to-complex FFT plan. func (p FFT3DR2CPlan) Destroy() Releases all resources associated with the FFT plan. func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s) Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D arrays. func (p FFT3DR2CPlan) InputLen() int Required length of the (1D) input array. func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int) 3D size of the input array. func (p FFT3DR2CPlan) OutputLen() int Required length of the (1D) output array. func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int) 3D size of the output array. func (p FFT3DR2CPlan) SetStream(stream cu.Stream) Associates a CUDA stream with the FFT plan. If a stream is set, plan.Stream().Synchronize() can to be called to wait for the execution to finish. func (s FFT3DR2CPlan) Size() (Nx, Ny, Nz int) Returns the logical size of the FFT: the number of elements (real or complex) it transforms. func (p FFT3DR2CPlan) Stream() cu.Stream Returns the CUDA stream associated with the FFT plan. type FFT3DZ2DPlan struct { // contains filtered or unexported fields } 3D single-precission real-to-complex FFT plan. func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan 3D single-precission real-to-complex FFT plan. func (p FFT3DZ2DPlan) Destroy() Releases all resources associated with the FFT plan. func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s) Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D arrays. func (p FFT3DZ2DPlan) InputLen() int Required length of the (1D) input array. func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int) 3D size of the input array. func (p FFT3DZ2DPlan) OutputLen() int Required length of the (1D) output array. func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int) 3D size of the output array. func (p FFT3DZ2DPlan) SetStream(stream cu.Stream) Associates a CUDA stream with the FFT plan. If a stream is set, plan.Stream().Synchronize() can to be called to wait for the execution to finish. func (s FFT3DZ2DPlan) Size() (Nx, Ny, Nz int) Returns the logical size of the FFT: the number of elements (real or complex) it transforms. func (p FFT3DZ2DPlan) Stream() cu.Stream Returns the CUDA stream associated with the FFT plan. type Float32s struct { // contains filtered or unexported fields } Slice of float32's on the GPU. func MakeFloat32s(len_ int) Float32s Make a slice of float32's on the GPU. Initialized to zero. func (s *Float32s) Cap() int Slice capacity. func (s Float32s) Complex() Complex64s Re-interpret the array as complex numbers, in interleaved format. Underlying storage is shared. func (dst Float32s) CopyDtoD(src Float32s) Copy src on host to dst on host. func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream) Copy src on host to dst on host, asynchronously. func (src Float32s) CopyDtoH(dst []float32) Copy src form device to dst on host. func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream) Copy src form device to dst on host, asynchronously. func (dst Float32s) CopyHtoD(src []float32) Copy src from host to dst on the device. func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream) Copy src from host to dst on the device, asynchronously. func (s *Float32s) Free() Free the underlying storage. To be used with care. Free() should only be called on a slice created by MakeXXX(), not on a slice created by x.Slice(). Freeing a slice invalidates all other slices referring to it. func (src Float32s) Host() []float32 Returns a fresh copy on host. func (s *Float32s) Len() int Slice length (number of elements). func (s Float32s) Memset(value float32) Set the entire slice to this value. func (s Float32s) MemsetAsync(value float32, stream cu.Stream) Set the entire slice to this value, asynchronously. func (s *Float32s) Pointer() cu.DevicePtr Pointer to the first element. func (s Float32s) Slice(start, stop int) Float32s Return a slice from start (inclusive) to stop (exclusive), sharing the underlying storage with the original slice. Slices obtained in this way should not be Free()'d func (s *Float32s) UnsafeSet(pointer unsafe.Pointer, length, capacity int) Manually set the pointer, length and capacity. Side-steps the security mechanisms, use with caution. type Float64s struct { // contains filtered or unexported fields } Slice of float64's on the GPU. func MakeFloat64s(len_ int) Float64s Make a slice of float64's on the GPU. Initialized to zero. func (s *Float64s) Cap() int Slice capacity. func (s Float64s) Complex() Complex128s Re-interpret the array as complex numbers, in interleaved format. Underlying storage is shared. func (dst Float64s) CopyDtoD(src Float64s) Copy src on host to dst on host. func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream) Copy src on host to dst on host, asynchronously. func (src Float64s) CopyDtoH(dst []float64) Copy src form device to dst on host. func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream) Copy src form device to dst on host, asynchronously. func (dst Float64s) CopyHtoD(src []float64) Copy src from host to dst on the device. func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream) Copy src from host to dst on the device, asynchronously. func (s *Float64s) Free() Free the underlying storage. To be used with care. Free() should only be called on a slice created by MakeXXX(), not on a slice created by x.Slice(). Freeing a slice invalidates all other slices referring to it. func (src Float64s) Host() []float64 Returns a fresh copy on host. func (s *Float64s) Len() int Slice length (number of elements). func (s *Float64s) Pointer() cu.DevicePtr Pointer to the first element. func (s Float64s) Slice(start, stop int) Float64s Return a slice from start (inclusive) to stop (exclusive), sharing the underlying storage with the original slice. Slices obtained in this way should not be Free()'d func (s *Float64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int) Manually set the pointer, length and capacity. Side-steps the security mechanisms, use with caution. ================================================ FILE: safe/complex128s.go ================================================ package safe import ( "github.com/barnex/cuda5/cu" "unsafe" ) // Slice of complex128's on the GPU. type Complex128s struct{ slice } // Make a slice of complex128's on the GPU. // Initialized to zero. func MakeComplex128s(len_ int) Complex128s { return Complex128s{makeslice(len_, cu.SIZEOF_COMPLEX128)} } // Return a slice from start (inclusive) to stop (exclusive), // sharing the underlying storage with the original slice. // Slices obtained in this way should not be Free()'d func (s Complex128s) Slice(start, stop int) Complex128s { return Complex128s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX128)} } // Copy src from host to dst on the device. func (dst Complex128s) CopyHtoD(src []complex128) { dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128) } // Copy src form device to dst on host. func (src Complex128s) CopyDtoH(dst []complex128) { src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128) } // Copy src on host to dst on host. func (dst Complex128s) CopyDtoD(src Complex128s) { dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX128) } // Copy src from host to dst on the device, asynchronously. func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream) { dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128, stream) } // Copy src form device to dst on host, asynchronously. func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream) { src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128, stream) } // Copy src on host to dst on host, asynchronously. func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream) { dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX128, stream) } // Returns a fresh copy on host. func (src Complex128s) Host() []complex128 { cpy := make([]complex128, src.Len()) src.CopyDtoH(cpy) return cpy } // Re-interpret the array as float numbers, // in interleaved format. Underlying storage // is shared. func (s Complex128s) Float() Float64s { return Float64s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}} } ================================================ FILE: safe/complex128s_test.go ================================================ package safe import ( "reflect" "testing" ) func TestComplex128sSlice(test *testing.T) { InitCuda() a := MakeComplex128s(100) defer a.Free() if !reflect.DeepEqual(a.Host(), make([]complex128, 100)) { test.Error(a.Host()) } b := make([]complex128, 100) if a.Len() != len(b) { test.Error("len:", a.Len(), "!=", cap(b)) } if a.Cap() != cap(b) { test.Error("cap:", a.Cap(), "!=", cap(b)) } c := a.Slice(20, 30) d := b[20:30] if c.Len() != len(d) { test.Error("sliced len:", c.Len(), "!=", cap(d)) } if c.Cap() != cap(d) { test.Error("sliced cap:", c.Cap(), "!=", cap(d)) } e := a.Slice(0, 50) f := b[0:50] if e.Len() != len(f) { test.Error("sliced len:", e.Len(), "!=", cap(f)) } if e.Cap() != cap(f) { test.Error("sliced cap:", e.Cap(), "!=", cap(f)) } } func TestComplex128sPanic1(test *testing.T) { InitCuda() defer func() { err := recover() test.Log("recovered:", err) if err == nil { test.Fail() } }() a := MakeComplex128s(100) defer a.Free() a.Slice(-1, 10) } func TestComplex128sPanic2(test *testing.T) { InitCuda() defer func() { err := recover() test.Log("recovered:", err) if err == nil { test.Fail() } }() a := MakeComplex128s(100) defer a.Free() a.Slice(0, 101) } func TestComplex128sCopy(test *testing.T) { InitCuda() a := make([]complex128, 100) b := MakeComplex128s(100) defer b.Free() c := MakeComplex128s(100) defer c.Free() d := make([]complex128, 200) for i := range a { a[i] = complex(float64(i), float64(2*i)) } b.CopyHtoD(a) c.CopyDtoD(b) c.CopyDtoH(d[:100]) if !reflect.DeepEqual(a, d[:100]) { test.Error(d) } if !reflect.DeepEqual(d[100:], make([]complex128, 100)) { test.Error(d) } } ================================================ FILE: safe/complex64s.go ================================================ package safe import ( "github.com/barnex/cuda5/cu" "unsafe" ) // Slice of complex64's on the GPU. type Complex64s struct{ slice } // Make a slice of complex64's on the GPU. // Initialized to zero. func MakeComplex64s(len_ int) Complex64s { return Complex64s{makeslice(len_, cu.SIZEOF_COMPLEX64)} } // Return a slice from start (inclusive) to stop (exclusive), // sharing the underlying storage with the original slice. // Slices obtained in this way should not be Free()'d func (s Complex64s) Slice(start, stop int) Complex64s { return Complex64s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX64)} } // Copy src from host to dst on the device. func (dst Complex64s) CopyHtoD(src []complex64) { dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64) } // Copy src form device to dst on host. func (src Complex64s) CopyDtoH(dst []complex64) { src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64) } // Copy src on host to dst on host. func (dst Complex64s) CopyDtoD(src Complex64s) { dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX64) } // Copy src from host to dst on the device, asynchronously. func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream) { dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64, stream) } // Copy src form device to dst on host, asynchronously. func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream) { src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64, stream) } // Copy src on host to dst on host, asynchronously. func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream) { dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX64, stream) } // Returns a fresh copy on host. func (src Complex64s) Host() []complex64 { cpy := make([]complex64, src.Len()) src.CopyDtoH(cpy) return cpy } // Re-interpret the array as float numbers, // in interleaved format. Underlying storage // is shared. func (s Complex64s) Float() Float32s { return Float32s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}} } ================================================ FILE: safe/complex64s_test.go ================================================ package safe import ( "reflect" "testing" ) func TestComplex64sSlice(test *testing.T) { InitCuda() a := MakeComplex64s(100) defer a.Free() if !reflect.DeepEqual(a.Host(), make([]complex64, 100)) { test.Error(a.Host()) } b := make([]complex64, 100) if a.Len() != len(b) { test.Error("len:", a.Len(), "!=", cap(b)) } if a.Cap() != cap(b) { test.Error("cap:", a.Cap(), "!=", cap(b)) } c := a.Slice(20, 30) d := b[20:30] if c.Len() != len(d) { test.Error("sliced len:", c.Len(), "!=", cap(d)) } if c.Cap() != cap(d) { test.Error("sliced cap:", c.Cap(), "!=", cap(d)) } e := a.Slice(0, 50) f := b[0:50] if e.Len() != len(f) { test.Error("sliced len:", e.Len(), "!=", cap(f)) } if e.Cap() != cap(f) { test.Error("sliced cap:", e.Cap(), "!=", cap(f)) } } func TestComplex64sPanic1(test *testing.T) { InitCuda() defer func() { err := recover() test.Log("recovered:", err) if err == nil { test.Fail() } }() a := MakeComplex64s(100) defer a.Free() a.Slice(-1, 10) } func TestComplex64sPanic2(test *testing.T) { InitCuda() defer func() { err := recover() test.Log("recovered:", err) if err == nil { test.Fail() } }() a := MakeComplex64s(100) defer a.Free() a.Slice(0, 101) } func TestComplex64sCopy(test *testing.T) { InitCuda() a := make([]complex64, 100) b := MakeComplex64s(100) defer b.Free() c := MakeComplex64s(100) defer c.Free() d := make([]complex64, 200) for i := range a { a[i] = complex(float32(i), float32(2*i)) } b.CopyHtoD(a) c.CopyDtoD(b) c.CopyDtoH(d[:100]) if !reflect.DeepEqual(a, d[:100]) { test.Error(d) } if !reflect.DeepEqual(d[100:], make([]complex64, 100)) { test.Error(d) } } ================================================ FILE: safe/doc.go ================================================ /* Safe and more idiomatic wrappers for the low-level CUDA functions. */ package safe ================================================ FILE: safe/fft1d_test.go ================================================ package safe import ( "fmt" ) func ExampleFFT1DR2C() { InitCuda() N := 8 batch := 1 fft := FFT1DR2C(N, batch) defer fft.Destroy() input := MakeFloat32s(N) defer input.Free() input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0}) output := MakeComplex64s(fft.OutputLen()) defer output.Free() fft.Exec(input, output) fmt.Println("input:", input.Host()) fmt.Println("output:", output.Host()) // Output: // input: [1 0 0 0 0 0 0 0] // output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)] } func ExampleFFT1DR2C_Inplace() { InitCuda() N := 8 batch := 2 fft := FFT1DR2C(N, batch) defer fft.Destroy() output := MakeComplex64s(fft.OutputLen()) defer output.Free() input := output.Float().Slice(0, fft.InputLen()) // input uses same layout as out-of-place transform // (CUFFT native layout) input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}) fmt.Println("input:", input.Host()) fft.Exec(input, output) fmt.Println("output:", output.Host()) inverse := FFT1DC2R(N, batch) defer inverse.Destroy() inverse.Exec(output, input) fmt.Println("input:", input.Host()) // Output: // input: [1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0] // output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)] // input: [8 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0] } func ExampleFFT1DC2R() { InitCuda() N := 8 batch := 1 fft := FFT1DC2R(N, batch) defer fft.Destroy() input := MakeComplex64s(fft.InputLen()) defer input.Free() input.CopyHtoD([]complex64{(1 + 0i), (+1 + 0i), (+1 + 0i), (+1 - 0i), (+1 + 0i)}) output := MakeFloat32s(fft.OutputLen()) defer output.Free() fft.Exec(input, output) fmt.Println("input:", input.Host()) fmt.Println("output:", output.Host()) // Output: // input: [(1+0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i)] // output: [8 0 0 0 0 0 0 0] } ================================================ FILE: safe/fft1dc2r.go ================================================ package safe import ( "fmt" "github.com/barnex/cuda5/cufft" ) // 1D single-precission complex-to-real FFT plan. type FFT1DC2RPlan struct { fftplan size1D batch int } // 1D single-precission complex-to-real FFT plan. func FFT1DC2R(size, batch int) FFT1DC2RPlan { handle := cufft.Plan1d(size, cufft.C2R, batch) handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) return FFT1DC2RPlan{fftplan{handle, 0}, size1D(size), batch} } // Execute the FFT plan. Synchronized. func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s) { oksrclen := p.InputLen() if src.Len() != oksrclen { panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecC2R(src.Pointer(), dst.Pointer()) p.stream.Synchronize() //! } // Required length of the input array. func (p FFT1DC2RPlan) OutputLen() int { return p.batch * p.Size() } // Required length of the output array. func (p FFT1DC2RPlan) InputLen() int { return p.batch * (p.Size()/2 + 1) } ================================================ FILE: safe/fft1dr2c.go ================================================ package safe import ( "fmt" "github.com/barnex/cuda5/cufft" ) // 1D single-precission real-to-complex FFT plan. type FFT1DR2CPlan struct { fftplan size1D batch int } // 1D single-precission real-to-complex FFT plan. func FFT1DR2C(size, batch int) FFT1DR2CPlan { handle := cufft.Plan1d(size, cufft.R2C, batch) handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) return FFT1DR2CPlan{fftplan{handle, 0}, size1D(size), batch} } // Execute the FFT plan. Synchronized. func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s) { oksrclen := p.InputLen() if src.Len() != oksrclen { panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecR2C(src.Pointer(), dst.Pointer()) p.stream.Synchronize() //! } // Required length of the input array. func (p FFT1DR2CPlan) InputLen() int { return p.batch * p.Size() } // Required length of the output array. func (p FFT1DR2CPlan) OutputLen() int { return p.batch * (p.Size()/2 + 1) } ================================================ FILE: safe/fft3d_test.go ================================================ package safe import ( "fmt" ) func ExampleFFT3DR2C() { InitCuda() Nx, Ny, Nz := 2, 4, 8 fft := FFT3DR2C(Nx, Ny, Nz) defer fft.Destroy() input := MakeFloat32s(fft.InputLen()) defer input.Free() inputData := make([]float32, Nx*Ny*Nz) inputData[0*Ny*Nz] = 1 inputData[1*Ny*Nz] = 1 input.CopyHtoD(inputData) output := MakeComplex64s(fft.OutputLen()) defer output.Free() fft.Exec(input, output) fmt.Println("input:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz)) Ox, Oy, Oz := fft.OutputSize() fmt.Println("output:", Reshape3DComplex64(output.Host(), Ox, Oy, Oz)) // Output: // input: [[[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] // output: [[[(2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)]] [[(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)]]] } func ExampleFFT3DC2R() { InitCuda() Nx, Ny, Nz := 2, 4, 8 fft := FFT3DC2R(Nx, Ny, Nz) defer fft.Destroy() input := MakeComplex64s(fft.InputLen()) defer input.Free() inputData := make([]complex64, fft.InputLen()) for i := range inputData { inputData[i] = 2 } input.CopyHtoD(inputData) output := MakeFloat32s(fft.OutputLen()) defer output.Free() fft.Exec(input, output) Ix, Iy, Iz := fft.InputSize() fmt.Println("input:", Reshape3DComplex64(input.Host(), Ix, Iy, Iz)) fmt.Println("output:", Reshape3DFloat32(output.Host(), Nx, Ny, Nz)) // Output: // input: [[[(2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]] [[(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]]] // output: [[[128 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] } func ExampleFFT3D() { InitCuda() Nx, Ny, Nz := 2, 4, 8 forward := FFT3DR2C(Nx, Ny, Nz) defer forward.Destroy() input := MakeFloat32s(forward.InputLen()) defer input.Free() inputData := make([]float32, forward.InputLen()) inputData[5] = 1 input.CopyHtoD(inputData) output := MakeComplex64s(forward.OutputLen()) defer output.Free() forward.Exec(input, output) backward := FFT3DC2R(Nx, Ny, Nz) backward.Exec(output, input) fmt.Println("input:", Reshape3DFloat32(inputData, Nx, Ny, Nz)) fmt.Println("forward+inverse:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz)) // Output: // input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] // forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] } //func ExampleFFT3D64() { // InitCuda() // // Nx, Ny, Nz := 2, 4, 8 // // forward := FFT3DD2Z(Nx, Ny, Nz) // defer forward.Destroy() // // input := MakeFloat64s(forward.InputLen()) // defer input.Free() // // inputData := make([]float64, forward.InputLen()) // inputData[5] = 1 // input.CopyHtoD(inputData) // // output := MakeComplex128s(forward.OutputLen()) // defer output.Free() // // forward.Exec(input, output) // // backward := FFT3DZ2D(Nx, Ny, Nz) // backward.Exec(output, input) // // fmt.Println("input:", Reshape3DFloat64(inputData, Nx, Ny, Nz)) // fmt.Println("forward+inverse:", Reshape3DFloat64(input.Host(), Nx, Ny, Nz)) // // // Output: // // input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] // // forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]] //} ================================================ FILE: safe/fft3dc2r.go ================================================ package safe import ( "fmt" "github.com/barnex/cuda5/cufft" ) // 3D single-precission real-to-complex FFT plan. type FFT3DC2RPlan struct { fftplan size3D } // 3D single-precission real-to-complex FFT plan. func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan { handle := cufft.Plan3d(Nx, Ny, Nz, cufft.C2R) handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) return FFT3DC2RPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}} } // Execute the FFT plan. // src and dst are 3D arrays stored 1D arrays. func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s) { oksrclen := p.InputLen() if src.Len() != oksrclen { panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecC2R(src.Pointer(), dst.Pointer()) p.stream.Synchronize() //! } // 3D size of the input array. func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int) { return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1 } // 3D size of the output array. func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int) { return p.size3D[0], p.size3D[1], p.size3D[2] } // Required length of the (1D) input array. func (p FFT3DC2RPlan) InputLen() int { return prod3(p.InputSize()) } // Required length of the (1D) output array. func (p FFT3DC2RPlan) OutputLen() int { return prod3(p.OutputSize()) } ================================================ FILE: safe/fft3dd2z.go ================================================ package safe import ( "fmt" "github.com/barnex/cuda5/cufft" ) // 3D single-precission real-to-complex FFT plan. type FFT3DD2ZPlan struct { fftplan size3D } // 3D single-precission real-to-complex FFT plan. func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan { handle := cufft.Plan3d(Nx, Ny, Nz, cufft.D2Z) handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) return FFT3DD2ZPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}} } // Execute the FFT plan. Synchronized. // src and dst are 3D arrays stored 1D arrays. func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s) { oksrclen := p.InputLen() if src.Len() != oksrclen { panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecD2Z(src.Pointer(), dst.Pointer()) p.stream.Synchronize() //! } // 3D size of the input array. func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int) { return p.size3D[0], p.size3D[1], p.size3D[2] } // 3D size of the output array. func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int) { return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1 } // Required length of the (1D) input array. func (p FFT3DD2ZPlan) InputLen() int { return prod3(p.InputSize()) } // Required length of the (1D) output array. func (p FFT3DD2ZPlan) OutputLen() int { return prod3(p.OutputSize()) } ================================================ FILE: safe/fft3dr2c.go ================================================ package safe import ( "fmt" "github.com/barnex/cuda5/cufft" ) // 3D single-precission real-to-complex FFT plan. type FFT3DR2CPlan struct { fftplan size3D } // 3D single-precission real-to-complex FFT plan. func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan { handle := cufft.Plan3d(Nx, Ny, Nz, cufft.R2C) handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) return FFT3DR2CPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}} } // Execute the FFT plan. Synchronized. // src and dst are 3D arrays stored 1D arrays. func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s) { oksrclen := p.InputLen() if src.Len() != oksrclen { panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecR2C(src.Pointer(), dst.Pointer()) p.stream.Synchronize() //! } // 3D size of the input array. func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int) { return p.size3D[0], p.size3D[1], p.size3D[2] } // 3D size of the output array. func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int) { return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1 } // Required length of the (1D) input array. func (p FFT3DR2CPlan) InputLen() int { return prod3(p.InputSize()) } // Required length of the (1D) output array. func (p FFT3DR2CPlan) OutputLen() int { return prod3(p.OutputSize()) } ================================================ FILE: safe/fft3dz2d.go ================================================ package safe import ( "fmt" "github.com/barnex/cuda5/cufft" ) // 3D single-precission real-to-complex FFT plan. type FFT3DZ2DPlan struct { fftplan size3D } // 3D single-precission real-to-complex FFT plan. func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan { handle := cufft.Plan3d(Nx, Ny, Nz, cufft.Z2D) handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE) return FFT3DZ2DPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}} } // Execute the FFT plan. Synchronized. // src and dst are 3D arrays stored 1D arrays. func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s) { oksrclen := p.InputLen() if src.Len() != oksrclen { panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecZ2D(src.Pointer(), dst.Pointer()) p.stream.Synchronize() //! } // 3D size of the input array. func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int) { return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1 } // 3D size of the output array. func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int) { return p.size3D[0], p.size3D[1], p.size3D[2] } // Required length of the (1D) input array. func (p FFT3DZ2DPlan) InputLen() int { return prod3(p.InputSize()) } // Required length of the (1D) output array. func (p FFT3DZ2DPlan) OutputLen() int { return prod3(p.OutputSize()) } ================================================ FILE: safe/fftplan.go ================================================ package safe // INTERNAL // Base implementation for all FFT plans. import ( "github.com/barnex/cuda5/cu" "github.com/barnex/cuda5/cufft" ) // Base implementation for all FFT plans. type fftplan struct { handle cufft.Handle stream cu.Stream } // For the sake of embedding. type size1D int // Returns the logical size of the FFT: // the number of elements (real or complex) // it transforms. func (s size1D) Size() int { return int(s) } // For the sake of embedding. type size3D [3]int // Returns the logical size of the FFT: // the number of elements (real or complex) // it transforms. func (s size3D) Size() (Nx, Ny, Nz int) { return s[0], s[1], s[2] } func prod3(x, y, z int) int { return x * y * z } // Releases all resources associated with the FFT plan. func (p fftplan) Destroy() { p.handle.Destroy() } // Associates a CUDA stream with the FFT plan. // If a stream is set, plan.Stream().Synchronize() can // to be called to wait for the execution to finish. func (p fftplan) SetStream(stream cu.Stream) { p.handle.SetStream(stream) p.stream = stream } // Returns the CUDA stream associated with the FFT plan. func (p fftplan) Stream() cu.Stream { return p.stream } ================================================ FILE: safe/float32s.go ================================================ package safe import ( "fmt" "github.com/barnex/cuda5/cu" "math" "unsafe" ) // Slice of float32's on the GPU. type Float32s struct{ slice } // Make a slice of float32's on the GPU. // Initialized to zero. func MakeFloat32s(len_ int) Float32s { return Float32s{makeslice(len_, cu.SIZEOF_FLOAT32)} } // Return a slice from start (inclusive) to stop (exclusive), // sharing the underlying storage with the original slice. // Slices obtained in this way should not be Free()'d func (s Float32s) Slice(start, stop int) Float32s { return Float32s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT32)} } // Copy src from host to dst on the device. func (dst Float32s) CopyHtoD(src []float32) { dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32) } // Copy src form device to dst on host. func (src Float32s) CopyDtoH(dst []float32) { src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32) } // Copy src on host to dst on host. func (dst Float32s) CopyDtoD(src Float32s) { dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT32) } // Copy src from host to dst on the device, asynchronously. func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream) { dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32, stream) } // Copy src form device to dst on host, asynchronously. func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream) { src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32, stream) } // Copy src on host to dst on host, asynchronously. func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream) { dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT32, stream) } // Returns a fresh copy on host. func (src Float32s) Host() []float32 { cpy := make([]float32, src.Len()) src.CopyDtoH(cpy) return cpy } // Set the entire slice to this value. func (s Float32s) Memset(value float32) { cu.MemsetD32(s.Pointer(), math.Float32bits(value), int64(s.Len())) cu.CtxSynchronize() } // Set the entire slice to this value, asynchronously. func (s Float32s) MemsetAsync(value float32, stream cu.Stream) { cu.MemsetD32Async(s.Pointer(), math.Float32bits(value), int64(s.Len()), stream) } // Re-interpret the array as complex numbers, // in interleaved format. Underlying storage // is shared. func (s Float32s) Complex() Complex64s { if s.Len()%2 != 0 { panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len())) } return Complex64s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}} } ================================================ FILE: safe/float32s_test.go ================================================ package safe import ( "reflect" "testing" ) func TestFloat32sSlice(test *testing.T) { InitCuda() a := MakeFloat32s(100) defer a.Free() if !reflect.DeepEqual(a.Host(), make([]float32, 100)) { test.Error(a.Host()) } b := make([]float32, 100) if a.Len() != len(b) { test.Error("len:", a.Len(), "!=", cap(b)) } if a.Cap() != cap(b) { test.Error("cap:", a.Cap(), "!=", cap(b)) } c := a.Slice(20, 30) d := b[20:30] if c.Len() != len(d) { test.Error("sliced len:", c.Len(), "!=", cap(d)) } if c.Cap() != cap(d) { test.Error("sliced cap:", c.Cap(), "!=", cap(d)) } e := a.Slice(0, 50) f := b[0:50] if e.Len() != len(f) { test.Error("sliced len:", e.Len(), "!=", cap(f)) } if e.Cap() != cap(f) { test.Error("sliced cap:", e.Cap(), "!=", cap(f)) } } func TestFloat32sPanic1(test *testing.T) { InitCuda() defer func() { err := recover() test.Log("recovered:", err) if err == nil { test.Fail() } }() a := MakeFloat32s(100) defer a.Free() a.Slice(-1, 10) } func TestFloat32sPanic2(test *testing.T) { InitCuda() defer func() { err := recover() test.Log("recovered:", err) if err == nil { test.Fail() } }() a := MakeFloat32s(100) defer a.Free() a.Slice(0, 101) } func TestFloat32sCopy(test *testing.T) { InitCuda() a := make([]float32, 100) b := MakeFloat32s(100) defer b.Free() c := MakeFloat32s(100) defer c.Free() d := make([]float32, 200) for i := range a { a[i] = float32(i) } b.CopyHtoD(a) c.CopyDtoD(b) c.CopyDtoH(d[:100]) if !reflect.DeepEqual(a, d[:100]) { test.Error(d) } if !reflect.DeepEqual(d[100:], make([]float32, 100)) { test.Error(d) } } ================================================ FILE: safe/float64s.go ================================================ package safe import ( "fmt" "github.com/barnex/cuda5/cu" "unsafe" ) // Slice of float64's on the GPU. type Float64s struct{ slice } // Make a slice of float64's on the GPU. // Initialized to zero. func MakeFloat64s(len_ int) Float64s { return Float64s{makeslice(len_, cu.SIZEOF_FLOAT64)} } // Return a slice from start (inclusive) to stop (exclusive), // sharing the underlying storage with the original slice. // Slices obtained in this way should not be Free()'d func (s Float64s) Slice(start, stop int) Float64s { return Float64s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT64)} } // Copy src from host to dst on the device. func (dst Float64s) CopyHtoD(src []float64) { dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64) } // Copy src form device to dst on host. func (src Float64s) CopyDtoH(dst []float64) { src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64) } // Copy src on host to dst on host. func (dst Float64s) CopyDtoD(src Float64s) { dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT64) } // Copy src from host to dst on the device, asynchronously. func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream) { dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64, stream) } // Copy src form device to dst on host, asynchronously. func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream) { src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64, stream) } // Copy src on host to dst on host, asynchronously. func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream) { dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT64, stream) } // Returns a fresh copy on host. func (src Float64s) Host() []float64 { cpy := make([]float64, src.Len()) src.CopyDtoH(cpy) return cpy } // Re-interpret the array as complex numbers, // in interleaved format. Underlying storage // is shared. func (s Float64s) Complex() Complex128s { if s.Len()%2 != 0 { panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len())) } return Complex128s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}} } ================================================ FILE: safe/float64s_test.go ================================================ package safe import ( "reflect" "testing" ) func TestFloat64sSlice(test *testing.T) { InitCuda() a := MakeFloat64s(100) defer a.Free() if !reflect.DeepEqual(a.Host(), make([]float64, 100)) { test.Error(a.Host()) } b := make([]float64, 100) if a.Len() != len(b) { test.Error("len:", a.Len(), "!=", cap(b)) } if a.Cap() != cap(b) { test.Error("cap:", a.Cap(), "!=", cap(b)) } c := a.Slice(20, 30) d := b[20:30] if c.Len() != len(d) { test.Error("sliced len:", c.Len(), "!=", cap(d)) } if c.Cap() != cap(d) { test.Error("sliced cap:", c.Cap(), "!=", cap(d)) } e := a.Slice(0, 50) f := b[0:50] if e.Len() != len(f) { test.Error("sliced len:", e.Len(), "!=", cap(f)) } if e.Cap() != cap(f) { test.Error("sliced cap:", e.Cap(), "!=", cap(f)) } } func TestFloat64sPanic1(test *testing.T) { InitCuda() defer func() { err := recover() test.Log("recovered:", err) if err == nil { test.Fail() } }() a := MakeFloat64s(100) defer a.Free() a.Slice(-1, 10) } func TestFloat64sPanic2(test *testing.T) { InitCuda() defer func() { err := recover() test.Log("recovered:", err) if err == nil { test.Fail() } }() a := MakeFloat64s(100) defer a.Free() a.Slice(0, 101) } func TestFloat64sCopy(test *testing.T) { InitCuda() a := make([]float64, 100) b := MakeFloat64s(100) defer b.Free() c := MakeFloat64s(100) defer c.Free() d := make([]float64, 200) for i := range a { a[i] = float64(i) } b.CopyHtoD(a) c.CopyDtoD(b) c.CopyDtoH(d[:100]) if !reflect.DeepEqual(a, d[:100]) { test.Error(d) } if !reflect.DeepEqual(d[100:], make([]float64, 100)) { test.Error(d) } } ================================================ FILE: safe/init.go ================================================ package safe import ( "github.com/barnex/cuda5/cu" "runtime" ) func InitCuda() { runtime.LockOSThread() cu.Init(0) cu.CtxCreate(cu.CTX_SCHED_AUTO, 0).SetCurrent() } ================================================ FILE: safe/slice.go ================================================ package safe // INTERNAL. // This file implements common functionality for all slice types // (Float32s, Float64s, Complex64s, ...). import ( "fmt" "github.com/barnex/cuda5/cu" "unsafe" ) // internal base func for all makeXXX() functions func makeslice(len_ int, elemsize int) slice { bytes := int64(len_) * int64(elemsize) s := slice{0, len_, len_} if bytes > 0 { s.ptr_ = cu.MemAlloc(bytes) cu.MemsetD8(s.ptr_, 0, bytes) cu.CtxSynchronize() } return s } // internal base type for all slices type slice struct { ptr_ cu.DevicePtr // address offset of first element len_ int // number of elements cap_ int } // Pointer to the first element. func (s *slice) Pointer() cu.DevicePtr { return s.ptr_ } // Slice length (number of elements). func (s *slice) Len() int { return s.len_ } // Slice capacity. func (s *slice) Cap() int { return s.cap_ } // Free the underlying storage. // To be used with care. Free() should only be called on // a slice created by MakeXXX(), not on a slice created // by x.Slice(). Freeing a slice invalidates all other // slices referring to it. func (s *slice) Free() { s.ptr_.Free() s.len_ = 0 s.cap_ = 0 } // internal base func for all slice() functions func (s *slice) slice(start, stop int, elemsize uintptr) slice { if start >= s.cap_ || start < 0 || stop > s.cap_ || stop < 0 { panic("cuda4/safe: slice index out of bounds") } if start > stop { panic("cuda4/safe: inverted slice range") } return slice{cu.DevicePtr(uintptr(s.ptr_) + uintptr(start)*elemsize), stop - start, s.cap_ - start} } func (dst *slice) copyHtoD(src unsafe.Pointer, srclen int, elemsize int) { if srclen != dst.Len() { panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len())) } cu.MemcpyHtoD(dst.Pointer(), src, int64(elemsize)*int64(srclen)) } func (src *slice) copyDtoH(dst unsafe.Pointer, dstlen int, elemsize int) { if dstlen != src.Len() { panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen)) } cu.MemcpyDtoH(dst, src.Pointer(), int64(elemsize)*int64(dstlen)) } func (dst *slice) copyDtoD(src *slice, elemsize int) { if dst.Len() != src.Len() { panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len())) } cu.MemcpyDtoD(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len())) } func (dst *slice) copyHtoDAsync(src unsafe.Pointer, srclen int, elemsize int, stream cu.Stream) { if srclen != dst.Len() { panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len())) } cu.MemcpyHtoDAsync(dst.Pointer(), src, int64(elemsize)*int64(srclen), stream) } func (src *slice) copyDtoHAsync(dst unsafe.Pointer, dstlen int, elemsize int, stream cu.Stream) { if dstlen != src.Len() { panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen)) } cu.MemcpyDtoHAsync(dst, src.Pointer(), int64(elemsize)*int64(dstlen), stream) } func (dst *slice) copyDtoDAsync(src *slice, elemsize int, stream cu.Stream) { if dst.Len() != src.Len() { panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len())) } cu.MemcpyDtoDAsync(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()), stream) } // Manually set the pointer, length and capacity. // Side-steps the security mechanisms, use with caution. func (s *slice) UnsafeSet(pointer unsafe.Pointer, length, capacity int) { s.ptr_ = cu.DevicePtr(uintptr(pointer)) s.len_ = length s.cap_ = capacity } ================================================ FILE: safe/subs.sh ================================================ #! /bin/bash subs32='s/loat32/loat64/g;' subs32+='s/FLOAT32/FLOAT64/g;' #sed $subs32 float32s.go > float64s.go #sed $subs32 float32s_test.go > float64s_test.go subsc64='s/Float32/Complex64/g;' subsc64+='s/float32/complex64/g;' subsc64+='s/FLOAT32/COMPLEX64/g;' #sed $subsc64 float32s_test.go > complex64s_test.go #sed $subsc64 float32s.go > complex64s.go subsc128='s/omplex64/omplex128/g;' subsc128+='s/COMPLEX64/COMPLEX128/g;' sed $subsc128 complex64s.go > complex128s.go sed $subsc128 complex64s_test.go > complex128s_test.go