Repository: barnex/cuda5
Branch: master
Commit: da30a9b287d8
Files: 72
Total size: 172.6 KB
Directory structure:
gitextract_cibr8rm8/
├── .gitignore
├── Makefile
├── README.md
├── cu/
│ ├── Makefile
│ ├── README
│ ├── cgoflags.go
│ ├── context.go
│ ├── context_test.go
│ ├── device.go
│ ├── device_test.go
│ ├── dim3.go
│ ├── doc.go
│ ├── execution.go
│ ├── function.go
│ ├── init.go
│ ├── init_test.go
│ ├── memory.go
│ ├── memory_test.go
│ ├── memset.go
│ ├── module.go
│ ├── module_test.go
│ ├── peer.go
│ ├── result.go
│ ├── runtimeapi.go
│ ├── stream.go
│ ├── testdata/
│ │ ├── testmodule.cu
│ │ └── testmodule.ptx
│ ├── version.go
│ └── version_test.go
├── cuda/
│ ├── Makefile
│ ├── README
│ ├── cgoflags.go
│ └── device.go
├── cufft/
│ ├── Makefile
│ ├── README
│ ├── cgoflags.go
│ ├── doc.go
│ ├── fft_test.go
│ ├── init_test.go
│ ├── mode.go
│ ├── plan.go
│ ├── result.go
│ └── type.go
├── curand/
│ ├── Makefile
│ ├── README
│ ├── cgoflags.go
│ ├── generator.go
│ └── status.go
├── doc.go
└── safe/
├── Makefile
├── README
├── complex128s.go
├── complex128s_test.go
├── complex64s.go
├── complex64s_test.go
├── doc.go
├── fft1d_test.go
├── fft1dc2r.go
├── fft1dr2c.go
├── fft3d_test.go
├── fft3dc2r.go
├── fft3dd2z.go
├── fft3dr2c.go
├── fft3dz2d.go
├── fftplan.go
├── float32s.go
├── float32s_test.go
├── float64s.go
├── float64s_test.go
├── init.go
├── slice.go
└── subs.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
*.swp
*.{6,8,5,o}
================================================
FILE: Makefile
================================================
all: 6g doc
6g:
go install -v
go tool vet *.go
gofmt -w *.go
GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
gccgo:
go install -v -compiler $(GCCGO)
test: 6gtest gccgotest
6gtest:
go test
gccgotest:
go test -compiler $(GCCGO)
bench: 6gbench gccgobench
6gbench:
go test -bench=.
gccgobench:
go test -bench=. -compiler $(GCCGO)
clean:
go clean
go-optview -c -w *.go
gofmt -w *.go
opt:
go-optview -w *.go
gofmt -w *.go
doc:
godoc github.com/barnex/cuda5 > README
================================================
FILE: README.md
================================================
# Go bindings for CUDA
Go bindings for nVIDIA CUDA 5 and later. This package compiles with both gc and gccgo.

================================================
FILE: cu/Makefile
================================================
all: 6g gccgo doc
6g:
go install -v
go tool vet *.go
gofmt -w *.go
GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
gccgo:
go build -v -compiler $(GCCGO)
test: 6gtest gccgotest
6gtest:
go test
gccgotest:
go test -compiler $(GCCGO)
bench: 6gbench gccgobench
6gbench:
go test -bench=.
gccgobench:
go test -bench=. -compiler $(GCCGO)
clean:
go clean
doc:
godoc github.com/barnex/cuda5/cu > README
================================================
FILE: cu/README
================================================
PACKAGE
package cu
import "github.com/barnex/cuda5/cu"
Go bindings for the CUDA driver API.
CONSTANTS
const (
// If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.
CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
// Spin when waiting for results from the GPU.
CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
// Yield its thread when waiting for results from the GPU.
CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
// Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.
CTX_BLOCKING_SYNC
// Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.
CTX_MAP_HOST = C.CU_CTX_MAP_HOST
//Do not reduce local memory after resizing local memory for a kernel.
CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
)
Flags for CtxCreate
const (
SIZEOF_FLOAT32 = 4
SIZEOF_FLOAT64 = 8
SIZEOF_COMPLEX64 = 8
SIZEOF_COMPLEX128 = 16
)
Type size in bytes
FUNCTIONS
func CtxDestroy(ctx *Context)
Destroys the CUDA context specified by ctx. If the context usage count
is not equal to 1, or the context is current to any CPU thread other
than the current one, this function fails. Floating contexts (detached
from a CPU thread via cuCtxPopCurrent()) may be destroyed by this
function.
func CtxDisablePeerAccess(peer Context)
Reverses CtxEnablePeerAccess().
func CtxEnablePeerAccess(peer Context)
Make allocations from the peer Context available to the current context.
func CtxGetApiVersion(ctx Context) (version int)
Returns the API version to create the context.
func CtxSetCurrent(ctx Context)
Sets the current active context.
func CtxSynchronize()
Blocks until the device has completed all preceding requested tasks, if
the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.
func DeviceCanAccessPeer(dev, peer Device) bool
Returns true if CtxEnablePeerAccess can be called on a context for dev
and peerDev.
func DeviceComputeCapability(device Device) (major, minor int)
Returns the compute capability of the device.
func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int
Gets the value of a device attribute.
func DeviceGetCount() int
Returns the number of devices with compute capability greater than or
equal to 1.0 that are available for execution.
func DeviceGetName(dev Device) string
Gets the name of the device.
func DeviceTotalMem(device Device) int64
Returns the total amount of memory available on the device in bytes.
func FuncGetAttribute(attrib FunctionAttribute, function Function) int
func Init(flags int)
Initialize the CUDA driver API. Currently, flags must be 0. If Init()
has not been called, any function from the driver API will panic with
ERROR_NOT_INITIALIZED.
func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer)
func MemAllocHost(bytes int64) unsafe.Pointer
func MemFree(ptr *DevicePtr)
Frees device memory allocated by MemAlloc(). Overwrites the pointer with
NULL. It is safe to double-free.
func MemFreeHost(ptr unsafe.Pointer)
func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr)
Returns the base address and size of the allocation (by MemAlloc) that
contains the input pointer ptr.
func MemGetInfo() (free, total int64)
Returns the free and total amount of memroy in the current Context (in
bytes).
func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag)
Page-locks memory specified by the pointer and bytes. The pointer and
byte size must be aligned to the host page size (4KB) See also:
MemHostUnregister()
func MemHostUnregister(ptr unsafe.Pointer)
Unmaps memory locked by MemHostRegister().
func Memcpy(dst, src DevicePtr, bytes int64)
Copies a number of bytes on the current device. Requires unified
addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually
an auto copy for device and/or host memory
func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream)
Asynchronously copies a number of bytes on the current device.
func MemcpyDtoD(dst, src DevicePtr, bytes int64)
Copies a number of bytes from host to device.
func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream)
Asynchronously copies a number of bytes from host to device.
func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64)
Copies a number of bytes from device to host.
func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream)
Asynchronously copies a number of bytes device host to host. The host
memory must be page-locked (see MemRegister)
func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64)
Copies a number of bytes from host to device.
func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream)
Asynchronously copies a number of bytes from host to device. The host
memory must be page-locked (see MemRegister)
func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64)
Copies from device memory in one context (device) to another.
func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream)
Asynchronously copies from device memory in one context (device) to
another.
func MemsetD32(deviceptr DevicePtr, value uint32, N int64)
Sets the first N 32-bit values of dst array to value. Asynchronous.
func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream)
Asynchronously sets the first N 32-bit values of dst array to value.
func MemsetD8(deviceptr DevicePtr, value uint8, N int64)
Sets the first N 8-bit values of dst array to value. Asynchronous.
func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream)
Asynchronously sets the first N 32-bit values of dst array to value.
func StreamDestroy(stream *Stream)
Destroys an asynchronous stream
func StreamSynchronize(stream Stream)
Blocks until the stream has completed.
func Version() int
Returns the CUDA driver version.
TYPES
type Context uintptr
CUDA context.
func CtxCreate(flags uint, dev Device) Context
Create a CUDA context.
func CtxGetCurrent() Context
Gets the current active context.
func (ctx Context) ApiVersion() (version int)
Returns the API version to create the context.
func (ctx *Context) Destroy()
Destroys the CUDA context.
func (peer Context) DisablePeerAccess()
Reverses EnablePeerAccess().
func (peer Context) EnablePeerAccess()
Make allocations from the peer Context available to the current context.
func (ctx Context) SetCurrent()
Sets the current active context.
type DevProp struct {
MaxThreadsPerBlock int
MaxThreadsDim [3]int
MaxGridSize [3]int
SharedMemPerBlock int
TotalConstantMemory int
SIMDWidth int
MemPitch int
RegsPerBlock int
ClockRate int
TextureAlign int
}
Device properties
func DeviceGetProperties(dev Device) (prop DevProp)
Returns the device's properties.
type Device int
CUDA Device number.
func CtxGetDevice() Device
Returns the ordinal of the current context's device.
func DeviceGet(ordinal int) Device
Returns in a device handle given an ordinal in the range [0,
DeviceGetCount()-1].
func (dev Device) Attribute(attrib DeviceAttribute) int
Gets the value of a device attribute.
func (dev Device) CanAccessPeer(peer Device) bool
Returns true if CtxEnablePeerAccess can be called on a context for dev
and peerDev.
func (device Device) ComputeCapability() (major, minor int)
Returns the compute capability of the device.
func (dev Device) Name() string
Gets the name of the device.
func (dev Device) Properties() DevProp
Returns the device's properties.
func (device Device) TotalMem() int64
Returns the total amount of memory available on the device in bytes.
type DeviceAttribute int
const (
MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block
MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X
MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y
MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z
MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X
MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y
MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z
MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes
TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads
MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies
MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block
CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz
TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures
MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device
KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels
INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory
CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space
COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details)
MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width
MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width
MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height
MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width
MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height
MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth
MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width
MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height
MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture
SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces
CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently
ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled
PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device
PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device
TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model
MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz
GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits
L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes
MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor
ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines
UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host
MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width
MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture
)
type DevicePtr uintptr
func MemAlloc(bytes int64) DevicePtr
Allocates a number of bytes of device memory.
func (ptr DevicePtr) Bytes() (bytes int64)
Returns the size of the allocation (by MemAlloc) that contains the input
pointer ptr.
func (ptr *DevicePtr) Free()
Frees device memory allocated by MemAlloc(). Overwrites the pointer with
NULL. It is safe to double-free.
func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr)
Returns the base address and size of the allocation (by MemAlloc) that
contains the input pointer ptr.
func (ptr DevicePtr) MemoryType() MemoryType
Returns the physical memory type that ptr addresses.
func (p DevicePtr) String() string
type Dim3 struct {
X, Y, Z int
}
type Function uintptr
Represents a CUDA CUfunction, a reference to a function within a module.
func ModuleGetFunction(module Module, name string) Function
Returns a Function handle.
func (f Function) GetAttribute(attrib FunctionAttribute) int
type FunctionAttribute int
const (
FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.
FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function.
FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function.
FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function.
FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function.
FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled.
FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled.
)
type MemHostRegisterFlag int
const (
// Memory is pinned in all CUDA contexts.
MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE
// Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()
MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP
)
Flag for MemHostRegister
type MemoryType uint
const (
MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST
MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE
MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY
MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
)
func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result)
Returns the physical memory type that ptr addresses.
func (t MemoryType) String() string
type Module uintptr
Represents a CUDA CUmodule, a reference to executable device code.
func ModuleLoad(fname string) Module
Loads a compute module from file
func ModuleLoadData(image string) Module
Loads a compute module from string
func (m Module) GetFunction(name string) Function
Returns a Function handle.
type Result int
CUDA error status. CUDA error statuses are not returned by functions but
checked and passed to panic() when not successful. If desired, they can
be caught by recover().
const (
SUCCESS Result = C.CUDA_SUCCESS
ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE
ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY
ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED
ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED
ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED
ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE
ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE
ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE
ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT
ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED
ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED
ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED
ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU
ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED
ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED
ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE
ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT
ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE
ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND
ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM
ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE
ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND
ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY
ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED
ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT
ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS
ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN
)
func StreamQuery(stream Stream) Result
Returns Success if all operations have completed, ErrorNotReady
otherwise
func (err Result) String() string
Message string for the error
type Stream uintptr
CUDA stream.
func StreamCreate() Stream
Creates an asynchronous stream
func (stream *Stream) Destroy()
Destroys the asynchronous stream
func (stream Stream) Query() Result
Returns Success if all operations have completed, ErrorNotReady
otherwise
func (stream Stream) Synchronize()
Blocks until the stream has completed.
================================================
FILE: cu/cgoflags.go
================================================
package cu
// This file provides CGO flags to find CUDA libraries and headers.
//#cgo LDFLAGS:-lcuda -lcudart
//
////default location:
//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
//#cgo CFLAGS: -I/usr/local/cuda/include/
//
////default location if not properly symlinked:
//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
//
////arch linux:
//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
//#cgo CFLAGS: -I/opt/cuda/include
//
////WINDOWS:
//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include
import "C"
================================================
FILE: cu/context.go
================================================
package cu
// This file implements CUDA driver context management
//#include <cuda.h>
import "C"
import "unsafe"
// CUDA context.
type Context uintptr
// Create a CUDA context.
func CtxCreate(flags uint, dev Device) Context {
var ctx C.CUcontext
err := Result(C.cuCtxCreate(&ctx, C.uint(flags), C.CUdevice(dev)))
if err != SUCCESS {
panic(err)
}
return Context(uintptr(unsafe.Pointer(ctx)))
}
//Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function.
func CtxDestroy(ctx *Context) {
err := Result(C.cuCtxDestroy(C.CUcontext(unsafe.Pointer(uintptr(*ctx)))))
*ctx = 0
if err != SUCCESS {
panic(err)
}
}
//Destroys the CUDA context.
func (ctx *Context) Destroy() {
CtxDestroy(ctx)
}
// Returns the API version to create the context.
func CtxGetApiVersion(ctx Context) (version int) {
var cversion C.uint
err := Result(C.cuCtxGetApiVersion(C.CUcontext(unsafe.Pointer(uintptr(ctx))), &cversion))
if err != SUCCESS {
panic(err)
}
version = int(cversion)
return
}
// Returns the API version to create the context.
func (ctx Context) ApiVersion() (version int) {
return CtxGetApiVersion(ctx)
}
// Gets the current active context.
func CtxGetCurrent() Context {
var ctx C.CUcontext
err := Result(C.cuCtxGetCurrent(&ctx))
if err != SUCCESS {
panic(err)
}
return Context(uintptr(unsafe.Pointer(ctx)))
}
// Returns the ordinal of the current context's device.
func CtxGetDevice() Device {
var dev C.CUdevice
err := Result(C.cuCtxGetDevice(&dev))
if err != SUCCESS {
panic(err)
}
return Device(dev)
}
// Sets the current active context.
func CtxSetCurrent(ctx Context) {
err := Result(C.cuCtxSetCurrent(C.CUcontext(unsafe.Pointer(uintptr(ctx)))))
if err != SUCCESS {
panic(err)
}
}
// Sets the current active context.
func (ctx Context) SetCurrent() {
CtxSetCurrent(ctx)
}
// Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.
func CtxSynchronize() {
err := Result(C.cuCtxSynchronize())
if err != SUCCESS {
panic(err)
}
}
// Flags for CtxCreate
const (
// If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.
CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
// Spin when waiting for results from the GPU.
CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
// Yield its thread when waiting for results from the GPU.
CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
// Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.
CTX_BLOCKING_SYNC
// Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.
CTX_MAP_HOST = C.CU_CTX_MAP_HOST
//Do not reduce local memory after resizing local memory for a kernel.
CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
)
================================================
FILE: cu/context_test.go
================================================
package cu
import (
"fmt"
"testing"
)
func TestContext(t *testing.T) {
fmt.Println("CtxCreate")
ctx := CtxCreate(CTX_SCHED_AUTO, 0)
fmt.Println("CtxSetCurrent")
CtxSetCurrent(ctx)
fmt.Println("CtxGetApiVersion:", ctx.ApiVersion())
fmt.Println("CtxGetDevice:", CtxGetDevice())
(&ctx).Destroy()
}
func BenchmarkGetContext(b *testing.B) {
b.StopTimer()
ctx := CtxCreate(CTX_SCHED_AUTO, 0)
CtxSetCurrent(ctx)
b.StartTimer()
for i := 0; i < b.N; i++ {
CtxGetCurrent()
}
}
func BenchmarkSetContext(b *testing.B) {
b.StopTimer()
ctx := CtxCreate(CTX_SCHED_AUTO, 0)
b.StartTimer()
for i := 0; i < b.N; i++ {
ctx.SetCurrent()
}
}
================================================
FILE: cu/device.go
================================================
package cu
// This file implements CUDA driver device management
//#include <cuda.h>
import "C"
import ()
// CUDA Device number.
type Device int
// Returns the compute capability of the device.
func DeviceComputeCapability(device Device) (major, minor int) {
var maj, min C.int
err := Result(C.cuDeviceComputeCapability(&maj, &min, C.CUdevice(device)))
if err != SUCCESS {
panic(err)
}
major = int(maj)
minor = int(min)
return
}
// Returns the compute capability of the device.
func (device Device) ComputeCapability() (major, minor int) {
return DeviceComputeCapability(device)
}
// Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1].
func DeviceGet(ordinal int) Device {
var device C.CUdevice
err := Result(C.cuDeviceGet(&device, C.int(ordinal)))
if err != SUCCESS {
panic(err)
}
return Device(device)
}
// Gets the value of a device attribute.
func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int {
var attr C.int
err := Result(C.cuDeviceGetAttribute(&attr, C.CUdevice_attribute(attrib), C.CUdevice(dev)))
if err != SUCCESS {
panic(err)
}
return int(attr)
}
// Gets the value of a device attribute.
func (dev Device) Attribute(attrib DeviceAttribute) int {
return DeviceGetAttribute(attrib, dev)
}
// Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution.
func DeviceGetCount() int {
var count C.int
err := Result(C.cuDeviceGetCount(&count))
if err != SUCCESS {
panic(err)
}
return int(count)
}
// Gets the name of the device.
func DeviceGetName(dev Device) string {
size := 256
buf := make([]byte, size)
cstr := C.CString(string(buf))
err := Result(C.cuDeviceGetName(cstr, C.int(size), C.CUdevice(dev)))
if err != SUCCESS {
panic(err)
}
return C.GoString(cstr)
}
// Gets the name of the device.
func (dev Device) Name() string {
return DeviceGetName(dev)
}
// Device properties
type DevProp struct {
MaxThreadsPerBlock int
MaxThreadsDim [3]int
MaxGridSize [3]int
SharedMemPerBlock int
TotalConstantMemory int
SIMDWidth int
MemPitch int
RegsPerBlock int
ClockRate int
TextureAlign int
}
// Returns the device's properties.
func DeviceGetProperties(dev Device) (prop DevProp) {
var cprop C.CUdevprop
err := Result(C.cuDeviceGetProperties(&cprop, C.CUdevice(dev)))
if err != SUCCESS {
panic(err)
}
prop.MaxThreadsPerBlock = int(cprop.maxThreadsPerBlock)
prop.MaxThreadsDim[0] = int(cprop.maxThreadsDim[0])
prop.MaxThreadsDim[1] = int(cprop.maxThreadsDim[1])
prop.MaxThreadsDim[2] = int(cprop.maxThreadsDim[2])
prop.MaxGridSize[0] = int(cprop.maxGridSize[0])
prop.MaxGridSize[1] = int(cprop.maxGridSize[1])
prop.MaxGridSize[2] = int(cprop.maxGridSize[2])
prop.SharedMemPerBlock = int(cprop.sharedMemPerBlock)
prop.TotalConstantMemory = int(cprop.totalConstantMemory)
prop.SIMDWidth = int(cprop.SIMDWidth)
prop.MemPitch = int(cprop.memPitch)
prop.RegsPerBlock = int(cprop.regsPerBlock)
prop.ClockRate = int(cprop.clockRate)
prop.TextureAlign = int(cprop.textureAlign)
return
}
// Returns the device's properties.
func (dev Device) Properties() DevProp {
return DeviceGetProperties(dev)
}
// Returns the total amount of memory available on the device in bytes.
func (device Device) TotalMem() int64 {
return DeviceTotalMem(device)
}
// Returns the total amount of memory available on the device in bytes.
func DeviceTotalMem(device Device) int64 {
var bytes C.size_t
err := Result(C.cuDeviceTotalMem(&bytes, C.CUdevice(device)))
if err != SUCCESS {
panic(err)
}
return int64(bytes)
}
type DeviceAttribute int
const (
MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block
MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X
MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y
MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z
MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X
MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y
MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z
MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes
TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads
MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies
MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block
CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz
TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures
MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device
KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels
INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory
CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space
COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details)
MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width
MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width
MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height
MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width
MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height
MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth
MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width
MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height
MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture
SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces
CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently
ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled
PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device
PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device
TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model
MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz
GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits
L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes
MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor
ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines
UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host
MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width
MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture
)
================================================
FILE: cu/device_test.go
================================================
package cu
import (
"fmt"
"testing"
)
func TestDevice(t *testing.T) {
fmt.Println("DeviceGetCount:", DeviceGetCount())
for i := 0; i < DeviceGetCount(); i++ {
fmt.Println("DeviceGet", i)
dev := DeviceGet(i)
major, minor := dev.ComputeCapability()
fmt.Println("Name: ", dev.Name())
fmt.Println("ComputeCapability: ", major, minor)
fmt.Println("TotalMem: ", dev.TotalMem())
fmt.Println("ATTRIBUTE_MAX_THREADS_PER_BLOCK :", dev.Attribute(MAX_THREADS_PER_BLOCK))
fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_X :", dev.Attribute(MAX_BLOCK_DIM_X))
fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Y :", dev.Attribute(MAX_BLOCK_DIM_Y))
fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Z :", dev.Attribute(MAX_BLOCK_DIM_Z))
fmt.Println("ATTRIBUTE_MAX_GRID_DIM_X :", dev.Attribute(MAX_GRID_DIM_X))
fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Y :", dev.Attribute(MAX_GRID_DIM_Y))
fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Z :", dev.Attribute(MAX_GRID_DIM_Z))
fmt.Println("ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK :", dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK))
fmt.Println("ATTRIBUTE_TOTAL_CONSTANT_MEMORY :", dev.Attribute(TOTAL_CONSTANT_MEMORY))
fmt.Println("ATTRIBUTE_WARP_SIZE :", dev.Attribute(WARP_SIZE))
fmt.Println("ATTRIBUTE_MAX_PITCH :", dev.Attribute(MAX_PITCH))
fmt.Println("ATTRIBUTE_MAX_REGISTERS_PER_BLOCK :", dev.Attribute(MAX_REGISTERS_PER_BLOCK))
fmt.Println("ATTRIBUTE_CLOCK_RATE :", dev.Attribute(CLOCK_RATE))
fmt.Println("ATTRIBUTE_TEXTURE_ALIGNMENT :", dev.Attribute(TEXTURE_ALIGNMENT))
fmt.Println("ATTRIBUTE_MULTIPROCESSOR_COUNT :", dev.Attribute(MULTIPROCESSOR_COUNT))
fmt.Println("ATTRIBUTE_KERNEL_EXEC_TIMEOUT :", dev.Attribute(KERNEL_EXEC_TIMEOUT))
fmt.Println("ATTRIBUTE_INTEGRATED :", dev.Attribute(INTEGRATED))
fmt.Println("ATTRIBUTE_CAN_MAP_HOST_MEMORY :", dev.Attribute(CAN_MAP_HOST_MEMORY))
fmt.Println("ATTRIBUTE_COMPUTE_MODE :", dev.Attribute(COMPUTE_MODE))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_WIDTH))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_WIDTH))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE2D_HEIGHT))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE3D_WIDTH))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE3D_HEIGHT))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH :", dev.Attribute(MAXIMUM_TEXTURE3D_DEPTH))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_WIDTH))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_LAYERS))
fmt.Println("ATTRIBUTE_SURFACE_ALIGNMENT :", dev.Attribute(SURFACE_ALIGNMENT))
fmt.Println("ATTRIBUTE_CONCURRENT_KERNELS :", dev.Attribute(CONCURRENT_KERNELS))
fmt.Println("ATTRIBUTE_ECC_ENABLED :", dev.Attribute(ECC_ENABLED))
fmt.Println("ATTRIBUTE_PCI_BUS_ID :", dev.Attribute(PCI_BUS_ID))
fmt.Println("ATTRIBUTE_PCI_DEVICE_ID :", dev.Attribute(PCI_DEVICE_ID))
fmt.Println("ATTRIBUTE_TCC_DRIVER :", dev.Attribute(TCC_DRIVER))
fmt.Println("ATTRIBUTE_MEMORY_CLOCK_RATE :", dev.Attribute(MEMORY_CLOCK_RATE))
fmt.Println("ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH :", dev.Attribute(GLOBAL_MEMORY_BUS_WIDTH))
fmt.Println("ATTRIBUTE_L2_CACHE_SIZE :", dev.Attribute(L2_CACHE_SIZE))
fmt.Println("ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR :", dev.Attribute(MAX_THREADS_PER_MULTIPROCESSOR))
fmt.Println("ATTRIBUTE_ASYNC_ENGINE_COUNT :", dev.Attribute(ASYNC_ENGINE_COUNT))
fmt.Println("ATTRIBUTE_UNIFIED_ADDRESSING :", dev.Attribute(UNIFIED_ADDRESSING))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_WIDTH))
fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_LAYERS))
fmt.Printf("Properties:%#v\n", dev.Properties())
}
}
================================================
FILE: cu/dim3.go
================================================
package cu
type Dim3 struct {
X, Y, Z int
}
================================================
FILE: cu/doc.go
================================================
// Go bindings for the CUDA driver API.
package cu
================================================
FILE: cu/execution.go
================================================
package cu
// This file implements execution of CUDA kernels
//#include <cuda.h>
import "C"
import (
"unsafe"
)
const pointerSize = 8 // sorry, 64 bits only.
func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) {
// Since Go 1.6, a cgo argument cannot have a Go pointer to Go pointer,
// so we copy the argument values go C memory first.
argv := C.malloc(C.size_t(len(kernelParams) * pointerSize))
argp := C.malloc(C.size_t(len(kernelParams) * pointerSize))
defer C.free(argv)
defer C.free(argp)
for i := range kernelParams {
*((*unsafe.Pointer)(offset(argp, i))) = offset(argv, i) // argp[i] = &argv[i]
*((*uint64)(offset(argv, i))) = *((*uint64)(kernelParams[i])) // argv[i] = *kernelParams[i]
}
err := Result(C.cuLaunchKernel(
C.CUfunction(unsafe.Pointer(uintptr(f))),
C.uint(gridDimX),
C.uint(gridDimY),
C.uint(gridDimZ),
C.uint(blockDimX),
C.uint(blockDimY),
C.uint(blockDimZ),
C.uint(sharedMemBytes),
C.CUstream(unsafe.Pointer(uintptr(stream))),
(*unsafe.Pointer)(argp),
(*unsafe.Pointer)(unsafe.Pointer(uintptr(0)))))
if err != SUCCESS {
panic(err)
}
}
func offset(ptr unsafe.Pointer, i int) unsafe.Pointer {
return unsafe.Pointer(uintptr(ptr) + pointerSize*uintptr(i))
}
================================================
FILE: cu/function.go
================================================
package cu
// This file implements manipulations on CUDA functions
//#include <cuda.h>
import "C"
import (
"unsafe"
)
// Represents a CUDA CUfunction, a reference to a function within a module.
type Function uintptr
func FuncGetAttribute(attrib FunctionAttribute, function Function) int {
var attr C.int
err := Result(C.cuFuncGetAttribute(&attr, C.CUfunction_attribute(attrib), C.CUfunction(unsafe.Pointer(uintptr(function)))))
if err != SUCCESS {
panic(err)
}
return int(attr)
}
func (f Function) GetAttribute(attrib FunctionAttribute) int {
return FuncGetAttribute(attrib, f)
}
type FunctionAttribute int
const (
FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.
FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function.
FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function.
FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function.
FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function.
FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled.
FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled.
)
================================================
FILE: cu/init.go
================================================
package cu
// This file implements CUDA driver initialization
//#include <cuda.h>
import "C"
// Initialize the CUDA driver API.
// Currently, flags must be 0.
// If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED.
func Init(flags int) {
err := Result(C.cuInit(C.uint(flags)))
if err != SUCCESS {
panic(err)
}
}
================================================
FILE: cu/init_test.go
================================================
package cu
import (
"fmt"
)
// needed for all other tests.
func init() {
Init(0)
ctx := CtxCreate(CTX_SCHED_AUTO, 0)
CtxSetCurrent(ctx)
fmt.Println("Created CUDA context")
}
================================================
FILE: cu/memory.go
================================================
package cu
// This file implements CUDA memory management on the driver level
//#include <cuda.h>
import "C"
import (
"fmt"
"unsafe"
)
type DevicePtr uintptr
// Allocates a number of bytes of device memory.
func MemAlloc(bytes int64) DevicePtr {
var devptr C.CUdeviceptr
err := Result(C.cuMemAlloc(&devptr, C.size_t(bytes)))
if err != SUCCESS {
panic(err)
}
return DevicePtr(devptr)
}
// Frees device memory allocated by MemAlloc().
// It is safe to double-free.
func MemFree(p DevicePtr) {
if p == DevicePtr(uintptr(0)) {
return // Allready freed
}
err := Result(C.cuMemFree(C.CUdeviceptr(p)))
if err != SUCCESS {
panic(err)
}
}
// Frees device memory allocated by MemAlloc().
// Overwrites the pointer with NULL.
// It is safe to double-free.
func (ptr DevicePtr) Free() {
MemFree(ptr)
}
// Copies a number of bytes on the current device.
// Requires unified addressing to be supported.
// See also: MemcpyDtoD().
func Memcpy(dst, src DevicePtr, bytes int64) {
err := Result(C.cuMemcpy(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes)))
if err != SUCCESS {
panic(err)
}
}
// Asynchronously copies a number of bytes on the current device.
func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) {
err := Result(C.cuMemcpyAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
if err != SUCCESS {
panic(err)
}
}
// Copies a number of bytes from host to device.
func MemcpyDtoD(dst, src DevicePtr, bytes int64) {
err := Result(C.cuMemcpyDtoD(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes)))
if err != SUCCESS {
panic(err)
}
}
// Asynchronously copies a number of bytes from host to device.
func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) {
err := Result(C.cuMemcpyDtoDAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
if err != SUCCESS {
panic(err)
}
}
// Copies a number of bytes from host to device.
func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) {
err := Result(C.cuMemcpyHtoD(C.CUdeviceptr(dst), src, C.size_t(bytes)))
if err != SUCCESS {
panic(err)
}
}
// Asynchronously copies a number of bytes from host to device.
// The host memory must be page-locked (see MemRegister)
func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) {
err := Result(C.cuMemcpyHtoDAsync(C.CUdeviceptr(dst), src, C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
if err != SUCCESS {
panic(err)
}
}
// Copies a number of bytes from device to host.
func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) {
err := Result(C.cuMemcpyDtoH(dst, C.CUdeviceptr(src), C.size_t(bytes)))
if err != SUCCESS {
panic(err)
}
}
// Asynchronously copies a number of bytes device host to host.
// The host memory must be page-locked (see MemRegister)
func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) {
err := Result(C.cuMemcpyDtoHAsync(dst, C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
if err != SUCCESS {
panic(err)
}
}
// Copies from device memory in one context (device) to another.
func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) {
err := Result(C.cuMemcpyPeer(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes)))
if err != SUCCESS {
panic(err)
}
}
// Asynchronously copies from device memory in one context (device) to another.
func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) {
err := Result(C.cuMemcpyPeerAsync(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
if err != SUCCESS {
panic(err)
}
}
// Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr.
func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) {
var cbytes C.size_t
var cptr C.CUdeviceptr
err := Result(C.cuMemGetAddressRange(&cptr, &cbytes, C.CUdeviceptr(ptr)))
if err != SUCCESS {
panic(err)
}
bytes = int64(cbytes)
base = DevicePtr(cptr)
return
}
// Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr.
func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) {
return MemGetAddressRange(ptr)
}
// Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr.
func (ptr DevicePtr) Bytes() (bytes int64) {
bytes, _ = MemGetAddressRange(ptr)
return
}
// Returns the free and total amount of memroy in the current Context (in bytes).
func MemGetInfo() (free, total int64) {
var cfree, ctotal C.size_t
err := Result(C.cuMemGetInfo(&cfree, &ctotal))
if err != SUCCESS {
panic(err)
}
free = int64(cfree)
total = int64(ctotal)
return
}
// Page-locks memory specified by the pointer and bytes.
// The pointer and byte size must be aligned to the host page size (4KB)
// See also: MemHostUnregister()
// doesn't link with cuda6.5
//func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) {
// err := Result(C.cuMemHostRegister(ptr, C.size_t(bytes), C.uint(flags)))
// if err != SUCCESS {
// panic(err)
// }
//}
// Unmaps memory locked by MemHostRegister().
// doesn't link with cuda6.5
//func MemHostUnregister(ptr unsafe.Pointer) {
// err := Result(C.cuMemHostUnregister(ptr))
// if err != SUCCESS {
// panic(err)
// }
//}
func MemAllocHost(bytes int64) unsafe.Pointer {
var p unsafe.Pointer
err := Result(C.cuMemAllocHost(&p, C.size_t(bytes)))
if err != SUCCESS {
panic(err)
}
return p
}
func MemFreeHost(ptr unsafe.Pointer) {
err := Result(C.cuMemFreeHost(ptr))
if err != SUCCESS {
panic(err)
}
}
type MemHostRegisterFlag int
// Flag for MemHostRegister
const (
// Memory is pinned in all CUDA contexts.
MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE
// Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()
MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP
)
func (p DevicePtr) String() string {
return fmt.Sprint(unsafe.Pointer(uintptr(p)))
}
// Type size in bytes
const (
SIZEOF_FLOAT32 = 4
SIZEOF_FLOAT64 = 8
SIZEOF_COMPLEX64 = 8
SIZEOF_COMPLEX128 = 16
)
// Physical memory type of device pointer.
type MemoryType uint
const (
MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST
MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE
MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY
MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
)
var memorytype = map[MemoryType]string{
MemoryTypeHost: "MemoryTypeHost",
MemoryTypeDevice: "MemoryTypeDevice",
MemoryTypeArray: "MemoryTypeArray",
MemoryTypeUnified: "MemoryTypeUnified"}
func (t MemoryType) String() string {
if s, ok := memorytype[t]; ok {
return s
}
return "MemoryTypeUnknown"
}
// Returns the physical memory type that ptr addresses.
func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) {
var typ uint64 // foresee enough memory just to be safe
err = Result(C.cuPointerGetAttribute(unsafe.Pointer(&typ),
C.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, C.CUdeviceptr(uintptr(ptr))))
return MemoryType(uint(typ)), err
}
// Returns the physical memory type that ptr addresses.
func (ptr DevicePtr) MemoryType() MemoryType {
t, err := PointerGetAttributeMemoryType(ptr)
if err != SUCCESS {
panic(err)
}
return t
}
================================================
FILE: cu/memory_test.go
================================================
package cu
import (
"fmt"
"math"
"testing"
"unsafe"
)
func TestMalloc(t *testing.T) {
for i := 0; i < 1024; i++ {
pointer := MemAlloc(16 * 1024 * 1024)
pointer.Free()
}
for i := 0; i < 1024; i++ {
pointer := MemAlloc(16 * 1024 * 1024)
MemFree(pointer)
}
}
func BenchmarkMallocFree1B(b *testing.B) {
for i := 0; i < b.N; i++ {
m := MemAlloc(1)
m.Free()
}
}
func BenchmarkMallocFree1kB(b *testing.B) {
for i := 0; i < b.N; i++ {
m := MemAlloc(1024)
m.Free()
}
}
func BenchmarkMallocFree1MB(b *testing.B) {
for i := 0; i < b.N; i++ {
m := MemAlloc(1024 * 1024)
m.Free()
}
}
func TestMemAddressRange(t *testing.T) {
N := 12345
ptr := MemAlloc(int64(N))
size, base := MemGetAddressRange(ptr)
if size != int64(N) {
t.Fail()
}
if base != ptr {
t.Fail()
}
size, base = 0, DevicePtr(0)
size, base = ptr.GetAddressRange()
if ptr.Bytes() != int64(N) {
t.Fail()
}
}
func TestMemGetInfo(t *testing.T) {
free, total := MemGetInfo()
fmt.Println("MemGetInfo: ", free, "/", total)
if free > total {
t.Fail()
}
if total == 0 {
t.Fail()
}
}
func TestMemsetAsync(t *testing.T) {
N := int64(32 * 1024)
host1 := make([]float32, N)
for i := range host1 {
host1[i] = float32(i)
}
host2 := make([]float32, N)
dev1 := MemAlloc(int64(4 * N))
MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
str := StreamCreate()
MemsetD32Async(dev1, math.Float32bits(42), N, str)
MemsetD32Async(dev1, math.Float32bits(21), N/2, str)
MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N)
str.Synchronize()
(&str).Destroy()
for i := 0; i < len(host2)/2; i++ {
if host2[i] != 21 {
t.Fail()
}
}
for i := len(host2) / 2; i < len(host2); i++ {
if host2[i] != 42 {
t.Fail()
}
}
dev1.Free()
}
func TestMemset(t *testing.T) {
N := int64(32 * 1024)
host1 := make([]float32, N)
for i := range host1 {
host1[i] = float32(i)
}
host2 := make([]float32, N)
dev1 := MemAlloc(int64(4 * N))
MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
MemsetD32(dev1, math.Float32bits(42), N)
MemsetD32(dev1, math.Float32bits(21), N/2)
MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N)
for i := 0; i < len(host2)/2; i++ {
if host2[i] != 21 {
t.Fail()
}
}
for i := len(host2) / 2; i < len(host2); i++ {
if host2[i] != 42 {
t.Fail()
}
}
dev1.Free()
}
func TestMemcpy(t *testing.T) {
N := int64(32 * 1024)
host1 := make([]float32, N)
for i := range host1 {
host1[i] = float32(i)
}
host2 := make([]float32, N)
dev1 := MemAlloc(int64(4 * N))
dev2 := MemAlloc(int64(4 * N))
MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
MemcpyDtoD(dev2, dev1, 4*N)
MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N)
for i := range host2 {
if host2[i] != float32(i) {
t.Fail()
}
}
dev1.Free()
dev2.Free()
}
func TestMemcpyAsync(t *testing.T) {
N := int64(32 * 1024)
host1 := make([]float32, N)
for i := range host1 {
host1[i] = float32(i)
}
host2 := make([]float32, N)
dev1 := MemAlloc(int64(4 * N))
dev2 := MemAlloc(int64(4 * N))
stream := StreamCreate()
MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream)
MemcpyDtoDAsync(dev2, dev1, 4*N, stream)
MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream)
stream.Synchronize()
for i := range host2 {
if host2[i] != float32(i) {
t.Fail()
}
}
dev1.Free()
dev2.Free()
}
func TestMemcpyAsyncRegistered(t *testing.T) {
N := int64(32 * 1024)
host1 := make([]float32, N)
for i := range host1 {
host1[i] = float32(i)
}
host2 := make([]float32, N)
dev1 := MemAlloc(int64(4 * N))
dev2 := MemAlloc(int64(4 * N))
stream := StreamCreate()
MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream)
MemcpyDtoDAsync(dev2, dev1, 4*N, stream)
MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream)
stream.Synchronize()
for i := range host2 {
if host2[i] != float32(i) {
t.Fail()
}
}
dev1.Free()
dev2.Free()
}
func BenchmarkMemcpy(b *testing.B) {
b.StopTimer()
N := int64(32 * 1024 * 1024)
host1 := make([]float32, N)
host2 := make([]float32, N)
dev1 := MemAlloc(int64(4 * N))
defer dev1.Free()
dev2 := MemAlloc(int64(4 * N))
defer dev2.Free()
b.SetBytes(4 * N)
b.StartTimer()
for i := 0; i < b.N; i++ {
MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
MemcpyDtoD(dev2, dev1, 4*N)
MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N)
}
}
================================================
FILE: cu/memset.go
================================================
package cu
// This file implements CUDA memset functions.
//#include <cuda.h>
import "C"
import (
"unsafe"
)
// Sets the first N 32-bit values of dst array to value.
// Asynchronous.
func MemsetD32(deviceptr DevicePtr, value uint32, N int64) {
err := Result(C.cuMemsetD32(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N)))
if err != SUCCESS {
panic(err)
}
}
// Asynchronously sets the first N 32-bit values of dst array to value.
func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) {
err := Result(C.cuMemsetD32Async(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream)))))
if err != SUCCESS {
panic(err)
}
}
// Sets the first N 8-bit values of dst array to value.
// Asynchronous.
func MemsetD8(deviceptr DevicePtr, value uint8, N int64) {
err := Result(C.cuMemsetD8(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N)))
if err != SUCCESS {
panic(err)
}
}
// Asynchronously sets the first N 32-bit values of dst array to value.
func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) {
err := Result(C.cuMemsetD8Async(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream)))))
if err != SUCCESS {
panic(err)
}
}
================================================
FILE: cu/module.go
================================================
package cu
// This file implements loading of CUDA ptx modules
//#include <cuda.h>
import "C"
import (
"unsafe"
)
// Represents a CUDA CUmodule, a reference to executable device code.
type Module uintptr
// Loads a compute module from file
func ModuleLoad(fname string) Module {
//fmt.Fprintln(os.Stderr, "driver.ModuleLoad", fname)
var mod C.CUmodule
err := Result(C.cuModuleLoad(&mod, C.CString(fname)))
if err != SUCCESS {
panic(err)
}
return Module(uintptr(unsafe.Pointer(mod)))
}
// Loads a compute module from string
func ModuleLoadData(image string) Module {
var mod C.CUmodule
err := Result(C.cuModuleLoadData(&mod, unsafe.Pointer(C.CString(image))))
if err != SUCCESS {
panic(err)
}
return Module(uintptr(unsafe.Pointer(mod)))
}
// Returns a Function handle.
func ModuleGetFunction(module Module, name string) Function {
var function C.CUfunction
err := Result(C.cuModuleGetFunction(
&function,
C.CUmodule(unsafe.Pointer(uintptr(module))),
C.CString(name)))
if err != SUCCESS {
panic(err)
}
return Function(uintptr(unsafe.Pointer(function)))
}
// Returns a Function handle.
func (m Module) GetFunction(name string) Function {
return ModuleGetFunction(m, name)
}
================================================
FILE: cu/module_test.go
================================================
package cu
import (
"testing"
"unsafe"
//"fmt"
)
func TestModule(test *testing.T) {
mod := ModuleLoad("/testdata/testmodule.ptx")
f := mod.GetFunction("testMemset")
N := 1000
N4 := 4 * int64(N)
a := make([]float32, N)
A := MemAlloc(N4)
defer A.Free()
aptr := unsafe.Pointer(&a[0])
MemcpyHtoD(A, aptr, N4)
var value float32
value = 42
var n int
n = N / 2
block := 128
grid := DivUp(N, block)
shmem := 0
args := []unsafe.Pointer{unsafe.Pointer(&A), unsafe.Pointer(&value), unsafe.Pointer(&n)}
LaunchKernel(f, grid, 1, 1, block, 1, 1, shmem, 0, args)
MemcpyDtoH(aptr, A, N4)
for i := 0; i < N/2; i++ {
if a[i] != 42 {
test.Fail()
}
}
for i := N / 2; i < N; i++ {
if a[i] != 0 {
test.Fail()
}
}
//fmt.Println(a)
}
// Integer division rounded up.
func DivUp(x, y int) int {
return ((x - 1) / y) + 1
}
================================================
FILE: cu/peer.go
================================================
package cu
// This file implements CUDA unified addressing.
//#include <cuda.h>
import "C"
import (
"unsafe"
)
// Make allocations from the peer Context available to the current context.
func CtxEnablePeerAccess(peer Context) {
err := Result(C.cuCtxEnablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))), C.uint(0)))
if err != SUCCESS {
panic(err)
}
}
// Make allocations from the peer Context available to the current context.
func (peer Context) EnablePeerAccess() {
CtxEnablePeerAccess(peer)
}
// Reverses CtxEnablePeerAccess().
func CtxDisablePeerAccess(peer Context) {
err := Result(C.cuCtxDisablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer)))))
if err != SUCCESS {
panic(err)
}
}
// Reverses EnablePeerAccess().
func (peer Context) DisablePeerAccess() {
CtxDisablePeerAccess(peer)
}
// Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev.
func DeviceCanAccessPeer(dev, peer Device) bool {
var canAccessPeer C.int
err := Result(C.cuDeviceCanAccessPeer(&canAccessPeer, C.CUdevice(dev), C.CUdevice(peer)))
if err != SUCCESS {
panic(err)
}
return int(canAccessPeer) != 0
}
// Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev.
func (dev Device) CanAccessPeer(peer Device) bool {
return DeviceCanAccessPeer(dev, peer)
}
================================================
FILE: cu/result.go
================================================
package cu
// This file provides access to CUDA driver error statuses (type CUresult).
//#include <cuda.h>
import "C"
import (
"fmt"
)
// CUDA error status.
// CUDA error statuses are not returned by functions but checked and passed to
// panic() when not successful. If desired, they can be caught by
// recover().
type Result int
// Message string for the error
func (err Result) String() string {
str, ok := errorString[err]
if !ok {
return "Unknown CUresult: " + fmt.Sprint(int(err))
}
return str
}
const (
SUCCESS Result = C.CUDA_SUCCESS
ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE
ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY
ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED
ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED
ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED
ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE
ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE
ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE
ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT
ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED
ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED
ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED
ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU
ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED
ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED
ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE
ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT
ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE
ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND
ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM
ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE
ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND
ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY
ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED
ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT
ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS
ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
ERROR_HARDWARE_STACK_ERROR Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR
ERROR_ILLEGAL_INSTRUCTION Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION
ERROR_MISALIGNED_ADDRESS Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS
ERROR_INVALID_ADDRESS_SPACE Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE
ERROR_INVALID_PC Result = 718 //C.CUDA_ERROR_INVALID_PC
ERROR_NOT_PERMITTED Result = 800 //C.CUDA_ERROR_NOT_PERMITTED
ERROR_NOT_SUPPORTED Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED
ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN
)
// Map with error strings for Result error numbers
var errorString map[Result]string = map[Result]string{
SUCCESS: "CUDA_SUCCESS",
ERROR_INVALID_VALUE: "CUDA_ERROR_INVALID_VALUE",
ERROR_OUT_OF_MEMORY: "CUDA_ERROR_OUT_OF_MEMORY",
ERROR_NOT_INITIALIZED: "CUDA_ERROR_NOT_INITIALIZED",
ERROR_DEINITIALIZED: "CUDA_ERROR_DEINITIALIZED",
ERROR_PROFILER_DISABLED: "CUDA_ERROR_PROFILER_DISABLED",
ERROR_PROFILER_NOT_INITIALIZED: "CUDA_ERROR_PROFILER_NOT_INITIALIZED",
ERROR_PROFILER_ALREADY_STARTED: "CUDA_ERROR_PROFILER_ALREADY_STARTED",
ERROR_PROFILER_ALREADY_STOPPED: "CUDA_ERROR_PROFILER_ALREADY_STOPPED",
ERROR_NO_DEVICE: "CUDA_ERROR_NO_DEVICE",
ERROR_INVALID_DEVICE: "CUDA_ERROR_INVALID_DEVICE",
ERROR_INVALID_IMAGE: "CUDA_ERROR_INVALID_IMAGE",
ERROR_INVALID_CONTEXT: "CUDA_ERROR_INVALID_CONTEXT",
ERROR_CONTEXT_ALREADY_CURRENT: "CUDA_ERROR_CONTEXT_ALREADY_CURRENT",
ERROR_MAP_FAILED: "CUDA_ERROR_MAP_FAILED",
ERROR_UNMAP_FAILED: "CUDA_ERROR_UNMAP_FAILED",
ERROR_ARRAY_IS_MAPPED: "CUDA_ERROR_ARRAY_IS_MAPPED",
ERROR_ALREADY_MAPPED: "CUDA_ERROR_ALREADY_MAPPED",
ERROR_NO_BINARY_FOR_GPU: "CUDA_ERROR_NO_BINARY_FOR_GPU",
ERROR_ALREADY_ACQUIRED: "CUDA_ERROR_ALREADY_ACQUIRED",
ERROR_NOT_MAPPED: "CUDA_ERROR_NOT_MAPPED",
ERROR_NOT_MAPPED_AS_ARRAY: "CUDA_ERROR_NOT_MAPPED_AS_ARRAY",
ERROR_NOT_MAPPED_AS_POINTER: "CUDA_ERROR_NOT_MAPPED_AS_POINTER",
ERROR_ECC_UNCORRECTABLE: "CUDA_ERROR_ECC_UNCORRECTABLE",
ERROR_UNSUPPORTED_LIMIT: "CUDA_ERROR_UNSUPPORTED_LIMIT",
ERROR_CONTEXT_ALREADY_IN_USE: "CUDA_ERROR_CONTEXT_ALREADY_IN_USE",
ERROR_INVALID_SOURCE: "CUDA_ERROR_INVALID_SOURCE",
ERROR_FILE_NOT_FOUND: "CUDA_ERROR_FILE_NOT_FOUND",
ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND",
ERROR_SHARED_OBJECT_INIT_FAILED: "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED",
ERROR_OPERATING_SYSTEM: "CUDA_ERROR_OPERATING_SYSTEM",
ERROR_INVALID_HANDLE: "CUDA_ERROR_INVALID_HANDLE",
ERROR_NOT_FOUND: "CUDA_ERROR_NOT_FOUND",
ERROR_NOT_READY: "CUDA_ERROR_NOT_READY",
ERROR_LAUNCH_OUT_OF_RESOURCES: "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES",
ERROR_LAUNCH_TIMEOUT: "CUDA_ERROR_LAUNCH_TIMEOUT",
ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING",
ERROR_PEER_ACCESS_ALREADY_ENABLED: "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED",
ERROR_PEER_ACCESS_NOT_ENABLED: "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED",
ERROR_PRIMARY_CONTEXT_ACTIVE: "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE",
ERROR_CONTEXT_IS_DESTROYED: "CUDA_ERROR_CONTEXT_IS_DESTROYED",
ERROR_ASSERT: "CUDA_ERROR_ASSERT",
ERROR_TOO_MANY_PEERS: "CUDA_ERROR_TOO_MANY_PEERS",
ERROR_HOST_MEMORY_ALREADY_REGISTERED: "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED",
ERROR_HOST_MEMORY_NOT_REGISTERED: "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED",
ERROR_HARDWARE_STACK_ERROR: "CUDA_ERROR_HARDWARE_STACK_ERROR",
ERROR_ILLEGAL_INSTRUCTION: "CUDA_ERROR_ILLEGAL_INSTRUCTION",
ERROR_MISALIGNED_ADDRESS: "CUDA_ERROR_MISALIGNED_ADDRESS",
ERROR_INVALID_ADDRESS_SPACE: "CUDA_ERROR_INVALID_ADDRESS_SPACE",
ERROR_INVALID_PC: "CUDA_ERROR_INVALID_PC",
ERROR_LAUNCH_FAILED: "CUDA_ERROR_LAUNCH_FAILED",
ERROR_NOT_PERMITTED: "CUDA_ERROR_NOT_PERMITTED",
ERROR_NOT_SUPPORTED: "CUDA_ERROR_NOT_SUPPORTED",
ERROR_UNKNOWN: "CUDA_ERROR_UNKNOWN"}
================================================
FILE: cu/runtimeapi.go
================================================
package cu
// This file implements parts of the CUDA runtime api instead of the driver
// api the rest of this package uses.
// It might be useful to move this to a seperate package at some point.
//#include <cuda_runtime.h>
import "C"
import "unsafe"
// Set the device as current.
func SetDevice(device Device) {
err := Result(C.cudaSetDevice(C.int(device)))
if err != SUCCESS {
panic(err)
}
}
// Reset the state of the current device.
func DeviceReset() {
err := Result(C.cudaDeviceReset())
if err != SUCCESS {
panic(err)
}
}
// Set CUDA device flags.
func SetDeviceFlags(flags uint) {
err := Result(C.cudaSetDeviceFlags(C.uint(flags)))
if err != SUCCESS {
panic(err)
}
}
//Flags for SetDeviceFlasgs
const (
// The default, decides to yield or not based on active CUDA threads and processors.
DeviceAuto = C.cudaDeviceScheduleAuto
// Actively spin while waiting for device.
DeviceSpin = C.cudaDeviceScheduleSpin
// Yield when waiting.
DeviceYield = C.cudaDeviceScheduleYield
// ScheduleBlockingSync block CPU on sync.
DeviceScheduleBlockingSync = C.cudaDeviceScheduleBlockingSync
// ScheduleBlockingSync block CPU on sync. Deprecated since cuda 4.0
DeviceBlockingSync = C.cudaDeviceBlockingSync
// For use with pinned host memory
DeviceMapHost = C.cudaDeviceMapHost
// Do not reduce local memory to try and prevent thrashing
DeviceLmemResizeToMax = C.cudaDeviceLmemResizeToMax
)
func Malloc(bytes int64) DevicePtr {
var devptr unsafe.Pointer
err := Result(C.cudaMalloc(&devptr, C.size_t(bytes)))
if err != SUCCESS {
panic(err)
}
return DevicePtr(devptr)
}
func MallocHost(bytes int64) unsafe.Pointer {
var p unsafe.Pointer
err := Result(C.cudaMallocHost(&p, C.size_t(bytes)))
if err != SUCCESS {
panic(err)
}
return p
}
func FreeHost(ptr unsafe.Pointer) {
err := Result(C.cudaFreeHost(ptr))
if err != SUCCESS {
panic(err)
}
}
// Copies a number of bytes in the direction specified by flags
func MemCpy(dst, src unsafe.Pointer, bytes int64, flags uint) {
err := Result(C.cudaMemcpy(dst, src, C.size_t(bytes), uint32(flags)))
if err != SUCCESS {
panic(err)
}
}
//Flags for memory copy types
const (
// Host to Host
HtoH = C.cudaMemcpyHostToHost
// Host to Device
HtoD = C.cudaMemcpyHostToDevice
// Device to Host
DtoH = C.cudaMemcpyDeviceToHost
// Device to Device
DtoD = C.cudaMemcpyDeviceToDevice
// Default, unified virtual address space
Virt = C.cudaMemcpyDefault
)
================================================
FILE: cu/stream.go
================================================
package cu
// This file implements CUDA streams
//#include <cuda.h>
import "C"
import "unsafe"
// CUDA stream.
type Stream uintptr
// Creates an asynchronous stream
func StreamCreate() Stream {
var stream C.CUstream
err := Result(C.cuStreamCreate(&stream, C.uint(0))) // flags has to be zero
if err != SUCCESS {
panic(err)
}
return Stream(uintptr(unsafe.Pointer(stream)))
}
// Destroys the asynchronous stream
func (stream *Stream) Destroy() {
str := *stream
err := Result(C.cuStreamDestroy(C.CUstream(unsafe.Pointer(uintptr(str)))))
*stream = 0
if err != SUCCESS {
panic(err)
}
}
// Destroys an asynchronous stream
func StreamDestroy(stream *Stream) {
stream.Destroy()
}
// Blocks until the stream has completed.
func (stream Stream) Synchronize() {
err := Result(C.cuStreamSynchronize(C.CUstream(unsafe.Pointer(uintptr(stream)))))
if err != SUCCESS {
panic(err)
}
}
// Returns Success if all operations have completed, ErrorNotReady otherwise
func (stream Stream) Query() Result {
return Result(C.cuStreamQuery(C.CUstream(unsafe.Pointer(uintptr(stream)))))
}
// Returns Success if all operations have completed, ErrorNotReady otherwise
func StreamQuery(stream Stream) Result {
return stream.Query()
}
// Blocks until the stream has completed.
func StreamSynchronize(stream Stream) {
stream.Synchronize()
}
================================================
FILE: cu/testdata/testmodule.cu
================================================
/*
* Module to test CUDA module loading and execution.
* To be compiled with:
* nvcc -ptx testmodule.cu
*/
#ifdef __cplusplus
extern "C" {
#endif
#define threadindex ( ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x )
/// Sets the first N elements of array to value.
__global__ void testMemset(float* array, float value, int N){
int i = threadindex;
if(i < N){
array[i] = value;
}
}
#ifdef __cplusplus
}
#endif
================================================
FILE: cu/testdata/testmodule.ptx
================================================
.version 1.4
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-02-18
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00000e56_00000000-9_testmodule.cpp3.i (/tmp/ccBI#.rDLD4T)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00000e56_00000000-8_testmodule.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h"
.file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/bin/../include/host_defines.h"
.file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
.file 7 "/usr/local/cuda/bin/../include/device_types.h"
.file 8 "/usr/local/cuda/bin/../include/driver_types.h"
.file 9 "/usr/local/cuda/bin/../include/surface_types.h"
.file 10 "/usr/local/cuda/bin/../include/texture_types.h"
.file 11 "/usr/local/cuda/bin/../include/vector_types.h"
.file 12 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/bin/../include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "testmodule.cu"
.file 17 "/usr/local/cuda/bin/../include/common_functions.h"
.file 18 "/usr/local/cuda/bin/../include/math_functions.h"
.file 19 "/usr/local/cuda/bin/../include/math_constants.h"
.file 20 "/usr/local/cuda/bin/../include/device_functions.h"
.file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/bin/../include/surface_functions.h"
.file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
.entry testMemset (
.param .u64 __cudaparm_testMemset_array,
.param .f32 __cudaparm_testMemset_value,
.param .s32 __cudaparm_testMemset_N)
{
.reg .u16 %rh<4>;
.reg .u32 %r<10>;
.reg .u64 %rd<6>;
.reg .f32 %f<3>;
.reg .pred %p<3>;
.loc 16 7 0
$LDWbegin_testMemset:
mov.u16 %rh1, %nctaid.x;
mov.u16 %rh2, %ctaid.y;
mul.wide.u16 %r1, %rh1, %rh2;
cvt.u32.u16 %r2, %ctaid.x;
add.u32 %r3, %r2, %r1;
cvt.u32.u16 %r4, %ntid.x;
mul.lo.u32 %r5, %r4, %r3;
cvt.u32.u16 %r6, %tid.x;
add.u32 %r7, %r6, %r5;
ld.param.s32 %r8, [__cudaparm_testMemset_N];
setp.le.s32 %p1, %r8, %r7;
@%p1 bra $Lt_0_1026;
.loc 16 10 0
ld.param.f32 %f1, [__cudaparm_testMemset_value];
ld.param.u64 %rd1, [__cudaparm_testMemset_array];
cvt.s64.s32 %rd2, %r7;
mul.wide.s32 %rd3, %r7, 4;
add.u64 %rd4, %rd1, %rd3;
st.global.f32 [%rd4+0], %f1;
$Lt_0_1026:
.loc 16 12 0
exit;
$LDWend_testMemset:
} // testMemset
================================================
FILE: cu/version.go
================================================
package cu
// This file implements CUDA driver version management
//#include <cuda.h>
import "C"
// Returns the CUDA driver version.
func Version() int {
var version C.int
err := Result(C.cuDriverGetVersion(&version))
if err != SUCCESS {
panic(err)
}
return int(version)
}
================================================
FILE: cu/version_test.go
================================================
package cu
import (
"fmt"
"testing"
)
func TestVersion(t *testing.T) {
fmt.Println("CUDA driver version: ", Version())
}
================================================
FILE: cuda/Makefile
================================================
all: 6g gccgo doc
6g:
go install -v
go tool vet *.go
gofmt -w *.go
GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
gccgo:
go build -v -compiler $(GCCGO)
test: 6gtest gccgotest
6gtest:
go test
gccgotest:
go test -compiler $(GCCGO)
bench: 6gbench gccgobench
6gbench:
go test -bench=.
gccgobench:
go test -bench=. -compiler $(GCCGO)
clean:
go clean
doc:
godoc github.com/barnex/cuda5/cu > README
================================================
FILE: cuda/README
================================================
PACKAGE
package cu
import "github.com/barnex/cuda5/cu"
Go bindings for the CUDA driver API.
CONSTANTS
const (
// If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.
CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
// Spin when waiting for results from the GPU.
CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
// Yield its thread when waiting for results from the GPU.
CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
// Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.
CTX_BLOCKING_SYNC
// Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.
CTX_MAP_HOST = C.CU_CTX_MAP_HOST
//Do not reduce local memory after resizing local memory for a kernel.
CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
)
Flags for CtxCreate
const (
SIZEOF_FLOAT32 = 4
SIZEOF_FLOAT64 = 8
SIZEOF_COMPLEX64 = 8
SIZEOF_COMPLEX128 = 16
)
Type size in bytes
FUNCTIONS
func CtxDestroy(ctx *Context)
Destroys the CUDA context specified by ctx. If the context usage count
is not equal to 1, or the context is current to any CPU thread other
than the current one, this function fails. Floating contexts (detached
from a CPU thread via cuCtxPopCurrent()) may be destroyed by this
function.
func CtxDisablePeerAccess(peer Context)
Reverses CtxEnablePeerAccess().
func CtxEnablePeerAccess(peer Context)
Make allocations from the peer Context available to the current context.
func CtxGetApiVersion(ctx Context) (version int)
Returns the API version to create the context.
func CtxSetCurrent(ctx Context)
Sets the current active context.
func CtxSynchronize()
Blocks until the device has completed all preceding requested tasks, if
the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.
func DeviceCanAccessPeer(dev, peer Device) bool
Returns true if CtxEnablePeerAccess can be called on a context for dev
and peerDev.
func DeviceComputeCapability(device Device) (major, minor int)
Returns the compute capability of the device.
func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int
Gets the value of a device attribute.
func DeviceGetCount() int
Returns the number of devices with compute capability greater than or
equal to 1.0 that are available for execution.
func DeviceGetName(dev Device) string
Gets the name of the device.
func DeviceTotalMem(device Device) int64
Returns the total amount of memory available on the device in bytes.
func FuncGetAttribute(attrib FunctionAttribute, function Function) int
func Init(flags int)
Initialize the CUDA driver API. Currently, flags must be 0. If Init()
has not been called, any function from the driver API will panic with
ERROR_NOT_INITIALIZED.
func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer)
func MemAllocHost(bytes int64) unsafe.Pointer
func MemFree(ptr *DevicePtr)
Frees device memory allocated by MemAlloc(). Overwrites the pointer with
NULL. It is safe to double-free.
func MemFreeHost(ptr unsafe.Pointer)
func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr)
Returns the base address and size of the allocation (by MemAlloc) that
contains the input pointer ptr.
func MemGetInfo() (free, total int64)
Returns the free and total amount of memroy in the current Context (in
bytes).
func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag)
Page-locks memory specified by the pointer and bytes. The pointer and
byte size must be aligned to the host page size (4KB) See also:
MemHostUnregister()
func MemHostUnregister(ptr unsafe.Pointer)
Unmaps memory locked by MemHostRegister().
func Memcpy(dst, src DevicePtr, bytes int64)
Copies a number of bytes on the current device. Requires unified
addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually
an auto copy for device and/or host memory
func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream)
Asynchronously copies a number of bytes on the current device.
func MemcpyDtoD(dst, src DevicePtr, bytes int64)
Copies a number of bytes from host to device.
func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream)
Asynchronously copies a number of bytes from host to device.
func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64)
Copies a number of bytes from device to host.
func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream)
Asynchronously copies a number of bytes device host to host. The host
memory must be page-locked (see MemRegister)
func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64)
Copies a number of bytes from host to device.
func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream)
Asynchronously copies a number of bytes from host to device. The host
memory must be page-locked (see MemRegister)
func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64)
Copies from device memory in one context (device) to another.
func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream)
Asynchronously copies from device memory in one context (device) to
another.
func MemsetD32(deviceptr DevicePtr, value uint32, N int64)
Sets the first N 32-bit values of dst array to value. Asynchronous.
func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream)
Asynchronously sets the first N 32-bit values of dst array to value.
func MemsetD8(deviceptr DevicePtr, value uint8, N int64)
Sets the first N 8-bit values of dst array to value. Asynchronous.
func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream)
Asynchronously sets the first N 32-bit values of dst array to value.
func StreamDestroy(stream *Stream)
Destroys an asynchronous stream
func StreamSynchronize(stream Stream)
Blocks until the stream has completed.
func Version() int
Returns the CUDA driver version.
TYPES
type Context uintptr
CUDA context.
func CtxCreate(flags uint, dev Device) Context
Create a CUDA context.
func CtxGetCurrent() Context
Gets the current active context.
func (ctx Context) ApiVersion() (version int)
Returns the API version to create the context.
func (ctx *Context) Destroy()
Destroys the CUDA context.
func (peer Context) DisablePeerAccess()
Reverses EnablePeerAccess().
func (peer Context) EnablePeerAccess()
Make allocations from the peer Context available to the current context.
func (ctx Context) SetCurrent()
Sets the current active context.
type DevProp struct {
MaxThreadsPerBlock int
MaxThreadsDim [3]int
MaxGridSize [3]int
SharedMemPerBlock int
TotalConstantMemory int
SIMDWidth int
MemPitch int
RegsPerBlock int
ClockRate int
TextureAlign int
}
Device properties
func DeviceGetProperties(dev Device) (prop DevProp)
Returns the device's properties.
type Device int
CUDA Device number.
func CtxGetDevice() Device
Returns the ordinal of the current context's device.
func DeviceGet(ordinal int) Device
Returns in a device handle given an ordinal in the range [0,
DeviceGetCount()-1].
func (dev Device) Attribute(attrib DeviceAttribute) int
Gets the value of a device attribute.
func (dev Device) CanAccessPeer(peer Device) bool
Returns true if CtxEnablePeerAccess can be called on a context for dev
and peerDev.
func (device Device) ComputeCapability() (major, minor int)
Returns the compute capability of the device.
func (dev Device) Name() string
Gets the name of the device.
func (dev Device) Properties() DevProp
Returns the device's properties.
func (device Device) TotalMem() int64
Returns the total amount of memory available on the device in bytes.
type DeviceAttribute int
const (
MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block
MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X
MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y
MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z
MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X
MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y
MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z
MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes
TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads
MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies
MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block
CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz
TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures
MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device
KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels
INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory
CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space
COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details)
MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width
MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width
MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height
MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width
MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height
MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth
MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width
MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height
MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture
SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces
CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently
ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled
PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device
PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device
TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model
MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz
GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits
L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes
MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor
ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines
UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host
MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width
MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture
)
type DevicePtr uintptr
func MemAlloc(bytes int64) DevicePtr
Allocates a number of bytes of device memory.
func (ptr DevicePtr) Bytes() (bytes int64)
Returns the size of the allocation (by MemAlloc) that contains the input
pointer ptr.
func (ptr *DevicePtr) Free()
Frees device memory allocated by MemAlloc(). Overwrites the pointer with
NULL. It is safe to double-free.
func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr)
Returns the base address and size of the allocation (by MemAlloc) that
contains the input pointer ptr.
func (ptr DevicePtr) MemoryType() MemoryType
Returns the physical memory type that ptr addresses.
func (p DevicePtr) String() string
type Dim3 struct {
X, Y, Z int
}
type Function uintptr
Represents a CUDA CUfunction, a reference to a function within a module.
func ModuleGetFunction(module Module, name string) Function
Returns a Function handle.
func (f Function) GetAttribute(attrib FunctionAttribute) int
type FunctionAttribute int
const (
FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.
FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function.
FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function.
FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function.
FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function.
FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled.
FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled.
)
type MemHostRegisterFlag int
const (
// Memory is pinned in all CUDA contexts.
MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE
// Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()
MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP
)
Flag for MemHostRegister
type MemoryType uint
Physical memory type of device pointer.
const (
MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST
MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE
MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY
MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
)
func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result)
Returns the physical memory type that ptr addresses.
func (t MemoryType) String() string
type Module uintptr
Represents a CUDA CUmodule, a reference to executable device code.
func ModuleLoad(fname string) Module
Loads a compute module from file
func ModuleLoadData(image string) Module
Loads a compute module from string
func (m Module) GetFunction(name string) Function
Returns a Function handle.
type Result int
CUDA error status. CUDA error statuses are not returned by functions but
checked and passed to panic() when not successful. If desired, they can
be caught by recover().
const (
SUCCESS Result = C.CUDA_SUCCESS
ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE
ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY
ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED
ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED
ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED
ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE
ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE
ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE
ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT
ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED
ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED
ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED
ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU
ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED
ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED
ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE
ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT
ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE
ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND
ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM
ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE
ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND
ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY
ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED
ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT
ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS
ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN
)
func StreamQuery(stream Stream) Result
Returns Success if all operations have completed, ErrorNotReady
otherwise
func (err Result) String() string
Message string for the error
type Stream uintptr
CUDA stream.
func StreamCreate() Stream
Creates an asynchronous stream
func (stream *Stream) Destroy()
Destroys the asynchronous stream
func (stream Stream) Query() Result
Returns Success if all operations have completed, ErrorNotReady
otherwise
func (stream Stream) Synchronize()
Blocks until the stream has completed.
================================================
FILE: cuda/cgoflags.go
================================================
package cuda
// This file provides CGO flags.
import "C"
//#cgo LDFLAGS:-lcudart
//
////default location:
//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
//#cgo CFLAGS: -I/usr/local/cuda/include/
//
////default location if not properly symlinked:
//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
//
////arch linux:
//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
//#cgo CFLAGS: -I/opt/cuda/include
//
////WINDOWS:
//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include
import "C"
================================================
FILE: cuda/device.go
================================================
package cuda
//#include <cuda_runtime.h>
//#include <cuda.h>
import "C"
import (
"github.com/barnex/cuda5/cu"
)
// Reset the current GPU device.
func DeviceReset() {
err := cu.Result(C.cudaDeviceReset())
if err != cu.SUCCESS {
panic(err)
}
}
// Set preference for more cache or shared memory.
func DeviceSetCacheConfig(cacheConfig FuncCache) {
err := cu.Result(C.cudaDeviceSetCacheConfig(uint32(cacheConfig)))
if err != cu.SUCCESS {
panic(err)
}
}
// Cache preference option.
type FuncCache int
const (
FUNC_CACHE_PREFER_NONE FuncCache = C.CU_FUNC_CACHE_PREFER_NONE
FUNC_CACHE_PREFER_SHARED FuncCache = C.CU_FUNC_CACHE_PREFER_SHARED
FUNC_CACHE_PREFER_L1 FuncCache = C.CU_FUNC_CACHE_PREFER_L1
FUNC_CACHE_PREFER_EQUAL FuncCache = C.CU_FUNC_CACHE_PREFER_EQUAL
)
================================================
FILE: cufft/Makefile
================================================
all: 6g gccgo doc
6g:
go install -v
go tool vet *.go
gofmt -w *.go
GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
gccgo:
go build -v -compiler $(GCCGO)
test: 6gtest gccgotest
6gtest:
go test
gccgotest:
go test -compiler $(GCCGO)
bench: 6gbench gccgobench
6gbench:
go test -bench=.
gccgobench:
go test -bench=. -compiler $(GCCGO)
clean:
go clean
doc:
godoc github.com/barnex/cuda5/cufft > README
================================================
FILE: cufft/README
================================================
PACKAGE DOCUMENTATION
package cufft
import "github.com/barnex/cuda5/cufft"
Go bindings for the CUDA CUFFT API.
CONSTANTS
const (
FORWARD = -1 // Forward FFT
INVERSE = 1 // Inverse FFT
)
TYPES
type CompatibilityMode int
CUFFT compatibility mode
const (
COMPATIBILITY_NATIVE CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE
COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING
COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC
COMPATIBILITY_FFTW_ALL CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL
)
func (t CompatibilityMode) String() string
type Handle uintptr
FFT plan handle, reference type to a plan
func Plan1d(nx int, typ Type, batch int) Handle
1D FFT plan
func Plan2d(nx, ny int, typ Type) Handle
2D FFT plan
func Plan3d(nx, ny, nz int, typ Type) Handle
3D FFT plan
func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle
1D,2D or 3D FFT plan
func (plan *Handle) Destroy()
Destroys the plan.
func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int)
Execute Complex-to-Complex plan
func (plan Handle) ExecC2R(idata, odata cu.DevicePtr)
Execute Complex-to-Real plan
func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr)
Execute Double Real-to-Complex plan
func (plan Handle) ExecR2C(idata, odata cu.DevicePtr)
Execute Real-to-Complex plan
func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr)
Execute Double Complex-to-Real plan
func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int)
Execute Double Complex-to-Complex plan
func (plan Handle) SetCompatibilityMode(mode CompatibilityMode)
Sets the FFTW compatibility mode
func (plan Handle) SetStream(stream cu.Stream)
Sets the cuda stream for this plan
type Result int
FFT result
const (
SUCCESS Result = C.CUFFT_SUCCESS
INVALID_PLAN Result = C.CUFFT_INVALID_PLAN
ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED
INVALID_TYPE Result = C.CUFFT_INVALID_TYPE
INVALID_VALUE Result = C.CUFFT_INVALID_VALUE
INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR
EXEC_FAILED Result = C.CUFFT_EXEC_FAILED
SETUP_FAILED Result = C.CUFFT_SETUP_FAILED
INVALID_SIZE Result = C.CUFFT_INVALID_SIZE
UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA
)
FFT result value
func (r Result) String() string
type Type int
FFT type
const (
R2C Type = C.CUFFT_R2C // Real to Complex (interleaved)
C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real
C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved
D2Z Type = C.CUFFT_D2Z // Double to Double-Complex
Z2D Type = C.CUFFT_Z2D // Double-Complex to Double
Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex
)
func (t Type) String() string
================================================
FILE: cufft/cgoflags.go
================================================
package cufft
// This file provides CGO flags to find CUDA libraries and headers.
//#cgo LDFLAGS:-lcufft
//
////default location:
//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
//#cgo CFLAGS: -I/usr/local/cuda/include/
//
////default location if not properly symlinked:
//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
//
////arch linux:
//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
//#cgo CFLAGS: -I/opt/cuda/include
//
////WINDOWS:
//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w
import "C"
================================================
FILE: cufft/doc.go
================================================
// Go bindings for the CUDA CUFFT API.
package cufft
================================================
FILE: cufft/fft_test.go
================================================
package cufft
import (
"fmt"
"github.com/barnex/cuda5/cu"
"unsafe"
)
func ExampleFFT1D() {
N := 8
hostIn := make([]float32, N)
hostIn[0] = 1
devIn := cu.MemAlloc(int64(len(hostIn)) * cu.SIZEOF_FLOAT32)
defer cu.MemFree(&devIn)
cu.MemcpyHtoD(devIn, unsafe.Pointer(&hostIn[0]), devIn.Bytes())
hostOut := make([]complex64, N/2+1)
devOut := cu.MemAlloc(int64(len(hostOut)) * cu.SIZEOF_COMPLEX64)
defer cu.MemFree(&devOut)
plan := Plan1d(N, R2C, 1)
defer plan.Destroy()
plan.ExecR2C(devIn, devOut)
cu.MemcpyDtoH(unsafe.Pointer(&hostOut[0]), devOut, devOut.Bytes())
fmt.Println("hostIn:", hostIn)
fmt.Println("hostOut:", hostOut)
// Output:
// hostIn: [1 0 0 0 0 0 0 0]
// hostOut: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]
}
================================================
FILE: cufft/init_test.go
================================================
package cufft
import (
"fmt"
"github.com/barnex/cuda5/cu"
)
// needed for all other tests.
func init() {
cu.Init(0)
ctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0)
cu.CtxSetCurrent(ctx)
fmt.Println("Created CUDA context")
}
================================================
FILE: cufft/mode.go
================================================
package cufft
//#include <cufft.h>
import "C"
import (
"fmt"
)
// CUFFT compatibility mode
type CompatibilityMode int
const (
COMPATIBILITY_NATIVE CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE
COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING
COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC
COMPATIBILITY_FFTW_ALL CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL
)
func (t CompatibilityMode) String() string {
if str, ok := compatibilityModeString[t]; ok {
return str
}
return fmt.Sprint("CUFFT Compatibility mode with unknown number:", int(t))
}
var compatibilityModeString map[CompatibilityMode]string = map[CompatibilityMode]string{
COMPATIBILITY_NATIVE: "CUFFT_COMPATIBILITY_NATIVE",
COMPATIBILITY_FFTW_PADDING: "CUFFT_COMPATIBILITY_FFTW_PADDING",
COMPATIBILITY_FFTW_ASYMMETRIC: "CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC",
COMPATIBILITY_FFTW_ALL: "CUFFT_COMPATIBILITY_FFTW_ALL"}
================================================
FILE: cufft/plan.go
================================================
// Copyright 2011 Arne Vansteenkiste (barnex@gmail.com). All rights reserved.
// Use of this source code is governed by a freeBSD
// license that can be found in the LICENSE.txt file.
package cufft
//#include <cufft.h>
import "C"
import (
"github.com/barnex/cuda5/cu"
"unsafe"
)
// FFT plan handle, reference type to a plan
type Handle uintptr
// 1D FFT plan
func Plan1d(nx int, typ Type, batch int) Handle {
var handle C.cufftHandle
err := Result(C.cufftPlan1d(
&handle,
C.int(nx),
C.cufftType(typ),
C.int(batch)))
if err != SUCCESS {
panic(err)
}
return Handle(handle)
}
// 2D FFT plan
func Plan2d(nx, ny int, typ Type) Handle {
var handle C.cufftHandle
err := Result(C.cufftPlan2d(
&handle,
C.int(nx),
C.int(ny),
C.cufftType(typ)))
if err != SUCCESS {
panic(err)
}
return Handle(handle)
}
// 3D FFT plan
func Plan3d(nx, ny, nz int, typ Type) Handle {
var handle C.cufftHandle
err := Result(C.cufftPlan3d(
&handle,
C.int(nx),
C.int(ny),
C.int(nz),
C.cufftType(typ)))
if err != SUCCESS {
panic(err)
}
return Handle(handle)
}
//cufftPlanMany(
// cufftHandle *plan, int rank, int *n, int *inembed,
// int istride, int idist, int *onembed, int ostride,
// int odist, cufftType type, int batch );
// 1D,2D or 3D FFT plan
func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle {
var handle C.cufftHandle
NULL := (*C.int)(unsafe.Pointer(uintptr(0)))
inembedptr := NULL
idist := 0
if inembed != nil {
inembedptr = (*C.int)(unsafe.Pointer(&inembed[0]))
idist = inembed[0]
}
oembedptr := NULL
odist := 0
if oembed != nil {
oembedptr = (*C.int)(unsafe.Pointer(&oembed[0]))
odist = oembed[0]
}
err := Result(C.cufftPlanMany(
&handle,
C.int(len(n)), // rank
(*C.int)(unsafe.Pointer(&n[0])), // n
inembedptr,
C.int(istride),
C.int(idist),
oembedptr,
C.int(ostride),
C.int(odist),
C.cufftType(typ),
C.int(batch)))
if err != SUCCESS {
panic(err)
}
return Handle(handle)
}
// Execute Complex-to-Complex plan
func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) {
err := Result(C.cufftExecC2C(
C.cufftHandle(plan),
(*C.cufftComplex)(unsafe.Pointer(uintptr(idata))),
(*C.cufftComplex)(unsafe.Pointer(uintptr(odata))),
C.int(direction)))
if err != SUCCESS {
panic(err)
}
}
// Execute Real-to-Complex plan
func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) {
err := Result(C.cufftExecR2C(
C.cufftHandle(plan),
(*C.cufftReal)(unsafe.Pointer(uintptr(idata))),
(*C.cufftComplex)(unsafe.Pointer(uintptr(odata)))))
if err != SUCCESS {
panic(err)
}
}
// Execute Complex-to-Real plan
func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) {
err := Result(C.cufftExecC2R(
C.cufftHandle(plan),
(*C.cufftComplex)(unsafe.Pointer(uintptr(idata))),
(*C.cufftReal)(unsafe.Pointer(uintptr(odata)))))
if err != SUCCESS {
panic(err)
}
}
// Execute Double Complex-to-Complex plan
func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) {
err := Result(C.cufftExecZ2Z(
C.cufftHandle(plan),
(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))),
(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))),
C.int(direction)))
if err != SUCCESS {
panic(err)
}
}
// Execute Double Real-to-Complex plan
func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) {
err := Result(C.cufftExecD2Z(
C.cufftHandle(plan),
(*C.cufftDoubleReal)(unsafe.Pointer(uintptr(idata))),
(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata)))))
if err != SUCCESS {
panic(err)
}
}
// Execute Double Complex-to-Real plan
func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) {
err := Result(C.cufftExecZ2D(
C.cufftHandle(plan),
(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))),
(*C.cufftDoubleReal)(unsafe.Pointer(uintptr(odata)))))
if err != SUCCESS {
panic(err)
}
}
// Destroys the plan.
func (plan *Handle) Destroy() {
err := Result(C.cufftDestroy(C.cufftHandle(*plan)))
*plan = 0 // make sure plan is not used anymore
if err != SUCCESS {
panic(err)
}
}
// Sets the cuda stream for this plan
func (plan Handle) SetStream(stream cu.Stream) {
err := Result(C.cufftSetStream(
C.cufftHandle(plan),
C.cudaStream_t(unsafe.Pointer(uintptr(stream)))))
if err != SUCCESS {
panic(err)
}
}
// Sets the FFTW compatibility mode
func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) {
err := Result(C.cufftSetCompatibilityMode(
C.cufftHandle(plan),
C.cufftCompatibility(mode)))
if err != SUCCESS {
panic(err)
}
}
================================================
FILE: cufft/result.go
================================================
package cufft
//#include <cufft.h>
import "C"
import (
"fmt"
)
// FFT result
type Result int
// FFT result value
const (
SUCCESS Result = C.CUFFT_SUCCESS
INVALID_PLAN Result = C.CUFFT_INVALID_PLAN
ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED
INVALID_TYPE Result = C.CUFFT_INVALID_TYPE
INVALID_VALUE Result = C.CUFFT_INVALID_VALUE
INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR
EXEC_FAILED Result = C.CUFFT_EXEC_FAILED
SETUP_FAILED Result = C.CUFFT_SETUP_FAILED
INVALID_SIZE Result = C.CUFFT_INVALID_SIZE
UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA
INCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h
INVALID_DEVICE Result = 0xB
PARSE_ERROR Result = 0xC
NO_WORKSPACE Result = 0xD
)
func (r Result) String() string {
if str, ok := resultString[r]; ok {
return str
}
return fmt.Sprint("CUFFT Result with unknown error number:", int(r))
}
var resultString map[Result]string = map[Result]string{
SUCCESS: "CUFFT_SUCCESS",
INVALID_PLAN: "CUFFT_INVALID_PLAN",
ALLOC_FAILED: "CUFFT_ALLOC_FAILED",
INVALID_TYPE: "CUFFT_INVALID_TYPE",
INVALID_VALUE: "CUFFT_INVALID_VALUE",
INTERNAL_ERROR: "CUFFT_INTERNAL_ERROR",
EXEC_FAILED: "CUFFT_EXEC_FAILED",
SETUP_FAILED: "CUFFT_SETUP_FAILED",
INVALID_SIZE: "CUFFT_INVALID_SIZE",
UNALIGNED_DATA: "CUFFT_UNALIGNED_DATA",
INCOMPLETE_PARAMETER_LIST: "CUFFT_INCOMPLETE_PARAMETER_LIST",
INVALID_DEVICE: "CUFFT_INVALID_DEVICE",
PARSE_ERROR: "CUFFT_PARSE_ERROR",
NO_WORKSPACE: "CUFFT_NO_WORKSPACE"}
================================================
FILE: cufft/type.go
================================================
package cufft
//#include <cufft.h>
import "C"
import (
"fmt"
)
// FFT type
type Type int
const (
R2C Type = C.CUFFT_R2C // Real to Complex (interleaved)
C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real
C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved
D2Z Type = C.CUFFT_D2Z // Double to Double-Complex
Z2D Type = C.CUFFT_Z2D // Double-Complex to Double
Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex
)
const (
FORWARD = -1 // Forward FFT
INVERSE = 1 // Inverse FFT
)
func (t Type) String() string {
if str, ok := typeString[t]; ok {
return str
}
return fmt.Sprint("CUFFT Type with unknown number:", int(t))
}
var typeString map[Type]string = map[Type]string{
R2C: "CUFFT_R2C",
C2R: "CUFFT_C2R",
C2C: "CUFFT_C2C",
D2Z: "CUFFT_D2Z",
Z2D: "CUFFT_Z2D",
Z2Z: "CUFFT_Z2Z"}
================================================
FILE: curand/Makefile
================================================
all: 6g gccgo doc
6g:
go install -v
go tool vet *.go
gofmt -w *.go
GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
gccgo:
go build -v -compiler $(GCCGO)
test: 6gtest gccgotest
6gtest:
go test
gccgotest:
go test -compiler $(GCCGO)
bench: 6gbench gccgobench
6gbench:
go test -bench=.
gccgobench:
go test -bench=. -compiler $(GCCGO)
clean:
go clean
doc:
godoc github.com/barnex/cuda5/curand > README
================================================
FILE: curand/README
================================================
PACKAGE DOCUMENTATION
package curand
import "github.com/barnex/cuda5/curand"
TYPES
type Generator uintptr
func CreateGenerator(rngType RngType) Generator
func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32)
func (g Generator) SetSeed(seed int64)
type RngType int
const (
PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator
PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator
QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator
QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator
QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator
QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator
QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator
)
type Status int
const (
SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors
VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match
NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized
ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed
TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type
OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range
LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension
LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure
PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry
INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed
ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature
INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error
)
================================================
FILE: curand/cgoflags.go
================================================
package curand
// This file provides CGO flags to find CUDA libraries and headers.
//#cgo LDFLAGS:-lcurand
//
////default location:
//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
//#cgo CFLAGS: -I/usr/local/cuda/include/
//
////default location if not properly symlinked:
//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
//
////arch linux:
//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
//#cgo CFLAGS: -I/opt/cuda/include
//
////WINDOWS:
//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w
import "C"
================================================
FILE: curand/generator.go
================================================
package curand
//#include <curand.h>
import "C"
import (
"unsafe"
)
type Generator uintptr
type RngType int
const (
PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator
PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator
QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator
QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator
QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator
QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator
QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator
)
func CreateGenerator(rngType RngType) Generator {
var rng C.curandGenerator_t
err := Status(C.curandCreateGenerator(&rng, C.curandRngType_t(rngType)))
if err != SUCCESS {
panic(err)
}
return Generator(uintptr(unsafe.Pointer(rng))) // cgo
}
func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) {
err := Status(C.curandGenerateNormal(
C.curandGenerator_t(unsafe.Pointer(uintptr(g))),
(*C.float)(unsafe.Pointer(output)),
C.size_t(n),
C.float(mean),
C.float(stddev)))
if err != SUCCESS {
panic(err)
}
}
func (g Generator) SetSeed(seed int64) {
err := Status(C.curandSetPseudoRandomGeneratorSeed(C.curandGenerator_t(unsafe.Pointer(uintptr(g))), _Ctype_ulonglong(seed)))
if err != SUCCESS {
panic(err)
}
}
// Documentation was taken from the curand headers.
================================================
FILE: curand/status.go
================================================
package curand
//#include <curand.h>
import "C"
import (
"fmt"
)
type Status int
const (
SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors
VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match
NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized
ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed
TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type
OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range
LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension
LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure
PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry
INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed
ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature
INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error
)
func (s Status) String() string {
if str, ok := statusStr[s]; ok {
return str
} else {
return fmt.Sprint("CURAND ERROR NUMBER ", int(s))
}
}
var statusStr = map[Status]string{
SUCCESS: "CURAND_STATUS_SUCCESS",
VERSION_MISMATCH: "CURAND_STATUS_VERSION_MISMATCH",
NOT_INITIALIZED: "CURAND_STATUS_NOT_INITIALIZED",
ALLOCATION_FAILED: "CURAND_STATUS_ALLOCATION_FAILED",
TYPE_ERROR: "CURAND_STATUS_TYPE_ERROR",
OUT_OF_RANGE: "CURAND_STATUS_OUT_OF_RANGE",
LENGTH_NOT_MULTIPLE: "CURAND_STATUS_LENGTH_NOT_MULTIPLE",
LAUNCH_FAILURE: "CURAND_STATUS_LAUNCH_FAILURE",
PREEXISTING_FAILURE: "CURAND_STATUS_PREEXISTING_FAILURE",
INITIALIZATION_FAILED: "CURAND_STATUS_INITIALIZATION_FAILED",
ARCH_MISMATCH: "CURAND_STATUS_ARCH_MISMATCH",
INTERNAL_ERROR: "CURAND_STATUS_INTERNAL_ERROR",
}
// Documentation was taken from the curand headers.
================================================
FILE: doc.go
================================================
/*
Go bindings for nVIDIA CUDA 5.
This package compiles with both gc and gccgo.
*/
package cuda5
// Dummy imports so that
// go get github.com/barnex/cuda5
// will install everything.
import (
_ "github.com/barnex/cuda5/cu"
_ "github.com/barnex/cuda5/cufft"
_ "github.com/barnex/cuda5/safe"
)
================================================
FILE: safe/Makefile
================================================
all: 6g doc #gccgo
6g:
go install -v
go tool vet *.go
gofmt -w *.go
GCCGO=gccgo -gccgoflags '-static-libgcc -O3'
gccgo:
go build -v -compiler $(GCCGO)
test: 6gtest gccgotest
6gtest:
go test
gccgotest:
go test -compiler $(GCCGO)
bench: 6gbench gccgobench
6gbench:
go test -bench=.
gccgobench:
go test -bench=. -compiler $(GCCGO)
clean:
go clean
go-optview -c -w *.go
gofmt -w *.go
opt:
go-optview -w *.go
gofmt -w *.go
doc:
godoc github.com/barnex/cuda5/safe > README
================================================
FILE: safe/README
================================================
PACKAGE
package safe
import "github.com/barnex/cuda5/safe"
Safe and more idiomatic wrappers for the low-level CUDA functions.
FUNCTIONS
func InitCuda()
TYPES
type Complex128s struct {
// contains filtered or unexported fields
}
Slice of complex128's on the GPU.
func MakeComplex128s(len_ int) Complex128s
Make a slice of complex128's on the GPU. Initialized to zero.
func (s *Complex128s) Cap() int
Slice capacity.
func (dst Complex128s) CopyDtoD(src Complex128s)
Copy src on host to dst on host.
func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream)
Copy src on host to dst on host, asynchronously.
func (src Complex128s) CopyDtoH(dst []complex128)
Copy src form device to dst on host.
func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream)
Copy src form device to dst on host, asynchronously.
func (dst Complex128s) CopyHtoD(src []complex128)
Copy src from host to dst on the device.
func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream)
Copy src from host to dst on the device, asynchronously.
func (s Complex128s) Float() Float64s
Re-interpret the array as float numbers, in interleaved format.
Underlying storage is shared.
func (s *Complex128s) Free()
Free the underlying storage. To be used with care. Free() should only be
called on a slice created by MakeXXX(), not on a slice created by
x.Slice(). Freeing a slice invalidates all other slices referring to it.
func (src Complex128s) Host() []complex128
Returns a fresh copy on host.
func (s *Complex128s) Len() int
Slice length (number of elements).
func (s *Complex128s) Pointer() cu.DevicePtr
Pointer to the first element.
func (s Complex128s) Slice(start, stop int) Complex128s
Return a slice from start (inclusive) to stop (exclusive), sharing the
underlying storage with the original slice. Slices obtained in this way
should not be Free()'d
func (s *Complex128s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
Manually set the pointer, length and capacity. Side-steps the security
mechanisms, use with caution.
type Complex64s struct {
// contains filtered or unexported fields
}
Slice of complex64's on the GPU.
func MakeComplex64s(len_ int) Complex64s
Make a slice of complex64's on the GPU. Initialized to zero.
func (s *Complex64s) Cap() int
Slice capacity.
func (dst Complex64s) CopyDtoD(src Complex64s)
Copy src on host to dst on host.
func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream)
Copy src on host to dst on host, asynchronously.
func (src Complex64s) CopyDtoH(dst []complex64)
Copy src form device to dst on host.
func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream)
Copy src form device to dst on host, asynchronously.
func (dst Complex64s) CopyHtoD(src []complex64)
Copy src from host to dst on the device.
func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream)
Copy src from host to dst on the device, asynchronously.
func (s Complex64s) Float() Float32s
Re-interpret the array as float numbers, in interleaved format.
Underlying storage is shared.
func (s *Complex64s) Free()
Free the underlying storage. To be used with care. Free() should only be
called on a slice created by MakeXXX(), not on a slice created by
x.Slice(). Freeing a slice invalidates all other slices referring to it.
func (src Complex64s) Host() []complex64
Returns a fresh copy on host.
func (s *Complex64s) Len() int
Slice length (number of elements).
func (s *Complex64s) Pointer() cu.DevicePtr
Pointer to the first element.
func (s Complex64s) Slice(start, stop int) Complex64s
Return a slice from start (inclusive) to stop (exclusive), sharing the
underlying storage with the original slice. Slices obtained in this way
should not be Free()'d
func (s *Complex64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
Manually set the pointer, length and capacity. Side-steps the security
mechanisms, use with caution.
type FFT1DC2RPlan struct {
// contains filtered or unexported fields
}
1D single-precission complex-to-real FFT plan.
func FFT1DC2R(size, batch int) FFT1DC2RPlan
1D single-precission complex-to-real FFT plan.
func (p FFT1DC2RPlan) Destroy()
Releases all resources associated with the FFT plan.
func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s)
Execute the FFT plan. Synchronized.
func (p FFT1DC2RPlan) InputLen() int
Required length of the output array.
func (p FFT1DC2RPlan) OutputLen() int
Required length of the input array.
func (p FFT1DC2RPlan) SetStream(stream cu.Stream)
Associates a CUDA stream with the FFT plan. If a stream is set,
plan.Stream().Synchronize() can to be called to wait for the execution
to finish.
func (s FFT1DC2RPlan) Size() int
Returns the logical size of the FFT: the number of elements (real or
complex) it transforms.
func (p FFT1DC2RPlan) Stream() cu.Stream
Returns the CUDA stream associated with the FFT plan.
type FFT1DR2CPlan struct {
// contains filtered or unexported fields
}
1D single-precission real-to-complex FFT plan.
func FFT1DR2C(size, batch int) FFT1DR2CPlan
1D single-precission real-to-complex FFT plan.
func (p FFT1DR2CPlan) Destroy()
Releases all resources associated with the FFT plan.
func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s)
Execute the FFT plan. Synchronized.
func (p FFT1DR2CPlan) InputLen() int
Required length of the input array.
func (p FFT1DR2CPlan) OutputLen() int
Required length of the output array.
func (p FFT1DR2CPlan) SetStream(stream cu.Stream)
Associates a CUDA stream with the FFT plan. If a stream is set,
plan.Stream().Synchronize() can to be called to wait for the execution
to finish.
func (s FFT1DR2CPlan) Size() int
Returns the logical size of the FFT: the number of elements (real or
complex) it transforms.
func (p FFT1DR2CPlan) Stream() cu.Stream
Returns the CUDA stream associated with the FFT plan.
type FFT3DC2RPlan struct {
// contains filtered or unexported fields
}
3D single-precission real-to-complex FFT plan.
func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan
3D single-precission real-to-complex FFT plan.
func (p FFT3DC2RPlan) Destroy()
Releases all resources associated with the FFT plan.
func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s)
Execute the FFT plan. src and dst are 3D arrays stored 1D arrays.
func (p FFT3DC2RPlan) InputLen() int
Required length of the (1D) input array.
func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int)
3D size of the input array.
func (p FFT3DC2RPlan) OutputLen() int
Required length of the (1D) output array.
func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int)
3D size of the output array.
func (p FFT3DC2RPlan) SetStream(stream cu.Stream)
Associates a CUDA stream with the FFT plan. If a stream is set,
plan.Stream().Synchronize() can to be called to wait for the execution
to finish.
func (s FFT3DC2RPlan) Size() (Nx, Ny, Nz int)
Returns the logical size of the FFT: the number of elements (real or
complex) it transforms.
func (p FFT3DC2RPlan) Stream() cu.Stream
Returns the CUDA stream associated with the FFT plan.
type FFT3DD2ZPlan struct {
// contains filtered or unexported fields
}
3D single-precission real-to-complex FFT plan.
func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan
3D single-precission real-to-complex FFT plan.
func (p FFT3DD2ZPlan) Destroy()
Releases all resources associated with the FFT plan.
func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s)
Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D
arrays.
func (p FFT3DD2ZPlan) InputLen() int
Required length of the (1D) input array.
func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int)
3D size of the input array.
func (p FFT3DD2ZPlan) OutputLen() int
Required length of the (1D) output array.
func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int)
3D size of the output array.
func (p FFT3DD2ZPlan) SetStream(stream cu.Stream)
Associates a CUDA stream with the FFT plan. If a stream is set,
plan.Stream().Synchronize() can to be called to wait for the execution
to finish.
func (s FFT3DD2ZPlan) Size() (Nx, Ny, Nz int)
Returns the logical size of the FFT: the number of elements (real or
complex) it transforms.
func (p FFT3DD2ZPlan) Stream() cu.Stream
Returns the CUDA stream associated with the FFT plan.
type FFT3DR2CPlan struct {
// contains filtered or unexported fields
}
3D single-precission real-to-complex FFT plan.
func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan
3D single-precission real-to-complex FFT plan.
func (p FFT3DR2CPlan) Destroy()
Releases all resources associated with the FFT plan.
func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s)
Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D
arrays.
func (p FFT3DR2CPlan) InputLen() int
Required length of the (1D) input array.
func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int)
3D size of the input array.
func (p FFT3DR2CPlan) OutputLen() int
Required length of the (1D) output array.
func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int)
3D size of the output array.
func (p FFT3DR2CPlan) SetStream(stream cu.Stream)
Associates a CUDA stream with the FFT plan. If a stream is set,
plan.Stream().Synchronize() can to be called to wait for the execution
to finish.
func (s FFT3DR2CPlan) Size() (Nx, Ny, Nz int)
Returns the logical size of the FFT: the number of elements (real or
complex) it transforms.
func (p FFT3DR2CPlan) Stream() cu.Stream
Returns the CUDA stream associated with the FFT plan.
type FFT3DZ2DPlan struct {
// contains filtered or unexported fields
}
3D single-precission real-to-complex FFT plan.
func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan
3D single-precission real-to-complex FFT plan.
func (p FFT3DZ2DPlan) Destroy()
Releases all resources associated with the FFT plan.
func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s)
Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D
arrays.
func (p FFT3DZ2DPlan) InputLen() int
Required length of the (1D) input array.
func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int)
3D size of the input array.
func (p FFT3DZ2DPlan) OutputLen() int
Required length of the (1D) output array.
func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int)
3D size of the output array.
func (p FFT3DZ2DPlan) SetStream(stream cu.Stream)
Associates a CUDA stream with the FFT plan. If a stream is set,
plan.Stream().Synchronize() can to be called to wait for the execution
to finish.
func (s FFT3DZ2DPlan) Size() (Nx, Ny, Nz int)
Returns the logical size of the FFT: the number of elements (real or
complex) it transforms.
func (p FFT3DZ2DPlan) Stream() cu.Stream
Returns the CUDA stream associated with the FFT plan.
type Float32s struct {
// contains filtered or unexported fields
}
Slice of float32's on the GPU.
func MakeFloat32s(len_ int) Float32s
Make a slice of float32's on the GPU. Initialized to zero.
func (s *Float32s) Cap() int
Slice capacity.
func (s Float32s) Complex() Complex64s
Re-interpret the array as complex numbers, in interleaved format.
Underlying storage is shared.
func (dst Float32s) CopyDtoD(src Float32s)
Copy src on host to dst on host.
func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream)
Copy src on host to dst on host, asynchronously.
func (src Float32s) CopyDtoH(dst []float32)
Copy src form device to dst on host.
func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream)
Copy src form device to dst on host, asynchronously.
func (dst Float32s) CopyHtoD(src []float32)
Copy src from host to dst on the device.
func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream)
Copy src from host to dst on the device, asynchronously.
func (s *Float32s) Free()
Free the underlying storage. To be used with care. Free() should only be
called on a slice created by MakeXXX(), not on a slice created by
x.Slice(). Freeing a slice invalidates all other slices referring to it.
func (src Float32s) Host() []float32
Returns a fresh copy on host.
func (s *Float32s) Len() int
Slice length (number of elements).
func (s Float32s) Memset(value float32)
Set the entire slice to this value.
func (s Float32s) MemsetAsync(value float32, stream cu.Stream)
Set the entire slice to this value, asynchronously.
func (s *Float32s) Pointer() cu.DevicePtr
Pointer to the first element.
func (s Float32s) Slice(start, stop int) Float32s
Return a slice from start (inclusive) to stop (exclusive), sharing the
underlying storage with the original slice. Slices obtained in this way
should not be Free()'d
func (s *Float32s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
Manually set the pointer, length and capacity. Side-steps the security
mechanisms, use with caution.
type Float64s struct {
// contains filtered or unexported fields
}
Slice of float64's on the GPU.
func MakeFloat64s(len_ int) Float64s
Make a slice of float64's on the GPU. Initialized to zero.
func (s *Float64s) Cap() int
Slice capacity.
func (s Float64s) Complex() Complex128s
Re-interpret the array as complex numbers, in interleaved format.
Underlying storage is shared.
func (dst Float64s) CopyDtoD(src Float64s)
Copy src on host to dst on host.
func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream)
Copy src on host to dst on host, asynchronously.
func (src Float64s) CopyDtoH(dst []float64)
Copy src form device to dst on host.
func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream)
Copy src form device to dst on host, asynchronously.
func (dst Float64s) CopyHtoD(src []float64)
Copy src from host to dst on the device.
func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream)
Copy src from host to dst on the device, asynchronously.
func (s *Float64s) Free()
Free the underlying storage. To be used with care. Free() should only be
called on a slice created by MakeXXX(), not on a slice created by
x.Slice(). Freeing a slice invalidates all other slices referring to it.
func (src Float64s) Host() []float64
Returns a fresh copy on host.
func (s *Float64s) Len() int
Slice length (number of elements).
func (s *Float64s) Pointer() cu.DevicePtr
Pointer to the first element.
func (s Float64s) Slice(start, stop int) Float64s
Return a slice from start (inclusive) to stop (exclusive), sharing the
underlying storage with the original slice. Slices obtained in this way
should not be Free()'d
func (s *Float64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
Manually set the pointer, length and capacity. Side-steps the security
mechanisms, use with caution.
================================================
FILE: safe/complex128s.go
================================================
package safe
import (
"github.com/barnex/cuda5/cu"
"unsafe"
)
// Slice of complex128's on the GPU.
type Complex128s struct{ slice }
// Make a slice of complex128's on the GPU.
// Initialized to zero.
func MakeComplex128s(len_ int) Complex128s {
return Complex128s{makeslice(len_, cu.SIZEOF_COMPLEX128)}
}
// Return a slice from start (inclusive) to stop (exclusive),
// sharing the underlying storage with the original slice.
// Slices obtained in this way should not be Free()'d
func (s Complex128s) Slice(start, stop int) Complex128s {
return Complex128s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX128)}
}
// Copy src from host to dst on the device.
func (dst Complex128s) CopyHtoD(src []complex128) {
dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128)
}
// Copy src form device to dst on host.
func (src Complex128s) CopyDtoH(dst []complex128) {
src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128)
}
// Copy src on host to dst on host.
func (dst Complex128s) CopyDtoD(src Complex128s) {
dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX128)
}
// Copy src from host to dst on the device, asynchronously.
func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream) {
dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128, stream)
}
// Copy src form device to dst on host, asynchronously.
func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream) {
src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128, stream)
}
// Copy src on host to dst on host, asynchronously.
func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream) {
dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX128, stream)
}
// Returns a fresh copy on host.
func (src Complex128s) Host() []complex128 {
cpy := make([]complex128, src.Len())
src.CopyDtoH(cpy)
return cpy
}
// Re-interpret the array as float numbers,
// in interleaved format. Underlying storage
// is shared.
func (s Complex128s) Float() Float64s {
return Float64s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}}
}
================================================
FILE: safe/complex128s_test.go
================================================
package safe
import (
"reflect"
"testing"
)
func TestComplex128sSlice(test *testing.T) {
InitCuda()
a := MakeComplex128s(100)
defer a.Free()
if !reflect.DeepEqual(a.Host(), make([]complex128, 100)) {
test.Error(a.Host())
}
b := make([]complex128, 100)
if a.Len() != len(b) {
test.Error("len:", a.Len(), "!=", cap(b))
}
if a.Cap() != cap(b) {
test.Error("cap:", a.Cap(), "!=", cap(b))
}
c := a.Slice(20, 30)
d := b[20:30]
if c.Len() != len(d) {
test.Error("sliced len:", c.Len(), "!=", cap(d))
}
if c.Cap() != cap(d) {
test.Error("sliced cap:", c.Cap(), "!=", cap(d))
}
e := a.Slice(0, 50)
f := b[0:50]
if e.Len() != len(f) {
test.Error("sliced len:", e.Len(), "!=", cap(f))
}
if e.Cap() != cap(f) {
test.Error("sliced cap:", e.Cap(), "!=", cap(f))
}
}
func TestComplex128sPanic1(test *testing.T) {
InitCuda()
defer func() {
err := recover()
test.Log("recovered:", err)
if err == nil {
test.Fail()
}
}()
a := MakeComplex128s(100)
defer a.Free()
a.Slice(-1, 10)
}
func TestComplex128sPanic2(test *testing.T) {
InitCuda()
defer func() {
err := recover()
test.Log("recovered:", err)
if err == nil {
test.Fail()
}
}()
a := MakeComplex128s(100)
defer a.Free()
a.Slice(0, 101)
}
func TestComplex128sCopy(test *testing.T) {
InitCuda()
a := make([]complex128, 100)
b := MakeComplex128s(100)
defer b.Free()
c := MakeComplex128s(100)
defer c.Free()
d := make([]complex128, 200)
for i := range a {
a[i] = complex(float64(i), float64(2*i))
}
b.CopyHtoD(a)
c.CopyDtoD(b)
c.CopyDtoH(d[:100])
if !reflect.DeepEqual(a, d[:100]) {
test.Error(d)
}
if !reflect.DeepEqual(d[100:], make([]complex128, 100)) {
test.Error(d)
}
}
================================================
FILE: safe/complex64s.go
================================================
package safe
import (
"github.com/barnex/cuda5/cu"
"unsafe"
)
// Slice of complex64's on the GPU.
type Complex64s struct{ slice }
// Make a slice of complex64's on the GPU.
// Initialized to zero.
func MakeComplex64s(len_ int) Complex64s {
return Complex64s{makeslice(len_, cu.SIZEOF_COMPLEX64)}
}
// Return a slice from start (inclusive) to stop (exclusive),
// sharing the underlying storage with the original slice.
// Slices obtained in this way should not be Free()'d
func (s Complex64s) Slice(start, stop int) Complex64s {
return Complex64s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX64)}
}
// Copy src from host to dst on the device.
func (dst Complex64s) CopyHtoD(src []complex64) {
dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64)
}
// Copy src form device to dst on host.
func (src Complex64s) CopyDtoH(dst []complex64) {
src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64)
}
// Copy src on host to dst on host.
func (dst Complex64s) CopyDtoD(src Complex64s) {
dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX64)
}
// Copy src from host to dst on the device, asynchronously.
func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream) {
dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64, stream)
}
// Copy src form device to dst on host, asynchronously.
func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream) {
src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64, stream)
}
// Copy src on host to dst on host, asynchronously.
func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream) {
dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX64, stream)
}
// Returns a fresh copy on host.
func (src Complex64s) Host() []complex64 {
cpy := make([]complex64, src.Len())
src.CopyDtoH(cpy)
return cpy
}
// Re-interpret the array as float numbers,
// in interleaved format. Underlying storage
// is shared.
func (s Complex64s) Float() Float32s {
return Float32s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}}
}
================================================
FILE: safe/complex64s_test.go
================================================
package safe
import (
"reflect"
"testing"
)
func TestComplex64sSlice(test *testing.T) {
InitCuda()
a := MakeComplex64s(100)
defer a.Free()
if !reflect.DeepEqual(a.Host(), make([]complex64, 100)) {
test.Error(a.Host())
}
b := make([]complex64, 100)
if a.Len() != len(b) {
test.Error("len:", a.Len(), "!=", cap(b))
}
if a.Cap() != cap(b) {
test.Error("cap:", a.Cap(), "!=", cap(b))
}
c := a.Slice(20, 30)
d := b[20:30]
if c.Len() != len(d) {
test.Error("sliced len:", c.Len(), "!=", cap(d))
}
if c.Cap() != cap(d) {
test.Error("sliced cap:", c.Cap(), "!=", cap(d))
}
e := a.Slice(0, 50)
f := b[0:50]
if e.Len() != len(f) {
test.Error("sliced len:", e.Len(), "!=", cap(f))
}
if e.Cap() != cap(f) {
test.Error("sliced cap:", e.Cap(), "!=", cap(f))
}
}
func TestComplex64sPanic1(test *testing.T) {
InitCuda()
defer func() {
err := recover()
test.Log("recovered:", err)
if err == nil {
test.Fail()
}
}()
a := MakeComplex64s(100)
defer a.Free()
a.Slice(-1, 10)
}
func TestComplex64sPanic2(test *testing.T) {
InitCuda()
defer func() {
err := recover()
test.Log("recovered:", err)
if err == nil {
test.Fail()
}
}()
a := MakeComplex64s(100)
defer a.Free()
a.Slice(0, 101)
}
func TestComplex64sCopy(test *testing.T) {
InitCuda()
a := make([]complex64, 100)
b := MakeComplex64s(100)
defer b.Free()
c := MakeComplex64s(100)
defer c.Free()
d := make([]complex64, 200)
for i := range a {
a[i] = complex(float32(i), float32(2*i))
}
b.CopyHtoD(a)
c.CopyDtoD(b)
c.CopyDtoH(d[:100])
if !reflect.DeepEqual(a, d[:100]) {
test.Error(d)
}
if !reflect.DeepEqual(d[100:], make([]complex64, 100)) {
test.Error(d)
}
}
================================================
FILE: safe/doc.go
================================================
/*
Safe and more idiomatic wrappers for the low-level CUDA functions.
*/
package safe
================================================
FILE: safe/fft1d_test.go
================================================
package safe
import (
"fmt"
)
func ExampleFFT1DR2C() {
InitCuda()
N := 8
batch := 1
fft := FFT1DR2C(N, batch)
defer fft.Destroy()
input := MakeFloat32s(N)
defer input.Free()
input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0})
output := MakeComplex64s(fft.OutputLen())
defer output.Free()
fft.Exec(input, output)
fmt.Println("input:", input.Host())
fmt.Println("output:", output.Host())
// Output:
// input: [1 0 0 0 0 0 0 0]
// output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]
}
func ExampleFFT1DR2C_Inplace() {
InitCuda()
N := 8
batch := 2
fft := FFT1DR2C(N, batch)
defer fft.Destroy()
output := MakeComplex64s(fft.OutputLen())
defer output.Free()
input := output.Float().Slice(0, fft.InputLen())
// input uses same layout as out-of-place transform
// (CUFFT native layout)
input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0})
fmt.Println("input:", input.Host())
fft.Exec(input, output)
fmt.Println("output:", output.Host())
inverse := FFT1DC2R(N, batch)
defer inverse.Destroy()
inverse.Exec(output, input)
fmt.Println("input:", input.Host())
// Output:
// input: [1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
// output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]
// input: [8 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0]
}
func ExampleFFT1DC2R() {
InitCuda()
N := 8
batch := 1
fft := FFT1DC2R(N, batch)
defer fft.Destroy()
input := MakeComplex64s(fft.InputLen())
defer input.Free()
input.CopyHtoD([]complex64{(1 + 0i), (+1 + 0i), (+1 + 0i), (+1 - 0i), (+1 + 0i)})
output := MakeFloat32s(fft.OutputLen())
defer output.Free()
fft.Exec(input, output)
fmt.Println("input:", input.Host())
fmt.Println("output:", output.Host())
// Output:
// input: [(1+0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i)]
// output: [8 0 0 0 0 0 0 0]
}
================================================
FILE: safe/fft1dc2r.go
================================================
package safe
import (
"fmt"
"github.com/barnex/cuda5/cufft"
)
// 1D single-precission complex-to-real FFT plan.
type FFT1DC2RPlan struct {
fftplan
size1D
batch int
}
// 1D single-precission complex-to-real FFT plan.
func FFT1DC2R(size, batch int) FFT1DC2RPlan {
handle := cufft.Plan1d(size, cufft.C2R, batch)
handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
return FFT1DC2RPlan{fftplan{handle, 0}, size1D(size), batch}
}
// Execute the FFT plan. Synchronized.
func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s) {
oksrclen := p.InputLen()
if src.Len() != oksrclen {
panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
}
okdstlen := p.OutputLen()
if dst.Len() != okdstlen {
panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
}
p.handle.ExecC2R(src.Pointer(), dst.Pointer())
p.stream.Synchronize() //!
}
// Required length of the input array.
func (p FFT1DC2RPlan) OutputLen() int {
return p.batch * p.Size()
}
// Required length of the output array.
func (p FFT1DC2RPlan) InputLen() int {
return p.batch * (p.Size()/2 + 1)
}
================================================
FILE: safe/fft1dr2c.go
================================================
package safe
import (
"fmt"
"github.com/barnex/cuda5/cufft"
)
// 1D single-precission real-to-complex FFT plan.
type FFT1DR2CPlan struct {
fftplan
size1D
batch int
}
// 1D single-precission real-to-complex FFT plan.
func FFT1DR2C(size, batch int) FFT1DR2CPlan {
handle := cufft.Plan1d(size, cufft.R2C, batch)
handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
return FFT1DR2CPlan{fftplan{handle, 0}, size1D(size), batch}
}
// Execute the FFT plan. Synchronized.
func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s) {
oksrclen := p.InputLen()
if src.Len() != oksrclen {
panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
}
okdstlen := p.OutputLen()
if dst.Len() != okdstlen {
panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
}
p.handle.ExecR2C(src.Pointer(), dst.Pointer())
p.stream.Synchronize() //!
}
// Required length of the input array.
func (p FFT1DR2CPlan) InputLen() int {
return p.batch * p.Size()
}
// Required length of the output array.
func (p FFT1DR2CPlan) OutputLen() int {
return p.batch * (p.Size()/2 + 1)
}
================================================
FILE: safe/fft3d_test.go
================================================
package safe
import (
"fmt"
)
func ExampleFFT3DR2C() {
InitCuda()
Nx, Ny, Nz := 2, 4, 8
fft := FFT3DR2C(Nx, Ny, Nz)
defer fft.Destroy()
input := MakeFloat32s(fft.InputLen())
defer input.Free()
inputData := make([]float32, Nx*Ny*Nz)
inputData[0*Ny*Nz] = 1
inputData[1*Ny*Nz] = 1
input.CopyHtoD(inputData)
output := MakeComplex64s(fft.OutputLen())
defer output.Free()
fft.Exec(input, output)
fmt.Println("input:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz))
Ox, Oy, Oz := fft.OutputSize()
fmt.Println("output:", Reshape3DComplex64(output.Host(), Ox, Oy, Oz))
// Output:
// input: [[[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
// output: [[[(2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)]] [[(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)]]]
}
func ExampleFFT3DC2R() {
InitCuda()
Nx, Ny, Nz := 2, 4, 8
fft := FFT3DC2R(Nx, Ny, Nz)
defer fft.Destroy()
input := MakeComplex64s(fft.InputLen())
defer input.Free()
inputData := make([]complex64, fft.InputLen())
for i := range inputData {
inputData[i] = 2
}
input.CopyHtoD(inputData)
output := MakeFloat32s(fft.OutputLen())
defer output.Free()
fft.Exec(input, output)
Ix, Iy, Iz := fft.InputSize()
fmt.Println("input:", Reshape3DComplex64(input.Host(), Ix, Iy, Iz))
fmt.Println("output:", Reshape3DFloat32(output.Host(), Nx, Ny, Nz))
// Output:
// input: [[[(2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]] [[(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]]]
// output: [[[128 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
}
func ExampleFFT3D() {
InitCuda()
Nx, Ny, Nz := 2, 4, 8
forward := FFT3DR2C(Nx, Ny, Nz)
defer forward.Destroy()
input := MakeFloat32s(forward.InputLen())
defer input.Free()
inputData := make([]float32, forward.InputLen())
inputData[5] = 1
input.CopyHtoD(inputData)
output := MakeComplex64s(forward.OutputLen())
defer output.Free()
forward.Exec(input, output)
backward := FFT3DC2R(Nx, Ny, Nz)
backward.Exec(output, input)
fmt.Println("input:", Reshape3DFloat32(inputData, Nx, Ny, Nz))
fmt.Println("forward+inverse:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz))
// Output:
// input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
// forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
}
//func ExampleFFT3D64() {
// InitCuda()
//
// Nx, Ny, Nz := 2, 4, 8
//
// forward := FFT3DD2Z(Nx, Ny, Nz)
// defer forward.Destroy()
//
// input := MakeFloat64s(forward.InputLen())
// defer input.Free()
//
// inputData := make([]float64, forward.InputLen())
// inputData[5] = 1
// input.CopyHtoD(inputData)
//
// output := MakeComplex128s(forward.OutputLen())
// defer output.Free()
//
// forward.Exec(input, output)
//
// backward := FFT3DZ2D(Nx, Ny, Nz)
// backward.Exec(output, input)
//
// fmt.Println("input:", Reshape3DFloat64(inputData, Nx, Ny, Nz))
// fmt.Println("forward+inverse:", Reshape3DFloat64(input.Host(), Nx, Ny, Nz))
//
// // Output:
// // input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
// // forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
//}
================================================
FILE: safe/fft3dc2r.go
================================================
package safe
import (
"fmt"
"github.com/barnex/cuda5/cufft"
)
// 3D single-precission real-to-complex FFT plan.
type FFT3DC2RPlan struct {
fftplan
size3D
}
// 3D single-precission real-to-complex FFT plan.
func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan {
handle := cufft.Plan3d(Nx, Ny, Nz, cufft.C2R)
handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
return FFT3DC2RPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
}
// Execute the FFT plan.
// src and dst are 3D arrays stored 1D arrays.
func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s) {
oksrclen := p.InputLen()
if src.Len() != oksrclen {
panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
}
okdstlen := p.OutputLen()
if dst.Len() != okdstlen {
panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
}
p.handle.ExecC2R(src.Pointer(), dst.Pointer())
p.stream.Synchronize() //!
}
// 3D size of the input array.
func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int) {
return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
}
// 3D size of the output array.
func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int) {
return p.size3D[0], p.size3D[1], p.size3D[2]
}
// Required length of the (1D) input array.
func (p FFT3DC2RPlan) InputLen() int {
return prod3(p.InputSize())
}
// Required length of the (1D) output array.
func (p FFT3DC2RPlan) OutputLen() int {
return prod3(p.OutputSize())
}
================================================
FILE: safe/fft3dd2z.go
================================================
package safe
import (
"fmt"
"github.com/barnex/cuda5/cufft"
)
// 3D single-precission real-to-complex FFT plan.
type FFT3DD2ZPlan struct {
fftplan
size3D
}
// 3D single-precission real-to-complex FFT plan.
func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan {
handle := cufft.Plan3d(Nx, Ny, Nz, cufft.D2Z)
handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
return FFT3DD2ZPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
}
// Execute the FFT plan. Synchronized.
// src and dst are 3D arrays stored 1D arrays.
func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s) {
oksrclen := p.InputLen()
if src.Len() != oksrclen {
panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
}
okdstlen := p.OutputLen()
if dst.Len() != okdstlen {
panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
}
p.handle.ExecD2Z(src.Pointer(), dst.Pointer())
p.stream.Synchronize() //!
}
// 3D size of the input array.
func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int) {
return p.size3D[0], p.size3D[1], p.size3D[2]
}
// 3D size of the output array.
func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int) {
return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
}
// Required length of the (1D) input array.
func (p FFT3DD2ZPlan) InputLen() int {
return prod3(p.InputSize())
}
// Required length of the (1D) output array.
func (p FFT3DD2ZPlan) OutputLen() int {
return prod3(p.OutputSize())
}
================================================
FILE: safe/fft3dr2c.go
================================================
package safe
import (
"fmt"
"github.com/barnex/cuda5/cufft"
)
// 3D single-precission real-to-complex FFT plan.
type FFT3DR2CPlan struct {
fftplan
size3D
}
// 3D single-precission real-to-complex FFT plan.
func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan {
handle := cufft.Plan3d(Nx, Ny, Nz, cufft.R2C)
handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
return FFT3DR2CPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
}
// Execute the FFT plan. Synchronized.
// src and dst are 3D arrays stored 1D arrays.
func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s) {
oksrclen := p.InputLen()
if src.Len() != oksrclen {
panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
}
okdstlen := p.OutputLen()
if dst.Len() != okdstlen {
panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
}
p.handle.ExecR2C(src.Pointer(), dst.Pointer())
p.stream.Synchronize() //!
}
// 3D size of the input array.
func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int) {
return p.size3D[0], p.size3D[1], p.size3D[2]
}
// 3D size of the output array.
func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int) {
return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
}
// Required length of the (1D) input array.
func (p FFT3DR2CPlan) InputLen() int {
return prod3(p.InputSize())
}
// Required length of the (1D) output array.
func (p FFT3DR2CPlan) OutputLen() int {
return prod3(p.OutputSize())
}
================================================
FILE: safe/fft3dz2d.go
================================================
package safe
import (
"fmt"
"github.com/barnex/cuda5/cufft"
)
// 3D single-precission real-to-complex FFT plan.
type FFT3DZ2DPlan struct {
fftplan
size3D
}
// 3D single-precission real-to-complex FFT plan.
func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan {
handle := cufft.Plan3d(Nx, Ny, Nz, cufft.Z2D)
handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
return FFT3DZ2DPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
}
// Execute the FFT plan. Synchronized.
// src and dst are 3D arrays stored 1D arrays.
func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s) {
oksrclen := p.InputLen()
if src.Len() != oksrclen {
panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
}
okdstlen := p.OutputLen()
if dst.Len() != okdstlen {
panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
}
p.handle.ExecZ2D(src.Pointer(), dst.Pointer())
p.stream.Synchronize() //!
}
// 3D size of the input array.
func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int) {
return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
}
// 3D size of the output array.
func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int) {
return p.size3D[0], p.size3D[1], p.size3D[2]
}
// Required length of the (1D) input array.
func (p FFT3DZ2DPlan) InputLen() int {
return prod3(p.InputSize())
}
// Required length of the (1D) output array.
func (p FFT3DZ2DPlan) OutputLen() int {
return prod3(p.OutputSize())
}
================================================
FILE: safe/fftplan.go
================================================
package safe
// INTERNAL
// Base implementation for all FFT plans.
import (
"github.com/barnex/cuda5/cu"
"github.com/barnex/cuda5/cufft"
)
// Base implementation for all FFT plans.
type fftplan struct {
handle cufft.Handle
stream cu.Stream
}
// For the sake of embedding.
type size1D int
// Returns the logical size of the FFT:
// the number of elements (real or complex)
// it transforms.
func (s size1D) Size() int { return int(s) }
// For the sake of embedding.
type size3D [3]int
// Returns the logical size of the FFT:
// the number of elements (real or complex)
// it transforms.
func (s size3D) Size() (Nx, Ny, Nz int) { return s[0], s[1], s[2] }
func prod3(x, y, z int) int {
return x * y * z
}
// Releases all resources associated with the FFT plan.
func (p fftplan) Destroy() { p.handle.Destroy() }
// Associates a CUDA stream with the FFT plan.
// If a stream is set, plan.Stream().Synchronize() can
// to be called to wait for the execution to finish.
func (p fftplan) SetStream(stream cu.Stream) {
p.handle.SetStream(stream)
p.stream = stream
}
// Returns the CUDA stream associated with the FFT plan.
func (p fftplan) Stream() cu.Stream {
return p.stream
}
================================================
FILE: safe/float32s.go
================================================
package safe
import (
"fmt"
"github.com/barnex/cuda5/cu"
"math"
"unsafe"
)
// Slice of float32's on the GPU.
type Float32s struct{ slice }
// Make a slice of float32's on the GPU.
// Initialized to zero.
func MakeFloat32s(len_ int) Float32s {
return Float32s{makeslice(len_, cu.SIZEOF_FLOAT32)}
}
// Return a slice from start (inclusive) to stop (exclusive),
// sharing the underlying storage with the original slice.
// Slices obtained in this way should not be Free()'d
func (s Float32s) Slice(start, stop int) Float32s {
return Float32s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT32)}
}
// Copy src from host to dst on the device.
func (dst Float32s) CopyHtoD(src []float32) {
dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32)
}
// Copy src form device to dst on host.
func (src Float32s) CopyDtoH(dst []float32) {
src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32)
}
// Copy src on host to dst on host.
func (dst Float32s) CopyDtoD(src Float32s) {
dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT32)
}
// Copy src from host to dst on the device, asynchronously.
func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream) {
dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32, stream)
}
// Copy src form device to dst on host, asynchronously.
func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream) {
src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32, stream)
}
// Copy src on host to dst on host, asynchronously.
func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream) {
dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT32, stream)
}
// Returns a fresh copy on host.
func (src Float32s) Host() []float32 {
cpy := make([]float32, src.Len())
src.CopyDtoH(cpy)
return cpy
}
// Set the entire slice to this value.
func (s Float32s) Memset(value float32) {
cu.MemsetD32(s.Pointer(), math.Float32bits(value), int64(s.Len()))
cu.CtxSynchronize()
}
// Set the entire slice to this value, asynchronously.
func (s Float32s) MemsetAsync(value float32, stream cu.Stream) {
cu.MemsetD32Async(s.Pointer(), math.Float32bits(value), int64(s.Len()), stream)
}
// Re-interpret the array as complex numbers,
// in interleaved format. Underlying storage
// is shared.
func (s Float32s) Complex() Complex64s {
if s.Len()%2 != 0 {
panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len()))
}
return Complex64s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}}
}
================================================
FILE: safe/float32s_test.go
================================================
package safe
import (
"reflect"
"testing"
)
func TestFloat32sSlice(test *testing.T) {
InitCuda()
a := MakeFloat32s(100)
defer a.Free()
if !reflect.DeepEqual(a.Host(), make([]float32, 100)) {
test.Error(a.Host())
}
b := make([]float32, 100)
if a.Len() != len(b) {
test.Error("len:", a.Len(), "!=", cap(b))
}
if a.Cap() != cap(b) {
test.Error("cap:", a.Cap(), "!=", cap(b))
}
c := a.Slice(20, 30)
d := b[20:30]
if c.Len() != len(d) {
test.Error("sliced len:", c.Len(), "!=", cap(d))
}
if c.Cap() != cap(d) {
test.Error("sliced cap:", c.Cap(), "!=", cap(d))
}
e := a.Slice(0, 50)
f := b[0:50]
if e.Len() != len(f) {
test.Error("sliced len:", e.Len(), "!=", cap(f))
}
if e.Cap() != cap(f) {
test.Error("sliced cap:", e.Cap(), "!=", cap(f))
}
}
func TestFloat32sPanic1(test *testing.T) {
InitCuda()
defer func() {
err := recover()
test.Log("recovered:", err)
if err == nil {
test.Fail()
}
}()
a := MakeFloat32s(100)
defer a.Free()
a.Slice(-1, 10)
}
func TestFloat32sPanic2(test *testing.T) {
InitCuda()
defer func() {
err := recover()
test.Log("recovered:", err)
if err == nil {
test.Fail()
}
}()
a := MakeFloat32s(100)
defer a.Free()
a.Slice(0, 101)
}
func TestFloat32sCopy(test *testing.T) {
InitCuda()
a := make([]float32, 100)
b := MakeFloat32s(100)
defer b.Free()
c := MakeFloat32s(100)
defer c.Free()
d := make([]float32, 200)
for i := range a {
a[i] = float32(i)
}
b.CopyHtoD(a)
c.CopyDtoD(b)
c.CopyDtoH(d[:100])
if !reflect.DeepEqual(a, d[:100]) {
test.Error(d)
}
if !reflect.DeepEqual(d[100:], make([]float32, 100)) {
test.Error(d)
}
}
================================================
FILE: safe/float64s.go
================================================
package safe
import (
"fmt"
"github.com/barnex/cuda5/cu"
"unsafe"
)
// Slice of float64's on the GPU.
type Float64s struct{ slice }
// Make a slice of float64's on the GPU.
// Initialized to zero.
func MakeFloat64s(len_ int) Float64s {
return Float64s{makeslice(len_, cu.SIZEOF_FLOAT64)}
}
// Return a slice from start (inclusive) to stop (exclusive),
// sharing the underlying storage with the original slice.
// Slices obtained in this way should not be Free()'d
func (s Float64s) Slice(start, stop int) Float64s {
return Float64s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT64)}
}
// Copy src from host to dst on the device.
func (dst Float64s) CopyHtoD(src []float64) {
dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64)
}
// Copy src form device to dst on host.
func (src Float64s) CopyDtoH(dst []float64) {
src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64)
}
// Copy src on host to dst on host.
func (dst Float64s) CopyDtoD(src Float64s) {
dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT64)
}
// Copy src from host to dst on the device, asynchronously.
func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream) {
dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64, stream)
}
// Copy src form device to dst on host, asynchronously.
func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream) {
src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64, stream)
}
// Copy src on host to dst on host, asynchronously.
func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream) {
dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT64, stream)
}
// Returns a fresh copy on host.
func (src Float64s) Host() []float64 {
cpy := make([]float64, src.Len())
src.CopyDtoH(cpy)
return cpy
}
// Re-interpret the array as complex numbers,
// in interleaved format. Underlying storage
// is shared.
func (s Float64s) Complex() Complex128s {
if s.Len()%2 != 0 {
panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len()))
}
return Complex128s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}}
}
================================================
FILE: safe/float64s_test.go
================================================
package safe
import (
"reflect"
"testing"
)
func TestFloat64sSlice(test *testing.T) {
InitCuda()
a := MakeFloat64s(100)
defer a.Free()
if !reflect.DeepEqual(a.Host(), make([]float64, 100)) {
test.Error(a.Host())
}
b := make([]float64, 100)
if a.Len() != len(b) {
test.Error("len:", a.Len(), "!=", cap(b))
}
if a.Cap() != cap(b) {
test.Error("cap:", a.Cap(), "!=", cap(b))
}
c := a.Slice(20, 30)
d := b[20:30]
if c.Len() != len(d) {
test.Error("sliced len:", c.Len(), "!=", cap(d))
}
if c.Cap() != cap(d) {
test.Error("sliced cap:", c.Cap(), "!=", cap(d))
}
e := a.Slice(0, 50)
f := b[0:50]
if e.Len() != len(f) {
test.Error("sliced len:", e.Len(), "!=", cap(f))
}
if e.Cap() != cap(f) {
test.Error("sliced cap:", e.Cap(), "!=", cap(f))
}
}
func TestFloat64sPanic1(test *testing.T) {
InitCuda()
defer func() {
err := recover()
test.Log("recovered:", err)
if err == nil {
test.Fail()
}
}()
a := MakeFloat64s(100)
defer a.Free()
a.Slice(-1, 10)
}
func TestFloat64sPanic2(test *testing.T) {
InitCuda()
defer func() {
err := recover()
test.Log("recovered:", err)
if err == nil {
test.Fail()
}
}()
a := MakeFloat64s(100)
defer a.Free()
a.Slice(0, 101)
}
func TestFloat64sCopy(test *testing.T) {
InitCuda()
a := make([]float64, 100)
b := MakeFloat64s(100)
defer b.Free()
c := MakeFloat64s(100)
defer c.Free()
d := make([]float64, 200)
for i := range a {
a[i] = float64(i)
}
b.CopyHtoD(a)
c.CopyDtoD(b)
c.CopyDtoH(d[:100])
if !reflect.DeepEqual(a, d[:100]) {
test.Error(d)
}
if !reflect.DeepEqual(d[100:], make([]float64, 100)) {
test.Error(d)
}
}
================================================
FILE: safe/init.go
================================================
package safe
import (
"github.com/barnex/cuda5/cu"
"runtime"
)
func InitCuda() {
runtime.LockOSThread()
cu.Init(0)
cu.CtxCreate(cu.CTX_SCHED_AUTO, 0).SetCurrent()
}
================================================
FILE: safe/slice.go
================================================
package safe
// INTERNAL.
// This file implements common functionality for all slice types
// (Float32s, Float64s, Complex64s, ...).
import (
"fmt"
"github.com/barnex/cuda5/cu"
"unsafe"
)
// internal base func for all makeXXX() functions
func makeslice(len_ int, elemsize int) slice {
bytes := int64(len_) * int64(elemsize)
s := slice{0, len_, len_}
if bytes > 0 {
s.ptr_ = cu.MemAlloc(bytes)
cu.MemsetD8(s.ptr_, 0, bytes)
cu.CtxSynchronize()
}
return s
}
// internal base type for all slices
type slice struct {
ptr_ cu.DevicePtr // address offset of first element
len_ int // number of elements
cap_ int
}
// Pointer to the first element.
func (s *slice) Pointer() cu.DevicePtr { return s.ptr_ }
// Slice length (number of elements).
func (s *slice) Len() int { return s.len_ }
// Slice capacity.
func (s *slice) Cap() int { return s.cap_ }
// Free the underlying storage.
// To be used with care. Free() should only be called on
// a slice created by MakeXXX(), not on a slice created
// by x.Slice(). Freeing a slice invalidates all other
// slices referring to it.
func (s *slice) Free() {
s.ptr_.Free()
s.len_ = 0
s.cap_ = 0
}
// internal base func for all slice() functions
func (s *slice) slice(start, stop int, elemsize uintptr) slice {
if start >= s.cap_ || start < 0 || stop > s.cap_ || stop < 0 {
panic("cuda4/safe: slice index out of bounds")
}
if start > stop {
panic("cuda4/safe: inverted slice range")
}
return slice{cu.DevicePtr(uintptr(s.ptr_) + uintptr(start)*elemsize), stop - start, s.cap_ - start}
}
func (dst *slice) copyHtoD(src unsafe.Pointer, srclen int, elemsize int) {
if srclen != dst.Len() {
panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len()))
}
cu.MemcpyHtoD(dst.Pointer(), src, int64(elemsize)*int64(srclen))
}
func (src *slice) copyDtoH(dst unsafe.Pointer, dstlen int, elemsize int) {
if dstlen != src.Len() {
panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen))
}
cu.MemcpyDtoH(dst, src.Pointer(), int64(elemsize)*int64(dstlen))
}
func (dst *slice) copyDtoD(src *slice, elemsize int) {
if dst.Len() != src.Len() {
panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len()))
}
cu.MemcpyDtoD(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()))
}
func (dst *slice) copyHtoDAsync(src unsafe.Pointer, srclen int, elemsize int, stream cu.Stream) {
if srclen != dst.Len() {
panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len()))
}
cu.MemcpyHtoDAsync(dst.Pointer(), src, int64(elemsize)*int64(srclen), stream)
}
func (src *slice) copyDtoHAsync(dst unsafe.Pointer, dstlen int, elemsize int, stream cu.Stream) {
if dstlen != src.Len() {
panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen))
}
cu.MemcpyDtoHAsync(dst, src.Pointer(), int64(elemsize)*int64(dstlen), stream)
}
func (dst *slice) copyDtoDAsync(src *slice, elemsize int, stream cu.Stream) {
if dst.Len() != src.Len() {
panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len()))
}
cu.MemcpyDtoDAsync(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()), stream)
}
// Manually set the pointer, length and capacity.
// Side-steps the security mechanisms, use with caution.
func (s *slice) UnsafeSet(pointer unsafe.Pointer, length, capacity int) {
s.ptr_ = cu.DevicePtr(uintptr(pointer))
s.len_ = length
s.cap_ = capacity
}
================================================
FILE: safe/subs.sh
================================================
#! /bin/bash
subs32='s/loat32/loat64/g;'
subs32+='s/FLOAT32/FLOAT64/g;'
#sed $subs32 float32s.go > float64s.go
#sed $subs32 float32s_test.go > float64s_test.go
subsc64='s/Float32/Complex64/g;'
subsc64+='s/float32/complex64/g;'
subsc64+='s/FLOAT32/COMPLEX64/g;'
#sed $subsc64 float32s_test.go > complex64s_test.go
#sed $subsc64 float32s.go > complex64s.go
subsc128='s/omplex64/omplex128/g;'
subsc128+='s/COMPLEX64/COMPLEX128/g;'
sed $subsc128 complex64s.go > complex128s.go
sed $subsc128 complex64s_test.go > complex128s_test.go
gitextract_cibr8rm8/
├── .gitignore
├── Makefile
├── README.md
├── cu/
│ ├── Makefile
│ ├── README
│ ├── cgoflags.go
│ ├── context.go
│ ├── context_test.go
│ ├── device.go
│ ├── device_test.go
│ ├── dim3.go
│ ├── doc.go
│ ├── execution.go
│ ├── function.go
│ ├── init.go
│ ├── init_test.go
│ ├── memory.go
│ ├── memory_test.go
│ ├── memset.go
│ ├── module.go
│ ├── module_test.go
│ ├── peer.go
│ ├── result.go
│ ├── runtimeapi.go
│ ├── stream.go
│ ├── testdata/
│ │ ├── testmodule.cu
│ │ └── testmodule.ptx
│ ├── version.go
│ └── version_test.go
├── cuda/
│ ├── Makefile
│ ├── README
│ ├── cgoflags.go
│ └── device.go
├── cufft/
│ ├── Makefile
│ ├── README
│ ├── cgoflags.go
│ ├── doc.go
│ ├── fft_test.go
│ ├── init_test.go
│ ├── mode.go
│ ├── plan.go
│ ├── result.go
│ └── type.go
├── curand/
│ ├── Makefile
│ ├── README
│ ├── cgoflags.go
│ ├── generator.go
│ └── status.go
├── doc.go
└── safe/
├── Makefile
├── README
├── complex128s.go
├── complex128s_test.go
├── complex64s.go
├── complex64s_test.go
├── doc.go
├── fft1d_test.go
├── fft1dc2r.go
├── fft1dr2c.go
├── fft3d_test.go
├── fft3dc2r.go
├── fft3dd2z.go
├── fft3dr2c.go
├── fft3dz2d.go
├── fftplan.go
├── float32s.go
├── float32s_test.go
├── float64s.go
├── float64s_test.go
├── init.go
├── slice.go
└── subs.sh
SYMBOL INDEX (456 symbols across 48 files)
FILE: cu/context.go
type Context (line 10) | type Context
method Destroy (line 32) | func (ctx *Context) Destroy() {
method ApiVersion (line 48) | func (ctx Context) ApiVersion() (version int) {
method SetCurrent (line 81) | func (ctx Context) SetCurrent() {
function CtxCreate (line 13) | func CtxCreate(flags uint, dev Device) Context {
function CtxDestroy (line 23) | func CtxDestroy(ctx *Context) {
function CtxGetApiVersion (line 37) | func CtxGetApiVersion(ctx Context) (version int) {
function CtxGetCurrent (line 53) | func CtxGetCurrent() Context {
function CtxGetDevice (line 63) | func CtxGetDevice() Device {
function CtxSetCurrent (line 73) | func CtxSetCurrent(ctx Context) {
function CtxSynchronize (line 86) | func CtxSynchronize() {
constant CTX_SCHED_AUTO (line 96) | CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
constant CTX_SCHED_SPIN (line 98) | CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
constant CTX_SCHED_YIELD (line 100) | CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
constant CTX_BLOCKING_SYNC (line 102) | CTX_BLOCKING_SYNC
constant CTX_MAP_HOST (line 104) | CTX_MAP_HOST = C.CU_CTX_MAP_HOST
constant CTX_LMEM_RESIZE_TO_MAX (line 106) | CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
FILE: cu/context_test.go
function TestContext (line 8) | func TestContext(t *testing.T) {
function BenchmarkGetContext (line 18) | func BenchmarkGetContext(b *testing.B) {
function BenchmarkSetContext (line 28) | func BenchmarkSetContext(b *testing.B) {
FILE: cu/device.go
type Device (line 11) | type Device
method ComputeCapability (line 26) | func (device Device) ComputeCapability() (major, minor int) {
method Attribute (line 51) | func (dev Device) Attribute(attrib DeviceAttribute) int {
method Name (line 78) | func (dev Device) Name() string {
method Properties (line 121) | func (dev Device) Properties() DevProp {
method TotalMem (line 126) | func (device Device) TotalMem() int64 {
function DeviceComputeCapability (line 14) | func DeviceComputeCapability(device Device) (major, minor int) {
function DeviceGet (line 31) | func DeviceGet(ordinal int) Device {
function DeviceGetAttribute (line 41) | func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int {
function DeviceGetCount (line 56) | func DeviceGetCount() int {
function DeviceGetName (line 66) | func DeviceGetName(dev Device) string {
type DevProp (line 83) | type DevProp struct
function DeviceGetProperties (line 97) | func DeviceGetProperties(dev Device) (prop DevProp) {
function DeviceTotalMem (line 131) | func DeviceTotalMem(device Device) int64 {
type DeviceAttribute (line 140) | type DeviceAttribute
constant MAX_THREADS_PER_BLOCK (line 143) | MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_BLOCK_DIM_X (line 144) | MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_BLOCK_DIM_Y (line 145) | MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_BLOCK_DIM_Z (line 146) | MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_GRID_DIM_X (line 147) | MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_GRID_DIM_Y (line 148) | MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_GRID_DIM_Z (line 149) | MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_SHARED_MEMORY_PER_BLOCK (line 150) | MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant TOTAL_CONSTANT_MEMORY (line 151) | TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant WARP_SIZE (line 152) | WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_PITCH (line 153) | MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_REGISTERS_PER_BLOCK (line 154) | MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant CLOCK_RATE (line 155) | CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant TEXTURE_ALIGNMENT (line 156) | TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MULTIPROCESSOR_COUNT (line 157) | MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant KERNEL_EXEC_TIMEOUT (line 158) | KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant INTEGRATED (line 159) | INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant CAN_MAP_HOST_MEMORY (line 160) | CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant COMPUTE_MODE (line 161) | COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE1D_WIDTH (line 162) | MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE2D_WIDTH (line 163) | MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE2D_HEIGHT (line 164) | MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE3D_WIDTH (line 165) | MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE3D_HEIGHT (line 166) | MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE3D_DEPTH (line 167) | MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE2D_LAYERED_WIDTH (line 168) | MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE2D_LAYERED_HEIGHT (line 169) | MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE2D_LAYERED_LAYERS (line 170) | MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant SURFACE_ALIGNMENT (line 171) | SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant CONCURRENT_KERNELS (line 172) | CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant ECC_ENABLED (line 173) | ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant PCI_BUS_ID (line 174) | PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant PCI_DEVICE_ID (line 175) | PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant TCC_DRIVER (line 176) | TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MEMORY_CLOCK_RATE (line 177) | MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant GLOBAL_MEMORY_BUS_WIDTH (line 178) | GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant L2_CACHE_SIZE (line 179) | L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAX_THREADS_PER_MULTIPROCESSOR (line 180) | MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant ASYNC_ENGINE_COUNT (line 181) | ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant UNIFIED_ADDRESSING (line 182) | UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE1D_LAYERED_WIDTH (line 183) | MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
constant MAXIMUM_TEXTURE1D_LAYERED_LAYERS (line 184) | MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE...
FILE: cu/device_test.go
function TestDevice (line 8) | func TestDevice(t *testing.T) {
FILE: cu/dim3.go
type Dim3 (line 3) | type Dim3 struct
FILE: cu/execution.go
constant pointerSize (line 12) | pointerSize = 8
function LaunchKernel (line 14) | func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDim...
function offset (line 44) | func offset(ptr unsafe.Pointer, i int) unsafe.Pointer {
FILE: cu/function.go
type Function (line 13) | type Function
method GetAttribute (line 24) | func (f Function) GetAttribute(attrib FunctionAttribute) int {
function FuncGetAttribute (line 15) | func FuncGetAttribute(attrib FunctionAttribute, function Function) int {
type FunctionAttribute (line 28) | type FunctionAttribute
constant FUNC_A_MAX_THREADS_PER_BLOCK (line 31) | FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX...
constant FUNC_A_SHARED_SIZE_BYTES (line 32) | FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHA...
constant FUNC_A_CONST_SIZE_BYTES (line 33) | FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CON...
constant FUNC_A_LOCAL_SIZE_BYTES (line 34) | FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOC...
constant FUNC_A_NUM_REGS (line 35) | FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM...
constant FUNC_A_PTX_VERSION (line 36) | FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX...
constant FUNC_A_BINARY_VERSION (line 37) | FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BIN...
FILE: cu/init.go
function Init (line 11) | func Init(flags int) {
FILE: cu/init_test.go
function init (line 8) | func init() {
FILE: cu/memory.go
type DevicePtr (line 13) | type DevicePtr
method Free (line 40) | func (ptr DevicePtr) Free() {
method GetAddressRange (line 142) | func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) {
method Bytes (line 147) | func (ptr DevicePtr) Bytes() (bytes int64) {
method String (line 210) | func (p DevicePtr) String() string {
method MemoryType (line 254) | func (ptr DevicePtr) MemoryType() MemoryType {
function MemAlloc (line 16) | func MemAlloc(bytes int64) DevicePtr {
function MemFree (line 27) | func MemFree(p DevicePtr) {
function Memcpy (line 47) | func Memcpy(dst, src DevicePtr, bytes int64) {
function MemcpyAsync (line 55) | func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) {
function MemcpyDtoD (line 63) | func MemcpyDtoD(dst, src DevicePtr, bytes int64) {
function MemcpyDtoDAsync (line 71) | func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) {
function MemcpyHtoD (line 79) | func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) {
function MemcpyHtoDAsync (line 88) | func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, str...
function MemcpyDtoH (line 96) | func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) {
function MemcpyDtoHAsync (line 105) | func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, str...
function MemcpyPeer (line 113) | func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Con...
function MemcpyPeerAsync (line 121) | func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCt...
function MemGetAddressRange (line 129) | func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) {
function MemGetInfo (line 153) | func MemGetInfo() (free, total int64) {
function MemAllocHost (line 184) | func MemAllocHost(bytes int64) unsafe.Pointer {
function MemFreeHost (line 193) | func MemFreeHost(ptr unsafe.Pointer) {
type MemHostRegisterFlag (line 200) | type MemHostRegisterFlag
constant MEMHOSTREGISTER_PORTABLE (line 205) | MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORT...
constant MEMHOSTREGISTER_DEVICEMAP (line 207) | MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEV...
constant SIZEOF_FLOAT32 (line 216) | SIZEOF_FLOAT32 = 4
constant SIZEOF_FLOAT64 (line 217) | SIZEOF_FLOAT64 = 8
constant SIZEOF_COMPLEX64 (line 218) | SIZEOF_COMPLEX64 = 8
constant SIZEOF_COMPLEX128 (line 219) | SIZEOF_COMPLEX128 = 16
type MemoryType (line 223) | type MemoryType
method String (line 238) | func (t MemoryType) String() string {
constant MemoryTypeHost (line 226) | MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST
constant MemoryTypeDevice (line 227) | MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE
constant MemoryTypeArray (line 228) | MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY
constant MemoryTypeUnified (line 229) | MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
function PointerGetAttributeMemoryType (line 246) | func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Res...
FILE: cu/memory_test.go
function TestMalloc (line 10) | func TestMalloc(t *testing.T) {
function BenchmarkMallocFree1B (line 21) | func BenchmarkMallocFree1B(b *testing.B) {
function BenchmarkMallocFree1kB (line 28) | func BenchmarkMallocFree1kB(b *testing.B) {
function BenchmarkMallocFree1MB (line 35) | func BenchmarkMallocFree1MB(b *testing.B) {
function TestMemAddressRange (line 42) | func TestMemAddressRange(t *testing.T) {
function TestMemGetInfo (line 59) | func TestMemGetInfo(t *testing.T) {
function TestMemsetAsync (line 70) | func TestMemsetAsync(t *testing.T) {
function TestMemset (line 98) | func TestMemset(t *testing.T) {
function TestMemcpy (line 123) | func TestMemcpy(t *testing.T) {
function TestMemcpyAsync (line 144) | func TestMemcpyAsync(t *testing.T) {
function TestMemcpyAsyncRegistered (line 167) | func TestMemcpyAsyncRegistered(t *testing.T) {
function BenchmarkMemcpy (line 190) | func BenchmarkMemcpy(b *testing.B) {
FILE: cu/memset.go
function MemsetD32 (line 14) | func MemsetD32(deviceptr DevicePtr, value uint32, N int64) {
function MemsetD32Async (line 22) | func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream S...
function MemsetD8 (line 31) | func MemsetD8(deviceptr DevicePtr, value uint8, N int64) {
function MemsetD8Async (line 39) | func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Str...
FILE: cu/module.go
type Module (line 13) | type Module
method GetFunction (line 50) | func (m Module) GetFunction(name string) Function {
function ModuleLoad (line 16) | func ModuleLoad(fname string) Module {
function ModuleLoadData (line 27) | func ModuleLoadData(image string) Module {
function ModuleGetFunction (line 37) | func ModuleGetFunction(module Module, name string) Function {
FILE: cu/module_test.go
function TestModule (line 9) | func TestModule(test *testing.T) {
function DivUp (line 48) | func DivUp(x, y int) int {
FILE: cu/peer.go
function CtxEnablePeerAccess (line 13) | func CtxEnablePeerAccess(peer Context) {
method EnablePeerAccess (line 21) | func (peer Context) EnablePeerAccess() {
function CtxDisablePeerAccess (line 26) | func CtxDisablePeerAccess(peer Context) {
method DisablePeerAccess (line 34) | func (peer Context) DisablePeerAccess() {
function DeviceCanAccessPeer (line 39) | func DeviceCanAccessPeer(dev, peer Device) bool {
method CanAccessPeer (line 49) | func (dev Device) CanAccessPeer(peer Device) bool {
FILE: cu/result.go
type Result (line 15) | type Result
method String (line 18) | func (err Result) String() string {
constant SUCCESS (line 27) | SUCCESS Result = C.CUDA_SUCCESS
constant ERROR_INVALID_VALUE (line 28) | ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE
constant ERROR_OUT_OF_MEMORY (line 29) | ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY
constant ERROR_NOT_INITIALIZED (line 30) | ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED
constant ERROR_DEINITIALIZED (line 31) | ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED
constant ERROR_PROFILER_DISABLED (line 32) | ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISA...
constant ERROR_PROFILER_NOT_INITIALIZED (line 33) | ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_...
constant ERROR_PROFILER_ALREADY_STARTED (line 34) | ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALRE...
constant ERROR_PROFILER_ALREADY_STOPPED (line 35) | ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALRE...
constant ERROR_NO_DEVICE (line 36) | ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE
constant ERROR_INVALID_DEVICE (line 37) | ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE
constant ERROR_INVALID_IMAGE (line 38) | ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE
constant ERROR_INVALID_CONTEXT (line 39) | ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT
constant ERROR_CONTEXT_ALREADY_CURRENT (line 40) | ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREA...
constant ERROR_MAP_FAILED (line 41) | ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED
constant ERROR_UNMAP_FAILED (line 42) | ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED
constant ERROR_ARRAY_IS_MAPPED (line 43) | ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
constant ERROR_ALREADY_MAPPED (line 44) | ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED
constant ERROR_NO_BINARY_FOR_GPU (line 45) | ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR...
constant ERROR_ALREADY_ACQUIRED (line 46) | ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED
constant ERROR_NOT_MAPPED (line 47) | ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED
constant ERROR_NOT_MAPPED_AS_ARRAY (line 48) | ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS...
constant ERROR_NOT_MAPPED_AS_POINTER (line 49) | ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS...
constant ERROR_ECC_UNCORRECTABLE (line 50) | ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECT...
constant ERROR_UNSUPPORTED_LIMIT (line 51) | ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_L...
constant ERROR_CONTEXT_ALREADY_IN_USE (line 52) | ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREA...
constant ERROR_INVALID_SOURCE (line 53) | ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE
constant ERROR_FILE_NOT_FOUND (line 54) | ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND
constant ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND (line 55) | ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT...
constant ERROR_SHARED_OBJECT_INIT_FAILED (line 56) | ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT...
constant ERROR_OPERATING_SYSTEM (line 57) | ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM
constant ERROR_INVALID_HANDLE (line 58) | ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE
constant ERROR_NOT_FOUND (line 59) | ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND
constant ERROR_NOT_READY (line 60) | ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY
constant ERROR_LAUNCH_FAILED (line 61) | ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED
constant ERROR_LAUNCH_OUT_OF_RESOURCES (line 62) | ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF...
constant ERROR_LAUNCH_TIMEOUT (line 63) | ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
constant ERROR_LAUNCH_INCOMPATIBLE_TEXTURING (line 64) | ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMP...
constant ERROR_PEER_ACCESS_ALREADY_ENABLED (line 65) | ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_A...
constant ERROR_PEER_ACCESS_NOT_ENABLED (line 66) | ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_N...
constant ERROR_PRIMARY_CONTEXT_ACTIVE (line 67) | ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTE...
constant ERROR_CONTEXT_IS_DESTROYED (line 68) | ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DE...
constant ERROR_ASSERT (line 69) | ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT
constant ERROR_TOO_MANY_PEERS (line 70) | ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS
constant ERROR_HOST_MEMORY_ALREADY_REGISTERED (line 71) | ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_A...
constant ERROR_HOST_MEMORY_NOT_REGISTERED (line 72) | ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_N...
constant ERROR_HARDWARE_STACK_ERROR (line 73) | ERROR_HARDWARE_STACK_ERROR Result = 714
constant ERROR_ILLEGAL_INSTRUCTION (line 74) | ERROR_ILLEGAL_INSTRUCTION Result = 715
constant ERROR_MISALIGNED_ADDRESS (line 75) | ERROR_MISALIGNED_ADDRESS Result = 716
constant ERROR_INVALID_ADDRESS_SPACE (line 76) | ERROR_INVALID_ADDRESS_SPACE Result = 717
constant ERROR_INVALID_PC (line 77) | ERROR_INVALID_PC Result = 718
constant ERROR_NOT_PERMITTED (line 78) | ERROR_NOT_PERMITTED Result = 800
constant ERROR_NOT_SUPPORTED (line 79) | ERROR_NOT_SUPPORTED Result = 801
constant ERROR_UNKNOWN (line 80) | ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN
FILE: cu/runtimeapi.go
function SetDevice (line 12) | func SetDevice(device Device) {
function DeviceReset (line 20) | func DeviceReset() {
function SetDeviceFlags (line 28) | func SetDeviceFlags(flags uint) {
constant DeviceAuto (line 38) | DeviceAuto = C.cudaDeviceScheduleAuto
constant DeviceSpin (line 40) | DeviceSpin = C.cudaDeviceScheduleSpin
constant DeviceYield (line 42) | DeviceYield = C.cudaDeviceScheduleYield
constant DeviceScheduleBlockingSync (line 44) | DeviceScheduleBlockingSync = C.cudaDeviceScheduleBlockingSync
constant DeviceBlockingSync (line 46) | DeviceBlockingSync = C.cudaDeviceBlockingSync
constant DeviceMapHost (line 48) | DeviceMapHost = C.cudaDeviceMapHost
constant DeviceLmemResizeToMax (line 50) | DeviceLmemResizeToMax = C.cudaDeviceLmemResizeToMax
function Malloc (line 53) | func Malloc(bytes int64) DevicePtr {
function MallocHost (line 62) | func MallocHost(bytes int64) unsafe.Pointer {
function FreeHost (line 71) | func FreeHost(ptr unsafe.Pointer) {
function MemCpy (line 79) | func MemCpy(dst, src unsafe.Pointer, bytes int64, flags uint) {
constant HtoH (line 89) | HtoH = C.cudaMemcpyHostToHost
constant HtoD (line 91) | HtoD = C.cudaMemcpyHostToDevice
constant DtoH (line 93) | DtoH = C.cudaMemcpyDeviceToHost
constant DtoD (line 95) | DtoD = C.cudaMemcpyDeviceToDevice
constant Virt (line 97) | Virt = C.cudaMemcpyDefault
FILE: cu/stream.go
type Stream (line 10) | type Stream
method Destroy (line 23) | func (stream *Stream) Destroy() {
method Synchronize (line 38) | func (stream Stream) Synchronize() {
method Query (line 46) | func (stream Stream) Query() Result {
function StreamCreate (line 13) | func StreamCreate() Stream {
function StreamDestroy (line 33) | func StreamDestroy(stream *Stream) {
function StreamQuery (line 51) | func StreamQuery(stream Stream) Result {
function StreamSynchronize (line 56) | func StreamSynchronize(stream Stream) {
FILE: cu/version.go
function Version (line 9) | func Version() int {
FILE: cu/version_test.go
function TestVersion (line 8) | func TestVersion(t *testing.T) {
FILE: cuda/device.go
function DeviceReset (line 12) | func DeviceReset() {
function DeviceSetCacheConfig (line 20) | func DeviceSetCacheConfig(cacheConfig FuncCache) {
type FuncCache (line 28) | type FuncCache
constant FUNC_CACHE_PREFER_NONE (line 31) | FUNC_CACHE_PREFER_NONE FuncCache = C.CU_FUNC_CACHE_PREFER_NONE
constant FUNC_CACHE_PREFER_SHARED (line 32) | FUNC_CACHE_PREFER_SHARED FuncCache = C.CU_FUNC_CACHE_PREFER_SHARED
constant FUNC_CACHE_PREFER_L1 (line 33) | FUNC_CACHE_PREFER_L1 FuncCache = C.CU_FUNC_CACHE_PREFER_L1
constant FUNC_CACHE_PREFER_EQUAL (line 34) | FUNC_CACHE_PREFER_EQUAL FuncCache = C.CU_FUNC_CACHE_PREFER_EQUAL
FILE: cufft/fft_test.go
function ExampleFFT1D (line 9) | func ExampleFFT1D() {
FILE: cufft/init_test.go
function init (line 9) | func init() {
FILE: cufft/mode.go
type CompatibilityMode (line 11) | type CompatibilityMode
method String (line 20) | func (t CompatibilityMode) String() string {
constant COMPATIBILITY_NATIVE (line 14) | COMPATIBILITY_NATIVE CompatibilityMode = C.CUFFT_COMPATIBILITY_...
constant COMPATIBILITY_FFTW_PADDING (line 15) | COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_...
constant COMPATIBILITY_FFTW_ASYMMETRIC (line 16) | COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_...
constant COMPATIBILITY_FFTW_ALL (line 17) | COMPATIBILITY_FFTW_ALL CompatibilityMode = C.CUFFT_COMPATIBILITY_...
FILE: cufft/plan.go
type Handle (line 16) | type Handle
method ExecC2C (line 105) | func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) {
method ExecR2C (line 117) | func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) {
method ExecC2R (line 128) | func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) {
method ExecZ2Z (line 139) | func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) {
method ExecD2Z (line 151) | func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) {
method ExecZ2D (line 162) | func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) {
method Destroy (line 173) | func (plan *Handle) Destroy() {
method SetStream (line 182) | func (plan Handle) SetStream(stream cu.Stream) {
method SetCompatibilityMode (line 192) | func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) {
function Plan1d (line 19) | func Plan1d(nx int, typ Type, batch int) Handle {
function Plan2d (line 33) | func Plan2d(nx, ny int, typ Type) Handle {
function Plan3d (line 47) | func Plan3d(nx, ny, nz int, typ Type) Handle {
function PlanMany (line 67) | func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride...
FILE: cufft/result.go
type Result (line 11) | type Result
method String (line 31) | func (r Result) String() string {
constant SUCCESS (line 15) | SUCCESS Result = C.CUFFT_SUCCESS
constant INVALID_PLAN (line 16) | INVALID_PLAN Result = C.CUFFT_INVALID_PLAN
constant ALLOC_FAILED (line 17) | ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED
constant INVALID_TYPE (line 18) | INVALID_TYPE Result = C.CUFFT_INVALID_TYPE
constant INVALID_VALUE (line 19) | INVALID_VALUE Result = C.CUFFT_INVALID_VALUE
constant INTERNAL_ERROR (line 20) | INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR
constant EXEC_FAILED (line 21) | EXEC_FAILED Result = C.CUFFT_EXEC_FAILED
constant SETUP_FAILED (line 22) | SETUP_FAILED Result = C.CUFFT_SETUP_FAILED
constant INVALID_SIZE (line 23) | INVALID_SIZE Result = C.CUFFT_INVALID_SIZE
constant UNALIGNED_DATA (line 24) | UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA
constant INCOMPLETE_PARAMETER_LIST (line 25) | INCOMPLETE_PARAMETER_LIST Result = 0xA
constant INVALID_DEVICE (line 26) | INVALID_DEVICE Result = 0xB
constant PARSE_ERROR (line 27) | PARSE_ERROR Result = 0xC
constant NO_WORKSPACE (line 28) | NO_WORKSPACE Result = 0xD
FILE: cufft/type.go
type Type (line 11) | type Type
method String (line 27) | func (t Type) String() string {
constant R2C (line 14) | R2C Type = C.CUFFT_R2C
constant C2R (line 15) | C2R Type = C.CUFFT_C2R
constant C2C (line 16) | C2C Type = C.CUFFT_C2C
constant D2Z (line 17) | D2Z Type = C.CUFFT_D2Z
constant Z2D (line 18) | Z2D Type = C.CUFFT_Z2D
constant Z2Z (line 19) | Z2Z Type = C.CUFFT_Z2Z
constant FORWARD (line 23) | FORWARD = -1
constant INVERSE (line 24) | INVERSE = 1
FILE: curand/generator.go
type Generator (line 10) | type Generator
method GenerateNormal (line 33) | func (g Generator) GenerateNormal(output uintptr, n int64, mean, stdde...
method SetSeed (line 45) | func (g Generator) SetSeed(seed int64) {
type RngType (line 12) | type RngType
constant PSEUDO_DEFAULT (line 15) | PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT
constant PSEUDO_XORWOW (line 16) | PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW
constant QUASI_DEFAULT (line 17) | QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT
constant QUASI_SOBOL32 (line 18) | QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32
constant QUASI_SCRAMBLED_SOBOL32 (line 19) | QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32
constant QUASI_SOBOL64 (line 20) | QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64
constant QUASI_SCRAMBLED_SOBOL64 (line 21) | QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64
function CreateGenerator (line 24) | func CreateGenerator(rngType RngType) Generator {
FILE: curand/status.go
type Status (line 10) | type Status
method String (line 27) | func (s Status) String() string {
constant SUCCESS (line 13) | SUCCESS Status = C.CURAND_STATUS_SUCCESS
constant VERSION_MISMATCH (line 14) | VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH
constant NOT_INITIALIZED (line 15) | NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED
constant ALLOCATION_FAILED (line 16) | ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED
constant TYPE_ERROR (line 17) | TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR
constant OUT_OF_RANGE (line 18) | OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE
constant LENGTH_NOT_MULTIPLE (line 19) | LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE
constant LAUNCH_FAILURE (line 20) | LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE
constant PREEXISTING_FAILURE (line 21) | PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE
constant INITIALIZATION_FAILED (line 22) | INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED
constant ARCH_MISMATCH (line 23) | ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH
constant INTERNAL_ERROR (line 24) | INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR
FILE: safe/complex128s.go
type Complex128s (line 9) | type Complex128s struct
method Slice (line 20) | func (s Complex128s) Slice(start, stop int) Complex128s {
method CopyHtoD (line 25) | func (dst Complex128s) CopyHtoD(src []complex128) {
method CopyDtoH (line 30) | func (src Complex128s) CopyDtoH(dst []complex128) {
method CopyDtoD (line 35) | func (dst Complex128s) CopyDtoD(src Complex128s) {
method CopyHtoDAsync (line 40) | func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Strea...
method CopyDtoHAsync (line 45) | func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Strea...
method CopyDtoDAsync (line 50) | func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream) {
method Host (line 55) | func (src Complex128s) Host() []complex128 {
method Float (line 64) | func (s Complex128s) Float() Float64s {
function MakeComplex128s (line 13) | func MakeComplex128s(len_ int) Complex128s {
FILE: safe/complex128s_test.go
function TestComplex128sSlice (line 8) | func TestComplex128sSlice(test *testing.T) {
function TestComplex128sPanic1 (line 48) | func TestComplex128sPanic1(test *testing.T) {
function TestComplex128sPanic2 (line 65) | func TestComplex128sPanic2(test *testing.T) {
function TestComplex128sCopy (line 82) | func TestComplex128sCopy(test *testing.T) {
FILE: safe/complex64s.go
type Complex64s (line 9) | type Complex64s struct
method Slice (line 20) | func (s Complex64s) Slice(start, stop int) Complex64s {
method CopyHtoD (line 25) | func (dst Complex64s) CopyHtoD(src []complex64) {
method CopyDtoH (line 30) | func (src Complex64s) CopyDtoH(dst []complex64) {
method CopyDtoD (line 35) | func (dst Complex64s) CopyDtoD(src Complex64s) {
method CopyHtoDAsync (line 40) | func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream) {
method CopyDtoHAsync (line 45) | func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream) {
method CopyDtoDAsync (line 50) | func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream) {
method Host (line 55) | func (src Complex64s) Host() []complex64 {
method Float (line 64) | func (s Complex64s) Float() Float32s {
function MakeComplex64s (line 13) | func MakeComplex64s(len_ int) Complex64s {
FILE: safe/complex64s_test.go
function TestComplex64sSlice (line 8) | func TestComplex64sSlice(test *testing.T) {
function TestComplex64sPanic1 (line 48) | func TestComplex64sPanic1(test *testing.T) {
function TestComplex64sPanic2 (line 65) | func TestComplex64sPanic2(test *testing.T) {
function TestComplex64sCopy (line 82) | func TestComplex64sCopy(test *testing.T) {
FILE: safe/fft1d_test.go
function ExampleFFT1DR2C (line 7) | func ExampleFFT1DR2C() {
function ExampleFFT1DR2C_Inplace (line 33) | func ExampleFFT1DR2C_Inplace() {
function ExampleFFT1DC2R (line 64) | func ExampleFFT1DC2R() {
FILE: safe/fft1dc2r.go
type FFT1DC2RPlan (line 9) | type FFT1DC2RPlan struct
method Exec (line 23) | func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s) {
method OutputLen (line 37) | func (p FFT1DC2RPlan) OutputLen() int {
method InputLen (line 42) | func (p FFT1DC2RPlan) InputLen() int {
function FFT1DC2R (line 16) | func FFT1DC2R(size, batch int) FFT1DC2RPlan {
FILE: safe/fft1dr2c.go
type FFT1DR2CPlan (line 9) | type FFT1DR2CPlan struct
method Exec (line 23) | func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s) {
method InputLen (line 37) | func (p FFT1DR2CPlan) InputLen() int {
method OutputLen (line 42) | func (p FFT1DR2CPlan) OutputLen() int {
function FFT1DR2C (line 16) | func FFT1DR2C(size, batch int) FFT1DR2CPlan {
FILE: safe/fft3d_test.go
function ExampleFFT3DR2C (line 7) | func ExampleFFT3DR2C() {
function ExampleFFT3DC2R (line 37) | func ExampleFFT3DC2R() {
function ExampleFFT3D (line 68) | func ExampleFFT3D() {
FILE: safe/fft3dc2r.go
type FFT3DC2RPlan (line 9) | type FFT3DC2RPlan struct
method Exec (line 23) | func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s) {
method InputSize (line 37) | func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int) {
method OutputSize (line 42) | func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int) {
method InputLen (line 47) | func (p FFT3DC2RPlan) InputLen() int {
method OutputLen (line 52) | func (p FFT3DC2RPlan) OutputLen() int {
function FFT3DC2R (line 15) | func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan {
FILE: safe/fft3dd2z.go
type FFT3DD2ZPlan (line 9) | type FFT3DD2ZPlan struct
method Exec (line 23) | func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s) {
method InputSize (line 37) | func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int) {
method OutputSize (line 42) | func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int) {
method InputLen (line 47) | func (p FFT3DD2ZPlan) InputLen() int {
method OutputLen (line 52) | func (p FFT3DD2ZPlan) OutputLen() int {
function FFT3DD2Z (line 15) | func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan {
FILE: safe/fft3dr2c.go
type FFT3DR2CPlan (line 9) | type FFT3DR2CPlan struct
method Exec (line 23) | func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s) {
method InputSize (line 37) | func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int) {
method OutputSize (line 42) | func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int) {
method InputLen (line 47) | func (p FFT3DR2CPlan) InputLen() int {
method OutputLen (line 52) | func (p FFT3DR2CPlan) OutputLen() int {
function FFT3DR2C (line 15) | func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan {
FILE: safe/fft3dz2d.go
type FFT3DZ2DPlan (line 9) | type FFT3DZ2DPlan struct
method Exec (line 23) | func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s) {
method InputSize (line 37) | func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int) {
method OutputSize (line 42) | func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int) {
method InputLen (line 47) | func (p FFT3DZ2DPlan) InputLen() int {
method OutputLen (line 52) | func (p FFT3DZ2DPlan) OutputLen() int {
function FFT3DZ2D (line 15) | func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan {
FILE: safe/fftplan.go
type fftplan (line 12) | type fftplan struct
method Destroy (line 38) | func (p fftplan) Destroy() { p.handle.Destroy() }
method SetStream (line 43) | func (p fftplan) SetStream(stream cu.Stream) {
method Stream (line 49) | func (p fftplan) Stream() cu.Stream {
type size1D (line 18) | type size1D
method Size (line 23) | func (s size1D) Size() int { return int(s) }
type size3D (line 26) | type size3D
method Size (line 31) | func (s size3D) Size() (Nx, Ny, Nz int) { return s[0], s[1], s[2] }
function prod3 (line 33) | func prod3(x, y, z int) int {
FILE: safe/float32s.go
type Float32s (line 11) | type Float32s struct
method Slice (line 22) | func (s Float32s) Slice(start, stop int) Float32s {
method CopyHtoD (line 27) | func (dst Float32s) CopyHtoD(src []float32) {
method CopyDtoH (line 32) | func (src Float32s) CopyDtoH(dst []float32) {
method CopyDtoD (line 37) | func (dst Float32s) CopyDtoD(src Float32s) {
method CopyHtoDAsync (line 42) | func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream) {
method CopyDtoHAsync (line 47) | func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream) {
method CopyDtoDAsync (line 52) | func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream) {
method Host (line 57) | func (src Float32s) Host() []float32 {
method Memset (line 64) | func (s Float32s) Memset(value float32) {
method MemsetAsync (line 70) | func (s Float32s) MemsetAsync(value float32, stream cu.Stream) {
method Complex (line 77) | func (s Float32s) Complex() Complex64s {
function MakeFloat32s (line 15) | func MakeFloat32s(len_ int) Float32s {
FILE: safe/float32s_test.go
function TestFloat32sSlice (line 8) | func TestFloat32sSlice(test *testing.T) {
function TestFloat32sPanic1 (line 48) | func TestFloat32sPanic1(test *testing.T) {
function TestFloat32sPanic2 (line 65) | func TestFloat32sPanic2(test *testing.T) {
function TestFloat32sCopy (line 82) | func TestFloat32sCopy(test *testing.T) {
FILE: safe/float64s.go
type Float64s (line 10) | type Float64s struct
method Slice (line 21) | func (s Float64s) Slice(start, stop int) Float64s {
method CopyHtoD (line 26) | func (dst Float64s) CopyHtoD(src []float64) {
method CopyDtoH (line 31) | func (src Float64s) CopyDtoH(dst []float64) {
method CopyDtoD (line 36) | func (dst Float64s) CopyDtoD(src Float64s) {
method CopyHtoDAsync (line 41) | func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream) {
method CopyDtoHAsync (line 46) | func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream) {
method CopyDtoDAsync (line 51) | func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream) {
method Host (line 56) | func (src Float64s) Host() []float64 {
method Complex (line 65) | func (s Float64s) Complex() Complex128s {
function MakeFloat64s (line 14) | func MakeFloat64s(len_ int) Float64s {
FILE: safe/float64s_test.go
function TestFloat64sSlice (line 8) | func TestFloat64sSlice(test *testing.T) {
function TestFloat64sPanic1 (line 48) | func TestFloat64sPanic1(test *testing.T) {
function TestFloat64sPanic2 (line 65) | func TestFloat64sPanic2(test *testing.T) {
function TestFloat64sCopy (line 82) | func TestFloat64sCopy(test *testing.T) {
FILE: safe/init.go
function InitCuda (line 8) | func InitCuda() {
FILE: safe/slice.go
function makeslice (line 14) | func makeslice(len_ int, elemsize int) slice {
type slice (line 26) | type slice struct
method Pointer (line 33) | func (s *slice) Pointer() cu.DevicePtr { return s.ptr_ }
method Len (line 36) | func (s *slice) Len() int { return s.len_ }
method Cap (line 39) | func (s *slice) Cap() int { return s.cap_ }
method Free (line 46) | func (s *slice) Free() {
method slice (line 53) | func (s *slice) slice(start, stop int, elemsize uintptr) slice {
method copyHtoD (line 63) | func (dst *slice) copyHtoD(src unsafe.Pointer, srclen int, elemsize in...
method copyDtoH (line 70) | func (src *slice) copyDtoH(dst unsafe.Pointer, dstlen int, elemsize in...
method copyDtoD (line 77) | func (dst *slice) copyDtoD(src *slice, elemsize int) {
method copyHtoDAsync (line 84) | func (dst *slice) copyHtoDAsync(src unsafe.Pointer, srclen int, elemsi...
method copyDtoHAsync (line 91) | func (src *slice) copyDtoHAsync(dst unsafe.Pointer, dstlen int, elemsi...
method copyDtoDAsync (line 98) | func (dst *slice) copyDtoDAsync(src *slice, elemsize int, stream cu.St...
method UnsafeSet (line 107) | func (s *slice) UnsafeSet(pointer unsafe.Pointer, length, capacity int) {
Condensed preview — 72 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (189K chars).
[
{
"path": ".gitignore",
"chars": 18,
"preview": "*.swp\n*.{6,8,5,o}\n"
},
{
"path": "Makefile",
"chars": 485,
"preview": "all: 6g doc\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngccgo:\n\t"
},
{
"path": "README.md",
"chars": 134,
"preview": "# Go bindings for CUDA\n\nGo bindings for nVIDIA CUDA 5 and later. This package compiles with both gc and gccgo.\n\n\n\nfunc TestContext(t *testing.T) {\n\tfmt.Println(\"CtxCreate\")\n\tctx := CtxCreate(C"
},
{
"path": "cu/device.go",
"chars": 9816,
"preview": "package cu\n\n// This file implements CUDA driver device management\n\n//#include <cuda.h>\nimport \"C\"\n\nimport ()\n\n// CUDA De"
},
{
"path": "cu/device_test.go",
"chars": 4524,
"preview": "package cu\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n)\n\nfunc TestDevice(t *testing.T) {\n\tfmt.Println(\"DeviceGetCount:\", DeviceGetCount"
},
{
"path": "cu/dim3.go",
"chars": 46,
"preview": "package cu\n\ntype Dim3 struct {\n\tX, Y, Z int\n}\n"
},
{
"path": "cu/doc.go",
"chars": 51,
"preview": "// Go bindings for the CUDA driver API.\npackage cu\n"
},
{
"path": "cu/execution.go",
"chars": 1348,
"preview": "package cu\n\n// This file implements execution of CUDA kernels\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\ncon"
},
{
"path": "cu/function.go",
"chars": 1829,
"preview": "package cu\n\n// This file implements manipulations on CUDA functions\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n"
},
{
"path": "cu/init.go",
"chars": 370,
"preview": "package cu\n\n// This file implements CUDA driver initialization\n\n//#include <cuda.h>\nimport \"C\"\n\n// Initialize the CUDA d"
},
{
"path": "cu/init_test.go",
"chars": 181,
"preview": "package cu\n\nimport (\n\t\"fmt\"\n)\n\n// needed for all other tests.\nfunc init() {\n\tInit(0)\n\tctx := CtxCreate(CTX_SCHED_AUTO, 0"
},
{
"path": "cu/memory.go",
"chars": 7711,
"preview": "package cu\n\n// This file implements CUDA memory management on the driver level\n\n//#include <cuda.h>\nimport \"C\"\n\nimport ("
},
{
"path": "cu/memory_test.go",
"chars": 4343,
"preview": "package cu\n\nimport (\n\t\"fmt\"\n\t\"math\"\n\t\"testing\"\n\t\"unsafe\"\n)\n\nfunc TestMalloc(t *testing.T) {\n\tfor i := 0; i < 1024; i++ {"
},
{
"path": "cu/memset.go",
"chars": 1280,
"preview": "package cu\n\n// This file implements CUDA memset functions.\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\n// Set"
},
{
"path": "cu/module.go",
"chars": 1209,
"preview": "package cu\n\n// This file implements loading of CUDA ptx modules\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\n/"
},
{
"path": "cu/module_test.go",
"chars": 846,
"preview": "package cu\n\nimport (\n\t\"testing\"\n\t\"unsafe\"\n\t//\"fmt\"\n)\n\nfunc TestModule(test *testing.T) {\n\tmod := ModuleLoad(\"/testdata/t"
},
{
"path": "cu/peer.go",
"chars": 1327,
"preview": "package cu\n\n// This file implements CUDA unified addressing.\n\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\n// M"
},
{
"path": "cu/result.go",
"chars": 8698,
"preview": "package cu\n\n// This file provides access to CUDA driver error statuses (type CUresult).\n\n//#include <cuda.h>\nimport \"C\"\n"
},
{
"path": "cu/runtimeapi.go",
"chars": 2445,
"preview": "package cu\n\n// This file implements parts of the CUDA runtime api instead of the driver\n// api the rest of this package "
},
{
"path": "cu/stream.go",
"chars": 1341,
"preview": "package cu\n\n// This file implements CUDA streams\n\n//#include <cuda.h>\nimport \"C\"\nimport \"unsafe\"\n\n// CUDA stream.\ntype S"
},
{
"path": "cu/testdata/testmodule.cu",
"chars": 443,
"preview": "/*\n * Module to test CUDA module loading and execution.\n * To be compiled with:\n * nvcc -ptx testmodule.cu\n */\n\n\n#ifdef "
},
{
"path": "cu/testdata/testmodule.ptx",
"chars": 3245,
"preview": "\t.version 1.4\n\t.target sm_10, map_f64_to_f32\n\t// compiled with /usr/local/cuda/open64/lib//be\n\t// nvopencc 4.0 built on "
},
{
"path": "cu/version.go",
"chars": 283,
"preview": "package cu\n\n// This file implements CUDA driver version management\n\n//#include <cuda.h>\nimport \"C\"\n\n// Returns the CUDA "
},
{
"path": "cu/version_test.go",
"chars": 126,
"preview": "package cu\n\nimport (\n\t\"fmt\"\n\t\"testing\"\n)\n\nfunc TestVersion(t *testing.T) {\n\tfmt.Println(\"CUDA driver version: \", Version"
},
{
"path": "cuda/Makefile",
"chars": 413,
"preview": "all: 6g gccgo doc\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngc"
},
{
"path": "cuda/README",
"chars": 22576,
"preview": "PACKAGE\n\npackage cu\n import \"github.com/barnex/cuda5/cu\"\n\n Go bindings for the CUDA driver API.\n\nCONSTANTS\n\nconst "
},
{
"path": "cuda/cgoflags.go",
"chars": 922,
"preview": "package cuda\n\n// This file provides CGO flags.\n\nimport \"C\"\n\n//#cgo LDFLAGS:-lcudart\n//\n////default location:\n//#cgo LDFL"
},
{
"path": "cuda/device.go",
"chars": 787,
"preview": "package cuda\n\n//#include <cuda_runtime.h>\n//#include <cuda.h>\nimport \"C\"\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n)\n\n// R"
},
{
"path": "cufft/Makefile",
"chars": 416,
"preview": "all: 6g gccgo doc\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngc"
},
{
"path": "cufft/README",
"chars": 2914,
"preview": "PACKAGE DOCUMENTATION\n\npackage cufft\n import \"github.com/barnex/cuda5/cufft\"\n\n Go bindings for the CUDA CUFFT API."
},
{
"path": "cufft/cgoflags.go",
"chars": 948,
"preview": "package cufft\n\n// This file provides CGO flags to find CUDA libraries and headers.\n\n//#cgo LDFLAGS:-lcufft\n//\n////defaul"
},
{
"path": "cufft/doc.go",
"chars": 53,
"preview": "// Go bindings for the CUDA CUFFT API.\npackage cufft\n"
},
{
"path": "cufft/fft_test.go",
"chars": 749,
"preview": "package cufft\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\nfunc ExampleFFT1D() {\n\tN := 8\n\n\thostIn := make"
},
{
"path": "cufft/init_test.go",
"chars": 226,
"preview": "package cufft\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cu\"\n)\n\n// needed for all other tests.\nfunc init() {\n\tcu.Init(0)"
},
{
"path": "cufft/mode.go",
"chars": 1013,
"preview": "package cufft\n\n//#include <cufft.h>\nimport \"C\"\n\nimport (\n\t\"fmt\"\n)\n\n// CUFFT compatibility mode\ntype CompatibilityMode in"
},
{
"path": "cufft/plan.go",
"chars": 4571,
"preview": "// Copyright 2011 Arne Vansteenkiste (barnex@gmail.com). All rights reserved.\n// Use of this source code is governed by"
},
{
"path": "cufft/result.go",
"chars": 1847,
"preview": "package cufft\n\n//#include <cufft.h>\nimport \"C\"\n\nimport (\n\t\"fmt\"\n)\n\n// FFT result\ntype Result int\n\n// FFT result value\nco"
},
{
"path": "cufft/type.go",
"chars": 823,
"preview": "package cufft\n\n//#include <cufft.h>\nimport \"C\"\n\nimport (\n\t\"fmt\"\n)\n\n// FFT type\ntype Type int\n\nconst (\n\tR2C Type = C.CUFF"
},
{
"path": "curand/Makefile",
"chars": 417,
"preview": "all: 6g gccgo doc\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ngc"
},
{
"path": "curand/README",
"chars": 2421,
"preview": "PACKAGE DOCUMENTATION\n\npackage curand\n import \"github.com/barnex/cuda5/curand\"\n\n\n\nTYPES\n\ntype Generator uintptr\n\n\nfun"
},
{
"path": "curand/cgoflags.go",
"chars": 950,
"preview": "package curand\n\n// This file provides CGO flags to find CUDA libraries and headers.\n\n//#cgo LDFLAGS:-lcurand\n//\n////defa"
},
{
"path": "curand/generator.go",
"chars": 1691,
"preview": "package curand\n\n//#include <curand.h>\nimport \"C\"\n\nimport (\n\t\"unsafe\"\n)\n\ntype Generator uintptr\n\ntype RngType int\n\nconst "
},
{
"path": "curand/status.go",
"chars": 2273,
"preview": "package curand\n\n//#include <curand.h>\nimport \"C\"\n\nimport (\n\t\"fmt\"\n)\n\ntype Status int\n\nconst (\n\tSUCCESS Sta"
},
{
"path": "doc.go",
"chars": 300,
"preview": "/*\n\tGo bindings for nVIDIA CUDA 5.\n\tThis package compiles with both gc and gccgo.\n*/\npackage cuda5\n\n// Dummy imports so "
},
{
"path": "safe/Makefile",
"chars": 495,
"preview": "all: 6g doc #gccgo\n\n6g:\n\tgo install -v\n\tgo tool vet *.go\n\tgofmt -w *.go\n\nGCCGO=gccgo -gccgoflags '-static-libgcc -O3'\n\ng"
},
{
"path": "safe/README",
"chars": 15185,
"preview": "PACKAGE\n\npackage safe\n import \"github.com/barnex/cuda5/safe\"\n\n Safe and more idiomatic wrappers for the low-level "
},
{
"path": "safe/complex128s.go",
"chars": 2073,
"preview": "package safe\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\n// Slice of complex128's on the GPU.\ntype Complex128s "
},
{
"path": "safe/complex128s_test.go",
"chars": 1727,
"preview": "package safe\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n)\n\nfunc TestComplex128sSlice(test *testing.T) {\n\tInitCuda()\n\n\ta := MakeComp"
},
{
"path": "safe/complex64s.go",
"chars": 2040,
"preview": "package safe\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\n// Slice of complex64's on the GPU.\ntype Complex64s st"
},
{
"path": "safe/complex64s_test.go",
"chars": 1713,
"preview": "package safe\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n)\n\nfunc TestComplex64sSlice(test *testing.T) {\n\tInitCuda()\n\n\ta := MakeCompl"
},
{
"path": "safe/doc.go",
"chars": 87,
"preview": "/*\n\tSafe and more idiomatic wrappers for the low-level CUDA functions.\n*/\npackage safe\n"
},
{
"path": "safe/fft1d_test.go",
"chars": 1827,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n)\n\nfunc ExampleFFT1DR2C() {\n\tInitCuda()\n\n\tN := 8\n\tbatch := 1\n\n\tfft := FFT1DR2C(N, batch)\n\t"
},
{
"path": "safe/fft1dc2r.go",
"chars": 1133,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 1D single-precission complex-to-real FFT plan.\ntype"
},
{
"path": "safe/fft1dr2c.go",
"chars": 1133,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 1D single-precission real-to-complex FFT plan.\ntype"
},
{
"path": "safe/fft3d_test.go",
"chars": 4130,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n)\n\nfunc ExampleFFT3DR2C() {\n\tInitCuda()\n\n\tNx, Ny, Nz := 2, 4, 8\n\n\tfft := FFT3DR2C(Nx, Ny, "
},
{
"path": "safe/fft3dc2r.go",
"chars": 1433,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 3D single-precission real-to-complex FFT plan.\ntype"
},
{
"path": "safe/fft3dd2z.go",
"chars": 1448,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 3D single-precission real-to-complex FFT plan.\ntype"
},
{
"path": "safe/fft3dr2c.go",
"chars": 1447,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 3D single-precission real-to-complex FFT plan.\ntype"
},
{
"path": "safe/fft3dz2d.go",
"chars": 1448,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cufft\"\n)\n\n// 3D single-precission real-to-complex FFT plan.\ntype"
},
{
"path": "safe/fftplan.go",
"chars": 1190,
"preview": "package safe\n\n// INTERNAL\n// Base implementation for all FFT plans.\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n\t\"github.com"
},
{
"path": "safe/float32s.go",
"chars": 2477,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cu\"\n\t\"math\"\n\t\"unsafe\"\n)\n\n// Slice of float32's on the GPU.\ntype "
},
{
"path": "safe/float32s_test.go",
"chars": 1662,
"preview": "package safe\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n)\n\nfunc TestFloat32sSlice(test *testing.T) {\n\tInitCuda()\n\n\ta := MakeFloat32"
},
{
"path": "safe/float64s.go",
"chars": 2094,
"preview": "package safe\n\nimport (\n\t\"fmt\"\n\t\"github.com/barnex/cuda5/cu\"\n\t\"unsafe\"\n)\n\n// Slice of float64's on the GPU.\ntype Float64s"
},
{
"path": "safe/float64s_test.go",
"chars": 1662,
"preview": "package safe\n\nimport (\n\t\"reflect\"\n\t\"testing\"\n)\n\nfunc TestFloat64sSlice(test *testing.T) {\n\tInitCuda()\n\n\ta := MakeFloat64"
},
{
"path": "safe/init.go",
"chars": 172,
"preview": "package safe\n\nimport (\n\t\"github.com/barnex/cuda5/cu\"\n\t\"runtime\"\n)\n\nfunc InitCuda() {\n\truntime.LockOSThread()\n\tcu.Init(0)"
},
{
"path": "safe/slice.go",
"chars": 3622,
"preview": "package safe\n\n// INTERNAL.\n// This file implements common functionality for all slice types\n// (Float32s, Float64s, Comp"
},
{
"path": "safe/subs.sh",
"chars": 533,
"preview": "#! /bin/bash\n\nsubs32='s/loat32/loat64/g;'\nsubs32+='s/FLOAT32/FLOAT64/g;'\n\n#sed $subs32 float32s.go > float64s.go\n#sed $s"
}
]
About this extraction
This page contains the full source code of the barnex/cuda5 GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 72 files (172.6 KB), approximately 49.7k tokens, and a symbol index with 456 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.