Repository: barnex/cuda5
Branch: master
Commit: da30a9b287d8
Files: 72
Total size: 172.6 KB

Directory structure:
gitextract_cibr8rm8/

├── .gitignore
├── Makefile
├── README.md
├── cu/
│   ├── Makefile
│   ├── README
│   ├── cgoflags.go
│   ├── context.go
│   ├── context_test.go
│   ├── device.go
│   ├── device_test.go
│   ├── dim3.go
│   ├── doc.go
│   ├── execution.go
│   ├── function.go
│   ├── init.go
│   ├── init_test.go
│   ├── memory.go
│   ├── memory_test.go
│   ├── memset.go
│   ├── module.go
│   ├── module_test.go
│   ├── peer.go
│   ├── result.go
│   ├── runtimeapi.go
│   ├── stream.go
│   ├── testdata/
│   │   ├── testmodule.cu
│   │   └── testmodule.ptx
│   ├── version.go
│   └── version_test.go
├── cuda/
│   ├── Makefile
│   ├── README
│   ├── cgoflags.go
│   └── device.go
├── cufft/
│   ├── Makefile
│   ├── README
│   ├── cgoflags.go
│   ├── doc.go
│   ├── fft_test.go
│   ├── init_test.go
│   ├── mode.go
│   ├── plan.go
│   ├── result.go
│   └── type.go
├── curand/
│   ├── Makefile
│   ├── README
│   ├── cgoflags.go
│   ├── generator.go
│   └── status.go
├── doc.go
└── safe/
    ├── Makefile
    ├── README
    ├── complex128s.go
    ├── complex128s_test.go
    ├── complex64s.go
    ├── complex64s_test.go
    ├── doc.go
    ├── fft1d_test.go
    ├── fft1dc2r.go
    ├── fft1dr2c.go
    ├── fft3d_test.go
    ├── fft3dc2r.go
    ├── fft3dd2z.go
    ├── fft3dr2c.go
    ├── fft3dz2d.go
    ├── fftplan.go
    ├── float32s.go
    ├── float32s_test.go
    ├── float64s.go
    ├── float64s_test.go
    ├── init.go
    ├── slice.go
    └── subs.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.swp
*.{6,8,5,o}


================================================
FILE: Makefile
================================================
all: 6g doc

6g:
	go install -v
	go tool vet *.go
	gofmt -w *.go

GCCGO=gccgo -gccgoflags '-static-libgcc -O3'

gccgo:
	go install -v -compiler $(GCCGO)

test: 6gtest gccgotest

6gtest: 
	go test

gccgotest: 
	go test -compiler $(GCCGO)

bench: 6gbench gccgobench

6gbench:
	go test -bench=.

gccgobench:
	go test -bench=. -compiler $(GCCGO)

clean:
	go clean
	go-optview -c -w *.go
	gofmt -w *.go

opt:
	go-optview -w *.go
	gofmt -w *.go

doc:
	godoc github.com/barnex/cuda5 > README


================================================
FILE: README.md
================================================
# Go bindings for CUDA

Go bindings for nVIDIA CUDA 5 and later. This package compiles with both gc and gccgo.

![fig](gophergpu.png)


================================================
FILE: cu/Makefile
================================================
all: 6g gccgo doc

6g:
	go install -v
	go tool vet *.go
	gofmt -w *.go

GCCGO=gccgo -gccgoflags '-static-libgcc -O3'

gccgo:
	go build -v -compiler $(GCCGO)

test: 6gtest gccgotest

6gtest: 
	go test

gccgotest: 
	go test -compiler $(GCCGO)

bench: 6gbench gccgobench

6gbench:
	go test -bench=.

gccgobench:
	go test -bench=. -compiler $(GCCGO)

clean:
	go clean

doc:
	godoc github.com/barnex/cuda5/cu > README


================================================
FILE: cu/README
================================================
PACKAGE

package cu
    import "github.com/barnex/cuda5/cu"

    Go bindings for the CUDA driver API.

CONSTANTS

const (
    // If  the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.
    CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
    // Spin when waiting for results from the GPU. 
    CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
    // Yield its thread when waiting for results from the GPU.
    CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
    // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.
    CTX_BLOCKING_SYNC
    // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.
    CTX_MAP_HOST = C.CU_CTX_MAP_HOST
    //Do not reduce local memory after resizing local memory for a kernel. 
    CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
)
    Flags for CtxCreate
const (
    SIZEOF_FLOAT32    = 4
    SIZEOF_FLOAT64    = 8
    SIZEOF_COMPLEX64  = 8
    SIZEOF_COMPLEX128 = 16
)
    Type size in bytes


FUNCTIONS

func CtxDestroy(ctx *Context)
    Destroys the CUDA context specified by ctx. If the context usage count
    is not equal to 1, or the context is current to any CPU thread other
    than the current one, this function fails. Floating contexts (detached
    from a CPU thread via cuCtxPopCurrent()) may be destroyed by this
    function.

func CtxDisablePeerAccess(peer Context)
    Reverses CtxEnablePeerAccess().

func CtxEnablePeerAccess(peer Context)
    Make allocations from the peer Context available to the current context.

func CtxGetApiVersion(ctx Context) (version int)
    Returns the API version to create the context.

func CtxSetCurrent(ctx Context)
    Sets the current active context.

func CtxSynchronize()
    Blocks until the device has completed all preceding requested tasks, if
    the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.

func DeviceCanAccessPeer(dev, peer Device) bool
    Returns true if CtxEnablePeerAccess can be called on a context for dev
    and peerDev.

func DeviceComputeCapability(device Device) (major, minor int)
    Returns the compute capability of the device.

func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int
    Gets the value of a device attribute.

func DeviceGetCount() int
    Returns the number of devices with compute capability greater than or
    equal to 1.0 that are available for execution.

func DeviceGetName(dev Device) string
    Gets the name of the device.

func DeviceTotalMem(device Device) int64
    Returns the total amount of memory available on the device in bytes.

func FuncGetAttribute(attrib FunctionAttribute, function Function) int

func Init(flags int)
    Initialize the CUDA driver API. Currently, flags must be 0. If Init()
    has not been called, any function from the driver API will panic with
    ERROR_NOT_INITIALIZED.

func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer)

func MemAllocHost(bytes int64) unsafe.Pointer

func MemFree(ptr *DevicePtr)
    Frees device memory allocated by MemAlloc(). Overwrites the pointer with
    NULL. It is safe to double-free.

func MemFreeHost(ptr unsafe.Pointer)

func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr)
    Returns the base address and size of the allocation (by MemAlloc) that
    contains the input pointer ptr.

func MemGetInfo() (free, total int64)
    Returns the free and total amount of memroy in the current Context (in
    bytes).

func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag)
    Page-locks memory specified by the pointer and bytes. The pointer and
    byte size must be aligned to the host page size (4KB) See also:
    MemHostUnregister()

func MemHostUnregister(ptr unsafe.Pointer)
    Unmaps memory locked by MemHostRegister().

func Memcpy(dst, src DevicePtr, bytes int64)
    Copies a number of bytes on the current device. Requires unified
    addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually
    an auto copy for device and/or host memory

func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream)
    Asynchronously copies a number of bytes on the current device.

func MemcpyDtoD(dst, src DevicePtr, bytes int64)
    Copies a number of bytes from host to device.

func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream)
    Asynchronously copies a number of bytes from host to device.

func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64)
    Copies a number of bytes from device to host.

func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream)
    Asynchronously copies a number of bytes device host to host. The host
    memory must be page-locked (see MemRegister)

func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64)
    Copies a number of bytes from host to device.

func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream)
    Asynchronously copies a number of bytes from host to device. The host
    memory must be page-locked (see MemRegister)

func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64)
    Copies from device memory in one context (device) to another.

func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream)
    Asynchronously copies from device memory in one context (device) to
    another.

func MemsetD32(deviceptr DevicePtr, value uint32, N int64)
    Sets the first N 32-bit values of dst array to value. Asynchronous.

func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream)
    Asynchronously sets the first N 32-bit values of dst array to value.

func MemsetD8(deviceptr DevicePtr, value uint8, N int64)
    Sets the first N 8-bit values of dst array to value. Asynchronous.

func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream)
    Asynchronously sets the first N 32-bit values of dst array to value.

func StreamDestroy(stream *Stream)
    Destroys an asynchronous stream

func StreamSynchronize(stream Stream)
    Blocks until the stream has completed.

func Version() int
    Returns the CUDA driver version.


TYPES

type Context uintptr
    CUDA context.

func CtxCreate(flags uint, dev Device) Context
    Create a CUDA context.

func CtxGetCurrent() Context
    Gets the current active context.

func (ctx Context) ApiVersion() (version int)
    Returns the API version to create the context.

func (ctx *Context) Destroy()
    Destroys the CUDA context.

func (peer Context) DisablePeerAccess()
    Reverses EnablePeerAccess().

func (peer Context) EnablePeerAccess()
    Make allocations from the peer Context available to the current context.

func (ctx Context) SetCurrent()
    Sets the current active context.

type DevProp struct {
    MaxThreadsPerBlock  int
    MaxThreadsDim       [3]int
    MaxGridSize         [3]int
    SharedMemPerBlock   int
    TotalConstantMemory int
    SIMDWidth           int
    MemPitch            int
    RegsPerBlock        int
    ClockRate           int
    TextureAlign        int
}
    Device properties

func DeviceGetProperties(dev Device) (prop DevProp)
    Returns the device's properties.

type Device int
    CUDA Device number.

func CtxGetDevice() Device
    Returns the ordinal of the current context's device.

func DeviceGet(ordinal int) Device
    Returns in a device handle given an ordinal in the range [0,
    DeviceGetCount()-1].

func (dev Device) Attribute(attrib DeviceAttribute) int
    Gets the value of a device attribute.

func (dev Device) CanAccessPeer(peer Device) bool
    Returns true if CtxEnablePeerAccess can be called on a context for dev
    and peerDev.

func (device Device) ComputeCapability() (major, minor int)
    Returns the compute capability of the device.

func (dev Device) Name() string
    Gets the name of the device.

func (dev Device) Properties() DevProp
    Returns the device's properties.

func (device Device) TotalMem() int64
    Returns the total amount of memory available on the device in bytes.

type DeviceAttribute int

const (
    MAX_THREADS_PER_BLOCK            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK            // Maximum number of threads per block
    MAX_BLOCK_DIM_X                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X                  // Maximum block dimension X
    MAX_BLOCK_DIM_Y                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y                  // Maximum block dimension Y
    MAX_BLOCK_DIM_Z                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z                  // Maximum block dimension Z
    MAX_GRID_DIM_X                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X                   // Maximum grid dimension X
    MAX_GRID_DIM_Y                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y                   // Maximum grid dimension Y
    MAX_GRID_DIM_Z                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z                   // Maximum grid dimension Z
    MAX_SHARED_MEMORY_PER_BLOCK      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK      // Maximum shared memory available per block in bytes
    TOTAL_CONSTANT_MEMORY            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY            // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
    WARP_SIZE                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE                        // Warp size in threads
    MAX_PITCH                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH                        // Maximum pitch in bytes allowed by memory copies
    MAX_REGISTERS_PER_BLOCK          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK          // Maximum number of 32-bit registers available per block
    CLOCK_RATE                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE                       // Peak clock frequency in kilohertz
    TEXTURE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT                // Alignment requirement for textures
    MULTIPROCESSOR_COUNT             DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT             // Number of multiprocessors on device
    KERNEL_EXEC_TIMEOUT              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT              // Specifies whether there is a run time limit on kernels
    INTEGRATED                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED                       // Device is integrated with host memory
    CAN_MAP_HOST_MEMORY              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY              // Device can map host memory into CUDA address space
    COMPUTE_MODE                     DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE                     // Compute mode (See ::CUcomputemode for details)
    MAXIMUM_TEXTURE1D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH          // Maximum 1D texture width
    MAXIMUM_TEXTURE2D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH          // Maximum 2D texture width
    MAXIMUM_TEXTURE2D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT         // Maximum 2D texture height
    MAXIMUM_TEXTURE3D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH          // Maximum 3D texture width
    MAXIMUM_TEXTURE3D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT         // Maximum 3D texture height
    MAXIMUM_TEXTURE3D_DEPTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH          // Maximum 3D texture depth
    MAXIMUM_TEXTURE2D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH  // Maximum 2D layered texture width
    MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height
    MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture
    SURFACE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                // Alignment requirement for surfaces
    CONCURRENT_KERNELS               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS               // Device can possibly execute multiple kernels concurrently
    ECC_ENABLED                      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED                      // Device has ECC support enabled
    PCI_BUS_ID                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID                       // PCI bus ID of the device
    PCI_DEVICE_ID                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID                    // PCI device ID of the device
    TCC_DRIVER                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER                       // Device is using TCC driver model
    MEMORY_CLOCK_RATE                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE                // Peak memory clock frequency in kilohertz
    GLOBAL_MEMORY_BUS_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH          // Global memory bus width in bits
    L2_CACHE_SIZE                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                    // Size of L2 cache in bytes
    MAX_THREADS_PER_MULTIPROCESSOR   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR   // Maximum resident threads per multiprocessor
    ASYNC_ENGINE_COUNT               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT               // Number of asynchronous engines
    UNIFIED_ADDRESSING               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING               // Device uses shares a unified address space with the host 
    MAXIMUM_TEXTURE1D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  // Maximum 1D layered texture width
    MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture
)

type DevicePtr uintptr

func MemAlloc(bytes int64) DevicePtr
    Allocates a number of bytes of device memory.

func (ptr DevicePtr) Bytes() (bytes int64)
    Returns the size of the allocation (by MemAlloc) that contains the input
    pointer ptr.

func (ptr *DevicePtr) Free()
    Frees device memory allocated by MemAlloc(). Overwrites the pointer with
    NULL. It is safe to double-free.

func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr)
    Returns the base address and size of the allocation (by MemAlloc) that
    contains the input pointer ptr.

func (ptr DevicePtr) MemoryType() MemoryType
    Returns the physical memory type that ptr addresses.

func (p DevicePtr) String() string

type Dim3 struct {
    X, Y, Z int
}

type Function uintptr
    Represents a CUDA CUfunction, a reference to a function within a module.

func ModuleGetFunction(module Module, name string) Function
    Returns a Function handle.

func (f Function) GetAttribute(attrib FunctionAttribute) int

type FunctionAttribute int

const (
    FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.
    FUNC_A_SHARED_SIZE_BYTES     FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES     // The size in bytes of statically-allocated shared memory required by this function. 
    FUNC_A_CONST_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES      // The size in bytes of user-allocated constant memory required by this function.
    FUNC_A_LOCAL_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES      // The size in bytes of local memory used by each thread of this function.
    FUNC_A_NUM_REGS              FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS              // The number of registers used by each thread of this function.
    FUNC_A_PTX_VERSION           FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION           // The PTX virtual architecture version for which the function was compiled. 
    FUNC_A_BINARY_VERSION        FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION        // The binary architecture version for which the function was compiled.
)

type MemHostRegisterFlag int

const (
    // Memory is pinned in all CUDA contexts.
    MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE
    // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()
    MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP
)
    Flag for MemHostRegister

type MemoryType uint

const (
    MemoryTypeHost    MemoryType = C.CU_MEMORYTYPE_HOST
    MemoryTypeDevice  MemoryType = C.CU_MEMORYTYPE_DEVICE
    MemoryTypeArray   MemoryType = C.CU_MEMORYTYPE_ARRAY
    MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
)

func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result)
    Returns the physical memory type that ptr addresses.

func (t MemoryType) String() string

type Module uintptr
    Represents a CUDA CUmodule, a reference to executable device code.

func ModuleLoad(fname string) Module
    Loads a compute module from file

func ModuleLoadData(image string) Module
    Loads a compute module from string

func (m Module) GetFunction(name string) Function
    Returns a Function handle.

type Result int
    CUDA error status. CUDA error statuses are not returned by functions but
    checked and passed to panic() when not successful. If desired, they can
    be caught by recover().

const (
    SUCCESS                              Result = C.CUDA_SUCCESS
    ERROR_INVALID_VALUE                  Result = C.CUDA_ERROR_INVALID_VALUE
    ERROR_OUT_OF_MEMORY                  Result = C.CUDA_ERROR_OUT_OF_MEMORY
    ERROR_NOT_INITIALIZED                Result = C.CUDA_ERROR_NOT_INITIALIZED
    ERROR_DEINITIALIZED                  Result = C.CUDA_ERROR_DEINITIALIZED
    ERROR_PROFILER_DISABLED              Result = C.CUDA_ERROR_PROFILER_DISABLED
    ERROR_PROFILER_NOT_INITIALIZED       Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
    ERROR_PROFILER_ALREADY_STARTED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
    ERROR_PROFILER_ALREADY_STOPPED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
    ERROR_NO_DEVICE                      Result = C.CUDA_ERROR_NO_DEVICE
    ERROR_INVALID_DEVICE                 Result = C.CUDA_ERROR_INVALID_DEVICE
    ERROR_INVALID_IMAGE                  Result = C.CUDA_ERROR_INVALID_IMAGE
    ERROR_INVALID_CONTEXT                Result = C.CUDA_ERROR_INVALID_CONTEXT
    ERROR_CONTEXT_ALREADY_CURRENT        Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
    ERROR_MAP_FAILED                     Result = C.CUDA_ERROR_MAP_FAILED
    ERROR_UNMAP_FAILED                   Result = C.CUDA_ERROR_UNMAP_FAILED
    ERROR_ARRAY_IS_MAPPED                Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
    ERROR_ALREADY_MAPPED                 Result = C.CUDA_ERROR_ALREADY_MAPPED
    ERROR_NO_BINARY_FOR_GPU              Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU
    ERROR_ALREADY_ACQUIRED               Result = C.CUDA_ERROR_ALREADY_ACQUIRED
    ERROR_NOT_MAPPED                     Result = C.CUDA_ERROR_NOT_MAPPED
    ERROR_NOT_MAPPED_AS_ARRAY            Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
    ERROR_NOT_MAPPED_AS_POINTER          Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
    ERROR_ECC_UNCORRECTABLE              Result = C.CUDA_ERROR_ECC_UNCORRECTABLE
    ERROR_UNSUPPORTED_LIMIT              Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT
    ERROR_CONTEXT_ALREADY_IN_USE         Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
    ERROR_INVALID_SOURCE                 Result = C.CUDA_ERROR_INVALID_SOURCE
    ERROR_FILE_NOT_FOUND                 Result = C.CUDA_ERROR_FILE_NOT_FOUND
    ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
    ERROR_SHARED_OBJECT_INIT_FAILED      Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
    ERROR_OPERATING_SYSTEM               Result = C.CUDA_ERROR_OPERATING_SYSTEM
    ERROR_INVALID_HANDLE                 Result = C.CUDA_ERROR_INVALID_HANDLE
    ERROR_NOT_FOUND                      Result = C.CUDA_ERROR_NOT_FOUND
    ERROR_NOT_READY                      Result = C.CUDA_ERROR_NOT_READY
    ERROR_LAUNCH_FAILED                  Result = C.CUDA_ERROR_LAUNCH_FAILED
    ERROR_LAUNCH_OUT_OF_RESOURCES        Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
    ERROR_LAUNCH_TIMEOUT                 Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
    ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
    ERROR_PEER_ACCESS_ALREADY_ENABLED    Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
    ERROR_PEER_ACCESS_NOT_ENABLED        Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
    ERROR_PRIMARY_CONTEXT_ACTIVE         Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
    ERROR_CONTEXT_IS_DESTROYED           Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
    ERROR_ASSERT                         Result = C.CUDA_ERROR_ASSERT
    ERROR_TOO_MANY_PEERS                 Result = C.CUDA_ERROR_TOO_MANY_PEERS
    ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
    ERROR_HOST_MEMORY_NOT_REGISTERED     Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
    ERROR_UNKNOWN                        Result = C.CUDA_ERROR_UNKNOWN
)

func StreamQuery(stream Stream) Result
    Returns Success if all operations have completed, ErrorNotReady
    otherwise

func (err Result) String() string
    Message string for the error

type Stream uintptr
    CUDA stream.

func StreamCreate() Stream
    Creates an asynchronous stream

func (stream *Stream) Destroy()
    Destroys the asynchronous stream

func (stream Stream) Query() Result
    Returns Success if all operations have completed, ErrorNotReady
    otherwise

func (stream Stream) Synchronize()
    Blocks until the stream has completed.


================================================
FILE: cu/cgoflags.go
================================================
package cu

// This file provides CGO flags to find CUDA libraries and headers.

//#cgo LDFLAGS:-lcuda -lcudart
//
////default location:
//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
//#cgo CFLAGS: -I/usr/local/cuda/include/
//
////default location if not properly symlinked:
//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
//
////arch linux:
//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
//#cgo CFLAGS: -I/opt/cuda/include
//
////WINDOWS:
//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include
import "C"


================================================
FILE: cu/context.go
================================================
package cu

// This file implements CUDA driver context management

//#include <cuda.h>
import "C"
import "unsafe"

// CUDA context.
type Context uintptr

// Create a CUDA context.
func CtxCreate(flags uint, dev Device) Context {
	var ctx C.CUcontext
	err := Result(C.cuCtxCreate(&ctx, C.uint(flags), C.CUdevice(dev)))
	if err != SUCCESS {
		panic(err)
	}
	return Context(uintptr(unsafe.Pointer(ctx)))
}

//Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function.
func CtxDestroy(ctx *Context) {
	err := Result(C.cuCtxDestroy(C.CUcontext(unsafe.Pointer(uintptr(*ctx)))))
	*ctx = 0
	if err != SUCCESS {
		panic(err)
	}
}

//Destroys the CUDA context.
func (ctx *Context) Destroy() {
	CtxDestroy(ctx)
}

// Returns the API version to create the context.
func CtxGetApiVersion(ctx Context) (version int) {
	var cversion C.uint
	err := Result(C.cuCtxGetApiVersion(C.CUcontext(unsafe.Pointer(uintptr(ctx))), &cversion))
	if err != SUCCESS {
		panic(err)
	}
	version = int(cversion)
	return
}

// Returns the API version to create the context.
func (ctx Context) ApiVersion() (version int) {
	return CtxGetApiVersion(ctx)
}

// Gets the current active context.
func CtxGetCurrent() Context {
	var ctx C.CUcontext
	err := Result(C.cuCtxGetCurrent(&ctx))
	if err != SUCCESS {
		panic(err)
	}
	return Context(uintptr(unsafe.Pointer(ctx)))
}

// Returns the ordinal of the current context's device.
func CtxGetDevice() Device {
	var dev C.CUdevice
	err := Result(C.cuCtxGetDevice(&dev))
	if err != SUCCESS {
		panic(err)
	}
	return Device(dev)
}

// Sets the current active context.
func CtxSetCurrent(ctx Context) {
	err := Result(C.cuCtxSetCurrent(C.CUcontext(unsafe.Pointer(uintptr(ctx)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Sets the current active context.
func (ctx Context) SetCurrent() {
	CtxSetCurrent(ctx)
}

// Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.
func CtxSynchronize() {
	err := Result(C.cuCtxSynchronize())
	if err != SUCCESS {
		panic(err)
	}
}

// Flags for CtxCreate
const (
	// If  the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.
	CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
	// Spin when waiting for results from the GPU.
	CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
	// Yield its thread when waiting for results from the GPU.
	CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
	// Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.
	CTX_BLOCKING_SYNC
	// Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.
	CTX_MAP_HOST = C.CU_CTX_MAP_HOST
	//Do not reduce local memory after resizing local memory for a kernel.
	CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
)


================================================
FILE: cu/context_test.go
================================================
package cu

import (
	"fmt"
	"testing"
)

func TestContext(t *testing.T) {
	fmt.Println("CtxCreate")
	ctx := CtxCreate(CTX_SCHED_AUTO, 0)
	fmt.Println("CtxSetCurrent")
	CtxSetCurrent(ctx)
	fmt.Println("CtxGetApiVersion:", ctx.ApiVersion())
	fmt.Println("CtxGetDevice:", CtxGetDevice())
	(&ctx).Destroy()
}

func BenchmarkGetContext(b *testing.B) {
	b.StopTimer()
	ctx := CtxCreate(CTX_SCHED_AUTO, 0)
	CtxSetCurrent(ctx)
	b.StartTimer()
	for i := 0; i < b.N; i++ {
		CtxGetCurrent()
	}
}

func BenchmarkSetContext(b *testing.B) {
	b.StopTimer()
	ctx := CtxCreate(CTX_SCHED_AUTO, 0)
	b.StartTimer()
	for i := 0; i < b.N; i++ {
		ctx.SetCurrent()
	}
}


================================================
FILE: cu/device.go
================================================
package cu

// This file implements CUDA driver device management

//#include <cuda.h>
import "C"

import ()

// CUDA Device number.
type Device int

// Returns the compute capability of the device.
func DeviceComputeCapability(device Device) (major, minor int) {
	var maj, min C.int
	err := Result(C.cuDeviceComputeCapability(&maj, &min, C.CUdevice(device)))
	if err != SUCCESS {
		panic(err)
	}
	major = int(maj)
	minor = int(min)
	return
}

// Returns the compute capability of the device.
func (device Device) ComputeCapability() (major, minor int) {
	return DeviceComputeCapability(device)
}

// Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1].
func DeviceGet(ordinal int) Device {
	var device C.CUdevice
	err := Result(C.cuDeviceGet(&device, C.int(ordinal)))
	if err != SUCCESS {
		panic(err)
	}
	return Device(device)
}

// Gets the value of a device attribute.
func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int {
	var attr C.int
	err := Result(C.cuDeviceGetAttribute(&attr, C.CUdevice_attribute(attrib), C.CUdevice(dev)))
	if err != SUCCESS {
		panic(err)
	}
	return int(attr)
}

// Gets the value of a device attribute.
func (dev Device) Attribute(attrib DeviceAttribute) int {
	return DeviceGetAttribute(attrib, dev)
}

// Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution.
func DeviceGetCount() int {
	var count C.int
	err := Result(C.cuDeviceGetCount(&count))
	if err != SUCCESS {
		panic(err)
	}
	return int(count)
}

// Gets the name of the device.
func DeviceGetName(dev Device) string {
	size := 256
	buf := make([]byte, size)
	cstr := C.CString(string(buf))
	err := Result(C.cuDeviceGetName(cstr, C.int(size), C.CUdevice(dev)))
	if err != SUCCESS {
		panic(err)
	}
	return C.GoString(cstr)
}

// Gets the name of the device.
func (dev Device) Name() string {
	return DeviceGetName(dev)
}

// Device properties
type DevProp struct {
	MaxThreadsPerBlock  int
	MaxThreadsDim       [3]int
	MaxGridSize         [3]int
	SharedMemPerBlock   int
	TotalConstantMemory int
	SIMDWidth           int
	MemPitch            int
	RegsPerBlock        int
	ClockRate           int
	TextureAlign        int
}

// Returns the device's properties.
func DeviceGetProperties(dev Device) (prop DevProp) {
	var cprop C.CUdevprop
	err := Result(C.cuDeviceGetProperties(&cprop, C.CUdevice(dev)))
	if err != SUCCESS {
		panic(err)
	}
	prop.MaxThreadsPerBlock = int(cprop.maxThreadsPerBlock)
	prop.MaxThreadsDim[0] = int(cprop.maxThreadsDim[0])
	prop.MaxThreadsDim[1] = int(cprop.maxThreadsDim[1])
	prop.MaxThreadsDim[2] = int(cprop.maxThreadsDim[2])
	prop.MaxGridSize[0] = int(cprop.maxGridSize[0])
	prop.MaxGridSize[1] = int(cprop.maxGridSize[1])
	prop.MaxGridSize[2] = int(cprop.maxGridSize[2])
	prop.SharedMemPerBlock = int(cprop.sharedMemPerBlock)
	prop.TotalConstantMemory = int(cprop.totalConstantMemory)
	prop.SIMDWidth = int(cprop.SIMDWidth)
	prop.MemPitch = int(cprop.memPitch)
	prop.RegsPerBlock = int(cprop.regsPerBlock)
	prop.ClockRate = int(cprop.clockRate)
	prop.TextureAlign = int(cprop.textureAlign)
	return
}

// Returns the device's properties.
func (dev Device) Properties() DevProp {
	return DeviceGetProperties(dev)
}

// Returns the total amount of memory available on the device in bytes.
func (device Device) TotalMem() int64 {
	return DeviceTotalMem(device)
}

// Returns the total amount of memory available on the device in bytes.
func DeviceTotalMem(device Device) int64 {
	var bytes C.size_t
	err := Result(C.cuDeviceTotalMem(&bytes, C.CUdevice(device)))
	if err != SUCCESS {
		panic(err)
	}
	return int64(bytes)
}

type DeviceAttribute int

const (
	MAX_THREADS_PER_BLOCK            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK            // Maximum number of threads per block
	MAX_BLOCK_DIM_X                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X                  // Maximum block dimension X
	MAX_BLOCK_DIM_Y                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y                  // Maximum block dimension Y
	MAX_BLOCK_DIM_Z                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z                  // Maximum block dimension Z
	MAX_GRID_DIM_X                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X                   // Maximum grid dimension X
	MAX_GRID_DIM_Y                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y                   // Maximum grid dimension Y
	MAX_GRID_DIM_Z                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z                   // Maximum grid dimension Z
	MAX_SHARED_MEMORY_PER_BLOCK      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK      // Maximum shared memory available per block in bytes
	TOTAL_CONSTANT_MEMORY            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY            // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
	WARP_SIZE                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE                        // Warp size in threads
	MAX_PITCH                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH                        // Maximum pitch in bytes allowed by memory copies
	MAX_REGISTERS_PER_BLOCK          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK          // Maximum number of 32-bit registers available per block
	CLOCK_RATE                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE                       // Peak clock frequency in kilohertz
	TEXTURE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT                // Alignment requirement for textures
	MULTIPROCESSOR_COUNT             DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT             // Number of multiprocessors on device
	KERNEL_EXEC_TIMEOUT              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT              // Specifies whether there is a run time limit on kernels
	INTEGRATED                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED                       // Device is integrated with host memory
	CAN_MAP_HOST_MEMORY              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY              // Device can map host memory into CUDA address space
	COMPUTE_MODE                     DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE                     // Compute mode (See ::CUcomputemode for details)
	MAXIMUM_TEXTURE1D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH          // Maximum 1D texture width
	MAXIMUM_TEXTURE2D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH          // Maximum 2D texture width
	MAXIMUM_TEXTURE2D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT         // Maximum 2D texture height
	MAXIMUM_TEXTURE3D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH          // Maximum 3D texture width
	MAXIMUM_TEXTURE3D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT         // Maximum 3D texture height
	MAXIMUM_TEXTURE3D_DEPTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH          // Maximum 3D texture depth
	MAXIMUM_TEXTURE2D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH  // Maximum 2D layered texture width
	MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height
	MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture
	SURFACE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                // Alignment requirement for surfaces
	CONCURRENT_KERNELS               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS               // Device can possibly execute multiple kernels concurrently
	ECC_ENABLED                      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED                      // Device has ECC support enabled
	PCI_BUS_ID                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID                       // PCI bus ID of the device
	PCI_DEVICE_ID                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID                    // PCI device ID of the device
	TCC_DRIVER                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER                       // Device is using TCC driver model
	MEMORY_CLOCK_RATE                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE                // Peak memory clock frequency in kilohertz
	GLOBAL_MEMORY_BUS_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH          // Global memory bus width in bits
	L2_CACHE_SIZE                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                    // Size of L2 cache in bytes
	MAX_THREADS_PER_MULTIPROCESSOR   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR   // Maximum resident threads per multiprocessor
	ASYNC_ENGINE_COUNT               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT               // Number of asynchronous engines
	UNIFIED_ADDRESSING               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING               // Device uses shares a unified address space with the host
	MAXIMUM_TEXTURE1D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  // Maximum 1D layered texture width
	MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture
)


================================================
FILE: cu/device_test.go
================================================
package cu

import (
	"fmt"
	"testing"
)

func TestDevice(t *testing.T) {
	fmt.Println("DeviceGetCount:", DeviceGetCount())
	for i := 0; i < DeviceGetCount(); i++ {
		fmt.Println("DeviceGet", i)
		dev := DeviceGet(i)
		major, minor := dev.ComputeCapability()
		fmt.Println("Name: ", dev.Name())
		fmt.Println("ComputeCapability: ", major, minor)
		fmt.Println("TotalMem: ", dev.TotalMem())

		fmt.Println("ATTRIBUTE_MAX_THREADS_PER_BLOCK           :", dev.Attribute(MAX_THREADS_PER_BLOCK))
		fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_X                 :", dev.Attribute(MAX_BLOCK_DIM_X))
		fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Y                 :", dev.Attribute(MAX_BLOCK_DIM_Y))
		fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Z                 :", dev.Attribute(MAX_BLOCK_DIM_Z))
		fmt.Println("ATTRIBUTE_MAX_GRID_DIM_X                  :", dev.Attribute(MAX_GRID_DIM_X))
		fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Y                  :", dev.Attribute(MAX_GRID_DIM_Y))
		fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Z                  :", dev.Attribute(MAX_GRID_DIM_Z))
		fmt.Println("ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK     :", dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK))
		fmt.Println("ATTRIBUTE_TOTAL_CONSTANT_MEMORY           :", dev.Attribute(TOTAL_CONSTANT_MEMORY))
		fmt.Println("ATTRIBUTE_WARP_SIZE                       :", dev.Attribute(WARP_SIZE))
		fmt.Println("ATTRIBUTE_MAX_PITCH                       :", dev.Attribute(MAX_PITCH))
		fmt.Println("ATTRIBUTE_MAX_REGISTERS_PER_BLOCK         :", dev.Attribute(MAX_REGISTERS_PER_BLOCK))
		fmt.Println("ATTRIBUTE_CLOCK_RATE                      :", dev.Attribute(CLOCK_RATE))
		fmt.Println("ATTRIBUTE_TEXTURE_ALIGNMENT               :", dev.Attribute(TEXTURE_ALIGNMENT))
		fmt.Println("ATTRIBUTE_MULTIPROCESSOR_COUNT            :", dev.Attribute(MULTIPROCESSOR_COUNT))
		fmt.Println("ATTRIBUTE_KERNEL_EXEC_TIMEOUT             :", dev.Attribute(KERNEL_EXEC_TIMEOUT))
		fmt.Println("ATTRIBUTE_INTEGRATED                      :", dev.Attribute(INTEGRATED))
		fmt.Println("ATTRIBUTE_CAN_MAP_HOST_MEMORY             :", dev.Attribute(CAN_MAP_HOST_MEMORY))
		fmt.Println("ATTRIBUTE_COMPUTE_MODE                    :", dev.Attribute(COMPUTE_MODE))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH         :", dev.Attribute(MAXIMUM_TEXTURE1D_WIDTH))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH         :", dev.Attribute(MAXIMUM_TEXTURE2D_WIDTH))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT        :", dev.Attribute(MAXIMUM_TEXTURE2D_HEIGHT))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH         :", dev.Attribute(MAXIMUM_TEXTURE3D_WIDTH))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT        :", dev.Attribute(MAXIMUM_TEXTURE3D_HEIGHT))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH         :", dev.Attribute(MAXIMUM_TEXTURE3D_DEPTH))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_WIDTH))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_LAYERS))
		fmt.Println("ATTRIBUTE_SURFACE_ALIGNMENT               :", dev.Attribute(SURFACE_ALIGNMENT))
		fmt.Println("ATTRIBUTE_CONCURRENT_KERNELS              :", dev.Attribute(CONCURRENT_KERNELS))
		fmt.Println("ATTRIBUTE_ECC_ENABLED                     :", dev.Attribute(ECC_ENABLED))
		fmt.Println("ATTRIBUTE_PCI_BUS_ID                      :", dev.Attribute(PCI_BUS_ID))
		fmt.Println("ATTRIBUTE_PCI_DEVICE_ID                   :", dev.Attribute(PCI_DEVICE_ID))
		fmt.Println("ATTRIBUTE_TCC_DRIVER                      :", dev.Attribute(TCC_DRIVER))
		fmt.Println("ATTRIBUTE_MEMORY_CLOCK_RATE               :", dev.Attribute(MEMORY_CLOCK_RATE))
		fmt.Println("ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH         :", dev.Attribute(GLOBAL_MEMORY_BUS_WIDTH))
		fmt.Println("ATTRIBUTE_L2_CACHE_SIZE                   :", dev.Attribute(L2_CACHE_SIZE))
		fmt.Println("ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR  :", dev.Attribute(MAX_THREADS_PER_MULTIPROCESSOR))
		fmt.Println("ATTRIBUTE_ASYNC_ENGINE_COUNT              :", dev.Attribute(ASYNC_ENGINE_COUNT))
		fmt.Println("ATTRIBUTE_UNIFIED_ADDRESSING              :", dev.Attribute(UNIFIED_ADDRESSING))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_WIDTH))
		fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_LAYERS))

		fmt.Printf("Properties:%#v\n", dev.Properties())
	}
}


================================================
FILE: cu/dim3.go
================================================
package cu

type Dim3 struct {
	X, Y, Z int
}


================================================
FILE: cu/doc.go
================================================
// Go bindings for the CUDA driver API.
package cu


================================================
FILE: cu/execution.go
================================================
package cu

// This file implements execution of CUDA kernels

//#include <cuda.h>
import "C"

import (
	"unsafe"
)

const pointerSize = 8 // sorry, 64 bits only.

func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) {

	// Since Go 1.6, a cgo argument cannot have a Go pointer to Go pointer,
	// so we copy the argument values go C memory first.
	argv := C.malloc(C.size_t(len(kernelParams) * pointerSize))
	argp := C.malloc(C.size_t(len(kernelParams) * pointerSize))
	defer C.free(argv)
	defer C.free(argp)
	for i := range kernelParams {
		*((*unsafe.Pointer)(offset(argp, i))) = offset(argv, i)       // argp[i] = &argv[i]
		*((*uint64)(offset(argv, i))) = *((*uint64)(kernelParams[i])) // argv[i] = *kernelParams[i]
	}

	err := Result(C.cuLaunchKernel(
		C.CUfunction(unsafe.Pointer(uintptr(f))),
		C.uint(gridDimX),
		C.uint(gridDimY),
		C.uint(gridDimZ),
		C.uint(blockDimX),
		C.uint(blockDimY),
		C.uint(blockDimZ),
		C.uint(sharedMemBytes),
		C.CUstream(unsafe.Pointer(uintptr(stream))),
		(*unsafe.Pointer)(argp),
		(*unsafe.Pointer)(unsafe.Pointer(uintptr(0)))))
	if err != SUCCESS {
		panic(err)
	}
}

func offset(ptr unsafe.Pointer, i int) unsafe.Pointer {
	return unsafe.Pointer(uintptr(ptr) + pointerSize*uintptr(i))
}


================================================
FILE: cu/function.go
================================================
package cu

// This file implements manipulations on CUDA functions

//#include <cuda.h>
import "C"

import (
	"unsafe"
)

// Represents a CUDA CUfunction, a reference to a function within a module.
type Function uintptr

func FuncGetAttribute(attrib FunctionAttribute, function Function) int {
	var attr C.int
	err := Result(C.cuFuncGetAttribute(&attr, C.CUfunction_attribute(attrib), C.CUfunction(unsafe.Pointer(uintptr(function)))))
	if err != SUCCESS {
		panic(err)
	}
	return int(attr)
}

func (f Function) GetAttribute(attrib FunctionAttribute) int {
	return FuncGetAttribute(attrib, f)
}

type FunctionAttribute int

const (
	FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.
	FUNC_A_SHARED_SIZE_BYTES     FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES     // The size in bytes of statically-allocated shared memory required by this function.
	FUNC_A_CONST_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES      // The size in bytes of user-allocated constant memory required by this function.
	FUNC_A_LOCAL_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES      // The size in bytes of local memory used by each thread of this function.
	FUNC_A_NUM_REGS              FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS              // The number of registers used by each thread of this function.
	FUNC_A_PTX_VERSION           FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION           // The PTX virtual architecture version for which the function was compiled.
	FUNC_A_BINARY_VERSION        FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION        // The binary architecture version for which the function was compiled.
)


================================================
FILE: cu/init.go
================================================
package cu

// This file implements CUDA driver initialization

//#include <cuda.h>
import "C"

// Initialize the CUDA driver API.
// Currently, flags must be 0.
// If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED.
func Init(flags int) {
	err := Result(C.cuInit(C.uint(flags)))
	if err != SUCCESS {
		panic(err)
	}
}


================================================
FILE: cu/init_test.go
================================================
package cu

import (
	"fmt"
)

// needed for all other tests.
func init() {
	Init(0)
	ctx := CtxCreate(CTX_SCHED_AUTO, 0)
	CtxSetCurrent(ctx)
	fmt.Println("Created CUDA context")
}


================================================
FILE: cu/memory.go
================================================
package cu

// This file implements CUDA memory management on the driver level

//#include <cuda.h>
import "C"

import (
	"fmt"
	"unsafe"
)

type DevicePtr uintptr

// Allocates a number of bytes of device memory.
func MemAlloc(bytes int64) DevicePtr {
	var devptr C.CUdeviceptr
	err := Result(C.cuMemAlloc(&devptr, C.size_t(bytes)))
	if err != SUCCESS {
		panic(err)
	}
	return DevicePtr(devptr)
}

// Frees device memory allocated by MemAlloc().
// It is safe to double-free.
func MemFree(p DevicePtr) {
	if p == DevicePtr(uintptr(0)) {
		return // Allready freed
	}
	err := Result(C.cuMemFree(C.CUdeviceptr(p)))
	if err != SUCCESS {
		panic(err)
	}
}

// Frees device memory allocated by MemAlloc().
// Overwrites the pointer with NULL.
// It is safe to double-free.
func (ptr DevicePtr) Free() {
	MemFree(ptr)
}

// Copies a number of bytes on the current device.
// Requires unified addressing to be supported.
// See also: MemcpyDtoD().
func Memcpy(dst, src DevicePtr, bytes int64) {
	err := Result(C.cuMemcpy(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes)))
	if err != SUCCESS {
		panic(err)
	}
}

// Asynchronously copies a number of bytes on the current device.
func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) {
	err := Result(C.cuMemcpyAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Copies a number of bytes from host to device.
func MemcpyDtoD(dst, src DevicePtr, bytes int64) {
	err := Result(C.cuMemcpyDtoD(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes)))
	if err != SUCCESS {
		panic(err)
	}
}

// Asynchronously copies a number of bytes from host to device.
func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) {
	err := Result(C.cuMemcpyDtoDAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Copies a number of bytes from host to device.
func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) {
	err := Result(C.cuMemcpyHtoD(C.CUdeviceptr(dst), src, C.size_t(bytes)))
	if err != SUCCESS {
		panic(err)
	}
}

// Asynchronously copies a number of bytes from host to device.
// The host memory must be page-locked (see MemRegister)
func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) {
	err := Result(C.cuMemcpyHtoDAsync(C.CUdeviceptr(dst), src, C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Copies a number of bytes from device to host.
func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) {
	err := Result(C.cuMemcpyDtoH(dst, C.CUdeviceptr(src), C.size_t(bytes)))
	if err != SUCCESS {
		panic(err)
	}
}

// Asynchronously copies a number of bytes device host to host.
// The host memory must be page-locked (see MemRegister)
func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) {
	err := Result(C.cuMemcpyDtoHAsync(dst, C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Copies from device memory in one context (device) to another.
func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) {
	err := Result(C.cuMemcpyPeer(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes)))
	if err != SUCCESS {
		panic(err)
	}
}

// Asynchronously copies from device memory in one context (device) to another.
func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) {
	err := Result(C.cuMemcpyPeerAsync(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr.
func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) {
	var cbytes C.size_t
	var cptr C.CUdeviceptr
	err := Result(C.cuMemGetAddressRange(&cptr, &cbytes, C.CUdeviceptr(ptr)))
	if err != SUCCESS {
		panic(err)
	}
	bytes = int64(cbytes)
	base = DevicePtr(cptr)
	return
}

// Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr.
func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) {
	return MemGetAddressRange(ptr)
}

// Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr.
func (ptr DevicePtr) Bytes() (bytes int64) {
	bytes, _ = MemGetAddressRange(ptr)
	return
}

// Returns the free and total amount of memroy in the current Context (in bytes).
func MemGetInfo() (free, total int64) {
	var cfree, ctotal C.size_t
	err := Result(C.cuMemGetInfo(&cfree, &ctotal))
	if err != SUCCESS {
		panic(err)
	}
	free = int64(cfree)
	total = int64(ctotal)
	return
}

// Page-locks memory specified by the pointer and bytes.
// The pointer and byte size must be aligned to the host page size (4KB)
// See also: MemHostUnregister()
// doesn't link with cuda6.5
//func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) {
//	err := Result(C.cuMemHostRegister(ptr, C.size_t(bytes), C.uint(flags)))
//	if err != SUCCESS {
//		panic(err)
//	}
//}

// Unmaps memory locked by MemHostRegister().
// doesn't link with cuda6.5
//func MemHostUnregister(ptr unsafe.Pointer) {
//	err := Result(C.cuMemHostUnregister(ptr))
//	if err != SUCCESS {
//		panic(err)
//	}
//}

func MemAllocHost(bytes int64) unsafe.Pointer {
	var p unsafe.Pointer
	err := Result(C.cuMemAllocHost(&p, C.size_t(bytes)))
	if err != SUCCESS {
		panic(err)
	}
	return p
}

func MemFreeHost(ptr unsafe.Pointer) {
	err := Result(C.cuMemFreeHost(ptr))
	if err != SUCCESS {
		panic(err)
	}
}

type MemHostRegisterFlag int

// Flag for MemHostRegister
const (
	// Memory is pinned in all CUDA contexts.
	MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE
	// Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()
	MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP
)

func (p DevicePtr) String() string {
	return fmt.Sprint(unsafe.Pointer(uintptr(p)))
}

// Type size in bytes
const (
	SIZEOF_FLOAT32    = 4
	SIZEOF_FLOAT64    = 8
	SIZEOF_COMPLEX64  = 8
	SIZEOF_COMPLEX128 = 16
)

// Physical memory type of device pointer.
type MemoryType uint

const (
	MemoryTypeHost    MemoryType = C.CU_MEMORYTYPE_HOST
	MemoryTypeDevice  MemoryType = C.CU_MEMORYTYPE_DEVICE
	MemoryTypeArray   MemoryType = C.CU_MEMORYTYPE_ARRAY
	MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
)

var memorytype = map[MemoryType]string{
	MemoryTypeHost:    "MemoryTypeHost",
	MemoryTypeDevice:  "MemoryTypeDevice",
	MemoryTypeArray:   "MemoryTypeArray",
	MemoryTypeUnified: "MemoryTypeUnified"}

func (t MemoryType) String() string {
	if s, ok := memorytype[t]; ok {
		return s
	}
	return "MemoryTypeUnknown"
}

// Returns the physical memory type that ptr addresses.
func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) {
	var typ uint64 // foresee enough memory just to be safe
	err = Result(C.cuPointerGetAttribute(unsafe.Pointer(&typ),
		C.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, C.CUdeviceptr(uintptr(ptr))))
	return MemoryType(uint(typ)), err
}

// Returns the physical memory type that ptr addresses.
func (ptr DevicePtr) MemoryType() MemoryType {
	t, err := PointerGetAttributeMemoryType(ptr)
	if err != SUCCESS {
		panic(err)
	}
	return t
}


================================================
FILE: cu/memory_test.go
================================================
package cu

import (
	"fmt"
	"math"
	"testing"
	"unsafe"
)

func TestMalloc(t *testing.T) {
	for i := 0; i < 1024; i++ {
		pointer := MemAlloc(16 * 1024 * 1024)
		pointer.Free()
	}
	for i := 0; i < 1024; i++ {
		pointer := MemAlloc(16 * 1024 * 1024)
		MemFree(pointer)
	}
}

func BenchmarkMallocFree1B(b *testing.B) {
	for i := 0; i < b.N; i++ {
		m := MemAlloc(1)
		m.Free()
	}
}

func BenchmarkMallocFree1kB(b *testing.B) {
	for i := 0; i < b.N; i++ {
		m := MemAlloc(1024)
		m.Free()
	}
}

func BenchmarkMallocFree1MB(b *testing.B) {
	for i := 0; i < b.N; i++ {
		m := MemAlloc(1024 * 1024)
		m.Free()
	}
}

func TestMemAddressRange(t *testing.T) {
	N := 12345
	ptr := MemAlloc(int64(N))
	size, base := MemGetAddressRange(ptr)
	if size != int64(N) {
		t.Fail()
	}
	if base != ptr {
		t.Fail()
	}
	size, base = 0, DevicePtr(0)
	size, base = ptr.GetAddressRange()
	if ptr.Bytes() != int64(N) {
		t.Fail()
	}
}

func TestMemGetInfo(t *testing.T) {
	free, total := MemGetInfo()
	fmt.Println("MemGetInfo: ", free, "/", total)
	if free > total {
		t.Fail()
	}
	if total == 0 {
		t.Fail()
	}
}

func TestMemsetAsync(t *testing.T) {
	N := int64(32 * 1024)
	host1 := make([]float32, N)
	for i := range host1 {
		host1[i] = float32(i)
	}
	host2 := make([]float32, N)
	dev1 := MemAlloc(int64(4 * N))
	MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
	str := StreamCreate()
	MemsetD32Async(dev1, math.Float32bits(42), N, str)
	MemsetD32Async(dev1, math.Float32bits(21), N/2, str)
	MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N)
	str.Synchronize()
	(&str).Destroy()
	for i := 0; i < len(host2)/2; i++ {
		if host2[i] != 21 {
			t.Fail()
		}
	}
	for i := len(host2) / 2; i < len(host2); i++ {
		if host2[i] != 42 {
			t.Fail()
		}
	}
	dev1.Free()
}

func TestMemset(t *testing.T) {
	N := int64(32 * 1024)
	host1 := make([]float32, N)
	for i := range host1 {
		host1[i] = float32(i)
	}
	host2 := make([]float32, N)
	dev1 := MemAlloc(int64(4 * N))
	MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
	MemsetD32(dev1, math.Float32bits(42), N)
	MemsetD32(dev1, math.Float32bits(21), N/2)
	MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N)
	for i := 0; i < len(host2)/2; i++ {
		if host2[i] != 21 {
			t.Fail()
		}
	}
	for i := len(host2) / 2; i < len(host2); i++ {
		if host2[i] != 42 {
			t.Fail()
		}
	}
	dev1.Free()
}

func TestMemcpy(t *testing.T) {
	N := int64(32 * 1024)
	host1 := make([]float32, N)
	for i := range host1 {
		host1[i] = float32(i)
	}
	host2 := make([]float32, N)
	dev1 := MemAlloc(int64(4 * N))
	dev2 := MemAlloc(int64(4 * N))
	MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
	MemcpyDtoD(dev2, dev1, 4*N)
	MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N)
	for i := range host2 {
		if host2[i] != float32(i) {
			t.Fail()
		}
	}
	dev1.Free()
	dev2.Free()
}

func TestMemcpyAsync(t *testing.T) {
	N := int64(32 * 1024)
	host1 := make([]float32, N)
	for i := range host1 {
		host1[i] = float32(i)
	}
	host2 := make([]float32, N)
	dev1 := MemAlloc(int64(4 * N))
	dev2 := MemAlloc(int64(4 * N))
	stream := StreamCreate()
	MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream)
	MemcpyDtoDAsync(dev2, dev1, 4*N, stream)
	MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream)
	stream.Synchronize()
	for i := range host2 {
		if host2[i] != float32(i) {
			t.Fail()
		}
	}
	dev1.Free()
	dev2.Free()
}

func TestMemcpyAsyncRegistered(t *testing.T) {
	N := int64(32 * 1024)
	host1 := make([]float32, N)
	for i := range host1 {
		host1[i] = float32(i)
	}
	host2 := make([]float32, N)
	dev1 := MemAlloc(int64(4 * N))
	dev2 := MemAlloc(int64(4 * N))
	stream := StreamCreate()
	MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream)
	MemcpyDtoDAsync(dev2, dev1, 4*N, stream)
	MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream)
	stream.Synchronize()
	for i := range host2 {
		if host2[i] != float32(i) {
			t.Fail()
		}
	}
	dev1.Free()
	dev2.Free()
}

func BenchmarkMemcpy(b *testing.B) {
	b.StopTimer()
	N := int64(32 * 1024 * 1024)
	host1 := make([]float32, N)
	host2 := make([]float32, N)
	dev1 := MemAlloc(int64(4 * N))
	defer dev1.Free()
	dev2 := MemAlloc(int64(4 * N))
	defer dev2.Free()
	b.SetBytes(4 * N)
	b.StartTimer()
	for i := 0; i < b.N; i++ {
		MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N)
		MemcpyDtoD(dev2, dev1, 4*N)
		MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N)
	}
}


================================================
FILE: cu/memset.go
================================================
package cu

// This file implements CUDA memset functions.

//#include <cuda.h>
import "C"

import (
	"unsafe"
)

// Sets the first N 32-bit values of dst array to value.
// Asynchronous.
func MemsetD32(deviceptr DevicePtr, value uint32, N int64) {
	err := Result(C.cuMemsetD32(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N)))
	if err != SUCCESS {
		panic(err)
	}
}

// Asynchronously sets the first N 32-bit values of dst array to value.
func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) {
	err := Result(C.cuMemsetD32Async(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Sets the first N 8-bit values of dst array to value.
// Asynchronous.
func MemsetD8(deviceptr DevicePtr, value uint8, N int64) {
	err := Result(C.cuMemsetD8(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N)))
	if err != SUCCESS {
		panic(err)
	}
}

// Asynchronously sets the first N 32-bit values of dst array to value.
func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) {
	err := Result(C.cuMemsetD8Async(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream)))))
	if err != SUCCESS {
		panic(err)
	}
}


================================================
FILE: cu/module.go
================================================
package cu

// This file implements loading of CUDA ptx modules

//#include <cuda.h>
import "C"

import (
	"unsafe"
)

// Represents a CUDA CUmodule, a reference to executable device code.
type Module uintptr

// Loads a compute module from file
func ModuleLoad(fname string) Module {
	//fmt.Fprintln(os.Stderr, "driver.ModuleLoad", fname)
	var mod C.CUmodule
	err := Result(C.cuModuleLoad(&mod, C.CString(fname)))
	if err != SUCCESS {
		panic(err)
	}
	return Module(uintptr(unsafe.Pointer(mod)))
}

// Loads a compute module from string
func ModuleLoadData(image string) Module {
	var mod C.CUmodule
	err := Result(C.cuModuleLoadData(&mod, unsafe.Pointer(C.CString(image))))
	if err != SUCCESS {
		panic(err)
	}
	return Module(uintptr(unsafe.Pointer(mod)))
}

// Returns a Function handle.
func ModuleGetFunction(module Module, name string) Function {
	var function C.CUfunction
	err := Result(C.cuModuleGetFunction(
		&function,
		C.CUmodule(unsafe.Pointer(uintptr(module))),
		C.CString(name)))
	if err != SUCCESS {
		panic(err)
	}
	return Function(uintptr(unsafe.Pointer(function)))
}

// Returns a Function handle.
func (m Module) GetFunction(name string) Function {
	return ModuleGetFunction(m, name)
}


================================================
FILE: cu/module_test.go
================================================
package cu

import (
	"testing"
	"unsafe"
	//"fmt"
)

func TestModule(test *testing.T) {
	mod := ModuleLoad("/testdata/testmodule.ptx")
	f := mod.GetFunction("testMemset")

	N := 1000
	N4 := 4 * int64(N)
	a := make([]float32, N)
	A := MemAlloc(N4)
	defer A.Free()
	aptr := unsafe.Pointer(&a[0])
	MemcpyHtoD(A, aptr, N4)

	var value float32
	value = 42

	var n int
	n = N / 2

	block := 128
	grid := DivUp(N, block)
	shmem := 0
	args := []unsafe.Pointer{unsafe.Pointer(&A), unsafe.Pointer(&value), unsafe.Pointer(&n)}
	LaunchKernel(f, grid, 1, 1, block, 1, 1, shmem, 0, args)

	MemcpyDtoH(aptr, A, N4)
	for i := 0; i < N/2; i++ {
		if a[i] != 42 {
			test.Fail()
		}
	}
	for i := N / 2; i < N; i++ {
		if a[i] != 0 {
			test.Fail()
		}
	}
	//fmt.Println(a)
}

// Integer division rounded up.
func DivUp(x, y int) int {
	return ((x - 1) / y) + 1
}


================================================
FILE: cu/peer.go
================================================
package cu

// This file implements CUDA unified addressing.

//#include <cuda.h>
import "C"

import (
	"unsafe"
)

// Make allocations from the peer Context available to the current context.
func CtxEnablePeerAccess(peer Context) {
	err := Result(C.cuCtxEnablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))), C.uint(0)))
	if err != SUCCESS {
		panic(err)
	}
}

// Make allocations from the peer Context available to the current context.
func (peer Context) EnablePeerAccess() {
	CtxEnablePeerAccess(peer)
}

// Reverses CtxEnablePeerAccess().
func CtxDisablePeerAccess(peer Context) {
	err := Result(C.cuCtxDisablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Reverses EnablePeerAccess().
func (peer Context) DisablePeerAccess() {
	CtxDisablePeerAccess(peer)
}

// Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev.
func DeviceCanAccessPeer(dev, peer Device) bool {
	var canAccessPeer C.int
	err := Result(C.cuDeviceCanAccessPeer(&canAccessPeer, C.CUdevice(dev), C.CUdevice(peer)))
	if err != SUCCESS {
		panic(err)
	}
	return int(canAccessPeer) != 0
}

// Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev.
func (dev Device) CanAccessPeer(peer Device) bool {
	return DeviceCanAccessPeer(dev, peer)
}


================================================
FILE: cu/result.go
================================================
package cu

// This file provides access to CUDA driver error statuses (type CUresult).

//#include <cuda.h>
import "C"
import (
	"fmt"
)

// CUDA error status.
// CUDA error statuses are not returned by functions but checked and passed to
// panic() when not successful. If desired, they can be caught by
// recover().
type Result int

// Message string for the error
func (err Result) String() string {
	str, ok := errorString[err]
	if !ok {
		return "Unknown CUresult: " + fmt.Sprint(int(err))
	}
	return str
}

const (
	SUCCESS                              Result = C.CUDA_SUCCESS
	ERROR_INVALID_VALUE                  Result = C.CUDA_ERROR_INVALID_VALUE
	ERROR_OUT_OF_MEMORY                  Result = C.CUDA_ERROR_OUT_OF_MEMORY
	ERROR_NOT_INITIALIZED                Result = C.CUDA_ERROR_NOT_INITIALIZED
	ERROR_DEINITIALIZED                  Result = C.CUDA_ERROR_DEINITIALIZED
	ERROR_PROFILER_DISABLED              Result = C.CUDA_ERROR_PROFILER_DISABLED
	ERROR_PROFILER_NOT_INITIALIZED       Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
	ERROR_PROFILER_ALREADY_STARTED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
	ERROR_PROFILER_ALREADY_STOPPED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
	ERROR_NO_DEVICE                      Result = C.CUDA_ERROR_NO_DEVICE
	ERROR_INVALID_DEVICE                 Result = C.CUDA_ERROR_INVALID_DEVICE
	ERROR_INVALID_IMAGE                  Result = C.CUDA_ERROR_INVALID_IMAGE
	ERROR_INVALID_CONTEXT                Result = C.CUDA_ERROR_INVALID_CONTEXT
	ERROR_CONTEXT_ALREADY_CURRENT        Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
	ERROR_MAP_FAILED                     Result = C.CUDA_ERROR_MAP_FAILED
	ERROR_UNMAP_FAILED                   Result = C.CUDA_ERROR_UNMAP_FAILED
	ERROR_ARRAY_IS_MAPPED                Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
	ERROR_ALREADY_MAPPED                 Result = C.CUDA_ERROR_ALREADY_MAPPED
	ERROR_NO_BINARY_FOR_GPU              Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU
	ERROR_ALREADY_ACQUIRED               Result = C.CUDA_ERROR_ALREADY_ACQUIRED
	ERROR_NOT_MAPPED                     Result = C.CUDA_ERROR_NOT_MAPPED
	ERROR_NOT_MAPPED_AS_ARRAY            Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
	ERROR_NOT_MAPPED_AS_POINTER          Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
	ERROR_ECC_UNCORRECTABLE              Result = C.CUDA_ERROR_ECC_UNCORRECTABLE
	ERROR_UNSUPPORTED_LIMIT              Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT
	ERROR_CONTEXT_ALREADY_IN_USE         Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
	ERROR_INVALID_SOURCE                 Result = C.CUDA_ERROR_INVALID_SOURCE
	ERROR_FILE_NOT_FOUND                 Result = C.CUDA_ERROR_FILE_NOT_FOUND
	ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
	ERROR_SHARED_OBJECT_INIT_FAILED      Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
	ERROR_OPERATING_SYSTEM               Result = C.CUDA_ERROR_OPERATING_SYSTEM
	ERROR_INVALID_HANDLE                 Result = C.CUDA_ERROR_INVALID_HANDLE
	ERROR_NOT_FOUND                      Result = C.CUDA_ERROR_NOT_FOUND
	ERROR_NOT_READY                      Result = C.CUDA_ERROR_NOT_READY
	ERROR_LAUNCH_FAILED                  Result = C.CUDA_ERROR_LAUNCH_FAILED
	ERROR_LAUNCH_OUT_OF_RESOURCES        Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
	ERROR_LAUNCH_TIMEOUT                 Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
	ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
	ERROR_PEER_ACCESS_ALREADY_ENABLED    Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
	ERROR_PEER_ACCESS_NOT_ENABLED        Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
	ERROR_PRIMARY_CONTEXT_ACTIVE         Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
	ERROR_CONTEXT_IS_DESTROYED           Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
	ERROR_ASSERT                         Result = C.CUDA_ERROR_ASSERT
	ERROR_TOO_MANY_PEERS                 Result = C.CUDA_ERROR_TOO_MANY_PEERS
	ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
	ERROR_HOST_MEMORY_NOT_REGISTERED     Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
	ERROR_HARDWARE_STACK_ERROR           Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR
	ERROR_ILLEGAL_INSTRUCTION            Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION
	ERROR_MISALIGNED_ADDRESS             Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS
	ERROR_INVALID_ADDRESS_SPACE          Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE
	ERROR_INVALID_PC                     Result = 718 //C.CUDA_ERROR_INVALID_PC
	ERROR_NOT_PERMITTED                  Result = 800 //C.CUDA_ERROR_NOT_PERMITTED
	ERROR_NOT_SUPPORTED                  Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED
	ERROR_UNKNOWN                        Result = C.CUDA_ERROR_UNKNOWN
)

// Map with error strings for Result error numbers
var errorString map[Result]string = map[Result]string{
	SUCCESS:                              "CUDA_SUCCESS",
	ERROR_INVALID_VALUE:                  "CUDA_ERROR_INVALID_VALUE",
	ERROR_OUT_OF_MEMORY:                  "CUDA_ERROR_OUT_OF_MEMORY",
	ERROR_NOT_INITIALIZED:                "CUDA_ERROR_NOT_INITIALIZED",
	ERROR_DEINITIALIZED:                  "CUDA_ERROR_DEINITIALIZED",
	ERROR_PROFILER_DISABLED:              "CUDA_ERROR_PROFILER_DISABLED",
	ERROR_PROFILER_NOT_INITIALIZED:       "CUDA_ERROR_PROFILER_NOT_INITIALIZED",
	ERROR_PROFILER_ALREADY_STARTED:       "CUDA_ERROR_PROFILER_ALREADY_STARTED",
	ERROR_PROFILER_ALREADY_STOPPED:       "CUDA_ERROR_PROFILER_ALREADY_STOPPED",
	ERROR_NO_DEVICE:                      "CUDA_ERROR_NO_DEVICE",
	ERROR_INVALID_DEVICE:                 "CUDA_ERROR_INVALID_DEVICE",
	ERROR_INVALID_IMAGE:                  "CUDA_ERROR_INVALID_IMAGE",
	ERROR_INVALID_CONTEXT:                "CUDA_ERROR_INVALID_CONTEXT",
	ERROR_CONTEXT_ALREADY_CURRENT:        "CUDA_ERROR_CONTEXT_ALREADY_CURRENT",
	ERROR_MAP_FAILED:                     "CUDA_ERROR_MAP_FAILED",
	ERROR_UNMAP_FAILED:                   "CUDA_ERROR_UNMAP_FAILED",
	ERROR_ARRAY_IS_MAPPED:                "CUDA_ERROR_ARRAY_IS_MAPPED",
	ERROR_ALREADY_MAPPED:                 "CUDA_ERROR_ALREADY_MAPPED",
	ERROR_NO_BINARY_FOR_GPU:              "CUDA_ERROR_NO_BINARY_FOR_GPU",
	ERROR_ALREADY_ACQUIRED:               "CUDA_ERROR_ALREADY_ACQUIRED",
	ERROR_NOT_MAPPED:                     "CUDA_ERROR_NOT_MAPPED",
	ERROR_NOT_MAPPED_AS_ARRAY:            "CUDA_ERROR_NOT_MAPPED_AS_ARRAY",
	ERROR_NOT_MAPPED_AS_POINTER:          "CUDA_ERROR_NOT_MAPPED_AS_POINTER",
	ERROR_ECC_UNCORRECTABLE:              "CUDA_ERROR_ECC_UNCORRECTABLE",
	ERROR_UNSUPPORTED_LIMIT:              "CUDA_ERROR_UNSUPPORTED_LIMIT",
	ERROR_CONTEXT_ALREADY_IN_USE:         "CUDA_ERROR_CONTEXT_ALREADY_IN_USE",
	ERROR_INVALID_SOURCE:                 "CUDA_ERROR_INVALID_SOURCE",
	ERROR_FILE_NOT_FOUND:                 "CUDA_ERROR_FILE_NOT_FOUND",
	ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND",
	ERROR_SHARED_OBJECT_INIT_FAILED:      "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED",
	ERROR_OPERATING_SYSTEM:               "CUDA_ERROR_OPERATING_SYSTEM",
	ERROR_INVALID_HANDLE:                 "CUDA_ERROR_INVALID_HANDLE",
	ERROR_NOT_FOUND:                      "CUDA_ERROR_NOT_FOUND",
	ERROR_NOT_READY:                      "CUDA_ERROR_NOT_READY",
	ERROR_LAUNCH_OUT_OF_RESOURCES:        "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES",
	ERROR_LAUNCH_TIMEOUT:                 "CUDA_ERROR_LAUNCH_TIMEOUT",
	ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:  "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING",
	ERROR_PEER_ACCESS_ALREADY_ENABLED:    "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED",
	ERROR_PEER_ACCESS_NOT_ENABLED:        "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED",
	ERROR_PRIMARY_CONTEXT_ACTIVE:         "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE",
	ERROR_CONTEXT_IS_DESTROYED:           "CUDA_ERROR_CONTEXT_IS_DESTROYED",
	ERROR_ASSERT:                         "CUDA_ERROR_ASSERT",
	ERROR_TOO_MANY_PEERS:                 "CUDA_ERROR_TOO_MANY_PEERS",
	ERROR_HOST_MEMORY_ALREADY_REGISTERED: "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED",
	ERROR_HOST_MEMORY_NOT_REGISTERED:     "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED",
	ERROR_HARDWARE_STACK_ERROR:           "CUDA_ERROR_HARDWARE_STACK_ERROR",
	ERROR_ILLEGAL_INSTRUCTION:            "CUDA_ERROR_ILLEGAL_INSTRUCTION",
	ERROR_MISALIGNED_ADDRESS:             "CUDA_ERROR_MISALIGNED_ADDRESS",
	ERROR_INVALID_ADDRESS_SPACE:          "CUDA_ERROR_INVALID_ADDRESS_SPACE",
	ERROR_INVALID_PC:                     "CUDA_ERROR_INVALID_PC",
	ERROR_LAUNCH_FAILED:                  "CUDA_ERROR_LAUNCH_FAILED",
	ERROR_NOT_PERMITTED:                  "CUDA_ERROR_NOT_PERMITTED",
	ERROR_NOT_SUPPORTED:                  "CUDA_ERROR_NOT_SUPPORTED",
	ERROR_UNKNOWN:                        "CUDA_ERROR_UNKNOWN"}


================================================
FILE: cu/runtimeapi.go
================================================
package cu

// This file implements parts of the CUDA runtime api instead of the driver
// api the rest of this package uses.
// It might be useful to move this to a seperate package at some point.

//#include <cuda_runtime.h>
import "C"
import "unsafe"

// Set the device as current.
func SetDevice(device Device) {
	err := Result(C.cudaSetDevice(C.int(device)))
	if err != SUCCESS {
		panic(err)
	}
}

// Reset the state of the current device.
func DeviceReset() {
	err := Result(C.cudaDeviceReset())
	if err != SUCCESS {
		panic(err)
	}
}

// Set CUDA device flags.
func SetDeviceFlags(flags uint) {
	err := Result(C.cudaSetDeviceFlags(C.uint(flags)))
	if err != SUCCESS {
		panic(err)
	}
}

//Flags for SetDeviceFlasgs
const (
	// The default, decides to yield or not based on active CUDA threads and processors.
	DeviceAuto = C.cudaDeviceScheduleAuto
	// Actively spin while waiting for device.
	DeviceSpin = C.cudaDeviceScheduleSpin
	// Yield when waiting.
	DeviceYield = C.cudaDeviceScheduleYield
	// ScheduleBlockingSync block CPU on sync.
	DeviceScheduleBlockingSync = C.cudaDeviceScheduleBlockingSync
	// ScheduleBlockingSync block CPU on sync.  Deprecated since cuda 4.0
	DeviceBlockingSync = C.cudaDeviceBlockingSync
	// For use with pinned host memory
	DeviceMapHost = C.cudaDeviceMapHost
	// Do not reduce local memory to try and prevent thrashing
	DeviceLmemResizeToMax = C.cudaDeviceLmemResizeToMax
)

func Malloc(bytes int64) DevicePtr {
	var devptr unsafe.Pointer
	err := Result(C.cudaMalloc(&devptr, C.size_t(bytes)))
	if err != SUCCESS {
		panic(err)
	}
	return DevicePtr(devptr)
}

func MallocHost(bytes int64) unsafe.Pointer {
	var p unsafe.Pointer
	err := Result(C.cudaMallocHost(&p, C.size_t(bytes)))
	if err != SUCCESS {
		panic(err)
	}
	return p
}

func FreeHost(ptr unsafe.Pointer) {
	err := Result(C.cudaFreeHost(ptr))
	if err != SUCCESS {
		panic(err)
	}
}

// Copies a number of bytes in the direction specified by flags
func MemCpy(dst, src unsafe.Pointer, bytes int64, flags uint) {
	err := Result(C.cudaMemcpy(dst, src, C.size_t(bytes), uint32(flags)))
	if err != SUCCESS {
		panic(err)
	}
}

//Flags for memory copy types
const (
	// Host to Host
	HtoH = C.cudaMemcpyHostToHost
	// Host to Device
	HtoD = C.cudaMemcpyHostToDevice
	// Device to Host
	DtoH = C.cudaMemcpyDeviceToHost
	// Device to Device
	DtoD = C.cudaMemcpyDeviceToDevice
	// Default, unified virtual address space
	Virt = C.cudaMemcpyDefault
)


================================================
FILE: cu/stream.go
================================================
package cu

// This file implements CUDA streams

//#include <cuda.h>
import "C"
import "unsafe"

// CUDA stream.
type Stream uintptr

// Creates an asynchronous stream
func StreamCreate() Stream {
	var stream C.CUstream
	err := Result(C.cuStreamCreate(&stream, C.uint(0))) // flags has to be zero
	if err != SUCCESS {
		panic(err)
	}
	return Stream(uintptr(unsafe.Pointer(stream)))
}

// Destroys the asynchronous stream
func (stream *Stream) Destroy() {
	str := *stream
	err := Result(C.cuStreamDestroy(C.CUstream(unsafe.Pointer(uintptr(str)))))
	*stream = 0
	if err != SUCCESS {
		panic(err)
	}
}

// Destroys an asynchronous stream
func StreamDestroy(stream *Stream) {
	stream.Destroy()
}

// Blocks until the stream has completed.
func (stream Stream) Synchronize() {
	err := Result(C.cuStreamSynchronize(C.CUstream(unsafe.Pointer(uintptr(stream)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Returns Success if all operations have completed, ErrorNotReady otherwise
func (stream Stream) Query() Result {
	return Result(C.cuStreamQuery(C.CUstream(unsafe.Pointer(uintptr(stream)))))
}

// Returns Success if all operations have completed, ErrorNotReady otherwise
func StreamQuery(stream Stream) Result {
	return stream.Query()
}

// Blocks until the stream has completed.
func StreamSynchronize(stream Stream) {
	stream.Synchronize()
}


================================================
FILE: cu/testdata/testmodule.cu
================================================
/*
 * Module to test CUDA module loading and execution.
 * To be compiled with:
 * nvcc -ptx testmodule.cu
 */


#ifdef __cplusplus
extern "C" {
#endif

#define threadindex ( ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x )

/// Sets the first N elements of array to value.
__global__ void testMemset(float* array, float value, int N){
	int i = threadindex;
	if(i < N){
		array[i] = value;
	}
}


#ifdef __cplusplus
}
#endif


================================================
FILE: cu/testdata/testmodule.ptx
================================================
	.version 1.4
	.target sm_10, map_f64_to_f32
	// compiled with /usr/local/cuda/open64/lib//be
	// nvopencc 4.0 built on 2011-02-18

	//-----------------------------------------------------------
	// Compiling /tmp/tmpxft_00000e56_00000000-9_testmodule.cpp3.i (/tmp/ccBI#.rDLD4T)
	//-----------------------------------------------------------

	//-----------------------------------------------------------
	// Options:
	//-----------------------------------------------------------
	//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
	//  -O3	(Optimization level)
	//  -g0	(Debug level)
	//  -m2	(Report advisories)
	//-----------------------------------------------------------

	.file	1	"<command-line>"
	.file	2	"/tmp/tmpxft_00000e56_00000000-8_testmodule.cudafe2.gpu"
	.file	3	"/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h"
	.file	4	"/usr/local/cuda/bin/../include/crt/device_runtime.h"
	.file	5	"/usr/local/cuda/bin/../include/host_defines.h"
	.file	6	"/usr/local/cuda/bin/../include/builtin_types.h"
	.file	7	"/usr/local/cuda/bin/../include/device_types.h"
	.file	8	"/usr/local/cuda/bin/../include/driver_types.h"
	.file	9	"/usr/local/cuda/bin/../include/surface_types.h"
	.file	10	"/usr/local/cuda/bin/../include/texture_types.h"
	.file	11	"/usr/local/cuda/bin/../include/vector_types.h"
	.file	12	"/usr/local/cuda/bin/../include/device_launch_parameters.h"
	.file	13	"/usr/local/cuda/bin/../include/crt/storage_class.h"
	.file	14	"/usr/include/bits/types.h"
	.file	15	"/usr/include/time.h"
	.file	16	"testmodule.cu"
	.file	17	"/usr/local/cuda/bin/../include/common_functions.h"
	.file	18	"/usr/local/cuda/bin/../include/math_functions.h"
	.file	19	"/usr/local/cuda/bin/../include/math_constants.h"
	.file	20	"/usr/local/cuda/bin/../include/device_functions.h"
	.file	21	"/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
	.file	22	"/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
	.file	23	"/usr/local/cuda/bin/../include/sm_13_double_functions.h"
	.file	24	"/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
	.file	25	"/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
	.file	26	"/usr/local/cuda/bin/../include/surface_functions.h"
	.file	27	"/usr/local/cuda/bin/../include/texture_fetch_functions.h"
	.file	28	"/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"


	.entry testMemset (
		.param .u64 __cudaparm_testMemset_array,
		.param .f32 __cudaparm_testMemset_value,
		.param .s32 __cudaparm_testMemset_N)
	{
	.reg .u16 %rh<4>;
	.reg .u32 %r<10>;
	.reg .u64 %rd<6>;
	.reg .f32 %f<3>;
	.reg .pred %p<3>;
	.loc	16	7	0
$LDWbegin_testMemset:
	mov.u16 	%rh1, %nctaid.x;
	mov.u16 	%rh2, %ctaid.y;
	mul.wide.u16 	%r1, %rh1, %rh2;
	cvt.u32.u16 	%r2, %ctaid.x;
	add.u32 	%r3, %r2, %r1;
	cvt.u32.u16 	%r4, %ntid.x;
	mul.lo.u32 	%r5, %r4, %r3;
	cvt.u32.u16 	%r6, %tid.x;
	add.u32 	%r7, %r6, %r5;
	ld.param.s32 	%r8, [__cudaparm_testMemset_N];
	setp.le.s32 	%p1, %r8, %r7;
	@%p1 bra 	$Lt_0_1026;
	.loc	16	10	0
	ld.param.f32 	%f1, [__cudaparm_testMemset_value];
	ld.param.u64 	%rd1, [__cudaparm_testMemset_array];
	cvt.s64.s32 	%rd2, %r7;
	mul.wide.s32 	%rd3, %r7, 4;
	add.u64 	%rd4, %rd1, %rd3;
	st.global.f32 	[%rd4+0], %f1;
$Lt_0_1026:
	.loc	16	12	0
	exit;
$LDWend_testMemset:
	} // testMemset


================================================
FILE: cu/version.go
================================================
package cu

// This file implements CUDA driver version management

//#include <cuda.h>
import "C"

// Returns the CUDA driver version.
func Version() int {
	var version C.int
	err := Result(C.cuDriverGetVersion(&version))
	if err != SUCCESS {
		panic(err)
	}
	return int(version)
}


================================================
FILE: cu/version_test.go
================================================
package cu

import (
	"fmt"
	"testing"
)

func TestVersion(t *testing.T) {
	fmt.Println("CUDA driver version: ", Version())
}


================================================
FILE: cuda/Makefile
================================================
all: 6g gccgo doc

6g:
	go install -v
	go tool vet *.go
	gofmt -w *.go

GCCGO=gccgo -gccgoflags '-static-libgcc -O3'

gccgo:
	go build -v -compiler $(GCCGO)

test: 6gtest gccgotest

6gtest: 
	go test

gccgotest: 
	go test -compiler $(GCCGO)

bench: 6gbench gccgobench

6gbench:
	go test -bench=.

gccgobench:
	go test -bench=. -compiler $(GCCGO)

clean:
	go clean

doc:
	godoc github.com/barnex/cuda5/cu > README


================================================
FILE: cuda/README
================================================
PACKAGE

package cu
    import "github.com/barnex/cuda5/cu"

    Go bindings for the CUDA driver API.

CONSTANTS

const (
    // If  the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor.
    CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO
    // Spin when waiting for results from the GPU. 
    CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN
    // Yield its thread when waiting for results from the GPU.
    CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD
    // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work.
    CTX_BLOCKING_SYNC
    // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU.
    CTX_MAP_HOST = C.CU_CTX_MAP_HOST
    //Do not reduce local memory after resizing local memory for a kernel. 
    CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX
)
    Flags for CtxCreate
const (
    SIZEOF_FLOAT32    = 4
    SIZEOF_FLOAT64    = 8
    SIZEOF_COMPLEX64  = 8
    SIZEOF_COMPLEX128 = 16
)
    Type size in bytes


FUNCTIONS

func CtxDestroy(ctx *Context)
    Destroys the CUDA context specified by ctx. If the context usage count
    is not equal to 1, or the context is current to any CPU thread other
    than the current one, this function fails. Floating contexts (detached
    from a CPU thread via cuCtxPopCurrent()) may be destroyed by this
    function.

func CtxDisablePeerAccess(peer Context)
    Reverses CtxEnablePeerAccess().

func CtxEnablePeerAccess(peer Context)
    Make allocations from the peer Context available to the current context.

func CtxGetApiVersion(ctx Context) (version int)
    Returns the API version to create the context.

func CtxSetCurrent(ctx Context)
    Sets the current active context.

func CtxSynchronize()
    Blocks until the device has completed all preceding requested tasks, if
    the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag.

func DeviceCanAccessPeer(dev, peer Device) bool
    Returns true if CtxEnablePeerAccess can be called on a context for dev
    and peerDev.

func DeviceComputeCapability(device Device) (major, minor int)
    Returns the compute capability of the device.

func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int
    Gets the value of a device attribute.

func DeviceGetCount() int
    Returns the number of devices with compute capability greater than or
    equal to 1.0 that are available for execution.

func DeviceGetName(dev Device) string
    Gets the name of the device.

func DeviceTotalMem(device Device) int64
    Returns the total amount of memory available on the device in bytes.

func FuncGetAttribute(attrib FunctionAttribute, function Function) int

func Init(flags int)
    Initialize the CUDA driver API. Currently, flags must be 0. If Init()
    has not been called, any function from the driver API will panic with
    ERROR_NOT_INITIALIZED.

func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer)

func MemAllocHost(bytes int64) unsafe.Pointer

func MemFree(ptr *DevicePtr)
    Frees device memory allocated by MemAlloc(). Overwrites the pointer with
    NULL. It is safe to double-free.

func MemFreeHost(ptr unsafe.Pointer)

func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr)
    Returns the base address and size of the allocation (by MemAlloc) that
    contains the input pointer ptr.

func MemGetInfo() (free, total int64)
    Returns the free and total amount of memroy in the current Context (in
    bytes).

func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag)
    Page-locks memory specified by the pointer and bytes. The pointer and
    byte size must be aligned to the host page size (4KB) See also:
    MemHostUnregister()

func MemHostUnregister(ptr unsafe.Pointer)
    Unmaps memory locked by MemHostRegister().

func Memcpy(dst, src DevicePtr, bytes int64)
    Copies a number of bytes on the current device. Requires unified
    addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually
    an auto copy for device and/or host memory

func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream)
    Asynchronously copies a number of bytes on the current device.

func MemcpyDtoD(dst, src DevicePtr, bytes int64)
    Copies a number of bytes from host to device.

func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream)
    Asynchronously copies a number of bytes from host to device.

func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64)
    Copies a number of bytes from device to host.

func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream)
    Asynchronously copies a number of bytes device host to host. The host
    memory must be page-locked (see MemRegister)

func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64)
    Copies a number of bytes from host to device.

func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream)
    Asynchronously copies a number of bytes from host to device. The host
    memory must be page-locked (see MemRegister)

func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64)
    Copies from device memory in one context (device) to another.

func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream)
    Asynchronously copies from device memory in one context (device) to
    another.

func MemsetD32(deviceptr DevicePtr, value uint32, N int64)
    Sets the first N 32-bit values of dst array to value. Asynchronous.

func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream)
    Asynchronously sets the first N 32-bit values of dst array to value.

func MemsetD8(deviceptr DevicePtr, value uint8, N int64)
    Sets the first N 8-bit values of dst array to value. Asynchronous.

func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream)
    Asynchronously sets the first N 32-bit values of dst array to value.

func StreamDestroy(stream *Stream)
    Destroys an asynchronous stream

func StreamSynchronize(stream Stream)
    Blocks until the stream has completed.

func Version() int
    Returns the CUDA driver version.


TYPES

type Context uintptr
    CUDA context.

func CtxCreate(flags uint, dev Device) Context
    Create a CUDA context.

func CtxGetCurrent() Context
    Gets the current active context.

func (ctx Context) ApiVersion() (version int)
    Returns the API version to create the context.

func (ctx *Context) Destroy()
    Destroys the CUDA context.

func (peer Context) DisablePeerAccess()
    Reverses EnablePeerAccess().

func (peer Context) EnablePeerAccess()
    Make allocations from the peer Context available to the current context.

func (ctx Context) SetCurrent()
    Sets the current active context.

type DevProp struct {
    MaxThreadsPerBlock  int
    MaxThreadsDim       [3]int
    MaxGridSize         [3]int
    SharedMemPerBlock   int
    TotalConstantMemory int
    SIMDWidth           int
    MemPitch            int
    RegsPerBlock        int
    ClockRate           int
    TextureAlign        int
}
    Device properties

func DeviceGetProperties(dev Device) (prop DevProp)
    Returns the device's properties.

type Device int
    CUDA Device number.

func CtxGetDevice() Device
    Returns the ordinal of the current context's device.

func DeviceGet(ordinal int) Device
    Returns in a device handle given an ordinal in the range [0,
    DeviceGetCount()-1].

func (dev Device) Attribute(attrib DeviceAttribute) int
    Gets the value of a device attribute.

func (dev Device) CanAccessPeer(peer Device) bool
    Returns true if CtxEnablePeerAccess can be called on a context for dev
    and peerDev.

func (device Device) ComputeCapability() (major, minor int)
    Returns the compute capability of the device.

func (dev Device) Name() string
    Gets the name of the device.

func (dev Device) Properties() DevProp
    Returns the device's properties.

func (device Device) TotalMem() int64
    Returns the total amount of memory available on the device in bytes.

type DeviceAttribute int

const (
    MAX_THREADS_PER_BLOCK            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK            // Maximum number of threads per block
    MAX_BLOCK_DIM_X                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X                  // Maximum block dimension X
    MAX_BLOCK_DIM_Y                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y                  // Maximum block dimension Y
    MAX_BLOCK_DIM_Z                  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z                  // Maximum block dimension Z
    MAX_GRID_DIM_X                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X                   // Maximum grid dimension X
    MAX_GRID_DIM_Y                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y                   // Maximum grid dimension Y
    MAX_GRID_DIM_Z                   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z                   // Maximum grid dimension Z
    MAX_SHARED_MEMORY_PER_BLOCK      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK      // Maximum shared memory available per block in bytes
    TOTAL_CONSTANT_MEMORY            DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY            // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
    WARP_SIZE                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE                        // Warp size in threads
    MAX_PITCH                        DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH                        // Maximum pitch in bytes allowed by memory copies
    MAX_REGISTERS_PER_BLOCK          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK          // Maximum number of 32-bit registers available per block
    CLOCK_RATE                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE                       // Peak clock frequency in kilohertz
    TEXTURE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT                // Alignment requirement for textures
    MULTIPROCESSOR_COUNT             DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT             // Number of multiprocessors on device
    KERNEL_EXEC_TIMEOUT              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT              // Specifies whether there is a run time limit on kernels
    INTEGRATED                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED                       // Device is integrated with host memory
    CAN_MAP_HOST_MEMORY              DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY              // Device can map host memory into CUDA address space
    COMPUTE_MODE                     DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE                     // Compute mode (See ::CUcomputemode for details)
    MAXIMUM_TEXTURE1D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH          // Maximum 1D texture width
    MAXIMUM_TEXTURE2D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH          // Maximum 2D texture width
    MAXIMUM_TEXTURE2D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT         // Maximum 2D texture height
    MAXIMUM_TEXTURE3D_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH          // Maximum 3D texture width
    MAXIMUM_TEXTURE3D_HEIGHT         DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT         // Maximum 3D texture height
    MAXIMUM_TEXTURE3D_DEPTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH          // Maximum 3D texture depth
    MAXIMUM_TEXTURE2D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH  // Maximum 2D layered texture width
    MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height
    MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture
    SURFACE_ALIGNMENT                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT                // Alignment requirement for surfaces
    CONCURRENT_KERNELS               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS               // Device can possibly execute multiple kernels concurrently
    ECC_ENABLED                      DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED                      // Device has ECC support enabled
    PCI_BUS_ID                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID                       // PCI bus ID of the device
    PCI_DEVICE_ID                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID                    // PCI device ID of the device
    TCC_DRIVER                       DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER                       // Device is using TCC driver model
    MEMORY_CLOCK_RATE                DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE                // Peak memory clock frequency in kilohertz
    GLOBAL_MEMORY_BUS_WIDTH          DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH          // Global memory bus width in bits
    L2_CACHE_SIZE                    DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE                    // Size of L2 cache in bytes
    MAX_THREADS_PER_MULTIPROCESSOR   DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR   // Maximum resident threads per multiprocessor
    ASYNC_ENGINE_COUNT               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT               // Number of asynchronous engines
    UNIFIED_ADDRESSING               DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING               // Device uses shares a unified address space with the host 
    MAXIMUM_TEXTURE1D_LAYERED_WIDTH  DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH  // Maximum 1D layered texture width
    MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture
)

type DevicePtr uintptr

func MemAlloc(bytes int64) DevicePtr
    Allocates a number of bytes of device memory.

func (ptr DevicePtr) Bytes() (bytes int64)
    Returns the size of the allocation (by MemAlloc) that contains the input
    pointer ptr.

func (ptr *DevicePtr) Free()
    Frees device memory allocated by MemAlloc(). Overwrites the pointer with
    NULL. It is safe to double-free.

func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr)
    Returns the base address and size of the allocation (by MemAlloc) that
    contains the input pointer ptr.

func (ptr DevicePtr) MemoryType() MemoryType
    Returns the physical memory type that ptr addresses.

func (p DevicePtr) String() string

type Dim3 struct {
    X, Y, Z int
}

type Function uintptr
    Represents a CUDA CUfunction, a reference to a function within a module.

func ModuleGetFunction(module Module, name string) Function
    Returns a Function handle.

func (f Function) GetAttribute(attrib FunctionAttribute) int

type FunctionAttribute int

const (
    FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail.
    FUNC_A_SHARED_SIZE_BYTES     FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES     // The size in bytes of statically-allocated shared memory required by this function. 
    FUNC_A_CONST_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES      // The size in bytes of user-allocated constant memory required by this function.
    FUNC_A_LOCAL_SIZE_BYTES      FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES      // The size in bytes of local memory used by each thread of this function.
    FUNC_A_NUM_REGS              FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS              // The number of registers used by each thread of this function.
    FUNC_A_PTX_VERSION           FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION           // The PTX virtual architecture version for which the function was compiled. 
    FUNC_A_BINARY_VERSION        FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION        // The binary architecture version for which the function was compiled.
)

type MemHostRegisterFlag int

const (
    // Memory is pinned in all CUDA contexts.
    MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE
    // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer()
    MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP
)
    Flag for MemHostRegister

type MemoryType uint
    Physical memory type of device pointer.

const (
    MemoryTypeHost    MemoryType = C.CU_MEMORYTYPE_HOST
    MemoryTypeDevice  MemoryType = C.CU_MEMORYTYPE_DEVICE
    MemoryTypeArray   MemoryType = C.CU_MEMORYTYPE_ARRAY
    MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED
)

func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result)
    Returns the physical memory type that ptr addresses.

func (t MemoryType) String() string

type Module uintptr
    Represents a CUDA CUmodule, a reference to executable device code.

func ModuleLoad(fname string) Module
    Loads a compute module from file

func ModuleLoadData(image string) Module
    Loads a compute module from string

func (m Module) GetFunction(name string) Function
    Returns a Function handle.

type Result int
    CUDA error status. CUDA error statuses are not returned by functions but
    checked and passed to panic() when not successful. If desired, they can
    be caught by recover().

const (
    SUCCESS                              Result = C.CUDA_SUCCESS
    ERROR_INVALID_VALUE                  Result = C.CUDA_ERROR_INVALID_VALUE
    ERROR_OUT_OF_MEMORY                  Result = C.CUDA_ERROR_OUT_OF_MEMORY
    ERROR_NOT_INITIALIZED                Result = C.CUDA_ERROR_NOT_INITIALIZED
    ERROR_DEINITIALIZED                  Result = C.CUDA_ERROR_DEINITIALIZED
    ERROR_PROFILER_DISABLED              Result = C.CUDA_ERROR_PROFILER_DISABLED
    ERROR_PROFILER_NOT_INITIALIZED       Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
    ERROR_PROFILER_ALREADY_STARTED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
    ERROR_PROFILER_ALREADY_STOPPED       Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
    ERROR_NO_DEVICE                      Result = C.CUDA_ERROR_NO_DEVICE
    ERROR_INVALID_DEVICE                 Result = C.CUDA_ERROR_INVALID_DEVICE
    ERROR_INVALID_IMAGE                  Result = C.CUDA_ERROR_INVALID_IMAGE
    ERROR_INVALID_CONTEXT                Result = C.CUDA_ERROR_INVALID_CONTEXT
    ERROR_CONTEXT_ALREADY_CURRENT        Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
    ERROR_MAP_FAILED                     Result = C.CUDA_ERROR_MAP_FAILED
    ERROR_UNMAP_FAILED                   Result = C.CUDA_ERROR_UNMAP_FAILED
    ERROR_ARRAY_IS_MAPPED                Result = C.CUDA_ERROR_ARRAY_IS_MAPPED
    ERROR_ALREADY_MAPPED                 Result = C.CUDA_ERROR_ALREADY_MAPPED
    ERROR_NO_BINARY_FOR_GPU              Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU
    ERROR_ALREADY_ACQUIRED               Result = C.CUDA_ERROR_ALREADY_ACQUIRED
    ERROR_NOT_MAPPED                     Result = C.CUDA_ERROR_NOT_MAPPED
    ERROR_NOT_MAPPED_AS_ARRAY            Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
    ERROR_NOT_MAPPED_AS_POINTER          Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
    ERROR_ECC_UNCORRECTABLE              Result = C.CUDA_ERROR_ECC_UNCORRECTABLE
    ERROR_UNSUPPORTED_LIMIT              Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT
    ERROR_CONTEXT_ALREADY_IN_USE         Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
    ERROR_INVALID_SOURCE                 Result = C.CUDA_ERROR_INVALID_SOURCE
    ERROR_FILE_NOT_FOUND                 Result = C.CUDA_ERROR_FILE_NOT_FOUND
    ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
    ERROR_SHARED_OBJECT_INIT_FAILED      Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
    ERROR_OPERATING_SYSTEM               Result = C.CUDA_ERROR_OPERATING_SYSTEM
    ERROR_INVALID_HANDLE                 Result = C.CUDA_ERROR_INVALID_HANDLE
    ERROR_NOT_FOUND                      Result = C.CUDA_ERROR_NOT_FOUND
    ERROR_NOT_READY                      Result = C.CUDA_ERROR_NOT_READY
    ERROR_LAUNCH_FAILED                  Result = C.CUDA_ERROR_LAUNCH_FAILED
    ERROR_LAUNCH_OUT_OF_RESOURCES        Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
    ERROR_LAUNCH_TIMEOUT                 Result = C.CUDA_ERROR_LAUNCH_TIMEOUT
    ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
    ERROR_PEER_ACCESS_ALREADY_ENABLED    Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
    ERROR_PEER_ACCESS_NOT_ENABLED        Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
    ERROR_PRIMARY_CONTEXT_ACTIVE         Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
    ERROR_CONTEXT_IS_DESTROYED           Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
    ERROR_ASSERT                         Result = C.CUDA_ERROR_ASSERT
    ERROR_TOO_MANY_PEERS                 Result = C.CUDA_ERROR_TOO_MANY_PEERS
    ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
    ERROR_HOST_MEMORY_NOT_REGISTERED     Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
    ERROR_UNKNOWN                        Result = C.CUDA_ERROR_UNKNOWN
)

func StreamQuery(stream Stream) Result
    Returns Success if all operations have completed, ErrorNotReady
    otherwise

func (err Result) String() string
    Message string for the error

type Stream uintptr
    CUDA stream.

func StreamCreate() Stream
    Creates an asynchronous stream

func (stream *Stream) Destroy()
    Destroys the asynchronous stream

func (stream Stream) Query() Result
    Returns Success if all operations have completed, ErrorNotReady
    otherwise

func (stream Stream) Synchronize()
    Blocks until the stream has completed.


================================================
FILE: cuda/cgoflags.go
================================================
package cuda

// This file provides CGO flags.

import "C"

//#cgo LDFLAGS:-lcudart
//
////default location:
//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
//#cgo CFLAGS: -I/usr/local/cuda/include/
//
////default location if not properly symlinked:
//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
//
////arch linux:
//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
//#cgo CFLAGS: -I/opt/cuda/include
//
////WINDOWS:
//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include
import "C"


================================================
FILE: cuda/device.go
================================================
package cuda

//#include <cuda_runtime.h>
//#include <cuda.h>
import "C"

import (
	"github.com/barnex/cuda5/cu"
)

// Reset the current GPU device.
func DeviceReset() {
	err := cu.Result(C.cudaDeviceReset())
	if err != cu.SUCCESS {
		panic(err)
	}
}

// Set preference for more cache or shared memory.
func DeviceSetCacheConfig(cacheConfig FuncCache) {
	err := cu.Result(C.cudaDeviceSetCacheConfig(uint32(cacheConfig)))
	if err != cu.SUCCESS {
		panic(err)
	}
}

// Cache preference option.
type FuncCache int

const (
	FUNC_CACHE_PREFER_NONE   FuncCache = C.CU_FUNC_CACHE_PREFER_NONE
	FUNC_CACHE_PREFER_SHARED FuncCache = C.CU_FUNC_CACHE_PREFER_SHARED
	FUNC_CACHE_PREFER_L1     FuncCache = C.CU_FUNC_CACHE_PREFER_L1
	FUNC_CACHE_PREFER_EQUAL  FuncCache = C.CU_FUNC_CACHE_PREFER_EQUAL
)


================================================
FILE: cufft/Makefile
================================================
all: 6g gccgo doc

6g:
	go install -v
	go tool vet *.go
	gofmt -w *.go

GCCGO=gccgo -gccgoflags '-static-libgcc -O3'

gccgo:
	go build -v -compiler $(GCCGO)

test: 6gtest gccgotest

6gtest: 
	go test

gccgotest: 
	go test -compiler $(GCCGO)

bench: 6gbench gccgobench

6gbench:
	go test -bench=.

gccgobench:
	go test -bench=. -compiler $(GCCGO)

clean:
	go clean

doc:
	godoc github.com/barnex/cuda5/cufft > README


================================================
FILE: cufft/README
================================================
PACKAGE DOCUMENTATION

package cufft
    import "github.com/barnex/cuda5/cufft"

    Go bindings for the CUDA CUFFT API.


CONSTANTS

const (
    FORWARD = -1 // Forward FFT
    INVERSE = 1  // Inverse FFT
)


TYPES

type CompatibilityMode int
    CUFFT compatibility mode

const (
    COMPATIBILITY_NATIVE          CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE
    COMPATIBILITY_FFTW_PADDING    CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING
    COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC
    COMPATIBILITY_FFTW_ALL        CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL
)


func (t CompatibilityMode) String() string


type Handle uintptr
    FFT plan handle, reference type to a plan


func Plan1d(nx int, typ Type, batch int) Handle
    1D FFT plan


func Plan2d(nx, ny int, typ Type) Handle
    2D FFT plan


func Plan3d(nx, ny, nz int, typ Type) Handle
    3D FFT plan


func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle
    1D,2D or 3D FFT plan


func (plan *Handle) Destroy()
    Destroys the plan.

func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int)
    Execute Complex-to-Complex plan

func (plan Handle) ExecC2R(idata, odata cu.DevicePtr)
    Execute Complex-to-Real plan

func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr)
    Execute Double Real-to-Complex plan

func (plan Handle) ExecR2C(idata, odata cu.DevicePtr)
    Execute Real-to-Complex plan

func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr)
    Execute Double Complex-to-Real plan

func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int)
    Execute Double Complex-to-Complex plan

func (plan Handle) SetCompatibilityMode(mode CompatibilityMode)
    Sets the FFTW compatibility mode

func (plan Handle) SetStream(stream cu.Stream)
    Sets the cuda stream for this plan


type Result int
    FFT result

const (
    SUCCESS        Result = C.CUFFT_SUCCESS
    INVALID_PLAN   Result = C.CUFFT_INVALID_PLAN
    ALLOC_FAILED   Result = C.CUFFT_ALLOC_FAILED
    INVALID_TYPE   Result = C.CUFFT_INVALID_TYPE
    INVALID_VALUE  Result = C.CUFFT_INVALID_VALUE
    INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR
    EXEC_FAILED    Result = C.CUFFT_EXEC_FAILED
    SETUP_FAILED   Result = C.CUFFT_SETUP_FAILED
    INVALID_SIZE   Result = C.CUFFT_INVALID_SIZE
    UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA
)
    FFT result value


func (r Result) String() string


type Type int
    FFT type

const (
    R2C Type = C.CUFFT_R2C // Real to Complex (interleaved)
    C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real
    C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved
    D2Z Type = C.CUFFT_D2Z // Double to Double-Complex
    Z2D Type = C.CUFFT_Z2D // Double-Complex to Double
    Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex
)


func (t Type) String() string


================================================
FILE: cufft/cgoflags.go
================================================
package cufft

// This file provides CGO flags to find CUDA libraries and headers.

//#cgo LDFLAGS:-lcufft
//
////default location:
//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
//#cgo CFLAGS: -I/usr/local/cuda/include/
//
////default location if not properly symlinked:
//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
//
////arch linux:
//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
//#cgo CFLAGS: -I/opt/cuda/include
//
////WINDOWS:
//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w
import "C"


================================================
FILE: cufft/doc.go
================================================
// Go bindings for the CUDA CUFFT API.
package cufft


================================================
FILE: cufft/fft_test.go
================================================
package cufft

import (
	"fmt"
	"github.com/barnex/cuda5/cu"
	"unsafe"
)

func ExampleFFT1D() {
	N := 8

	hostIn := make([]float32, N)
	hostIn[0] = 1

	devIn := cu.MemAlloc(int64(len(hostIn)) * cu.SIZEOF_FLOAT32)
	defer cu.MemFree(&devIn)
	cu.MemcpyHtoD(devIn, unsafe.Pointer(&hostIn[0]), devIn.Bytes())

	hostOut := make([]complex64, N/2+1)
	devOut := cu.MemAlloc(int64(len(hostOut)) * cu.SIZEOF_COMPLEX64)
	defer cu.MemFree(&devOut)

	plan := Plan1d(N, R2C, 1)
	defer plan.Destroy()
	plan.ExecR2C(devIn, devOut)

	cu.MemcpyDtoH(unsafe.Pointer(&hostOut[0]), devOut, devOut.Bytes())

	fmt.Println("hostIn:", hostIn)
	fmt.Println("hostOut:", hostOut)

	// Output:
	// hostIn: [1 0 0 0 0 0 0 0]
	// hostOut: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]
}


================================================
FILE: cufft/init_test.go
================================================
package cufft

import (
	"fmt"
	"github.com/barnex/cuda5/cu"
)

// needed for all other tests.
func init() {
	cu.Init(0)
	ctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0)
	cu.CtxSetCurrent(ctx)
	fmt.Println("Created CUDA context")
}


================================================
FILE: cufft/mode.go
================================================
package cufft

//#include <cufft.h>
import "C"

import (
	"fmt"
)

// CUFFT compatibility mode
type CompatibilityMode int

const (
	COMPATIBILITY_NATIVE          CompatibilityMode = C.CUFFT_COMPATIBILITY_NATIVE
	COMPATIBILITY_FFTW_PADDING    CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING
	COMPATIBILITY_FFTW_ASYMMETRIC CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC
	COMPATIBILITY_FFTW_ALL        CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_ALL
)

func (t CompatibilityMode) String() string {
	if str, ok := compatibilityModeString[t]; ok {
		return str
	}
	return fmt.Sprint("CUFFT Compatibility mode with unknown number:", int(t))
}

var compatibilityModeString map[CompatibilityMode]string = map[CompatibilityMode]string{
	COMPATIBILITY_NATIVE:          "CUFFT_COMPATIBILITY_NATIVE",
	COMPATIBILITY_FFTW_PADDING:    "CUFFT_COMPATIBILITY_FFTW_PADDING",
	COMPATIBILITY_FFTW_ASYMMETRIC: "CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC",
	COMPATIBILITY_FFTW_ALL:        "CUFFT_COMPATIBILITY_FFTW_ALL"}


================================================
FILE: cufft/plan.go
================================================
// Copyright 2011 Arne Vansteenkiste (barnex@gmail.com).  All rights reserved.
// Use of this source code is governed by a freeBSD
// license that can be found in the LICENSE.txt file.

package cufft

//#include <cufft.h>
import "C"

import (
	"github.com/barnex/cuda5/cu"
	"unsafe"
)

// FFT plan handle, reference type to a plan
type Handle uintptr

// 1D FFT plan
func Plan1d(nx int, typ Type, batch int) Handle {
	var handle C.cufftHandle
	err := Result(C.cufftPlan1d(
		&handle,
		C.int(nx),
		C.cufftType(typ),
		C.int(batch)))
	if err != SUCCESS {
		panic(err)
	}
	return Handle(handle)
}

// 2D FFT plan
func Plan2d(nx, ny int, typ Type) Handle {
	var handle C.cufftHandle
	err := Result(C.cufftPlan2d(
		&handle,
		C.int(nx),
		C.int(ny),
		C.cufftType(typ)))
	if err != SUCCESS {
		panic(err)
	}
	return Handle(handle)
}

// 3D FFT plan
func Plan3d(nx, ny, nz int, typ Type) Handle {
	var handle C.cufftHandle
	err := Result(C.cufftPlan3d(
		&handle,
		C.int(nx),
		C.int(ny),
		C.int(nz),
		C.cufftType(typ)))
	if err != SUCCESS {
		panic(err)
	}
	return Handle(handle)
}

//cufftPlanMany(
//    cufftHandle *plan, int rank, int *n, int *inembed,
//    int istride, int idist, int *onembed, int ostride,
//    int odist, cufftType type, int batch );

// 1D,2D or 3D FFT plan
func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle {
	var handle C.cufftHandle

	NULL := (*C.int)(unsafe.Pointer(uintptr(0)))

	inembedptr := NULL
	idist := 0
	if inembed != nil {
		inembedptr = (*C.int)(unsafe.Pointer(&inembed[0]))
		idist = inembed[0]
	}

	oembedptr := NULL
	odist := 0
	if oembed != nil {
		oembedptr = (*C.int)(unsafe.Pointer(&oembed[0]))
		odist = oembed[0]
	}

	err := Result(C.cufftPlanMany(
		&handle,
		C.int(len(n)),                   // rank
		(*C.int)(unsafe.Pointer(&n[0])), // n
		inembedptr,
		C.int(istride),
		C.int(idist),
		oembedptr,
		C.int(ostride),
		C.int(odist),
		C.cufftType(typ),
		C.int(batch)))
	if err != SUCCESS {
		panic(err)
	}
	return Handle(handle)
}

// Execute Complex-to-Complex plan
func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) {
	err := Result(C.cufftExecC2C(
		C.cufftHandle(plan),
		(*C.cufftComplex)(unsafe.Pointer(uintptr(idata))),
		(*C.cufftComplex)(unsafe.Pointer(uintptr(odata))),
		C.int(direction)))
	if err != SUCCESS {
		panic(err)
	}
}

// Execute Real-to-Complex plan
func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) {
	err := Result(C.cufftExecR2C(
		C.cufftHandle(plan),
		(*C.cufftReal)(unsafe.Pointer(uintptr(idata))),
		(*C.cufftComplex)(unsafe.Pointer(uintptr(odata)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Execute Complex-to-Real plan
func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) {
	err := Result(C.cufftExecC2R(
		C.cufftHandle(plan),
		(*C.cufftComplex)(unsafe.Pointer(uintptr(idata))),
		(*C.cufftReal)(unsafe.Pointer(uintptr(odata)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Execute Double Complex-to-Complex plan
func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) {
	err := Result(C.cufftExecZ2Z(
		C.cufftHandle(plan),
		(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))),
		(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))),
		C.int(direction)))
	if err != SUCCESS {
		panic(err)
	}
}

// Execute Double Real-to-Complex plan
func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) {
	err := Result(C.cufftExecD2Z(
		C.cufftHandle(plan),
		(*C.cufftDoubleReal)(unsafe.Pointer(uintptr(idata))),
		(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Execute Double Complex-to-Real plan
func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) {
	err := Result(C.cufftExecZ2D(
		C.cufftHandle(plan),
		(*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))),
		(*C.cufftDoubleReal)(unsafe.Pointer(uintptr(odata)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Destroys the plan.
func (plan *Handle) Destroy() {
	err := Result(C.cufftDestroy(C.cufftHandle(*plan)))
	*plan = 0 // make sure plan is not used anymore
	if err != SUCCESS {
		panic(err)
	}
}

// Sets the cuda stream for this plan
func (plan Handle) SetStream(stream cu.Stream) {
	err := Result(C.cufftSetStream(
		C.cufftHandle(plan),
		C.cudaStream_t(unsafe.Pointer(uintptr(stream)))))
	if err != SUCCESS {
		panic(err)
	}
}

// Sets the FFTW compatibility mode
func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) {
	err := Result(C.cufftSetCompatibilityMode(
		C.cufftHandle(plan),
		C.cufftCompatibility(mode)))
	if err != SUCCESS {
		panic(err)
	}
}


================================================
FILE: cufft/result.go
================================================
package cufft

//#include <cufft.h>
import "C"

import (
	"fmt"
)

// FFT result
type Result int

// FFT result value
const (
	SUCCESS                   Result = C.CUFFT_SUCCESS
	INVALID_PLAN              Result = C.CUFFT_INVALID_PLAN
	ALLOC_FAILED              Result = C.CUFFT_ALLOC_FAILED
	INVALID_TYPE              Result = C.CUFFT_INVALID_TYPE
	INVALID_VALUE             Result = C.CUFFT_INVALID_VALUE
	INTERNAL_ERROR            Result = C.CUFFT_INTERNAL_ERROR
	EXEC_FAILED               Result = C.CUFFT_EXEC_FAILED
	SETUP_FAILED              Result = C.CUFFT_SETUP_FAILED
	INVALID_SIZE              Result = C.CUFFT_INVALID_SIZE
	UNALIGNED_DATA            Result = C.CUFFT_UNALIGNED_DATA
	INCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h
	INVALID_DEVICE            Result = 0xB
	PARSE_ERROR               Result = 0xC
	NO_WORKSPACE              Result = 0xD
)

func (r Result) String() string {
	if str, ok := resultString[r]; ok {
		return str
	}
	return fmt.Sprint("CUFFT Result with unknown error number:", int(r))
}

var resultString map[Result]string = map[Result]string{
	SUCCESS:                   "CUFFT_SUCCESS",
	INVALID_PLAN:              "CUFFT_INVALID_PLAN",
	ALLOC_FAILED:              "CUFFT_ALLOC_FAILED",
	INVALID_TYPE:              "CUFFT_INVALID_TYPE",
	INVALID_VALUE:             "CUFFT_INVALID_VALUE",
	INTERNAL_ERROR:            "CUFFT_INTERNAL_ERROR",
	EXEC_FAILED:               "CUFFT_EXEC_FAILED",
	SETUP_FAILED:              "CUFFT_SETUP_FAILED",
	INVALID_SIZE:              "CUFFT_INVALID_SIZE",
	UNALIGNED_DATA:            "CUFFT_UNALIGNED_DATA",
	INCOMPLETE_PARAMETER_LIST: "CUFFT_INCOMPLETE_PARAMETER_LIST",
	INVALID_DEVICE:            "CUFFT_INVALID_DEVICE",
	PARSE_ERROR:               "CUFFT_PARSE_ERROR",
	NO_WORKSPACE:              "CUFFT_NO_WORKSPACE"}


================================================
FILE: cufft/type.go
================================================
package cufft

//#include <cufft.h>
import "C"

import (
	"fmt"
)

// FFT type
type Type int

const (
	R2C Type = C.CUFFT_R2C // Real to Complex (interleaved)
	C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real
	C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved
	D2Z Type = C.CUFFT_D2Z // Double to Double-Complex
	Z2D Type = C.CUFFT_Z2D // Double-Complex to Double
	Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex
)

const (
	FORWARD = -1 // Forward FFT
	INVERSE = 1  // Inverse FFT
)

func (t Type) String() string {
	if str, ok := typeString[t]; ok {
		return str
	}
	return fmt.Sprint("CUFFT Type with unknown number:", int(t))
}

var typeString map[Type]string = map[Type]string{
	R2C: "CUFFT_R2C",
	C2R: "CUFFT_C2R",
	C2C: "CUFFT_C2C",
	D2Z: "CUFFT_D2Z",
	Z2D: "CUFFT_Z2D",
	Z2Z: "CUFFT_Z2Z"}


================================================
FILE: curand/Makefile
================================================
all: 6g gccgo doc

6g:
	go install -v
	go tool vet *.go
	gofmt -w *.go

GCCGO=gccgo -gccgoflags '-static-libgcc -O3'

gccgo:
	go build -v -compiler $(GCCGO)

test: 6gtest gccgotest

6gtest: 
	go test

gccgotest: 
	go test -compiler $(GCCGO)

bench: 6gbench gccgobench

6gbench:
	go test -bench=.

gccgobench:
	go test -bench=. -compiler $(GCCGO)

clean:
	go clean

doc:
	godoc github.com/barnex/cuda5/curand > README


================================================
FILE: curand/README
================================================
PACKAGE DOCUMENTATION

package curand
    import "github.com/barnex/cuda5/curand"


TYPES

type Generator uintptr


func CreateGenerator(rngType RngType) Generator


func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32)

func (g Generator) SetSeed(seed int64)


type RngType int

const (
    PSEUDO_DEFAULT          RngType = C.CURAND_RNG_PSEUDO_DEFAULT          // Default pseudorandom generator
    PSEUDO_XORWOW           RngType = C.CURAND_RNG_PSEUDO_XORWOW           // XORWOW pseudorandom generator
    QUASI_DEFAULT           RngType = C.CURAND_RNG_QUASI_DEFAULT           // Default quasirandom generator
    QUASI_SOBOL32           RngType = C.CURAND_RNG_QUASI_SOBOL32           // Sobol32 quasirandom generator
    QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator
    QUASI_SOBOL64           RngType = C.CURAND_RNG_QUASI_SOBOL64           // Sobol64 quasirandom generator
    QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator
)


type Status int

const (
    SUCCESS               Status = C.CURAND_STATUS_SUCCESS               // No errors
    VERSION_MISMATCH      Status = C.CURAND_STATUS_VERSION_MISMATCH      // Header file and linked library version do not match
    NOT_INITIALIZED       Status = C.CURAND_STATUS_NOT_INITIALIZED       // Generator not initialized
    ALLOCATION_FAILED     Status = C.CURAND_STATUS_ALLOCATION_FAILED     // Memory allocation failed
    TYPE_ERROR            Status = C.CURAND_STATUS_TYPE_ERROR            // Generator is wrong type
    OUT_OF_RANGE          Status = C.CURAND_STATUS_OUT_OF_RANGE          // Argument out of range
    LENGTH_NOT_MULTIPLE   Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE   // Length requested is not a multple of dimension
    LAUNCH_FAILURE        Status = C.CURAND_STATUS_LAUNCH_FAILURE        // Kernel launch failure
    PREEXISTING_FAILURE   Status = C.CURAND_STATUS_PREEXISTING_FAILURE   // Preexisting failure on library entry
    INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed
    ARCH_MISMATCH         Status = C.CURAND_STATUS_ARCH_MISMATCH         // Architecture mismatch, GPU does not support requested feature
    INTERNAL_ERROR        Status = C.CURAND_STATUS_INTERNAL_ERROR        // Internal library error
)


================================================
FILE: curand/cgoflags.go
================================================
package curand

// This file provides CGO flags to find CUDA libraries and headers.

//#cgo LDFLAGS:-lcurand
//
////default location:
//#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
//#cgo CFLAGS: -I/usr/local/cuda/include/
//
////default location if not properly symlinked:
//#cgo LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
//#cgo LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
//#cgo CFLAGS: -I/usr/local/cuda-6.0/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.5/include/
//#cgo CFLAGS: -I/usr/local/cuda-5.0/include/
//
////arch linux:
//#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib
//#cgo CFLAGS: -I/opt/cuda/include
//
////WINDOWS:
//#cgo windows LDFLAGS:-LC:/cuda/v5.0/lib/x64 -LC:/cuda/v5.5/lib/x64 -LC:/cuda/v6.0/lib/x64
//#cgo windows CFLAGS: -IC:/cuda/v5.0/include -IC:/cuda/v5.5/include -IC:/cuda/v6.0/include -w
import "C"


================================================
FILE: curand/generator.go
================================================
package curand

//#include <curand.h>
import "C"

import (
	"unsafe"
)

type Generator uintptr

type RngType int

const (
	PSEUDO_DEFAULT          RngType = C.CURAND_RNG_PSEUDO_DEFAULT          // Default pseudorandom generator
	PSEUDO_XORWOW           RngType = C.CURAND_RNG_PSEUDO_XORWOW           // XORWOW pseudorandom generator
	QUASI_DEFAULT           RngType = C.CURAND_RNG_QUASI_DEFAULT           // Default quasirandom generator
	QUASI_SOBOL32           RngType = C.CURAND_RNG_QUASI_SOBOL32           // Sobol32 quasirandom generator
	QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator
	QUASI_SOBOL64           RngType = C.CURAND_RNG_QUASI_SOBOL64           // Sobol64 quasirandom generator
	QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator
)

func CreateGenerator(rngType RngType) Generator {
	var rng C.curandGenerator_t
	err := Status(C.curandCreateGenerator(&rng, C.curandRngType_t(rngType)))
	if err != SUCCESS {
		panic(err)
	}
	return Generator(uintptr(unsafe.Pointer(rng))) // cgo
}

func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) {
	err := Status(C.curandGenerateNormal(
		C.curandGenerator_t(unsafe.Pointer(uintptr(g))),
		(*C.float)(unsafe.Pointer(output)),
		C.size_t(n),
		C.float(mean),
		C.float(stddev)))
	if err != SUCCESS {
		panic(err)
	}
}

func (g Generator) SetSeed(seed int64) {
	err := Status(C.curandSetPseudoRandomGeneratorSeed(C.curandGenerator_t(unsafe.Pointer(uintptr(g))), _Ctype_ulonglong(seed)))
	if err != SUCCESS {
		panic(err)
	}
}

// Documentation was taken from the curand headers.


================================================
FILE: curand/status.go
================================================
package curand

//#include <curand.h>
import "C"

import (
	"fmt"
)

type Status int

const (
	SUCCESS               Status = C.CURAND_STATUS_SUCCESS               // No errors
	VERSION_MISMATCH      Status = C.CURAND_STATUS_VERSION_MISMATCH      // Header file and linked library version do not match
	NOT_INITIALIZED       Status = C.CURAND_STATUS_NOT_INITIALIZED       // Generator not initialized
	ALLOCATION_FAILED     Status = C.CURAND_STATUS_ALLOCATION_FAILED     // Memory allocation failed
	TYPE_ERROR            Status = C.CURAND_STATUS_TYPE_ERROR            // Generator is wrong type
	OUT_OF_RANGE          Status = C.CURAND_STATUS_OUT_OF_RANGE          // Argument out of range
	LENGTH_NOT_MULTIPLE   Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE   // Length requested is not a multple of dimension
	LAUNCH_FAILURE        Status = C.CURAND_STATUS_LAUNCH_FAILURE        // Kernel launch failure
	PREEXISTING_FAILURE   Status = C.CURAND_STATUS_PREEXISTING_FAILURE   // Preexisting failure on library entry
	INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed
	ARCH_MISMATCH         Status = C.CURAND_STATUS_ARCH_MISMATCH         // Architecture mismatch, GPU does not support requested feature
	INTERNAL_ERROR        Status = C.CURAND_STATUS_INTERNAL_ERROR        // Internal library error
)

func (s Status) String() string {
	if str, ok := statusStr[s]; ok {
		return str
	} else {
		return fmt.Sprint("CURAND ERROR NUMBER ", int(s))
	}
}

var statusStr = map[Status]string{
	SUCCESS:               "CURAND_STATUS_SUCCESS",
	VERSION_MISMATCH:      "CURAND_STATUS_VERSION_MISMATCH",
	NOT_INITIALIZED:       "CURAND_STATUS_NOT_INITIALIZED",
	ALLOCATION_FAILED:     "CURAND_STATUS_ALLOCATION_FAILED",
	TYPE_ERROR:            "CURAND_STATUS_TYPE_ERROR",
	OUT_OF_RANGE:          "CURAND_STATUS_OUT_OF_RANGE",
	LENGTH_NOT_MULTIPLE:   "CURAND_STATUS_LENGTH_NOT_MULTIPLE",
	LAUNCH_FAILURE:        "CURAND_STATUS_LAUNCH_FAILURE",
	PREEXISTING_FAILURE:   "CURAND_STATUS_PREEXISTING_FAILURE",
	INITIALIZATION_FAILED: "CURAND_STATUS_INITIALIZATION_FAILED",
	ARCH_MISMATCH:         "CURAND_STATUS_ARCH_MISMATCH",
	INTERNAL_ERROR:        "CURAND_STATUS_INTERNAL_ERROR",
}

// Documentation was taken from the curand headers.


================================================
FILE: doc.go
================================================
/*
	Go bindings for nVIDIA CUDA 5.
	This package compiles with both gc and gccgo.
*/
package cuda5

// Dummy imports so that
// 	go get github.com/barnex/cuda5
// will install everything.
import (
	_ "github.com/barnex/cuda5/cu"
	_ "github.com/barnex/cuda5/cufft"
	_ "github.com/barnex/cuda5/safe"
)


================================================
FILE: safe/Makefile
================================================
all: 6g doc #gccgo

6g:
	go install -v
	go tool vet *.go
	gofmt -w *.go

GCCGO=gccgo -gccgoflags '-static-libgcc -O3'

gccgo:
	go build -v -compiler $(GCCGO)

test: 6gtest gccgotest

6gtest: 
	go test

gccgotest: 
	go test -compiler $(GCCGO)

bench: 6gbench gccgobench

6gbench:
	go test -bench=.

gccgobench:
	go test -bench=. -compiler $(GCCGO)

clean:
	go clean
	go-optview -c -w *.go
	gofmt -w *.go

opt:
	go-optview -w *.go
	gofmt -w *.go

doc:
	godoc github.com/barnex/cuda5/safe > README


================================================
FILE: safe/README
================================================
PACKAGE

package safe
    import "github.com/barnex/cuda5/safe"

    Safe and more idiomatic wrappers for the low-level CUDA functions.

FUNCTIONS

func InitCuda()


TYPES

type Complex128s struct {
    // contains filtered or unexported fields
}
    Slice of complex128's on the GPU.

func MakeComplex128s(len_ int) Complex128s
    Make a slice of complex128's on the GPU. Initialized to zero.

func (s *Complex128s) Cap() int
    Slice capacity.

func (dst Complex128s) CopyDtoD(src Complex128s)
    Copy src on host to dst on host.

func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream)
    Copy src on host to dst on host, asynchronously.

func (src Complex128s) CopyDtoH(dst []complex128)
    Copy src form device to dst on host.

func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream)
    Copy src form device to dst on host, asynchronously.

func (dst Complex128s) CopyHtoD(src []complex128)
    Copy src from host to dst on the device.

func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream)
    Copy src from host to dst on the device, asynchronously.

func (s Complex128s) Float() Float64s
    Re-interpret the array as float numbers, in interleaved format.
    Underlying storage is shared.

func (s *Complex128s) Free()
    Free the underlying storage. To be used with care. Free() should only be
    called on a slice created by MakeXXX(), not on a slice created by
    x.Slice(). Freeing a slice invalidates all other slices referring to it.

func (src Complex128s) Host() []complex128
    Returns a fresh copy on host.

func (s *Complex128s) Len() int
    Slice length (number of elements).

func (s *Complex128s) Pointer() cu.DevicePtr
    Pointer to the first element.

func (s Complex128s) Slice(start, stop int) Complex128s
    Return a slice from start (inclusive) to stop (exclusive), sharing the
    underlying storage with the original slice. Slices obtained in this way
    should not be Free()'d

func (s *Complex128s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
    Manually set the pointer, length and capacity. Side-steps the security
    mechanisms, use with caution.

type Complex64s struct {
    // contains filtered or unexported fields
}
    Slice of complex64's on the GPU.

func MakeComplex64s(len_ int) Complex64s
    Make a slice of complex64's on the GPU. Initialized to zero.

func (s *Complex64s) Cap() int
    Slice capacity.

func (dst Complex64s) CopyDtoD(src Complex64s)
    Copy src on host to dst on host.

func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream)
    Copy src on host to dst on host, asynchronously.

func (src Complex64s) CopyDtoH(dst []complex64)
    Copy src form device to dst on host.

func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream)
    Copy src form device to dst on host, asynchronously.

func (dst Complex64s) CopyHtoD(src []complex64)
    Copy src from host to dst on the device.

func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream)
    Copy src from host to dst on the device, asynchronously.

func (s Complex64s) Float() Float32s
    Re-interpret the array as float numbers, in interleaved format.
    Underlying storage is shared.

func (s *Complex64s) Free()
    Free the underlying storage. To be used with care. Free() should only be
    called on a slice created by MakeXXX(), not on a slice created by
    x.Slice(). Freeing a slice invalidates all other slices referring to it.

func (src Complex64s) Host() []complex64
    Returns a fresh copy on host.

func (s *Complex64s) Len() int
    Slice length (number of elements).

func (s *Complex64s) Pointer() cu.DevicePtr
    Pointer to the first element.

func (s Complex64s) Slice(start, stop int) Complex64s
    Return a slice from start (inclusive) to stop (exclusive), sharing the
    underlying storage with the original slice. Slices obtained in this way
    should not be Free()'d

func (s *Complex64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
    Manually set the pointer, length and capacity. Side-steps the security
    mechanisms, use with caution.

type FFT1DC2RPlan struct {
    // contains filtered or unexported fields
}
    1D single-precission complex-to-real FFT plan.

func FFT1DC2R(size, batch int) FFT1DC2RPlan
    1D single-precission complex-to-real FFT plan.

func (p FFT1DC2RPlan) Destroy()
    Releases all resources associated with the FFT plan.

func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s)
    Execute the FFT plan. Synchronized.

func (p FFT1DC2RPlan) InputLen() int
    Required length of the output array.

func (p FFT1DC2RPlan) OutputLen() int
    Required length of the input array.

func (p FFT1DC2RPlan) SetStream(stream cu.Stream)
    Associates a CUDA stream with the FFT plan. If a stream is set,
    plan.Stream().Synchronize() can to be called to wait for the execution
    to finish.

func (s FFT1DC2RPlan) Size() int
    Returns the logical size of the FFT: the number of elements (real or
    complex) it transforms.

func (p FFT1DC2RPlan) Stream() cu.Stream
    Returns the CUDA stream associated with the FFT plan.

type FFT1DR2CPlan struct {
    // contains filtered or unexported fields
}
    1D single-precission real-to-complex FFT plan.

func FFT1DR2C(size, batch int) FFT1DR2CPlan
    1D single-precission real-to-complex FFT plan.

func (p FFT1DR2CPlan) Destroy()
    Releases all resources associated with the FFT plan.

func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s)
    Execute the FFT plan. Synchronized.

func (p FFT1DR2CPlan) InputLen() int
    Required length of the input array.

func (p FFT1DR2CPlan) OutputLen() int
    Required length of the output array.

func (p FFT1DR2CPlan) SetStream(stream cu.Stream)
    Associates a CUDA stream with the FFT plan. If a stream is set,
    plan.Stream().Synchronize() can to be called to wait for the execution
    to finish.

func (s FFT1DR2CPlan) Size() int
    Returns the logical size of the FFT: the number of elements (real or
    complex) it transforms.

func (p FFT1DR2CPlan) Stream() cu.Stream
    Returns the CUDA stream associated with the FFT plan.

type FFT3DC2RPlan struct {
    // contains filtered or unexported fields
}
    3D single-precission real-to-complex FFT plan.

func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan
    3D single-precission real-to-complex FFT plan.

func (p FFT3DC2RPlan) Destroy()
    Releases all resources associated with the FFT plan.

func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s)
    Execute the FFT plan. src and dst are 3D arrays stored 1D arrays.

func (p FFT3DC2RPlan) InputLen() int
    Required length of the (1D) input array.

func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int)
    3D size of the input array.

func (p FFT3DC2RPlan) OutputLen() int
    Required length of the (1D) output array.

func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int)
    3D size of the output array.

func (p FFT3DC2RPlan) SetStream(stream cu.Stream)
    Associates a CUDA stream with the FFT plan. If a stream is set,
    plan.Stream().Synchronize() can to be called to wait for the execution
    to finish.

func (s FFT3DC2RPlan) Size() (Nx, Ny, Nz int)
    Returns the logical size of the FFT: the number of elements (real or
    complex) it transforms.

func (p FFT3DC2RPlan) Stream() cu.Stream
    Returns the CUDA stream associated with the FFT plan.

type FFT3DD2ZPlan struct {
    // contains filtered or unexported fields
}
    3D single-precission real-to-complex FFT plan.

func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan
    3D single-precission real-to-complex FFT plan.

func (p FFT3DD2ZPlan) Destroy()
    Releases all resources associated with the FFT plan.

func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s)
    Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D
    arrays.

func (p FFT3DD2ZPlan) InputLen() int
    Required length of the (1D) input array.

func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int)
    3D size of the input array.

func (p FFT3DD2ZPlan) OutputLen() int
    Required length of the (1D) output array.

func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int)
    3D size of the output array.

func (p FFT3DD2ZPlan) SetStream(stream cu.Stream)
    Associates a CUDA stream with the FFT plan. If a stream is set,
    plan.Stream().Synchronize() can to be called to wait for the execution
    to finish.

func (s FFT3DD2ZPlan) Size() (Nx, Ny, Nz int)
    Returns the logical size of the FFT: the number of elements (real or
    complex) it transforms.

func (p FFT3DD2ZPlan) Stream() cu.Stream
    Returns the CUDA stream associated with the FFT plan.

type FFT3DR2CPlan struct {
    // contains filtered or unexported fields
}
    3D single-precission real-to-complex FFT plan.

func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan
    3D single-precission real-to-complex FFT plan.

func (p FFT3DR2CPlan) Destroy()
    Releases all resources associated with the FFT plan.

func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s)
    Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D
    arrays.

func (p FFT3DR2CPlan) InputLen() int
    Required length of the (1D) input array.

func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int)
    3D size of the input array.

func (p FFT3DR2CPlan) OutputLen() int
    Required length of the (1D) output array.

func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int)
    3D size of the output array.

func (p FFT3DR2CPlan) SetStream(stream cu.Stream)
    Associates a CUDA stream with the FFT plan. If a stream is set,
    plan.Stream().Synchronize() can to be called to wait for the execution
    to finish.

func (s FFT3DR2CPlan) Size() (Nx, Ny, Nz int)
    Returns the logical size of the FFT: the number of elements (real or
    complex) it transforms.

func (p FFT3DR2CPlan) Stream() cu.Stream
    Returns the CUDA stream associated with the FFT plan.

type FFT3DZ2DPlan struct {
    // contains filtered or unexported fields
}
    3D single-precission real-to-complex FFT plan.

func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan
    3D single-precission real-to-complex FFT plan.

func (p FFT3DZ2DPlan) Destroy()
    Releases all resources associated with the FFT plan.

func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s)
    Execute the FFT plan. Synchronized. src and dst are 3D arrays stored 1D
    arrays.

func (p FFT3DZ2DPlan) InputLen() int
    Required length of the (1D) input array.

func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int)
    3D size of the input array.

func (p FFT3DZ2DPlan) OutputLen() int
    Required length of the (1D) output array.

func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int)
    3D size of the output array.

func (p FFT3DZ2DPlan) SetStream(stream cu.Stream)
    Associates a CUDA stream with the FFT plan. If a stream is set,
    plan.Stream().Synchronize() can to be called to wait for the execution
    to finish.

func (s FFT3DZ2DPlan) Size() (Nx, Ny, Nz int)
    Returns the logical size of the FFT: the number of elements (real or
    complex) it transforms.

func (p FFT3DZ2DPlan) Stream() cu.Stream
    Returns the CUDA stream associated with the FFT plan.

type Float32s struct {
    // contains filtered or unexported fields
}
    Slice of float32's on the GPU.

func MakeFloat32s(len_ int) Float32s
    Make a slice of float32's on the GPU. Initialized to zero.

func (s *Float32s) Cap() int
    Slice capacity.

func (s Float32s) Complex() Complex64s
    Re-interpret the array as complex numbers, in interleaved format.
    Underlying storage is shared.

func (dst Float32s) CopyDtoD(src Float32s)
    Copy src on host to dst on host.

func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream)
    Copy src on host to dst on host, asynchronously.

func (src Float32s) CopyDtoH(dst []float32)
    Copy src form device to dst on host.

func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream)
    Copy src form device to dst on host, asynchronously.

func (dst Float32s) CopyHtoD(src []float32)
    Copy src from host to dst on the device.

func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream)
    Copy src from host to dst on the device, asynchronously.

func (s *Float32s) Free()
    Free the underlying storage. To be used with care. Free() should only be
    called on a slice created by MakeXXX(), not on a slice created by
    x.Slice(). Freeing a slice invalidates all other slices referring to it.

func (src Float32s) Host() []float32
    Returns a fresh copy on host.

func (s *Float32s) Len() int
    Slice length (number of elements).

func (s Float32s) Memset(value float32)
    Set the entire slice to this value.

func (s Float32s) MemsetAsync(value float32, stream cu.Stream)
    Set the entire slice to this value, asynchronously.

func (s *Float32s) Pointer() cu.DevicePtr
    Pointer to the first element.

func (s Float32s) Slice(start, stop int) Float32s
    Return a slice from start (inclusive) to stop (exclusive), sharing the
    underlying storage with the original slice. Slices obtained in this way
    should not be Free()'d

func (s *Float32s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
    Manually set the pointer, length and capacity. Side-steps the security
    mechanisms, use with caution.

type Float64s struct {
    // contains filtered or unexported fields
}
    Slice of float64's on the GPU.

func MakeFloat64s(len_ int) Float64s
    Make a slice of float64's on the GPU. Initialized to zero.

func (s *Float64s) Cap() int
    Slice capacity.

func (s Float64s) Complex() Complex128s
    Re-interpret the array as complex numbers, in interleaved format.
    Underlying storage is shared.

func (dst Float64s) CopyDtoD(src Float64s)
    Copy src on host to dst on host.

func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream)
    Copy src on host to dst on host, asynchronously.

func (src Float64s) CopyDtoH(dst []float64)
    Copy src form device to dst on host.

func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream)
    Copy src form device to dst on host, asynchronously.

func (dst Float64s) CopyHtoD(src []float64)
    Copy src from host to dst on the device.

func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream)
    Copy src from host to dst on the device, asynchronously.

func (s *Float64s) Free()
    Free the underlying storage. To be used with care. Free() should only be
    called on a slice created by MakeXXX(), not on a slice created by
    x.Slice(). Freeing a slice invalidates all other slices referring to it.

func (src Float64s) Host() []float64
    Returns a fresh copy on host.

func (s *Float64s) Len() int
    Slice length (number of elements).

func (s *Float64s) Pointer() cu.DevicePtr
    Pointer to the first element.

func (s Float64s) Slice(start, stop int) Float64s
    Return a slice from start (inclusive) to stop (exclusive), sharing the
    underlying storage with the original slice. Slices obtained in this way
    should not be Free()'d

func (s *Float64s) UnsafeSet(pointer unsafe.Pointer, length, capacity int)
    Manually set the pointer, length and capacity. Side-steps the security
    mechanisms, use with caution.


================================================
FILE: safe/complex128s.go
================================================
package safe

import (
	"github.com/barnex/cuda5/cu"
	"unsafe"
)

// Slice of complex128's on the GPU.
type Complex128s struct{ slice }

// Make a slice of complex128's on the GPU.
// Initialized to zero.
func MakeComplex128s(len_ int) Complex128s {
	return Complex128s{makeslice(len_, cu.SIZEOF_COMPLEX128)}
}

// Return a slice from start (inclusive) to stop (exclusive),
// sharing the underlying storage with the original slice.
// Slices obtained in this way should not be Free()'d
func (s Complex128s) Slice(start, stop int) Complex128s {
	return Complex128s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX128)}
}

// Copy src from host to dst on the device.
func (dst Complex128s) CopyHtoD(src []complex128) {
	dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128)
}

// Copy src form device to dst on host.
func (src Complex128s) CopyDtoH(dst []complex128) {
	src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128)
}

// Copy src on host to dst on host.
func (dst Complex128s) CopyDtoD(src Complex128s) {
	dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX128)
}

// Copy src from host to dst on the device, asynchronously.
func (dst Complex128s) CopyHtoDAsync(src []complex128, stream cu.Stream) {
	dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX128, stream)
}

// Copy src form device to dst on host, asynchronously.
func (src Complex128s) CopyDtoHAsync(dst []complex128, stream cu.Stream) {
	src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX128, stream)
}

// Copy src on host to dst on host, asynchronously.
func (dst Complex128s) CopyDtoDAsync(src Complex128s, stream cu.Stream) {
	dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX128, stream)
}

// Returns a fresh copy on host.
func (src Complex128s) Host() []complex128 {
	cpy := make([]complex128, src.Len())
	src.CopyDtoH(cpy)
	return cpy
}

// Re-interpret the array as float numbers,
// in interleaved format. Underlying storage
// is shared.
func (s Complex128s) Float() Float64s {
	return Float64s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}}
}


================================================
FILE: safe/complex128s_test.go
================================================
package safe

import (
	"reflect"
	"testing"
)

func TestComplex128sSlice(test *testing.T) {
	InitCuda()

	a := MakeComplex128s(100)
	defer a.Free()

	if !reflect.DeepEqual(a.Host(), make([]complex128, 100)) {
		test.Error(a.Host())
	}

	b := make([]complex128, 100)

	if a.Len() != len(b) {
		test.Error("len:", a.Len(), "!=", cap(b))
	}
	if a.Cap() != cap(b) {
		test.Error("cap:", a.Cap(), "!=", cap(b))
	}

	c := a.Slice(20, 30)
	d := b[20:30]

	if c.Len() != len(d) {
		test.Error("sliced len:", c.Len(), "!=", cap(d))
	}
	if c.Cap() != cap(d) {
		test.Error("sliced cap:", c.Cap(), "!=", cap(d))
	}

	e := a.Slice(0, 50)
	f := b[0:50]

	if e.Len() != len(f) {
		test.Error("sliced len:", e.Len(), "!=", cap(f))
	}
	if e.Cap() != cap(f) {
		test.Error("sliced cap:", e.Cap(), "!=", cap(f))
	}
}

func TestComplex128sPanic1(test *testing.T) {
	InitCuda()

	defer func() {
		err := recover()
		test.Log("recovered:", err)
		if err == nil {
			test.Fail()
		}
	}()

	a := MakeComplex128s(100)
	defer a.Free()

	a.Slice(-1, 10)
}

func TestComplex128sPanic2(test *testing.T) {
	InitCuda()

	defer func() {
		err := recover()
		test.Log("recovered:", err)
		if err == nil {
			test.Fail()
		}
	}()

	a := MakeComplex128s(100)
	defer a.Free()

	a.Slice(0, 101)
}

func TestComplex128sCopy(test *testing.T) {
	InitCuda()

	a := make([]complex128, 100)

	b := MakeComplex128s(100)
	defer b.Free()

	c := MakeComplex128s(100)
	defer c.Free()

	d := make([]complex128, 200)

	for i := range a {
		a[i] = complex(float64(i), float64(2*i))
	}

	b.CopyHtoD(a)

	c.CopyDtoD(b)

	c.CopyDtoH(d[:100])

	if !reflect.DeepEqual(a, d[:100]) {
		test.Error(d)
	}
	if !reflect.DeepEqual(d[100:], make([]complex128, 100)) {
		test.Error(d)
	}
}


================================================
FILE: safe/complex64s.go
================================================
package safe

import (
	"github.com/barnex/cuda5/cu"
	"unsafe"
)

// Slice of complex64's on the GPU.
type Complex64s struct{ slice }

// Make a slice of complex64's on the GPU.
// Initialized to zero.
func MakeComplex64s(len_ int) Complex64s {
	return Complex64s{makeslice(len_, cu.SIZEOF_COMPLEX64)}
}

// Return a slice from start (inclusive) to stop (exclusive),
// sharing the underlying storage with the original slice.
// Slices obtained in this way should not be Free()'d
func (s Complex64s) Slice(start, stop int) Complex64s {
	return Complex64s{s.slice.slice(start, stop, cu.SIZEOF_COMPLEX64)}
}

// Copy src from host to dst on the device.
func (dst Complex64s) CopyHtoD(src []complex64) {
	dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64)
}

// Copy src form device to dst on host.
func (src Complex64s) CopyDtoH(dst []complex64) {
	src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64)
}

// Copy src on host to dst on host.
func (dst Complex64s) CopyDtoD(src Complex64s) {
	dst.copyDtoD(&src.slice, cu.SIZEOF_COMPLEX64)
}

// Copy src from host to dst on the device, asynchronously.
func (dst Complex64s) CopyHtoDAsync(src []complex64, stream cu.Stream) {
	dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_COMPLEX64, stream)
}

// Copy src form device to dst on host, asynchronously.
func (src Complex64s) CopyDtoHAsync(dst []complex64, stream cu.Stream) {
	src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_COMPLEX64, stream)
}

// Copy src on host to dst on host, asynchronously.
func (dst Complex64s) CopyDtoDAsync(src Complex64s, stream cu.Stream) {
	dst.copyDtoDAsync(&src.slice, cu.SIZEOF_COMPLEX64, stream)
}

// Returns a fresh copy on host.
func (src Complex64s) Host() []complex64 {
	cpy := make([]complex64, src.Len())
	src.CopyDtoH(cpy)
	return cpy
}

// Re-interpret the array as float numbers,
// in interleaved format. Underlying storage
// is shared.
func (s Complex64s) Float() Float32s {
	return Float32s{slice{s.ptr_, s.len_ * 2, s.cap_ * 2}}
}


================================================
FILE: safe/complex64s_test.go
================================================
package safe

import (
	"reflect"
	"testing"
)

func TestComplex64sSlice(test *testing.T) {
	InitCuda()

	a := MakeComplex64s(100)
	defer a.Free()

	if !reflect.DeepEqual(a.Host(), make([]complex64, 100)) {
		test.Error(a.Host())
	}

	b := make([]complex64, 100)

	if a.Len() != len(b) {
		test.Error("len:", a.Len(), "!=", cap(b))
	}
	if a.Cap() != cap(b) {
		test.Error("cap:", a.Cap(), "!=", cap(b))
	}

	c := a.Slice(20, 30)
	d := b[20:30]

	if c.Len() != len(d) {
		test.Error("sliced len:", c.Len(), "!=", cap(d))
	}
	if c.Cap() != cap(d) {
		test.Error("sliced cap:", c.Cap(), "!=", cap(d))
	}

	e := a.Slice(0, 50)
	f := b[0:50]

	if e.Len() != len(f) {
		test.Error("sliced len:", e.Len(), "!=", cap(f))
	}
	if e.Cap() != cap(f) {
		test.Error("sliced cap:", e.Cap(), "!=", cap(f))
	}
}

func TestComplex64sPanic1(test *testing.T) {
	InitCuda()

	defer func() {
		err := recover()
		test.Log("recovered:", err)
		if err == nil {
			test.Fail()
		}
	}()

	a := MakeComplex64s(100)
	defer a.Free()

	a.Slice(-1, 10)
}

func TestComplex64sPanic2(test *testing.T) {
	InitCuda()

	defer func() {
		err := recover()
		test.Log("recovered:", err)
		if err == nil {
			test.Fail()
		}
	}()

	a := MakeComplex64s(100)
	defer a.Free()

	a.Slice(0, 101)
}

func TestComplex64sCopy(test *testing.T) {
	InitCuda()

	a := make([]complex64, 100)

	b := MakeComplex64s(100)
	defer b.Free()

	c := MakeComplex64s(100)
	defer c.Free()

	d := make([]complex64, 200)

	for i := range a {
		a[i] = complex(float32(i), float32(2*i))
	}

	b.CopyHtoD(a)

	c.CopyDtoD(b)

	c.CopyDtoH(d[:100])

	if !reflect.DeepEqual(a, d[:100]) {
		test.Error(d)
	}
	if !reflect.DeepEqual(d[100:], make([]complex64, 100)) {
		test.Error(d)
	}
}


================================================
FILE: safe/doc.go
================================================
/*
	Safe and more idiomatic wrappers for the low-level CUDA functions.
*/
package safe


================================================
FILE: safe/fft1d_test.go
================================================
package safe

import (
	"fmt"
)

func ExampleFFT1DR2C() {
	InitCuda()

	N := 8
	batch := 1

	fft := FFT1DR2C(N, batch)
	defer fft.Destroy()

	input := MakeFloat32s(N)
	defer input.Free()
	input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0})

	output := MakeComplex64s(fft.OutputLen())
	defer output.Free()

	fft.Exec(input, output)

	fmt.Println("input:", input.Host())
	fmt.Println("output:", output.Host())

	// Output:
	// input: [1 0 0 0 0 0 0 0]
	// output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]
}

func ExampleFFT1DR2C_Inplace() {
	InitCuda()

	N := 8
	batch := 2

	fft := FFT1DR2C(N, batch)
	defer fft.Destroy()

	output := MakeComplex64s(fft.OutputLen())
	defer output.Free()

	input := output.Float().Slice(0, fft.InputLen())
	// input uses same layout as out-of-place transform
	// (CUFFT native layout)
	input.CopyHtoD([]float32{1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0})
	fmt.Println("input:", input.Host())

	fft.Exec(input, output)
	fmt.Println("output:", output.Host())

	inverse := FFT1DC2R(N, batch)
	defer inverse.Destroy()
	inverse.Exec(output, input)
	fmt.Println("input:", input.Host())

	// Output:
	// input: [1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
	// output: [(1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i) (+1-0i) (+1+0i)]
	// input: [8 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0]
}
func ExampleFFT1DC2R() {
	InitCuda()

	N := 8
	batch := 1

	fft := FFT1DC2R(N, batch)
	defer fft.Destroy()

	input := MakeComplex64s(fft.InputLen())
	defer input.Free()
	input.CopyHtoD([]complex64{(1 + 0i), (+1 + 0i), (+1 + 0i), (+1 - 0i), (+1 + 0i)})

	output := MakeFloat32s(fft.OutputLen())
	defer output.Free()

	fft.Exec(input, output)

	fmt.Println("input:", input.Host())
	fmt.Println("output:", output.Host())

	// Output:
	// input: [(1+0i) (+1+0i) (+1+0i) (+1+0i) (+1+0i)]
	// output: [8 0 0 0 0 0 0 0]
}


================================================
FILE: safe/fft1dc2r.go
================================================
package safe

import (
	"fmt"
	"github.com/barnex/cuda5/cufft"
)

// 1D single-precission complex-to-real FFT plan.
type FFT1DC2RPlan struct {
	fftplan
	size1D
	batch int
}

// 1D single-precission complex-to-real FFT plan.
func FFT1DC2R(size, batch int) FFT1DC2RPlan {
	handle := cufft.Plan1d(size, cufft.C2R, batch)
	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
	return FFT1DC2RPlan{fftplan{handle, 0}, size1D(size), batch}
}

// Execute the FFT plan. Synchronized.
func (p FFT1DC2RPlan) Exec(src Complex64s, dst Float32s) {
	oksrclen := p.InputLen()
	if src.Len() != oksrclen {
		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
	}
	okdstlen := p.OutputLen()
	if dst.Len() != okdstlen {
		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
	}
	p.handle.ExecC2R(src.Pointer(), dst.Pointer())
	p.stream.Synchronize() //!
}

// Required length of the input array.
func (p FFT1DC2RPlan) OutputLen() int {
	return p.batch * p.Size()
}

// Required length of the output array.
func (p FFT1DC2RPlan) InputLen() int {
	return p.batch * (p.Size()/2 + 1)
}


================================================
FILE: safe/fft1dr2c.go
================================================
package safe

import (
	"fmt"
	"github.com/barnex/cuda5/cufft"
)

// 1D single-precission real-to-complex FFT plan.
type FFT1DR2CPlan struct {
	fftplan
	size1D
	batch int
}

// 1D single-precission real-to-complex FFT plan.
func FFT1DR2C(size, batch int) FFT1DR2CPlan {
	handle := cufft.Plan1d(size, cufft.R2C, batch)
	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
	return FFT1DR2CPlan{fftplan{handle, 0}, size1D(size), batch}
}

// Execute the FFT plan. Synchronized.
func (p FFT1DR2CPlan) Exec(src Float32s, dst Complex64s) {
	oksrclen := p.InputLen()
	if src.Len() != oksrclen {
		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
	}
	okdstlen := p.OutputLen()
	if dst.Len() != okdstlen {
		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
	}
	p.handle.ExecR2C(src.Pointer(), dst.Pointer())
	p.stream.Synchronize() //!
}

// Required length of the input array.
func (p FFT1DR2CPlan) InputLen() int {
	return p.batch * p.Size()
}

// Required length of the output array.
func (p FFT1DR2CPlan) OutputLen() int {
	return p.batch * (p.Size()/2 + 1)
}


================================================
FILE: safe/fft3d_test.go
================================================
package safe

import (
	"fmt"
)

func ExampleFFT3DR2C() {
	InitCuda()

	Nx, Ny, Nz := 2, 4, 8

	fft := FFT3DR2C(Nx, Ny, Nz)
	defer fft.Destroy()

	input := MakeFloat32s(fft.InputLen())
	defer input.Free()

	inputData := make([]float32, Nx*Ny*Nz)
	inputData[0*Ny*Nz] = 1
	inputData[1*Ny*Nz] = 1
	input.CopyHtoD(inputData)

	output := MakeComplex64s(fft.OutputLen())
	defer output.Free()

	fft.Exec(input, output)

	fmt.Println("input:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz))
	Ox, Oy, Oz := fft.OutputSize()
	fmt.Println("output:", Reshape3DComplex64(output.Host(), Ox, Oy, Oz))

	// Output:
	// input: [[[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
	// output: [[[(2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2-0i) (+2+0i)]] [[(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)] [(+0+0i) (+0+0i) (+0+0i) (+0-0i) (+0+0i)]]]
}

func ExampleFFT3DC2R() {
	InitCuda()

	Nx, Ny, Nz := 2, 4, 8

	fft := FFT3DC2R(Nx, Ny, Nz)
	defer fft.Destroy()

	input := MakeComplex64s(fft.InputLen())
	defer input.Free()

	inputData := make([]complex64, fft.InputLen())
	for i := range inputData {
		inputData[i] = 2
	}
	input.CopyHtoD(inputData)

	output := MakeFloat32s(fft.OutputLen())
	defer output.Free()

	fft.Exec(input, output)

	Ix, Iy, Iz := fft.InputSize()
	fmt.Println("input:", Reshape3DComplex64(input.Host(), Ix, Iy, Iz))
	fmt.Println("output:", Reshape3DFloat32(output.Host(), Nx, Ny, Nz))

	// Output:
	// input: [[[(2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]] [[(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)] [(+2+0i) (+2+0i) (+2+0i) (+2+0i) (+2+0i)]]]
	// output: [[[128 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
}

func ExampleFFT3D() {
	InitCuda()

	Nx, Ny, Nz := 2, 4, 8

	forward := FFT3DR2C(Nx, Ny, Nz)
	defer forward.Destroy()

	input := MakeFloat32s(forward.InputLen())
	defer input.Free()

	inputData := make([]float32, forward.InputLen())
	inputData[5] = 1
	input.CopyHtoD(inputData)

	output := MakeComplex64s(forward.OutputLen())
	defer output.Free()

	forward.Exec(input, output)

	backward := FFT3DC2R(Nx, Ny, Nz)
	backward.Exec(output, input)

	fmt.Println("input:", Reshape3DFloat32(inputData, Nx, Ny, Nz))
	fmt.Println("forward+inverse:", Reshape3DFloat32(input.Host(), Nx, Ny, Nz))

	// Output:
	// input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
	// forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
}

//func ExampleFFT3D64() {
//	InitCuda()
//
//	Nx, Ny, Nz := 2, 4, 8
//
//	forward := FFT3DD2Z(Nx, Ny, Nz)
//	defer forward.Destroy()
//
//	input := MakeFloat64s(forward.InputLen())
//	defer input.Free()
//
//	inputData := make([]float64, forward.InputLen())
//	inputData[5] = 1
//	input.CopyHtoD(inputData)
//
//	output := MakeComplex128s(forward.OutputLen())
//	defer output.Free()
//
//	forward.Exec(input, output)
//
//	backward := FFT3DZ2D(Nx, Ny, Nz)
//	backward.Exec(output, input)
//
//	fmt.Println("input:", Reshape3DFloat64(inputData, Nx, Ny, Nz))
//	fmt.Println("forward+inverse:", Reshape3DFloat64(input.Host(), Nx, Ny, Nz))
//
//	// Output:
//	// input: [[[0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
//	// forward+inverse: [[[0 0 0 0 0 64 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0]]]
//}


================================================
FILE: safe/fft3dc2r.go
================================================
package safe

import (
	"fmt"
	"github.com/barnex/cuda5/cufft"
)

// 3D single-precission real-to-complex FFT plan.
type FFT3DC2RPlan struct {
	fftplan
	size3D
}

// 3D single-precission real-to-complex FFT plan.
func FFT3DC2R(Nx, Ny, Nz int) FFT3DC2RPlan {
	handle := cufft.Plan3d(Nx, Ny, Nz, cufft.C2R)
	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
	return FFT3DC2RPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
}

// Execute the FFT plan.
// src and dst are 3D arrays stored 1D arrays.
func (p FFT3DC2RPlan) Exec(src Complex64s, dst Float32s) {
	oksrclen := p.InputLen()
	if src.Len() != oksrclen {
		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
	}
	okdstlen := p.OutputLen()
	if dst.Len() != okdstlen {
		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
	}
	p.handle.ExecC2R(src.Pointer(), dst.Pointer())
	p.stream.Synchronize() //!
}

// 3D size of the input array.
func (p FFT3DC2RPlan) InputSize() (Nx, Ny, Nz int) {
	return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
}

// 3D size of the output array.
func (p FFT3DC2RPlan) OutputSize() (Nx, Ny, Nz int) {
	return p.size3D[0], p.size3D[1], p.size3D[2]
}

// Required length of the (1D) input array.
func (p FFT3DC2RPlan) InputLen() int {
	return prod3(p.InputSize())
}

// Required length of the (1D) output array.
func (p FFT3DC2RPlan) OutputLen() int {
	return prod3(p.OutputSize())
}


================================================
FILE: safe/fft3dd2z.go
================================================
package safe

import (
	"fmt"
	"github.com/barnex/cuda5/cufft"
)

// 3D single-precission real-to-complex FFT plan.
type FFT3DD2ZPlan struct {
	fftplan
	size3D
}

// 3D single-precission real-to-complex FFT plan.
func FFT3DD2Z(Nx, Ny, Nz int) FFT3DD2ZPlan {
	handle := cufft.Plan3d(Nx, Ny, Nz, cufft.D2Z)
	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
	return FFT3DD2ZPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
}

// Execute the FFT plan. Synchronized.
// src and dst are 3D arrays stored 1D arrays.
func (p FFT3DD2ZPlan) Exec(src Float64s, dst Complex128s) {
	oksrclen := p.InputLen()
	if src.Len() != oksrclen {
		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
	}
	okdstlen := p.OutputLen()
	if dst.Len() != okdstlen {
		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
	}
	p.handle.ExecD2Z(src.Pointer(), dst.Pointer())
	p.stream.Synchronize() //!
}

// 3D size of the input array.
func (p FFT3DD2ZPlan) InputSize() (Nx, Ny, Nz int) {
	return p.size3D[0], p.size3D[1], p.size3D[2]
}

// 3D size of the output array.
func (p FFT3DD2ZPlan) OutputSize() (Nx, Ny, Nz int) {
	return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
}

// Required length of the (1D) input array.
func (p FFT3DD2ZPlan) InputLen() int {
	return prod3(p.InputSize())
}

// Required length of the (1D) output array.
func (p FFT3DD2ZPlan) OutputLen() int {
	return prod3(p.OutputSize())
}


================================================
FILE: safe/fft3dr2c.go
================================================
package safe

import (
	"fmt"
	"github.com/barnex/cuda5/cufft"
)

// 3D single-precission real-to-complex FFT plan.
type FFT3DR2CPlan struct {
	fftplan
	size3D
}

// 3D single-precission real-to-complex FFT plan.
func FFT3DR2C(Nx, Ny, Nz int) FFT3DR2CPlan {
	handle := cufft.Plan3d(Nx, Ny, Nz, cufft.R2C)
	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
	return FFT3DR2CPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
}

// Execute the FFT plan. Synchronized.
// src and dst are 3D arrays stored 1D arrays.
func (p FFT3DR2CPlan) Exec(src Float32s, dst Complex64s) {
	oksrclen := p.InputLen()
	if src.Len() != oksrclen {
		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
	}
	okdstlen := p.OutputLen()
	if dst.Len() != okdstlen {
		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
	}
	p.handle.ExecR2C(src.Pointer(), dst.Pointer())
	p.stream.Synchronize() //!
}

// 3D size of the input array.
func (p FFT3DR2CPlan) InputSize() (Nx, Ny, Nz int) {
	return p.size3D[0], p.size3D[1], p.size3D[2]
}

// 3D size of the output array.
func (p FFT3DR2CPlan) OutputSize() (Nx, Ny, Nz int) {
	return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
}

// Required length of the (1D) input array.
func (p FFT3DR2CPlan) InputLen() int {
	return prod3(p.InputSize())
}

// Required length of the (1D) output array.
func (p FFT3DR2CPlan) OutputLen() int {
	return prod3(p.OutputSize())
}


================================================
FILE: safe/fft3dz2d.go
================================================
package safe

import (
	"fmt"
	"github.com/barnex/cuda5/cufft"
)

// 3D single-precission real-to-complex FFT plan.
type FFT3DZ2DPlan struct {
	fftplan
	size3D
}

// 3D single-precission real-to-complex FFT plan.
func FFT3DZ2D(Nx, Ny, Nz int) FFT3DZ2DPlan {
	handle := cufft.Plan3d(Nx, Ny, Nz, cufft.Z2D)
	handle.SetCompatibilityMode(cufft.COMPATIBILITY_NATIVE)
	return FFT3DZ2DPlan{fftplan{handle, 0}, size3D{Nx, Ny, Nz}}
}

// Execute the FFT plan. Synchronized.
// src and dst are 3D arrays stored 1D arrays.
func (p FFT3DZ2DPlan) Exec(src Complex128s, dst Float64s) {
	oksrclen := p.InputLen()
	if src.Len() != oksrclen {
		panic(fmt.Errorf("size mismatch: expecting src len %v, got %v", oksrclen, src.Len()))
	}
	okdstlen := p.OutputLen()
	if dst.Len() != okdstlen {
		panic(fmt.Errorf("size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()))
	}
	p.handle.ExecZ2D(src.Pointer(), dst.Pointer())
	p.stream.Synchronize() //!
}

// 3D size of the input array.
func (p FFT3DZ2DPlan) InputSize() (Nx, Ny, Nz int) {
	return p.size3D[0], p.size3D[1], p.size3D[2]/2 + 1
}

// 3D size of the output array.
func (p FFT3DZ2DPlan) OutputSize() (Nx, Ny, Nz int) {
	return p.size3D[0], p.size3D[1], p.size3D[2]
}

// Required length of the (1D) input array.
func (p FFT3DZ2DPlan) InputLen() int {
	return prod3(p.InputSize())
}

// Required length of the (1D) output array.
func (p FFT3DZ2DPlan) OutputLen() int {
	return prod3(p.OutputSize())
}


================================================
FILE: safe/fftplan.go
================================================
package safe

// INTERNAL
// Base implementation for all FFT plans.

import (
	"github.com/barnex/cuda5/cu"
	"github.com/barnex/cuda5/cufft"
)

// Base implementation for all FFT plans.
type fftplan struct {
	handle cufft.Handle
	stream cu.Stream
}

// For the sake of embedding.
type size1D int

// Returns the logical size of the FFT:
// the number of elements (real or complex)
// it transforms.
func (s size1D) Size() int { return int(s) }

// For the sake of embedding.
type size3D [3]int

// Returns the logical size of the FFT:
// the number of elements (real or complex)
// it transforms.
func (s size3D) Size() (Nx, Ny, Nz int) { return s[0], s[1], s[2] }

func prod3(x, y, z int) int {
	return x * y * z
}

// Releases all resources associated with the FFT plan.
func (p fftplan) Destroy() { p.handle.Destroy() }

// Associates a CUDA stream with the FFT plan.
// If a stream is set, plan.Stream().Synchronize() can
// to be called to wait for the execution to finish.
func (p fftplan) SetStream(stream cu.Stream) {
	p.handle.SetStream(stream)
	p.stream = stream
}

// Returns the CUDA stream associated with the FFT plan.
func (p fftplan) Stream() cu.Stream {
	return p.stream
}


================================================
FILE: safe/float32s.go
================================================
package safe

import (
	"fmt"
	"github.com/barnex/cuda5/cu"
	"math"
	"unsafe"
)

// Slice of float32's on the GPU.
type Float32s struct{ slice }

// Make a slice of float32's on the GPU.
// Initialized to zero.
func MakeFloat32s(len_ int) Float32s {
	return Float32s{makeslice(len_, cu.SIZEOF_FLOAT32)}
}

// Return a slice from start (inclusive) to stop (exclusive),
// sharing the underlying storage with the original slice.
// Slices obtained in this way should not be Free()'d
func (s Float32s) Slice(start, stop int) Float32s {
	return Float32s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT32)}
}

// Copy src from host to dst on the device.
func (dst Float32s) CopyHtoD(src []float32) {
	dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32)
}

// Copy src form device to dst on host.
func (src Float32s) CopyDtoH(dst []float32) {
	src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32)
}

// Copy src on host to dst on host.
func (dst Float32s) CopyDtoD(src Float32s) {
	dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT32)
}

// Copy src from host to dst on the device, asynchronously.
func (dst Float32s) CopyHtoDAsync(src []float32, stream cu.Stream) {
	dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT32, stream)
}

// Copy src form device to dst on host, asynchronously.
func (src Float32s) CopyDtoHAsync(dst []float32, stream cu.Stream) {
	src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT32, stream)
}

// Copy src on host to dst on host, asynchronously.
func (dst Float32s) CopyDtoDAsync(src Float32s, stream cu.Stream) {
	dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT32, stream)
}

// Returns a fresh copy on host.
func (src Float32s) Host() []float32 {
	cpy := make([]float32, src.Len())
	src.CopyDtoH(cpy)
	return cpy
}

// Set the entire slice to this value.
func (s Float32s) Memset(value float32) {
	cu.MemsetD32(s.Pointer(), math.Float32bits(value), int64(s.Len()))
	cu.CtxSynchronize()
}

// Set the entire slice to this value, asynchronously.
func (s Float32s) MemsetAsync(value float32, stream cu.Stream) {
	cu.MemsetD32Async(s.Pointer(), math.Float32bits(value), int64(s.Len()), stream)
}

// Re-interpret the array as complex numbers,
// in interleaved format. Underlying storage
// is shared.
func (s Float32s) Complex() Complex64s {
	if s.Len()%2 != 0 {
		panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len()))
	}
	return Complex64s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}}
}


================================================
FILE: safe/float32s_test.go
================================================
package safe

import (
	"reflect"
	"testing"
)

func TestFloat32sSlice(test *testing.T) {
	InitCuda()

	a := MakeFloat32s(100)
	defer a.Free()

	if !reflect.DeepEqual(a.Host(), make([]float32, 100)) {
		test.Error(a.Host())
	}

	b := make([]float32, 100)

	if a.Len() != len(b) {
		test.Error("len:", a.Len(), "!=", cap(b))
	}
	if a.Cap() != cap(b) {
		test.Error("cap:", a.Cap(), "!=", cap(b))
	}

	c := a.Slice(20, 30)
	d := b[20:30]

	if c.Len() != len(d) {
		test.Error("sliced len:", c.Len(), "!=", cap(d))
	}
	if c.Cap() != cap(d) {
		test.Error("sliced cap:", c.Cap(), "!=", cap(d))
	}

	e := a.Slice(0, 50)
	f := b[0:50]

	if e.Len() != len(f) {
		test.Error("sliced len:", e.Len(), "!=", cap(f))
	}
	if e.Cap() != cap(f) {
		test.Error("sliced cap:", e.Cap(), "!=", cap(f))
	}
}

func TestFloat32sPanic1(test *testing.T) {
	InitCuda()

	defer func() {
		err := recover()
		test.Log("recovered:", err)
		if err == nil {
			test.Fail()
		}
	}()

	a := MakeFloat32s(100)
	defer a.Free()

	a.Slice(-1, 10)
}

func TestFloat32sPanic2(test *testing.T) {
	InitCuda()

	defer func() {
		err := recover()
		test.Log("recovered:", err)
		if err == nil {
			test.Fail()
		}
	}()

	a := MakeFloat32s(100)
	defer a.Free()

	a.Slice(0, 101)
}

func TestFloat32sCopy(test *testing.T) {
	InitCuda()

	a := make([]float32, 100)

	b := MakeFloat32s(100)
	defer b.Free()

	c := MakeFloat32s(100)
	defer c.Free()

	d := make([]float32, 200)

	for i := range a {
		a[i] = float32(i)
	}

	b.CopyHtoD(a)

	c.CopyDtoD(b)

	c.CopyDtoH(d[:100])

	if !reflect.DeepEqual(a, d[:100]) {
		test.Error(d)
	}
	if !reflect.DeepEqual(d[100:], make([]float32, 100)) {
		test.Error(d)
	}
}


================================================
FILE: safe/float64s.go
================================================
package safe

import (
	"fmt"
	"github.com/barnex/cuda5/cu"
	"unsafe"
)

// Slice of float64's on the GPU.
type Float64s struct{ slice }

// Make a slice of float64's on the GPU.
// Initialized to zero.
func MakeFloat64s(len_ int) Float64s {
	return Float64s{makeslice(len_, cu.SIZEOF_FLOAT64)}
}

// Return a slice from start (inclusive) to stop (exclusive),
// sharing the underlying storage with the original slice.
// Slices obtained in this way should not be Free()'d
func (s Float64s) Slice(start, stop int) Float64s {
	return Float64s{s.slice.slice(start, stop, cu.SIZEOF_FLOAT64)}
}

// Copy src from host to dst on the device.
func (dst Float64s) CopyHtoD(src []float64) {
	dst.copyHtoD(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64)
}

// Copy src form device to dst on host.
func (src Float64s) CopyDtoH(dst []float64) {
	src.copyDtoH(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64)
}

// Copy src on host to dst on host.
func (dst Float64s) CopyDtoD(src Float64s) {
	dst.copyDtoD(&src.slice, cu.SIZEOF_FLOAT64)
}

// Copy src from host to dst on the device, asynchronously.
func (dst Float64s) CopyHtoDAsync(src []float64, stream cu.Stream) {
	dst.copyHtoDAsync(unsafe.Pointer(&src[0]), len(src), cu.SIZEOF_FLOAT64, stream)
}

// Copy src form device to dst on host, asynchronously.
func (src Float64s) CopyDtoHAsync(dst []float64, stream cu.Stream) {
	src.copyDtoHAsync(unsafe.Pointer(&dst[0]), len(dst), cu.SIZEOF_FLOAT64, stream)
}

// Copy src on host to dst on host, asynchronously.
func (dst Float64s) CopyDtoDAsync(src Float64s, stream cu.Stream) {
	dst.copyDtoDAsync(&src.slice, cu.SIZEOF_FLOAT64, stream)
}

// Returns a fresh copy on host.
func (src Float64s) Host() []float64 {
	cpy := make([]float64, src.Len())
	src.CopyDtoH(cpy)
	return cpy
}

// Re-interpret the array as complex numbers,
// in interleaved format. Underlying storage
// is shared.
func (s Float64s) Complex() Complex128s {
	if s.Len()%2 != 0 {
		panic(fmt.Errorf("complex: need even number of elements, have:%v", s.Len()))
	}
	return Complex128s{slice{s.ptr_, s.len_ / 2, s.cap_ / 2}}
}


================================================
FILE: safe/float64s_test.go
================================================
package safe

import (
	"reflect"
	"testing"
)

func TestFloat64sSlice(test *testing.T) {
	InitCuda()

	a := MakeFloat64s(100)
	defer a.Free()

	if !reflect.DeepEqual(a.Host(), make([]float64, 100)) {
		test.Error(a.Host())
	}

	b := make([]float64, 100)

	if a.Len() != len(b) {
		test.Error("len:", a.Len(), "!=", cap(b))
	}
	if a.Cap() != cap(b) {
		test.Error("cap:", a.Cap(), "!=", cap(b))
	}

	c := a.Slice(20, 30)
	d := b[20:30]

	if c.Len() != len(d) {
		test.Error("sliced len:", c.Len(), "!=", cap(d))
	}
	if c.Cap() != cap(d) {
		test.Error("sliced cap:", c.Cap(), "!=", cap(d))
	}

	e := a.Slice(0, 50)
	f := b[0:50]

	if e.Len() != len(f) {
		test.Error("sliced len:", e.Len(), "!=", cap(f))
	}
	if e.Cap() != cap(f) {
		test.Error("sliced cap:", e.Cap(), "!=", cap(f))
	}
}

func TestFloat64sPanic1(test *testing.T) {
	InitCuda()

	defer func() {
		err := recover()
		test.Log("recovered:", err)
		if err == nil {
			test.Fail()
		}
	}()

	a := MakeFloat64s(100)
	defer a.Free()

	a.Slice(-1, 10)
}

func TestFloat64sPanic2(test *testing.T) {
	InitCuda()

	defer func() {
		err := recover()
		test.Log("recovered:", err)
		if err == nil {
			test.Fail()
		}
	}()

	a := MakeFloat64s(100)
	defer a.Free()

	a.Slice(0, 101)
}

func TestFloat64sCopy(test *testing.T) {
	InitCuda()

	a := make([]float64, 100)

	b := MakeFloat64s(100)
	defer b.Free()

	c := MakeFloat64s(100)
	defer c.Free()

	d := make([]float64, 200)

	for i := range a {
		a[i] = float64(i)
	}

	b.CopyHtoD(a)

	c.CopyDtoD(b)

	c.CopyDtoH(d[:100])

	if !reflect.DeepEqual(a, d[:100]) {
		test.Error(d)
	}
	if !reflect.DeepEqual(d[100:], make([]float64, 100)) {
		test.Error(d)
	}
}


================================================
FILE: safe/init.go
================================================
package safe

import (
	"github.com/barnex/cuda5/cu"
	"runtime"
)

func InitCuda() {
	runtime.LockOSThread()
	cu.Init(0)
	cu.CtxCreate(cu.CTX_SCHED_AUTO, 0).SetCurrent()
}


================================================
FILE: safe/slice.go
================================================
package safe

// INTERNAL.
// This file implements common functionality for all slice types
// (Float32s, Float64s, Complex64s, ...).

import (
	"fmt"
	"github.com/barnex/cuda5/cu"
	"unsafe"
)

// internal base func for all makeXXX() functions
func makeslice(len_ int, elemsize int) slice {
	bytes := int64(len_) * int64(elemsize)
	s := slice{0, len_, len_}
	if bytes > 0 {
		s.ptr_ = cu.MemAlloc(bytes)
		cu.MemsetD8(s.ptr_, 0, bytes)
		cu.CtxSynchronize()
	}
	return s
}

// internal base type for all slices
type slice struct {
	ptr_ cu.DevicePtr // address offset of first element
	len_ int          // number of elements
	cap_ int
}

// Pointer to the first element.
func (s *slice) Pointer() cu.DevicePtr { return s.ptr_ }

// Slice length (number of elements).
func (s *slice) Len() int { return s.len_ }

// Slice capacity.
func (s *slice) Cap() int { return s.cap_ }

// Free the underlying storage.
// To be used with care. Free() should only be called on
// a slice created by MakeXXX(), not on a slice created
// by x.Slice(). Freeing a slice invalidates all other
// slices referring to it.
func (s *slice) Free() {
	s.ptr_.Free()
	s.len_ = 0
	s.cap_ = 0
}

// internal base func for all slice() functions
func (s *slice) slice(start, stop int, elemsize uintptr) slice {
	if start >= s.cap_ || start < 0 || stop > s.cap_ || stop < 0 {
		panic("cuda4/safe: slice index out of bounds")
	}
	if start > stop {
		panic("cuda4/safe: inverted slice range")
	}
	return slice{cu.DevicePtr(uintptr(s.ptr_) + uintptr(start)*elemsize), stop - start, s.cap_ - start}
}

func (dst *slice) copyHtoD(src unsafe.Pointer, srclen int, elemsize int) {
	if srclen != dst.Len() {
		panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len()))
	}
	cu.MemcpyHtoD(dst.Pointer(), src, int64(elemsize)*int64(srclen))
}

func (src *slice) copyDtoH(dst unsafe.Pointer, dstlen int, elemsize int) {
	if dstlen != src.Len() {
		panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen))
	}
	cu.MemcpyDtoH(dst, src.Pointer(), int64(elemsize)*int64(dstlen))
}

func (dst *slice) copyDtoD(src *slice, elemsize int) {
	if dst.Len() != src.Len() {
		panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len()))
	}
	cu.MemcpyDtoD(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()))
}

func (dst *slice) copyHtoDAsync(src unsafe.Pointer, srclen int, elemsize int, stream cu.Stream) {
	if srclen != dst.Len() {
		panic(fmt.Errorf("cuda4/safe: len mismatch: len(src)=%v (host), dst.Len()=%v (device)", srclen, dst.Len()))
	}
	cu.MemcpyHtoDAsync(dst.Pointer(), src, int64(elemsize)*int64(srclen), stream)
}

func (src *slice) copyDtoHAsync(dst unsafe.Pointer, dstlen int, elemsize int, stream cu.Stream) {
	if dstlen != src.Len() {
		panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), len(dst)=%v (host)", src.Len(), dstlen))
	}
	cu.MemcpyDtoHAsync(dst, src.Pointer(), int64(elemsize)*int64(dstlen), stream)
}

func (dst *slice) copyDtoDAsync(src *slice, elemsize int, stream cu.Stream) {
	if dst.Len() != src.Len() {
		panic(fmt.Errorf("cuda4/safe: len mismatch: src.Len()=%v (device), dst.Len()=%v", src.Len(), dst.Len()))
	}
	cu.MemcpyDtoDAsync(dst.Pointer(), src.Pointer(), int64(elemsize)*int64(dst.Len()), stream)
}

// Manually set the pointer, length and capacity.
// Side-steps the security mechanisms, use with caution.
func (s *slice) UnsafeSet(pointer unsafe.Pointer, length, capacity int) {
	s.ptr_ = cu.DevicePtr(uintptr(pointer))
	s.len_ = length
	s.cap_ = capacity
}


================================================
FILE: safe/subs.sh
================================================
#! /bin/bash

subs32='s/loat32/loat64/g;'
subs32+='s/FLOAT32/FLOAT64/g;'

#sed $subs32 float32s.go > float64s.go
#sed $subs32 float32s_test.go > float64s_test.go

subsc64='s/Float32/Complex64/g;'
subsc64+='s/float32/complex64/g;'
subsc64+='s/FLOAT32/COMPLEX64/g;'
#sed $subsc64 float32s_test.go > complex64s_test.go
#sed $subsc64 float32s.go > complex64s.go


subsc128='s/omplex64/omplex128/g;'
subsc128+='s/COMPLEX64/COMPLEX128/g;'
sed $subsc128 complex64s.go > complex128s.go
sed $subsc128 complex64s_test.go > complex128s_test.go