Experimental Nim GPU 'macro macros' for generating GPU kernels and helpers to use them. these macros accept a kernel body and produce a macro for easily using the kernel.Nim Library to enable writing CUDA and HIP kernels in Nim All cuda and hip structures and functions are re-exported and can be used
- There are 3 sets of function prefixes.
- hippo* prefixed functions are friendly nim interfaces for either HIP or CUDA
- This is the recommended way to use this library, as it is the most nim-like
- These functions check for errors and raise them as exceptions
- hip* prefixed functions are the raw HIP C++ functions
- cuda* prefixed functions are the raw CUDA C functions
Types
BlockDim {.importcpp: "const __HIP_Coordinates<__HIP_BlockDim>", header: "hip/hip_runtime.h".} = object x* {.importc: "x".}: uint32_t ## < x y* {.importc: "y".}: uint32_t ## < y z* {.importc: "z".}: uint32_t ## < z
- Source Edit
BlockIdx {.importcpp: "const __HIP_Coordinates<__HIP_BlockIdx>", header: "hip/hip_runtime.h".} = object x* {.importc: "x".}: uint32_t ## < x y* {.importc: "y".}: uint32_t ## < y z* {.importc: "z".}: uint32_t ## < z
- Source Edit
ConstCString {.importc: "const char*".} = object
- Source Edit
Dim3 {.importcpp: "dim3", header: "hip/hip_runtime.h", bycopy.} = object x* {.importc: "x".}: uint32_t ## < x y* {.importc: "y".}: uint32_t ## < y z* {.importc: "z".}: uint32_t ## < z
- Source Edit
GridDim {.importcpp: "const __HIP_Coordinates<__HIP_GridDim>", header: "hip/hip_runtime.h".} = object x* {.importc: "x".}: uint32_t ## < x y* {.importc: "y".}: uint32_t ## < y z* {.importc: "z".}: uint32_t ## < z
- Source Edit
hipDeviceProp_t {.importcpp: "hipDeviceProp_t", header: "hip/hip_runtime.h".} = object deviceOverlap*: cint
- Source Edit
hipError_t {.importcpp: "hipError_t", header: "hip/hip_runtime.h".} = cint
- Source Edit
hipEvent_t {.importcpp: "hipEvent_t", header: "hip/hip_runtime.h".} = pointer
- Source Edit
hipMemcpyKind {.size: 4, header: "hip/hip_runtime.h", importcpp: "hipMemcpyKind".} = enum hipMemcpyHostToHost = 0, ## < Host-to-Host Copy hipMemcpyHostToDevice = 1, ## < Host-to-Device Copy hipMemcpyDeviceToHost = 2, ## < Device-to-Host Copy hipMemcpyDeviceToDevice = 3, ## < Device-to-Device Copy hipMemcpyDefault = 4 ## < Runtime will automatically determine copy-kind based on virtual addresses.
- Source Edit
hipStream_t {.importcpp: "hipStream_t", header: "hip/hip_runtime.h".} = pointer
- Source Edit
Consts
HippoErrorNotReady = 600
- Source Edit
HippoEventBlockingSync = 1'u32
- Source Edit
HippoEventDefault = 0'u32
- Source Edit
HippoEventDisableTiming = 2'u32
- Source Edit
HippoEventInterprocess = 4'u32
- Source Edit
HippoMemcpyDefault = hipMemcpyDefault
- Source Edit
HippoMemcpyDeviceToDevice = hipMemcpyDeviceToDevice
- Source Edit
HippoMemcpyDeviceToHost = hipMemcpyDeviceToHost
- Source Edit
HippoMemcpyHostToDevice = hipMemcpyHostToDevice
- Source Edit
HippoMemcpyHostToHost = hipMemcpyHostToHost
- Source Edit
HippoPlatform {.strdefine.} = ""
- Source Edit
HippoRuntime {.strdefine.} = "HIP"
- Source Edit
HippoWarpSize = 32
- Warp/wavefront size for the current backend. Source Edit
Procs
proc `$`(self: ConstCString): string {....raises: [], tags: [], forbids: [].}
- Source Edit
proc `=destroy`(mem: var GpuMemory) {....raises: [], tags: [], forbids: [].}
- Automatically free device memory when the object goes out of scope Source Edit
proc floatToHalf(f: cfloat): uint16 {.header: "hip/hip_fp16.h", importcpp: "__half_raw(__float2half(#)).x", ...raises: [], tags: [], forbids: [].}
- Convert float32 to IEEE 754 half-precision (uint16) using HIP hardware intrinsic. Source Edit
proc halfToFloat(h: uint16): cfloat {.header: "hip/hip_fp16.h", importcpp: "[&]{ __half_raw r; r.x = (#); return __half2float(r); }()", ...raises: [], tags: [], forbids: [].}
- Convert IEEE 754 half-precision (uint16) to float32 using HIP hardware intrinsic. Source Edit
proc handleError(err: hipError_t) {....raises: [Exception], tags: [], forbids: [].}
- Source Edit
proc hipAtomicAdd(address: ptr int32; val: int32): int32 {. header: "hip/hip_runtime.h", importcpp: "atomicAdd(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicAdd(address: ptr uint32; val: uint32): uint32 {. header: "hip/hip_runtime.h", importcpp: "atomicAdd(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicAnd(address: ptr int32; val: int32): int32 {. header: "hip/hip_runtime.h", importcpp: "atomicAnd(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicAnd(address: ptr uint32; val: uint32): uint32 {. header: "hip/hip_runtime.h", importcpp: "atomicAnd(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicCAS(address: ptr int32; compare: int32; val: int32): int32 {. header: "hip/hip_runtime.h", importcpp: "atomicCAS(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicCAS(address: ptr uint32; compare: uint32; val: uint32): uint32 {. header: "hip/hip_runtime.h", importcpp: "atomicCAS(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicExch(address: ptr int32; val: int32): int32 {. header: "hip/hip_runtime.h", importcpp: "atomicExch(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicExch(address: ptr uint32; val: uint32): uint32 {. header: "hip/hip_runtime.h", importcpp: "atomicExch(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicMax(address: ptr int32; val: int32): int32 {. header: "hip/hip_runtime.h", importcpp: "atomicMax(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicMax(address: ptr uint32; val: uint32): uint32 {. header: "hip/hip_runtime.h", importcpp: "atomicMax(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicMin(address: ptr int32; val: int32): int32 {. header: "hip/hip_runtime.h", importcpp: "atomicMin(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicMin(address: ptr uint32; val: uint32): uint32 {. header: "hip/hip_runtime.h", importcpp: "atomicMin(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicOr(address: ptr int32; val: int32): int32 {. header: "hip/hip_runtime.h", importcpp: "atomicOr(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicOr(address: ptr uint32; val: uint32): uint32 {. header: "hip/hip_runtime.h", importcpp: "atomicOr(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicSub(address: ptr int32; val: int32): int32 {. header: "hip/hip_runtime.h", importcpp: "atomicSub(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicSub(address: ptr uint32; val: uint32): uint32 {. header: "hip/hip_runtime.h", importcpp: "atomicSub(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicXor(address: ptr int32; val: int32): int32 {. header: "hip/hip_runtime.h", importcpp: "atomicXor(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipAtomicXor(address: ptr uint32; val: uint32): uint32 {. header: "hip/hip_runtime.h", importcpp: "atomicXor(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipDeviceSynchronize(): hipError_t {.header: "hip/hip_runtime.h", importcpp: "hipDeviceSynchronize(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipEventCreate(event: ptr hipEvent_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipEventCreate(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipEventCreateWithFlags(event: ptr hipEvent_t; flags: uint32_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipEventCreateWithFlags(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipEventDestroy(event: hipEvent_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipEventDestroy(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipEventElapsedTime(ms: ptr cfloat; start: hipEvent_t; stop: hipEvent_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipEventElapsedTime(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipEventQuery(event: hipEvent_t): hipError_t {.header: "hip/hip_runtime.h", importcpp: "hipEventQuery(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipEventRecord(event: hipEvent_t; stream: hipStream_t = nil): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipEventRecord(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipEventSynchronize(event: hipEvent_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipEventSynchronize(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipFree(ptr: pointer): hipError_t {.header: "hip/hip_runtime.h", importcpp: "hipFree(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipGetDevice(device: ptr cint): hipError_t {.header: "hip/hip_runtime.h", importcpp: "hipGetDevice(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipGetDeviceProperties(prop: ptr hipDeviceProp_t; device: cint): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipGetDeviceProperties(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipGetErrorString(err: hipError_t): ConstCString {. header: "hip/hip_runtime.h", importcpp: "hipGetErrorString(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipGetLastError(): hipError_t {.header: "hip/hip_runtime.h", importcpp: "hipGetLastError()", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipHostAlloc(p: ptr pointer; size: csize_t; flags: uint32_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipHostAlloc(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipHostFree(p: pointer): hipError_t {.header: "hip/hip_runtime.h", importcpp: "hipHostFree(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipLaunchKernel(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; args: ptr pointer): hipError_t {. importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipLaunchKernel(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {. importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipLaunchKernelGGL(function_address: proc; numBlocks: Dim3; dimBlocks: Dim3; sharedMemBytes: uint32_t; stream: hipStream_t) {. importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs, ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipMalloc(ptr: ptr pointer; size: cint): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipMalloc(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipMemcpy(dst: pointer; src: pointer; size: cint; kind: hipMemcpyKind): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipMemcpy(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipMemcpyAsync(dst: pointer; src: pointer; sizeBytes: csize_t; kind: hipMemcpyKind; stream: hipStream_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipMemcpyAsync(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipMemcpyToSymbol(symbol: pointer; src: pointer; sizeBytes: csize_t; offset: csize_t = 0; kind: hipMemcpyKind = hipMemcpyHostToDevice): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipMemcpyToSymbol(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hippoSyncthreads() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipStreamCreate(stream: ptr hipStream_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipStreamCreate(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipStreamDestroy(stream: hipStream_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipStreamDestroy(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipStreamSynchronize(stream: hipStream_t): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipStreamSynchronize(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipStreamWaitEvent(stream: hipStream_t; event: hipEvent_t; flags: uint32_t = 0'u32): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipStreamWaitEvent(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipSyncthreads() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h", ...raises: [], tags: [], forbids: [].}
- Source Edit
Converters
converter toConstCString(self: cstring): ConstCString {. importc: "(const char*)", noconv, nodecl, ...raises: [], tags: [], forbids: [].}
- Source Edit
converter toCString(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl, ...raises: [], tags: [], forbids: [].}
- Source Edit
Macros
macro generateForLoopMacro(name: static[string]; theType: typedesc; body: untyped): untyped
- Generate a GPU kernel and a host launcher proc using a custom name. Source Edit
macro hippoConstant(v: untyped): untyped
- Declared a variable as __constant__. Constants are read-only globals that are cached on-chip. constants are useful for data that is being read by all threads in a warp at the same time. if each thread in a warp accesses different addresses in constant memory, the accesses are serialized and this may cause a 16x slowdown. eg: const N {.hippoConstant.} = 1024 Source Edit
macro hippoDevice(fn: untyped): untyped
- Declare fuctions for use on the __device__ (the gpu), to be called by either device or global functions. Source Edit
macro hippoGlobal(fn: untyped): untyped
- Declare a function as __global__. global functions are called from the host and run on the device. Source Edit
macro hippoHostDevice(fn: untyped): untyped
- Declare a function as both __host__ and __device__. This is useful for functions that are usable from either the host and the device. eg: proc add(a: int, b: int) {.hippoHostDevice.} = a + b Source Edit
Templates
template hippoAtomicAdd(address: ptr [int32]; val: int32): int32
- Atomically add and return the previous value. Source Edit
template hippoAtomicAdd(address: ptr [uint32]; val: uint32): uint32
- Atomically add and return the previous value. Source Edit
template hippoAtomicAnd(address: ptr [int32]; val: int32): int32
- Atomically AND and return the previous value. Source Edit
template hippoAtomicAnd(address: ptr [uint32]; val: uint32): uint32
- Atomically AND and return the previous value. Source Edit
template hippoAtomicCAS(address: ptr [int32]; compare: int32; val: int32): int32
- Atomically compare-and-swap and return the previous value. Source Edit
template hippoAtomicCAS(address: ptr [uint32]; compare: uint32; val: uint32): uint32
- Atomically compare-and-swap and return the previous value. Source Edit
template hippoAtomicExch(address: ptr [int32]; val: int32): int32
- Atomically exchange and return the previous value. Source Edit
template hippoAtomicExch(address: ptr [uint32]; val: uint32): uint32
- Atomically exchange and return the previous value. Source Edit
template hippoAtomicMax(address: ptr [int32]; val: int32): int32
- Atomically update with max and return the previous value. Source Edit
template hippoAtomicMax(address: ptr [uint32]; val: uint32): uint32
- Atomically update with max and return the previous value. Source Edit
template hippoAtomicMin(address: ptr [int32]; val: int32): int32
- Atomically update with min and return the previous value. Source Edit
template hippoAtomicMin(address: ptr [uint32]; val: uint32): uint32
- Atomically update with min and return the previous value. Source Edit
template hippoAtomicOr(address: ptr [int32]; val: int32): int32
- Atomically OR and return the previous value. Source Edit
template hippoAtomicOr(address: ptr [uint32]; val: uint32): uint32
- Atomically OR and return the previous value. Source Edit
template hippoAtomicSub(address: ptr [int32]; val: int32): int32
- Atomically subtract and return the previous value. Source Edit
template hippoAtomicSub(address: ptr [uint32]; val: uint32): uint32
- Atomically subtract and return the previous value. Source Edit
template hippoAtomicXor(address: ptr [int32]; val: int32): int32
- Atomically XOR and return the previous value. Source Edit
template hippoAtomicXor(address: ptr [uint32]; val: uint32): uint32
- Atomically XOR and return the previous value. Source Edit
template hippoEventCreate(): HippoEvent
- Create a timing event. Source Edit
template hippoEventCreateWithFlags(flags: uint32 = 0'u32): HippoEvent
- Create a timing event with explicit runtime flags. Source Edit
template hippoEventDestroy(event: HippoEvent)
- Destroy a timing event. Source Edit
template hippoEventElapsedTime(start: HippoEvent; stop: HippoEvent): float32
- Get elapsed time between two events. Source Edit
template hippoEventQuery(event: HippoEvent): bool
- Return true when the event has completed and false when it is still pending. On HIP-CPU, this behaves as a blocking synchronize because hipEventQuery is not declared. Source Edit
template hippoEventRecord(event: HippoEvent; stream: HippoStream = nil)
- Record an event on a stream. Source Edit
template hippoEventSynchronize(event: HippoEvent)
- Wait for an event to complete. Source Edit
template hippoFloatToHalf(f: cfloat): uint16
- Convert float32 to IEEE 754 half-precision (uint16). Uses hardware intrinsic on HIP/CUDA, software fallback on SIMPLE. Source Edit
template hippoGetDevice(): cint
- Get the current device index. Source Edit
template hippoGetDeviceProperties(prop: var HippoDeviceProp; device: cint)
- Get device properties for the given device. Source Edit
template hippoHalfToFloat(h: uint16): cfloat
- Convert IEEE 754 half-precision (uint16) to float32. Uses hardware intrinsic on HIP/CUDA, software fallback on SIMPLE. Source Edit
template hippoHostAlloc(size: int): pointer
- Allocate page-locked host memory Source Edit
template hippoHostFree(p: pointer)
- Free page-locked host memory. Source Edit
template hippoLaunchKernel(kernel: untyped; ## The GPU kernel procedure to launch gridDim: Dim3 = newDim3(1, 1, 1); ## default to a grid of 1 block blockDim: Dim3 = newDim3(1, 1, 1); ## default to 1 thread per block sharedMemBytes: uint32 = 0; ## dynamic shared memory amount to allocate stream: HippoStream = nil; ## Which device stream to run under (defaults to null) args: untyped)
- Source Edit
template hippoMalloc(size: int): GpuRef
- Allocate memory on the GPU and return a GpuRef object. GpuMemory is a wrapper around Gpu allocated pointers. It will automatically free the memory when it goes out of scope. Source Edit
template hippoMemcpy(dst: GpuRef; src: GpuRef; size: int; kind: HippoMemcpyKind)
- device -> device memory copy Copy memory from src to dst. direction of device and host is determined by kind. Source Edit
template hippoMemcpy(dst: GpuRef; src: pointer; size: int; kind: HippoMemcpyKind)
- device -> host memory copy Copy memory from src to dst. direction of device and host is determined by kind. Source Edit
template hippoMemcpy(dst: pointer; src: GpuRef; size: int; kind: HippoMemcpyKind)
- host -> device memory copy Copy memory from src to dst. direction of device and host is determined by kind. Source Edit
template hippoMemcpy(dst: pointer; src: pointer; size: int; kind: HippoMemcpyKind)
- host -> host memory copy hippoMemcpy is broken out as 4 separate templates to make it easier to work with GpuRef objects Copy memory from src to dst. direction of device and host is determined by kind. Source Edit
template hippoMemcpyAsync(dst: pointer; src: pointer; size: int; kind: HippoMemcpyKind; stream: HippoStream)
- Asynchronous memory copy on a stream Source Edit
template hippoMemcpyToSymbol(symbol: untyped; src: pointer; size: int; offset: int = 0; kind: HippoMemcpyKind = HippoMemcpyHostToDevice)
- Copy data from host to a device symbol (eg: __constant__ storage). Source Edit
template hippoShflDown(val: cfloat; delta: int): cfloat
- Warp shuffle down for float32. Returns the value from the lane delta positions below the calling lane within the same warp. Source Edit
template hippoShflDown(val: cint; delta: int): cint
- Warp shuffle down for int32. Source Edit
template hippoStreamCreate(): HippoStream
- Create a new stream Source Edit
template hippoStreamDestroy(stream: HippoStream)
- Destroy a stream Source Edit
template hippoStreamSynchronize(stream: HippoStream)
- Synchronize a specific stream Source Edit
template hippoStreamWaitEvent(stream: HippoStream; event: pointer; flags: uint32 = 0'u32)
- Block stream work until an event is recorded and completed. Source Edit
template hippoSynchronize()
- Synchronize the device Source Edit