Nim Library to enable writing CUDA and HIP kernels in Nim All cuda and hip structures and functions are re-exported and can be used
- There are 3 sets of function prefixes.
- hippo* prefixed functions are friendly nim interfaces for either HIP or CUDA
- This is the recommended way to use this library, as it is the most nim-like
- These functions check for errors and raise them as exceptions
- hip* prefixed functions are the raw HIP C++ functions
- cuda* prefixed functions are the raw CUDA C functions
Types
BlockDim {.importcpp: "const __HIP_Coordinates<__HIP_BlockDim>", header: "hip/hip_runtime.h".} = object x* {.importc: "x".}: uint32_t ## < x y* {.importc: "y".}: uint32_t ## < y z* {.importc: "z".}: uint32_t ## < z
- Source Edit
BlockIdx {.importcpp: "const __HIP_Coordinates<__HIP_BlockIdx>", header: "hip/hip_runtime.h".} = object x* {.importc: "x".}: uint32_t ## < x y* {.importc: "y".}: uint32_t ## < y z* {.importc: "z".}: uint32_t ## < z
- Source Edit
ConstCString {.importc: "const char*".} = object
- Source Edit
Dim3 {.importcpp: "dim3", header: "hip/hip_runtime.h", bycopy.} = object x* {.importc: "x".}: uint32_t ## < x y* {.importc: "y".}: uint32_t ## < y z* {.importc: "z".}: uint32_t ## < z
- Source Edit
GridDim {.importcpp: "const __HIP_Coordinates<__HIP_GridDim>", header: "hip/hip_runtime.h".} = object x* {.importc: "x".}: uint32_t ## < x y* {.importc: "y".}: uint32_t ## < y z* {.importc: "z".}: uint32_t ## < z
- Source Edit
hipError_t {.importcpp: "hipError_t", header: "hip/hip_runtime.h".} = cint
- Source Edit
hipMemcpyKind {.size: 4, header: "hip/hip_runtime.h", importcpp: "hipMemcpyKind".} = enum hipMemcpyHostToHost = 0, ## < Host-to-Host Copy hipMemcpyHostToDevice = 1, ## < Host-to-Device Copy hipMemcpyDeviceToHost = 2, ## < Device-to-Host Copy hipMemcpyDeviceToDevice = 3, ## < Device-to-Device Copy hipMemcpyDefault = 4 ## < Runtime will automatically determine copy-kind based on virtual addresses.
- Source Edit
hipStream_t = pointer
- Source Edit
Consts
HippoMemcpyDefault = hipMemcpyDefault
- Source Edit
HippoMemcpyDeviceToDevice = hipMemcpyDeviceToDevice
- Source Edit
HippoMemcpyDeviceToHost = hipMemcpyDeviceToHost
- Source Edit
HippoMemcpyHostToDevice = hipMemcpyHostToDevice
- Source Edit
HippoMemcpyHostToHost = hipMemcpyHostToHost
- Source Edit
HippoRuntime {.strdefine.} = "HIP"
- Source Edit
Procs
proc `$`(self: ConstCString): string {....raises: [], tags: [], forbids: [].}
- Source Edit
proc `=destroy`(mem: var GpuMemory) {....raises: [], tags: [], forbids: [].}
- Automatically free device memory when the object goes out of scope Source Edit
proc handleError(err: hipError_t) {....raises: [Exception], tags: [], forbids: [].}
- Source Edit
proc hipDeviceSynchronize(): hipError_t {.header: "hip/hip_runtime.h", importcpp: "hipDeviceSynchronize(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipFree(ptr: pointer): hipError_t {.header: "hip/hip_runtime.h", importcpp: "hipFree(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipGetErrorString(err: hipError_t): ConstCString {. header: "hip/hip_runtime.h", importcpp: "hipGetErrorString(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipGetLastError(): hipError_t {.header: "hip/hip_runtime.h", importcpp: "hipGetLastError()", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipLaunchKernel(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; args: ptr pointer): hipError_t {. importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipLaunchKernel(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {. importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipLaunchKernelGGL(function_address: proc; numBlocks: Dim3; dimBlocks: Dim3; sharedMemBytes: uint32_t; stream: hipStream_t) {. importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs, ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipMalloc(ptr: ptr pointer; size: cint): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipMalloc(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipMemcpy(dst: pointer; src: pointer; size: cint; kind: hipMemcpyKind): hipError_t {. header: "hip/hip_runtime.h", importcpp: "hipMemcpy(@)", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hippoSyncthreads() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h", ...raises: [], tags: [], forbids: [].}
- Source Edit
proc hipSyncthreads() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h", ...raises: [], tags: [], forbids: [].}
- Source Edit
Converters
converter toConstCString(self: cstring): ConstCString {. importc: "(const char*)", noconv, nodecl, ...raises: [], tags: [], forbids: [].}
- Source Edit
converter toCString(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl, ...raises: [], tags: [], forbids: [].}
- Source Edit
Macros
macro hippoConstant(v: untyped): untyped
- Declared a variable as __constant__. Constants are read-only globals that are cached on-chip. constants are useful for data that is being read by all threads in a warp at the same time. if each thread in a warp accesses different addresses in constant memory, the accesses are serialized and this may cause a 16x slowdown. eg: const N {.hippoConstant.} = 1024 Source Edit
macro hippoDevice(fn: untyped): untyped
- Declare fuctions for use on the __device__ (the gpu), to be called by either device or global functions. Source Edit
macro hippoGlobal(fn: untyped): untyped
- Declare a function as __global__. global functions are called from the host and run on the device. Source Edit
macro hippoHostDevice(fn: untyped): untyped
- Declare a function as both __host__ and __device__. This is useful for functions that are usable from either the host and the device. eg: proc add(a: int, b: int) {.hippoHostDevice.} = a + b Source Edit
Templates
template hippoLaunchKernel(kernel: proc; ## The GPU kernel procedure to launch gridDim: Dim3 = newDim3(1, 1, 1); ## default to a grid of 1 block blockDim: Dim3 = newDim3(1, 1, 1); ## default to 1 thread per block sharedMemBytes: uint32 = 0; ## dynamic shared memory amount to allocate stream: HippoStream = nil; ## Which device stream to run under (defaults to null) args: untyped)
- Source Edit
template hippoMalloc(size: int): GpuRef
- Allocate memory on the GPU and return a GpuRef object. GpuMemory is a wrapper around Gpu allocated pointers. It will automatically free the memory when it goes out of scope. Source Edit
template hippoMemcpy(dst: GpuRef; src: GpuRef; size: int; kind: HippoMemcpyKind)
- device -> device memory copy Copy memory from src to dst. direction of device and host is determined by kind. Source Edit
template hippoMemcpy(dst: GpuRef; src: pointer; size: int; kind: HippoMemcpyKind)
- device -> host memory copy Copy memory from src to dst. direction of device and host is determined by kind. Source Edit
template hippoMemcpy(dst: pointer; src: GpuRef; size: int; kind: HippoMemcpyKind)
- host -> device memory copy Copy memory from src to dst. direction of device and host is determined by kind. Source Edit
template hippoMemcpy(dst: pointer; src: pointer; size: int; kind: HippoMemcpyKind)
- host -> host memory copy hippoMemcpy is broken out as 4 separate templates to make it easier to work with GpuRef objects Copy memory from src to dst. direction of device and host is determined by kind. Source Edit
template hippoSynchronize()
- Synchronize the device Source Edit