27#ifndef _SOURCE_TF_CUDA_H_
28#define _SOURCE_TF_CUDA_H_
33#include <cuda_runtime.h>
44 enum ErrorCode :
int {
50 TFCUDAERR_notondevice,
51 TFCUDAERR_cleardevices,
59 static const char *tfcuda_err_msg[TFCUDAERR_LAST] = {
61 "Failed to set device.",
62 "Failed to set blocks.",
63 "Failed to set threads.",
66 "Failed to clear devices.",
68 "Attempting send to device failed.",
69 "Attempting pull from device when not sent."
72 inline CUresult cuda_errorchk(CUresult retCode,
const char *file,
int line) {
73 if(retCode != CUDA_SUCCESS) {
74 std::string msg =
"CUDA failed with error: ";
76 cuGetErrorName(retCode, &cmsg);
77 msg += std::string(cmsg);
78 msg +=
", " + std::string(file) +
", " + std::to_string(line);
79 tf_exp(std::runtime_error(msg.c_str()));
84 #define TF_CUDA_CALL(res) cuda_errorchk(res, __FILE__, __LINE__)
87 inline nvrtcResult nvrtc_errorchk(nvrtcResult retCode,
const char *file,
int line) {
88 if(retCode != NVRTC_SUCCESS) {
89 std::string msg =
"NVRTC failed with error: ";
90 msg += std::string(nvrtcGetErrorString(retCode));
91 msg +=
", " + std::string(file) +
", " + std::to_string(line);
92 tf_exp(std::runtime_error(msg.c_str()));
97 #define TF_NVRTC_CALL(res) nvrtc_errorchk(res, __FILE__, __LINE__)
100 inline cudaError_t cudart_errorchk(cudaError_t retCode,
const char *file,
int line) {
101 if(retCode != cudaSuccess) {
102 std::string msg =
"NVRTC failed with error: ";
103 msg += std::string(cudaGetErrorString(retCode));
104 msg +=
", " + std::string(file) +
", " + std::to_string(line);
105 tf_exp(std::runtime_error(msg.c_str()));
109 #ifndef TF_CUDART_CALL
110 #define TF_CUDART_CALL(res) cudart_errorchk(res, __FILE__, __LINE__)
123 const char *c_str()
const;
138 std::vector<std::string> opts;
139 std::vector<std::string> namedExprs;
140 std::vector<std::string> includePaths;
177 void compile(
const char *src,
const char *name,
int numHeaders=0,
const char *
const *headers=0,
const char *
const *includeNames=0);
197 const std::string name;
198 unsigned int gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes;
205 HRESULT autoConfig(
const unsigned int &_nr_arrayElems,
206 size_t dynamicSMemSize=0,
207 size_t (*blockSizeToDynamicSMemSize)(
int)=0,
208 int blockSizeLimit=0);
210 void operator()(
void **args);
211 void operator()(
int nargs, ...);
214 CUfunction *function;
228 std::vector<CUjit_option> compileOpts;
229 std::vector<void*> compileOptVals;
237 void addOpt(CUjit_option opt,
void *val);
680 void validateAttached();
681 static void validateDeviceId(
const int &deviceId);
694 CPPAPI_FUNC(
void) setGLDevice(
const int &deviceId);
993 CPPAPI_FUNC(
void)
test(
const int &numBlocks,
const int &numThreads,
const int &numEls,
const int &deviceId=0);
Tissue Forge GPU acceleration on CUDA-supporting devices.
Definition tfAngleConfig.h:26
bool gpuOverlap(const int &deviceId)
Test if the device can concurrently copy memory between host and device while executing a kernel.
int maxBlockDimZ(const int &deviceId)
Maximum z-dimension of a block.
void test(const int &numBlocks, const int &numThreads, const int &numEls, const int &deviceId=0)
Tests JIT-compiled program execution and deployment.
int clockRateMem(const int &deviceId)
Peak memory clock frequency in kilohertz.
int maxGridDimX(const int &deviceId)
Maximum x-dimension of a grid.
int maxSharedMemPerMultiprocessor(const int &deviceId)
Maximum amount of shared memory available to a multiprocessor in bytes.
int getDeviceAttribute(const int &deviceId, const int &attrib)
Get the attribute value of a device.
bool L1CacheSupportGlobal(const int &deviceId)
Test if device supports caching globals in L1 cache.
bool L1CacheSupportLocal(const int &deviceId)
Test if device supports caching locals in L1 cache.
std::string getDevicePCIBusId(const int &deviceId)
Get the PCI bus id of a device.
int maxGridDimY(const int &deviceId)
Maximum y-dimension of a grid.
std::string CUDAPTXObjectRelPath()
Returns the relative path to the installed Tissue Forge CUDA PTX object directory.
HRESULT setCUDAIncludePath(const std::string &_path)
Set the current path to the installed CUDA include directory.
bool computeModeDefault(const int &deviceId)
Test if device is not restricted and can have multiple CUDA contexts present at a single time.
HRESULT setTfResourcePath(const std::string &_path)
Set the current path to the installed Tissue Forge resource directory.
int maxThreadsPerBlock(const int &deviceId)
Maximum number of threads per block.
int maxThreadsPerMultiprocessor(const int &deviceId)
Maximum resident threads per multiprocessor.
int maxRegsPerBlock(const int &deviceId)
Maximum number of 32-bit registers available to a thread block.
int computeCapabilityMajor(const int &deviceId)
Major compute capability version number.
int clockRate(const int &deviceId)
The typical clock frequency in kilohertz.
std::string tfPrivateIncludePath()
Returns the path to the installed Tissue Forge private include directory.
int computeCapabilityMinor(const int &deviceId)
Minor compute capability version number.
std::string CUDAPath()
Returns the path to the installed Tissue Forge CUDA resources directory.
std::string tfResourcePath()
Returns the current path to the installed Tissue Forge resource directory.
int getCurrentDevice()
Get the device id of the current context of the calling CPU thread.
std::string tfIncludePath()
Returns the current path to the installed Tissue Forge include directory.
bool multiGPUBoard(const int &deviceId)
Test if device is on a multi-GPU board.
int maxBlockDimX(const int &deviceId)
Maximum x-dimension of a block.
int L2CacheSize(const int &deviceId)
Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
bool computeModeProhibited(const int &deviceId)
Test if device is prohibited from creating new CUDA contexts.
int numMultiprocessors(const int &deviceId)
Number of multiprocessors on the device.
int PCIDeviceId(const int &deviceId)
PCI device (also known as slot) identifier of the device.
int getNumDevices()
Get number of available compute-capable devices.
bool kernelExecTimeout(const int &deviceId)
Test if there is a run time limit for kernels executed on the device.
int maxBlockDimY(const int &deviceId)
Maximum y-dimension of a block.
int multiGPUBoardGroupId(const int &deviceId)
Unique identifier for a group of devices associated with the same board.
std::string getDeviceName(const int &deviceId)
Get the name of a device.
int PCIDomainId(const int &deviceId)
PCI domain identifier of the device.
HRESULT setTfIncludePath(const std::string &_path)
Set the current path to the installed Tissue Forge include directory.
bool managedMem(const int &deviceId)
Test if device supports allocating managed memory on this system.
int warpSize(const int &deviceId)
Warp size in threads.
int maxGridDimZ(const int &deviceId)
Maximum z-dimension of a grid.
std::vector< std::string > CUDAArchs()
Returns the supported CUDA architectures of the installation.
std::string CUDAResourcePath(const std::string &relativePath)
Returns an absolute path to a subdirectory of the install Tissue Forge CUDA resources directory.
size_t getDeviceTotalMem(const int &deviceId)
Get the total memory of device.
std::string CUDAIncludePath()
Returns the current path to the installed CUDA include directory.
void init()
Initialize CUDA.
bool computeModeExclusive(const int &deviceId)
Test if device can have only one context used by a single process at a time.
int maxSharedMemPerBlock(const int &deviceId)
Maximum amount of shared memory available to a thread block in bytes.
int globalMemBusWidth(const int &deviceId)
Global memory bus width in bits.
int maxTotalMemConst(const int &deviceId)
Memory available on device for constant variables in a CUDA C kernel in bytes.
int maxRegsPerMultiprocessor(const int &deviceId)
Maximum number of 32-bit registers available to a multiprocessor.
A convenience wrap of the CUDA context for JIT-compiled Tissue Forge programs.
Definition tf_cuda.h:223
CUDAFunction * getFunction(const char *name)
Get a cuda function from a loaded module.
CUdeviceptr * getGlobal(const char *name)
Get a global pointer from a loaded module.
void loadPTX(const char *ptx)
Load pre-compiled ptx.
CUcontext * popCurrent()
Pop the context from the stack and returns the new current context of contexts of the CPU thread.
static void sync()
Synchronize GPU with calling CPU thread. Blocks until all preceding tasks of the current context are ...
void loadProgram(const CUDARTProgram &prog)
Load a compiled program.
void pushCurrent()
Push the context onto the stack of current contexts of the CPU thread.
int getAPIVersion()
Get the API version of this context.
size_t getGlobalSize(const char *name)
Get the size of a global pointer from a loaded module.
void destroy()
Destroy the context.
A simple interface with a CUDA device.
Definition tf_cuda.h:319
std::string name()
Get the name of attached device.
int computeCapabilityMajor()
Major compute capability version number.
int maxBlockDimZ()
Maximum z-dimension of a block.
int computeCapabilityMinor()
Minor compute capability version number.
int maxSharedMemPerMultiprocessor()
Maximum amount of shared memory available to a multiprocessor in bytes.
static std::string getDeviceName(const int &deviceId)
Get the name of a device.
int maxTotalMemConst()
Memory available on device for constant variables in a CUDA C kernel in bytes.
int maxRegsPerBlock()
Maximum number of 32-bit registers available to a thread block.
int clockRateMem()
Peak memory clock frequency in kilohertz.
void detachDevice()
Detach currently attached device.
int maxGridDimY()
Maximum y-dimension of a grid.
int maxThreadsPerMultiprocessor()
Maximum resident threads per multiprocessor.
bool L1CacheSupportGlobal()
Test if device supports caching globals in L1 cache.
int multiGPUBoardGroupId()
Unique identifier for a group of devices associated with the same board.
int maxSharedMemPerBlock()
Maximum amount of shared memory available to a thread block in bytes.
int maxBlockDimY()
Maximum y-dimension of a block.
int maxGridDimZ()
Maximum z-dimension of a grid.
int clockRate()
The typical clock frequency in kilohertz.
bool multiGPUBoard()
Test if device is on a multi-GPU board.
int maxThreadsPerBlock()
Maximum number of threads per block.
int L2CacheSize()
Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
int warpSize()
Warp size in threads.
int maxRegsPerMultiprocessor()
Maximum number of 32-bit registers available to a multiprocessor.
bool kernelExecTimeout()
Test if there is a run time limit for kernels executed on the device.
int maxBlockDimX()
Maximum x-dimension of a block.
static int getNumDevices()
Get number of available compute-capable devices.
void attachDevice(const int &deviceId=0)
Attach a CUDA-supporting device by id.
CUDAContext * createContext()
Create a context on this device.
bool gpuOverlap()
Test if the device can concurrently copy memory between host and device while executing a kernel.
int arch()
Get architecture of attached device.
std::string PCIBusId()
Get the PCI bus id of this device.
static int getCurrentDevice()
Get the device id of the current context of the calling CPU thread.
bool computeModeProhibited()
Test if device is prohibited from creating new CUDA contexts.
int PCIDomainId()
PCI domain identifier of the device.
int numMultiprocessors()
Number of multiprocessors on the device.
int PCIDeviceId()
PCI device (also known as slot) identifier of the device.
static std::string getDevicePCIBusId(const int &deviceId)
Get the PCI bus id of a device.
int globalMemBusWidth()
Global memory bus width in bits.
static size_t getDeviceTotalMem(const int &deviceId)
Get the total memory of device.
size_t totalMem()
Get the total memory of attached device.
static int getDeviceAttribute(const int &deviceId, const int &attrib)
Get the attribute value of a device.
bool L1CacheSupportLocal()
Test if device supports caching locals in L1 cache.
int maxGridDimX()
Maximum x-dimension of a grid.
CUDAContext * currentContext()
Get the current context. If none exists, one is created.
bool managedMem()
Test if device supports allocating managed memory on this system.
bool computeModeDefault()
Test if device is not restricted and can have multiple CUDA contexts present at a single time.
int getAttribute(const int &attrib)
Get the attribute value of attached device.
bool computeModeExclusive()
Test if device can have only one context used by a single process at a time.
A CUDA kernel from a JIT-compiled Tissue Forge program.
Definition tf_cuda.h:196
A JIT-compiled CUDA Tissue Forge program.
Definition tf_cuda.h:134
void addNamedExpr(const std::string &namedExpr)
Add a named expression.
std::string loweredName(const std::string namedExpr)
Get the lowered name of a named expression. Cannot be called until after compilaton.
void addIncludePath(const std::string &ipath)
Add a directory to include in the search path.
void compile(const char *src, const char *name, int numHeaders=0, const char *const *headers=0, const char *const *includeNames=0)
Compile the program.
void addOpt(const std::string &opt)
Add a compilation option.
Convenience class for loading source from file and storing, here intended for CUDA.
Definition tf_cuda.h:118
int32_t HRESULT
Definition tf_port.h:255