Tissue Forge C++ 0.2.1
Interactive, particle-based physics, chemistry and biology modeling and simulation environment
Loading...
Searching...
No Matches
tf_cuda.h
Go to the documentation of this file.
1/*******************************************************************************
2 * This file is part of Tissue Forge.
3 * Copyright (c) 2022-2024 T.J. Sego
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as published
7 * by the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 ******************************************************************************/
19
25// TODO: implement support for JIT-compiled programs and kernel usage in wrapped languages
26
27#ifndef _SOURCE_TF_CUDA_H_
28#define _SOURCE_TF_CUDA_H_
29
30#include "tfError.h"
31
32#include <cuda.h>
33#include <cuda_runtime.h>
34#include <nvrtc.h>
35
36#include <vector>
37#include <stdexcept>
38#include <string>
39
40
41namespace TissueForge::cuda {
42
43
44 enum ErrorCode : int {
45 TFCUDAERR_ok = 0,
46 TFCUDAERR_setdevice,
47 TFCUDAERR_setblocks,
48 TFCUDAERR_setthreads,
49 TFCUDAERR_ondevice,
50 TFCUDAERR_notondevice,
51 TFCUDAERR_cleardevices,
52 TFCUDAERR_refresh,
53 TFCUDAERR_send,
54 TFCUDAERR_pull,
55 TFCUDAERR_LAST
56 };
57
58 /* list of error messages. */
59 static const char *tfcuda_err_msg[TFCUDAERR_LAST] = {
60 "No CUDA errors.",
61 "Failed to set device.",
62 "Failed to set blocks.",
63 "Failed to set threads.",
64 "Already on device.",
65 "Not on device.",
66 "Failed to clear devices.",
67 "Refresh failed.",
68 "Attempting send to device failed.",
69 "Attempting pull from device when not sent."
70 };
71
72 inline CUresult cuda_errorchk(CUresult retCode, const char *file, int line) {
73 if(retCode != CUDA_SUCCESS) {
74 std::string msg = "CUDA failed with error: ";
75 const char *cmsg;
76 cuGetErrorName(retCode, &cmsg);
77 msg += std::string(cmsg);
78 msg += ", " + std::string(file) + ", " + std::to_string(line);
79 tf_exp(std::runtime_error(msg.c_str()));
80 }
81 return retCode;
82 }
83 #ifndef TF_CUDA_CALL
84 #define TF_CUDA_CALL(res) cuda_errorchk(res, __FILE__, __LINE__)
85 #endif
86
87 inline nvrtcResult nvrtc_errorchk(nvrtcResult retCode, const char *file, int line) {
88 if(retCode != NVRTC_SUCCESS) {
89 std::string msg = "NVRTC failed with error: ";
90 msg += std::string(nvrtcGetErrorString(retCode));
91 msg += ", " + std::string(file) + ", " + std::to_string(line);
92 tf_exp(std::runtime_error(msg.c_str()));
93 }
94 return retCode;
95 }
96 #ifndef TF_NVRTC_CALL
97 #define TF_NVRTC_CALL(res) nvrtc_errorchk(res, __FILE__, __LINE__)
98 #endif
99
100 inline cudaError_t cudart_errorchk(cudaError_t retCode, const char *file, int line) {
101 if(retCode != cudaSuccess) {
102 std::string msg = "NVRTC failed with error: ";
103 msg += std::string(cudaGetErrorString(retCode));
104 msg += ", " + std::string(file) + ", " + std::to_string(line);
105 tf_exp(std::runtime_error(msg.c_str()));
106 }
107 return retCode;
108 }
109 #ifndef TF_CUDART_CALL
110 #define TF_CUDART_CALL(res) cudart_errorchk(res, __FILE__, __LINE__)
111 #endif
112
113
119 std::string source;
120 const char *name;
121
122 CUDARTSource(const char *filePath, const char *_name);
123 const char *c_str() const;
124 };
125
126
135
136 nvrtcProgram *prog;
137 char *ptx;
138 std::vector<std::string> opts;
139 std::vector<std::string> namedExprs;
140 std::vector<std::string> includePaths;
141 int arch;
142 bool is_compute;
143
146
152 void addOpt(const std::string &opt);
153
159 void addIncludePath(const std::string &ipath);
160
166 void addNamedExpr(const std::string &namedExpr);
167
177 void compile(const char *src, const char *name, int numHeaders=0, const char *const *headers=0, const char *const *includeNames=0);
178
185 std::string loweredName(const std::string namedExpr);
186
187 };
188
189 struct CUDAContext;
190
191
197 const std::string name;
198 unsigned int gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes;
199 CUstream hStream;
200 void **extra;
201
202 CUDAFunction(const std::string &name, CUDAContext *context);
204
205 HRESULT autoConfig(const unsigned int &_nr_arrayElems,
206 size_t dynamicSMemSize=0,
207 size_t (*blockSizeToDynamicSMemSize)(int)=0,
208 int blockSizeLimit=0);
209
210 void operator()(void **args);
211 void operator()(int nargs, ...);
212
213 private:
214 CUfunction *function;
215 CUDAContext *context;
216 };
217
218
223 struct CUDAContext {
224
225 CUcontext *context;
226 CUdevice device;
227 CUmodule *module;
228 std::vector<CUjit_option> compileOpts;
229 std::vector<void*> compileOptVals;
230
231 /* Flag signifying whether this context is attached to a CPU thread. */
232 bool attached;
233
234 CUDAContext(CUdevice device=0);
235 ~CUDAContext();
236
237 void addOpt(CUjit_option opt, void *val);
238
244 void loadProgram(const CUDARTProgram &prog);
245
251 void loadPTX(const char *ptx);
252
259 CUDAFunction *getFunction(const char *name);
260
267 CUdeviceptr *getGlobal(const char *name);
268
275 size_t getGlobalSize(const char *name);
276
284
292 CUcontext *popCurrent();
293
298 void destroy();
299
306
311 static void sync();
312 };
313
314
319 struct CUDADevice {
320
321 CUdevice *device;
322
323 CUDADevice();
324 ~CUDADevice();
325
331 void attachDevice(const int &deviceId=0);
332
338
344 std::string name();
345
351 int arch();
352
358 size_t totalMem();
359
366 int getAttribute(const int &attrib);
367
373 std::string PCIBusId();
374
383
390
397 static std::string getDeviceName(const int &deviceId);
398
405 static size_t getDeviceTotalMem(const int &deviceId);
406
414 static int getDeviceAttribute(const int &deviceId, const int &attrib);
415
421 static int getNumDevices();
422
429 static std::string getDevicePCIBusId(const int &deviceId);
430
436 static int getCurrentDevice();
437
444
451
458
465
472
479
486
493
500
506 int warpSize();
507
514
521
529
536
544
552
560
568
575
582
589
596
603
610
617
624
632
640
647
654
662
670
677
678 private:
679
680 void validateAttached();
681 static void validateDeviceId(const int &deviceId);
682 };
683
684
685 // Tissue Forge CUDA interface
686
687
692 CPPAPI_FUNC(void) init();
693
694 CPPAPI_FUNC(void) setGLDevice(const int &deviceId);
695
702 CPPAPI_FUNC(std::string) getDeviceName(const int &deviceId);
703
710 CPPAPI_FUNC(size_t) getDeviceTotalMem(const int &deviceId);
711
719 CPPAPI_FUNC(int) getDeviceAttribute(const int &deviceId, const int &attrib);
720
726 CPPAPI_FUNC(int) getNumDevices();
727
734 CPPAPI_FUNC(std::string) getDevicePCIBusId(const int &deviceId);
735
741 CPPAPI_FUNC(int) getCurrentDevice();
742
748 CPPAPI_FUNC(int) maxThreadsPerBlock(const int &deviceId);
749
755 CPPAPI_FUNC(int) maxBlockDimX(const int &deviceId);
756
762 CPPAPI_FUNC(int) maxBlockDimY(const int &deviceId);
763
769 CPPAPI_FUNC(int) maxBlockDimZ(const int &deviceId);
770
776 CPPAPI_FUNC(int) maxGridDimX(const int &deviceId);
777
783 CPPAPI_FUNC(int) maxGridDimY(const int &deviceId);
784
790 CPPAPI_FUNC(int) maxGridDimZ(const int &deviceId);
791
797 CPPAPI_FUNC(int) maxSharedMemPerBlock(const int &deviceId);
798
804 CPPAPI_FUNC(int) maxTotalMemConst(const int &deviceId);
805
811 CPPAPI_FUNC(int) warpSize(const int &deviceId);
812
818 CPPAPI_FUNC(int) maxRegsPerBlock(const int &deviceId);
819
825 CPPAPI_FUNC(int) clockRate(const int &deviceId);
826
833 CPPAPI_FUNC(bool) gpuOverlap(const int &deviceId);
834
840 CPPAPI_FUNC(int) numMultiprocessors(const int &deviceId);
841
848 CPPAPI_FUNC(bool) kernelExecTimeout(const int &deviceId);
849
856 CPPAPI_FUNC(bool) computeModeDefault(const int &deviceId);
857
864 CPPAPI_FUNC(bool) computeModeProhibited(const int &deviceId);
865
872 CPPAPI_FUNC(bool) computeModeExclusive(const int &deviceId);
873
879 CPPAPI_FUNC(int) PCIDeviceId(const int &deviceId);
880
886 CPPAPI_FUNC(int) PCIDomainId(const int &deviceId);
887
893 CPPAPI_FUNC(int) clockRateMem(const int &deviceId);
894
900 CPPAPI_FUNC(int) globalMemBusWidth(const int &deviceId);
901
907 CPPAPI_FUNC(int) L2CacheSize(const int &deviceId);
908
914 CPPAPI_FUNC(int) maxThreadsPerMultiprocessor(const int &deviceId);
915
921 CPPAPI_FUNC(int) computeCapabilityMajor(const int &deviceId);
922
928 CPPAPI_FUNC(int) computeCapabilityMinor(const int &deviceId);
929
936 CPPAPI_FUNC(bool) L1CacheSupportGlobal(const int &deviceId);
937
944 CPPAPI_FUNC(bool) L1CacheSupportLocal(const int &deviceId);
945
951 CPPAPI_FUNC(int) maxSharedMemPerMultiprocessor(const int &deviceId);
952
958 CPPAPI_FUNC(int) maxRegsPerMultiprocessor(const int &deviceId);
959
966 CPPAPI_FUNC(bool) managedMem(const int &deviceId);
967
974 CPPAPI_FUNC(bool) multiGPUBoard(const int &deviceId);
975
981 CPPAPI_FUNC(int) multiGPUBoardGroupId(const int &deviceId);
982
993 CPPAPI_FUNC(void) test(const int &numBlocks, const int &numThreads, const int &numEls, const int &deviceId=0);
994
995
1001 CPPAPI_FUNC(std::string) tfIncludePath();
1002
1009 CPPAPI_FUNC(HRESULT) setTfIncludePath(const std::string &_path);
1010
1016 CPPAPI_FUNC(std::string) tfPrivateIncludePath();
1017
1023 CPPAPI_FUNC(std::string) tfResourcePath();
1024
1031 CPPAPI_FUNC(HRESULT) setTfResourcePath(const std::string &_path);
1032
1038 CPPAPI_FUNC(std::string) CUDAPath();
1039
1045 CPPAPI_FUNC(std::string) CUDAIncludePath();
1046
1053 CPPAPI_FUNC(HRESULT) setCUDAIncludePath(const std::string &_path);
1054
1061 CPPAPI_FUNC(std::string) CUDAResourcePath(const std::string &relativePath);
1062
1070 CPPAPI_FUNC(std::string) CUDAPTXObjectRelPath();
1071
1077 CPPAPI_FUNC(std::vector<std::string>) CUDAArchs();
1078
1079};
1080
1081#endif // _SOURCE_TF_CUDA_H_
Tissue Forge GPU acceleration on CUDA-supporting devices.
Definition tfAngleConfig.h:26
bool gpuOverlap(const int &deviceId)
Test if the device can concurrently copy memory between host and device while executing a kernel.
int maxBlockDimZ(const int &deviceId)
Maximum z-dimension of a block.
void test(const int &numBlocks, const int &numThreads, const int &numEls, const int &deviceId=0)
Tests JIT-compiled program execution and deployment.
int clockRateMem(const int &deviceId)
Peak memory clock frequency in kilohertz.
int maxGridDimX(const int &deviceId)
Maximum x-dimension of a grid.
int maxSharedMemPerMultiprocessor(const int &deviceId)
Maximum amount of shared memory available to a multiprocessor in bytes.
int getDeviceAttribute(const int &deviceId, const int &attrib)
Get the attribute value of a device.
bool L1CacheSupportGlobal(const int &deviceId)
Test if device supports caching globals in L1 cache.
bool L1CacheSupportLocal(const int &deviceId)
Test if device supports caching locals in L1 cache.
std::string getDevicePCIBusId(const int &deviceId)
Get the PCI bus id of a device.
int maxGridDimY(const int &deviceId)
Maximum y-dimension of a grid.
std::string CUDAPTXObjectRelPath()
Returns the relative path to the installed Tissue Forge CUDA PTX object directory.
HRESULT setCUDAIncludePath(const std::string &_path)
Set the current path to the installed CUDA include directory.
bool computeModeDefault(const int &deviceId)
Test if device is not restricted and can have multiple CUDA contexts present at a single time.
HRESULT setTfResourcePath(const std::string &_path)
Set the current path to the installed Tissue Forge resource directory.
int maxThreadsPerBlock(const int &deviceId)
Maximum number of threads per block.
int maxThreadsPerMultiprocessor(const int &deviceId)
Maximum resident threads per multiprocessor.
int maxRegsPerBlock(const int &deviceId)
Maximum number of 32-bit registers available to a thread block.
int computeCapabilityMajor(const int &deviceId)
Major compute capability version number.
int clockRate(const int &deviceId)
The typical clock frequency in kilohertz.
std::string tfPrivateIncludePath()
Returns the path to the installed Tissue Forge private include directory.
int computeCapabilityMinor(const int &deviceId)
Minor compute capability version number.
std::string CUDAPath()
Returns the path to the installed Tissue Forge CUDA resources directory.
std::string tfResourcePath()
Returns the current path to the installed Tissue Forge resource directory.
int getCurrentDevice()
Get the device id of the current context of the calling CPU thread.
std::string tfIncludePath()
Returns the current path to the installed Tissue Forge include directory.
bool multiGPUBoard(const int &deviceId)
Test if device is on a multi-GPU board.
int maxBlockDimX(const int &deviceId)
Maximum x-dimension of a block.
int L2CacheSize(const int &deviceId)
Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
bool computeModeProhibited(const int &deviceId)
Test if device is prohibited from creating new CUDA contexts.
int numMultiprocessors(const int &deviceId)
Number of multiprocessors on the device.
int PCIDeviceId(const int &deviceId)
PCI device (also known as slot) identifier of the device.
int getNumDevices()
Get number of available compute-capable devices.
bool kernelExecTimeout(const int &deviceId)
Test if there is a run time limit for kernels executed on the device.
int maxBlockDimY(const int &deviceId)
Maximum y-dimension of a block.
int multiGPUBoardGroupId(const int &deviceId)
Unique identifier for a group of devices associated with the same board.
std::string getDeviceName(const int &deviceId)
Get the name of a device.
int PCIDomainId(const int &deviceId)
PCI domain identifier of the device.
HRESULT setTfIncludePath(const std::string &_path)
Set the current path to the installed Tissue Forge include directory.
bool managedMem(const int &deviceId)
Test if device supports allocating managed memory on this system.
int warpSize(const int &deviceId)
Warp size in threads.
int maxGridDimZ(const int &deviceId)
Maximum z-dimension of a grid.
std::vector< std::string > CUDAArchs()
Returns the supported CUDA architectures of the installation.
std::string CUDAResourcePath(const std::string &relativePath)
Returns an absolute path to a subdirectory of the install Tissue Forge CUDA resources directory.
size_t getDeviceTotalMem(const int &deviceId)
Get the total memory of device.
std::string CUDAIncludePath()
Returns the current path to the installed CUDA include directory.
void init()
Initialize CUDA.
bool computeModeExclusive(const int &deviceId)
Test if device can have only one context used by a single process at a time.
int maxSharedMemPerBlock(const int &deviceId)
Maximum amount of shared memory available to a thread block in bytes.
int globalMemBusWidth(const int &deviceId)
Global memory bus width in bits.
int maxTotalMemConst(const int &deviceId)
Memory available on device for constant variables in a CUDA C kernel in bytes.
int maxRegsPerMultiprocessor(const int &deviceId)
Maximum number of 32-bit registers available to a multiprocessor.
A convenience wrap of the CUDA context for JIT-compiled Tissue Forge programs.
Definition tf_cuda.h:223
CUDAFunction * getFunction(const char *name)
Get a cuda function from a loaded module.
CUdeviceptr * getGlobal(const char *name)
Get a global pointer from a loaded module.
void loadPTX(const char *ptx)
Load pre-compiled ptx.
CUcontext * popCurrent()
Pop the context from the stack and returns the new current context of contexts of the CPU thread.
static void sync()
Synchronize GPU with calling CPU thread. Blocks until all preceding tasks of the current context are ...
void loadProgram(const CUDARTProgram &prog)
Load a compiled program.
void pushCurrent()
Push the context onto the stack of current contexts of the CPU thread.
int getAPIVersion()
Get the API version of this context.
size_t getGlobalSize(const char *name)
Get the size of a global pointer from a loaded module.
void destroy()
Destroy the context.
A simple interface with a CUDA device.
Definition tf_cuda.h:319
std::string name()
Get the name of attached device.
int computeCapabilityMajor()
Major compute capability version number.
int maxBlockDimZ()
Maximum z-dimension of a block.
int computeCapabilityMinor()
Minor compute capability version number.
int maxSharedMemPerMultiprocessor()
Maximum amount of shared memory available to a multiprocessor in bytes.
static std::string getDeviceName(const int &deviceId)
Get the name of a device.
int maxTotalMemConst()
Memory available on device for constant variables in a CUDA C kernel in bytes.
int maxRegsPerBlock()
Maximum number of 32-bit registers available to a thread block.
int clockRateMem()
Peak memory clock frequency in kilohertz.
void detachDevice()
Detach currently attached device.
int maxGridDimY()
Maximum y-dimension of a grid.
int maxThreadsPerMultiprocessor()
Maximum resident threads per multiprocessor.
bool L1CacheSupportGlobal()
Test if device supports caching globals in L1 cache.
int multiGPUBoardGroupId()
Unique identifier for a group of devices associated with the same board.
int maxSharedMemPerBlock()
Maximum amount of shared memory available to a thread block in bytes.
int maxBlockDimY()
Maximum y-dimension of a block.
int maxGridDimZ()
Maximum z-dimension of a grid.
int clockRate()
The typical clock frequency in kilohertz.
bool multiGPUBoard()
Test if device is on a multi-GPU board.
int maxThreadsPerBlock()
Maximum number of threads per block.
int L2CacheSize()
Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
int warpSize()
Warp size in threads.
int maxRegsPerMultiprocessor()
Maximum number of 32-bit registers available to a multiprocessor.
bool kernelExecTimeout()
Test if there is a run time limit for kernels executed on the device.
int maxBlockDimX()
Maximum x-dimension of a block.
static int getNumDevices()
Get number of available compute-capable devices.
void attachDevice(const int &deviceId=0)
Attach a CUDA-supporting device by id.
CUDAContext * createContext()
Create a context on this device.
bool gpuOverlap()
Test if the device can concurrently copy memory between host and device while executing a kernel.
int arch()
Get architecture of attached device.
std::string PCIBusId()
Get the PCI bus id of this device.
static int getCurrentDevice()
Get the device id of the current context of the calling CPU thread.
bool computeModeProhibited()
Test if device is prohibited from creating new CUDA contexts.
int PCIDomainId()
PCI domain identifier of the device.
int numMultiprocessors()
Number of multiprocessors on the device.
int PCIDeviceId()
PCI device (also known as slot) identifier of the device.
static std::string getDevicePCIBusId(const int &deviceId)
Get the PCI bus id of a device.
int globalMemBusWidth()
Global memory bus width in bits.
static size_t getDeviceTotalMem(const int &deviceId)
Get the total memory of device.
size_t totalMem()
Get the total memory of attached device.
static int getDeviceAttribute(const int &deviceId, const int &attrib)
Get the attribute value of a device.
bool L1CacheSupportLocal()
Test if device supports caching locals in L1 cache.
int maxGridDimX()
Maximum x-dimension of a grid.
CUDAContext * currentContext()
Get the current context. If none exists, one is created.
bool managedMem()
Test if device supports allocating managed memory on this system.
bool computeModeDefault()
Test if device is not restricted and can have multiple CUDA contexts present at a single time.
int getAttribute(const int &attrib)
Get the attribute value of attached device.
bool computeModeExclusive()
Test if device can have only one context used by a single process at a time.
A CUDA kernel from a JIT-compiled Tissue Forge program.
Definition tf_cuda.h:196
A JIT-compiled CUDA Tissue Forge program.
Definition tf_cuda.h:134
void addNamedExpr(const std::string &namedExpr)
Add a named expression.
std::string loweredName(const std::string namedExpr)
Get the lowered name of a named expression. Cannot be called until after compilaton.
void addIncludePath(const std::string &ipath)
Add a directory to include in the search path.
void compile(const char *src, const char *name, int numHeaders=0, const char *const *headers=0, const char *const *includeNames=0)
Compile the program.
void addOpt(const std::string &opt)
Add a compilation option.
Convenience class for loading source from file and storing, here intended for CUDA.
Definition tf_cuda.h:118
int32_t HRESULT
Definition tf_port.h:255