db/d2e/tf__cuda_8h_source.html

/*******************************************************************************

 * This file is part of Tissue Forge.

 * Copyright (c) 2022-2024 T.J. Sego

 *

 * This program is free software: you can redistribute it and/or modify

 * it under the terms of the GNU Lesser General Public License as published

 * by the Free Software Foundation, either version 3 of the License, or

 * (at your option) any later version.

 *

 * This program is distributed in the hope that it will be useful,

 * but WITHOUT ANY WARRANTY; without even the implied warranty of

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 * GNU General Public License for more details.

 *

 * You should have received a copy of the GNU Lesser General Public License

 * along with this program.  If not, see <http://www.gnu.org/licenses/>.

 *

 ******************************************************************************/


// TODO: implement support for JIT-compiled programs and kernel usage in wrapped languages


#ifndef _SOURCE_TF_CUDA_H_

#define _SOURCE_TF_CUDA_H_


#include "tfError.h"


#include <cuda.h>

#include <cuda_runtime.h>

#include <nvrtc.h>


#include <vector>

#include <stdexcept>

#include <string>


namespace TissueForge::cuda {


    enum ErrorCode : int {

        TFCUDAERR_ok = 0,

        TFCUDAERR_setdevice,

        TFCUDAERR_setblocks,

        TFCUDAERR_setthreads,

        TFCUDAERR_ondevice,

        TFCUDAERR_notondevice,

        TFCUDAERR_cleardevices,

        TFCUDAERR_refresh,

        TFCUDAERR_send,

        TFCUDAERR_pull,

        TFCUDAERR_LAST

    };


    /* list of error messages. */

    static const char *tfcuda_err_msg[TFCUDAERR_LAST] = {

        "No CUDA errors.",

        "Failed to set device.",

        "Failed to set blocks.",

        "Failed to set threads.",

        "Already on device.",

        "Not on device.",

        "Failed to clear devices.",

        "Refresh failed.",

        "Attempting send to device failed.",

        "Attempting pull from device when not sent."

    };


    inline CUresult cuda_errorchk(CUresult retCode, const char *file, int line) {

        if(retCode != CUDA_SUCCESS) {

            std::string msg = "CUDA failed with error: ";

            const char *cmsg;

            cuGetErrorName(retCode, &cmsg);

            msg += std::string(cmsg);

            msg += ", " + std::string(file) + ", " + std::to_string(line);

            tf_exp(std::runtime_error(msg.c_str()));

        }

        return retCode;

    }

    #ifndef TF_CUDA_CALL

        #define TF_CUDA_CALL(res) cuda_errorchk(res, __FILE__, __LINE__)

    #endif


    inline nvrtcResult nvrtc_errorchk(nvrtcResult retCode, const char *file, int line) {

        if(retCode != NVRTC_SUCCESS) {

            std::string msg = "NVRTC failed with error: ";

            msg += std::string(nvrtcGetErrorString(retCode));

            msg += ", " + std::string(file) + ", " + std::to_string(line);

            tf_exp(std::runtime_error(msg.c_str()));

        }

        return retCode;

    }

    #ifndef TF_NVRTC_CALL

        #define TF_NVRTC_CALL(res) nvrtc_errorchk(res, __FILE__, __LINE__)

    #endif


    inline cudaError_t cudart_errorchk(cudaError_t retCode, const char *file, int line) {

        if(retCode != cudaSuccess) {

            std::string msg = "NVRTC failed with error: ";

            msg += std::string(cudaGetErrorString(retCode));

            msg += ", " + std::string(file) + ", " + std::to_string(line);

            tf_exp(std::runtime_error(msg.c_str()));

        }

        return retCode;

    }

    #ifndef TF_CUDART_CALL

        #define TF_CUDART_CALL(res) cudart_errorchk(res, __FILE__, __LINE__)

    #endif


    struct CUDARTSource {

        std::string source;

        const char *name;


        CUDARTSource(const char *filePath, const char *_name);

        const char *c_str() const;

    };


    struct CUDARTProgram {


        nvrtcProgram *prog;

        char *ptx;

        std::vector<std::string> opts;

        std::vector<std::string> namedExprs;

        std::vector<std::string> includePaths;

        int arch;

        bool is_compute;


        CUDARTProgram();

        ~CUDARTProgram();


        void addOpt(const std::string &opt);


        void addIncludePath(const std::string &ipath);


        void addNamedExpr(const std::string &namedExpr);


        void compile(const char *src, const char *name, int numHeaders=0, const char *const *headers=0, const char *const *includeNames=0);


        std::string loweredName(const std::string namedExpr);


    };


    struct CUDAContext;


    struct CUDAFunction {

        const std::string name;

        unsigned int gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes;

        CUstream hStream;

        void **extra;


        CUDAFunction(const std::string &name, CUDAContext *context);

        ~CUDAFunction();


        HRESULT autoConfig(const unsigned int &_nr_arrayElems,

                        size_t dynamicSMemSize=0,

                        size_t (*blockSizeToDynamicSMemSize)(int)=0,

                        int blockSizeLimit=0);


        void operator()(void **args);

        void operator()(int nargs, ...);


    private:

        CUfunction *function;

        CUDAContext *context;

    };


    struct CUDAContext {


        CUcontext *context;

        CUdevice device;

        CUmodule *module;

        std::vector<CUjit_option> compileOpts;

        std::vector<void*> compileOptVals;


        /* Flag signifying whether this context is attached to a CPU thread. */

        bool attached;


        CUDAContext(CUdevice device=0);

        ~CUDAContext();


        void addOpt(CUjit_option opt, void *val);


        void loadProgram(const CUDARTProgram &prog);


        void loadPTX(const char *ptx);


        CUDAFunction *getFunction(const char *name);


        CUdeviceptr *getGlobal(const char *name);


        size_t getGlobalSize(const char *name);


        void pushCurrent();


        CUcontext *popCurrent();


        void destroy();


        int getAPIVersion();


        static void sync();

    };


    struct CUDADevice {


        CUdevice *device;


        CUDADevice();

        ~CUDADevice();


        void attachDevice(const int &deviceId=0);


        void detachDevice();


        std::string name();


        int arch();


        size_t totalMem();


        int getAttribute(const int &attrib);


        std::string PCIBusId();


        CUDAContext *createContext();


        CUDAContext *currentContext();


        static std::string getDeviceName(const int &deviceId);


        static size_t getDeviceTotalMem(const int &deviceId);


        static int getDeviceAttribute(const int &deviceId, const int &attrib);


        static int getNumDevices();


        static std::string getDevicePCIBusId(const int &deviceId);


        static int getCurrentDevice();


        int maxThreadsPerBlock();


        int maxBlockDimX();


        int maxBlockDimY();


        int maxBlockDimZ();


        int maxGridDimX();


        int maxGridDimY();


        int maxGridDimZ();


        int maxSharedMemPerBlock();


        int maxTotalMemConst();


        int warpSize();


        int maxRegsPerBlock();


        int clockRate();


        bool gpuOverlap();


        int numMultiprocessors();


        bool kernelExecTimeout();


        bool computeModeDefault();


        bool computeModeProhibited();


        bool computeModeExclusive();


        int PCIDeviceId();


        int PCIDomainId();


        int clockRateMem();


        int globalMemBusWidth();


        int L2CacheSize();


        int maxThreadsPerMultiprocessor();


        int computeCapabilityMajor();


        int computeCapabilityMinor();


        bool L1CacheSupportGlobal();


        bool L1CacheSupportLocal();


        int maxSharedMemPerMultiprocessor();


        int maxRegsPerMultiprocessor();


        bool managedMem();


        bool multiGPUBoard();


        int multiGPUBoardGroupId();


    private:


        void validateAttached();

        static void validateDeviceId(const int &deviceId);

    };


    // Tissue Forge CUDA interface


    CPPAPI_FUNC(void) init();


    CPPAPI_FUNC(void) setGLDevice(const int &deviceId);


    CPPAPI_FUNC(std::string) getDeviceName(const int &deviceId);


    CPPAPI_FUNC(size_t) getDeviceTotalMem(const int &deviceId);


    CPPAPI_FUNC(int) getDeviceAttribute(const int &deviceId, const int &attrib);


    CPPAPI_FUNC(int) getNumDevices();


    CPPAPI_FUNC(std::string) getDevicePCIBusId(const int &deviceId);


    CPPAPI_FUNC(int) getCurrentDevice();


    CPPAPI_FUNC(int) maxThreadsPerBlock(const int &deviceId);


    CPPAPI_FUNC(int) maxBlockDimX(const int &deviceId);


    CPPAPI_FUNC(int) maxBlockDimY(const int &deviceId);


    CPPAPI_FUNC(int) maxBlockDimZ(const int &deviceId);


    CPPAPI_FUNC(int) maxGridDimX(const int &deviceId);


    CPPAPI_FUNC(int) maxGridDimY(const int &deviceId);


    CPPAPI_FUNC(int) maxGridDimZ(const int &deviceId);


    CPPAPI_FUNC(int) maxSharedMemPerBlock(const int &deviceId);


    CPPAPI_FUNC(int) maxTotalMemConst(const int &deviceId);


    CPPAPI_FUNC(int) warpSize(const int &deviceId);


    CPPAPI_FUNC(int) maxRegsPerBlock(const int &deviceId);


    CPPAPI_FUNC(int) clockRate(const int &deviceId);


    CPPAPI_FUNC(bool) gpuOverlap(const int &deviceId);


    CPPAPI_FUNC(int) numMultiprocessors(const int &deviceId);


    CPPAPI_FUNC(bool) kernelExecTimeout(const int &deviceId);


    CPPAPI_FUNC(bool) computeModeDefault(const int &deviceId);


    CPPAPI_FUNC(bool) computeModeProhibited(const int &deviceId);


    CPPAPI_FUNC(bool) computeModeExclusive(const int &deviceId);


    CPPAPI_FUNC(int) PCIDeviceId(const int &deviceId);


    CPPAPI_FUNC(int) PCIDomainId(const int &deviceId);


    CPPAPI_FUNC(int) clockRateMem(const int &deviceId);


    CPPAPI_FUNC(int) globalMemBusWidth(const int &deviceId);


    CPPAPI_FUNC(int) L2CacheSize(const int &deviceId);


    CPPAPI_FUNC(int) maxThreadsPerMultiprocessor(const int &deviceId);


    CPPAPI_FUNC(int) computeCapabilityMajor(const int &deviceId);


    CPPAPI_FUNC(int) computeCapabilityMinor(const int &deviceId);


    CPPAPI_FUNC(bool) L1CacheSupportGlobal(const int &deviceId);


    CPPAPI_FUNC(bool) L1CacheSupportLocal(const int &deviceId);


    CPPAPI_FUNC(int) maxSharedMemPerMultiprocessor(const int &deviceId);


    CPPAPI_FUNC(int) maxRegsPerMultiprocessor(const int &deviceId);


    CPPAPI_FUNC(bool) managedMem(const int &deviceId);


    CPPAPI_FUNC(bool) multiGPUBoard(const int &deviceId);


    CPPAPI_FUNC(int) multiGPUBoardGroupId(const int &deviceId);


    CPPAPI_FUNC(void) test(const int &numBlocks, const int &numThreads, const int &numEls, const int &deviceId=0);


    CPPAPI_FUNC(std::string) tfIncludePath();


    CPPAPI_FUNC(HRESULT) setTfIncludePath(const std::string &_path);


    CPPAPI_FUNC(std::string) tfPrivateIncludePath();


    CPPAPI_FUNC(std::string) tfResourcePath();


    CPPAPI_FUNC(HRESULT) setTfResourcePath(const std::string &_path);


    CPPAPI_FUNC(std::string) CUDAPath();


    CPPAPI_FUNC(std::string) CUDAIncludePath();


    CPPAPI_FUNC(HRESULT) setCUDAIncludePath(const std::string &_path);


    CPPAPI_FUNC(std::string) CUDAResourcePath(const std::string &relativePath);


    CPPAPI_FUNC(std::string) CUDAPTXObjectRelPath();


    CPPAPI_FUNC(std::vector<std::string>) CUDAArchs();


};


#endif // _SOURCE_TF_CUDA_H_

TissueForge::cuda
Tissue Forge GPU acceleration on CUDA-supporting devices.
Definition tfAngleConfig.h:26

TissueForge::cuda::gpuOverlap
bool gpuOverlap(const int &deviceId)
Test if the device can concurrently copy memory between host and device while executing a kernel.

TissueForge::cuda::maxBlockDimZ
int maxBlockDimZ(const int &deviceId)
Maximum z-dimension of a block.

TissueForge::cuda::test
void test(const int &numBlocks, const int &numThreads, const int &numEls, const int &deviceId=0)
Tests JIT-compiled program execution and deployment.

TissueForge::cuda::clockRateMem
int clockRateMem(const int &deviceId)
Peak memory clock frequency in kilohertz.

TissueForge::cuda::maxGridDimX
int maxGridDimX(const int &deviceId)
Maximum x-dimension of a grid.

TissueForge::cuda::maxSharedMemPerMultiprocessor
int maxSharedMemPerMultiprocessor(const int &deviceId)
Maximum amount of shared memory available to a multiprocessor in bytes.

TissueForge::cuda::getDeviceAttribute
int getDeviceAttribute(const int &deviceId, const int &attrib)
Get the attribute value of a device.

TissueForge::cuda::L1CacheSupportGlobal
bool L1CacheSupportGlobal(const int &deviceId)
Test if device supports caching globals in L1 cache.

TissueForge::cuda::L1CacheSupportLocal
bool L1CacheSupportLocal(const int &deviceId)
Test if device supports caching locals in L1 cache.

TissueForge::cuda::getDevicePCIBusId
std::string getDevicePCIBusId(const int &deviceId)
Get the PCI bus id of a device.

TissueForge::cuda::maxGridDimY
int maxGridDimY(const int &deviceId)
Maximum y-dimension of a grid.

TissueForge::cuda::CUDAPTXObjectRelPath
std::string CUDAPTXObjectRelPath()
Returns the relative path to the installed Tissue Forge CUDA PTX object directory.

TissueForge::cuda::setCUDAIncludePath
HRESULT setCUDAIncludePath(const std::string &_path)
Set the current path to the installed CUDA include directory.

TissueForge::cuda::computeModeDefault
bool computeModeDefault(const int &deviceId)
Test if device is not restricted and can have multiple CUDA contexts present at a single time.

TissueForge::cuda::setTfResourcePath
HRESULT setTfResourcePath(const std::string &_path)
Set the current path to the installed Tissue Forge resource directory.

TissueForge::cuda::maxThreadsPerBlock
int maxThreadsPerBlock(const int &deviceId)
Maximum number of threads per block.

TissueForge::cuda::maxThreadsPerMultiprocessor
int maxThreadsPerMultiprocessor(const int &deviceId)
Maximum resident threads per multiprocessor.

TissueForge::cuda::maxRegsPerBlock
int maxRegsPerBlock(const int &deviceId)
Maximum number of 32-bit registers available to a thread block.

TissueForge::cuda::computeCapabilityMajor
int computeCapabilityMajor(const int &deviceId)
Major compute capability version number.

TissueForge::cuda::clockRate
int clockRate(const int &deviceId)
The typical clock frequency in kilohertz.

TissueForge::cuda::tfPrivateIncludePath
std::string tfPrivateIncludePath()
Returns the path to the installed Tissue Forge private include directory.

TissueForge::cuda::computeCapabilityMinor
int computeCapabilityMinor(const int &deviceId)
Minor compute capability version number.

TissueForge::cuda::CUDAPath
std::string CUDAPath()
Returns the path to the installed Tissue Forge CUDA resources directory.

TissueForge::cuda::tfResourcePath
std::string tfResourcePath()
Returns the current path to the installed Tissue Forge resource directory.

TissueForge::cuda::getCurrentDevice
int getCurrentDevice()
Get the device id of the current context of the calling CPU thread.

TissueForge::cuda::tfIncludePath
std::string tfIncludePath()
Returns the current path to the installed Tissue Forge include directory.

TissueForge::cuda::multiGPUBoard
bool multiGPUBoard(const int &deviceId)
Test if device is on a multi-GPU board.

TissueForge::cuda::maxBlockDimX
int maxBlockDimX(const int &deviceId)
Maximum x-dimension of a block.

TissueForge::cuda::L2CacheSize
int L2CacheSize(const int &deviceId)
Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.

TissueForge::cuda::computeModeProhibited
bool computeModeProhibited(const int &deviceId)
Test if device is prohibited from creating new CUDA contexts.

TissueForge::cuda::numMultiprocessors
int numMultiprocessors(const int &deviceId)
Number of multiprocessors on the device.

TissueForge::cuda::PCIDeviceId
int PCIDeviceId(const int &deviceId)
PCI device (also known as slot) identifier of the device.

TissueForge::cuda::getNumDevices
int getNumDevices()
Get number of available compute-capable devices.

TissueForge::cuda::kernelExecTimeout
bool kernelExecTimeout(const int &deviceId)
Test if there is a run time limit for kernels executed on the device.

TissueForge::cuda::maxBlockDimY
int maxBlockDimY(const int &deviceId)
Maximum y-dimension of a block.

TissueForge::cuda::multiGPUBoardGroupId
int multiGPUBoardGroupId(const int &deviceId)
Unique identifier for a group of devices associated with the same board.

TissueForge::cuda::getDeviceName
std::string getDeviceName(const int &deviceId)
Get the name of a device.

TissueForge::cuda::PCIDomainId
int PCIDomainId(const int &deviceId)
PCI domain identifier of the device.

TissueForge::cuda::setTfIncludePath
HRESULT setTfIncludePath(const std::string &_path)
Set the current path to the installed Tissue Forge include directory.

TissueForge::cuda::managedMem
bool managedMem(const int &deviceId)
Test if device supports allocating managed memory on this system.

TissueForge::cuda::warpSize
int warpSize(const int &deviceId)
Warp size in threads.

TissueForge::cuda::maxGridDimZ
int maxGridDimZ(const int &deviceId)
Maximum z-dimension of a grid.

TissueForge::cuda::CUDAArchs
std::vector< std::string > CUDAArchs()
Returns the supported CUDA architectures of the installation.

TissueForge::cuda::CUDAResourcePath
std::string CUDAResourcePath(const std::string &relativePath)
Returns an absolute path to a subdirectory of the install Tissue Forge CUDA resources directory.

TissueForge::cuda::getDeviceTotalMem
size_t getDeviceTotalMem(const int &deviceId)
Get the total memory of device.

TissueForge::cuda::CUDAIncludePath
std::string CUDAIncludePath()
Returns the current path to the installed CUDA include directory.

TissueForge::cuda::init
void init()
Initialize CUDA.

TissueForge::cuda::computeModeExclusive
bool computeModeExclusive(const int &deviceId)
Test if device can have only one context used by a single process at a time.

TissueForge::cuda::maxSharedMemPerBlock
int maxSharedMemPerBlock(const int &deviceId)
Maximum amount of shared memory available to a thread block in bytes.

TissueForge::cuda::globalMemBusWidth
int globalMemBusWidth(const int &deviceId)
Global memory bus width in bits.

TissueForge::cuda::maxTotalMemConst
int maxTotalMemConst(const int &deviceId)
Memory available on device for constant variables in a CUDA C kernel in bytes.

TissueForge::cuda::maxRegsPerMultiprocessor
int maxRegsPerMultiprocessor(const int &deviceId)
Maximum number of 32-bit registers available to a multiprocessor.

TissueForge::cuda::CUDAContext
A convenience wrap of the CUDA context for JIT-compiled Tissue Forge programs.
Definition tf_cuda.h:223

TissueForge::cuda::CUDAContext::getFunction
CUDAFunction * getFunction(const char *name)
Get a cuda function from a loaded module.

TissueForge::cuda::CUDAContext::getGlobal
CUdeviceptr * getGlobal(const char *name)
Get a global pointer from a loaded module.

TissueForge::cuda::CUDAContext::loadPTX
void loadPTX(const char *ptx)
Load pre-compiled ptx.

TissueForge::cuda::CUDAContext::popCurrent
CUcontext * popCurrent()
Pop the context from the stack and returns the new current context of contexts of the CPU thread.

TissueForge::cuda::CUDAContext::sync
static void sync()
Synchronize GPU with calling CPU thread. Blocks until all preceding tasks of the current context are ...

TissueForge::cuda::CUDAContext::loadProgram
void loadProgram(const CUDARTProgram &prog)
Load a compiled program.

TissueForge::cuda::CUDAContext::pushCurrent
void pushCurrent()
Push the context onto the stack of current contexts of the CPU thread.

TissueForge::cuda::CUDAContext::getAPIVersion
int getAPIVersion()
Get the API version of this context.

TissueForge::cuda::CUDAContext::getGlobalSize
size_t getGlobalSize(const char *name)
Get the size of a global pointer from a loaded module.

TissueForge::cuda::CUDAContext::destroy
void destroy()
Destroy the context.

TissueForge::cuda::CUDADevice
A simple interface with a CUDA device.
Definition tf_cuda.h:319

TissueForge::cuda::CUDADevice::name
std::string name()
Get the name of attached device.

TissueForge::cuda::CUDADevice::computeCapabilityMajor
int computeCapabilityMajor()
Major compute capability version number.

TissueForge::cuda::CUDADevice::maxBlockDimZ
int maxBlockDimZ()
Maximum z-dimension of a block.

TissueForge::cuda::CUDADevice::computeCapabilityMinor
int computeCapabilityMinor()
Minor compute capability version number.

TissueForge::cuda::CUDADevice::maxSharedMemPerMultiprocessor
int maxSharedMemPerMultiprocessor()
Maximum amount of shared memory available to a multiprocessor in bytes.

TissueForge::cuda::CUDADevice::getDeviceName
static std::string getDeviceName(const int &deviceId)
Get the name of a device.

TissueForge::cuda::CUDADevice::maxTotalMemConst
int maxTotalMemConst()
Memory available on device for constant variables in a CUDA C kernel in bytes.

TissueForge::cuda::CUDADevice::maxRegsPerBlock
int maxRegsPerBlock()
Maximum number of 32-bit registers available to a thread block.

TissueForge::cuda::CUDADevice::clockRateMem
int clockRateMem()
Peak memory clock frequency in kilohertz.

TissueForge::cuda::CUDADevice::detachDevice
void detachDevice()
Detach currently attached device.

TissueForge::cuda::CUDADevice::maxGridDimY
int maxGridDimY()
Maximum y-dimension of a grid.

TissueForge::cuda::CUDADevice::maxThreadsPerMultiprocessor
int maxThreadsPerMultiprocessor()
Maximum resident threads per multiprocessor.

TissueForge::cuda::CUDADevice::L1CacheSupportGlobal
bool L1CacheSupportGlobal()
Test if device supports caching globals in L1 cache.

TissueForge::cuda::CUDADevice::multiGPUBoardGroupId
int multiGPUBoardGroupId()
Unique identifier for a group of devices associated with the same board.

TissueForge::cuda::CUDADevice::maxSharedMemPerBlock
int maxSharedMemPerBlock()
Maximum amount of shared memory available to a thread block in bytes.

TissueForge::cuda::CUDADevice::maxBlockDimY
int maxBlockDimY()
Maximum y-dimension of a block.

TissueForge::cuda::CUDADevice::maxGridDimZ
int maxGridDimZ()
Maximum z-dimension of a grid.

TissueForge::cuda::CUDADevice::clockRate
int clockRate()
The typical clock frequency in kilohertz.

TissueForge::cuda::CUDADevice::multiGPUBoard
bool multiGPUBoard()
Test if device is on a multi-GPU board.

TissueForge::cuda::CUDADevice::maxThreadsPerBlock
int maxThreadsPerBlock()
Maximum number of threads per block.

TissueForge::cuda::CUDADevice::L2CacheSize
int L2CacheSize()
Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.

TissueForge::cuda::CUDADevice::warpSize
int warpSize()
Warp size in threads.

TissueForge::cuda::CUDADevice::maxRegsPerMultiprocessor
int maxRegsPerMultiprocessor()
Maximum number of 32-bit registers available to a multiprocessor.

TissueForge::cuda::CUDADevice::kernelExecTimeout
bool kernelExecTimeout()
Test if there is a run time limit for kernels executed on the device.

TissueForge::cuda::CUDADevice::maxBlockDimX
int maxBlockDimX()
Maximum x-dimension of a block.

TissueForge::cuda::CUDADevice::getNumDevices
static int getNumDevices()
Get number of available compute-capable devices.

TissueForge::cuda::CUDADevice::attachDevice
void attachDevice(const int &deviceId=0)
Attach a CUDA-supporting device by id.

TissueForge::cuda::CUDADevice::createContext
CUDAContext * createContext()
Create a context on this device.

TissueForge::cuda::CUDADevice::gpuOverlap
bool gpuOverlap()
Test if the device can concurrently copy memory between host and device while executing a kernel.

TissueForge::cuda::CUDADevice::arch
int arch()
Get architecture of attached device.

TissueForge::cuda::CUDADevice::PCIBusId
std::string PCIBusId()
Get the PCI bus id of this device.

TissueForge::cuda::CUDADevice::getCurrentDevice
static int getCurrentDevice()
Get the device id of the current context of the calling CPU thread.

TissueForge::cuda::CUDADevice::computeModeProhibited
bool computeModeProhibited()
Test if device is prohibited from creating new CUDA contexts.

TissueForge::cuda::CUDADevice::PCIDomainId
int PCIDomainId()
PCI domain identifier of the device.

TissueForge::cuda::CUDADevice::numMultiprocessors
int numMultiprocessors()
Number of multiprocessors on the device.

TissueForge::cuda::CUDADevice::PCIDeviceId
int PCIDeviceId()
PCI device (also known as slot) identifier of the device.

TissueForge::cuda::CUDADevice::getDevicePCIBusId
static std::string getDevicePCIBusId(const int &deviceId)
Get the PCI bus id of a device.

TissueForge::cuda::CUDADevice::globalMemBusWidth
int globalMemBusWidth()
Global memory bus width in bits.

TissueForge::cuda::CUDADevice::getDeviceTotalMem
static size_t getDeviceTotalMem(const int &deviceId)
Get the total memory of device.

TissueForge::cuda::CUDADevice::totalMem
size_t totalMem()
Get the total memory of attached device.

TissueForge::cuda::CUDADevice::getDeviceAttribute
static int getDeviceAttribute(const int &deviceId, const int &attrib)
Get the attribute value of a device.

TissueForge::cuda::CUDADevice::L1CacheSupportLocal
bool L1CacheSupportLocal()
Test if device supports caching locals in L1 cache.

TissueForge::cuda::CUDADevice::maxGridDimX
int maxGridDimX()
Maximum x-dimension of a grid.

TissueForge::cuda::CUDADevice::currentContext
CUDAContext * currentContext()
Get the current context. If none exists, one is created.

TissueForge::cuda::CUDADevice::managedMem
bool managedMem()
Test if device supports allocating managed memory on this system.

TissueForge::cuda::CUDADevice::computeModeDefault
bool computeModeDefault()
Test if device is not restricted and can have multiple CUDA contexts present at a single time.

TissueForge::cuda::CUDADevice::getAttribute
int getAttribute(const int &attrib)
Get the attribute value of attached device.

TissueForge::cuda::CUDADevice::computeModeExclusive
bool computeModeExclusive()
Test if device can have only one context used by a single process at a time.

TissueForge::cuda::CUDAFunction
A CUDA kernel from a JIT-compiled Tissue Forge program.
Definition tf_cuda.h:196

TissueForge::cuda::CUDARTProgram
A JIT-compiled CUDA Tissue Forge program.
Definition tf_cuda.h:134

TissueForge::cuda::CUDARTProgram::addNamedExpr
void addNamedExpr(const std::string &namedExpr)
Add a named expression.

TissueForge::cuda::CUDARTProgram::loweredName
std::string loweredName(const std::string namedExpr)
Get the lowered name of a named expression. Cannot be called until after compilaton.

TissueForge::cuda::CUDARTProgram::addIncludePath
void addIncludePath(const std::string &ipath)
Add a directory to include in the search path.

TissueForge::cuda::CUDARTProgram::compile
void compile(const char *src, const char *name, int numHeaders=0, const char *const *headers=0, const char *const *includeNames=0)
Compile the program.

TissueForge::cuda::CUDARTProgram::addOpt
void addOpt(const std::string &opt)
Add a compilation option.

TissueForge::cuda::CUDARTSource
Convenience class for loading source from file and storing, here intended for CUDA.
Definition tf_cuda.h:118

HRESULT
int32_t HRESULT
Definition tf_port.h:255

tfError.h