文章目录
通过cuda API获取
CUDA 提供了一系列 API 来查询设备的硬件信息、性能特性和限制。以下是一些常用的获取设备信息的 API:
1. 获取设备数量
使用 cudaGetDeviceCount
可以获取系统中可用的 CUDA 设备数量。
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);
std::cout << "Number of CUDA devices: " << deviceCount << std::endl;
2. 获取当前设备 ID
使用 cudaGetDevice
可以获取当前正在使用的设备 ID。
int currentDevice = 0;
cudaGetDevice(¤tDevice);
std::cout << "Current device ID: " << currentDevice << std::endl;
3. 设置当前设备
使用 cudaSetDevice
可以设置当前使用的设备。
int deviceId = 0; // 设置设备 ID
cudaSetDevice(deviceId);
4. 获取设备属性
使用 cudaGetDeviceProperties
可以获取设备的详细属性信息,包括硬件特性、计算能力、内存大小等。
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceId); // deviceId 是设备 ID
std::cout << "Device Name: " << deviceProp.name << std::endl;
std::cout << "Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
std::cout << "Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
std::cout << "Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
std::cout << "Max Threads Dim: (" << deviceProp.maxThreadsDim[0] << ", "
<< deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << ")" << std::endl;
std::cout << "Max Grid Size: (" << deviceProp.maxGridSize[0] << ", "
<< deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << ")" << std::endl;
std::cout << "Warp Size: " << deviceProp.warpSize << std::endl;
std::cout << "Multiprocessor Count: " << deviceProp.multiProcessorCount << std::endl;
5. 获取设备限制
使用 cudaDeviceGetLimit
可以获取设备的资源限制,例如栈大小、堆大小等。
size_t stackSize = 0;
cudaDeviceGetLimit(&stackSize, cudaLimitStackSize);
std::cout << "Stack Size Limit: " << stackSize << " bytes" << std::endl;
size_t heapSize = 0;
cudaDeviceGetLimit(&heapSize, cudaLimitMallocHeapSize);
std::cout << "Heap Size Limit: " << heapSize << " bytes" << std::endl;
6. 获取设备共享内存配置
使用 cudaDeviceGetSharedMemConfig
可以获取设备的共享内存配置(4 字节或 8 字节 bank 大小)。
cudaSharedMemConfig config;
cudaDeviceGetSharedMemConfig(&config);
if (config == cudaSharedMemBankSizeFourByte) {
std::cout << "Shared Memory Bank Size: 4 bytes" << std::endl;
} else if (config == cudaSharedMemBankSizeEightByte) {
std::cout << "Shared Memory Bank Size: 8 bytes" << std::endl;
}
7. 获取设备缓存配置
使用 cudaDeviceGetCacheConfig
可以获取设备的 L1 缓存和共享内存的配置。
cudaFuncCache cacheConfig;
cudaDeviceGetCacheConfig(&cacheConfig);
switch (cacheConfig) {
case cudaFuncCachePreferNone:
std::cout << "Cache Config: Prefer None" << std::endl;
break;
case cudaFuncCachePreferShared:
std::cout << "Cache Config: Prefer Shared Memory" << std::endl;
break;
case cudaFuncCachePreferL1:
std::cout << "Cache Config: Prefer L1 Cache" << std::endl;
break;
case cudaFuncCachePreferEqual:
std::cout << "Cache Config: Prefer Equal L1 and Shared Memory" << std::endl;
break;
}
8. 获取设备是否支持统一内存
使用 cudaDeviceGetAttribute
可以查询设备是否支持统一内存(Unified Memory)。
int unifiedAddressing = 0;
cudaDeviceGetAttribute(&unifiedAddressing, cudaDevAttrUnifiedAddressing, deviceId);
if (unifiedAddressing) {
std::cout << "Device supports Unified Memory" << std::endl;
} else {
std::cout << "Device does not support Unified Memory" << std::endl;
}
9. 获取设备是否支持并发内核执行
使用 cudaDeviceGetAttribute
可以查询设备是否支持并发内核执行。
int concurrentKernels = 0;
cudaDeviceGetAttribute(&concurrentKernels, cudaDevAttrConcurrentKernels, deviceId);
if (concurrentKernels) {
std::cout << "Device supports Concurrent Kernel Execution" << std::endl;
} else {
std::cout << "Device does not support Concurrent Kernel Execution" << std::endl;
}
10. 获取设备的最大线程块数
使用 cudaDeviceGetAttribute
可以查询设备的最大线程块数。
int maxBlocks = 0;
cudaDeviceGetAttribute(&maxBlocks, cudaDevAttrMaxBlocksPerMultiprocessor, deviceId);
std::cout << "Max Blocks per Multiprocessor: " << maxBlocks << std::endl;
11. 获取设备的时钟频率
使用 cudaDeviceGetAttribute
可以查询设备的时钟频率。
int clockRate = 0;
cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, deviceId);
std::cout << "Device Clock Rate: " << clockRate << " kHz" << std::endl;
12. 获取设备的内存时钟频率
使用 cudaDeviceGetAttribute
可以查询设备的内存时钟频率。
int memoryClockRate = 0;
cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, deviceId);
std::cout << "Memory Clock Rate: " << memoryClockRate << " kHz" << std::endl;
13. 获取设备的总显存大小
使用 cudaDeviceGetAttribute
可以查询设备的总显存大小。
size_t totalMemory = 0;
cudaDeviceGetAttribute(&totalMemory, cudaDevAttrTotalMemory, deviceId);
std::cout << "Total Device Memory: " << totalMemory / (1024 * 1024) << " MB" << std::endl;
案例
#include <iostream>
#include <cuda_runtime.h>
// 打印设备信息的函数
void printDeviceInfo(int deviceId) {
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceId);
std::cout << "Device " << deviceId << ": " << deviceProp.name << std::endl;
std::cout << " Compute Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
std::cout << " Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
std::cout << " Shared Memory per Block: " << deviceProp.sharedMemPerBlock / 1024 << " KB" << std::endl;
std::cout << " Registers per Block: " << deviceProp.regsPerBlock << std::endl;
std::cout << " Warp Size: " << deviceProp.warpSize << std::endl;
std::cout << " Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
std::cout << " Max Threads Dim: (" << deviceProp.maxThreadsDim[0] << ", "
<< deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << ")" << std::endl;
std::cout << " Max Grid Size: (" << deviceProp.maxGridSize[0] << ", "
<< deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << ")" << std::endl;
std::cout << " Multiprocessor Count: " << deviceProp.multiProcessorCount << std::endl;
std::cout << " Clock Rate: " << deviceProp.clockRate / 1000 << " MHz" << std::endl;
std::cout << " Memory Clock Rate: " << deviceProp.memoryClockRate / 1000 << " MHz" << std::endl;
std::cout << " Memory Bus Width: " << deviceProp.memoryBusWidth << " bits" << std::endl;
std::cout << " L2 Cache Size: " << deviceProp.l2CacheSize / 1024 << " KB" << std::endl;
std::cout << " Max Texture Dimensions: (" << deviceProp.maxTexture1D << ", "
<< deviceProp.maxTexture2D[0] << "x" << deviceProp.maxTexture2D[1] << ", "
<< deviceProp.maxTexture3D[0] << "x" << deviceProp.maxTexture3D[1] << "x" << deviceProp.maxTexture3D[2] << ")" << std::endl;
std::cout << " Unified Addressing: " << (deviceProp.unifiedAddressing ? "Yes" : "No") << std::endl;
std::cout << " Concurrent Kernels: " << (deviceProp.concurrentKernels ? "Yes" : "No") << std::endl;
std::cout << " ECC Enabled: " << (deviceProp.ECCEnabled ? "Yes" : "No") << std::endl;
std::cout << " PCI Bus ID: " << deviceProp.pciBusID << std::endl;
std::cout << " PCI Device ID: " << deviceProp.pciDeviceID << std::endl;
std::cout << " PCI Domain ID: " << deviceProp.pciDomainID << std::endl;
std::cout << std::endl;
}
int main() {
// 获取设备数量
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);
if (deviceCount == 0) {
std::cerr << "No CUDA devices found!" << std::endl;
return 1;
}
std::cout << "Number of CUDA devices: " << deviceCount << std::endl;
// 获取当前设备 ID
int currentDevice = 0;
cudaGetDevice(¤tDevice);
std::cout << "Current device ID: " << currentDevice << std::endl;
// 打印每个设备的详细信息
for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
printDeviceInfo(deviceId);
}
return 0;
}
Number of CUDA devices: 2
Current device ID: 0
Device 0: NVIDIA GeForce RTX 3080
Compute Capability: 8.6
Total Global Memory: 10240 MB
Shared Memory per Block: 48 KB
Registers per Block: 65536
Warp Size: 32
Max Threads per Block: 1024
Max Threads Dim: (1024, 1024, 64)
Max Grid Size: (2147483647, 65535, 65535)
Multiprocessor Count: 68
Clock Rate: 1710 MHz
Memory Clock Rate: 9500 MHz
Memory Bus Width: 320 bits
L2 Cache Size: 5120 KB
Max Texture Dimensions: (131072, 131072x65536, 16384x16384x16384)
Unified Addressing: Yes
Concurrent Kernels: Yes
ECC Enabled: No
PCI Bus ID: 1
PCI Device ID: 0
PCI Domain ID: 0
Device 1: NVIDIA Quadro P4000
Compute Capability: 6.1
Total Global Memory: 8192 MB
Shared Memory per Block: 48 KB
Registers per Block: 65536
Warp Size: 32
Max Threads per Block: 1024
Max Threads Dim: (1024, 1024, 64)
Max Grid Size: (2147483647, 65535, 65535)
Multiprocessor Count: 14
Clock Rate: 1480 MHz
Memory Clock Rate: 7600 MHz
Memory Bus Width: 256 bits
L2 Cache Size: 2048 KB
Max Texture Dimensions: (131072, 131072x65536, 16384x16384x16384)
Unified Addressing: Yes
Concurrent Kernels: Yes
ECC Enabled: No
PCI Bus ID: 2
PCI Device ID: 0
PCI Domain ID: 0