1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 #include <stdio.h> 4 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); 5 __global__ void addKernel(int *c, const int *a, const int *b) 6 { 7 int i = threadIdx.x; 8 c[i] = a[i] + b[i]; 9 } 10 extern "C" 11 void run() 12 { 13 const int arraySize = 5; 14 const int a[arraySize] = { 1, 2, 3, 4, 5 }; 15 const int b[arraySize] = { 10, 20, 30, 40, 50 }; 16 int c[arraySize] = { 0 }; 17 // Add vectors in parallel. 18 cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); 19 if (cudaStatus != cudaSuccess) { 20 fprintf(stderr, "addWithCuda failed!"); 21 return; 22 } 23 printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", 24 c[0], c[1], c[2], c[3], c[4]); 25 // cudaDeviceReset must be called before exiting in order for profiling and 26 // tracing tools such as Nsight and Visual Profiler to show complete traces. 27 cudaStatus = cudaDeviceReset(); 28 if (cudaStatus != cudaSuccess) { 29 fprintf(stderr, "cudaDeviceReset failed!"); 30 return; 31 } 32 // return 0; 33 } 34 // Helper function for using CUDA to add vectors in parallel. 35 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) 36 { 37 int *dev_a = 0; 38 int *dev_b = 0; 39 int *dev_c = 0; 40 cudaError_t cudaStatus; 41 // Choose which GPU to run on, change this on a multi-GPU system. 42 cudaStatus = cudaSetDevice(0); 43 if (cudaStatus != cudaSuccess) { 44 fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 45 goto Error; 46 } 47 // Allocate GPU buffers for three vectors (two input, one output) . 48 cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); 49 if (cudaStatus != cudaSuccess) { 50 fprintf(stderr, "cudaMalloc failed!"); 51 goto Error; 52 } 53 cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); 54 if (cudaStatus != cudaSuccess) { 55 fprintf(stderr, "cudaMalloc failed!"); 56 goto Error; 57 } 58 cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); 59 if (cudaStatus != cudaSuccess) { 60 fprintf(stderr, "cudaMalloc failed!"); 61 goto Error; 62 } 63 // Copy input vectors from host memory to GPU buffers. 64 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); 65 if (cudaStatus != cudaSuccess) { 66 fprintf(stderr, "cudaMemcpy failed!"); 67 goto Error; 68 } 69 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); 70 if (cudaStatus != cudaSuccess) { 71 fprintf(stderr, "cudaMemcpy failed!"); 72 goto Error; 73 } 74 // Launch a kernel on the GPU with one thread for each element. 75 addKernel<<<1, size>>>(dev_c, dev_a, dev_b); 76 // Check for any errors launching the kernel 77 cudaStatus = cudaGetLastError(); 78 if (cudaStatus != cudaSuccess) { 79 fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 80 goto Error; 81 } 82 // cudaDeviceSynchronize waits for the kernel to finish, and returns 83 // any errors encountered during the launch. 84 cudaStatus = cudaDeviceSynchronize(); 85 if (cudaStatus != cudaSuccess) { 86 fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 87 goto Error; 88 } 89 // Copy output vector from GPU buffer to host memory. 90 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); 91 if (cudaStatus != cudaSuccess) { 92 fprintf(stderr, "cudaMemcpy failed!"); 93 goto Error; 94 } 95 Error: 96 cudaFree(dev_c); 97 cudaFree(dev_a); 98 cudaFree(dev_b); 99 return cudaStatus; 100 }
1 CONFIG += console 2 3 TARGET = test 4 5 # Define output directories 6 DESTDIR = ../bin 7 CUDA_OBJECTS_DIR = OBJECTS_DIR/../cuda 8 9 # This makes the .cu files appear in your project 10 CUDA_SOURCES += \ 11 kernel.cu 12 13 # MSVCRT link option (static or dynamic, it must be the same with your Qt SDK link option) 14 MSVCRT_LINK_FLAG_DEBUG = "/MDd" 15 MSVCRT_LINK_FLAG_RELEASE = "/MD" 16 17 # CUDA settings 18 CUDA_DIR = $$(CUDA_PATH) # Path to cuda toolkit install 19 SYSTEM_NAME = x64 # Depending on your system either 'Win32', 'x64', or 'Win64' 20 SYSTEM_TYPE = 64 # '32' or '64', depending on your system 21 CUDA_ARCH = sm_50 # Type of CUDA architecture 22 NVCC_OPTIONS = --use_fast_math 23 24 # include paths 25 INCLUDEPATH += $$CUDA_DIR/include \ 26 $$CUDA_DIR/common/inc \ 27 $$CUDA_DIR/../shared/inc 28 29 # library directories 30 QMAKE_LIBDIR += $$CUDA_DIR/lib/$$SYSTEM_NAME \ 31 $$CUDA_DIR/common/lib/$$SYSTEM_NAME \ 32 $$CUDA_DIR/../shared/lib/$$SYSTEM_NAME 33 34 # The following makes sure all path names (which often include spaces) are put between quotation marks 35 CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"') 36 37 # Add the necessary libraries 38 CUDA_LIB_NAMES = cudart_static kernel32 user32 gdi32 winspool comdlg32 \ 39 advapi32 shell32 ole32 oleaut32 uuid odbc32 odbccp32 \ 40 #freeglut glew32 41 42 for(lib, CUDA_LIB_NAMES) { 43 CUDA_LIBS += -l$$lib 44 } 45 LIBS += $$CUDA_LIBS 46 47 # Configuration of the Cuda compiler 48 CONFIG(debug, debug|release) { 49 # Debug mode 50 cuda_d.input = CUDA_SOURCES 51 cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.obj 52 cuda_d.commands = $$CUDA_DIR/bin/nvcc.exe -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$LIBS \ 53 --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH \ 54 --compile -cudart static -g -DWIN32 -D_MBCS \ 55 -Xcompiler "/wd4819,/EHsc,/W3,/nologo,/Od,/Zi,/RTC1" \ 56 -Xcompiler $$MSVCRT_LINK_FLAG_DEBUG \ 57 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} 58 cuda_d.dependency_type = TYPE_C 59 QMAKE_EXTRA_COMPILERS += cuda_d 60 } 61 else { 62 # Release mode 63 cuda.input = CUDA_SOURCES 64 cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.obj 65 cuda.commands = $$CUDA_DIR/bin/nvcc.exe $$NVCC_OPTIONS $$CUDA_INC $$LIBS \ 66 --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH \ 67 --compile -cudart static -DWIN32 -D_MBCS \ 68 -Xcompiler "/wd4819,/EHsc,/W3,/nologo,/O2,/Zi" \ 69 -Xcompiler $$MSVCRT_LINK_FLAG_RELEASE \ 70 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME} 71 cuda.dependency_type = TYPE_C 72 QMAKE_EXTRA_COMPILERS += cuda 73 } 74 75 SOURCES += \ 76 main.cpp
