一个简单的案例:
header.h
void addKernel(const int* a, const int* b, int* c, int size);
test.cu
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "header.h" __global__ void add(const int* a, const int* b, int* c, int size) { int tid = threadIdx.x + blockIdx.x * blockDim.x; c[tid] = b[tid] + a[tid]; } void addKernel(const int* a, const int* b, int* c, int size) { int* dev_a = 0; int* dev_b = 0; int* dev_c = 0; cudaSetDevice(0); cudaMalloc((void**)&dev_a, sizeof(int) * size); cudaMalloc((void**)&dev_b, sizeof(int) * size); cudaMalloc((void**)&dev_c, sizeof(int) * size); cudaMemcpy(dev_a, a, sizeof(int) * size, cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, sizeof(int) * size, cudaMemcpyHostToDevice); cudaMemcpy(dev_c, c, sizeof(int) * size, cudaMemcpyHostToDevice); add <<<2, 128 >>> (dev_a, dev_b, dev_c, size); cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); }
test.cpp
#include <stdio.h> #include <stdlib.h> #include "header.h" #define N 256 int main(int argc, char** argv) { int a[N]; int b[N]; int c[N]; for (int i = 0; i < N; i++) { a[i] = i; b[i] = 2 * i; c[i] = 0; } addKernel(a, b, c, N); for (int i = 0; i < N; i++) { printf("%d is %d.\n", i, c[i]); } return 0; }
标签:const,int,编程,dev,案例,cuda,sizeof,include,size From: https://www.cnblogs.com/xiaochouk/p/17905208.html