#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
typedef float FLOAT;
double get_time();
void warm_up();
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N); // CPU端
__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N); // GPU端
// 二维GRID+1维BLOCK
#define get_tid() ((blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x)
#define get_bid() (blockIdx.y * gridDim.x + blockIdx.x)
#define WINDOWS 0
#if WINDOWS
#include <windows.h>
double get_time()
{
LARGE_INTEGER timer;
static LARGE_INTEGER fre;
static int init = 0;
double t;
if(init != 1)
{
QueryPerformanceFrequency(&fre);
init = 1;
}
QueryPerformanceFrequency(&timer);
t = timer.QuadPart * 1. / fre.QuadPart;
return t;
}
#else
#include <sys/time.h>
#include <time.h>
double get_time()
{
struct timeval tv;
double t;
gettimeofday(&tv, (struct timezone*)0);
t = tv.tv_sec + (double)tv.tv_usec*1e-6;
return t;
}
#endif
// GPU WARM-UP
__global__ void warmup_knl(void)
{
int i, j;
i = 1;
j = 1;
i = i + j;
}
// CPU WARM-UP
void warm_up()
{
int i = 0;
for (; i < 0; ++i)
{
warmup_knl <<<1, 256>>> ();
}
}
// CPU端-HOST
void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
int i;
for (int i = 0; i < N; ++i)
z[i] = x[i] + y[i] + z[i];
}
__global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N)
{
int idx = get_tid();
if (idx < N)
z[idx] = x[idx] + y[idx] + z[idx];
}
int main()
{
int N = 20000000;
int nbytes = N * sizeof(FLOAT);
// 二维GRID 一维BLOCK
int bs = 256; // BLOCK NUMBER
int s = ceil(sqrt((N + bs - 1.) / bs)); // GRID SIZE
dim3 grid = dim3(s, s);
FLOAT* dx = NULL, *hx = NULL;
FLOAT* dy = NULL, *hy = NULL;
FLOAT* dz = NULL, *hz = NULL;
int iter = 30;
int i;
double th, td;
warm_up();
// 分配GPU内存
cudaMalloc((void**)&dx, nbytes);
cudaMalloc((void**)&dy, nbytes);
cudaMalloc((void**)&dz, nbytes);
if(dx == NULL || dy == NULL || dz == NULL)
{
printf("Couldn't allocate GPU Memory");
return -1;
}
// 分配CPU内存
hx = (FLOAT*)malloc(nbytes);
hy = (FLOAT*)malloc(nbytes);
hz = (FLOAT*)malloc(nbytes);
if(hx == NULL || hy == NULL || hz == NULL)
{
printf("Couldn't allocate CPU Memory");
}
// 初始化
for(int i = 0; i < N; ++i)
{
hx[i] = 1;
hy[i] = 1;
hz[i] = 1;
}
// 拷贝数据从主机CPU至设备GPU
cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, nbytes, cudaMemcpyHostToDevice);
warm_up();
cudaThreadSynchronize(); // 异步线程开启
// 调用GPU
td = get_time();
for(i = 0; i < iter; ++i)
vec_add_device<<<grid, bs>>> (dx, dy, dz, N);
td = get_time() - td;
// 严格上这里需要有一个从设备端拷贝数据到主机端的过程
// cudaMemcpy(hx, dx, nbytes, cudaMemcpyDevicetoHost);
// 调用CPU
th = get_time();
for(i = 0; i < iter; ++i)
vec_add_host(hx, hy, hz, N);
th = get_time() - th;
printf("GPU time: %.4f, CPU time: %.4f. SppedUp: %g \n", td, th, th/td);
// 释放资源
free(hx);
free(hy);
free(hz);
cudaFree(hx);
cudaFree(hy);
cudaFree(hz);
return 0;
}
GPU time: 0.0109, CPU time: 2.6454. SppedUp: 242.811
标签:time,int,void,FLOAT,Cuda,加法,NULL,Hello,nbytes
From: https://www.cnblogs.com/XL2COWARD/p/MyCuda_2.html