GPU CPU向量加法时间测试
实验设备
系统:WSL Ubuntu18.04
实验思路
分别在GPU,CPU上测试两个一维向量的加法,CPU是一个个的串行计算相加,GPU可以通过并行的方式将对应位置的元素相加。
实验结果
向量a,b的维度 | 262144\((512 \times 512)\) | 1048576\((1024 \times 1024)\) | 4194304\((2048 \times 2048)\) |
---|---|---|---|
实验结果截图 | |||
内存数据拷贝到GPU时间消耗 | 0.000754 sec | 0.001460 sec | 0.004343sec |
GPU计算时间 | 0.000021 sec | 0.000026 sec | 0.000028 sec |
结果从显存拷贝到内存时间消耗 | 0.000263sec | 0.000954 sec | 0.002438 sec |
CPU 计算时间 | 0.000372 sec | 0.001884 sec | 0.006497 sec |
分析:
- 对比GPU和CPU的计算时间,随着数据维度增大,GPU计算时间没有明显的增大,CPU计算时间逐渐增大。
- 随着数据维度增大,数据在内存到显存双向拷贝时间逐渐增大。
- 数据拷贝的时间+GPU计算时间(前三行),并没有使得向量加法的运行效率提高,这还有待研究。
实验代码
sum.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"
// CPU 加法
void sumArrays(float *a, float *b, float *res, const int size)
{
for (int i = 0; i < size; i += 1)
{
res[i] = a[i] + b[i];
}
}
// GPU 加法
__global__ void sumArraysGPU(float *a, float *b, float *res, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
res[i] = a[i] + b[i];
}
int main(int argc, char **argv)
{
// set up device
initDevice(0);
int nElem = 512*512;
// int nElem = 1024*1024;
// int nElem = 2048*2048;
printf("Vector size:%d\n", nElem);
// 内存数据申请空间
int nByte = sizeof(float) * nElem;
float *a_h = (float *)malloc(nByte);
float *b_h = (float *)malloc(nByte);
float *res_h = (float *)malloc(nByte);
float *res_from_gpu_h = (float *)malloc(nByte);
memset(res_h, 0, nByte);
memset(res_from_gpu_h, 0, nByte);
// 内存数据随机初始化
initialData(a_h, nElem);
initialData(b_h, nElem);
// 显存申请空间
float *a_d, *b_d, *res_d;
CHECK(cudaMalloc((float **)&a_d, nByte));
CHECK(cudaMalloc((float **)&b_d, nByte));
CHECK(cudaMalloc((float **)&res_d, nByte));
// 内存到显存数据拷贝
double iStart, iElaps;
iStart = cpuSecond();
CHECK(cudaMemcpy(a_d, a_h, nByte, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(b_d, b_h, nByte, cudaMemcpyHostToDevice));
iElaps = cpuSecond() - iStart;
printf("内存数据拷贝到GPU时间消耗\t%f sec\n", iElaps);
dim3 block(512);
dim3 grid((nElem - 1) / block.x + 1);
// GPU 加法
iStart = cpuSecond();
sumArraysGPU<<<grid, block>>>(a_d, b_d, res_d, nElem);
iElaps = cpuSecond() - iStart;
printf("GPU计算时间 \t\t\t\t %f sec\n", iElaps);
//显存到内存数据拷贝
iStart = cpuSecond();
CHECK(cudaMemcpy(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost));
iElaps = cpuSecond() - iStart;
printf("结果从显存拷贝到内存时间消耗 %f sec\n", iElaps);
// CPU 加法
iStart = cpuSecond();
sumArrays(a_h,b_h,res_h,nElem);
iElaps= cpuSecond() - iStart;
printf("CPU 计算时间\t\t\t\t %f sec\n", iElaps);
checkResult(res_h, res_from_gpu_h, nElem);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(res_d);
free(a_h);
free(b_h);
free(res_h);
free(res_from_gpu_h);
return 0;
}
freshman.h
#ifndef FRESHMAN_H
#define FRESHMAN_H
#define CHECK(call)\
{\
const cudaError_t error=call;\
if(error!=cudaSuccess)\
{\
printf("ERROR: %s:%d,",__FILE__,__LINE__);\
printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
exit(1);\
}\
}
#include <time.h>
#ifdef _WIN32
# include <windows.h>
#else
# include <sys/time.h>
#endif
#ifdef _WIN32
int gettimeofday(struct timeval *tp, void *tzp)
{
time_t clock;
struct tm tm;
SYSTEMTIME wtm;
GetLocalTime(&wtm);
tm.tm_year = wtm.wYear - 1900;
tm.tm_mon = wtm.wMonth - 1;
tm.tm_mday = wtm.wDay;
tm.tm_hour = wtm.wHour;
tm.tm_min = wtm.wMinute;
tm.tm_sec = wtm.wSecond;
tm. tm_isdst = -1;
clock = mktime(&tm);
tp->tv_sec = clock;
tp->tv_usec = wtm.wMilliseconds * 1000;
return (0);
}
#endif
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp,NULL);
return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
}
void initialData(float* ip,int size)
{
time_t t;
srand((unsigned )time(&t));
for(int i=0;i<size;i++)
{
ip[i]=(float)(rand()&0xffff)/1000.0f;
}
}
void initialData_int(int* ip, int size)
{
time_t t;
srand((unsigned)time(&t));
for (int i = 0; i<size; i++)
{
ip[i] = int(rand()&0xff);
}
}
void printMatrix(float * C,const int nx,const int ny)
{
float *ic=C;
printf("Matrix<%d,%d>:",ny,nx);
for(int i=0;i<ny;i++)
{
for(int j=0;j<nx;j++)
{
printf("%6f ",C[j]);
}
ic+=nx;
printf("\n");
}
}
void initDevice(int devNum)
{
int dev = devNum;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp,dev));
printf("Using device %d: %s\n",dev,deviceProp.name);
CHECK(cudaSetDevice(dev));
}
void checkResult(float * hostRef,float * gpuRef,const int N)
{
double epsilon=1.0E-8;
for(int i=0;i<N;i++)
{
if(abs(hostRef[i]-gpuRef[i])>epsilon)
{
printf("Results don\'t match!\n");
printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
return;
}
}
printf("Check result success!\n");
}
#endif//FRESHMAN_H
标签:int,res,float,tm,sec,加法,GPU,CPU
From: https://www.cnblogs.com/programmerwang/p/16974393.html