标签：int res float tm sec 加法 GPU CPU

GPU CPU向量加法时间测试

实验设备

系统：WSL Ubuntu18.04

实验思路

分别在GPU，CPU上测试两个一维向量的加法，CPU是一个个的串行计算相加，GPU可以通过并行的方式将对应位置的元素相加。

实验结果

向量a，b的维度	262144\((512 \times 512)\)	1048576\((1024 \times 1024)\)	4194304\((2048 \times 2048)\)
实验结果截图
内存数据拷贝到GPU时间消耗	0.000754 sec	0.001460 sec	0.004343sec
GPU计算时间	0.000021 sec	0.000026 sec	0.000028 sec
结果从显存拷贝到内存时间消耗	0.000263sec	0.000954 sec	0.002438 sec
CPU 计算时间	0.000372 sec	0.001884 sec	0.006497 sec

分析：

对比GPU和CPU的计算时间，随着数据维度增大，GPU计算时间没有明显的增大，CPU计算时间逐渐增大。
随着数据维度增大，数据在内存到显存双向拷贝时间逐渐增大。
数据拷贝的时间+GPU计算时间（前三行），并没有使得向量加法的运行效率提高，这还有待研究。

实验代码

sum.cu

#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"


// CPU 加法
void sumArrays(float *a, float *b, float *res, const int size)
{
  for (int i = 0; i < size; i += 1)
  {
    res[i] = a[i] + b[i];
  }
}

// GPU 加法
__global__ void sumArraysGPU(float *a, float *b, float *res, int N)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < N)
    res[i] = a[i] + b[i];
}

int main(int argc, char **argv)
{
  // set up device
  initDevice(0);

  int nElem = 512*512;
  // int nElem = 1024*1024;
  // int nElem = 2048*2048;

  printf("Vector size:%d\n", nElem);

  // 内存数据申请空间
  int nByte = sizeof(float) * nElem;
  float *a_h = (float *)malloc(nByte);
  float *b_h = (float *)malloc(nByte);
  float *res_h = (float *)malloc(nByte);
  float *res_from_gpu_h = (float *)malloc(nByte);
  memset(res_h, 0, nByte);
  memset(res_from_gpu_h, 0, nByte);

  // 内存数据随机初始化
  initialData(a_h, nElem);
  initialData(b_h, nElem);

  // 显存申请空间
  float *a_d, *b_d, *res_d;
  CHECK(cudaMalloc((float **)&a_d, nByte));
  CHECK(cudaMalloc((float **)&b_d, nByte));
  CHECK(cudaMalloc((float **)&res_d, nByte));


  // 内存到显存数据拷贝
  double iStart, iElaps;
  iStart = cpuSecond();
  CHECK(cudaMemcpy(a_d, a_h, nByte, cudaMemcpyHostToDevice));
  CHECK(cudaMemcpy(b_d, b_h, nByte, cudaMemcpyHostToDevice));
  iElaps = cpuSecond() - iStart;
  printf("内存数据拷贝到GPU时间消耗\t%f sec\n",  iElaps);

  dim3 block(512);
  dim3 grid((nElem - 1) / block.x + 1);

  // GPU 加法
  iStart = cpuSecond();
  sumArraysGPU<<<grid, block>>>(a_d, b_d, res_d, nElem);
  iElaps = cpuSecond() - iStart;
  printf("GPU计算时间 \t\t\t\t %f sec\n", iElaps);

  //显存到内存数据拷贝
  iStart = cpuSecond();
  CHECK(cudaMemcpy(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost));
  iElaps = cpuSecond() - iStart;
  printf("结果从显存拷贝到内存时间消耗   %f sec\n",  iElaps);

  // CPU 加法
  iStart = cpuSecond();
  sumArrays(a_h,b_h,res_h,nElem);
  iElaps= cpuSecond() - iStart;
  printf("CPU 计算时间\t\t\t\t %f sec\n", iElaps);

  checkResult(res_h, res_from_gpu_h, nElem);
  cudaFree(a_d);
  cudaFree(b_d);
  cudaFree(res_d);

  free(a_h);
  free(b_h);
  free(res_h);
  free(res_from_gpu_h);

  return 0;
}

freshman.h

#ifndef FRESHMAN_H
#define FRESHMAN_H
#define CHECK(call)\
{\
  const cudaError_t error=call;\
  if(error!=cudaSuccess)\
  {\
      printf("ERROR: %s:%d,",__FILE__,__LINE__);\
      printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
      exit(1);\
  }\
}


#include <time.h>
#ifdef _WIN32
#	include <windows.h>
#else
#	include <sys/time.h>
#endif
#ifdef _WIN32
int gettimeofday(struct timeval *tp, void *tzp)
{
  time_t clock;
  struct tm tm;
  SYSTEMTIME wtm;
  GetLocalTime(&wtm);
  tm.tm_year   = wtm.wYear - 1900;
  tm.tm_mon   = wtm.wMonth - 1;
  tm.tm_mday   = wtm.wDay;
  tm.tm_hour   = wtm.wHour;
  tm.tm_min   = wtm.wMinute;
  tm.tm_sec   = wtm.wSecond;
  tm. tm_isdst  = -1;
  clock = mktime(&tm);
  tp->tv_sec = clock;
  tp->tv_usec = wtm.wMilliseconds * 1000;
  return (0);
}
#endif
double cpuSecond()
{
  struct timeval tp;
  gettimeofday(&tp,NULL);
  return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);

}
void initialData(float* ip,int size)
{
  time_t t;
  srand((unsigned )time(&t));
  for(int i=0;i<size;i++)
  {
    ip[i]=(float)(rand()&0xffff)/1000.0f;
  }
}
void initialData_int(int* ip, int size)
{
	time_t t;
	srand((unsigned)time(&t));
	for (int i = 0; i<size; i++)
	{
		ip[i] = int(rand()&0xff);
	}
}
void printMatrix(float * C,const int nx,const int ny)
{
  float *ic=C;
  printf("Matrix<%d,%d>:",ny,nx);
  for(int i=0;i<ny;i++)
  {
    for(int j=0;j<nx;j++)
    {
      printf("%6f ",C[j]);
    }
    ic+=nx;
    printf("\n");
  }
}

void initDevice(int devNum)
{
  int dev = devNum;
  cudaDeviceProp deviceProp;
  CHECK(cudaGetDeviceProperties(&deviceProp,dev));
  printf("Using device %d: %s\n",dev,deviceProp.name);
  CHECK(cudaSetDevice(dev));

}
void checkResult(float * hostRef,float * gpuRef,const int N)
{
  double epsilon=1.0E-8;
  for(int i=0;i<N;i++)
  {
    if(abs(hostRef[i]-gpuRef[i])>epsilon)
    {
      printf("Results don\'t match!\n");
      printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
      return;
    }
  }
  printf("Check result success!\n");
}
#endif//FRESHMAN_H

标签：int,res,float,tm,sec,加法,GPU,CPU
From： https://www.cnblogs.com/programmerwang/p/16974393.html

GPU CPU向量加法时间测试

GPU CPU向量加法时间测试

实验设备

实验思路

实验结果

实验代码

相关文章

赞助商

阅读排行