1. 目标:对 16384*16384 规模的矩阵进行加法运算,对比 CPU 和 GPU 计算的效率,还有不同线程块大小规模下对效率的影响;并做可能的优化测试。
2. 核心代码
/*
用GPU对二维矩阵做加法,分析不同线程块规模下的性能变化
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include "cuda_runtime_api.h"
#define EP 1e-6
int NX = 1<<14; //矩阵行数
int NY = 1<<14; //阵列数
int len = NX*NY; //矩阵元素数
float *hostA = (float*)calloc(len,sizeof(float));
float *hostB = (float*)calloc(len,sizeof(float));
float *hostC = (float*)calloc(len,sizeof(float));
float *hostCFD = (float*)calloc(len,sizeof(float)); //存储GPU计算结果
void init_matrix(float *arr,int len)
{
srand(0);
for(int i=0;i<len;i++)
arr[i] = (rand()%100)/100;
}
void serial_add2D(float* A,float*B,float*C,int ny,int nx)
{
for(int i=0;i<ny;i++)
{
for(int j=0;j<nx;j++)
C[i*nx+j] = A[i*nx+j]+B[i*nx+j];
}
}
//grid中参与计算线程shap与矩阵一致,右侧和底部线程可能闲置,并产生线程束分支
__global__ void kernel_add2D(float* A,float*B,float*C,int ny,int nx)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int id = nx * y + x;
if(x<nx && y<ny)
C[id] = A[id] + B[id];
}
//优化:减少每个线程寄存器的使用数量,增加SM上线程束的的数量
__global__ void kernel_add2D_regist_opt(float* A,float*B,float*C,int ny,int nx)
{
int id = nx*(blockIdx.y * blockDim.y + threadIdx.y) + blockIdx.x * blockDim.x + threadIdx.x;
if((blockIdx.x * blockDim.x + threadIdx.x)<nx && (blockIdx.y * blockDim.y + threadIdx.y)<ny)
C[id] = A[id] + B[id];
}
//优化:二维数据存储索引与全局线程ID匹配。grid全局线程ID与矩阵元素一维ID对应进行计算,减少线程需求数量,减少线程束分支的情况
__global__ void kernel_add2D_reallocate_opt(float* A,float*B,float*C,int ny,int nx)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int id = y * nx + x;
if(id<nx*ny)
C[id] = A[id] + B[id];
}
//寄存器优化+存储索引与全局线程ID匹配优化
__global__ void kernel_add2D_regist_reallocate_opt(float* A,float*B,float*C,int ny,int nx)
{
int id = nx*(blockIdx.y * blockDim.y + threadIdx.y) + blockIdx.x * blockDim.x + threadIdx.x;
if(id<nx*ny)
C[id] = A[id] + B[id];
}
//ms
long getTime()
{
struct timeval cur;
gettimeofday(&cur, NULL);
// printf("sec %ld usec %ld,toal ms %ld\n",cur.tv_sec,cur.tv_usec,cur.tv_sec*1e3 + cur.tv_usec / 1e3);
return cur.tv_sec*1e3 + cur.tv_usec / 1e3;
}
//串行计算
long serial()
{
long start = getTime();
serial_add2D(hostA,hostB,hostC,NY,NX);
long end = getTime();
return end-start;
}
//mode 0:cpu串行 其余GPU并行 1:kernel_base 2:kernel_regist_opt 3:kernel_reallocate_opt 4:kernel_regist_reallocate_opt
bool gpu(int mode,int bx,int by,float * gpuTime)
{
//开辟全局内存
float* AD = NULL;
float* BD = NULL;
float* CD = NULL;
cudaMalloc((void**)&AD,len*sizeof(float));
cudaMalloc((void**)&BD,len*sizeof(float));
cudaMalloc((void**)&CD,len*sizeof(float));
//拷贝数据
cudaMemcpy(AD,hostA,len*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(BD,hostB,len*sizeof(float),cudaMemcpyHostToDevice);
//设置blockSize
dim3 blockDim;
dim3 gridDim;
//cuda 记录时间,单位为 ms
cudaEvent_t startD, endD;
cudaEventCreate(&startD);
cudaEventCreate(&endD);
cudaEventRecord(startD, 0);
switch(mode)
{
case 1:
blockDim.x = bx;
blockDim.y = by;
gridDim.x = (NX-1)/bx+1;
gridDim.y = (NY-1)/by+1;
kernel_add2D<<<gridDim,blockDim>>>(AD,BD,CD,NY,NX);
break;
case 2:
//计算 gridDim,
//grid中参与计算线程shap与矩阵一致,右侧和底部线程可能闲置,并产生线程束分支
blockDim.x = bx;
blockDim.y = by;
gridDim.x = (NX-1)/bx+1;
gridDim.y = (NY-1)/by+1;
kernel_add2D_regist_opt<<<gridDim,blockDim>>>(AD,BD,CD,NY,NX);
break;
case 3:
//grid全局线程ID与矩阵元素一维ID对应进行计算,相对应要减少 block数量
blockDim.x = bx;
blockDim.y = by;
gridDim.x = (NX-1)/bx+1;
gridDim.y = (len-1)/(gridDim.x*blockDim.x*blockDim.y)+1;
kernel_add2D_reallocate_opt<<<gridDim,blockDim>>>(AD,BD,CD,NY,NX);
break;
case 4:
blockDim.x = bx;
blockDim.y = by;
gridDim.x = (NX-1)/bx+1;
gridDim.y = (len-1)/(gridDim.x*blockDim.x*blockDim.y)+1;
kernel_add2D_regist_reallocate_opt<<<gridDim,blockDim>>>(AD,BD,CD,NY,NX);
break;
default:
break;
}
cudaEventRecord(endD, 0);
//等待事件完成ll
cudaEventSynchronize(endD);
cudaEventElapsedTime(gpuTime, startD, endD);
cudaEventDestroy(startD);
cudaEventDestroy(endD);
cudaMemcpy(hostCFD,CD,len*sizeof(float),cudaMemcpyDeviceToHost);
//结果对比
for(int i=0;i<len;i++)
{
if(fabs(hostC[i]-hostCFD[i])>EP)
{
printf("hostC[%d] %f != CFromDevice[%d] %f\n",i,hostC[i],i,hostCFD[i]);
return false;
}
}
//耗时
printf("串行并行计算结果一致,mode %d,gpu<<<(%d,%d),(%d,%d)>>>,耗时 %f ms\n",mode,gridDim.x,gridDim.y,blockDim.x,blockDim.y,*gpuTime);
cudaFree(AD);
cudaFree(BD);
cudaFree(CD);
return true;
}
int main(int argc ,char** argv)
{
//矩阵初始化
init_matrix(hostA,len);
init_matrix(hostB,len);
if(argc==2)
{
//串行计算
long cpuTime = serial();
printf("cpu 耗时 %d ms\n",cpuTime);
return 0;
}
else if(argc!=4)
{
printf("parameter 1:mode [0:kernel_base 1:kernel_regist_opt 2:kernel_reallocate_opt 3:kernel_regist_reallocate_opt] 2:BlockDim.x 3:BlockDim.y\n");
return 1;
}
else
{
#if 0
long cpuTime = serial();
#endif
//gpu计算
//mode 0:cpu串行 其余GPU并行 1:kernel_base 2:kernel_regist_opt 3:kernel_reallocate_opt 4:kernel_regist_reallocate_opt
int mode = 0;
float gpuTime = 0.0;
int blockX = 8,blockY = 8;
mode = atoi(argv[1]);
blockX = atoi(argv[2]);
blockY = atoi(argv[3]);
if(gpu(mode,blockX,blockY,&gpuTime))
// printf("mode %d,cpu与GPU计算结果一致。cpu耗时 %dms,gpu耗时 %f ms,加速比%f\n",mode,cpuTime,gpuTime,cpuTime/gpuTime);
;
}
free(hostA);
free(hostB);
free(hostC);
free(hostCFD);
return 0;
}
3. 测试脚本
#!/bin/bash
# 编译
nvcc matrix_add2D.cu -o matrix_add2D
dir=out
# 清空文件夹
> "$dir"
echo "start $(date)" >> out
for((i=0;i<4;i++)); do
yhrun -N1 -n1 -pTH_GPU ./matrix_add2D 0 | tee -a "$dir"
done
# mode
mode=(1 2 3 4)
# blockDim.x
X=(8 16 32)
# blockDim.y
Y=(8 16 32)
# gpu
for m in "${mode[@]}"; do
for x in "${X[@]}"; do
for y in "${Y[@]}"; do
for((i=0;i<4;i++)); do
yhrun -N1 -n1 -pTH_GPU ./matrix_add2D "$m" "$x" "$y" | tee -a "$dir"
done
done
done
done
echo "end $(date)" >> out
4. 测试思路
(1) 对线程块 X Y两个维度分别设置 (8 16 32)参数进行交叉测试;
(2) GPU 基础并行计算,存在以下两个问题:
一:kernel 函数计算线程全局唯一ID时使用了三个寄存器,有额外 x y 2个寄存器变量的占用,降低了一个SM上分配线程块的数量。
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int id = nx * y + x;
二:线程块进程分配时,采用的算法在矩阵2个维度方向都有冗余线程;
gridDim.x = (NX-1)/bx+1;
gridDim.y = (NY-1)/by+1;
有两个缺陷:
1:分配的线程在两个维度上存在大量冗余,硬件利用率不高;
2:最右侧一列和最底部一行的线程块内部有线程束分支,阻塞线程束,影响效率。
针对上面的问题进行如下优化:
一:减少每个线程寄存器的使用数量,增加SM上线程块的的数量
二:对线程块Grid全局ID与矩阵一维索引对应进行计算(grid中每一行的线程块都利用完才使用下一行的线程块),期望减少 gridDim.y的行数,降低线程块的需求量,降低资源占用,减少具有分支的线程束数量。
5. 测试结果
kernel_DIM | cpu串行(ms) | gpu_base(ms) | gpu+寄存器优化(ms) | gpu+gridDim优化(ms) | gpu+寄存器+GridDim优化(ms) | 加速比(根据gpu_base) |
---|---|---|---|---|---|---|
<<<(2048,2048),(8,8)>>> | 1545.75 | 37.78 | 37.45 | 37.40 | 37.54 | 40.91 |
<<<(2048,1024),(8,16)>>> | 39.04 | 39.04 | 39.09 | 39.05 | 39.60 | |
<<<(2048,512),(8,32)>>> | 40.81 | 40.84 | 40.83 | 40.81 | 37.88 | |
<<<(1024,2048),(16,8)>>> | 26.63 | 26.64 | 26.67 | 26.65 | 58.04 | |
<<<(1024,1024),(16,16)>>> | 27.21 | 27.26 | 27.24 | 27.26 | 56.81 | |
<<<(1024,512),(16,32)>>> | 28.13 | 28.14 | 28.14 | 28.20 | 54.95 | |
<<<(512,2048),(32,8)>>> | 26.20 | 26.17 | 26.14 | 26.17 | 59.01 | |
<<<(512,1024),(32,16)>>> | 26.79 | 26.79 | 26.76 | 26.70 | 57.70 | |
<<<(512,512),(32,32)>>> | 27.42 | 27.37 | 27.45 | 27.37 | 56.38 |
6. 结果分析
(1) 寄存器优化:并未明显提升效率,可能线程对寄存器占用并非SM利用率的瓶颈;
(2) GridDim 优化:也没有明显的效率提升,经分析实验采用的线程块维度粒度较粗,对 GridDim没有影响。但是理论上确实减少了线程数分支的情况。
(3) 实验发现线程块规模, X在数值16及以上,比数值为 8 时加速比提升明显。