对于一个m * n的矩阵a和一个n * k的矩阵b
因为最后得到一个m * k的矩阵c,那么我们可以分配m * k个线程。
在线程(i,j)里矩阵a的第i行和矩阵b的第j列进行点积运算得到c[i][j]
#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
# define BLOCK_SIZE 2
__global__ void gpu_matrix_mult(int* a, int* b, int* c, int m, int n, int k)
{
//row和col是该线程所在行数和列数
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if (col < k && row < m)
{
for (int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
int main()
{
int m = 100, n = 100, k = 100;
int* h_a, * h_b, * h_c;
cudaMallocHost((void**)&h_a, sizeof(int) * m * n);
cudaMallocHost((void**)&h_b, sizeof(int) * n * k);
cudaMallocHost((void**)&h_c, sizeof(int) * m * k);
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n; ++j)
h_a[i * n + j] = rand() % 1024;
}
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < k; ++j)
h_b[i * k + j] = rand() % 1024;
}
int* d_a, * d_b, * d_c;
cudaMalloc((void**)&d_a, sizeof(int) * m * n);
cudaMalloc((void**)&d_b, sizeof(int) * n * k);
cudaMalloc((void**)&d_c, sizeof(int) * m * k);
cudaMemcpy(d_a, h_a, sizeof(int) * m * n, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeof(int) * n * k, cudaMemcpyHostToDevice);
//BLOCK_SIZE是一个block边的大小
//grid_rows是一个grid有几行block
//grid_cols是一个grid有几列block
//dimGrid是一个grid一行有几个block,一列有几个block
//dimBlock是一个block一行有几个thread,一列有几个thread
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
gpu_matrix_mult<<<dimGrid , dimBlock>>>(d_a, d_b, d_c, m, n, k);
cudaMemcpy(h_c, d_c, sizeof(int) * m * k, cudaMemcpyDeviceToHost);
for (int i = 0; i < m*k; i++)
{
std::cout << h_c[i] << std::endl;
}
return 0;
}
标签:int,void,矩阵,grid,cuda,sizeof,SIZE,BLOCK,乘法
From: https://www.cnblogs.com/algoshimo/p/18075392