如果不用原子操作,在进行计算直方图时会发生计算冲突
d_b[i]为h_a中数字i有几个
下面的代码将h_a全赋值为3,但d_b[3]却为1
#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10
__global__ void f(int* a , int *b)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
b[a[x]]++;
}
int main()
{
int h_a[N] , h_b[N]; //d_b[i]为h_a中数字i有几个
int* d_a, * d_b;
cudaMalloc((void**)&d_a, N * sizeof(int));
cudaMalloc((void**)&d_b, N * sizeof(int));
for (int i = 0; i < N; i++) h_a[i] = 3;
cudaMemcpy(d_a, h_a, N * sizeof(int) , cudaMemcpyHostToDevice);
f << <N, 1 >> > (d_a, d_b);
cudaMemcpy(h_b, d_b, N * sizeof(int) , cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) printf("%d ", h_b[i]);
return 0;
}
使用共享内存原子操作:
#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10
__global__ void f(int* a , int *b)
{
__shared__ unsigned int temp[N];
int x = blockIdx.x * blockDim.x + threadIdx.x;
temp[x] = 0; //将共享内存所有元素清0
__syncthreads();
atomicAdd(&(b[a[x]]), 1); //结果现在在共享内存中
__syncthreads();
atomicAdd(&(b[x]), temp[x]); //再将结果复制到结果数组中
}
int main()
{
int h_a[N] , h_b[N];
int* d_a, * d_b;
cudaMalloc((void**)&d_a, N * sizeof(int));
cudaMalloc((void**)&d_b, N * sizeof(int));
for (int i = 0; i < N; i++) h_a[i] = 3;
cudaMemcpy(d_a, h_a, N * sizeof(int) , cudaMemcpyHostToDevice);
f << <N, 1 >> > (d_a, d_b);
cudaMemcpy(h_b, d_b, N * sizeof(int) , cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) printf("%d ", h_b[i]);
return 0;
}
标签:__,int,void,原子,++,cuda,操作,sizeof,include
From: https://www.cnblogs.com/algoshimo/p/18101974