cuda 内存模型

标签：__ int 模型 global cuda shared array include 内存

cuda内存模型其实概括来说就是下面两张图

双箭头代表可读可写，单箭头代表只读

1. local memory

#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

# define BLOCK_SIZE 256


__global__ void test_kernal()
{
	int  array[3];
	float value = 5;
	__shared__ int shared_value;
	printf("array is local = %s\n", __isLocal(array) ? "true" : "false");  //数组是local memory
	printf("value is local = %s\n", __isLocal(&value) ? "true" : "false");  //自定义的一个变量是...
	printf("shared_value is local = %s\n", __isLocal(&shared_value) ? "true" : "false"); //shared memory里的变量不是...
}



int main()
{
	test_kernal << <1, 1 >> >() ;
	cudaDeviceSynchronize(); //前面的任务失败后会返回一个error

	return 0;
}

输出

2. shared memory

#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

# define BLOCK_SIZE 256

//声明共享的变量，不能给初始值，需要由线程来初始化
__shared__ int shared_value2;

static __global__ void test_kernal()
{
	__shared__ int  shared_array[3]; //shared类型的变量，同一block的所有线程共用
	__shared__ int shared_value1; //声明共享的变量，不能给初始值，需要由线程来初始化

	if (threadIdx.x == 0) 
	{
		shared_value1 = 5;
		shared_value2 = 8;
		shared_array[0] = 33;
	}

	__syncthreads();

	printf("%d, shared_value1 = %d, shared_value2 = %d\n",threadIdx.x , shared_value1 , shared_value2);
	printf("%d, shared_array[0] = %d\n",shared_array[0]); 
}



int main()
{
	test_kernal << <1, 2 >> >() ;
	cudaDeviceSynchronize(); //前面的任务失败后会返回一个error

	return 0;
}

输出

#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

# define BLOCK_SIZE 256


static __global__ void test_kernal()
{
	//使用extern声明外部的动态大小共享内存，由启动核函数的第三个参数指定
	extern __shared__ int  shared_array[]; 

	if (threadIdx.x == 0) 
	{
		shared_array[0] = blockIdx.x;
	}

	__syncthreads();

	printf("%d, %d , %d\n",blockIdx.x , threadIdx.x , shared_array[0]); 
}



int main()
{ 
	test_kernal << <2, 2 ,sizeof(int)*5 >> >() ; //核函数里的数组大小为5
	cudaDeviceSynchronize(); //前面的任务失败后会返回一个error

	return 0;
}

输出:

3. global memory

1. 定义

#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

# define BLOCK_SIZE 256
__device__ float global_array[100];  //方式2 ： __device__定义

static __global__ void test_kernal(float* device_ptr)
{
	printf("device_ptr is global = %s\n",__isGlobal(device_ptr) ? "true" : "false");
	printf("global_array is global = %s\n", __isGlobal(global_array) ? "true" : "false");
}



int main()
{ 
	float* device_ptr = nullptr;
	cudaMalloc(&device_ptr, sizeof(float) * 100); //方式1 ： cudamalloc主机分配

	test_kernal << <1, 1>> >(device_ptr) ; //核函数里的数组大小为5
	cudaDeviceSynchronize(); //前面的任务失败后会返回一个error

	return 0;
}

输出：

2. memory transfer

cpu和gpu之间传输用PCle 速度慢:8GB/s
GPU和GPU Memory之间用GDDR5 速度快:144GB/s

尽量避免cpu和gpu之间的数据传输

3. Pinned memory(页锁定内存)

默认下，通过new,malloc函数分配的pageable data transfer(可置换页上的内存)可能会被os置换到虚拟内存上导致gpu无法安全获取

因此在pageable data transfer传送到device时，cuda驱动会分配一个Pinned Memory。
Pinned Memory常驻物理内存，不会被交换。可以使用DMA技术直接在cpu/gpu使用该内存

通过cudaMallocHost可以显示的分配Pinned Data Transfer

#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


static __global__ void test_kernal(float* array)
{
	array[threadIdx.x] = threadIdx.x;
}

void global_memory2_pinned_memory()
{
	int num = 5;
	float* array = nullptr;
	cudaMallocHost(&array, sizeof(float) * num); //给array分配页锁定内存，是dma的，不需要cpu参与

	test_kernal << <1, num >> > (array);
	cudaDeviceSynchronize();

	for (int i = 0; i < num; i++) printf("array[%d] = %f\n", i, array[i]);
	cudaFreeHost(array); //释放
}

int main()
{
	global_memory2_pinned_memory();
	cudaDeviceSynchronize(); //前面的任务失败后会返回一个error

	return 0;
}

4. Unified Memory(统一内存)

将cpu和gpu看做一个整体进行管理和使用。分配的内存可以cpu/gpu直接访问

左图是将cpu和gpu的内存割裂开来看的，右图则是统一内存

static __global__ void test_kernal(float* array)
{
	array[threadIdx.x] = threadIdx.x;
}

void global_memory2_pinned_memory()
{
	int num = 5;
	float* array = nullptr;
	cudaMallocManaged(&array, sizeof(float) * num); //给array分配页锁定内存

	test_kernal << <1, num >> > (array);
	cudaDeviceSynchronize();

	for (int i = 0; i < num; i++) printf("array[%d] = %f\n", i, array[i]);
	cudaFree(array);
}

4. constant Memory

#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

//直接定义和初始化
__constant__ float warp_matrix[6] = { 1,2,3,4,5,6 };

static __global__ void test_kernal(float* array)
{ 
	//核函数内，常量内存不能修改，否则报错
	printf("warp_matrix[%d] = %f\n", threadIdx.x, warp_matrix[threadIdx.x]); 
}

void constant_memory()
{
	//修改常量内存的方法:覆盖
	float host_warp_matrix[6] = { 6,5,4,3,2,1 };
	test_kernal << <1, 6 >> > (warp_matrix);
	cudaMemcpyToSymbol(warp_matrix, host_warp_matrix, sizeof(float) * 6); //cudaMemcpyToSymbol用来拷贝数据到常量内存

	test_kernal << <1, 6>> > (warp_matrix);
	cudaDeviceSynchronize();
}

int main()
{
	constant_memory();
	cudaDeviceSynchronize(); //前面的任务失败后会返回一个error

	return 0;
}

标签：__,int,模型,global,cuda,shared,array,include,内存
From： https://www.cnblogs.com/algoshimo/p/18088189