首页 > 编程语言 >cuda编程[1]:一二三维网格和块的核函数

cuda编程[1]:一二三维网格和块的核函数

时间:2024-09-03 21:25:48浏览次数:6  
标签:cord idx thread blockDim 编程 网格 cuda block 3d

目录

前言

所有的代码下载链接:code。以下代码展示了如何在 CUDA 中打印网格和线程的索引信息。代码包括一维、二维和三维的网格和块的设置,并定义了多个内核函数来输出当前的索引信息。

核函数

  1. 打印线程索引
__global__ void print_idx_kernel(){
    printf("block idx: (%3d, %3d, %3d), thread idx: (%3d, %3d, %3d)\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         threadIdx.z, threadIdx.y, threadIdx.x);
}
  1. 打印网格和块的维度
__global__ void print_dim_kernel(){
    printf("grid dimension: (%3d, %3d, %3d), block dimension: (%3d, %3d, %3d)\n",
         gridDim.z, gridDim.y, gridDim.x,
         blockDim.z, blockDim.y, blockDim.x);
}
  1. 打印每个块的线程索引
__global__ void print_thread_idx_per_block_kernel(){
    int index = threadIdx.z * blockDim.x * blockDim.y + \
              threadIdx.y * blockDim.x + \
              threadIdx.x;

    printf("block idx: (%3d, %3d, %3d), thread idx: %3d\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         index);
}
  1. 打印网格和块的维度
__global__ void print_thread_idx_per_grid_kernel(){
    int bSize  = blockDim.z * blockDim.y * blockDim.x;

    int bIndex = blockIdx.z * gridDim.x * gridDim.y + \
               blockIdx.y * gridDim.x + \
               blockIdx.x;

    int tIndex = threadIdx.z * blockDim.x * blockDim.y + \
               threadIdx.y * blockDim.x + \
               threadIdx.x;

    int index  = bIndex * bSize + tIndex;

    printf("block idx: %3d, thread idx in block: %3d, thread idx: %3d\n", 
         bIndex, tIndex, index);
}
  1. 打印坐标
__global__ void print_cord_kernel(){
    int index = threadIdx.z * blockDim.x * blockDim.y + \
              threadIdx.y * blockDim.x + \
              threadIdx.x;

    int x  = blockIdx.x * blockDim.x + threadIdx.x;
    int y  = blockIdx.y * blockDim.y + threadIdx.y;
    int z  = blockIdx.z * blockDim.z + threadIdx.z;

    printf("block idx: (%3d, %3d, %3d), thread idx: %3d, cord: (%3d, %3d, %3d)\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         index, x, y, z);
}

一维

不可以缺少,cudaDeviceSynchronize( CPU与GPU端完成同步),当主函数在cpu中执行到需要调用核函数的时候不会等GPU全部完成返回结果,需要加上这个同步函数,不然运行可执行文件的时候会得到空的结果。

代码

void print_one_dim() {
    int inputSize = 8;
    int blockDim = 4;
    int gridDim = inputSize / blockDim;

    dim3 block(blockDim);
    dim3 grid(gridDim);

    printf("grid dimension: %d, block dimension: %d,\n", grid.x, block.x);

    cudaDeviceSynchronize();
}

二维

代码

void print_two_dim() {
    int inputWidth = 4;
    int blockDim = 2;
    int gridDim = inputWidth / blockDim;

    dim3 block(blockDim, blockDim);
    dim3 grid(gridDim, gridDim);

    printf("grid dimension: (%d, %d), block dimension: (%d, %d)\n",
           grid.y, grid.x, block.y, block.x);

    cudaDeviceSynchronize();
}

三维打印

代码

void print_three_dim() {
    int depth = 3;
    int height = 3;
    int width = 3;

    int blockDim = 2;

    dim3 block(blockDim, blockDim, blockDim);
    dim3 grid((width + blockDim - 1) / blockDim, 
              (height + blockDim - 1) / blockDim,
              (depth + blockDim - 1) / blockDim);

    printf("grid dimension: (%d, %d, %d), block dimension: (%d, %d, %d)\n",
           grid.z, grid.y, grid.x,
           block.z, block.y, block.x);

    cudaDeviceSynchronize();
}

主函数

这里就可以自己来使用print_one_dim, print_two_dim, print_three_dim测试不同网格不块的情况。可以自行组合定义核函数来测试所有情况。

int main() {
    // 选择打印的维度
    // print_one_dim();
    // print_two_dim();
    print_three_dim();

    return 0;
}

结果分析

这个只是一个小的.cu代码,所以我直接使用了笔记(点击代码链接可以看到)中得第一种方法编译。
在这里插入图片描述
打开当前代码目录下运行

nvcc grid_block_123D.cu -o test
./test

得到结果
···txt
grid dimension: (2, 2, 2), block dimension: (2, 2, 2)
block idx: ( 1, 0, 1), thread idx: 0, cord: ( 2, 0, 2)
block idx: ( 1, 0, 1), thread idx: 1, cord: ( 3, 0, 2)
block idx: ( 1, 0, 1), thread idx: 2, cord: ( 2, 1, 2)
block idx: ( 1, 0, 1), thread idx: 3, cord: ( 3, 1, 2)
block idx: ( 1, 0, 1), thread idx: 4, cord: ( 2, 0, 3)
block idx: ( 1, 0, 1), thread idx: 5, cord: ( 3, 0, 3)
block idx: ( 1, 0, 1), thread idx: 6, cord: ( 2, 1, 3)
block idx: ( 1, 0, 1), thread idx: 7, cord: ( 3, 1, 3)
block idx: ( 0, 1, 0), thread idx: 0, cord: ( 0, 2, 0)
block idx: ( 0, 1, 0), thread idx: 1, cord: ( 1, 2, 0)
block idx: ( 0, 1, 0), thread idx: 2, cord: ( 0, 3, 0)
block idx: ( 0, 1, 0), thread idx: 3, cord: ( 1, 3, 0)
block idx: ( 0, 1, 0), thread idx: 4, cord: ( 0, 2, 1)
block idx: ( 0, 1, 0), thread idx: 5, cord: ( 1, 2, 1)
block idx: ( 0, 1, 0), thread idx: 6, cord: ( 0, 3, 1)
block idx: ( 0, 1, 0), thread idx: 7, cord: ( 1, 3, 1)
block idx: ( 1, 0, 0), thread idx: 0, cord: ( 0, 0, 2)
block idx: ( 1, 0, 0), thread idx: 1, cord: ( 1, 0, 2)
block idx: ( 1, 0, 0), thread idx: 2, cord: ( 0, 1, 2)
block idx: ( 1, 0, 0), thread idx: 3, cord: ( 1, 1, 2)
block idx: ( 1, 0, 0), thread idx: 4, cord: ( 0, 0, 3)
block idx: ( 1, 0, 0), thread idx: 5, cord: ( 1, 0, 3)
block idx: ( 1, 0, 0), thread idx: 6, cord: ( 0, 1, 3)
block idx: ( 1, 0, 0), thread idx: 7, cord: ( 1, 1, 3)
block idx: ( 0, 0, 1), thread idx: 0, cord: ( 2, 0, 0)
block idx: ( 0, 0, 1), thread idx: 1, cord: ( 3, 0, 0)
block idx: ( 0, 0, 1), thread idx: 2, cord: ( 2, 1, 0)
block idx: ( 0, 0, 1), thread idx: 3, cord: ( 3, 1, 0)
block idx: ( 0, 0, 1), thread idx: 4, cord: ( 2, 0, 1)
block idx: ( 0, 0, 1), thread idx: 5, cord: ( 3, 0, 1)
block idx: ( 0, 0, 1), thread idx: 6, cord: ( 2, 1, 1)
block idx: ( 0, 0, 1), thread idx: 7, cord: ( 3, 1, 1)
block idx: ( 1, 1, 1), thread idx: 0, cord: ( 2, 2, 2)
block idx: ( 1, 1, 1), thread idx: 1, cord: ( 3, 2, 2)
block idx: ( 1, 1, 1), thread idx: 2, cord: ( 2, 3, 2)
block idx: ( 1, 1, 1), thread idx: 3, cord: ( 3, 3, 2)
block idx: ( 1, 1, 1), thread idx: 4, cord: ( 2, 2, 3)
block idx: ( 1, 1, 1), thread idx: 5, cord: ( 3, 2, 3)
block idx: ( 1, 1, 1), thread idx: 6, cord: ( 2, 3, 3)
block idx: ( 1, 1, 1), thread idx: 7, cord: ( 3, 3, 3)
block idx: ( 0, 1, 1), thread idx: 0, cord: ( 2, 2, 0)
block idx: ( 0, 1, 1), thread idx: 1, cord: ( 3, 2, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 7, cord: ( 3, 3, 1)
block idx: ( 0, 0, 0), thread idx: 0, cord: ( 0, 0, 0)
block idx: ( 0, 0, 0), thread idx: 1, cord: ( 1, 0, 0)
block idx: ( 0, 0, 0), thread idx: 2, cord: ( 0, 1, 0)
block idx: ( 0, 0, 0), thread idx: 3, cord: ( 1, 1, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 7, cord: ( 3, 3, 1)
block idx: ( 0, 0, 0), thread idx: 0, cord: ( 0, 0, 0)
block idx: ( 0, 0, 0), thread idx: 1, cord: ( 1, 0, 0)
block idx: ( 0, 0, 0), thread idx: 2, cord: ( 0, 1, 0)
block idx: ( 0, 0, 0), thread idx: 3, cord: ( 1, 1, 0)
block idx: ( 0, 0, 0), thread idx: 4, cord: ( 0, 0, 1)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 7, cord: ( 3, 3, 1)
block idx: ( 0, 0, 0), thread idx: 0, cord: ( 0, 0, 0)
block idx: ( 0, 0, 0), thread idx: 1, cord: ( 1, 0, 0)
block idx: ( 0, 0, 0), thread idx: 2, cord: ( 0, 1, 0)
block idx: ( 0, 0, 0), thread idx: 3, cord: ( 1, 1, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 7, cord: ( 3, 3, 1)
block idx: ( 0, 0, 0), thread idx: 0, cord: ( 0, 0, 0)
block idx: ( 0, 0, 0), thread idx: 1, cord: ( 1, 0, 0)
block idx: ( 0, 0, 0), thread idx: 2, cord: ( 0, 1, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 7, cord: ( 3, 3, 1)
block idx: ( 0, 0, 0), thread idx: 0, cord: ( 0, 0, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 7, cord: ( 3, 3, 1)
block idx: ( 0, 0, 0), thread idx: 0, cord: ( 0, 0, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 7, cord: ( 3, 3, 1)
block idx: ( 0, 0, 0), thread idx: 0, cord: ( 0, 0, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 2, cord: ( 2, 3, 0)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 3, cord: ( 3, 3, 0)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 4, cord: ( 2, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 5, cord: ( 3, 2, 1)
block idx: ( 0, 1, 1), thread idx: 6, cord: ( 2, 3, 1)
block idx: ( 0, 1, 1), thread idx: 7, cord: ( 3, 3, 1)
block idx: ( 0, 0, 0), thread idx: 0, cord: ( 0, 0, 0)
block idx: ( 0, 0, 0), thread idx: 1, cord: ( 1, 0, 0)
block idx: ( 0, 0, 0), thread idx: 2, cord: ( 0, 1, 0)
block idx: ( 0, 0, 0), thread idx: 3, cord: ( 1, 1, 0)
block idx: ( 0, 0, 0), thread idx: 4, cord: ( 0, 0, 1)
block idx: ( 0, 0, 0), thread idx: 5, cord: ( 1, 0, 1)
block idx: ( 0, 0, 0), thread idx: 6, cord: ( 0, 1, 1)
block idx: ( 0, 0, 0), thread idx: 7, cord: ( 1, 1, 1)
block idx: ( 1, 1, 0), thread idx: 0, cord: ( 0, 2, 2)
block idx: ( 1, 1, 0), thread idx: 1, cord: ( 1, 2, 2)
block idx: ( 1, 1, 0), thread idx: 2, cord: ( 0, 3, 2)
block idx: ( 1, 1, 0), thread idx: 3, cord: ( 1, 3, 2)
block idx: ( 1, 1, 0), thread idx: 4, cord: ( 0, 2, 3)
block idx: ( 1, 1, 0), thread idx: 5, cord: ( 1, 2, 3)
block idx: ( 1, 1, 0), thread idx: 6, cord: ( 0, 3, 3)
block idx: ( 1, 1, 0), thread idx: 7, cord: ( 1, 3, 3)

标签:cord,idx,thread,blockDim,编程,网格,cuda,block,3d
From: https://blog.csdn.net/buuliuda/article/details/141863094

相关文章

  • 编程新手必看:探索编程中的 for 循环20 种语言的实践与比较
    在这里我展示了20多种编程语言中的for循环实现。希望这些示例对大家学习不同语言的语法有帮助!1.C语言2.C++3.Python4.JavaScript5.Java6.Ruby7.Swift8.Go9.Rust10.Kotlin11.PHP12.TypeScript13.Perl14.Haskell15.Scala16.Julia17.R18.MATLAB19.Lua......
  • 并发编程学习笔记1
    1.线程的创建    方法一:直接重写Thread类的run方法Threadt=newThread(){@Overridepublicvoidrun(){}};t.start();    可简写为:Threadt3=newThread(()->{});t.start();    方法二:使用Runnable配合ThreadRunna......
  • Java 入门指南:Java 并发编程 —— 并发容器 ConcurrentSkipListMap
    ConcurrentMapConcurrentMap是Java并发包中提供的一个接口,它继承了java.util.Map接口,专门用于支持高并发环境下的线程安全操作。ConcurrentMap提供了一系列线程安全的方法,旨在解决在多线程环境下使用普通Map类型(如HashMap)时可能出现的竞态条件和数据不一致问题。......
  • 高效并发编程:使用Python线程池执行任务
    高效并发编程:使用Python线程池执行任务在现代软件开发中,处理并发任务是提高程序性能和响应速度的关键技术之一。Python作为一种广泛使用的编程语言,提供了多种并发编程工具,其中线程池(ThreadPool)是一个非常实用的工具。本文将详细介绍如何编写一个函数,使用线程池执行一组任务......
  • 章10——面向对象编程(高级部分)——两种单例模式
    代码如下://单例模式//instance--实例//该篇中记录了饿汉模式和懒汉模式publicclassHungryMan{publicstaticvoidmain(String[]args){Single01.say();Single02.say();}}classSingle01{//只能有instance这一个实例。privateS......
  • C安全编程教学-声明和初始化-不要声明或者定义保留标识符(三)
    注:本课程参考文献《C安全编码标准》 欢迎关注我......
  • 二、并发编程与多线程-2.1、J.U.C和锁(中篇)
    2.1、J.U.C和锁(中篇)2.1.4、什么是CAS?答:CAS是Java中Unsafe类里面的方法,全称是CompareAndSwap,是比较并交换的意思。作用就是保证在多线程环境下,对于修改共享变量操作的原子性。扩展:CAS保证修改共享变量操作原子性的实现逻辑:CAS方法里有三个参数,依次分别是共享变量的内......
  • 【花雕学编程】Arduino FOC 之并联五连杆算法
    Arduino是一个开放源码的电子原型平台,它可以让你用简单的硬件和软件来创建各种互动的项目。Arduino的核心是一个微控制器板,它可以通过一系列的引脚来连接各种传感器、执行器、显示器等外部设备。Arduino的编程是基于C/C++语言的,你可以使用ArduinoIDE(集成开发环境)来编写、......
  • 【花雕学编程】Arduino FOC 之步进电机正反转驱动、AS5600编码器信息读取及速度检测
    Arduino是一个开放源码的电子原型平台,它可以让你用简单的硬件和软件来创建各种互动的项目。Arduino的核心是一个微控制器板,它可以通过一系列的引脚来连接各种传感器、执行器、显示器等外部设备。Arduino的编程是基于C/C++语言的,你可以使用ArduinoIDE(集成开发环境)来编写、......
  • 章10——面向对象编程(高级部分)——代码块
    代码块/初始化块与方法的区别:无方法名、返回、参数,仅方法体。不用通过对象或类显示调用,加载类/创建对象时隐式调用。代码块的好处:总结:在有多个构造器的类中,可以把每个构造器中都需要有的重复语句抽取出来单独作为代码块,大大提高了程序的复用性。publicclassCodeBlock01{......