首页 > 编程语言 >cuda kernel启动的反汇编

cuda kernel启动的反汇编

时间:2025-01-05 21:57:01浏览次数:1  
标签:kernel movl rbp 反汇编 quad 0x0000000000000000 movq rax cuda

原始代码

// Type your code here, or load an example.
extern "C" __global__ void square(int* array, int n) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < n)
        array[tid] = array[tid] * array[tid];
}

void square_do(int *array, int n) {
    square<<<92, 128>>>(array, n);
}

对应汇编

__nv_save_fatbinhandle_for_managed_rt(void**):
        pushq   %rbp
        movq    %rsp, %rbp
        movq    %rdi, -8(%rbp)
        movq    -8(%rbp), %rax
        movq    %rax, __nv_fatbinhandle_for_managed_rt(%rip)
        nop
        popq    %rbp
        ret
dim3::dim3(unsigned int, unsigned int, unsigned int):
        pushq   %rbp
        movq    %rsp, %rbp
        movq    %rdi, -8(%rbp)
        movl    %esi, -12(%rbp)
        movl    %edx, -16(%rbp)
        movl    %ecx, -20(%rbp)
        movq    -8(%rbp), %rax
        movl    -12(%rbp), %edx
        movl    %edx, (%rax)
        movq    -8(%rbp), %rax
        movl    -16(%rbp), %edx
        movl    %edx, 4(%rax)
        movq    -8(%rbp), %rax
        movl    -20(%rbp), %edx
        movl    %edx, 8(%rax)
        nop
        popq    %rbp
        ret
square_do(int*, int):
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $48, %rsp
        movq    %rdi, -40(%rbp)
        movl    %esi, -44(%rbp)
        leaq    -24(%rbp), %rax
        movl    $1, %ecx
        movl    $1, %edx
        movl    $128, %esi
        movq    %rax, %rdi
        call    dim3::dim3(unsigned int, unsigned int, unsigned int)
        leaq    -12(%rbp), %rax
        movl    $1, %ecx
        movl    $1, %edx
        movl    $92, %esi
        movq    %rax, %rdi
        call    dim3::dim3(unsigned int, unsigned int, unsigned int)
        movq    -24(%rbp), %rax // blockDim
        movl    -16(%rbp), %ecx
        movq    %rcx, %rdx
        movq    -12(%rbp), %rdi // gridDim
        movl    -4(%rbp), %esi
        movl    $0, %r9d
        movl    $0, %r8d 
        movq    %rdx, %rcx
        movq    %rax, %rdx
        call    __cudaPushCallConfiguration // four parameter:(dim3 gridDim, dim3 blockDim, int shareMemSize, stream)
        testl   %eax, %eax
        jne     .L5 // if not push ok, return
        movl    -44(%rbp), %edx
        movq    -40(%rbp), %rax
        movl    %edx, %esi // second parameter: n
        movq    %rax, %rdi // first parameter: array
        call    square // call square
.L5:
        nop
        leave
        ret
____nv_dummy_param_ref(void*):
        pushq   %rbp
        movq    %rsp, %rbp
        movq    %rdi, -8(%rbp)
        movq    -8(%rbp), %rax
        movq    %rax, ____nv_dummy_param_ref(void*)::__ref(%rip)
        nop
        popq    %rbp
        ret
__cudaUnregisterBinaryUtil():
        pushq   %rbp
        movq    %rsp, %rbp
        movl    $__cudaFatCubinHandle, %edi
        call    ____nv_dummy_param_ref(void*)
        movq    __cudaFatCubinHandle(%rip), %rax
        movq    %rax, %rdi
        call    __cudaUnregisterFatBinary
        nop
        popq    %rbp
        ret
__nv_init_managed_rt_with_module(void**):
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $16, %rsp
        movq    %rdi, -8(%rbp)
        movq    -8(%rbp), %rax
        movq    %rax, %rdi
        call    __cudaInitModule
        leave
        ret
fatbinData:
.quad 0x00100001ba55ed50,0x00000000000010a8,0x0000005001010002,0x0000000000000e08
.quad 0x0000000000000000,0x0000003400010007,0x0000000f00000040,0x0000000000000011
.quad 0x0000000000000000,0x0000000000000000,0x6178652f7070612f,0x0075632e656c706d
.quad 0x33010102464c457f,0x0000000000000007,0x0000007d00be0002,0x0000000000000000
.quad 0x0000000000000d60,0x00000000000009a0,0x0038004000340534,0x0001000f00400003
.quad 0x7472747368732e00,0x747274732e006261,0x746d79732e006261,0x746d79732e006261
.quad 0x78646e68735f6261,0x666e692e766e2e00,0x2e747865742e006f,0x2e00657261757173
.quad 0x2e6f666e692e766e,0x2e00657261757173,0x65726168732e766e,0x6572617571732e64
.quad 0x6e6f632e766e2e00,0x732e30746e617473,0x722e006572617571,0x6f632e766e2e6c65
.quad 0x2e30746e6174736e,0x2e00657261757173,0x696c5f6775626564,0x2e6c65722e00656e
.quad 0x696c5f6775626564,0x645f766e2e00656e,0x6e696c5f67756265,0x2e00737361735f65
.quad 0x645f766e2e6c6572,0x6e696c5f67756265,0x2e00737361735f65,0x67756265645f766e
.quad 0x7478745f7874705f,0x6c61632e766e2e00,0x2e0068706172676c,0x6f746f72702e766e
.quad 0x766e2e0065707974,0x7463612e6c65722e,0x68732e00006e6f69,0x2e00626174727473
.quad 0x2e00626174727473,0x2e006261746d7973,0x735f6261746d7973,0x766e2e0078646e68
.quad 0x742e006f666e692e,0x617571732e747865,0x692e766e2e006572,0x617571732e6f666e
.quad 0x732e766e2e006572,0x71732e6465726168,0x65722e0065726175,0x6e6f632e766e2e6c
.quad 0x732e30746e617473,0x6e2e006572617571,0x6174736e6f632e76,0x617571732e30746e
.quad 0x756265642e006572,0x2e00656e696c5f67,0x756265642e6c6572,0x2e00656e696c5f67
.quad 0x67756265645f766e,0x61735f656e696c5f,0x2e6c65722e007373,0x67756265645f766e
.quad 0x61735f656e696c5f,0x645f766e2e007373,0x7874705f67756265,0x766e2e007478745f
.quad 0x6172676c6c61632e,0x702e766e2e006870,0x657079746f746f72,0x6c65722e766e2e00
.quad 0x006e6f697463612e,0x0000657261757173,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x000e000300000032,0x0000000000000000,0x0000000000000000
.quad 0x000d00030000007a,0x0000000000000000,0x0000000000000000,0x000400030000008f
.quad 0x0000000000000000,0x0000000000000000,0x00050003000000ab,0x0000000000000000
.quad 0x0000000000000000,0x00060003000000d7,0x0000000000000000,0x0000000000000000
.quad 0x00090003000000e9,0x0000000000000000,0x0000000000000000,0x000a000300000105
.quad 0x0000000000000000,0x0000000000000000,0x000e101200000114,0x0000000000000000
.quad 0x0000000000000100,0x0028000200000054,0x000a0efb01010000,0x0100000001010101
.quad 0x786500007070612f,0x75632e656c706d61,0xa206bbe7c3cd0100,0x0000000209000002
.quad 0x0301040000000000,0x0138020103f00101,0x0201030120020103,0x01010030020100e8
.quad 0x001000020000004c,0x000a0efb01010000,0x0100000001010101,0x0000000209000000
.quad 0x0300040000000000,0x08020b0308020112,0x0120020203f08001,0x011802010380f4f0
.quad 0x0403012002010380,0x0101003002012002,0x0000000000000000,0x6e6f69737265762e
.quad 0x61742e00352e3820,0x5f6d732074656772,0x726464612e003235,0x657a69735f737365
.quad 0x2e00000000343620,0x20656c6269736976,0x73207972746e652e,0x2e00286572617571
.quad 0x752e206d61726170,0x7261757173203436,0x5f6d617261705f65,0x617261702e002c30
.quad 0x73203233752e206d,0x61705f6572617571,0x002900315f6d6172,0x2e206765722e007b
.quad 0x7025092064657270,0x65722e003b3e323c,0x09203233622e2067,0x2e003b3e383c7225
.quad 0x3436622e20676572,0x3e353c6472250920,0x2e646c000000003b,0x36752e6d61726170
.quad 0x2c31647225092034,0x6572617571735b20,0x305f6d617261705f,0x61702e646c003b5d
.quad 0x203233752e6d6172,0x735b202c32722509,0x61705f6572617571,0x003b5d315f6d6172
.quad 0x3233752e766f6d00,0x25202c3372250920,0x003b782e6469746e,0x203233752e766f6d
.quad 0x6325202c34722509,0x003b782e64696174,0x203233752e766f6d,0x7425202c35722509
.quad 0x616d003b782e6469,0x3233732e6f6c2e64,0x25202c3172250920,0x2c347225202c3372
.quad 0x7300003b35722520,0x732e65672e707465,0x2c31702509203233,0x7225202c31722520
.quad 0x2031702540003b32,0x5f4c240920617262,0x003b325f3042425f,0x742e617476630000
.quad 0x6c61626f6c672e6f,0x722509203436752e,0x31647225202c3264,0x772e6c756d00003b
.quad 0x203233732e656469,0x25202c3364722509,0x61003b34202c3172,0x09203436732e6464
.quad 0x7225202c34647225,0x33647225202c3264,0x6f6c672e646c003b,0x203233752e6c6162
.quad 0x255b202c36722509,0x756d003b5d346472,0x3233732e6f6c2e6c,0x25202c3772250920
.quad 0x3b367225202c3672,0x626f6c672e747300,0x09203233752e6c61,0x202c5d346472255b
.quad 0x4c2400003b377225,0x3a325f3042425f5f,0x00003b7465720000,0x00082f040000007d
.quad 0x0000000700000008,0x0000000800081204,0x0008110400000000,0x0000000000000008
.quad 0x0000000800081204,0x0004370400000000,0x000030010000007d,0x00080a0400002a01
.quad 0x000c014000000002,0x000c1704000c1903,0x0008000100000000,0x000c17040011f000
.quad 0x0000000000000000,0x00ff1b030021f000,0x0000001000041d04,0x0000005800081c04
.quad 0x00000000000000d0,0x00000000ffffffff,0x00000000fffffffe,0x00000000fffffffd
.quad 0x00000000fffffffc,0x0000000000000073,0x3605002511000000,0x0000000000000035
.quad 0x0000000800000002,0x000000000000001d,0x0000000800000002,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x001cfc00e22007f6,0x4c98078000870001,0xf0c8000002570002,0xf0c8000002170000
.quad 0x001fd842fec20ff1,0x4f107f8000270203,0x4e00000000270200,0x5b30001800370202
.quad 0x001ff400fd4007ed,0x4b6d038005270207,0x50b0000000070f00,0xe30000000000000f
.quad 0x001fc800fec207f1,0x3829000001e70200,0x4c18810005070202,0x4c10080005170003
.quad 0x001f98e0fe2007b5,0xeed4200000070200,0x5b007f8000070005,0x5b007fa800070006
.quad 0x001fbc00fe2007f2,0x5b30029800670000,0xeedc200000070200,0x50b0000000070f00
.quad 0x001ffc00ffe007ea,0x50b0000000070f00,0xe30000000007000f,0xe2400fffff87000f
.quad 0x001f8000fc0007e0,0x50b0000000070f00,0x50b0000000070f00,0x50b0000000070f00
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000000000000,0x0000000000000000,0x0000000000000000,0x0000000000000000
.quad 0x0000000300000001,0x0000000000000000,0x0000000000000000,0x0000000000000040
.quad 0x0000000000000114,0x0000000000000000,0x0000000000000001,0x0000000000000000
.quad 0x000000030000000b,0x0000000000000000,0x0000000000000000,0x0000000000000154
.quad 0x000000000000011b,0x0000000000000000,0x0000000000000001,0x0000000000000000
.quad 0x0000000200000013,0x0000000000000000,0x0000000000000000,0x0000000000000270
.quad 0x00000000000000d8,0x0000000800000002,0x0000000000000008,0x0000000000000018
.quad 0x000000010000008f,0x0000000000000000,0x0000000000000000,0x0000000000000348
.quad 0x0000000000000058,0x0000000000000000,0x0000000000000001,0x0000000000000000
.quad 0x00000001000000ab,0x0000000000000000,0x0000000000000000,0x00000000000003a0
.quad 0x0000000000000050,0x0000000000000000,0x0000000000000001,0x0000000000000000
.quad 0x00000001000000d7,0x0000000000000000,0x0000000000000000,0x00000000000003f0
.quad 0x000000000000026a,0x0000000000000000,0x0000000000000001,0x0000000000000000
.quad 0x7000000000000029,0x0000000000000000,0x0000000000000000,0x000000000000065c
.quad 0x0000000000000030,0x0000000000000003,0x0000000000000004,0x0000000000000000
.quad 0x700000000000003f,0x0000000000000040,0x0000000000000000,0x000000000000068c
.quad 0x0000000000000058,0x0000000e00000003,0x0000000000000004,0x0000000000000000
.quad 0x70000001000000e9,0x0000000000000000,0x0000000000000000,0x00000000000006e4
.quad 0x0000000000000020,0x0000000000000003,0x0000000000000004,0x0000000000000008
.quad 0x7000000b00000105,0x0000000000000000,0x0000000000000000,0x0000000000000708
.quad 0x0000000000000010,0x0000000000000000,0x0000000000000008,0x0000000000000008
.quad 0x000000090000009b,0x0000000000000040,0x0000000000000000,0x0000000000000718
.quad 0x0000000000000010,0x0000000400000003,0x0000000000000008,0x0000000000000010
.quad 0x00000009000000bf,0x0000000000000040,0x0000000000000000,0x0000000000000728
.quad 0x0000000000000010,0x0000000500000003,0x0000000000000008,0x0000000000000010
.quad 0x0000000100000061,0x0000000000000042,0x0000000000000000,0x0000000000000738
.quad 0x000000000000014c,0x0000000e00000000,0x0000000000000004,0x0000000000000000
.quad 0x0000000100000032,0x0000000000000006,0x0000000000000000,0x00000000000008a0
.quad 0x0000000000000100,0x0700000800000003,0x0000000000000020,0x0000000000000000
.quad 0x0000000500000006,0x0000000000000d60,0x0000000000000000,0x0000000000000000
.quad 0x00000000000000a8,0x00000000000000a8,0x0000000000000008,0x0000000500000001
.quad 0x0000000000000738,0x0000000000000000,0x0000000000000000,0x0000000000000268
.quad 0x0000000000000268,0x0000000000000008,0x0000000500000001,0x0000000000000d60
.quad 0x0000000000000000,0x0000000000000000,0x00000000000000a8,0x00000000000000a8
.quad 0x0000000000000008,0x0000007001010001,0x00000000000001e0,0x00000050000001d9
.quad 0x0000003400080005,0x0000000f00000040,0x0000000000002011,0x0000000000000000
.quad 0x00000000000002bf,0x6178652f7070612f,0x0075632e656c706d,0x0000001600000058
.quad 0x72656e65672d2d20,0x656e696c2d657461,0x0000206f666e692d,0x1ef300032f2f0a3c
.quad 0x6f69737265762e0a,0x742e0a352e38206e,0x6d73207465677261,0x6464612e0a32355f
.quad 0x7a69735f73736572,0xf300310a34362065,0x20656c6269736912,0x73207972746e652e
.quad 0x2e0a286572617571,0x752e206d61726170,0x00125f1100143436,0x332a001c2c305f36
.quad 0x290a3108f3001c32,0x206765722e0a7b0a,0x702520646572702e,0x628600123b3e323c
.quad 0x11383c7225203233,0x6472252034368000,0x636f6ce20012353c,0x0a0a302032203109
.quad 0x752e220063646c0a,0x855b202c314a0022,0x580100253b5d2700,0x5d31250024321c00
.quad 0x6d0a352033730056,0x202c33b7002a766f,0x3b782e6469746e25,0x6325202c347c0016
.quad 0x202c354400176174,0x6c2e646171002c25,0x2c31230018732e6f,0x357225003901004a
.quad 0x730a352034b2006c,0x002b65672e707465,0x08f800302c317034,0x20317025400a3b32
.quad 0x5f5f4c2420617262,0xa70a3b325f304242,0x742e61747663e400,0x6c61626f6c672e6f
.quad 0x0501072c32210101,0x6d0a392035b30065,0x00666469772e6c75,0x3482006733647233
.quad 0x3a732e6464610a3b,0x331100402c342600,0x003601005a040135,0x2100265b202c3640
.quad 0x2200df0500503b5d,0x36722578001d2c37,0x3002003574730a3b,0x0a3b3745001b0000
.quad 0xf0009c3a1400c70a,0x7465720a31203619,0x662e0a0a7d0a0a3b,0x2f22203109656c69
.quad 0x6d6178652f707061, 0x0a2275632e656c70, 0x0000000000000000

__fatDeviceText:
        .long   1180844977
        .long   1
        .quad   fatbinData
        .quad   0
__device_stub__Z6squarePii(int*, int):
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $96, %rsp
        movq    %rdi, -88(%rbp)
        movl    %esi, -92(%rbp)
        movl    $0, -4(%rbp)
        movl    -4(%rbp), %eax
        cltq
        leaq    -88(%rbp), %rdx
        movq    %rdx, -32(%rbp,%rax,8) // push first parameter &array to %rbp-32
        addl    $1, -4(%rbp)
        movl    -4(%rbp), %eax
        cltq
        leaq    -92(%rbp), %rdx 
        movq    %rdx, -32(%rbp,%rax,8) // push second parameter &n to %rbp-24
        addl    $1, -4(%rbp) // %rbp-4 store number of parameter
        movq    $square, __device_stub__Z6squarePii(int*, int)::__f(%rip) // move square function to static function pointer
        movl    $1, -44(%rbp)
        movl    $1, -40(%rbp)
        movl    $1, -36(%rbp) //[-44, -32) stores three 1s, default 1
        movl    $1, -56(%rbp)
        movl    $1, -52(%rbp)
        movl    $1, -48(%rbp) //[-56, -44) stores three 1s
        leaq    -72(%rbp), %rcx // stream
        leaq    -64(%rbp), %rdx // sharedMemSize
        leaq    -56(%rbp), %rsi // blokcDim?
        leaq    -44(%rbp), %rax // gridDim
        movq    %rax, %rdi
        call    __cudaPopCallConfiguration // call cudaPopCallConfiguration, return 0 for success
        testl   %eax, %eax  
        setne   %al // if not success
        testb   %al, %al // set result false
        jne     .L10 // if not success, function return
        cmpl    $0, -4(%rbp) // if(nParameter ==0)?
        jne     .L13 // not 0 paramenter, goto .L13
        movq    -72(%rbp), %rdi // take return val1 for pop call configuration, should be stream
        movq    -64(%rbp), %rsi // take return val2 for pop call configuration, should be shareMemSize
        leaq    -32(%rbp), %rdx // take parameter pointer
        movl    -4(%rbp), %eax 
        cltq
        salq    $3, %rax
        leaq    (%rdx,%rax), %r9 // parameter end position-> r9
        movq    -56(%rbp), %rcx // two 0x00010001
        movl    -48(%rbp), %r8d // one 1
        movq    -44(%rbp), %rdx // two 1
        movl    -36(%rbp), %eax // 1
        pushq   %rdi
        pushq   %rsi
        movq    %rdx, %rsi // gridBlock:
        movl    %eax, %edx // dimbloc
        movl    $square, %edi // square address
        call    cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*)
        addq    $16, %rsp
        jmp     .L10
.L13: //
        movq    -72(%rbp), %rdi
        movq    -64(%rbp), %rsi
        leaq    -32(%rbp), %r9
        movq    -56(%rbp), %rcx
        movl    -48(%rbp), %r8d
        movq    -44(%rbp), %rdx
        movl    -36(%rbp), %eax
        pushq   %rdi
        pushq   %rsi
        movq    %rdx, %rsi
        movl    %eax, %edx
        movl    $square, %edi
        call    cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*)
        addq    $16, %rsp
.L10:
        leave
        ret
square:
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $16, %rsp
        movq    %rdi, -8(%rbp)
        movl    %esi, -12(%rbp)
        movl    -12(%rbp), %edx
        movq    -8(%rbp), %rax
        movl    %edx, %esi
        movq    %rax, %rdi
        call    __device_stub__Z6squarePii(int*, int)
        nop
        leave
        ret
.LC0:
        .string "square"
__nv_cudaEntityRegisterCallback(void**):
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $16, %rsp
        movq    %rdi, -8(%rbp)
        movq    -8(%rbp), %rax
        movq    %rax, __nv_cudaEntityRegisterCallback(void**)::__ref(%rip)
        movq    -8(%rbp), %rax
        movq    %rax, %rdi
        call    __nv_save_fatbinhandle_for_managed_rt(void**)
        movq    -8(%rbp), %rax
        pushq   $0
        pushq   $0
        pushq   $0
        pushq   $0
        movl    $0, %r9d
        movl    $-1, %r8d
        movl    $.LC0, %ecx
        movl    $.LC0, %edx
        movl    $square, %esi
        movq    %rax, %rdi
        call    __cudaRegisterFunction
        addq    $32, %rsp
        nop
        leave
        ret
__sti____cudaRegisterAll():
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $16, %rsp
        movl    $__fatDeviceText, %edi
        call    __cudaRegisterFatBinary
        movq    %rax, __cudaFatCubinHandle(%rip)
        movq    $__nv_cudaEntityRegisterCallback(void**), -8(%rbp)
        movq    __cudaFatCubinHandle(%rip), %rax
        movq    -8(%rbp), %rdx
        movq    %rax, %rdi
        call    *%rdx
        movq    __cudaFatCubinHandle(%rip), %rax
        movq    %rax, %rdi
        call    __cudaRegisterFatBinaryEnd
        movl    $__cudaUnregisterBinaryUtil(), %edi
        call    atexit
        nop
        leave
        ret
cudaError cudaLaunchKernel<char>(char const*, dim3, dim3, void**, unsigned long, CUstream_st*):
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $48, %rsp
        movq    %rdi, -8(%rbp) // function->%rbp-8
        movq    %rcx, %rax // 
        movl    %r8d, %ecx
        movq    %r9, -48(%rbp)
        movq    %rsi, -24(%rbp)
        movl    %edx, -16(%rbp)
        movq    %rax, -40(%rbp)
        movl    %ecx, -32(%rbp)
        movq    -48(%rbp), %r8
        movq    -40(%rbp), %rcx
        movl    -32(%rbp), %edi
        movq    -24(%rbp), %rsi
        movl    -16(%rbp), %edx
        movq    -8(%rbp), %rax
        pushq   24(%rbp)
        pushq   16(%rbp)
        movq    %r8, %r9
        movl    %edi, %r8d
        movq    %rax, %rdi
        call    cudaLaunchKernel
        addq    $16, %rsp
        leave
        ret


标签:kernel,movl,rbp,反汇编,quad,0x0000000000000000,movq,rax,cuda
From: https://www.cnblogs.com/zwlwf/p/18653990

相关文章

  • Linux服务器无Root权限安装Cuda方法及问题解决
    CUDA简介什么是CUDA?CUDA(ComputeUnifiedDeviceArchitecture)是由NVIDIA提供的一种并行计算平台和编程模型,用于加速计算密集型任务。CUDA允许开发者使用GPU的计算能力,通过并行处理来快速执行复杂的计算任务。CUDA包括以下主要组成部分:CUDAToolkit:为开发人员提供工......
  • 深度学习CUDA环境安装教程---动手学深度学习
    首先说明我安装的是《动手学深度学习》中的环境本人是小白,一次安装,可能有不对的地方,望包含。安装CUDA因为我们是深度学习,很多时候要用到gpu进行训练,所以我们需要一种方式加快训练速度。通俗地说,CUDA是一种协助“CPU任务分发+GPU并行处理”的编程模型/平台,用于加速GPU和CPU之......
  • 高性能计算-CUDA矩阵加法及优化测试
    1.目标:对16384*16384规模的矩阵进行加法运算,对比CPU和GPU计算的效率,还有不同线程块大小规模下对效率的影响;并做可能的优化测试。2.核心代码/*用GPU对二维矩阵做加法,分析不同线程块规模下的性能变化*/#include<stdio.h>#include<stdlib.h>#include<sys/time.h>#......
  • CUDA编程【5】获取GPU设备信息
    文章目录通过cudaAPI获取1.获取设备数量2.获取当前设备ID3.设置当前设备4.获取设备属性5.获取设备限制6.获取设备共享内存配置7.获取设备缓存配置8.获取设备是否支持统一内存9.获取设备是否支持并发内核执行10.获取设备的最大线程块数11.获取设备的时钟频率......
  • 编译CUDA时的ARCH参数
    https://blog.csdn.net/Vingnir/article/details/135255072在编译CUDA程序时,ARCH是指定给nvcc(NVIDIACUDACompiler)的一个重要参数。ARCH代表着目标GPU的计算能力(ComputeCapability),这是一个特定于NVIDIAGPU架构的指标,用于表明GPU支持的特性和指令集。关于CUDA计算能力(Com......
  • 折腾笔记[4]-cuda的hello-world
    摘要在window11上搭建cuda开发环境并编译helloworld程序;关键信息编译器:cudanvcc12.4.131平台:windows11原理简介cuda简介CUDA(ComputeUnifiedDeviceArchitecture,统一计算架构)是由英伟达所推出的一种集成技术,向用户提供了可以很优雅地调用GPU进行并行计算的编程......
  • 利用CUDA编程实现在GPU中对图像的极坐标变换加速
    问题来源:1.需要对输入图像中的一个环形区域,进行极坐标逆变换,将该环形区域转换为一张新的矩形图像2.opencv没有直接对环形区域图像进行变换的函数,需要通过循环遍历的方式,利用polarToCart进行转换3.循环遍历不可避免的带来速度上的问题,尤其是图片较大时解决思路1:使用open......
  • AI Agent系列-什么是AI智能体,使用Semantic Kernel开发一个AI Agent
    今年最热的技术除了LLM大语言模型外,AIAgent智能体成为下一个最热的技术发展热点。、近期准备整理几篇AI智能体的博客,带着大家了解并学习AI智能体的开发和应用。一、什么是AI智能体AI智能体(AIAgent)是指一个由人工智能驱动的系统或程序,能够在一定的环境中自主感知、决策和执......
  • 【CUDA】cuDNN:加速深度学习的核心库
    【CUDA】cuDNN:加速深度学习的核心库1.什么是cuDNN?cuDNN(CUDADeepNeuralNetworklibrary)是NVIDIA提供的一个高性能GPU加速库,专为深度学习任务中常见的操作进行了高度优化。它不仅提供了单操作的高效实现,还支持多操作融合(fusion),旨在最大化地利用NVIDIAGPU的计算......
  • 【STS测试】failure of android.security.sts.KernelLtsTest#testRequiredKernelLts_W
    总结:获取以下信息:1.安全补丁版本spl:2024-12-052.kernel版本5.15.1483.确认spl+6个月是否在【kernel-lifetimes.xml】中对应版本的生命周期之内,如果不在,则报出异常逻辑:确保安全补丁版本+6个月之后,仍旧在kernel的生命周期之内。也就是kernel版本该升级就升级吧。----------......