首页 > 编程语言 >AMD hipcc 生成各个gpu 微架构汇编语言代码的方法示例

AMD hipcc 生成各个gpu 微架构汇编语言代码的方法示例

时间:2024-03-30 22:33:56浏览次数:31  
标签:HIP __ 示例 AMD amdhsa ZN17 offset hipcc size

1,gpu vectorAdd 示例

为了简化逻辑,故假设 vector 的 size 与运行配置的thread个熟正好一样多,比如都是512之类的.

1.1 源码

vectorAdd.hip

#include <stdio.h>
#include <hip/hip_runtime.h>

__global__ void vectorAdd(const float *A, const float *B, float *C) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;

  C[i] = A[i] + B[i] + 0.0f;
}

int main(void) {
  hipError_t err = hipSuccess;
  int numElements = 512;
  size_t size = numElements * sizeof(float);
  printf("[Vector addition of %d elements]\n", numElements);

  float *h_A = (float *)malloc(size);
  float *h_B = (float *)malloc(size);
  float *h_C = (float *)malloc(size);

  if (h_A == NULL || h_B == NULL || h_C == NULL) {
    fprintf(stderr, "Failed to allocate host vectors!\n");
    exit(EXIT_FAILURE);
  }

  for (int i = 0; i < numElements; ++i) {
    h_A[i] = rand() / (float)RAND_MAX;
    h_B[i] = rand() / (float)RAND_MAX;
  }

  float *d_A = NULL;
  err = hipMalloc((void **)&d_A, size);

  if (err != hipSuccess) {
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  float *d_B = NULL;
  err = hipMalloc((void **)&d_B, size);

  if (err != hipSuccess) {
    fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  float *d_C = NULL;
  err = hipMalloc((void **)&d_C, size);

  if (err != hipSuccess) {
    fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  printf("Copy input data from the host memory to the CUDA device\n");
  err = hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);

  if (err != hipSuccess) {
    fprintf(stderr,
            "Failed to copy vector A from host to device (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  err = hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);

  if (err != hipSuccess) {
    fprintf(stderr,
            "Failed to copy vector B from host to device (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  int threadsPerBlock = 256;
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
         threadsPerBlock);
  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C);
  err = hipGetLastError();

  if (err != hipSuccess) {
    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  printf("Copy output data from the CUDA device to the host memory\n");
  err = hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost);

  if (err != hipSuccess) {
    fprintf(stderr,
            "Failed to copy vector C from device to host (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  for (int i = 0; i < numElements; ++i) {
    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
      fprintf(stderr, "Result verification failed at element %d!\n", i);
      exit(EXIT_FAILURE);
    }
  }

  printf("Test PASSED\n");

  err = hipFree(d_A);

  if (err != hipSuccess) {
    fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  err = hipFree(d_B);

  if (err != hipSuccess) {
    fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  err = hipFree(d_C);

  if (err != hipSuccess) {
    fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
            hipGetErrorString(err));
    exit(EXIT_FAILURE);
  }

  free(h_A);
  free(h_B);
  free(h_C);

  printf("Done\n");
  return 0;
}

Makefile

all: vectorAdd.hip
	hipcc $< -o vectorAdd
	hipcc -S --cuda-device-only --offload-arch=gfx906 --offload-arch=gfx908 $<

.PHONY: clean
clean:
	rm -rf *.s vectorAdd


# --offload-arch=gfx803 
# --offload-arch=gfx900 
# --offload-arch=gfx906 
# --offload-arch=gfx908 
# --offload-arch=gfx90a 
# --offload-arch=gfx942 
# --offload-arch=gfx1030 
# --offload-arch=gfx1100 
# --offload-arch=gfx1101 
# --offload-arch=gfx1102 

1.2  编译运行

make

2,汇编代码

这句命令可以帮助生成汇编代码:

hipcc -S --cuda-device-only --offload-arch=gfx906 --offload-arch=gfx908 $<

offload-arch 帮助指定 gpu 的微架构,

vectorAdd-hip-amdgcn-amd-amdhsa-gfx906.s :

	.text
	.amdgcn_target "amdgcn-amd-amdhsa--gfx906"
	.protected	_Z9vectorAddPKfS0_Pf    ; -- Begin function _Z9vectorAddPKfS0_Pf
	.globl	_Z9vectorAddPKfS0_Pf
	.p2align	8
	.type	_Z9vectorAddPKfS0_Pf,@function
_Z9vectorAddPKfS0_Pf:                   ; @_Z9vectorAddPKfS0_Pf
; %bb.0:
	s_load_dword s7, s[4:5], 0x24
	s_load_dwordx4 s[0:3], s[4:5], 0x0
	s_load_dwordx2 s[8:9], s[4:5], 0x10
	s_waitcnt lgkmcnt(0)
	s_and_b32 s4, s7, 0xffff
	s_mul_i32 s6, s6, s4
	v_add_u32_e32 v0, s6, v0
	v_ashrrev_i32_e32 v1, 31, v0
	v_lshlrev_b64 v[0:1], 2, v[0:1]
	v_mov_b32_e32 v3, s1
	v_add_co_u32_e32 v2, vcc, s0, v0
	v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
	global_load_dword v4, v[2:3], off
	v_mov_b32_e32 v3, s3
	v_add_co_u32_e32 v2, vcc, s2, v0
	v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
	global_load_dword v2, v[2:3], off
	v_mov_b32_e32 v3, s9
	v_add_co_u32_e32 v0, vcc, s8, v0
	v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
	s_waitcnt vmcnt(0)
	v_add_f32_e32 v2, v4, v2
	v_add_f32_e32 v2, 0, v2
	global_store_dword v[0:1], v2, off
	s_endpgm
	.section	.rodata,#alloc
	.p2align	6, 0x0
	.amdhsa_kernel _Z9vectorAddPKfS0_Pf
		.amdhsa_group_segment_fixed_size 0
		.amdhsa_private_segment_fixed_size 0
		.amdhsa_kernarg_size 280
		.amdhsa_user_sgpr_count 6
		.amdhsa_user_sgpr_private_segment_buffer 1
		.amdhsa_user_sgpr_dispatch_ptr 0
		.amdhsa_user_sgpr_queue_ptr 0
		.amdhsa_user_sgpr_kernarg_segment_ptr 1
		.amdhsa_user_sgpr_dispatch_id 0
		.amdhsa_user_sgpr_flat_scratch_init 0
		.amdhsa_user_sgpr_private_segment_size 0
		.amdhsa_uses_dynamic_stack 0
		.amdhsa_system_sgpr_private_segment_wavefront_offset 0
		.amdhsa_system_sgpr_workgroup_id_x 1
		.amdhsa_system_sgpr_workgroup_id_y 0
		.amdhsa_system_sgpr_workgroup_id_z 0
		.amdhsa_system_sgpr_workgroup_info 0
		.amdhsa_system_vgpr_workitem_id 0
		.amdhsa_next_free_vgpr 5
		.amdhsa_next_free_sgpr 10
		.amdhsa_reserve_flat_scratch 0
		.amdhsa_reserve_xnack_mask 1
		.amdhsa_float_round_mode_32 0
		.amdhsa_float_round_mode_16_64 0
		.amdhsa_float_denorm_mode_32 3
		.amdhsa_float_denorm_mode_16_64 3
		.amdhsa_dx10_clamp 1
		.amdhsa_ieee_mode 1
		.amdhsa_fp16_overflow 0
		.amdhsa_exception_fp_ieee_invalid_op 0
		.amdhsa_exception_fp_denorm_src 0
		.amdhsa_exception_fp_ieee_div_zero 0
		.amdhsa_exception_fp_ieee_overflow 0
		.amdhsa_exception_fp_ieee_underflow 0
		.amdhsa_exception_fp_ieee_inexact 0
		.amdhsa_exception_int_div_zero 0
	.end_amdhsa_kernel
	.text
.Lfunc_end0:
	.size	_Z9vectorAddPKfS0_Pf, .Lfunc_end0-_Z9vectorAddPKfS0_Pf
                                        ; -- End function
	.section	.AMDGPU.csdata
; Kernel info:
; codeLenInByte = 132
; NumSgprs: 14
; NumVgprs: 5
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 14
; NumVGPRsForWavesPerEU: 5
; Occupancy: 8
; WaveLimiterHint : 1
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc
	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
	.zero	1
	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1

	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc
	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
	.zero	1
	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1

	.protected	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
	.type	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
	.section	.rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc
	.weak	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
	.zero	1
	.size	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1

	.ident	"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"
	.section	".note.GNU-stack"
	.addrsig
	.amdgpu_metadata
---
amdhsa.kernels:
  - .args:
      - .address_space:  global
        .offset:         0
        .size:           8
        .value_kind:     global_buffer
      - .address_space:  global
        .offset:         8
        .size:           8
        .value_kind:     global_buffer
      - .address_space:  global
        .offset:         16
        .size:           8
        .value_kind:     global_buffer
      - .offset:         24
        .size:           4
        .value_kind:     hidden_block_count_x
      - .offset:         28
        .size:           4
        .value_kind:     hidden_block_count_y
      - .offset:         32
        .size:           4
        .value_kind:     hidden_block_count_z
      - .offset:         36
        .size:           2
        .value_kind:     hidden_group_size_x
      - .offset:         38
        .size:           2
        .value_kind:     hidden_group_size_y
      - .offset:         40
        .size:           2
        .value_kind:     hidden_group_size_z
      - .offset:         42
        .size:           2
        .value_kind:     hidden_remainder_x
      - .offset:         44
        .size:           2
        .value_kind:     hidden_remainder_y
      - .offset:         46
        .size:           2
        .value_kind:     hidden_remainder_z
      - .offset:         64
        .size:           8
        .value_kind:     hidden_global_offset_x
      - .offset:         72
        .size:           8
        .value_kind:     hidden_global_offset_y
      - .offset:         80
        .size:           8
        .value_kind:     hidden_global_offset_z
      - .offset:         88
        .size:           2
        .value_kind:     hidden_grid_dims
    .group_segment_fixed_size: 0
    .kernarg_segment_align: 8
    .kernarg_segment_size: 280
    .language:       OpenCL C
    .language_version:
      - 2
      - 0
    .max_flat_workgroup_size: 1024
    .name:           _Z9vectorAddPKfS0_Pf
    .private_segment_fixed_size: 0
    .sgpr_count:     14
    .sgpr_spill_count: 0
    .symbol:         _Z9vectorAddPKfS0_Pf.kd
    .uniform_work_group_size: 1
    .uses_dynamic_stack: false
    .vgpr_count:     5
    .vgpr_spill_count: 0
    .wavefront_size: 64
amdhsa.target:   amdgcn-amd-amdhsa--gfx906
amdhsa.version:
  - 1
  - 2
...

	.end_amdgpu_metadata

vectorAdd-hip-amdgcn-amd-amdhsa-gfx908.s :


	.text
	.amdgcn_target "amdgcn-amd-amdhsa--gfx908"
	.protected	_Z9vectorAddPKfS0_Pf    ; -- Begin function _Z9vectorAddPKfS0_Pf
	.globl	_Z9vectorAddPKfS0_Pf
	.p2align	8
	.type	_Z9vectorAddPKfS0_Pf,@function
_Z9vectorAddPKfS0_Pf:                   ; @_Z9vectorAddPKfS0_Pf
; %bb.0:
	s_load_dword s7, s[4:5], 0x24
	s_load_dwordx4 s[0:3], s[4:5], 0x0
	s_load_dwordx2 s[8:9], s[4:5], 0x10
	s_waitcnt lgkmcnt(0)
	s_and_b32 s4, s7, 0xffff
	s_mul_i32 s6, s6, s4
	v_add_u32_e32 v0, s6, v0
	v_ashrrev_i32_e32 v1, 31, v0
	v_lshlrev_b64 v[0:1], 2, v[0:1]
	v_mov_b32_e32 v3, s1
	v_add_co_u32_e32 v2, vcc, s0, v0
	v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
	global_load_dword v4, v[2:3], off
	v_mov_b32_e32 v3, s3
	v_add_co_u32_e32 v2, vcc, s2, v0
	v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
	global_load_dword v2, v[2:3], off
	v_mov_b32_e32 v3, s9
	v_add_co_u32_e32 v0, vcc, s8, v0
	v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
	s_waitcnt vmcnt(0)
	v_add_f32_e32 v2, v4, v2
	v_add_f32_e32 v2, 0, v2
	global_store_dword v[0:1], v2, off
	s_endpgm
	.section	.rodata,#alloc
	.p2align	6, 0x0
	.amdhsa_kernel _Z9vectorAddPKfS0_Pf
		.amdhsa_group_segment_fixed_size 0
		.amdhsa_private_segment_fixed_size 0
		.amdhsa_kernarg_size 280
		.amdhsa_user_sgpr_count 6
		.amdhsa_user_sgpr_private_segment_buffer 1
		.amdhsa_user_sgpr_dispatch_ptr 0
		.amdhsa_user_sgpr_queue_ptr 0
		.amdhsa_user_sgpr_kernarg_segment_ptr 1
		.amdhsa_user_sgpr_dispatch_id 0
		.amdhsa_user_sgpr_flat_scratch_init 0
		.amdhsa_user_sgpr_private_segment_size 0
		.amdhsa_uses_dynamic_stack 0
		.amdhsa_system_sgpr_private_segment_wavefront_offset 0
		.amdhsa_system_sgpr_workgroup_id_x 1
		.amdhsa_system_sgpr_workgroup_id_y 0
		.amdhsa_system_sgpr_workgroup_id_z 0
		.amdhsa_system_sgpr_workgroup_info 0
		.amdhsa_system_vgpr_workitem_id 0
		.amdhsa_next_free_vgpr 5
		.amdhsa_next_free_sgpr 10
		.amdhsa_reserve_flat_scratch 0
		.amdhsa_reserve_xnack_mask 1
		.amdhsa_float_round_mode_32 0
		.amdhsa_float_round_mode_16_64 0
		.amdhsa_float_denorm_mode_32 3
		.amdhsa_float_denorm_mode_16_64 3
		.amdhsa_dx10_clamp 1
		.amdhsa_ieee_mode 1
		.amdhsa_fp16_overflow 0
		.amdhsa_exception_fp_ieee_invalid_op 0
		.amdhsa_exception_fp_denorm_src 0
		.amdhsa_exception_fp_ieee_div_zero 0
		.amdhsa_exception_fp_ieee_overflow 0
		.amdhsa_exception_fp_ieee_underflow 0
		.amdhsa_exception_fp_ieee_inexact 0
		.amdhsa_exception_int_div_zero 0
	.end_amdhsa_kernel
	.text
.Lfunc_end0:
	.size	_Z9vectorAddPKfS0_Pf, .Lfunc_end0-_Z9vectorAddPKfS0_Pf
                                        ; -- End function
	.section	.AMDGPU.csdata
; Kernel info:
; codeLenInByte = 132
; NumSgprs: 14
; NumVgprs: 5
; NumAgprs: 0
; TotalNumVgprs: 5
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 14
; NumVGPRsForWavesPerEU: 5
; Occupancy: 8
; WaveLimiterHint : 1
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc
	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
	.zero	1
	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1

	.protected	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
	.type	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
	.section	.rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc
	.weak	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
	.zero	1
	.size	_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1

	.protected	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
	.type	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
	.section	.rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc
	.weak	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
	.zero	1
	.size	_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1

	.ident	"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"
	.section	".note.GNU-stack"
	.addrsig
	.amdgpu_metadata
---
amdhsa.kernels:
  - .agpr_count:     0
    .args:
      - .address_space:  global
        .offset:         0
        .size:           8
        .value_kind:     global_buffer
      - .address_space:  global
        .offset:         8
        .size:           8
        .value_kind:     global_buffer
      - .address_space:  global
        .offset:         16
        .size:           8
        .value_kind:     global_buffer
      - .offset:         24
        .size:           4
        .value_kind:     hidden_block_count_x
      - .offset:         28
        .size:           4
        .value_kind:     hidden_block_count_y
      - .offset:         32
        .size:           4
        .value_kind:     hidden_block_count_z
      - .offset:         36
        .size:           2
        .value_kind:     hidden_group_size_x
      - .offset:         38
        .size:           2
        .value_kind:     hidden_group_size_y
      - .offset:         40
        .size:           2
        .value_kind:     hidden_group_size_z
      - .offset:         42
        .size:           2
        .value_kind:     hidden_remainder_x
      - .offset:         44
        .size:           2
        .value_kind:     hidden_remainder_y
      - .offset:         46
        .size:           2
        .value_kind:     hidden_remainder_z
      - .offset:         64
        .size:           8
        .value_kind:     hidden_global_offset_x
      - .offset:         72
        .size:           8
        .value_kind:     hidden_global_offset_y
      - .offset:         80
        .size:           8
        .value_kind:     hidden_global_offset_z
      - .offset:         88
        .size:           2
        .value_kind:     hidden_grid_dims
    .group_segment_fixed_size: 0
    .kernarg_segment_align: 8
    .kernarg_segment_size: 280
    .language:       OpenCL C
    .language_version:
      - 2
      - 0
    .max_flat_workgroup_size: 1024
    .name:           _Z9vectorAddPKfS0_Pf
    .private_segment_fixed_size: 0
    .sgpr_count:     14
    .sgpr_spill_count: 0
    .symbol:         _Z9vectorAddPKfS0_Pf.kd
    .uniform_work_group_size: 1
    .uses_dynamic_stack: false
    .vgpr_count:     5
    .vgpr_spill_count: 0
    .wavefront_size: 64
amdhsa.target:   amdgcn-amd-amdhsa--gfx908
amdhsa.version:
  - 1
  - 2
...

	.end_amdgpu_metadata

对比可以发现,两个kernel的 asm 内容差不多,说明两个微架构差不多。

标签:HIP,__,示例,AMD,amdhsa,ZN17,offset,hipcc,size
From: https://blog.csdn.net/eloudy/article/details/137178054

相关文章

  • OpenStack一键式脚本创建示例网络及图像等
    kolla-ansible提供的一键式创建示例网络、图像等,不得不说,真的是太贴心了!只需根据实际环境修改公共网络和租户网络部分。#!/bin/bashset-oerrexit#Thisscriptismeanttoberunonceafterrunningstartforthefirst#time.Thisscriptdownloadsacirrosimage......
  • windows下socket客户端编程示例
    #include<iostream>#include<winsock2.h>#include<ws2tcpip.h>#include<windows.h>#pragmacomment(lib,"Ws2_32.lib")intsocket_client_demo(char*addr,intport){ charrecvbuf[1024]={0}; intretVal=-1;#......
  • JWT示例与原理
    简介JWT(JSONWebToken)是一种去中心化的web认证方案,信息存储在客户端。数据结构JWT通常由3部分组成,Header、Payload、Signature。每个部分都是用Base64Url编码后的字符串,每个部分之间由点分割。形如Header.Payload.Signature注:Base64Url是Base64的一个变种,主要是将Base6......
  • 第11章 使用类——运算符重载(一)一个简单的运算符重载示例(Time类对象的加法)
    本文章是作者根据史蒂芬·普拉达所著的《C++PrimerPlus》而整理出的读书笔记,如果您在浏览过程中发现了什么错误,烦请告知。另外,此书由浅入深,非常适合有C语言基础的人学习,感兴趣的朋友可以自行阅读此书籍。运算符重载我们先了解下函数重载的概念,函数重载,也叫函数多态,指的是用......
  • uniapp(全端兼容) - 最新移动端评论区讨论点赞回复功能,可发表文字或图片评论|点赞|回
    效果图在uniapp小程序/h5网页网站/安卓苹果app/nvue等(全平台完美兼容)开发中,实现评论区、讨论区功能详细教程,uniapp评论区用户可发布图片、视频、文字进行评论,其他用户可进行“无限级|盖楼评论区”,点赞评论、回复评论、删除评论(自动计算刷新,不影响布局),当评论大于n条时自......
  • 大模型提示工程之Prompt框架和示例
    今天和大家分享一下:大模型提示工程之Prompt框架和示例:TAG框架任务(Task): 开发一个新的手机应用,旨在帮助用户更好地管理他们的日常健康。行动(Action): 进行市场调研,设计用户友好的界面,开发核心健康跟踪功能,测试应用并收集用户反馈。目标(Goal): 在六个月内发布应用,并在发布后的......
  • 【docker常用命令系列】Docker save语法用法示例详解
    【docker常用命令系列】Dockersave语法用法示例详解源自专栏《docker常用命令系列目录导航?》文章目录[【docker常用命令系列】Dockersave语法用法示例详解](https://zhuanlan.zhihu.com/p/689619518/)概览用法别名选项示例参考链接概览dockerimagesav......
  • Java接口应用代码示例
    1.概念在Java中接口是一个抽象的数据类型,在接口里面我们会定义类应该遵循的行为规范,并不会去具体实现,只是告诉了接了该接口的类一定要实现些什么方法。接口中可以有常量、方法和嵌套类型的声明(就是在该接口内部定义其他类型,如接口、类、枚举)。一个接口可以被多个类去实现,一......
  • 在Vue项目中使用Vuex进行状态管理是一种常见做法。下面是一个简单的示例,展示了如何创
    步骤1:创建VuexStore首先,你需要创建一个Vuexstore。通常,这是在你的项目的store目录下完成的。//store.jsimportVuefrom'vue';importVuexfrom'vuex';Vue.use(Vuex);conststore=newVuex.Store({state:{count:0},mutations:{increment(......
  • 在 Windows Server 2022 系统中,你可以使用一些组合命令结合系统自带的工具来实现文件
    在WindowsServer2022系统中,你可以使用一些组合命令结合系统自带的工具来实现文件夹同步。以下是一个示例组合命令,结合Robocopy和TaskScheduler来实现文件夹同步:使用Robocopy进行文件夹同步:Robocopy是Windows自带的一个命令行工具,用于复制大量文件和文件夹。你可......