1,gpu vectorAdd 示例
为了简化逻辑,故假设 vector 的 size 与运行配置的thread个熟正好一样多,比如都是512之类的.
1.1 源码
vectorAdd.hip
#include <stdio.h>
#include <hip/hip_runtime.h>__global__ void vectorAdd(const float *A, const float *B, float *C) {int i = blockDim.x * blockIdx.x + threadIdx.x;C[i] = A[i] + B[i] + 0.0f;
}int main(void) {hipError_t err = hipSuccess;int numElements = 512;size_t size = numElements * sizeof(float);printf("[Vector addition of %d elements]\n", numElements);float *h_A = (float *)malloc(size);float *h_B = (float *)malloc(size);float *h_C = (float *)malloc(size);if (h_A == NULL || h_B == NULL || h_C == NULL) {fprintf(stderr, "Failed to allocate host vectors!\n");exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {h_A[i] = rand() / (float)RAND_MAX;h_B[i] = rand() / (float)RAND_MAX;}float *d_A = NULL;err = hipMalloc((void **)&d_A, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}float *d_B = NULL;err = hipMalloc((void **)&d_B, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}float *d_C = NULL;err = hipMalloc((void **)&d_C, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy input data from the host memory to the CUDA device\n");err = hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector A from host to device (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector B from host to device (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}int threadsPerBlock = 256;int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,threadsPerBlock);vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C);err = hipGetLastError();if (err != hipSuccess) {fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy output data from the CUDA device to the host memory\n");err = hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector C from device to host (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {fprintf(stderr, "Result verification failed at element %d!\n", i);exit(EXIT_FAILURE);}}printf("Test PASSED\n");err = hipFree(d_A);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector A (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipFree(d_B);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector B (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipFree(d_C);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector C (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}free(h_A);free(h_B);free(h_C);printf("Done\n");return 0;
}
Makefile
all: vectorAdd.hiphipcc $< -o vectorAddhipcc -S --cuda-device-only --offload-arch=gfx906 --offload-arch=gfx908 $<.PHONY: clean
clean:rm -rf *.s vectorAdd# --offload-arch=gfx803
# --offload-arch=gfx900
# --offload-arch=gfx906
# --offload-arch=gfx908
# --offload-arch=gfx90a
# --offload-arch=gfx942
# --offload-arch=gfx1030
# --offload-arch=gfx1100
# --offload-arch=gfx1101
# --offload-arch=gfx1102
1.2 编译运行
make
2,汇编代码
这句命令可以帮助生成汇编代码:
hipcc -S --cuda-device-only --offload-arch=gfx906 --offload-arch=gfx908 $<
offload-arch 帮助指定 gpu 的微架构,
vectorAdd-hip-amdgcn-amd-amdhsa-gfx906.s :
.text.amdgcn_target "amdgcn-amd-amdhsa--gfx906".protected _Z9vectorAddPKfS0_Pf ; -- Begin function _Z9vectorAddPKfS0_Pf.globl _Z9vectorAddPKfS0_Pf.p2align 8.type _Z9vectorAddPKfS0_Pf,@function
_Z9vectorAddPKfS0_Pf: ; @_Z9vectorAddPKfS0_Pf
; %bb.0:s_load_dword s7, s[4:5], 0x24s_load_dwordx4 s[0:3], s[4:5], 0x0s_load_dwordx2 s[8:9], s[4:5], 0x10s_waitcnt lgkmcnt(0)s_and_b32 s4, s7, 0xffffs_mul_i32 s6, s6, s4v_add_u32_e32 v0, s6, v0v_ashrrev_i32_e32 v1, 31, v0v_lshlrev_b64 v[0:1], 2, v[0:1]v_mov_b32_e32 v3, s1v_add_co_u32_e32 v2, vcc, s0, v0v_addc_co_u32_e32 v3, vcc, v3, v1, vccglobal_load_dword v4, v[2:3], offv_mov_b32_e32 v3, s3v_add_co_u32_e32 v2, vcc, s2, v0v_addc_co_u32_e32 v3, vcc, v3, v1, vccglobal_load_dword v2, v[2:3], offv_mov_b32_e32 v3, s9v_add_co_u32_e32 v0, vcc, s8, v0v_addc_co_u32_e32 v1, vcc, v3, v1, vccs_waitcnt vmcnt(0)v_add_f32_e32 v2, v4, v2v_add_f32_e32 v2, 0, v2global_store_dword v[0:1], v2, offs_endpgm.section .rodata,#alloc.p2align 6, 0x0.amdhsa_kernel _Z9vectorAddPKfS0_Pf.amdhsa_group_segment_fixed_size 0.amdhsa_private_segment_fixed_size 0.amdhsa_kernarg_size 280.amdhsa_user_sgpr_count 6.amdhsa_user_sgpr_private_segment_buffer 1.amdhsa_user_sgpr_dispatch_ptr 0.amdhsa_user_sgpr_queue_ptr 0.amdhsa_user_sgpr_kernarg_segment_ptr 1.amdhsa_user_sgpr_dispatch_id 0.amdhsa_user_sgpr_flat_scratch_init 0.amdhsa_user_sgpr_private_segment_size 0.amdhsa_uses_dynamic_stack 0.amdhsa_system_sgpr_private_segment_wavefront_offset 0.amdhsa_system_sgpr_workgroup_id_x 1.amdhsa_system_sgpr_workgroup_id_y 0.amdhsa_system_sgpr_workgroup_id_z 0.amdhsa_system_sgpr_workgroup_info 0.amdhsa_system_vgpr_workitem_id 0.amdhsa_next_free_vgpr 5.amdhsa_next_free_sgpr 10.amdhsa_reserve_flat_scratch 0.amdhsa_reserve_xnack_mask 1.amdhsa_float_round_mode_32 0.amdhsa_float_round_mode_16_64 0.amdhsa_float_denorm_mode_32 3.amdhsa_float_denorm_mode_16_64 3.amdhsa_dx10_clamp 1.amdhsa_ieee_mode 1.amdhsa_fp16_overflow 0.amdhsa_exception_fp_ieee_invalid_op 0.amdhsa_exception_fp_denorm_src 0.amdhsa_exception_fp_ieee_div_zero 0.amdhsa_exception_fp_ieee_overflow 0.amdhsa_exception_fp_ieee_underflow 0.amdhsa_exception_fp_ieee_inexact 0.amdhsa_exception_int_div_zero 0.end_amdhsa_kernel.text
.Lfunc_end0:.size _Z9vectorAddPKfS0_Pf, .Lfunc_end0-_Z9vectorAddPKfS0_Pf; -- End function.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 132
; NumSgprs: 14
; NumVgprs: 5
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 14
; NumVGPRsForWavesPerEU: 5
; Occupancy: 8
; WaveLimiterHint : 1
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:.zero 1.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:.zero 1.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:.zero 1.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1.ident "AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)".section ".note.GNU-stack".addrsig.amdgpu_metadata
---
amdhsa.kernels:- .args:- .address_space: global.offset: 0.size: 8.value_kind: global_buffer- .address_space: global.offset: 8.size: 8.value_kind: global_buffer- .address_space: global.offset: 16.size: 8.value_kind: global_buffer- .offset: 24.size: 4.value_kind: hidden_block_count_x- .offset: 28.size: 4.value_kind: hidden_block_count_y- .offset: 32.size: 4.value_kind: hidden_block_count_z- .offset: 36.size: 2.value_kind: hidden_group_size_x- .offset: 38.size: 2.value_kind: hidden_group_size_y- .offset: 40.size: 2.value_kind: hidden_group_size_z- .offset: 42.size: 2.value_kind: hidden_remainder_x- .offset: 44.size: 2.value_kind: hidden_remainder_y- .offset: 46.size: 2.value_kind: hidden_remainder_z- .offset: 64.size: 8.value_kind: hidden_global_offset_x- .offset: 72.size: 8.value_kind: hidden_global_offset_y- .offset: 80.size: 8.value_kind: hidden_global_offset_z- .offset: 88.size: 2.value_kind: hidden_grid_dims.group_segment_fixed_size: 0.kernarg_segment_align: 8.kernarg_segment_size: 280.language: OpenCL C.language_version:- 2- 0.max_flat_workgroup_size: 1024.name: _Z9vectorAddPKfS0_Pf.private_segment_fixed_size: 0.sgpr_count: 14.sgpr_spill_count: 0.symbol: _Z9vectorAddPKfS0_Pf.kd.uniform_work_group_size: 1.uses_dynamic_stack: false.vgpr_count: 5.vgpr_spill_count: 0.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx906
amdhsa.version:- 1- 2
....end_amdgpu_metadata
vectorAdd-hip-amdgcn-amd-amdhsa-gfx908.s :
.text.amdgcn_target "amdgcn-amd-amdhsa--gfx908".protected _Z9vectorAddPKfS0_Pf ; -- Begin function _Z9vectorAddPKfS0_Pf.globl _Z9vectorAddPKfS0_Pf.p2align 8.type _Z9vectorAddPKfS0_Pf,@function
_Z9vectorAddPKfS0_Pf: ; @_Z9vectorAddPKfS0_Pf
; %bb.0:s_load_dword s7, s[4:5], 0x24s_load_dwordx4 s[0:3], s[4:5], 0x0s_load_dwordx2 s[8:9], s[4:5], 0x10s_waitcnt lgkmcnt(0)s_and_b32 s4, s7, 0xffffs_mul_i32 s6, s6, s4v_add_u32_e32 v0, s6, v0v_ashrrev_i32_e32 v1, 31, v0v_lshlrev_b64 v[0:1], 2, v[0:1]v_mov_b32_e32 v3, s1v_add_co_u32_e32 v2, vcc, s0, v0v_addc_co_u32_e32 v3, vcc, v3, v1, vccglobal_load_dword v4, v[2:3], offv_mov_b32_e32 v3, s3v_add_co_u32_e32 v2, vcc, s2, v0v_addc_co_u32_e32 v3, vcc, v3, v1, vccglobal_load_dword v2, v[2:3], offv_mov_b32_e32 v3, s9v_add_co_u32_e32 v0, vcc, s8, v0v_addc_co_u32_e32 v1, vcc, v3, v1, vccs_waitcnt vmcnt(0)v_add_f32_e32 v2, v4, v2v_add_f32_e32 v2, 0, v2global_store_dword v[0:1], v2, offs_endpgm.section .rodata,#alloc.p2align 6, 0x0.amdhsa_kernel _Z9vectorAddPKfS0_Pf.amdhsa_group_segment_fixed_size 0.amdhsa_private_segment_fixed_size 0.amdhsa_kernarg_size 280.amdhsa_user_sgpr_count 6.amdhsa_user_sgpr_private_segment_buffer 1.amdhsa_user_sgpr_dispatch_ptr 0.amdhsa_user_sgpr_queue_ptr 0.amdhsa_user_sgpr_kernarg_segment_ptr 1.amdhsa_user_sgpr_dispatch_id 0.amdhsa_user_sgpr_flat_scratch_init 0.amdhsa_user_sgpr_private_segment_size 0.amdhsa_uses_dynamic_stack 0.amdhsa_system_sgpr_private_segment_wavefront_offset 0.amdhsa_system_sgpr_workgroup_id_x 1.amdhsa_system_sgpr_workgroup_id_y 0.amdhsa_system_sgpr_workgroup_id_z 0.amdhsa_system_sgpr_workgroup_info 0.amdhsa_system_vgpr_workitem_id 0.amdhsa_next_free_vgpr 5.amdhsa_next_free_sgpr 10.amdhsa_reserve_flat_scratch 0.amdhsa_reserve_xnack_mask 1.amdhsa_float_round_mode_32 0.amdhsa_float_round_mode_16_64 0.amdhsa_float_denorm_mode_32 3.amdhsa_float_denorm_mode_16_64 3.amdhsa_dx10_clamp 1.amdhsa_ieee_mode 1.amdhsa_fp16_overflow 0.amdhsa_exception_fp_ieee_invalid_op 0.amdhsa_exception_fp_denorm_src 0.amdhsa_exception_fp_ieee_div_zero 0.amdhsa_exception_fp_ieee_overflow 0.amdhsa_exception_fp_ieee_underflow 0.amdhsa_exception_fp_ieee_inexact 0.amdhsa_exception_int_div_zero 0.end_amdhsa_kernel.text
.Lfunc_end0:.size _Z9vectorAddPKfS0_Pf, .Lfunc_end0-_Z9vectorAddPKfS0_Pf; -- End function.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 132
; NumSgprs: 14
; NumVgprs: 5
; NumAgprs: 0
; TotalNumVgprs: 5
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 14
; NumVGPRsForWavesPerEU: 5
; Occupancy: 8
; WaveLimiterHint : 1
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:.zero 1.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:.zero 1.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:.zero 1.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1.ident "AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)".section ".note.GNU-stack".addrsig.amdgpu_metadata
---
amdhsa.kernels:- .agpr_count: 0.args:- .address_space: global.offset: 0.size: 8.value_kind: global_buffer- .address_space: global.offset: 8.size: 8.value_kind: global_buffer- .address_space: global.offset: 16.size: 8.value_kind: global_buffer- .offset: 24.size: 4.value_kind: hidden_block_count_x- .offset: 28.size: 4.value_kind: hidden_block_count_y- .offset: 32.size: 4.value_kind: hidden_block_count_z- .offset: 36.size: 2.value_kind: hidden_group_size_x- .offset: 38.size: 2.value_kind: hidden_group_size_y- .offset: 40.size: 2.value_kind: hidden_group_size_z- .offset: 42.size: 2.value_kind: hidden_remainder_x- .offset: 44.size: 2.value_kind: hidden_remainder_y- .offset: 46.size: 2.value_kind: hidden_remainder_z- .offset: 64.size: 8.value_kind: hidden_global_offset_x- .offset: 72.size: 8.value_kind: hidden_global_offset_y- .offset: 80.size: 8.value_kind: hidden_global_offset_z- .offset: 88.size: 2.value_kind: hidden_grid_dims.group_segment_fixed_size: 0.kernarg_segment_align: 8.kernarg_segment_size: 280.language: OpenCL C.language_version:- 2- 0.max_flat_workgroup_size: 1024.name: _Z9vectorAddPKfS0_Pf.private_segment_fixed_size: 0.sgpr_count: 14.sgpr_spill_count: 0.symbol: _Z9vectorAddPKfS0_Pf.kd.uniform_work_group_size: 1.uses_dynamic_stack: false.vgpr_count: 5.vgpr_spill_count: 0.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx908
amdhsa.version:- 1- 2
....end_amdgpu_metadata
对比可以发现,两个kernel的 asm 内容差不多,说明两个微架构差不多。