背景 想要了解CUDA并行计算原理,同时针对深度学习中出现一些“不支持算子”可能需要手写的需要,配置一个简单的CUDA编译环境,探索CUDA编程的范式【注:CUDA环境配置略】。 结果展示 示例代码
# include "cuda_runtime.h"
# include "device_launch_parameters.h"
# include <iostream> __global__ void VecAdd ( int * A, int * B, int * C)
{ int i = threadIdx. x; C[ i] = A[ i] + B[ i] ;
} void test_cuda ( ) { const int size = 3 ; int a[ size] = { 1 , 2 , 3 } ; int b[ size] = { 10 , 20 , 30 } ; int c[ size] = { 0 } ; int * dev_a = 0 ; int * dev_b = 0 ; int * dev_c = 0 ; cudaError_t cudaStatus; cudaStatus = cudaSetDevice ( 0 ) ; if ( cudaStatus != cudaSuccess) { fprintf ( stderr , "GPU device error" ) ; return ; } cudaStatus = cudaMalloc ( ( void * * ) & dev_c, size * sizeof ( int ) ) ; if ( cudaStatus != cudaSuccess) fprintf ( stderr , "device_c allocate error" ) ; cudaStatus = cudaMalloc ( ( void * * ) & dev_a, size * sizeof ( int ) ) ; if ( cudaStatus != cudaSuccess) fprintf ( stderr , "device_a allocate error" ) ; cudaStatus = cudaMalloc ( ( void * * ) & dev_b, size * sizeof ( int ) ) ; if ( cudaStatus != cudaSuccess) fprintf ( stderr , "device_b allocate error" ) ; cudaStatus = cudaMemcpy ( dev_a, a, size * sizeof ( int ) , cudaMemcpyHostToDevice) ; if ( cudaStatus != cudaSuccess) { fprintf ( stderr , "device_a copy error" ) ; } cudaStatus = cudaMemcpy ( dev_b, b, size * sizeof ( int ) , cudaMemcpyHostToDevice) ; if ( cudaStatus != cudaSuccess) { fprintf ( stderr , "device_b copy error" ) ; } VecAdd << < 1 , size>> > ( dev_a, dev_b, dev_c) ; cudaStatus = cudaGetLastError ( ) ; if ( cudaStatus != cudaSuccess) { fprintf ( stderr , "VecAdd call error: %s\n" , cudaGetErrorString ( cudaStatus) ) ; } cudaStatus = cudaDeviceSynchronize ( ) ; if ( cudaStatus != cudaSuccess) { fprintf ( stderr , "cudaDeviceSynchronize not sucess %d!\n" , cudaStatus) ; } cudaStatus = cudaMemcpy ( c, dev_c, size * sizeof ( int ) , cudaMemcpyDeviceToHost) ; if ( cudaStatus != cudaSuccess) { fprintf ( stderr , "copy result to host error" ) ; } printf ( "{1,2,3} + {10,20,30} = {%d,%d,%d}\n" , c[ 0 ] , c[ 1 ] , c[ 2 ] ) ; cudaFree ( dev_a) ; cudaFree ( dev_b) ; cudaFree ( dev_c) ;
} int main ( ) { test_cuda ( ) ; return 0 ;
}
小结
NVCC编译cuda命令与g++编译C++较为相似,从而借鉴引入对应的include,实现Windows下cmake编译CUDA代码; 示例代码展示了从CPU读取数据,在GPU端进行计算,最终传输给CPU的过程,与深度学习数据加载过程类似,是较为通用的过程; 理解C++到CUDA的过渡、预加载过程,进一步从底层了解CUDA。