【cuda学习日记】2.cuda编程模型

2.1 在CPU上执行sum array

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>void sumArraysOnHost(float *A, float *B, float *C, const int N)
{for (int idx = 0; idx< N; idx ++){C[idx] = A[idx] + B[idx];}
}void initialData(float *ip, int size)
{time_t t;srand((unsigned int) time(&t));for (int i = 0; i < size; i++) {ip[i] = (float) (rand() & 0xff) / 10.0f;}
}int main(int argc , char **argv)
{int nElem = 1024;size_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *h_C;h_A = (float *) malloc (nBytes);h_B = (float *) malloc (nBytes);h_C = (float *) malloc (nBytes);initialData(h_A, nElem);initialData(h_B, nElem);sumArraysOnHost(h_A, h_B, h_C, nElem);free(h_A);free(h_B);free(h_C);return 0;
}

2.2 查看grid, block的索引维度

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cuda_runtime.h>__global__ void checkIndex(void)
{printf("threadidx: (%d ,%d ,%d) blockidx:(%d ,%d ,%d) blockdim: (%d ,%d ,%d) gridDim: (%d ,%d ,%d)\n", threadIdx.x, threadIdx.y, threadIdx.z,blockIdx.x, blockIdx.y, blockIdx.z,blockDim.x,blockDim.y,blockDim.z,gridDim.x, gridDim.y, gridDim.z);
}int main(int argc , char **argv)
{int nElem = 6;dim3 block(3);dim3 grid ((nElem + block.x -1)/block.x);printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);checkIndex<<<grid, block>>>();cudaDeviceReset();return 0;
}

输出
grid.x 2 grid.y 1 grid.z 1
block.x 3 block.y 1 block.z 1
threadidx: (0 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (1 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (2 ,0 ,0) blockidx:(1 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (0 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (1 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)
threadidx: (2 ,0 ,0) blockidx:(0 ,0 ,0) blockdim: (3 ,1 ,1) gridDim: (2 ,1 ,1)

2.4 cuda核函数
kernel_name<<<grid, block>>> (argument list);

2.5 核函数限定符
global 在设备端执行，主机端或计算能力>3的设备端调用，必须有void 返回类型
device 在设备端执行，设备端调用
host 在主机端执行，主机端调用

2.6 在GPU中sum，和HOST sum array做对比

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/time.h>#define CHECK(call) \{\const cudaError_t error = call; \if (error != cudaSuccess)\{\printf("Error: %s: %d\n", __FILE__, __LINE__);\printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\exit(1);\}\
}void checkResult(float *hostRef, float *gpuRef, const int N)
{double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(hostRef[i] - gpuRef[i])> epsilon){match = 0;printf("Array do not match\n");printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);break;}}if (match) printf("array matches\n");
}void sumArraysOnHost(float *A, float *B, float *C, const int N)
{for (int idx = 0; idx< N; idx ++){C[idx] = A[idx] + B[idx];}
}void initialData(float *ip, int size)
{time_t t;srand((unsigned int) time(&t));for (int i = 0; i < size; i++) {ip[i] = (float) (rand() & 0xff) / 10.0f;}
}__global__ void sumArraysOnGPU(float *A, float *B, float *C)
{int i  = blockIdx.x * blockDim.x + threadIdx.x;C[i] = A[i] + B[i];
}double cpusec()
{struct timeval tp;gettimeofday(&tp, NULL);return ((double) tp.tv_sec + (double)tp.tv_usec* 1.e-6);
}int main(int argc , char **argv)
{printf("%s starting\n", argv[0]);int dev = 0;cudaSetDevice(dev);//set up dataint nElem = 32;size_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *hostRef, *gpuRef;h_A = (float *) malloc (nBytes);h_B = (float *) malloc (nBytes);hostRef = (float *) malloc (nBytes);gpuRef = (float *) malloc (nBytes);initialData(h_A, nElem);initialData(h_B, nElem);memset(hostRef,0, nBytes);memset(gpuRef,0, nBytes);// malloc device global memoryfloat *d_A, *d_B, *d_C;cudaMalloc((float**)&d_A, nBytes);cudaMalloc((float**)&d_B, nBytes);cudaMalloc((float**)&d_C, nBytes);//transfer data from host to devicecudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);dim3 block(nElem);dim3 grid(nElem/block.x);sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);printf("execution config <<<%d, %d>>>\n", grid.x, block.x);//copy kernel result back to hostcudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);sumArraysOnHost(h_A, h_B, hostRef, nElem);checkResult(hostRef, gpuRef, nElem);cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);free(h_A);free(h_B);free(hostRef);free(gpuRef);return 0;
}

输出
execution config <<<1, 32>>>
array matches

2.8 尝试用nvprof 检查执行时间，报错找不到dll文件，可以参考
https://blog.csdn.net/qq_41607336/article/details/126741908
解决，
但最后还是报warning，跟显卡相关
======= Warning: nvprof is not supported on devices with compute capability 8.0 and higher.

2.9 加上device信息打印，以及cudaevent 计时，参考https://blog.csdn.net/qq_42681630/article/details/144895351

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>typedef unsigned long DWORD;#define CHECK(call) \{\const cudaError_t error = call; \if (error != cudaSuccess)\{\printf("Error: %s: %d\n", __FILE__, __LINE__);\printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\exit(1);\}\
}void checkResult(float *hostRef, float *gpuRef, const int N)
{double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(hostRef[i] - gpuRef[i])> epsilon){match = 0;printf("Array do not match\n");printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);break;}}if (match) printf("array matches\n");
}void sumArraysOnHost(float *A, float *B, float *C, const int N)
{for (int idx = 0; idx< N; idx ++){C[idx] = A[idx] + B[idx];}
}void initialData(float *ip, int size)
{time_t t;srand((unsigned int) time(&t));for (int i = 0; i < size; i++) {ip[i] = (float) (rand() & 0xff) / 10.0f;}
}__global__ void sumArraysOnGPU(float *A, float *B, float *C)
{int i  = blockIdx.x * blockDim.x + threadIdx.x;C[i] = A[i] + B[i];
}int main(int argc , char **argv)
{printf("%s starting\n", argv[0]);int dev = 0;cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("Using Device %d : %s\n", dev, deviceprop.name);CHECK(cudaSetDevice(dev));printf( "Compute capability:  %d.%d\n", deviceprop.major, deviceprop.minor );printf( "Clock rate:  %d\n", deviceprop.clockRate );printf( "Memory Clock rate:  %d\n", deviceprop.memoryClockRate );printf( "Memory busWidth:  %d\n", deviceprop.memoryBusWidth );printf( "   --- Memory Information for device  ---\n");// printf( "Total global mem:  %ld\n", prop.totalGlobalMem );printf( "Total global mem:  %zu\n", deviceprop.totalGlobalMem );printf( "Total constant Mem:  %ld\n", deviceprop.totalConstMem );printf( "Max mem pitch:  %ld\n", deviceprop.memPitch );printf( "Texture Alignment:  %ld\n", deviceprop.textureAlignment );printf( "   --- MP Information for device  ---\n" );printf( "Multiprocessor count:  %d\n",deviceprop.multiProcessorCount );printf( "Shared mem per mp:  %ld\n", deviceprop.sharedMemPerBlock );printf( "Registers per mp:  %d\n", deviceprop.regsPerBlock );printf( "Threads in warp:  %d\n", deviceprop.warpSize );printf( "Max threads per block:  %d\n",deviceprop.maxThreadsPerBlock );printf( "Max thread dimensions:  (%d, %d, %d)\n",deviceprop.maxThreadsDim[0], deviceprop.maxThreadsDim[1],deviceprop.maxThreadsDim[2] );printf( "Max grid dimensions:  (%d, %d, %d)\n",deviceprop.maxGridSize[0], deviceprop.maxGridSize[1],deviceprop.maxGridSize[2] );printf( "\n" );//set up dataint nElem = 1<<24;size_t nBytes = nElem * sizeof(float);float *h_A, *h_B, *hostRef, *gpuRef;h_A = (float *) malloc (nBytes);h_B = (float *) malloc (nBytes);hostRef = (float *) malloc (nBytes);gpuRef = (float *) malloc (nBytes);initialData(h_A, nElem);initialData(h_B, nElem);memset(hostRef,0, nBytes);memset(gpuRef,0, nBytes);// malloc device global memoryfloat *d_A, *d_B, *d_C;cudaMalloc((float**)&d_A, nBytes);cudaMalloc((float**)&d_B, nBytes);cudaMalloc((float**)&d_C, nBytes);//transfer data from host to devicecudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);int Ilen = 1024;dim3 block(Ilen);dim3 grid((nElem + block.x - 1)/block.x);cudaEvent_t start, stop;cudaEventCreate(&start);cudaEventCreate(&stop);cudaEventRecord(start);sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);printf("execution config <<<%d, %d>>>\n", grid.x, block.x);cudaEventRecord(stop);cudaEventSynchronize(stop);float milliseconds = 0;cudaEventElapsedTime(&milliseconds, start, stop);printf("Kernel execution time: %f ms\n", milliseconds);cudaEventDestroy(start);cudaEventDestroy(stop);//copy kernel result back to hostcudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);sumArraysOnHost(h_A, h_B, hostRef, nElem);checkResult(hostRef, gpuRef, nElem);cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);free(h_A);free(h_B);free(hostRef);free(gpuRef);return 0;
}

输出：
Using Device 0 : NVIDIA GeForce RTX 4090
Compute capability: 8.9
Clock rate: 2520000
Memory Clock rate: 10501000
Memory busWidth: 384
— Memory Information for device —
Total global mem: 25756696576
Total constant Mem: 65536
Max mem pitch: 2147483647
Texture Alignment: 512
— MP Information for device —
Multiprocessor count: 128
Shared mem per mp: 49152
Registers per mp: 65536
Threads in warp: 32
Max threads per block: 1024
Max thread dimensions: (1024, 1024, 64)
Max grid dimensions: (2147483647, 65535, 65535)