3.3
考虑以下代码
for(int i = 0; i < 100; i++){a[i] = b[i] + c[i];
}for(int i = 0; i < 100; i+=2){a[i] = b[i] + c[i];a[i+1] = b[i+1] + c[i+1];
}
原来的循环中则检查了100次, 展开后i < 100只检查了50次。在CUDA中,循环展开的意义非常重大:通过减少指令消耗和增加更多的独立调度指令来提高性能。因此,更多的并发操作被添加到流水线上,以产生更高的指令和内存带宽。
3.3.1 展开的归约
(接上一篇3.2 归约问题)在reduceInterleaved核函数中每个线程块只处理一部分数据,这些数据可以被认为是一个数据块。如果用一个线程块手动展开两个数据块的处理。
添加核函数
__global__ void reduceUnrolling2( int *g_idata, int *g_odata, unsigned int n){unsigned int tid = threadIdx.x;int * idata = g_idata + blockIdx.x * blockDim.x * 2; //指针的间隔变成2个BLOCKDIMint idx = threadIdx.x + blockIdx.x * blockDim.x * 2;if (idx > n) return; // boundary check//unrolling 2 data blocks ,先把2个block的数加起来?if (idx + blockDim.x < n){g_idata[idx] += g_idata[idx + blockDim.x];}__syncthreads();for (int stride = blockDim.x / 2; stride > 0; stride >>= 1){ //if (tid < stride){idata[tid] += idata[tid + stride];}__syncthreads();}if (tid == 0){ g_odata[blockIdx.x] = idata[0];}
}
因为现在每个线程块处理两个数据块,我们需要调整内核的执行配置,将网格大小减
小至一半, 在main函数里添加
cudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);cudaDeviceSynchronize();timer.start();reduceUnrolling2<<<grid.x /2, block>>>(d_idata, d_odata, size);cudaDeviceSynchronize(); timer.stop();float elapsedTime5 = timer.elapsedms();cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),cudaMemcpyDeviceToHost);gpu_sum = 0;for (int i = 0; i < grid.x / 2; i ++){gpu_sum += h_odata[i];}printf("reduceUnrolling2 gpu reduce time: %f, sum: %d, gird ,block (%d %d)\n", elapsedTime5, gpu_sum, grid.x / 2, block.x);
输出:
sum value is : 2139095040
cpu reduce time: 11.161504, sum: 2139095040
warm up reduce time: 0.570368, sum: 2139095040
reduceNeighbored gpu reduce time: 0.231840, sum: 2139095040, gird ,block (32768 512)
reduceNeighboredLess gpu reduce time: 0.179328, sum: 2139095040, gird ,block (32768 512)
reduceInterleaved gpu reduce time: 0.116288, sum: 2139095040, gird ,block (32768 512)
reduceUnrolling2 gpu reduce time: 0.075360, sum: 2139095040, gird ,block (16384 512)
简单的展开也能让核函数的执行速度比一开始快约3倍。可以进一步展开获得更好的性能吗?
//核函数
__global__ void reduceUnrolling4( int *g_idata, int *g_odata, unsigned int n){unsigned int tid = threadIdx.x;int * idata = g_idata + blockIdx.x * blockDim.x * 4; //指针的间隔变成4个BLOCKDIMint idx = threadIdx.x + blockIdx.x * blockDim.x * 4;if (idx > n) return; // boundary check//unrolling 4 data blocks if (idx + 3 * blockDim.x < n) {g_idata[idx] += (g_idata[idx + 1 * blockDim.x]);g_idata[idx] += (g_idata[idx + 2 * blockDim.x]);g_idata[idx] += (g_idata[idx + 3 * blockDim.x]);}__syncthreads();for (int stride = blockDim.x / 4; stride > 0; stride >>= 1){ //if (tid < stride){idata[tid] += idata[tid + stride];}__syncthreads();}
//调用
cudaMemcpy(d_idata, h_idata, nBytes, cudaMemcpyHostToDevice);cudaDeviceSynchronize();timer.start();reduceUnrolling4<<<grid.x /4, block>>>(d_idata, d_odata, size);cudaDeviceSynchronize(); timer.stop();float elapsedTime6 = timer.elapsedms();cudaMemcpy(h_odata, d_odata, grid.x/4 * sizeof(int),cudaMemcpyDeviceToHost);gpu_sum = 0;for (int i = 0; i < grid.x / 4; i ++){gpu_sum += h_odata[i];}printf("reduceUnrolling4 gpu reduce time: %f, sum: %d, gird ,block (%d %d)\n", elapsedTime6, gpu_sum, grid.x / 4, block.x);
输出的结果:
sum value is : 2139095040
cpu reduce time: 10.978304, sum: 2139095040
warm up reduce time: 5.448128, sum: 2139095040
reduceNeighbored gpu reduce time: 0.234400, sum: 2139095040, gird ,block (32768 512)
reduceNeighboredLess gpu reduce time: 0.279584, sum: 2139095040, gird ,block (32768 512)
reduceInterleaved gpu reduce time: 0.134304, sum: 2139095040, gird ,block (32768 512)
reduceUnrolling2 gpu reduce time: 0.104128, sum: 2139095040, gird ,block (16384 512)
reduceUnrolling4 gpu reduce time: 0.084064, sum: 1069547520, gird ,block (8192 512)
发现sum的值,是原来的一半,还不理解。
破案了
__global__ void reduceUnrolling4( int *g_idata, int *g_odata, unsigned int n){unsigned int tid = threadIdx.x;int * idata = g_idata + blockIdx.x * blockDim.x * 4; //指针的间隔变成4个BLOCKDIMint idx = threadIdx.x + blockIdx.x * blockDim.x * 4;if (idx > n) return; // boundary check//unrolling 4 data blocks if (idx + 3 * blockDim.x < n) {g_idata[idx] += (g_idata[idx + 1 * blockDim.x]);g_idata[idx] += (g_idata[idx + 2 * blockDim.x]);g_idata[idx] += (g_idata[idx + 3 * blockDim.x]);}__syncthreads();for (int stride = blockDim.x / 2; stride > 0; stride >>= 1){ //关键是 blockDim.x / 2不能改, 这个2是从interleaved继承来的,而不是unrolling2if (tid < stride){idata[tid] += idata[tid + stride];}__syncthreads();}if (tid == 0){ g_odata[blockIdx.x] = idata[0];}
}
结果输出:
reduceUnrolling2 gpu reduce time: 0.244896, sum: 2139095040, gird ,block (16384 512)
reduceUnrolling4 gpu reduce time: 0.074560, sum: 2139095040, gird ,block (8192 512)
reduceUnrolling8 gpu reduce time: 0.050816, sum: 2139095040, gird ,block (4096 512)
3.3.2 展开线程的归约
__syncthreads是用于块内同步的。在归约核函数中,它用来确保在线程进入下一轮之前,每一轮中所有线程已经将局部结果写入全局内存中了。
然而,要细想一下只剩下32个或更少线程(即一个线程束)的情况。因为线程束的执行是SIMT(单指令多线程)的,每条指令之后有隐式的线程束内同步过程。因此,归约循环的最后6个迭代可以用语句来展开,展开避免了执行循环控制和线程同步逻辑。注意变量vmem是和volatile修饰符一起被声明的,它告诉编译器每次赋值时必须将vmem[tid]的值存回全局内存中。
__global__ void reduceUnrolling8Warp( int *g_idata, int *g_odata, unsigned int n){unsigned int tid = threadIdx.x;int * idata = g_idata + blockIdx.x * blockDim.x * 8; //指针的间隔变成8个BLOCKDIMint idx = threadIdx.x + blockIdx.x * blockDim.x * 8;if (idx > n) return; // boundary check//unrolling 8 data blocks if (idx + 7 * blockDim.x < n){g_idata[idx] += (g_idata[idx + blockDim.x] + g_idata[idx + 2 * blockDim.x] + g_idata[idx + 3 * blockDim.x]+ g_idata[idx + 4 * blockDim.x] + g_idata[idx + 5 * blockDim.x] + g_idata[idx + 6 * blockDim.x] + g_idata[idx + 7 * blockDim.x]);}__syncthreads();for (int stride = blockDim.x / 2; stride > 32; stride >>= 1){ // 这地方改了 32!if (tid < stride){idata[tid] += idata[tid + stride];}__syncthreads();}if (tid < 32){volatile int *vmem = idata;vmem[tid] += vmem[tid + 32]; //怎么保证这一行执行完了才是下一行?vmem[tid] += vmem[tid + 16];vmem[tid] += vmem[tid + 8];vmem[tid] += vmem[tid + 4];vmem[tid] += vmem[tid + 2];vmem[tid] += vmem[tid + 1];}if (tid == 0){ g_odata[blockIdx.x] = idata[0];}
}
书上介绍说会比Unrolling8快,但是我执行下来多数比Unrolling8慢一点:
reduceUnrolling8 gpu reduce time: 0.047424, sum: 2139095040, gird ,block (4096 512)
reduceUnrolling8Warp gpu reduce time: 0.054528, sum: 2139095040, gird ,block (4096 512)