VSCode之C++ CUDA入门:reduce的N+1重境界

  1. 背景
    Reduce是几乎所有多线程技术的基础和关键,同样也是诸如深度学习等领域的核心,简单如卷积运算,复杂如梯度聚合、分布式训练等,了解CUDA实现reduce,以及优化reduce是理解CUDA软硬件连接点的很好切入点。

硬件环境:
在这里插入图片描述

  1. 结果展示
chapter2 reduce test
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, GPU 0: 8389334.000, 4.3356 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, GPU 1: 8389335.000, 1.3475 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, GPU 2: 8389335.000, 1.3096 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, GPU 3: 8389335.000, 1.3098 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, GPU 4: 8389335.000, 1.3093 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, GPU 5: 8389335.000, 1.3119 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, GPU 6: 8389335.000, 1.3132 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, GPU 7: 8389335.000, 1.3157 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, L1 v7: 8389335.000, 1.3086 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, L1 Co: 8389335.000, 2.6103 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, L any: 8389335.000, 1.6096 ms 
sum of 16777216 random nums, host: 8389334.731, 18.6554 ms, GPU 8: 8389335.000, 1.3160 ms 
  1. 源码
#include"../include/utils/cx.h"
#include"../include/utils/cxtimers.h"
#include"cooperative_groups.h" // for cg namespace
#include"cooperative_groups/reduce.h" // for cg::reduce
#include"../include/utils/helper_math.h" // for overload += operator for reinterpret (CYDA SDK)
#include<random>namespace cg = cooperative_groups;__global__ void reduce0(float* x, int n) {int tid = blockDim.x * blockIdx.x + threadIdx.x;x[tid] += x[tid+n];
}__global__ void reduce1(float *x,int N)
{int tid = blockDim.x*blockIdx.x+threadIdx.x;float tsum = 0.0f;int stride = gridDim.x*blockDim.x;for(int k=tid; k<N; k += stride) tsum += x[k];x[tid] = tsum;
}__global__ void reduce2(float *y,float *x,int N)
{extern __shared__ float tsum[]; // Dynamically Allocated Shared Memint id = threadIdx.x;int tid = blockDim.x*blockIdx.x+threadIdx.x;int stride = gridDim.x*blockDim.x;tsum[id] = 0.0f;for(int k=tid;k<N;k+=stride) tsum[id] += x[k];__syncthreads();for(int k=blockDim.x/2; k>0; k /= 2){ // power of 2 reduction loopif(id<k) tsum[id] += tsum[id+k];__syncthreads();}if(id==0) y[blockIdx.x] = tsum[0]; // store one value per block
}__global__ void reduce3(float *y,float *x,int N)
{extern __shared__ float tsum[];int id = threadIdx.x;int tid = blockDim.x*blockIdx.x+threadIdx.x;int stride = gridDim.x*blockDim.x;tsum[id] = 0.0f;for(int k=tid;k<N;k+=stride) tsum[id] += x[k];__syncthreads();int block2 = cx::pow2ceil(blockDim.x); // next higher power of 2for(int k=block2/2; k>0; k >>= 1){     // power of 2 reduction loopif(id<k && id+k < blockDim.x) tsum[id] += tsum[id+k];__syncthreads();}if(id==0) y[blockIdx.x] = tsum[0]; // store one value per block
}__global__ void reduce4(float * y,float * x,int N)
{extern __shared__ float tsum[];int id = threadIdx.x;int tid = blockDim.x*blockIdx.x+threadIdx.x;int stride = gridDim.x*blockDim.x;tsum[id] = 0.0f;for(int k=tid;k<N;k+=stride) tsum[id] += x[k];__syncthreads();if(id<256 && id+256 < blockDim.x) tsum[id] += tsum[id+256]; __syncthreads();if(id<128) tsum[id] += tsum[id+128]; __syncthreads();if(id< 64) tsum[id] += tsum[id+ 64]; __syncthreads();if(id< 32) tsum[id] += tsum[id+ 32]; __syncthreads();// warp 0 only from hereif(id< 16) tsum[id] += tsum[id+16]; __syncwarp();if(id< 8)  tsum[id] += tsum[id+ 8]; __syncwarp();if(id< 4)  tsum[id] += tsum[id+ 4]; __syncwarp();if(id< 2)  tsum[id] += tsum[id+ 2]; __syncwarp();if(id==0)  y[blockIdx.x] = tsum[0]+tsum[1];
}template <int blockSize>
__global__ void reduce5(r_Ptr<float> sums,cr_Ptr<float> data,int n)
{// This template kernel assumes that blockDim.x = blockSize, // that blockSize is power of 2 between 64 and 1024 __shared__ float s[blockSize];int id = threadIdx.x;       // rank in block   s[id] = 0;for(int tid = blockSize*blockIdx.x+threadIdx.x;tid < n; tid += blockSize*gridDim.x) s[id] += data[tid];__syncthreads();if(blockSize > 512 && id < 512 && id+512 < blockSize)s[id] += s[id+512];__syncthreads();if(blockSize > 256 && id < 256 && id+256 < blockSize)s[id] += s[id+256];__syncthreads();if(blockSize > 128 && id < 128 && id+128 < blockSize)s[id] += s[id+128];__syncthreads();if(blockSize >  64 && id <  64 && id+ 64 < blockSize)s[id] += s[id+64];__syncthreads();if(id < 32) {  //  single warp from heres[id]             += s[id + 32]; __syncwarp(); // the syncwarpsif(id < 16) s[id] += s[id + 16]; __syncwarp(); // are requiredif(id <  8) s[id] += s[id +  8]; __syncwarp(); // for devices ofif(id <  4) s[id] += s[id +  4]; __syncwarp(); // CC = 7 and aboveif(id <  2) s[id] += s[id +  2]; __syncwarp(); // for CC < 7if(id <  1) s[id] += s[id +  1]; __syncwarp(); // they do nothingif(id == 0) sums[blockIdx.x] = s[0]; // store block sum}
}
// using warp_shrl functions
template<int blockSize>
__global__ void reduce6(r_Ptr<float> sums, cr_Ptr<float> data, int n) {__shared__ float s[blockSize];auto grid = cg::this_grid();auto block = cg::this_thread_block();auto warp = cg::tiled_partition<32>(block);int id = block.thread_rank();s[id] = 0.0f;for (int tid = grid.thread_rank(); tid < n; tid += grid.size()) {s[id] += data[tid];}block.sync();if (blockSize > 512 && id < 512 && id + 512 < blockSize) {s[id] += s[id + 512];}block.sync();if (blockSize > 256 && id < 256 && id + 256 < blockSize) {s[id] += s[id+256];}block.sync();if (blockSize > 128 && id < 128 && id + 128 < blockSize) {s[id] += s[id+128];}block.sync();if (blockSize > 64 && id < 64 && id + 64 < blockSize) {s[id] += s[id+64];}block.sync();if (warp.meta_group_rank() == 0) {s[id] += s[id+32];warp.sync();s[id] += warp.shfl_down(s[id], 16);s[id] += warp.shfl_down(s[id], 8);s[id] += warp.shfl_down(s[id], 4);s[id] += warp.shfl_down(s[id], 2);s[id] += warp.shfl_down(s[id], 1);if (id == 0){sums[blockIdx.x] = s[0];}}
}// warp-only reduce function
__global__ void reduce7(r_Ptr<float> sums,cr_Ptr<float> data,int n)
{// This kernel assumes the array sums is set to zeros on entry// also blockSize is multiple of 32 (should always be true)auto grid =  cg::this_grid();auto block = cg::this_thread_block();auto warp =  cg::tiled_partition<32>(block);float v = 0.0f;  // accumulate thread sums in register variable vfor(int tid=grid.thread_rank(); tid<n; tid+=grid.size()) v += data[tid];warp.sync();v += warp.shfl_down(v,16); // | v += warp.shfl_down(v,8);  // | warp levelv += warp.shfl_down(v,4);  // | reduce herev += warp.shfl_down(v,2);  // |v += warp.shfl_down(v,1);  // | //  use atomicAdd here to sum over warpsif(warp.thread_rank()==0) atomicAdd(&sums[block.group_index().x],v);
}// warp-only and L1 cache function
__global__ void reduce7_L1(r_Ptr<float> sums, cr_Ptr<float> data, int n) {auto grid = cg::this_grid();auto block = cg::this_thread_block();auto warp = cg::tiled_partition<32>(block);float4 v4 = {0.0f, 0.0f, 0.0f, 0.0f};for(int tid = grid.thread_rank(); tid < n/4; tid += grid.size()) {v4 += reinterpret_cast<const float4 *>(data)[tid];}float v = v4.x + v4.y + v4.z + v4.w;warp.sync();v += warp.shfl_down(v, 16);v += warp.shfl_down(v, 8);v += warp.shfl_down(v, 4);v += warp.shfl_down(v, 2);v += warp.shfl_down(v, 1);if (warp.thread_rank() == 0){atomicAdd(&sums[block.group_index().x], v);}
}__device__ void reduce7_L1_coal(r_Ptr<float>sums,cr_Ptr<float>data,int n)
{// This function assumes that a.size() is power of 2 in [1,32]// and that n is a multiple of 4auto g = cg::this_grid();auto b = cg::this_thread_block();auto a = cg::coalesced_threads(); // active threads in warpfloat4 v4 ={0,0,0,0};for(int tid = g.thread_rank(); tid < n/4; tid += g.size())v4 += reinterpret_cast<const float4 *>(data)[tid];float v = v4.x + v4.y + v4.z + v4.w;a.sync();if(a.size() > 16) v += a.shfl_down(v,16); // NB no newif(a.size() >  8) v += a.shfl_down(v,8);  // thread if(a.size() >  4) v += a.shfl_down(v,4);  // divergenceif(a.size() >  2) v += a.shfl_down(v,2);  // allowedif(a.size() >  1) v += a.shfl_down(v,1);  // here               // rank here is within coal group therefore if(a.thread_rank() == 0) atomicAdd(&sums[b.group_index().x],v); // rank==0 OK even for odd only threads
}__global__ void reduce7_coal_even_odd(r_Ptr<float>sumeven,r_Ptr<float>sumodd,cr_Ptr<float>data,int n)
{// divergent code hereif(threadIdx.x%2==0) reduce7_L1_coal(sumeven,data,n);else                 reduce7_L1_coal(sumodd,data,n);
}// reduce L1 coal any
__device__ void reduce7_L1_coal_any(r_Ptr<float>sums,cr_Ptr<float>data,int n)
{// This function works for any value of a.size() in [1,32] // it assumes that n is a multiple of 4auto g = cg::this_grid();auto b = cg::this_thread_block();auto w = cg::tiled_partition<32>(b); // whole warpauto a = cg::coalesced_threads();    // active threads in warpint warps = g.group_dim().x*w.meta_group_size(); // number of warps in grid// divide data into contiguous parts, with one part per warp int part_size = ((n/4)+warps-1)/warps;int part_start = (b.group_index().x*w.meta_group_size() +w.meta_group_rank())*part_size;int part_end = min(part_start+part_size,n/4);// get part sub-sums into threads of afloat4 v4 ={0,0,0,0};int id = a.thread_rank();for(int k=part_start+id; k<part_end; k+=a.size()) // adjacent adds withinv4 += reinterpret_cast<const float4 *>(data)[k]; //    the warpfloat v = v4.x + v4.y + v4.z + v4.w;a.sync();// now reduce over a// first deal with items held by ranks >= kstartint kstart = 1 << (31 - __clz(a.size())); // max power of 2 <= a.size()if(a.size() > kstart) {float w = a.shfl_down(v,kstart);if(a.thread_rank() < a.size()-kstart) v += w;// only update v for         a.sync();                                    // valid low ranking threads}// then do power of 2 reductionfor(int k = kstart/2; k>0; k /= 2) v += a.shfl_down(v,k);if(a.thread_rank() == 0) atomicAdd(&sums[b.group_index().x],v);
}__global__ void reduce7_any(r_Ptr<float>sums,cr_Ptr<float>data,int n)
{if(threadIdx.x % 3 == 0)  reduce7_L1_coal_any(sums,data,n);
}// cg warp-level function
__global__ void reduce8(r_Ptr<float> sums, cr_Ptr<float> data, int n) {auto grid = cg::this_grid();auto block = cg::this_thread_block();auto warp = cg::tiled_partition<32>(block);float v = 0.0f;for(int tid = grid.thread_rank(); tid < n; tid += grid.size()) {v += data[tid];}v = cg::reduce(warp, v, cg::plus<float>());warp.sync();if (warp.thread_rank() == 0) {atomicAdd(&sums[block.group_index().x], v);}
}void test_reduce(const int N) {// const int N = 1 << 24;const int blocks = 256;const int threads = 256;const int nreps = 1000;thrust::host_vector<float> x(N);thrust::device_vector<float> dx(N);// init datastd::default_random_engine gen(42);std::uniform_real_distribution<float> fran(0.0, 1.0);for (int k = 0; k < N; k++) {x[k] = fran(gen);}// host to devicedx = x;cx::timer tim;// cpu time testdouble host_sum = 0.0;for (int k = 0; k < N; k++) {host_sum += x[k];}double t1 = tim.lap_ms();// gpu test reduce0double gpu_sum = 0.0;tim.reset();for (int m = N/2; m > 0; m /= 2) {int threads = std::min(256, m);int blocks = std::max(m / 256, 1);reduce0<<<blocks, threads>>> (dx.data().get(), m);}cudaDeviceSynchronize();double t2 = tim.lap_ms();// device to hostgpu_sum = dx[0];printf("sum of %d random nums, host: %.3f, %.4f ms, GPU 0: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t2);// gpu test reduce1dx = x;tim.reset();for (int rep = 0; rep < nreps; rep++) {reduce1<<<blocks, threads>>> (dx.data().get(), N);reduce1<<<1, threads>>> (dx.data().get(), blocks * threads);reduce1<<<1, 1>>> (dx.data().get(), threads);if (rep == 0) gpu_sum = dx[0];}cudaDeviceSynchronize();double t3 = tim.lap_ms() / nreps;printf("sum of %d random nums, host: %.3f, %.4f ms, GPU 1: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t3);// gpu test reduce2dx = x;thrust::device_vector<float> dy(blocks);tim.reset();for (int rep = 0; rep < nreps; rep++) {reduce2<<<blocks, threads, threads * sizeof(float)>>> (dy.data().get(), dx.data().get(), N);reduce2<<<1, threads, blocks * sizeof(float)>>> (dx.data().get(), dy.data().get(), blocks);if (rep == 0) gpu_sum = dx[0];}cudaDeviceSynchronize();double t4 = tim.lap_ms() / nreps;printf("sum of %d random nums, host: %.3f, %.4f ms, GPU 2: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t4);// gpu test reduce3dx = x;tim.reset();for (int rep = 0; rep < nreps; rep++) {reduce3<<<blocks, threads, threads * sizeof(float)>>> (dy.data().get(), dx.data().get(), N);reduce3<<<1, threads, blocks * sizeof(float)>>> (dx.data().get(), dy.data().get(), blocks);if (rep == 0) gpu_sum = dx[0];}cudaDeviceSynchronize();double t5 = tim.lap_ms() / nreps;printf("sum of %d random nums, host: %.3f, %.4f ms, GPU 3: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t5);// gpu test reduce4dx = x;tim.reset();for (int rep = 0; rep < nreps; rep++) {reduce4<<<blocks, threads, threads * sizeof(float)>>> (dy.data().get(), dx.data().get(), N);reduce4<<<1, threads, blocks * sizeof(float)>>> (dx.data().get(), dy.data().get(), blocks);if (rep == 0) gpu_sum = dx[0];}cudaDeviceSynchronize();double t6 = tim.lap_ms() / nreps;printf("sum of %d random nums, host: %.3f, %.4f ms, GPU 4: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t6);// gpu test reduce5dx = x;tim.reset();for (int rep = 0; rep < nreps; rep++) {reduce5<256><<<blocks,threads,threads*sizeof(float)>>>(dy.data().get(),dx.data().get(),N);}reduce4<<<1, blocks, blocks*sizeof(float)>>>(dx.data().get(), dy.data().get(), blocks);cudaDeviceSynchronize();double t7 = tim.lap_ms() / nreps;gpu_sum = dx[0];printf("sum of %d random nums, host: %.3f, %.4f ms, GPU 5: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t7);// gpu tst reduce6dx = x;tim.reset();for (int rep = 0; rep < nreps; rep++) {reduce6<256><<<blocks, threads, threads*sizeof(float)>>>(dy.data().get(), dx.data().get(), N);}reduce4<<<1, blocks, blocks*sizeof(float)>>>(dx.data().get(), dy.data().get(), blocks);cudaDeviceSynchronize();double t8 = tim.lap_ms() / nreps;gpu_sum = dx[0];printf("sum of %d random nums, host: %.3f, %.4f ms, GPU 6: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t8);// gpu test reduce7dx = x;thrust::device_vector<float> dz(blocks);tim.reset();for (int rep = 0; rep < nreps; rep++) {reduce7<<<blocks, threads, threads*sizeof(float)>>>(dz.data().get(), dx.data().get(), N);reduce7<<<1, blocks, blocks*sizeof(float)>>>(dx.data().get(), dz.data().get(), blocks);if (rep == 0) gpu_sum = dx[0];}cudaDeviceSynchronize();double t9 = tim.lap_ms() / nreps;printf("sum of %d random nums, host: %.3f, %.4f ms, GPU 7: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t9);// gpu test reduce7_L1dx = x;thrust::fill(dz.begin(), dz.end(), 0.0);tim.reset();for (int rep = 0; rep < nreps; rep++) {reduce7_L1<<<blocks, threads, threads*sizeof(float)>>>(dz.data().get(), dx.data().get(), N);reduce7_L1<<<1, blocks, blocks*sizeof(float)>>>(dx.data().get(), dz.data().get(), blocks);if (rep == 0) gpu_sum = dx[0];}cudaDeviceSynchronize();double t91 = tim.lap_ms() / nreps;printf("sum of %d random nums, host: %.3f, %.4f ms, L1 v7: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t91);// gpu test reduce7_L1 coalthrust::device_vector<float>  dy_even(blocks);  // only even elements are usedthrust::device_vector<float>  dy_odd(blocks);   // only odd elements are useddx = x;tim.reset();for(int rep=0;rep<nreps;rep++){reduce7_coal_even_odd<<<blocks,threads>>>(dy_even.data().get(),dy_odd.data().get(),dx.data().get(),N);if (rep == 0) {// use reduce7_L1 for final step// dx[0] = 0.0f; // clear output bufferreduce7_L1<<<1,blocks>>>(dx.data().get(),dy_even.data().get(),blocks);reduce7_L1<<<1,blocks>>>(dx.data().get(),dy_odd.data().get(),blocks);gpu_sum  = dx[0];}}cudaDeviceSynchronize();double t92 = tim.lap_ms()/nreps;// gpu_sum  = dx[0];  // D2H copy (1 word)printf("sum of %d random nums, host: %.3f, %.4f ms, L1 Co: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t92);// gpu test reduce 7 L1 coal anydx = x;thrust::fill(dy.begin(), dy.end(), 0.0);tim.reset();for(int rep=0;rep<nreps;rep++){reduce7_any<<<blocks,threads>>>(dy.data().get(),dx.data().get(),N);if (rep == 0) {reduce7_L1<<<1,blocks>>>(dx.data().get(),dy.data().get(),blocks);gpu_sum = dx[0];}}cudaDeviceSynchronize();double t93 = tim.lap_ms() / nreps;printf("sum of %d random nums, host: %.3f, %.4f ms, L any: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t93);// gpu test reduce8dx = x;thrust::fill(dy.begin(), dy.end(), 0.0);tim.reset();for (int rep = 0; rep < nreps; rep++) {reduce8<<<blocks, threads, threads*sizeof(float)>>>(dy.data().get(), dx.data().get(), N);reduce8<<<1, blocks, blocks*sizeof(float)>>>(dx.data().get(), dy.data().get(), blocks);if (rep == 0) gpu_sum = dx[0];}cudaDeviceSynchronize();double t10 = tim.lap_ms() / nreps;printf("sum of %d random nums, host: %.3f, %.4f ms, GPU 8: %.3f, %.4f ms \n", N, host_sum, t1, gpu_sum, t10);
}
  1. 小结
    1)尝试了9+1中CUDA中最基本的reduce方法,从基本方法到高级库,从精度和速度两方面进行的对比,可以看到与CPU的reduce算法相比,GPU算法明显快很多,更好的显卡,速度应该会更快,同时要注意部分精度损失,这也是CUDA编程中应注意到是否在误差允许范围内;
    2)理论上reduce7_L1_coal应该比reduce7_L1,reduce7速度更快,实测本地电脑测试反而更慢了,猜测原因可能是机器的GPU性能受限导致的;
    3)以上测试结果在不同机子上的总体趋势较一致,细微有差别,与GPU的架构、block\threads的数量、数据计算的总量等都有关系,实际应用中可能需要进一步微调。

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.rhkb.cn/news/211446.html

如若内容造成侵权/违法违规/事实不符,请联系长河编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

模式识别与机器学习(七):集成学习

集成学习 1.概念1.1 类型1.2 集成策略1.3 优势 2. 代码实例2.1boosting2.2 bagging2.3 集成 1.概念 集成学习是一种机器学习方法&#xff0c;旨在通过组合多个个体学习器的预测结果来提高整体的预测性能。它通过将多个弱学习器&#xff08;个体学习器&#xff09;组合成一个强学…

空间金字塔池化(SPP,Spatial Pyramid Pooling)系列

空间金字塔池化的作用是解决输入图片大小不一造成的缺陷&#xff0c;同时在目标识别中增加了精度。空间金字塔池化可以使得任意大小的特征图都能够转换成固定大小的特征向量&#xff0c;下面针对一些典型的空间金字塔进行盘点。 部分图片来自blog:空间金字塔池化改进 SPP / SP…

Jmeter接口自动化测试 —— Jmeter断言之Json断言

&#x1f4e2;专注于分享软件测试干货内容&#xff0c;欢迎点赞 &#x1f44d; 收藏 ⭐留言 &#x1f4dd; 如有错误敬请指正&#xff01;&#x1f4e2;交流讨论&#xff1a;欢迎加入我们一起学习&#xff01;&#x1f4e2;资源分享&#xff1a;耗时200小时精选的「软件测试」资…

PHP:js中怎么使用PHP变量,php变量为数组时的处理

方法一&#xff1a;使用内嵌 PHP 脚本标记 1、简单的拼接 使用内嵌的 PHP 脚本标记 <?php ?> 将 PHP 变量 $phpVariable 的值嵌入到 JavaScript 代码中。 <?php $phpVariable "Hello, World!"; ?><script> // 将 PHP 变量的值传递给 JavaS…

软著项目推荐 深度学习图像风格迁移 - opencv python

文章目录 0 前言1 VGG网络2 风格迁移3 内容损失4 风格损失5 主代码实现6 迁移模型实现7 效果展示8 最后 0 前言 &#x1f525; 优质竞赛项目系列&#xff0c;今天要分享的是 &#x1f6a9; 深度学习图像风格迁移 - opencv python 该项目较为新颖&#xff0c;适合作为竞赛课题…

游戏:火星孤征 - deliver us mars - 美图秀秀~~

今天水一篇&#xff0c;借着免费周下载了deliver us mars&#xff0c;玩下来截了好多图&#xff0c;就放这里了。 游戏没有难度&#xff0c;剧情也不难理解&#xff0c;美图到处都是&#xff0c;建模细节也是满满&#xff0c;值得一玩。 游戏中的 A.S.E是守卫飞行机器人&…

在Vivado 仿真器中搭建UVM验证环境(不需要联合modelsim)

Vivado 集成设计环境支持将通用验证方法学 (UVM) 应用于 Vivado 仿真器。Vivado 提供了预编译的 UVM V1.2 库。 &#xff08;1&#xff09;在 Vivado 2019.2 中创建新 RTL 工程。 &#xff08;2&#xff09;单击“添加目录 (Add Directories)”以将“src”和“verif”目录添加…

揭秘Etsy店群模式:Etsy多账户关联如何解决?如何创建多个账户?

Etsy是美国一个在线销售原创手工工艺品的网站&#xff0c;以手工艺成品买卖为主要特色 &#xff0c;曾被纽约时报拿来和eBay&#xff0c;Amazon和“祖母的地下室收藏”比较。近年来作为热门的电商平台吸引了大量跨境玩家是从事电子商务的人熟悉的平台。但它有一个很大的限制&am…

Python推导式详细讲解

更多资料获取 &#x1f4da; 个人网站&#xff1a;ipengtao.com 在Python中&#xff0c;推导式是一种简洁而强大的语法特性&#xff0c;可以用来创建列表、集合、字典等数据结构。本文将深入探讨Python中的三种推导式&#xff1a;列表推导式、集合推导式和字典推导式&#xff…

Open3D 最小二乘拟合二维直线(直接求解法)

目录 一、算法原理二、代码实现三、结果展示本文由CSDN点云侠原创,原文链接。爬虫网站自重。 一、算法原理 平面直线的表达式为: y = k x + b

Python-炸弹人【附完整源码】

炸弹人 炸弹人是童年的一款经典电子游戏&#xff0c;玩家控制一个类似"炸弹人"的角色&#xff0c;这个角色可以放置炸弹&#xff0c;并在指定的时间内引爆它们消灭敌人以达到目标&#xff0c;此游戏共设有两节关卡&#xff0c;代码如下&#xff1a; 运行效果&#x…

二叉搜索树——模拟

对于一个无穷的满二叉排序树&#xff08;如图&#xff09;&#xff0c;节点的编号是1,2,3&#xff0c;…。对于一棵树根为X的子树&#xff0c;沿着左节点一直往下到最后一层&#xff0c;可以获得该子树编号最小的节点&#xff1b;沿着右节点一直往下到最后一层&#xff0c;可以…

『亚马逊云科技产品测评』活动征文| 基于etcd实现服务发现

提示&#xff1a;授权声明&#xff1a;本篇文章授权活动官方亚马逊云科技文章转发、改写权&#xff0c;包括不限于在 Developer Centre, 知乎&#xff0c;自媒体平台&#xff0c;第三方开发者媒体等亚马逊云科技官方渠道 背景 etcd 是一个分布式 Key-Value 存储系统&#xff0…

IP地址定位不准确的情况研究

在互联网的浩瀚海洋中&#xff0c;每一台连接到网络的设备都被赋予了一个独特的标识符&#xff0c;这就是IP地址。它就像是我们在线身份的一部分&#xff0c;帮助我们与他人进行通信&#xff0c;获取信息&#xff0c;以及享受各种网络服务。然而&#xff0c;由于各种原因&#…

【LVS实战】04 LVS+Keepalived实现负载均衡高可用

一、介绍 Keepalived 是一个用于 Linux 平台的高可用性软件。它实现了虚拟路由器冗余协议 (VRRP) 和健康检查功能&#xff0c;可以用于确保在多台服务器之间提供服务的高可用性。Keepalived 可以检测服务器的故障&#xff0c;并在主服务器宕机时&#xff0c;自动将备份服务器提…

哈希表及其基础(java详解)

目录 一、哈希表基础 二、哈希函数的设计 哈希函数的设计原则 三、java中的hashCode 基本数据类型的hashCode使用 自定义类型的hashCode使用 需要注意 四、哈希冲突的处理 链地址法Seperate Chaining 五、实现属于我们自己的哈希表 六、哈希表的动态空间处理和复杂…

10、外观模式(Facade Pattern,不常用)

外观模式&#xff08;Facade Pattern&#xff09;也叫作门面模式&#xff0c;通过一个门面&#xff08;Facade&#xff09;向客户端提供一个访问系统的统一接口&#xff0c;客户端无须关心和知晓系统内部各子模块&#xff08;系统&#xff09;之间的复杂关系&#xff0c;其主要…

vscode插件离线下载

离线下载插件地址&#xff1a;https://marketplace.visualstudio.com/VSCode

SQL server 基线安全加固操作

目录 账号管理、认证授权 ELK-Mssql-01-01-01 ELK-Mssql-01-01-02 ELK-Mssql-01-01-03 ​​​​​​​ ELK-Mssql-01-01-04 ​​​​​​​ ELK-Mssql-01-01-05 ​​​​​​​ELK-Mssql-01-01-06 日志配置 ELK-Mssql-02-01-01 通信协议 ELK-Mssql-03-01-01 ​​​​​…

【i.MX6ULL】linux驱动bh1750模块

I2C-BH1750 1、设备地址 引脚说明 VCC5VGNDGNDSCLPB6SDAPB7ADDRVCC/GND bh1750设备的地址由引脚 ADDR 来决定 ADDR接GND 当ADDR引脚接地时设备bh1750的地址为&#xff1a;0x23(7bit) ADDR接VCC 当ADDR引脚接地时设备bh1750的地址为&#xff1a;0x5c(7bit) 2、工作模式 …