cuda入门学习

最近接触cuda 编程,记录一下。

1 工作实现一个【0-100)的加法

如果用python

sum = 0
for i in range(200):sum+=i
print(sum)

2 cuda 的一些简单的概念

一维情况下大概是这样的
(1个grid * 2个blocks * 4个thread)
在这里插入图片描述

3 代码直接上代码

我把100分为20个blocks ,每个block 有5个threads。

int num_blocks = 20;
int block_size = data_len/num_blocks // 100/20 = 5;
sum_kernel << <num_blocks, block_size >> > (sum, dev_c, data_len); //将其送入到内核中去

内核函数计算加法

 int tid = blockIdx.x * blockDim.x + threadIdx.x; // blockDim.x =5  

原子相加,相当加了一个锁,保证运算的正确性。

atomicAdd(sum, data[tid]);

3 完整代码

#include <stdio.h>    
#include <stdlib.h>   
#include <cuda_runtime.h>  
__global__ void sum_kernel(int* sum, int* data, int n) {int tid = blockIdx.x * blockDim.x + threadIdx.x;int stride = gridDim.x * blockDim.x;printf("stride=%d, blockIdx.x blockDim.x threadIdx.x [%d, %d, %d] \n", stride,blockIdx.x,blockDim.x,threadIdx.x);atomicAdd(sum, data[tid]);printf("data[%d] = %d  sum  in kernel:  %d\n",tid,data[tid],*sum);
}
int main() {const int data_len = 100;int* dev_c = 0;int *sum=0;cudaError_t cudaStatus;cudaStatus = cudaSetDevice(0);if (cudaStatus != cudaSuccess) {fprintf(stderr, "选择GPU失败,您的电脑上没有GPU");return 0;}cudaStatus = cudaMalloc((void**)&dev_c, data_len * sizeof(int));cudaStatus = cudaMalloc((void**)&sum, data_len * sizeof(int));//cudaMalloc(&sum, sizeof(int));int data_cpu[data_len];for (int i = 0; i < data_len; ++i){data_cpu[i] = i;}cudaStatus =cudaMemcpy(dev_c, data_cpu,sizeof(int)* data_len, cudaMemcpyHostToDevice);//cudaMemcpy(dev_histo, &threadSum, sizeof(int), cudaMemcpyHostToDevice);if (cudaStatus != cudaSuccess) {fprintf(stderr, "dev_b复制失败");}int num_blocks = 20;int block_size = data_len/num_blocks;sum_kernel << <num_blocks, block_size >> > (sum, dev_c, data_len);int result;cudaMemcpy(&result, sum, sizeof(int), cudaMemcpyDeviceToHost);printf("sum = %d\n", result);
}

4 运行结果

stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [6, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [18, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [2, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [9, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [8, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [14, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [17, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [11, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [5, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [3, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [15, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [0, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [12, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [7, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [19, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [10, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [4, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [16, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [1, 5, 4]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 0]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 1]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 2]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 3]
stride=100, blockIdx.x blockDim.x threadIdx.x [13, 5, 4]
data[25] = 25  sum  in kernel:  135
data[26] = 26  sum  in kernel:  135
data[27] = 27  sum  in kernel:  135
data[28] = 28  sum  in kernel:  135
data[29] = 29  sum  in kernel:  135
data[40] = 40  sum  in kernel:  345
data[41] = 41  sum  in kernel:  345
data[42] = 42  sum  in kernel:  345
data[43] = 43  sum  in kernel:  345
data[44] = 44  sum  in kernel:  345
data[45] = 45  sum  in kernel:  740
data[46] = 46  sum  in kernel:  740
data[47] = 47  sum  in kernel:  740
data[48] = 48  sum  in kernel:  740
data[49] = 49  sum  in kernel:  740
data[30] = 30  sum  in kernel:  740
data[31] = 31  sum  in kernel:  740
data[32] = 32  sum  in kernel:  740
data[33] = 33  sum  in kernel:  740
data[34] = 34  sum  in kernel:  740
data[50] = 50  sum  in kernel:  1110
data[51] = 51  sum  in kernel:  1110
data[52] = 52  sum  in kernel:  1110
data[53] = 53  sum  in kernel:  1110
data[54] = 54  sum  in kernel:  1110
data[85] = 85  sum  in kernel:  1545
data[86] = 86  sum  in kernel:  1545
data[87] = 87  sum  in kernel:  1545
data[88] = 88  sum  in kernel:  1545
data[89] = 89  sum  in kernel:  1545
data[55] = 55  sum  in kernel:  1830
data[56] = 56  sum  in kernel:  1830
data[57] = 57  sum  in kernel:  1830
data[58] = 58  sum  in kernel:  1830
data[59] = 59  sum  in kernel:  1830
data[20] = 20  sum  in kernel:  1110
data[21] = 21  sum  in kernel:  1110
data[22] = 22  sum  in kernel:  1110
data[23] = 23  sum  in kernel:  1110
data[24] = 24  sum  in kernel:  1110
data[90] = 90  sum  in kernel:  2290
data[91] = 91  sum  in kernel:  2290
data[92] = 92  sum  in kernel:  2290
data[93] = 93  sum  in kernel:  2290
data[94] = 94  sum  in kernel:  2290
data[10] = 10  sum  in kernel:  3155
data[11] = 11  sum  in kernel:  3155
data[12] = 12  sum  in kernel:  3155
data[13] = 13  sum  in kernel:  3155
data[14] = 14  sum  in kernel:  3155
data[15] = 15  sum  in kernel:  3155
data[16] = 16  sum  in kernel:  3155
data[17] = 17  sum  in kernel:  3155
data[18] = 18  sum  in kernel:  3155
data[19] = 19  sum  in kernel:  3155
data[60] = 60  sum  in kernel:  3155
data[61] = 61  sum  in kernel:  3155
data[62] = 62  sum  in kernel:  3155
data[63] = 63  sum  in kernel:  3155
data[64] = 64  sum  in kernel:  3155
data[80] = 80  sum  in kernel:  2700
data[81] = 81  sum  in kernel:  2700
data[82] = 82  sum  in kernel:  2700
data[83] = 83  sum  in kernel:  2700
data[84] = 84  sum  in kernel:  2700
data[95] = 95  sum  in kernel:  3675
data[96] = 96  sum  in kernel:  3675
data[97] = 97  sum  in kernel:  3675
data[98] = 98  sum  in kernel:  3675
data[99] = 99  sum  in kernel:  3675
data[5] = 5  sum  in kernel:  3190
data[6] = 6  sum  in kernel:  3190
data[7] = 7  sum  in kernel:  3190
data[8] = 8  sum  in kernel:  3190
data[9] = 9  sum  in kernel:  3190
data[70] = 70  sum  in kernel:  4035
data[71] = 71  sum  in kernel:  4035
data[72] = 72  sum  in kernel:  4035
data[73] = 73  sum  in kernel:  4035
data[74] = 74  sum  in kernel:  4035
data[75] = 75  sum  in kernel:  4615
data[76] = 76  sum  in kernel:  4615
data[77] = 77  sum  in kernel:  4615
data[78] = 78  sum  in kernel:  4615
data[79] = 79  sum  in kernel:  4615
data[0] = 0  sum  in kernel:  4615
data[1] = 1  sum  in kernel:  4615
data[2] = 2  sum  in kernel:  4615
data[3] = 3  sum  in kernel:  4615
data[4] = 4  sum  in kernel:  4615
data[35] = 35  sum  in kernel:  4615
data[36] = 36  sum  in kernel:  4615
data[37] = 37  sum  in kernel:  4615
data[38] = 38  sum  in kernel:  4615
data[39] = 39  sum  in kernel:  4615
data[65] = 65  sum  in kernel:  4950
data[66] = 66  sum  in kernel:  4950
data[67] = 67  sum  in kernel:  4950
data[68] = 68  sum  in kernel:  4950
data[69] = 69  sum  in kernel:  4950
sum = 4950

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.rhkb.cn/news/445793.html

如若内容造成侵权/违法违规/事实不符,请联系长河编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

metahuman如何导入UE5

1.启动 通过EPIC启动UE5(UE5内置有Bridge, 但是UE4是需要单独下在Bridge软件) 2.打开Quixel Bridge 在window(窗口)中打开Quixel Bridge 3.Bridge界面 在弹出的Bridge界面选择模型 需要先下载&#xff0c;然后再导入 4.下载模型 点击需要的模型右上方的绿色箭头下载 5.下…

【论文#码率控制】ADAPTIVE RATE CONTROL FOR H.264

目录 摘要1.前言2.基本知识2.1 蛋鸡悖论2.2 基本单元的定义2.3 线性MAD预测模型 3.GOP级码率控制3.1 总比特数3.2 初始化量化参数 4.帧级码率控制4.1 非存储图像的量化参数4.2 存储图像的目标比特 5.基本单元级码率控制6.实验结果7.结论 《ADAPTIVE RATE CONTROL FOR H.264》 A…

OKHTTP 如何处理请求超时和重连机制

&#x1f604;作者简介&#xff1a; 小曾同学.com,一个致力于测试开发的博主⛽️&#xff0c;主要职责&#xff1a;测试开发、CI/CD 如果文章知识点有错误的地方&#xff0c;还请大家指正&#xff0c;让我们一起学习&#xff0c;一起进步。 &#x1f60a; 座右铭&#xff1a;不…

【ELKB】Kibana使用

搭建好ELKB后访问地址&#xff1a;http://localhost:5601 输入账号密码登录以后 左侧导航有home、Analysis、Enterprise search 、Observability、Security、Management home&#xff1a;首页Analysis&#xff1a;工具来分析及可视化数据Enterprise search&#xff1a;企业级搜…

解读《ARM Cortex-M3 与Cortex-M4 权威指南》——第1章 ARM Cortex-M处理器简介

1. 三级流水线设计 解释:三级流水线设计意味着处理器在执行指令时可以同时处理多个步骤。这些步骤通常包括取指(Fetch)、译码(Decode)和执行(Execute)。好处:这种设计提高了指令的执行效率,使得处理器能够在每个时钟周期内完成更多的工作,从而提升整体性能。2. 哈佛总…

分享一些常用的数据库性能监测工具

以下是一些常用的数据库性能监测工具&#xff1a; 一、MySQL MySQL Enterprise Monitor&#xff1a; 由 MySQL 官方推出&#xff0c;提供全面的数据库性能监控、诊断和优化功能。可以监控数据库的各种指标&#xff0c;如查询性能、连接数、缓存命中率等&#xff0c;并提供警报…

yolo参数调节

1-weight 不同版本的神经网络 可以在这下载复制 2 source图片路径或者文件夹路径 3 img size 尺寸&#xff08;尽量与神经网络模型匹配&#xff09; 4 4 -conf-thres 简单理解就是模型识别成功概率超过这一标准才会显示 5 iou多区域重合 &#xff08;重合比例&#xff09;…

数学科普读物《从毕达哥拉斯到怀尔斯》

毕达哥拉斯是古希腊数学家&#xff0c;怀尔斯是英国数学家&#xff0c;曾任美国普林斯顿大学教授。这本书是哈工大出版社刘培杰先生主编的。这是一本500多页的书&#xff0c;我不禁慨叹高级数学爱好者刘培杰的博学广识&#xff0c;因为书中纵论古今旁征博引&#xff0c;仅书后的…

数据结构-5.6.二叉树的先,中,后序遍历

一.遍历&#xff1a; 二.二叉树的遍历&#xff1a;利用了递归操作 1.简介&#xff1a; 二叉树的先序遍历&#xff0c;中序遍历&#xff0c;后序遍历都是以根结点遍历顺序为准的&#xff0c;如先序遍历就先遍历根结点 2.实例&#xff1a; 例一&#xff1a; 例二&#xff1a; …

C++中string函数用法总结

一&#xff0c;string的构造函数 string() //无参构造&#xff0c;初始化为空串 string(const string& str) //用str拷贝构造 string(size_t n,char c) //用n个字符c初始化 string(const char* s,size_t n) //用字符串s的前n个字符初始化 string(const string& str…

【最优化方法】最速下降法

给出点 x [1,4,5,8,12] y [7,9,15,14,27] 要找出温度和冰淇淋销量之间的关系&#xff0c;通过线性回归来拟合求出属性和结果之间的线性关系。 如果直接把这些点连起来&#xff0c;是吃力不讨好的&#xff0c;因为如果有新数据进来大概率不在这条线上&#xff0c;这个行为也…

Prometheus + Grafana 监控 MySQL 数据库

文章目录 1、前置介绍2、搭建流程2.1、安装 Docker2.2、安装 MySQL2.3、安装 MySQL Exporter2.4、安装 Prometheus2.5、安装 Grafana 1、前置介绍 本次监控平台搭建&#xff0c;我使用2台阿里云服务器来完成本次的搭建部署操作&#xff0c;配置如下&#xff1a; 阿里云ECS1&am…

【Kubernets】配置类型资源 Etcd, Secret, ConfigMap

文章目录 所有资源概览Etcd详细说明一、基本概念二、主要功能三、架构与组件四、数据模型与操作五、安全与认证六、集群部署与管理 Secret详细说明一、Secret 的类型二、Secret 的创建三、Secret 的使用四、Secret 的更新与删除五、Secret 的安全性 ConfigMap详细说明一、Confi…

2024年恩施职称评前公示

此次公示共有422人&#xff0c;初级职称、中级职称、馆员、畜牧师、助理馆员、三级演员、农艺师等均在一起进行评审前的公示。 根据恩施州职称改革工作领导小组办公室《关于报送2024年度恩施州中初级专业技术职务评审材料的通知》&#xff08;恩施州职改办〔2024〕14号&#xf…

jdk环境变量配置--小总结

1、jdk安装路径变量 2、在path下添加环境变量

【Python Django + Vue】酒店在线预订系统:用技术说话!

&#x1f393; 作者&#xff1a;计算机毕设小月哥 | 软件开发专家 &#x1f5a5;️ 简介&#xff1a;8年计算机软件程序开发经验。精通Java、Python、微信小程序、安卓、大数据、PHP、.NET|C#、Golang等技术栈。 &#x1f6e0;️ 专业服务 &#x1f6e0;️ 需求定制化开发源码提…

【JavaEE初阶】文件-IO之实现文件系统的操作如何进行实现

前言 &#x1f31f;&#x1f31f;本期讲解关于文件IO的操作&#xff0c;这里涉及到比较常用的文件操作哦~~~ &#x1f308;上期博客在这里&#xff1a;【JavaEE初阶】CAS的ABA问题&#xff0c;JUC多线程编程有用的相关类-CSDN博客 &#x1f308;感兴趣的小伙伴看一看小编主页&a…

支持向量机-笔记

支持向量机&#xff08;Support Vector Machine, SVM&#xff09; 是一种强大的监督学习算法&#xff0c;广泛应用于分类和回归任务&#xff0c;特别是在分类问题中表现优异。SVM 的核心思想是通过寻找一个最优超平面&#xff0c;将不同类别的数据点进行分割&#xff0c;并最大…

数据结构 ——— 顺序表oj题:有效的括号

目录 题目要求 代码实现 题目要求 给定一个只包括 (&#xff0c;)&#xff0c;{&#xff0c;}&#xff0c;[&#xff0c;] 的字符串 s &#xff0c;判断字符串是否有效 有效字符串需满足&#xff1a; 左括号必须用相同类型的右括号闭合。左括号必须以正确的顺序闭合。每个…

[单master节点k8s部署]37.微服务(一)springCloud 微服务

微服务架构的一个重要特点是&#xff0c;它与开发中使用的具体编程语言或技术栈无关。每个微服务都可以使用最适合其功能需求的语言或技术来实现。例如&#xff0c;一个微服务可以用Java编写&#xff0c;另一个微服务可以用Python、Go、Node.js等编写。微服务架构允许这种灵活性…