将 magma example 改写成 cusolver example eqrf

1,简单安装Magma

1.1 下载编译 OpenBLAS

$ git clone https://github.com/OpenMathLib/OpenBLAS.git
$ cd OpenBLAS/
$ make -j DEBUG=1
$ make install PREFIX=/home/hipper/ex_magma/local_d/OpenBLAS/

1.2 下载编译 magma

$ git clone https://bitbucket.org/icl/magma.git
$ cd magma/
$ cp make.inc-examples/make.inc.openblas ./make.inc
$ vim make.inc
// # edit openblasdir to abouve
// # -O2 -> -g
$ make -j

vim make.inc

2. 改写 testing_xxxqr_gpu.cpp

testing/testing_sgeqrf_gpu.cpp

运行效果:

原始代码: 

/*-- MAGMA (version 2.0) --Univ. of Tennessee, KnoxvilleUniv. of California, BerkeleyUniv. of Colorado, Denver@date@generated from testing/testing_zgeqrf_gpu.cpp, normal z -> s, Mon Jul 29 01:23:15 2024
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>// includes, project
#include "flops.h"
#include "magma_v2.h"
#include "magma_lapack.h"
#include "testings.h"/* -- Testing sgeqrf
*/
int main( int argc, char** argv)
{TESTING_CHECK( magma_init() );magma_print_environment();const float             d_neg_one = MAGMA_D_NEG_ONE;const float             d_one     = MAGMA_D_ONE;const float c_neg_one = MAGMA_S_NEG_ONE;const float c_one     = MAGMA_S_ONE;const float c_zero    = MAGMA_S_ZERO;const magma_int_t        ione      = 1;real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;float           Anorm, error=0, error2=0;float *h_A, *h_R, *tau, *h_work, tmp[1], unused[1];magmaFloat_ptr d_A, dT;magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size;magma_int_t ISEED[4] = {0,0,0,1};magma_opts opts;opts.parse_opts( argc, argv );int status = 0;float tol = opts.tolerance * lapackf77_slamch("E");// for expert API testingmagma_device_t cdev;magma_queue_t queues[2];magma_getdevice( &cdev );magma_queue_create( cdev, &queues[0] );magma_queue_create( cdev, &queues[1] );// version 3 can do either checkif (opts.check == 1 && ( opts.version == 1 || opts.version == 4 ) ) {opts.check = 2;printf( "%% versions 1 and 4 requires check 2 (solve A*x=b)\n" );}if (opts.check == 2 && opts.version == 2) {opts.check = 1;printf( "%% version 2 requires check 1 (R - Q^H*A)\n" );}printf( "%% version %lld\n", (long long) opts.version );if ( opts.check == 1 ) {printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)   |R - Q^H*A|   |I - Q^H*Q|\n");printf("%%==============================================================================\n");}else {printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)    |b - A*x|\n");printf("%%===============================================================\n");}for( int itest = 0; itest < opts.ntest; ++itest ) {for( int iter = 0; iter < opts.niter; ++iter ) {M = opts.msize[itest];N = opts.nsize[itest];min_mn = min( M, N );lda    = M;n2     = lda*N;ldda   = magma_roundup( M, opts.align );  // multiple of 32 by defaultnb     = magma_get_sgeqrf_nb( M, N );gflops = FLOPS_SGEQRF( M, N ) / 1e9;// query for workspace sizelwork = -1;lapackf77_sgeqrf( &M, &N, unused, &M, unused, tmp, &lwork, &info );lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] );TESTING_CHECK( magma_smalloc_cpu( &tau,    min_mn ));TESTING_CHECK( magma_smalloc_cpu( &h_A,    n2     ));TESTING_CHECK( magma_smalloc_cpu( &h_work, lwork  ));TESTING_CHECK( magma_smalloc_pinned( &h_R,    n2     ));TESTING_CHECK( magma_smalloc( &d_A,    ldda*N ));if ( opts.version == 1 || opts.version == 3 || opts.version == 4 ) {size = (2*min(M, N) + magma_roundup( N, 32 ) )*nb;TESTING_CHECK( magma_smalloc( &dT, size ));magmablas_slaset( MagmaFull, size, 1, c_zero, c_zero, dT, size, opts.queue );}/* Initialize the matrix */magma_generate_matrix( opts, M, N, h_A, lda );lapackf77_slacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda );magma_ssetmatrix( M, N, h_R, lda, d_A, ldda, opts.queue );/* ====================================================================Performs operation using MAGMA=================================================================== */if ( opts.version == 1 ) {// stores dT, V blocks have zeros, R blocks inverted & stored in dTgpu_time = magma_wtime();magma_sgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info );gpu_time = magma_wtime() - gpu_time;}else if ( opts.version == 2 ) {// LAPACK complaint argumentsgpu_time = magma_wtime();magma_sgeqrf2_gpu( M, N, d_A, ldda, tau, &info );gpu_time = magma_wtime() - gpu_time;}#if defined(MAGMA_HAVE_CUDA) || defined(MAGMA_HAVE_HIP)else if ( opts.version == 3 ) {// stores dT, V blocks have zeros, R blocks stored in dTgpu_time = magma_wtime();magma_sgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info );gpu_time = magma_wtime() - gpu_time;}#endifelse if (opts.version == 4) {// expert API for magma_sgeqrf_gpumagma_mode_t mode = MagmaHybrid;// query workspacevoid *host_work = NULL, *device_work=NULL;magma_int_t lhwork[1] = {-1}, ldwork[1] = {-1};magma_sgeqrf_expert_gpu_work(M, N, NULL, ldda,NULL, NULL, &info,mode, nb,NULL, lhwork,NULL, ldwork, queues );// alloc workspaceif( lhwork[0] > 0 ) {magma_malloc_pinned( (void**)&host_work, lhwork[0] );}if( ldwork[0] > 0 ) {magma_malloc( (void**)&device_work, ldwork[0] );}// time actual call onlygpu_time = magma_wtime();magma_sgeqrf_expert_gpu_work(M, N, d_A, ldda, tau, dT, &info,mode, nb,host_work, lhwork, device_work, ldwork, queues );magma_queue_sync( queues[0] );magma_queue_sync( queues[1] );gpu_time = magma_wtime() - gpu_time;// free workspaceif( host_work != NULL) {magma_free_pinned( host_work );}if( device_work != NULL ) {magma_free( device_work );}}else {printf( "Unknown version %lld\n", (long long) opts.version );return -1;}gpu_perf = gflops / gpu_time;if (info != 0) {printf("magma_sgeqrf returned error %lld: %s.\n",(long long) info, magma_strerror( info ));}if ( opts.check == 1 && (opts.version == 2 || opts.version == 3) ) {if ( opts.version == 3 ) {// copy diagonal blocks of R back to Afor( int i=0; i < min_mn-nb; i += nb ) {magma_int_t ib = min( min_mn-i, nb );magmablas_slacpy( MagmaUpper, ib, ib, &dT[min_mn*nb + i*nb], nb, &d_A[ i + i*ldda ], ldda, opts.queue );}}/* =====================================================================Check the result, following zqrt01 except using the reduced Q.This works for any M,N (square, tall, wide).Only for version 2, which has LAPACK complaint output.Or   for version 3, after restoring diagonal blocks of A above.=================================================================== */magma_sgetmatrix( M, N, d_A, ldda, h_R, lda, opts.queue );magma_int_t ldq = M;magma_int_t ldr = min_mn;float *Q, *R;float *work;TESTING_CHECK( magma_smalloc_cpu( &Q,    ldq*min_mn ));  // M by KTESTING_CHECK( magma_smalloc_cpu( &R,    ldr*N ));       // K by NTESTING_CHECK( magma_smalloc_cpu( &work, min_mn ));// generate M by K matrix Q, where K = min(M,N)lapackf77_slacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq );lapackf77_sorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info );assert( info == 0 );// copy K by N matrix Rlapackf77_slaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr );lapackf77_slacpy( "Upper", &min_mn, &N, h_R, &lda,        R, &ldr );// error = || R - Q^H*A || / (N * ||A||)blasf77_sgemm( "Conj", "NoTrans", &min_mn, &N, &M,&c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr );Anorm = lapackf77_slange( "1", &M,      &N, h_A, &lda, work );error = lapackf77_slange( "1", &min_mn, &N, R,   &ldr, work );if ( N > 0 && Anorm > 0 )error /= (N*Anorm);// set R = I (K by K identity), then R = I - Q^H*Q// error = || I - Q^H*Q || / Nlapackf77_slaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr );blasf77_ssyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr );error2 = safe_lapackf77_slansy( "1", "Upper", &min_mn, R, &ldr, work );if ( N > 0 )error2 /= N;magma_free_cpu( Q    );  Q    = NULL;magma_free_cpu( R    );  R    = NULL;magma_free_cpu( work );  work = NULL;}else if ( opts.check == 2 && M >= N && (opts.version == 1 || opts.version == 3 || opts.version == 4) ) {/* =====================================================================Check the result by solving consistent linear system, A*x = b.Only for versions 1 & 3 with M >= N.=================================================================== */magma_int_t lwork2;float *x, *b, *hwork;magmaFloat_ptr d_B;// initialize RHS, b = A*randomTESTING_CHECK( magma_smalloc_cpu( &x, N ));TESTING_CHECK( magma_smalloc_cpu( &b, M ));lapackf77_slarnv( &ione, ISEED, &N, x );blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione );// copy to GPUTESTING_CHECK( magma_smalloc( &d_B, M ));magma_ssetvector( M, b, 1, d_B, 1, opts.queue );if ( opts.version == 1 || opts.version == 4) {// allocate hworkmagma_sgeqrs_gpu( M, N, 1,d_A, ldda, tau, dT,d_B, M, tmp, -1, &info );lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );TESTING_CHECK( magma_smalloc_cpu( &hwork, lwork2 ));// solve linear systemmagma_sgeqrs_gpu( M, N, 1,d_A, ldda, tau, dT,d_B, M, hwork, lwork2, &info );if (info != 0) {printf("magma_sgeqrs returned error %lld: %s.\n",(long long) info, magma_strerror( info ));}magma_free_cpu( hwork );}#if defined(MAGMA_HAVE_CUDA) || defined(MAGMA_HAVE_HIP)else if ( opts.version == 3 ) {// allocate hworkmagma_sgeqrs3_gpu( M, N, 1,d_A, ldda, tau, dT,d_B, M, tmp, -1, &info );lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );TESTING_CHECK( magma_smalloc_cpu( &hwork, lwork2 ));// solve linear systemmagma_sgeqrs3_gpu( M, N, 1,d_A, ldda, tau, dT,d_B, M, hwork, lwork2, &info );if (info != 0) {printf("magma_sgeqrs3 returned error %lld: %s.\n",(long long) info, magma_strerror( info ));}magma_free_cpu( hwork );}#endifelse {printf( "Unknown version %lld\n", (long long) opts.version );return -1;}magma_sgetvector( N, d_B, 1, x, 1, opts.queue );// compute r = Ax - b, saved in bblasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione );// compute residual |Ax - b| / (max(m,n)*|A|*|x|)float norm_x, norm_A, norm_r, work[1];norm_A = lapackf77_slange( "F", &M, &N, h_A, &lda, work );norm_r = lapackf77_slange( "F", &M, &ione, b, &M, work );norm_x = lapackf77_slange( "F", &N, &ione, x, &N, work );magma_free_cpu( x );magma_free_cpu( b );magma_free( d_B );error = norm_r / (max(M,N) * norm_A * norm_x);}/* =====================================================================Performs operation using LAPACK=================================================================== */if ( opts.lapack ) {cpu_time = magma_wtime();lapackf77_sgeqrf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info );cpu_time = magma_wtime() - cpu_time;cpu_perf = gflops / cpu_time;if (info != 0) {printf("lapackf77_sgeqrf returned error %lld: %s.\n",(long long) info, magma_strerror( info ));}}/* =====================================================================Print performance and error.=================================================================== */printf("%5lld %5lld   ", (long long) M, (long long) N );if ( opts.lapack ) {printf( "%7.2f (%7.2f)", cpu_perf, cpu_time );}else {printf("  ---   (  ---  )" );}printf( "   %7.2f (%7.2f)   ", gpu_perf, gpu_time );if ( opts.check == 1 ) {bool okay = (error < tol && error2 < tol);status += ! okay;printf( "%11.2e   %11.2e   %s\n", error, error2, (okay ? "ok" : "failed") );}else if ( opts.check == 2 ) {if ( M >= N ) {bool okay = (error < tol);status += ! okay;printf( "%10.2e   %s\n", error, (okay ? "ok" : "failed") );}else {printf( "(error check only for M >= N)\n" );}}else {printf( "    ---\n" );}magma_free_cpu( tau    );magma_free_cpu( h_A    );magma_free_cpu( h_work );magma_free_pinned( h_R );magma_free( d_A );if ( opts.version == 1 || opts.version == 3 || opts.version == 4 ) {magma_free( dT );}fflush( stdout );}if ( opts.niter > 1 ) {printf( "\n" );}}magma_queue_destroy( queues[0] );magma_queue_destroy( queues[1] );opts.cleanup();TESTING_CHECK( magma_finalize() );return status;
}

流程分析:

改写为:

testing_cusolver_sgeqrf_gpu.cpp

#include <>

待补。。。

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.rhkb.cn/news/387052.html

如若内容造成侵权/违法违规/事实不符,请联系长河编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

【Kubernetes】二进制部署k8s集群(中)之cni网络插件flannel和calico

&#xff01;&#xff01;&#xff01;继续上一篇实验部署&#xff01;&#xff01;&#xff01; 目录 一.k8s的三种网络模式 1.Pod 内容器与容器之间的通信 2.同一个 Node 内 Pod 之间的通信 3.不同 Node 上 Pod 之间的通信 二.k8s的三种接口 三.Flannel 网络插件 1.U…

美摄科技企业级视频拍摄与编辑SDK解决方案

在数字化浪潮汹涌的今天&#xff0c;视频已成为企业传递信息、塑造品牌、连接用户不可或缺的强大媒介。为了帮助企业轻松驾驭这一视觉盛宴的制作过程&#xff0c;美摄科技凭借其在影视级非编技术领域的深厚积累&#xff0c;推出了面向企业的专业视频拍摄与编辑SDK解决方案&…

每日OJ_牛客CM26 二进制插入

目录 牛客CM26 二进制插入 解析代码 牛客CM26 二进制插入 二进制插入_牛客题霸_牛客网 解析代码 m:1024&#xff1a;100000000 00 n:19 &#xff1a; 10011 要把n的二进制值插入m的第j位到第i位&#xff0c;只需要把n先左移j位&#xff0c;然后再进行或运算&#xff08;|&am…

高品质定制线缆知名智造品牌推荐-精工电联:高压线缆行业定制服务的领航者

定制线缆源头厂家推荐-精工电联&#xff1a;高压线缆行业定制服务的领航者 在当今这个高度信息化的社会&#xff0c;电力传输与分配系统的稳定运行至关重要。作为连接各个电力设备的纽带&#xff0c;高压线缆的质量直接关系到电力系统的安全性和稳定性。在定制高压线缆行业中&a…

android(安卓)最简单明了解释版本控制之MinSdkVersion、CompileSdkVersion、TargetSdkVersion

1、先明白几个概念 &#xff08;1&#xff09;平台版本&#xff08;Android SDK版本号&#xff09; 平台版本也就是我们平时说的安卓8、安卓9、安卓10 &#xff08;2&#xff09;API级别&#xff08;API Level&#xff09; Android 平台提供的框架 API 被称作“API 级别” …

Hugo 部署与自动更新(Git)

文章目录 Nginx部署Hugonginx.confhugo.conf Hugo自动更新Hugo自动更新流程添加访问令牌添加web hookrust实现自动更新接口 Nginx部署Hugo nginx.conf user nginx; worker_processes auto;error_log /var/log/nginx/error.log notice; pid /var/run/nginx.pid;even…

C++STL简介(三)

目录 1.vector的模拟实现 1.1begin&#xff08;&#xff09; 1.2end&#xff08;&#xff09; 1.3打印信息 1.4 reserve&#xff08;&#xff09; 1.5 size&#xff08;&#xff09; 1.6 capacity&#xff08;&#xff09; 1.7 push_back() 1.8[ ] 1.9 pop_back() 1.10 insert&…

合并K个有序链表

题目 给你一个链表数组&#xff0c;每个链表都已经按升序排列。 请你将所有链表合并到一个升序链表中&#xff0c;返回合并后的链表。 示例1&#xff1a; 输入&#xff1a; 输出&#xff1a; 示例2&#xff1a; 输入&#xff1a; 输出&#xff1a; 示例3&#xff1a; 输入&…

【音视频之SDL2】Windows配置SDL2项目模板

文章目录 前言 SDL2 简介核心功能 Windows配置SDL2项目模板下载SDL2编译好的文件VS配置SDL2 测试代码效果展示 总结 前言 在开发跨平台的音视频应用程序时&#xff0c;SDL2&#xff08;Simple DirectMedia Layer 2&#xff09;是一个备受欢迎的选择。SDL2 是一个开源库&#x…

自研Vue3开源Tree组件:节点拖拽bug修复

当dropType为after&#xff0c;且dropNode为父节点时&#xff0c;bug出现了&#xff1a; bug原因&#xff1a;插入扁平化列表的位置insertIndex计算的不对&#xff1a; 正确的逻辑&#xff0c;同inner要算上子孙节点所占的位置&#xff1a; bug修复&#xff01;

「数组」实现动态数组的功能(C++)

概述 动态数组&#xff0c;顾名思议即可变长度的数组。数组这种数据结构的实现是在栈空间或堆空间申请一段连续的可操作区域。 实现可变长度的动态数组结构&#xff0c;应该有以下操作&#xff1a;申请一段足够长的空间&#xff0c;如果数据的存入导致空间已满&#xff0c;则…

PackagesNotFoundError 错误表明 conda 在当前使用的镜像源中找不到 contourpy 版本 1.2.1。以下是可能的解决方法:

PackagesNotFoundError 错误表明 conda 在当前使用的镜像源中找不到 contourpy 版本 1.2.1。以下是可能的解决方法&#xff1a; PackagesNotFoundError 错误表明 conda 在当前使用的镜像源中找不到 contourpy 版本 1.2.1。以下是可能的解决方法&#xff1a; 1. 更换镜像源 虽…

rust 桌面 sip 软电话(基于tauri 、pjsip库)

本文尝试下rust 的tauri 桌面运用 原因在于体积小 1、pjsip 提供了rust 接口官方的 rust demo 没编译出来 在git找了个sip-phone-rs-master https://github.com/Charles-Schleich/sip-phone-rs 可以自己编译下pjsip lib库替换该项目的lib 2、创建一个tauri demo 引用 [depe…

计算机毕业设计选题推荐-某炼油厂盲板管理系统-Java/Python项目实战

✨作者主页&#xff1a;IT研究室✨ 个人简介&#xff1a;曾从事计算机专业培训教学&#xff0c;擅长Java、Python、微信小程序、Golang、安卓Android等项目实战。接项目定制开发、代码讲解、答辩教学、文档编写、降重等。 ☑文末获取源码☑ 精彩专栏推荐⬇⬇⬇ Java项目 Python…

实战:Zookeeper 简介和单点部署ZooKeeper

Zookeeper 简介 ZooKeeper是一个开源的分布式协调服务&#xff0c;它是Apache软件基金会下的一个项目&#xff0c;旨在解决分布式系统中的协调和管理问题。以下是ZooKeeper的详细简介&#xff1a; 一、基本定义 ZooKeeper是一个分布式的、开放源码的分布式应用程序协调服务&a…

SpringBoot的基础配置

目录 SpringBoot快速搭建web程序 第一步&#xff1a;导包 第二步&#xff1a;配置SpringBoot引导类 第三步&#xff1a;编写controller类 第四步&#xff1a;在SpirngBoot引导类中启动项目 起步依赖 SpringBoot基础配置 配置文件格式 yaml语法规则 读取yml配置文件的方…

UE5+OpenCV配置(Windows11系统)

一、概述 因为需要在UE5中使用OpenCV这些工具进行配置&#xff0c;所以在网络上参考借鉴一些资料进行配置。查询到不少的资料&#xff0c;最后将其配置成功。在这里顺便记录一下自己的配置成功的过程。 二、具体过程 &#xff08;一&#xff09;版本 使用Windows11系统、UE5.…

ONLYOFFICE 协作空间 2.6 已发布:表单填写房间、LDAP、优化房间和文件管理等

更新后的 ONLYOFFICE 协作空间带来了超过 20 项新功能和优化&#xff0c;让工作更加高效和舒适。阅读本文了解详情。 表单填写房间 这次更新增加了一种新的房间类型&#xff0c;可在 ONLYOFFICE 协作空间中组织简单的表单填写流程。 通过表单填写房间&#xff0c;目前可以完成…

将控制台内容输出到文本文件

示例代码&#xff1a; Imports System.IO Module Module1Sub Main()Dim fs As New FileStream("D:\Desktop\test\输出结果.txt", FileMode.Create, FileAccess.Write, FileShare.None)Dim sw As New StreamWriter(fs)Console.SetOut(sw)Console.SetError(sw)For i …

【北京迅为】《i.MX8MM嵌入式Linux开发指南》-第四篇 嵌入式Linux系统移植篇-第六十九章uboot移植

i.MX8MM处理器采用了先进的14LPCFinFET工艺&#xff0c;提供更快的速度和更高的电源效率;四核Cortex-A53&#xff0c;单核Cortex-M4&#xff0c;多达五个内核 &#xff0c;主频高达1.8GHz&#xff0c;2G DDR4内存、8G EMMC存储。千兆工业级以太网、MIPI-DSI、USB HOST、WIFI/BT…