PTX入门教程与实战

PTX入门教程

官方文档的目录结构

1 PTX指令

官方文档链接

1.1 指令形式

指令的操作数个数从0-4不等，其中d代表的是目的操作数，a,b,c是源操作数

@p   opcode;@p   opcode a;@p   opcode d, a;@p   opcode d, a, b;@p   opcode d, a, b, c;

2 编程模型

2.1 CTA

Cooperative thread arrays (CTAs) implement CUDA thread blocks and clusters implement CUDA thread block clusters.

CTA: A cooperative thread array, or CTA, is an array of threads that execute a kernel concurrently or in parallel.

CTA中的每一个线程都有唯一的一个标识符,tid(tid.x,tid.y,tid.z)
每一个CTA的结构由ntid(ntid.x,ntid.y,ntid.z)标识，其中ntid的每一维度指示了该维度上有多少个线程。
在具体的实施过程中，线程会被组织为warp的形式来执行。warp具体的大小和机器的有关，由WARP_SZ查询。

2.2 Cluster

Cluster is a group of CTAs that run concurrently or in parallel and can synchronize and communicate with each other via shared memory.

cluster只能在sm_90或者更高的设备上使用

可以完成cluster内的所有线程同步。此外，在一个cluster中，不同CTA上的线程能够通过共享内存进行同步。

2.3 Grid

在一个grid中，不同cluster中的线程是不能通信的

在grid中，每一个cluster有一个唯一的编号clusterid,一个grid中的cluster数量由ncluster进行标识。

在grid中，每一个CTA有唯一的编号ctaid,一个grid中的CTA数量由nctid进行标识
在这里插入图片描述

cluster是一个优化的结构，只有在sm_90及以上的设备上才会被使用。

3 内存结构

每一个线程有自己的本地内存，每一个CTA有共享内存，该共享内存被CTA中的每一个线程共享，同时也对同一集群中不同CTA的线程开放。所有的线程都有权利访问全局内存。

所有线程都可以访问其他状态空间：常量、参数、纹理和表面状态空间。常量和纹理内存是只读的；表面内存是可读写的。

在这里插入图片描述

4 机器模型

微处理器(multiprocessors)包括多个SP(scalar processor core)核，一个多线程指令单元(multithread instruction unit)和片上的共享内存。

每一个线程会被映射到一个SP上。SIMT(single-instruction,multiple-thread) unit 将多个线程组织为warp的形式。每次发出指令时，SIMT unit都会选择一个已准备好执行的 Warp，并向该 Warp 中的活动线程发出下一条指令。线程分支(thread divergence)仅仅只是在一个warp中发生，不同warp的执行路径是互相独立的。

SM中的能够并发的block数量是由SM的资源数量(寄存器、共享内存)以及一个block所需要的资源数量所共同决定的。

SIMD和SIMT的区别：SIMD将数据处理的宽度暴露给编程着，而SIMT提供了线程级别的并行和数据级别的并行。
A key difference is that SIMD vector organizations expose the SIMD width to the software, whereas SIMT instructions specify the execution and branching behavior of a single thread. In contrast with SIMD vector machines, SIMT enables programmers to write thread-level parallel code for independent, scalar threads, as well as data-parallel code for coordinated threads.

5 实战1

目的:寻找PTX是如何带来性能提升的

C代码如下

#include <stdio.h>
#include <stdint.h>
#include <mma.h>
#include <stdlib.h> 
#include <time.h>using namespace nvcuda;
#define Q_N653 16777153
#define NUMBER_ACONFI 3221491777__device__ int32_t montgomery_reduce_n653_cuda(int64_t a)
{int32_t t;t = (int32_t)a * NUMBER_ACONFI;//t是低32位置t = (a - (int64_t)t * Q_N653) >> 32;return t;
}/*2024-7-11
蒙格玛丽约减法的ptx版本:
*/
__device__ int32_t montgomery_reduce_n653_ptx(int64_t a){int64_t res;asm("{\n\t"".reg .s64 b;\n\t""mul.lo.s64 %0,%1,%2;\n\t""and.b64 %0,%0,0xffffffff;\n\t""mul.lo.s64 %0,%0,%3;\n\t""sub.s64 %0,%1,%0;\n\t""shr.s64 %0,%0,32;\n\t" //"add.s64 %0,%0,%3;\n\t"       "}":"=l"(res):"l"(a),"n"(NUMBER_ACONFI),"n"(Q_N653));return (int32_t)res;
}#define NUM_TESTS 1000__global__ void test_montgomery_reduce(int64_t* inputs, int32_t* outputs_ptx, int32_t* outputs_cuda, int num_tests) {int idx = threadIdx.x + blockIdx.x * blockDim.x;if (idx < num_tests) {int64_t a = inputs[idx];outputs_ptx[idx] = montgomery_reduce_n653_ptx(a);outputs_cuda[idx] = montgomery_reduce_n653_cuda(a);}
}
int test_montgomery() {int64_t* h_inputs = (int64_t*)malloc(NUM_TESTS * sizeof(int64_t));int32_t* h_outputs_ptx = (int32_t*)malloc(NUM_TESTS * sizeof(int32_t));int32_t* h_outputs_cuda = (int32_t*)malloc(NUM_TESTS * sizeof(int32_t));for (int i = 0; i < NUM_TESTS; ++i) {h_inputs[i] = rand() % UINT64_MAX;}int64_t* d_inputs;int32_t* d_outputs_ptx;int32_t* d_outputs_cuda;cudaMalloc(&d_inputs, NUM_TESTS * sizeof(int64_t));cudaMalloc(&d_outputs_ptx, NUM_TESTS * sizeof(int32_t));cudaMalloc(&d_outputs_cuda, NUM_TESTS * sizeof(int32_t));cudaMemcpy(d_inputs, h_inputs, NUM_TESTS * sizeof(int64_t), cudaMemcpyHostToDevice);test_montgomery_reduce<<<(NUM_TESTS + 255) / 256, 256>>>(d_inputs, d_outputs_ptx, d_outputs_cuda, NUM_TESTS);cudaMemcpy(h_outputs_ptx, d_outputs_ptx, NUM_TESTS * sizeof(int32_t), cudaMemcpyDeviceToHost);cudaMemcpy(h_outputs_cuda, d_outputs_cuda, NUM_TESTS * sizeof(int32_t), cudaMemcpyDeviceToHost);bool correct = true;for (int i = 0; i < NUM_TESTS; ++i) {if (h_outputs_ptx[i] != h_outputs_cuda[i]) {//if(h_outputs_cuda[i] - h_outputs_ptx[i] != Q_N653){printf("Mismatch at index %d: PTX = %d, CUDA = %d\n", i, h_outputs_ptx[i], h_outputs_cuda[i]);correct = false;//}}}if (correct) {printf("All tests passed!\n");}free(h_inputs);free(h_outputs_ptx);free(h_outputs_cuda);cudaFree(d_inputs);cudaFree(d_outputs_ptx);cudaFree(d_outputs_cuda);return 0;
}
/***********************************************************************/int main(){test_montgomery();
}

汇编结束后的ptx代码如下所示，在CUDA中，__device__通常是内联的，因此其不会单独对应一个ptx的函数，而是被嵌入到了调用__device__函数的__global__中。

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-30672275
// Cuda compilation tools, release 11.5, V11.5.119
// Based on NVVM 7.0.1
//.version 7.5
.target sm_86
.address_size 64// .globl	_Z22test_montgomery_reducePlPiS0_i.visible .entry _Z22test_montgomery_reducePlPiS0_i(.param .u64 _Z22test_montgomery_reducePlPiS0_i_param_0,.param .u64 _Z22test_montgomery_reducePlPiS0_i_param_1,.param .u64 _Z22test_montgomery_reducePlPiS0_i_param_2,.param .u32 _Z22test_montgomery_reducePlPiS0_i_param_3
)
{.reg .pred 	%p<2>;.reg .b32 	%r<6>;.reg .b64 	%rd<19>;ld.param.u64 	%rd1, [_Z22test_montgomery_reducePlPiS0_i_param_0];ld.param.u64 	%rd2, [_Z22test_montgomery_reducePlPiS0_i_param_1];ld.param.u64 	%rd3, [_Z22test_montgomery_reducePlPiS0_i_param_2];ld.param.u32 	%r2, [_Z22test_montgomery_reducePlPiS0_i_param_3];mov.u32 	%r3, %tid.x;mov.u32 	%r4, %ntid.x;mov.u32 	%r5, %ctaid.x;mad.lo.s32 	%r1, %r5, %r4, %r3;setp.ge.s32 	%p1, %r1, %r2;@%p1 bra 	$L__BB0_2;cvta.to.global.u64 	%rd6, %rd1;mul.wide.s32 	%rd7, %r1, 8;add.s64 	%rd8, %rd6, %rd7;ld.global.u64 	%rd5, [%rd8];// begin inline asm{.reg .s64 b;mul.lo.s64 %rd4,%rd5,3221491777;and.b64 %rd4,%rd4,0xffffffff;mul.lo.s64 %rd4,%rd4,16777153;sub.s64 %rd4,%rd5,%rd4;shr.s64 %rd4,%rd4,32;}// end inline asmcvta.to.global.u64 	%rd9, %rd2;mul.wide.s32 	%rd10, %r1, 4;add.s64 	%rd11, %rd9, %rd10;st.global.u32 	[%rd11], %rd4;mul.lo.s64 	%rd12, %rd5, -4610542247161626624;shr.s64 	%rd13, %rd12, 32;mul.lo.s64 	%rd14, %rd13, -16777153;add.s64 	%rd15, %rd14, %rd5;shr.u64 	%rd16, %rd15, 32;cvta.to.global.u64 	%rd17, %rd3;add.s64 	%rd18, %rd17, %rd10;st.global.u32 	[%rd18], %rd16;$L__BB0_2:ret;}

下面，我们将C代码实现和PTX实现一一对应，并详细地解析ptx代码

5.1 函数声明与下标计算

CUDA 在处理 64 位架构（例如 sm_86）的 GPU 时，所有指针类型（无论是 int32_t* 还是 int64_t*）都被视为 64 位宽的地址。因此，在 PTX 代码中，指针类型的参数被表示为 u64，即 64 位无符号整数。这是为了与硬件的地址宽度对齐。
因此int32_t类型的指针是u64类型的。
指令关键词 .param代表的是参数，其对应着一种状态空间。由于test_montgomery_reduce是一个global函数，因此.param是read-only的。

test_montgomery_reduce(int64_t* inputs, int32_t* outputs_ptx, int32_t* outputs_cuda, int num_tests).visible .entry _Z22test_montgomery_reducePlPiS0_i(.param .u64 _Z22test_montgomery_reducePlPiS0_i_param_0,.param .u64 _Z22test_montgomery_reducePlPiS0_i_param_1,.param .u64 _Z22test_montgomery_reducePlPiS0_i_param_2,.param .u32 _Z22test_montgomery_reducePlPiS0_i_param_3
)

mad指令用于将两个数相乘后（选取高位或者低位），再加上另一个数。mad.lo.s32 %r1, %r5, %r4, %r3; 实现了int idx = threadIdx.x + blockIdx.x * blockDim.x;

int idx = threadIdx.x + blockIdx.x * blockDim.x;if (idx < num_tests) {mov.u32 	%r3, %tid.x;
mov.u32 	%r4, %ntid.x;
mov.u32 	%r5, %ctaid.x;
mad.lo.s32 	%r1, %r5, %r4, %r3;
setp.ge.s32 	%p1, %r1, %r2;
@%p1 bra 	$L__BB0_2;

setp指令通过应用布尔运算符比较两个值并将结果与另一个谓词值相结合。

bra用于控制分支语句的执行，@指令用于预测执行，因此@%p1 bra $L__BB0_2; 代表根据%p1判断是否执行相应的L__BB0_2语句。

5.2 montgomery_reduce_n653_ptx

outputs_ptx[idx] = montgomery_reduce_n653_ptx(a);__device__ int32_t montgomery_reduce_n653_ptx(int64_t a){int64_t res;asm("{\n\t"//".reg .s64 b;\n\t" //这句话可以直接删除了"mul.lo.s64 %0,%1,%2;\n\t""and.b64 %0,%0,0xffffffff;\n\t""mul.lo.s64 %0,%0,%3;\n\t""sub.s64 %0,%1,%0;\n\t""shr.s64 %0,%0,32;\n\t" //"add.s64 %0,%0,%3;\n\t"       "}":"=l"(res):"l"(a),"n"(NUMBER_ACONFI),"n"(Q_N653));return (int32_t)res;
}cvta.to.global.u64 	%rd6, %rd1; //%rd6代表的是inputs的地址mul.wide.s32 	%rd7, %r1, 8; //%r1代表的是idx的值，这里把地址乘上一个8add.s64 	%rd8, %rd6, %rd7;//加上inputs最初的地址ld.global.u64 	%rd5, [%rd8]; //%rd5中存放的即为a的值// begin inline asm{.reg .s64 b; //声明一个寄存器bmul.lo.s64 %rd4,%rd5,3221491777; //%rd4即为resand.b64 %rd4,%rd4,0xffffffff;mul.lo.s64 %rd4,%rd4,16777153;sub.s64 %rd4,%rd5,%rd4;shr.s64 %rd4,%rd4,32;}// end inline asmcvta.to.global.u64 	%rd9, %rd2; //把outputs_ptx指针重新读一下mul.wide.s32 	%rd10, %r1, 4; //因为outputs_ptx是32bit长的数组，又以idx作为下标，因此这里需要乘上4add.s64 	%rd11, %rd9, %rd10; //%rd11指向了outputs_ptxst.global.u32 	[%rd11], %rd4;//把计算得到的%rd4的值，放到outputs_ptx[idx]中

cvta指令

cvta.to.global.u64 %rd6, %rd1;代表的是地址转化指令，将 %rd1 寄存器中的地址转换为全局地址空间的 64 位地址，并将转换后的地址存储在 %rd6 寄存器中。结合C代码，该操作将inputs的指针放到寄存器%rd6里面。下面，将idx乘上8再加上inputs的地址，即得到了真实的地址%rd8，此时，将值使用ld指令加载到%rd5寄存器中（即%rd5中存储的即为c代码中的a的值）。//begin inline asm和//end inline asm给出了montgomery_reduce_n653_ptx中所内嵌的汇编代码。在完成计算后，使用类似的方法，计算outputs_ptx对应地址的指针值，并将计算结果%rd4存储进去。

5.3 montgomery_reduce_n653_cuda

outputs_cuda[idx] = montgomery_reduce_n653_cuda(a);__device__ int32_t montgomery_reduce_n653_cuda(int64_t a)
{int32_t t;t = (int32_t)a * NUMBER_ACONFI;//t是低32位t = (a - (int64_t)t * Q_N653) >> 32;return t;
}mul.lo.s64 	%rd12, %rd5, -4610542247161626624; //%rd5为a的值shr.s64 	%rd13, %rd12, 32;mul.lo.s64 	%rd14, %rd13, -16777153;add.s64 	%rd15, %rd14, %rd5;shr.u64 	%rd16, %rd15, 32;cvta.to.global.u64 	%rd17, %rd3;add.s64 	%rd18, %rd17, %rd10;st.global.u32 	[%rd18], %rd16;

强制类型转化会改变数据在内存中的存储的结构（float和整型之间），但是如果都是整型之间，那么进行的即为截断操作，例如，下面代码输出123456789abcdef0 9abcdef。
int main(){//test_montgomery();int64_t a = 0x123456789ABCDEF0;int32_t b = (int32_t)a;printf("%llx %x",a,b);
}

-4610542247161626624为int64_t类型的数，其二进制表示为0xc004104100000000，NUMBER_ACONFI=3221491777=0xc0041041，因此-4610542247161626624 = NUMBER_ACONFI << 32。因此 mul.lo.s64 %rd12, %rd5, -4610542247161626624; //%rd5为a的值
shr.s64 %rd13, %rd12, 32;用于计算t = (int32_t)a * NUMBER_ACONFI;//t是低32位
需要注意的是int32_t首先作用在了a上，从而使得运算本质上是在int32_t上进行的，为了能够在int64_t上模拟在int32_t上运算得到的溢出和截断等，需要先左移32位，这步操作，一方面完成了a的截断操作，另一方面也实现了运算的模拟。

6 实战2

本文借助一个实例，展示使用ptx代码带来性能提升的原因

注意，在global函数中加入下述语句，能够避免函数被编译器优化，从而使得能够显示出编译后正确的ptx代码
asm volatile ("" : "+r"(t));

#define DILITHIUM_Q 8380417
#define QINV 123__global__ void gpu_montgomery_multiply_ptx(int32_t x, int32_t y) {//printf("ptx\n");int32_t t;asm volatile("{\n\t"" .reg .s32 a_hi, a_lo;\n\t"" mul.hi.s32 a_hi, %1, %2;\n\t"" mul.lo.s32 a_lo, %1, %2;\n\t"" mul.lo.s32 %0, a_lo, %4;\n\t"" mul.hi.s32 %0, %0, %3;\n\t"" sub.s32 %0, a_hi, %0;\n\t""}": "=r"(t): "r"(x), "r"(y), "r"(DILITHIUM_Q), "r"(QINV));asm volatile ("" : "+r"(t));//防止该段代码被编译器优化掉
}__global__ void gpu_montgomery_multiply_cuda(int32_t x, int32_t y) {//printf("cuda\n");int32_t t;int64_t a = (int64_t) x * y;t = (int64_t) (int32_t) a * QINV;t = (a - (int64_t) t * DILITHIUM_Q) >> 32;asm volatile ("" : "+r"(t)); //防止该段代码被编译器优化掉}int test_for_dili(){printf("test for dili\n");gpu_montgomery_multiply_ptx<<<1,1>>>(1002,1213);gpu_montgomery_multiply_cuda<<<1,1>>>(1002,1213);cudaDeviceSynchronize();
}/***********************************************************************/int main(){//test_montgomery();//single_1_kernel();/*int64_t a = 0x123456789ABCDEF0;int32_t b = (int32_t)a;printf("%llx %x",a,b);*/test_for_dili();
}

其对应的ptx代码为

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-30672275
// Cuda compilation tools, release 11.5, V11.5.119
// Based on NVVM 7.0.1
//.version 7.5
.target sm_86
.address_size 64// .globl	_Z27gpu_montgomery_multiply_ptxii.visible .entry _Z27gpu_montgomery_multiply_ptxii(.param .u32 _Z27gpu_montgomery_multiply_ptxii_param_0,.param .u32 _Z27gpu_montgomery_multiply_ptxii_param_1
)
{.reg .b32 	%r<8>;ld.param.u32 	%r2, [_Z27gpu_montgomery_multiply_ptxii_param_0];ld.param.u32 	%r3, [_Z27gpu_montgomery_multiply_ptxii_param_1];mov.u32 	%r4, 8380417; //dili_qmov.u32 	%r5, 123; //qinv// begin inline asm{.reg .s32 a_hi, a_lo;mul.hi.s32 a_hi, %r2, %r3; //mul a_hi,x,ymul.lo.s32 a_lo, %r2, %r3; //mul a_lo,x,ymul.lo.s32 %r6, a_lo, %r5; //mul t,a_lo,qinvmul.hi.s32 %r6, %r6, %r4; //mul t,t,dili_qsub.s32 %r6, a_hi, %r6; // mul t,a_h,t}// end inline asm// begin inline asm// end inline asmret;}// .globl	_Z28gpu_montgomery_multiply_cudaii
.visible .entry _Z28gpu_montgomery_multiply_cudaii(.param .u32 _Z28gpu_montgomery_multiply_cudaii_param_0,.param .u32 _Z28gpu_montgomery_multiply_cudaii_param_1
)
{.reg .b32 	%r<5>;.reg .b64 	%rd<7>;ld.param.u32 	%r3, [_Z28gpu_montgomery_multiply_cudaii_param_0];//xld.param.u32 	%r4, [_Z28gpu_montgomery_multiply_cudaii_param_1];//ymul.wide.s32 	%rd1, %r4, %r3;//rd1 = x*y amul.lo.s64 	%rd2, %rd1, 528280977408; shr.s64 	%rd3, %rd2, 32;mul.lo.s64 	%rd4, %rd3, -8380417;add.s64 	%rd5, %rd4, %rd1;shr.u64 	%rd6, %rd5, 32;cvt.u32.u64 	%r1, %rd6;// begin inline asm// end inline asmret;}

6.1 gpu_montgomery_multiply_ptx

%r4 = DILITHIUM_Q
%r5 = QINV
_Z27gpu_montgomery_multiply_ptxii_param_0是指向x的指针，因此[_Z27gpu_montgomery_multiply_ptxii_param_0] = x
内联代码部分同asm中的一样

6.2 gpu_montgomery_multiply_cuda

t = (int64_t) (int32_t) a * QINV;被转化为mul.lo.s64 %rd2, %rd1, 528280977408; shr.s64 %rd3, %rd2, 32; 其正确性如下图所示:
528280977408的二进制表示为7b00000000，123的二进制表示为7b，因此528280977408 = 123 << 32。
在这里插入图片描述

强制类型转化测试:输出结果为-80。下面对该结果进行检验。首先(int8_t)a = 0x10 (int16_t)(int8_t)a = 0x0010 = 16 (int16_t)(int8_t)a*123 = 1968 = 0x7b，其表示的补码为-80。因此，由长到短的转化，为截断；由短到长的转化，为保持符号，增添0的数量。
int16_t a = 0x1010;
int8_t t = (int16_t) (int8_t)a * 123;
printf("%d",t);