目录
- `intra_4×4`帧内预测
- 下左对角线模式`INTRA4x4_DDL`
- 下右对角线模式`INTRA4x4_DDR`
- 右垂直模式`INTRA4x4_VR`
- 下水平模式`INTRA4x4_HD`
- 左垂直模式`INTRA4x4_VL`
- 上水平模式`INTRA4x4_HU`
- 变换编码
- 为什么要使用Hadamard变换
- `Rate Distortion Optimation`
- `lambda`值的产生
前述文章链接在此
H.264视频编解码的FPGA源码分析(一)输入数据分析
H.264视频编解码的FPGA源码分析(二)帧内预测1
intra_4×4
帧内预测
书接上文,写到intra_4×4
帧内预测的9中模式
下左对角线模式INTRA4x4_DDL
pixel_pred00=( ref00_t + (ref01_t<<1 ) + ref02_t + 2'd2 ) >>2 ;
pixel_pred01=( ref01_t + (ref02_t<<1 ) + ref03_t + 2'd2 ) >>2 ;
pixel_pred02=( ref02_t + (ref03_t<<1 ) + ref00_tr + 2'd2 ) >>2 ;
pixel_pred03=( ref03_t + (ref00_tr<<1) + ref01_tr + 2'd2 ) >>2 ;
pixel_pred10=( ref01_t + (ref02_t<<1 ) + ref03_t + 2'd2 ) >>2 ;
pixel_pred11=( ref02_t + (ref03_t<<1 ) + ref00_tr + 2'd2 ) >>2 ;
pixel_pred12=( ref03_t + (ref00_tr<<1) + ref01_tr + 2'd2 ) >>2 ;
pixel_pred13=( ref00_tr + (ref01_tr<<1 )+ ref02_tr + 2'd2 ) >>2 ;
pixel_pred20=( ref02_t + (ref03_t<<1 ) + ref00_tr + 2'd2 ) >>2 ;
pixel_pred21=( ref03_t + (ref00_tr<<1) + ref01_tr + 2'd2 ) >>2 ;
pixel_pred22=( ref00_tr + (ref01_tr<<1) + ref02_tr + 2'd2 ) >>2 ;
pixel_pred23=( ref01_tr + (ref02_tr<<1) + ref03_tr + 2'd2 ) >>2 ;
pixel_pred30=( ref03_t + (ref00_tr<<1) + ref01_tr + 2'd2 ) >>2 ;
pixel_pred31=( ref00_tr + (ref01_tr<<1) + ref02_tr + 2'd2 ) >>2 ;
pixel_pred32=( ref01_tr + (ref02_tr<<1) + ref03_tr + 2'd2 ) >>2 ;
pixel_pred33=( ref02_tr + (ref03_tr<<1) + ref03_tr + 2'd2 ) >>2 ;
只有A、B、C、D、I、J、K、L都存在的时候才可以使用
对应关系示意图
ref00_tl (M) | ref00_t (A) | ref01_t (B) | ref02_t (C) | ref03_t (D) | ref00_tr (E) | ref01_tr (F) | ref02_tr (G) | ref03_tr (H) |
---|---|---|---|---|---|---|---|---|
ref00_l | pred00 | pred01 | pred02 | pred03 | ||||
ref01_l | pred10 | pred11 | pred12 | pred13 | ||||
ref02_l | pred20 | pred21 | pred22 | pred23 | ||||
ref03_l | pred30 | pred31 | pred32 | pred33 |
这波操作的解释
H.264帧内预测可以分为三个步骤:
- 相邻像素的获取
在遇到边界情况时,相邻参考像素可能会不存在或不可用,此时会使用最邻近的像素进行填充。举个栗子,A中所有区域不存在,则A中所有像素用B最下方的像素填充,如果所有区域参考像素都不可用,则使用固定值填充:
R = 1 << (BitDepth - 1)
- 参考像素的滤波
这一步就是计算公式的来源
滤波的目的:更好的利用邻近像素之间的相关性,提高预测精度
常规滤波方法:
相当于一个抽头系数为[0.25, 0.25, 0.5]
的滤波器
强滤波方法:只针对32*32TU进行,且需要满足:
abs(A + C - 2B) < Threshold
和abs(C + E - 2D) < Threshold
- 预测像素的计算(有时间再说)
下右对角线模式INTRA4x4_DDR
pixel_pred00=( ref00_l + (ref00_tl<<1) + ref00_t + 2'd2 ) >> 2;
pixel_pred01=( ref00_tl + (ref00_t<<1 ) + ref01_t + 2'd2 ) >> 2;
pixel_pred02=( ref00_t + (ref01_t<<1 ) + ref02_t + 2'd2 ) >> 2;
pixel_pred03=( ref01_t + (ref02_t<<1 ) + ref03_t + 2'd2 ) >> 2;
pixel_pred10=( ref00_tl + (ref00_l<<1 ) + ref01_l + 2'd2 ) >> 2;
pixel_pred11=( ref00_l + (ref00_tl<<1) + ref00_t + 2'd2 ) >> 2;
pixel_pred12=( ref00_tl + (ref00_t<<1 ) + ref01_t + 2'd2 ) >> 2;
pixel_pred13=( ref00_t + (ref01_t<<1 ) + ref02_t + 2'd2 ) >> 2;
pixel_pred20=( ref00_l + (ref01_l<<1 ) + ref02_l + 2'd2 ) >> 2;
pixel_pred21=( ref00_tl + (ref00_l<<1 ) + ref01_l + 2'd2 ) >> 2;
pixel_pred22=( ref00_l + (ref00_tl<<1) + ref00_t + 2'd2 ) >> 2;
pixel_pred23=( ref00_tl + (ref00_t<<1 ) + ref01_t + 2'd2 ) >> 2;
pixel_pred30=( ref01_l + (ref02_l<<1 ) + ref03_l + 2'd2 ) >> 2;
pixel_pred31=( ref00_l + (ref01_l<<1 ) + ref02_l + 2'd2 ) >> 2;
pixel_pred32=( ref00_tl + (ref00_l<<1 ) + ref01_l + 2'd2 ) >> 2;
pixel_pred33=( ref00_l + (ref00_tl<<1) + ref00_t + 2'd2 ) >> 2;
对应关系:
a的预测值:(I + 2M + A + 2) / 4
b的预测值:(M + 2A + B + 2) / 4
c的预测值:(A + 2B + C + 2) / 4
d的预测值:(B + 2C + D + 2) / 4
e的预测值:(M + 2I + J + 2) / 4
f的预测值:(I + 2M + A + 2) / 4
和a是一样的
g的预测值:(M + 2A + B + 2) / 4
和b是一样的
h的预测值:(A + 2B + C + 2) / 4
和c是一样的
i的预测值:(I + 2J + K + 2) / 4
j的预测值:(M + 2I + J + 2) / 4
和e是一样的
k的预测值:(I + 2M + A + 2) / 4
和a是一样的
l的预测值:(M + 2A + B + 2) / 4
和b是一样的
m的预测值:(J + 2K+ L + 2) / 4
n的预测值:(I + 2J + K + 2) / 4
和i是一样的
o的预测值:(M + 2I + J + 2) / 4
和e是一样的
p的预测值:(I + 2M + A + 2) / 4
和a是一样的
你找一找规律:
在同一根箭头上的一定是相同的值
从箭头起点的三个值出发去计算
右垂直模式INTRA4x4_VR
pixel_pred00=( ref00_tl + ref00_t + 1'd1 ) >> 1;
pixel_pred01=( ref00_t + ref01_t + 1'd1 ) >> 1;
pixel_pred02=( ref01_t + ref02_t + 1'd1 ) >> 1;
pixel_pred03=( ref02_t + ref03_t + 1'd1 ) >> 1;
pixel_pred10=( ref00_l + (ref00_tl<<1) + ref00_t + 2'd2 ) >> 2;
pixel_pred11=( ref00_tl + (ref00_t<<1 ) + ref01_t + 2'd2 ) >> 2;
pixel_pred12=( ref00_t + (ref01_t<<1 ) + ref02_t + 2'd2 ) >> 2;
pixel_pred13=( ref01_t + (ref02_t<<1 ) + ref03_t + 2'd2 ) >> 2;
pixel_pred20=( ref01_l + (ref00_l<<1 ) + ref00_tl + 2'd2 ) >> 2;
pixel_pred21=( ref00_tl + ref00_t + 1'd1 ) >> 1;
pixel_pred22=( ref00_t + ref01_t + 1'd1 ) >> 1;
pixel_pred23=( ref01_t + ref02_t + 1'd1 ) >> 1;
pixel_pred30=( ref00_l + (ref01_l<<1 ) + ref02_l + 2'd2 ) >> 2;
pixel_pred31=( ref00_l + (ref00_tl<<1) + ref00_t + 2'd2 ) >> 2;
pixel_pred32=( ref00_tl + (ref00_t<<1 ) + ref01_t + 2'd2 ) >> 2;
pixel_pred33=( ref00_t + (ref01_t<<1 ) + ref02_t + 2'd2 ) >> 2;
下水平模式INTRA4x4_HD
pixel_pred00=( ref00_l + ref00_tl + 1'd1 ) >> 1;
pixel_pred01=( ref00_l + (ref00_tl<<1) + ref00_t + 2'd2 ) >> 2;
pixel_pred02=( ref00_tl + (ref00_t<<1 ) + ref01_t + 2'd2 ) >> 2;
pixel_pred03=( ref00_t + (ref01_t<<1 ) + ref02_t + 2'd2 ) >> 2;
pixel_pred10=( ref00_l + ref01_l + 1'd1 ) >> 1;
pixel_pred11=( ref01_l + (ref00_l<<1 ) + ref00_tl + 2'd2 ) >> 2;
pixel_pred12=( ref00_l + ref00_tl + 1'd1 ) >> 1;
pixel_pred13=( ref00_l + (ref00_tl<<1) + ref00_t + 2'd2 ) >> 2;
pixel_pred20=( ref02_l + ref01_l + 1'd1 ) >> 1;
pixel_pred21=( ref02_l + (ref01_l<<1 ) + ref00_l + 2'd2 ) >> 2;
pixel_pred22=( ref00_l + ref01_l + 1'd1 ) >> 1;
pixel_pred23=( ref01_l + (ref00_l<<1 ) + ref00_tl + 2'd2 ) >> 2;
pixel_pred30=( ref03_l + ref02_l + 1'd1 ) >> 1;
pixel_pred31=( ref03_l + (ref02_l<<1 ) + ref01_l + 2'd2 ) >> 2;
pixel_pred32=( ref02_l + ref01_l + 1'd1 ) >> 1;
pixel_pred33=( ref02_l + (ref01_l<<1 ) + ref00_l + 2'd2 ) >> 2;
左垂直模式INTRA4x4_VL
pixel_pred00=( ref00_t + ref01_t + 1'd1 ) >> 1;
pixel_pred01=( ref01_t + ref02_t + 1'd1 ) >> 1;
pixel_pred02=( ref02_t + ref03_t + 1'd1 ) >> 1;
pixel_pred03=( ref03_t + ref00_tr + 1'd1 ) >> 1;
pixel_pred10=( ref00_t + (ref01_t<<1 ) + ref02_t + 2'd2 ) >> 2;
pixel_pred11=( ref01_t + (ref02_t<<1 ) + ref03_t + 2'd2 ) >> 2;
pixel_pred12=( ref02_t + (ref03_t<<1 ) + ref00_tr + 2'd2 ) >> 2;
pixel_pred13=( ref03_t + (ref00_tr<<1) + ref01_tr + 2'd2 ) >> 2;
pixel_pred20=( ref01_t + ref02_t + 1'd1 ) >> 1;
pixel_pred21=( ref02_t + ref03_t + 1'd1 ) >> 1;
pixel_pred22=( ref03_t + ref00_tr + 1'd1 ) >> 1;
pixel_pred23=( ref00_tr + ref01_tr + 1'd1 ) >> 1;
pixel_pred30=( ref01_t + (ref02_t<<1 ) + ref03_t + 2'd2 ) >> 2;
pixel_pred31=( ref02_t + (ref03_t<<1 ) + ref00_tr + 2'd2 ) >> 2;
pixel_pred32=( ref03_t + (ref00_tr<<1) + ref01_tr + 2'd2 ) >> 2;
pixel_pred33=( ref00_tr + (ref01_tr<<1) + ref02_tr + 2'd2 ) >> 2;
上水平模式INTRA4x4_HU
pixel_pred00=( ref01_l + ref00_l + 1'd1 ) >> 1;
pixel_pred01=( ref02_l + (ref01_l<<1 ) + ref00_l + 2'd2 ) >> 2;
pixel_pred02=( ref02_l + ref01_l + 1'd1 ) >> 1;
pixel_pred03=( ref03_l + (ref02_l<<1 ) + ref01_l + 2'd2 ) >> 2;
pixel_pred10=( ref02_l + ref01_l + 1'd1 ) >> 1;
pixel_pred11=( ref03_l + (ref02_l<<1 ) + ref01_l + 2'd2 ) >> 2;
pixel_pred12=( ref03_l + ref02_l + 1'd1 ) >> 1;
pixel_pred13=( ref03_l + (ref03_l<<1 ) + ref02_l + 2'd2 ) >> 2;
pixel_pred20=( ref03_l + ref02_l + 1'd1 ) >> 1;
pixel_pred21=( ref03_l + (ref03_l<<1 ) + ref02_l + 2'd2 ) >> 2;
pixel_pred22=ref03_l;
pixel_pred23=ref03_l;
pixel_pred30=ref03_l;
pixel_pred31=ref03_l;
pixel_pred32=ref03_l;
pixel_pred33=ref03_l;
原始值减去预测值得到残差值,然后赋值给pixel_res00_o
这个就是预测模块的输出
给到了这个,是个多路选择器
这个其实就是整个帧内预测模块的输出,不知道为啥是用个多路选择器表示的~
下面看一下参考值从哪来
变换编码
我们先不去思考参考值的由来,这个参考值并不是原始图像的值,而是原始像素值经过重建之后的值,所以被命名为rec
,我们先看这个输出的残差值去了哪儿。
// hadamard transform
intra_hadamard4x4 u_intra4x4_hadamard4x4 (.ht_s00_i ( res00 ), .ht_s01_i ( res01 ), .ht_s02_i ( res02 ), .ht_s03_i ( res03 ),.ht_s10_i ( res10 ), .ht_s11_i ( res11 ), .ht_s12_i ( res12 ), .ht_s13_i ( res13 ),.ht_s20_i ( res20 ), .ht_s21_i ( res21 ), .ht_s22_i ( res22 ), .ht_s23_i ( res23 ),.ht_s30_i ( res30 ), .ht_s31_i ( res31 ), .ht_s32_i ( res32 ), .ht_s33_i ( res33 ),.ht_d00_o ( ht_coef00 ), .ht_d01_o ( ht_coef01 ), .ht_d02_o ( ht_coef02 ), .ht_d03_o ( ht_coef03 ),.ht_d10_o ( ht_coef10 ), .ht_d11_o ( ht_coef11 ), .ht_d12_o ( ht_coef12 ), .ht_d13_o ( ht_coef13 ),.ht_d20_o ( ht_coef20 ), .ht_d21_o ( ht_coef21 ), .ht_d22_o ( ht_coef22 ), .ht_d23_o ( ht_coef23 ),.ht_d30_o ( ht_coef30 ), .ht_d31_o ( ht_coef31 ), .ht_d32_o ( ht_coef32 ), .ht_d33_o ( ht_coef33 )
);
可以看到这里使用的是Hadamard矩阵变换
为什么要使用Hadamard变换
目的:为了进一步节省图像传输码率,需要对图像信号进行压缩,一般方法为去除图像信号中的相关性及减小图像编码的动态范围,也就是变换编码和量化
变换编码的实质:将时域信号变换成频域信号(频域中信号能量大部分集中在低频部分,码率下降)
离散余弦变换和量化是在tq
这个大模块里面进行的,这里直接对输出进行了hadamard变换,其实感觉看这张图会比较符合代码:
书上给出的蝶形结构是酱紫:
但是代码中的结构是这样的:
类似于这种FHT快速算法,但是注意到在结构上是水平交换的。
也就是说源代码中的更确切的说是hadamard的反变换?此处存疑。
该变换模块中有三类变量:
// 输入的残差块
input [ 8:0] ht_s00_i, ht_s01_i, ht_s02_i, ht_s03_i;
// 输出的变换后值
output [12:0] ht_d00_o, ht_d01_o, ht_d02_o, ht_d03_o;
// 中间值
wire [10:0] ht_t00_w, ht_t01_w, ht_t02_w, ht_t03_w;
可以看到
hadamard4 #(9) u_hadamard_h0(.s0(ht_s00_i), .s1(ht_s01_i), .s2(ht_s02_i), .s3(ht_s03_i), .d0(ht_t00_w), .d1(ht_t01_w), .d2(ht_t02_w), .d3(ht_t03_w));
// 代码有删减
hadamard4 #(11) u_hadamard_v0(.s0(ht_t00_w), .s1(ht_t10_w), .s2(ht_t20_w), .s3(ht_t30_w), .d0(ht_d00_o), .d1(ht_d10_o), .d2(ht_d20_o), .d3(ht_d30_o));
此处经过了两次hadamard变换,先变换到中间值,再变换至输出值
因此,输入数据位宽是9,输出数据位宽变成了13
我怀疑这个地方根本没有根据快速算法来,而是直接执行了公式:
随后取绝对值:
assign ht_abs00 = ht_coef00[`BIT_DEPTH+4]? (~ht_coef00[`BIT_DEPTH+3:0] + 1'b1) : ht_coef00[`BIT_DEPTH+3:0];
然后用寄存器寄存一级,计算Cost
assign ht_sum0 = (ht_abs00_r + ht_abs01_r) + (ht_abs02_r + ht_abs03_r);
assign ht_sum1 = (ht_abs10_r + ht_abs11_r) + (ht_abs12_r + ht_abs13_r);
assign ht_sum2 = (ht_abs20_r + ht_abs21_r) + (ht_abs22_r + ht_abs23_r);
assign ht_sum3 = (ht_abs30_r + ht_abs31_r) + (ht_abs32_r + ht_abs33_r);
assign ht_sum4x4 = (ht_sum0 + ht_sum1) + (ht_sum2 + ht_sum3);
就是求宏块所有值的和
这个是为了计算RD-Cost
Rate Distortion Optimation
率失真优化
在H.264编码过程中,有许多模式可以选择(前面已经详细说过了),有些模式的图像失真比较小,但是码率很大;有些模式则相反。为了找到一种模式在不超过某最大码率的情况下,失真达到最小
这篇文章中有图解
因此有文献提出,用Hadamard-SATD
来估算帧内预测的RD-Cost
,SATD
就是预测残差矩阵经过hadamard变换后的矩阵的绝对值的和。
该值作为intra_4x4_ctrl
模块的输入值:
input [`BIT_DEPTH+7:0] ht_sum4x4_i; // 4x4 block hardamard sum
// 具体的计算公式为:
assign cost4x4 = ((ht_sum4x4_i>>1) + cost_lambda);
// cost_lambda的取值:
assign cost_lambda = mode_equal_r1 ? 9'd0 : (lambda_i<<2);
也就是说如果mode_equal_r1 == 1
,cost
直接就是绝对值和的一半,如果mode_equal_r1 == 0
,cost
直接就是绝对值和的一半加上一个lambda
lambda
值的产生
intra_lambda u_lambda(.qp_i ( qp ),.lambda_o ( lambda )
);
输入信号qp
:
// qp pipeline
always @(posedge clk or negedge rst_n)beginif (!rst_n)qp_r <= 'b0;else if (ime_start || intra_start)qp_r <= sys_qp;
end
qp
是量化模块中的量化参数,是整体top模块从外部设置的参数
量化步长QP 决定量化器的编码压缩率及图像精度
在量化和反量化过程中,量化步长QP
决定量化器的编码压缩率及图像精度。如果QP 比较大,则量化值FQ
动态范围较小,其相应的编码长度较小,但反量化时损失较多的图像细节信息;如果QP
比较小,则FQ
动态范围较大,相应的编码长度也较大,但图像细节信息损失较少。编码器根据图像值实际动态范围自动改变QP 值,在编码长度和图像精度之间折衷,达到整体最佳效果
用量化步长推算lambda
的值:
always @ (qp_i)case(qp_i)6'd0, 6'd1, 6'd2, 6'd3, 6'd4, 6'd5, 6'd6, 6'd7,6'd8, 6'd9, 6'd10, 6'd11,6'd12, 6'd13, 6'd14, 6'd15 :lambda=7'd1;6'd16, 6'd17, 6'd18, 6'd19 :lambda=2;6'd20, 6'd21, 6'd22 :lambda=7'd3;6'd23, 6'd24, 6'd25 :lambda=7'd4;6'd26 :lambda=7'd5;6'd27, 6'd28:lambda=7'd6;6'd29 : lambda=7'd7;6'd30 : lambda=7'd8;6'd31 : lambda=7'd9;6'd32 : lambda=7'd10;6'd33 : lambda=7'd11;6'd34 : lambda=7'd13;6'd35 : lambda=7'd14;6'd36 : lambda=7'd16;6'd37 : lambda=7'd18;6'd38 : lambda=7'd20;6'd39 : lambda=7'd23;6'd40 : lambda=7'd25;6'd41 : lambda=7'd29;6'd42 : lambda=7'd32;6'd43 : lambda=7'd36;6'd44 : lambda=7'd40;6'd45 : lambda=7'd45;6'd46 : lambda=7'd51;6'd47 : lambda=7'd57;6'd48 : lambda=7'd64;6'd49 : lambda=7'd72;6'd50 : lambda=7'd81;6'd51 : lambda=7'd91;default:lambda=7'd0 ;endcase
其实lambda
就是拉格朗日因子,感觉就是提前打好的表
那么下面看一下mode_equal_r1
这个决定拉格朗日取值的标志位:
在非复位状态下:
// Mode Compare
assign mode_equal = (curr_mode==i4x4_pred_mode_i) ? 1'b1 : 1'b0; always @(posedge clk or negedge rst_n) beginif(!rst_n)mode_equal_r <= 1'b0;else if (mode_equal && valid_prediction)mode_equal_r <= mode_equal;elsemode_equal_r <= 1'b0;
end
// 代码有删减
// Current 4x4 sub-block valid intra prediction
assign valid_prediction = (curr_state!=INTRA4x4_IDLE) && (curr_state!=INTRA4x4_WAIT);
这个i4x4_pred_mode_i
是模块intra_4x4_pred_mode_gen
的输出
// Mode gen module
intra_4x4_pred_mode_gen u_intra_4x4_pred_mode_gen(.clk ( clk ),.rst_n ( rst_n ),.mb_x ( mb_x ),.mb_y ( mb_y ),.blk4x4_num ( i4x4_num_i ),.intra4x4_bm_c ( i4x4_bm_o ),.intra4x4_bm_l ( i4x4_bm_l ),.intra4x4_bm_t ( i4x4_bm_t ), .mode_pred ( i4x4_pred_mode_o )
);
assign mode_pred = (intra4x4_mode_t==4'd15 || intra4x4_mode_l==4'd15)? 4'd2 : (intra4x4_mode_t < intra4x4_mode_l) ? intra4x4_mode_t : intra4x4_mode_l;
这是一种拉格朗日因子的选择方法。