论文介绍
论文链接:https://arxiv.org/pdf/2211.11943
- 研究背景:论文指出,尽管当前研究者们通过利用大核卷积、高阶空间交互或稀疏卷积核等方法对卷积神经网络(ConvNets)的设计进行了重新思考,但如何更有效地利用卷积来构建强大的ConvNet架构仍是一个热门研究课题。
- Conv2Former提出:论文提出了一种新的卷积网络架构Conv2Former,它采用了一种简化的自注意力机制,仅使用卷积和Hadamard乘积。
创新点
- 卷积调制操作:Conv2Former的核心是卷积调制操作,它简化了自注意力机制,提高了内存效率,特别是当处理高分辨率图像时。
- 大核卷积的优势:与之前的ConvNets不同,Conv2Former可以从更大的卷积核(如11×11和21×21)中获益,这带来了更好的性能提升。
方法
- 卷积调制块:Conv2Former使用了卷积调制块,该块利用深度卷积的特征作为权重来调制值表示。
- 与自注意力和经典残差块的比较:相比自注意力,Conv2Former的方法在构建关系时更加内存高效;与经典残差块相比,由于调制操作,Conv2Former的方法可以适应输入内容。
模块作用
- 卷积调制块的优势:卷积调制块能够同时捕捉通道和空间维度的信息,并且比传统的自注意力机制更加高效。
- 在下游任务中的表现:Conv2Former在物体检测和语义分割等下游任务中表现出色,证明了其卷积调制块的有效性和实用性。
整体而言,Conv2Former通过提出卷积调制操作,简化了自注意力机制,提高了内存效率,并展示了在视觉识别任务中的优越性能。代码如下:
import torch
import torch.nn as nn
import torch.nn.functional as Ffrom timm.models.layers import DropPathclass MLP(nn.Module):def __init__(self, dim, mlp_ratio=4):super().__init__()self.norm = LayerNorm(dim, eps=1e-6, data_format="channels_first")self.fc1 = nn.Conv2d(dim, int(dim * mlp_ratio), 1,padding=0)self.pos = nn.Conv2d(int(dim * mlp_ratio), int(dim * mlp_ratio), 3, padding=1, groups=int(dim * mlp_ratio))self.fc2 = nn.Conv2d(int(dim * mlp_ratio), dim, 1)self.act = nn.GELU()def forward(self, x):B, C, H, W = x.shapex = self.norm(x)x = self.fc1(x)x = self.act(x)x = x + self.act(self.pos(x))x = self.fc2(x)return x
class LayerNorm(nn.Module):r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.The ordering of the dimensions in the inputs. channels_last corresponds to inputs withshape (batch_size, height, width, channels) while channels_first corresponds to inputswith shape (batch_size, channels, height, width)."""def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):super().__init__()self.weight = nn.Parameter(torch.ones(normalized_shape))self.bias = nn.Parameter(torch.zeros(normalized_shape))self.eps = epsself.data_format = data_formatif self.data_format not in ["channels_last", "channels_first"]:raise NotImplementedErrorself.normalized_shape = (normalized_shape,)def forward(self, x):if self.data_format == "channels_last":return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)elif self.data_format == "channels_first":u = x.mean(1, keepdim=True)s = (x - u).pow(2).mean(1, keepdim=True)x = (x - u) / torch.sqrt(s + self.eps)x = self.weight[:, None, None] * x + self.bias[:, None, None]return x
class SpatialAttention(nn.Module):def __init__(self, dim, kernel_size, expand_ratio=2):super().__init__()self.norm = LayerNorm(dim, eps=1e-6, data_format="channels_first")self.att = nn.Sequential(nn.Conv2d(dim, dim, 1),nn.GELU(),nn.Conv2d(dim, dim, kernel_size=kernel_size, padding=kernel_size//2, groups=dim))self.v = nn.Conv2d(dim, dim, 1)self.proj = nn.Conv2d(dim, dim, 1)def forward(self, x):B, C, H, W = x.shapex = self.norm(x)x = self.att(x) * self.v(x)x = self.proj(x)return xclass Block(nn.Module):def __init__(self, dim, kernel_size, num_head, window_size=14, mlp_ratio=4., drop_path=0.):super().__init__()self.attn = SpatialAttention(dim, kernel_size)self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()self.mlp = MLP(dim, mlp_ratio)layer_scale_init_value = 1e-6self.layer_scale_1 = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)self.layer_scale_2 = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)def forward(self, x):x = x + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.attn(x))x = x + self.drop_path(self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(x))return xif __name__ == '__main__':# 生成随机输入数据input_data = torch.randn(2, 64, 40, 40)mca = Block(64,3,16)output = mca(input_data)# 打印输入和输出形状print("Input size:", input_data.size())print("Output size:", output.size())
输出结果:
Input size: torch.Size([2, 64, 40, 40])
Output size: torch.Size([2, 64, 40, 40])