46语义分割和数据集
"""
图像分割将图像划分为若干组成区域,这类问题的方法通常利用图像中像素之间的相关性。
它在训练时不需要有关图像像素的标签信息,在预测时也无法保证分割出的区域具有我们希望得到的语义。
图像分割可能会将狗分为两个区域:一个覆盖以黑色为主的嘴和眼睛,另一个覆盖以黄色为主的其余部分身体。实例分割也叫同时检测并分割(simultaneous detection and segmentation),
它研究如何识别图像中各个目标实例的像素级区域。
与语义分割不同,实例分割不仅需要区分语义,还要区分不同的目标实例。
例如,如果图像中有两条狗,则实例分割需要区分像素属于的两条狗中的哪一条。
"""
import os
import torch
import torchvision
from d2l import torch as d2l
import matplotlib. pyplot as plt
voc_dir = '../data/VOCdevkit/VOC2012'
def read_voc_images ( voc_dir, is_train= True ) : """将所有输入的图像和标签读入内存""" txt_fname = os. path. join( voc_dir, 'ImageSets' , 'Segmentation' , 'train.txt' if is_train else 'val.txt' ) mode = torchvision. io. image. ImageReadMode. RGBwith open ( txt_fname, 'r' ) as f: images = f. read( ) . split( ) features, labels = [ ] , [ ] for i, fname in enumerate ( images) : features. append( torchvision. io. read_image( os. path. join( voc_dir, 'JPEGImages' , f' { fname} .jpg' ) ) ) labels. append( torchvision. io. read_image( os. path. join( voc_dir, 'SegmentationClass' , f' { fname} .png' ) , mode) ) return features, labelstrain_features, train_labels = read_voc_images( voc_dir, True )
n = 5
imgs = train_features[ 0 : n] + train_labels[ 0 : n]
imgs = [ img. permute( 1 , 2 , 0 ) for img in imgs]
d2l. show_images( imgs, 2 , n)
plt. show( )
VOC_COLORMAP = [ [ 0 , 0 , 0 ] , [ 128 , 0 , 0 ] , [ 0 , 128 , 0 ] , [ 128 , 128 , 0 ] , [ 0 , 0 , 128 ] , [ 128 , 0 , 128 ] , [ 0 , 128 , 128 ] , [ 128 , 128 , 128 ] , [ 64 , 0 , 0 ] , [ 192 , 0 , 0 ] , [ 64 , 128 , 0 ] , [ 192 , 128 , 0 ] , [ 64 , 0 , 128 ] , [ 192 , 0 , 128 ] , [ 64 , 128 , 128 ] , [ 192 , 128 , 128 ] , [ 0 , 64 , 0 ] , [ 128 , 64 , 0 ] , [ 0 , 192 , 0 ] , [ 128 , 192 , 0 ] , [ 0 , 64 , 128 ] ]
VOC_CLASSES = [ 'background' , 'aeroplane' , 'bicycle' , 'bird' , 'boat' , 'bottle' , 'bus' , 'car' , 'cat' , 'chair' , 'cow' , 'diningtable' , 'dog' , 'horse' , 'motorbike' , 'person' , 'potted plant' , 'sheep' , 'sofa' , 'train' , 'tv/monitor' ]
def voc_colormap2label ( ) : """构建从RGB到VOC类别索引的映射""" colormap2label = torch. zeros( 256 ** 3 , dtype= torch. long ) for i, colormap in enumerate ( VOC_COLORMAP) : colormap2label[ ( colormap[ 0 ] * 256 + colormap[ 1 ] ) * 256 + colormap[ 2 ] ] = ireturn colormap2label
def voc_label_indices ( colormap, colormap2label) : """将VOC标签中的RGB值映射到它们的类别索引""" colormap = colormap. permute( 1 , 2 , 0 ) . numpy( ) . astype( 'int32' ) idx = ( ( colormap[ : , : , 0 ] * 256 + colormap[ : , : , 1 ] ) * 256 + colormap[ : , : , 2 ] ) return colormap2label[ idx]
"""
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1],[0, 0, 0, 0, 0, 0, 0, 1, 1, 1],[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],[0, 0, 0, 0, 1, 1, 1, 1, 1, 1],[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],[0, 0, 0, 0, 0, 0, 0, 0, 1, 1]])
aeroplane
"""
def voc_rand_crop ( feature, label, height, width) : """随机裁剪特征和标签图像""" rect = torchvision. transforms. RandomCrop. get_params( feature, ( height, width) ) feature = torchvision. transforms. functional. crop( feature, * rect) label = torchvision. transforms. functional. crop( label, * rect) return feature, labelimgs = [ ]
for _ in range ( n) : imgs += voc_rand_crop( train_features[ 0 ] , train_labels[ 0 ] , 200 , 300 ) imgs = [ img. permute( 1 , 2 , 0 ) for img in imgs]
d2l. show_images( imgs[ : : 2 ] + imgs[ 1 : : 2 ] , 2 , n)
"""
imgs = [0, 1, 2, 3, 4, 5]
result = imgs[::2] + imgs[1::2]
# imgs[::2] 返回 [0, 2, 4]
# imgs[1::2] 返回 [1, 3, 5]
# result 将这两个子列表连接起来,返回 [0, 2, 4, 1, 3, 5]
"""
plt. show( )
class VOCSegDataset ( torch. utils. data. Dataset) : """一个用于加载VOC数据集的自定义数据集""" def __init__ ( self, is_train, crop_size, voc_dir) : self. transform = torchvision. transforms. Normalize( mean= [ 0.485 , 0.456 , 0.406 ] , std= [ 0.229 , 0.224 , 0.225 ] ) self. crop_size = crop_size features, labels = read_voc_images( voc_dir, is_train= is_train) self. features = [ self. normalize_image( feature) for feature in self. filter ( features) ] self. labels = self. filter ( labels) self. colormap2label = voc_colormap2label( ) print ( 'read ' + str ( len ( self. features) ) + ' examples' ) def normalize_image ( self, img) : return self. transform( img. float ( ) / 255 ) def filter ( self, imgs) : return [ img for img in imgs if ( img. shape[ 1 ] >= self. crop_size[ 0 ] and img. shape[ 2 ] >= self. crop_size[ 1 ] ) ] def __getitem__ ( self, idx) : feature, label = voc_rand_crop( self. features[ idx] , self. labels[ idx] , * self. crop_size) return ( feature, voc_label_indices( label, self. colormap2label) ) def __len__ ( self) : return len ( self. features)
crop_size = ( 320 , 480 )
voc_train = VOCSegDataset( True , crop_size, voc_dir)
voc_test = VOCSegDataset( False , crop_size, voc_dir)
batch_size = 64
train_iter = torch. utils. data. DataLoader( voc_train, batch_size, shuffle= True , drop_last= True , num_workers= 0 )
for X, Y in train_iter: print ( X. shape) print ( Y. shape) break
def load_data_voc ( batch_size, crop_size) : """加载VOC语义分割数据集""" voc_dir = '../data/VOCdevkit/VOC2012' num_workers = 4 train_iter = torch. utils. data. DataLoader( VOCSegDataset( True , crop_size, voc_dir) , batch_size, shuffle= True , drop_last= True , num_workers= num_workers) test_iter = torch. utils. data. DataLoader( VOCSegDataset( False , crop_size, voc_dir) , batch_size, drop_last= True , num_workers= num_workers) return train_iter, test_iter