NLP任务之预测最后一个词

1.加载预训练模型

2 从本地加载数据集

3.数据集处理

4.下游任务模型

5.测试代码

6.训练代码

7.保存训练好的模型

8. 加载保存的模型

1.加载预训练模型

#加载预训练模型
from transformers import AutoTokenizer#预训练模型：distilgpt2
#use_fast=True：用rust语言写的分词器，速度比python语言写的分词器快很多
tokenizer = AutoTokenizer.from_pretrained(r'../data/model/distilgpt2/', use_fast=True)
tokenizer.batch_encode_plus(['hide new secretions from the parental units','this moive is great'  
])#输出：
#{'input_ids': [[24717, 649, 3200, 507, 422, 262, 21694, 4991], [5661, 6941, 425, 318, 1049]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}print(tokenizer)

GPT2TokenizerFast(name_or_path='../data/model/distilgpt2/', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

#预测最后一个词：是一个多分类问题
#针对这个vocab_size=50257的问题，分类类别就是50257个类别

2 从本地加载数据集

from datasets import load_from_diskdataset = load_from_disk(r'E:/ALOT/10_deep_learning/data/datasets/glue_sst2/')dataset

DatasetDict({train: Dataset({features: ['sentence', 'label', 'idx'],num_rows: 67349})validation: Dataset({features: ['sentence', 'label', 'idx'],num_rows: 872})test: Dataset({features: ['sentence', 'label', 'idx'],num_rows: 1821})
})

3.数据集处理

#预测下一个词，只需要数据集中的sentence, 不需要label和idx
#使用map函数做映射。处理只剩下sentence

def f(dataset, tokenizer):return tokenizer.batch_encode_plus(dataset['sentence'])#num_proc=8  查看任务管理器--性能--逻辑处理器的数量
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=8,remove_columns=['sentence', 'label', 'idx'],fn_kwargs={'tokenizer': tokenizer})  dataset

DatasetDict({train: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 67349})validation: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 872})test: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 1821})
})

#规定一个句子最小要有8个单词
#过滤掉太短的句子
def f_1(dataset):return [len(i) >=8 for i in dataset['input_ids']]  #i是每一句话dataset = dataset.filter(f_1, batched=True, batch_size=1000, num_proc=8)dataset

DatasetDict({train: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 39905})validation: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 848})test: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 1730})
})

#截断句子
def f_2(dataset):#分别获取截断之后的input_ids, attention_maskdataset['input_ids'] = [i[:8] for i in dataset['input_ids']]dataset['attention_mask'] = [[1] * 8] * len(dataset['attention_mask'])#预测最后一个词，前七个单词输入，最后一个输入#模型帮我们做了偏移量问题， 这里输入和输出保持一致即可dataset['labels'] = dataset['input_ids']return datasetdataset = dataset.map(f_2, batched=True, batch_size=1000, num_proc=8)dataset

DatasetDict({train: Dataset({features: ['input_ids', 'attention_mask', 'labels'],num_rows: 39905})validation: Dataset({features: ['input_ids', 'attention_mask', 'labels'],num_rows: 848})test: Dataset({features: ['input_ids', 'attention_mask', 'labels'],num_rows: 1730})
})

dataset['train'][0]

{'input_ids': [24717, 649, 3200, 507, 422, 262, 21694, 4991],'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1],'labels': [24717, 649, 3200, 507, 422, 262, 21694, 4991]}

#定义数据加载器
import torch
#default_data_collator: 将一条条取数据变成批量取数据
from transformers.data.data_collator import default_data_collatorloader = torch.utils.data.DataLoader(dataset=dataset['train'],  #取出训练数据集batch_size=16,collate_fn=default_data_collator,shuffle=True,drop_last=True  #若最后一批数据没有batch_size个数据，就删掉不用
)for data in loader:breaklen(loader), data

(2494,{'input_ids': tensor([[22602,  4340,   262,  2126,   286,  1642,   479,   993],[ 5832,   651,   262, 10647,   326,  6260,   290,  3437],[   11,   645,   530,   460,  3285,   345,  3013,   382],[48580,   257,  2612,   290,  3950,   326, 36675,   262],[  361,   345,   467,   287,  6970,   326,   837,   345],[  270,   705,    82,   257,  4950,  2646,   837,  1336],[   71,  1794,  6819,   837, 26996,  6819,  6776,   837],[11246,  7650, 30669, 13766, 17548,   351,  6159,   220],[ 1169, 11918,   286,   281,  7876,  1013, 30909,  1358],[22437,   299,   470,  1612,   340,   705,    82,   922],[  270,   705,    82, 23056,   284,   766,   257,  3807],[ 5832,  1244,   892,   339,   373,  2491,   329,  2607],[ 2395,  9259,   736, 49253,   837, 11441,  2223, 16311],[ 8505,   837,  1312, 11691,   340,   705,    82, 14081],[ 1169,  1306, 13203, 29185,   286,   262,  1842,  8848],[26535,   867, 14138,   290, 41169,    12, 44517,  2628]]),'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1]]),'labels': tensor([[22602,  4340,   262,  2126,   286,  1642,   479,   993],[ 5832,   651,   262, 10647,   326,  6260,   290,  3437],[   11,   645,   530,   460,  3285,   345,  3013,   382],[48580,   257,  2612,   290,  3950,   326, 36675,   262],[  361,   345,   467,   287,  6970,   326,   837,   345],[  270,   705,    82,   257,  4950,  2646,   837,  1336],[   71,  1794,  6819,   837, 26996,  6819,  6776,   837],[11246,  7650, 30669, 13766, 17548,   351,  6159,   220],[ 1169, 11918,   286,   281,  7876,  1013, 30909,  1358],[22437,   299,   470,  1612,   340,   705,    82,   922],[  270,   705,    82, 23056,   284,   766,   257,  3807],[ 5832,  1244,   892,   339,   373,  2491,   329,  2607],[ 2395,  9259,   736, 49253,   837, 11441,  2223, 16311],[ 8505,   837,  1312, 11691,   340,   705,    82, 14081],[ 1169,  1306, 13203, 29185,   286,   262,  1842,  8848],[26535,   867, 14138,   290, 41169,    12, 44517,  2628]])})

4.下游任务模型

from transformers import AutoModelForCausalLM, GPT2Modelclass Model(torch.nn.Module):def __init__(self):super().__init__()self.pretrained = GPT2Model.from_pretrained('../data/model/distilgpt2/')self.fc = torch.nn.Linear(768, tokenizer.vocab_size, bias=False)#加载预训练权重的模型parameters = AutoModelForCausalLM.from_pretrained('../data/model/distilgpt2/')#全连接层加载预训练权重self.fc.load_state_dict(parameters.lm_head.state_dict()) #前四句代码的简便写法#self_pretrained = AutoModelForCausalLM.from_pretrained('../data/model/distilgpt2/')self.criterion = torch.nn.CrossEntropyLoss()def forward(self, input_ids, attention_mask, labels=None):logits = self.pretrained(input_ids=input_ids, attention_mask=attention_mask)logits = logits.last_hidden_statelogits = self.fc(logits)loss=Noneif labels is not None:#传入数据与labels 的取值范围的偏移量为1#第0维度与第2维度都要，第1维度的最后一个不要shift_logits = logits[:, :-1].reshape(-1, tokenizer.vocab_size)  #-1表示前两个维度合并#第0维度与第2维度都要，第1维度的第一个不要shift_labels = labels[:, 1:].reshape(-1)  #二维tensor数组变成一维loss = self.criterion(shift_logits, shift_labels)return {'loss':loss, 'logits':logits}model = Model()
#参数量
print(sum(i.numel() for i in model.parameters()) / 10000)  #除以10000是以万为单位

12050.9952

#python中**的用法，可以自动把一个字典解包成关键字参数{} -->xxx = xxx， xxx-->xxx
out = model(**data)
# print(out)  #一个字典
print(out['loss'], out['logits'].shape)

tensor(6.2742, grad_fn=<NllLossBackward0>) torch.Size([16, 8, 50257])

5.测试代码

# 测试代码
def test(model):model.eval()# 加载测试数据loader_test = torch.utils.data.DataLoader(dataset=dataset['test'],batch_size=16,collate_fn=default_data_collator,shuffle=True,   #测试预测时，可以不打乱数据drop_last=True)correct = 0total = 0for i, data in enumerate(loader_test):# 只计算最后一个词的准确率. label = data['input_ids'][:, -1].clone()    #clone()克隆数据，不修改原数据# 再从数据中抹除最后一个词, 防止模型作弊. data['input_ids'][:, -1] = 0   #把最后一列数据都赋值为0# label就不需要了   都赋值为0data['labels'][:, :]  = 0# 计算with torch.no_grad():      #测试预测时，不需要求导梯度下降（是处于训练时做的事）#相当于 out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)out = model(**data)# 最后一个词的准确率, 因为有偏移量的关系, 这里取的是倒数第二个词out = out['logits'].argmax(dim=2)[:, -2]  #在PyTorch中,.item() 是一个常用于从包含单个元素的张量(通常是一个0维张量,即标量scalar)中提取Python数值的方法。correct += (label==out).sum().item()total += 16    #16就是batch_szie的数值if i % 10 == 0:   #每隔10次print(i)print(label)print(out)if i == 50:   #只循环50次breakprint('accuracy: ', correct / total)for i in range(8):print(tokenizer.decode(data['input_ids'][i, :-1]))print(tokenizer.decode(label[i]), tokenizer.decode(out[i]))print()

test(model)

0
tensor([  416, 42738,  7297,  2709,   651,   837,   290,   349,   290, 11815,72, 14505,  7559,   532,  3822,   262])
tensor([   11,   262,  3807,  5000,  1064,    13,    11,   453,    13, 13664,72,   983,   340,    13,    12,   262])
10
tensor([  284,   428,   991,   705,   318, 11783,  1787,    65, 43527,  2306,460,  8395,   743,  6386,  2370,   393])
tensor([  284,   262,   262,   447,    11,    11,    76,    65, 38520, 20706,318,   502,   468,  6386,  4899,  1022])
20
tensor([ 7357,  1936,  4572,  2465,  1049,   257,  7358,   262, 29963,  2646,517,   290,  9188,  1647,   278,  1241])
tensor([7357,  584,  290, 1143,  649,  262, 2656,  262, 7051,   11,  257,  284,262,   12,  278, 1210])
30
tensor([14969,  5239,  3016,   837, 43207,   262,   764,  4129,   307,   262,705,   465,   262,   837,  1100,  2700])
tensor([   67,  5239, 12302,    13,  2033,   262,   284,  1988,   307,   262,1053,   262,   262,    13,  1621,  2700])
40
tensor([21730, 13770,  2737,   264,   477,  2218,   262,   257,   340,  8886,848, 14821,  1178,   705,   787,  1239])
tensor([  13,  290, 2737,  670,  262,  290,  262,  340,    6,  983,  257,  262,1438,  460,  787,  318])
50
tensor([  262,  6840,   763,   286,   611,   286,   475,  2915,   764,   837,4379, 12986, 10997,   272,   257,  1200])
tensor([  262,   651,   318,  1683,   621,   286,   475,    12,   284,   837,257,   262, 10997,   272,   257,   262])
accuracy:  0.18382352941176472
automatically pegs itself forthe  theif there 's a way toeffectively  getwhile benigni ( who stars andco  isthe stupidest , most insulting movieof  everthe film might have been more satisfyingif  thana journey through memory , a celebrationof  ofthe story may not be new ,but  butthere are touching moments in eto
iles -

6.训练

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

from transformers import AdamW
from transformers.optimization import get_scheduler#训练代码
def train():#设置优化器：优化梯度下降的算法optimizer = AdamW(model.parameters(), lr=2e-5)#学习率下降计划scheduler = get_scheduler(name='linear',num_warmup_steps=0,   #下降步长从0开始num_training_steps=len(loader), #训练次数就是训练数据加载器的长度optimizer=optimizer)#将模型发送到设备上model.to(device)model.train() #模型训练for i, data in enumerate(loader):input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)#计算损失， 损失在model中以及计算了， 可以从返回值中获取loss = out['loss']#为了训练稳定（不要波动太大）， 进行梯度裁剪torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)    #max_norm就是公式中的c#用计算的损失来反向传播loss.backward()#梯度更新optimizer.step()scheduler.step()#梯度清零optimizer.zero_grad()model.zero_grad()if i % 50 == 0:  #每隔50次labels = labels[:, 1:]#out是三维tensor数组， 在索引2的维度上找最大的概率out = out['logits'].argmax(dim=2)[:, :-1]correct = (labels == out).sum().item()  #.item()一维tensor标量可以取出来accuracy = correct / (16 * 7)  #一批16个数据， 每个数据7个数据（8-1个偏移量）lr = optimizer.state_dict()['param_groups'][0]['lr']print(i, loss.item(), accuracy, lr)

train()

0 6.9149088859558105 0.14285714285714285 1.9991980753809144e-05
50 5.734440803527832 0.1875 1.959101844426624e-05
100 5.432187557220459 0.1875 1.919005613472334e-05
150 5.808253765106201 0.16964285714285715 1.8789093825180436e-05
200 5.217792510986328 0.16071428571428573 1.838813151563753e-05
250 5.223909854888916 0.20535714285714285 1.7987169206094627e-05

。。。。。

2250 5.031280040740967 0.14285714285714285 1.948676824378509e-06
2300 4.822522163391113 0.2857142857142857 1.5477145148356058e-06
2350 4.803909778594971 0.25 1.1467522052927025e-06
2400 4.606936931610107 0.26785714285714285 7.457898957497996e-07
2450 4.976705074310303 0.24107142857142858 3.4482758620689656e-07

7.保存训练好的模型

torch.save(model, '../data//预测最后一个词模型.model')

8. 加载保存的模型

#加载上一行代码保存的模型
#注意：加载的模型是传送到GPU训练得到的， 在加载时需要改到cpu上-->map_location='cpu'
model2 = torch.load('../data/预测最后一个词模型.model', map_location='cpu')

test(model2)

0
tensor([  627,   616,  1486,  4608,   290, 38132,   880,   262,  3900,   336,890,   428,   764,   428,   837,  1377])
tensor([  286,   262,   318,  3988,   837,  3807,   257,   262,   705, 14397,3807,   262,   621,   262,   290,   326])
10
tensor([11815,   326,   663,  7464,   340,  3898,   287,    82,   257,   546,3281,   262, 16631,  3807,   428,  2646])
tensor([7635,  286,  663, 3807,  286, 3898,  287,   82,  257,  257, 1146,  262,837,  837,  262, 2646])
20
tensor([  546,  1312,   307,   340,   262,   422,  1049, 11648,   640,  1267,2089,  1683,   800,  1502,   355,   475])
tensor([ 345,  340,  307,  340,  262,  837, 2646, 3807,  262, 1267, 2089,  705,6275,  262,  355,  475])
30
tensor([   82, 19377,   764,   837,  3316,  1751,   508,   809,  1621,  3755,12,  4681,  2071,  1039, 48133,   290])
tensor([   82,   257,   326,   837,  2860,   290,   508,   705,  2646,  2567,48133, 20170,   555,  1039, 48133,   290])
40
tensor([ 3704,   705,  2589, 36138,   534,   503,   262, 20024,  8591,   290,788,  3923,  3807,  8925,  1128,   764])
tensor([  898,   423,  1218,   772,   534,   345,   262, 20024,   262,   290,257,  1692,  2646,  2568,   837,   837])
50
tensor([  837,  5581,   764, 21981,   287,    12,  4379,   318,   705,   286,2962, 10997,  3146,   717,   764,  1165])
tensor([  837,  3159,   326,   257,   837,   288,   257,   318,   705,   286,21452, 10512,  1438,   262,   326,   257])
accuracy:  0.25612745098039214
no movement , no yuks,  ,whether seen on a 10-inchtelevision  screena taut , intelligent psychological drama.  thatthere are times when a rumor ofangels  agreene delivers a typically solid performancein  ,some movies are like a tasty hors
-  dthe visuals alone make metropolis worthseeing  aan experience so engrossing itis  is