目录
1.加载预训练模型
2 从本地加载数据集
3.数据集处理
4.下游任务模型
5.测试代码
6.训练代码
7.保存训练好的模型
8. 加载 保存的模型
1.加载预训练模型
#加载预训练模型
from transformers import AutoTokenizer#预训练模型:distilgpt2
#use_fast=True:用rust语言写的分词器,速度比python语言写的分词器快很多
tokenizer = AutoTokenizer.from_pretrained(r'../data/model/distilgpt2/', use_fast=True)
tokenizer.batch_encode_plus(['hide new secretions from the parental units','this moive is great'
])#输出:
#{'input_ids': [[24717, 649, 3200, 507, 422, 262, 21694, 4991], [5661, 6941, 425, 318, 1049]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}print(tokenizer)
GPT2TokenizerFast(name_or_path='../data/model/distilgpt2/', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False), added_tokens_decoder={50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True), }
#预测最后一个词:是一个多分类问题
#针对这个vocab_size=50257的问题,分类类别就是50257个类别
2 从本地加载数据集
from datasets import load_from_diskdataset = load_from_disk(r'E:/ALOT/10_deep_learning/data/datasets/glue_sst2/')dataset
DatasetDict({train: Dataset({features: ['sentence', 'label', 'idx'],num_rows: 67349})validation: Dataset({features: ['sentence', 'label', 'idx'],num_rows: 872})test: Dataset({features: ['sentence', 'label', 'idx'],num_rows: 1821}) })
3.数据集处理
#预测下一个词,只需要数据集中的sentence, 不需要label和idx
#使用map函数做映射。处理只剩下sentence
def f(dataset, tokenizer):return tokenizer.batch_encode_plus(dataset['sentence'])#num_proc=8 查看任务管理器--性能--逻辑处理器的数量
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=8,remove_columns=['sentence', 'label', 'idx'],fn_kwargs={'tokenizer': tokenizer}) dataset
DatasetDict({train: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 67349})validation: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 872})test: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 1821}) })
#规定一个句子最小要有8个单词
#过滤掉太短的句子
def f_1(dataset):return [len(i) >=8 for i in dataset['input_ids']] #i是每一句话dataset = dataset.filter(f_1, batched=True, batch_size=1000, num_proc=8)dataset
DatasetDict({train: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 39905})validation: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 848})test: Dataset({features: ['input_ids', 'attention_mask'],num_rows: 1730}) })
#截断句子
def f_2(dataset):#分别获取截断之后的input_ids, attention_maskdataset['input_ids'] = [i[:8] for i in dataset['input_ids']]dataset['attention_mask'] = [[1] * 8] * len(dataset['attention_mask'])#预测最后一个词,前七个单词输入,最后一个输入#模型帮我们做了偏移量问题, 这里输入和输出保持一致即可dataset['labels'] = dataset['input_ids']return datasetdataset = dataset.map(f_2, batched=True, batch_size=1000, num_proc=8)dataset
DatasetDict({train: Dataset({features: ['input_ids', 'attention_mask', 'labels'],num_rows: 39905})validation: Dataset({features: ['input_ids', 'attention_mask', 'labels'],num_rows: 848})test: Dataset({features: ['input_ids', 'attention_mask', 'labels'],num_rows: 1730}) })
dataset['train'][0]
{'input_ids': [24717, 649, 3200, 507, 422, 262, 21694, 4991],'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1],'labels': [24717, 649, 3200, 507, 422, 262, 21694, 4991]}
#定义数据加载器
import torch
#default_data_collator: 将一条条取数据变成批量取数据
from transformers.data.data_collator import default_data_collatorloader = torch.utils.data.DataLoader(dataset=dataset['train'], #取出训练数据集batch_size=16,collate_fn=default_data_collator,shuffle=True,drop_last=True #若最后一批数据没有batch_size个数据,就删掉不用
)for data in loader:breaklen(loader), data
(2494,{'input_ids': tensor([[22602, 4340, 262, 2126, 286, 1642, 479, 993],[ 5832, 651, 262, 10647, 326, 6260, 290, 3437],[ 11, 645, 530, 460, 3285, 345, 3013, 382],[48580, 257, 2612, 290, 3950, 326, 36675, 262],[ 361, 345, 467, 287, 6970, 326, 837, 345],[ 270, 705, 82, 257, 4950, 2646, 837, 1336],[ 71, 1794, 6819, 837, 26996, 6819, 6776, 837],[11246, 7650, 30669, 13766, 17548, 351, 6159, 220],[ 1169, 11918, 286, 281, 7876, 1013, 30909, 1358],[22437, 299, 470, 1612, 340, 705, 82, 922],[ 270, 705, 82, 23056, 284, 766, 257, 3807],[ 5832, 1244, 892, 339, 373, 2491, 329, 2607],[ 2395, 9259, 736, 49253, 837, 11441, 2223, 16311],[ 8505, 837, 1312, 11691, 340, 705, 82, 14081],[ 1169, 1306, 13203, 29185, 286, 262, 1842, 8848],[26535, 867, 14138, 290, 41169, 12, 44517, 2628]]),'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1]]),'labels': tensor([[22602, 4340, 262, 2126, 286, 1642, 479, 993],[ 5832, 651, 262, 10647, 326, 6260, 290, 3437],[ 11, 645, 530, 460, 3285, 345, 3013, 382],[48580, 257, 2612, 290, 3950, 326, 36675, 262],[ 361, 345, 467, 287, 6970, 326, 837, 345],[ 270, 705, 82, 257, 4950, 2646, 837, 1336],[ 71, 1794, 6819, 837, 26996, 6819, 6776, 837],[11246, 7650, 30669, 13766, 17548, 351, 6159, 220],[ 1169, 11918, 286, 281, 7876, 1013, 30909, 1358],[22437, 299, 470, 1612, 340, 705, 82, 922],[ 270, 705, 82, 23056, 284, 766, 257, 3807],[ 5832, 1244, 892, 339, 373, 2491, 329, 2607],[ 2395, 9259, 736, 49253, 837, 11441, 2223, 16311],[ 8505, 837, 1312, 11691, 340, 705, 82, 14081],[ 1169, 1306, 13203, 29185, 286, 262, 1842, 8848],[26535, 867, 14138, 290, 41169, 12, 44517, 2628]])})
4.下游任务模型
from transformers import AutoModelForCausalLM, GPT2Modelclass Model(torch.nn.Module):def __init__(self):super().__init__()self.pretrained = GPT2Model.from_pretrained('../data/model/distilgpt2/')self.fc = torch.nn.Linear(768, tokenizer.vocab_size, bias=False)#加载预训练权重的模型parameters = AutoModelForCausalLM.from_pretrained('../data/model/distilgpt2/')#全连接层加载预训练权重self.fc.load_state_dict(parameters.lm_head.state_dict()) #前四句代码的简便写法#self_pretrained = AutoModelForCausalLM.from_pretrained('../data/model/distilgpt2/')self.criterion = torch.nn.CrossEntropyLoss()def forward(self, input_ids, attention_mask, labels=None):logits = self.pretrained(input_ids=input_ids, attention_mask=attention_mask)logits = logits.last_hidden_statelogits = self.fc(logits)loss=Noneif labels is not None:#传入数据与labels 的取值范围的偏移量为1#第0维度与第2维度都要,第1维度的最后一个不要shift_logits = logits[:, :-1].reshape(-1, tokenizer.vocab_size) #-1表示前两个维度合并#第0维度与第2维度都要,第1维度的第一个不要shift_labels = labels[:, 1:].reshape(-1) #二维tensor数组变成一维loss = self.criterion(shift_logits, shift_labels)return {'loss':loss, 'logits':logits}model = Model()
#参数量
print(sum(i.numel() for i in model.parameters()) / 10000) #除以10000是以万为单位
12050.9952
#python中**的用法,可以自动把一个字典解包成关键字参数{} -->xxx = xxx, xxx-->xxx
out = model(**data)
# print(out) #一个字典
print(out['loss'], out['logits'].shape)
tensor(6.2742, grad_fn=<NllLossBackward0>) torch.Size([16, 8, 50257])
5.测试代码
# 测试代码
def test(model):model.eval()# 加载测试数据loader_test = torch.utils.data.DataLoader(dataset=dataset['test'],batch_size=16,collate_fn=default_data_collator,shuffle=True, #测试预测时,可以不打乱数据drop_last=True)correct = 0total = 0for i, data in enumerate(loader_test):# 只计算最后一个词的准确率. label = data['input_ids'][:, -1].clone() #clone()克隆数据,不修改原数据# 再从数据中抹除最后一个词, 防止模型作弊. data['input_ids'][:, -1] = 0 #把最后一列数据都赋值为0# label就不需要了 都赋值为0data['labels'][:, :] = 0# 计算with torch.no_grad(): #测试预测时,不需要求导梯度下降(是处于训练时做的事)#相当于 out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)out = model(**data)# 最后一个词的准确率, 因为有偏移量的关系, 这里取的是倒数第二个词out = out['logits'].argmax(dim=2)[:, -2] #在PyTorch中,.item() 是一个常用于从包含单个元素的张量(通常是一个0维张量,即标量scalar)中提取Python数值的方法。correct += (label==out).sum().item()total += 16 #16就是batch_szie的数值if i % 10 == 0: #每隔10次print(i)print(label)print(out)if i == 50: #只循环50次breakprint('accuracy: ', correct / total)for i in range(8):print(tokenizer.decode(data['input_ids'][i, :-1]))print(tokenizer.decode(label[i]), tokenizer.decode(out[i]))print()
test(model)
0 tensor([ 416, 42738, 7297, 2709, 651, 837, 290, 349, 290, 11815,72, 14505, 7559, 532, 3822, 262]) tensor([ 11, 262, 3807, 5000, 1064, 13, 11, 453, 13, 13664,72, 983, 340, 13, 12, 262]) 10 tensor([ 284, 428, 991, 705, 318, 11783, 1787, 65, 43527, 2306,460, 8395, 743, 6386, 2370, 393]) tensor([ 284, 262, 262, 447, 11, 11, 76, 65, 38520, 20706,318, 502, 468, 6386, 4899, 1022]) 20 tensor([ 7357, 1936, 4572, 2465, 1049, 257, 7358, 262, 29963, 2646,517, 290, 9188, 1647, 278, 1241]) tensor([7357, 584, 290, 1143, 649, 262, 2656, 262, 7051, 11, 257, 284,262, 12, 278, 1210]) 30 tensor([14969, 5239, 3016, 837, 43207, 262, 764, 4129, 307, 262,705, 465, 262, 837, 1100, 2700]) tensor([ 67, 5239, 12302, 13, 2033, 262, 284, 1988, 307, 262,1053, 262, 262, 13, 1621, 2700]) 40 tensor([21730, 13770, 2737, 264, 477, 2218, 262, 257, 340, 8886,848, 14821, 1178, 705, 787, 1239]) tensor([ 13, 290, 2737, 670, 262, 290, 262, 340, 6, 983, 257, 262,1438, 460, 787, 318]) 50 tensor([ 262, 6840, 763, 286, 611, 286, 475, 2915, 764, 837,4379, 12986, 10997, 272, 257, 1200]) tensor([ 262, 651, 318, 1683, 621, 286, 475, 12, 284, 837,257, 262, 10997, 272, 257, 262]) accuracy: 0.18382352941176472 automatically pegs itself forthe theif there 's a way toeffectively getwhile benigni ( who stars andco isthe stupidest , most insulting movieof everthe film might have been more satisfyingif thana journey through memory , a celebrationof ofthe story may not be new ,but butthere are touching moments in eto iles -
6.训练
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda', index=0)
from transformers import AdamW
from transformers.optimization import get_scheduler#训练代码
def train():#设置优化器:优化梯度下降的算法optimizer = AdamW(model.parameters(), lr=2e-5)#学习率下降计划scheduler = get_scheduler(name='linear',num_warmup_steps=0, #下降步长从0开始num_training_steps=len(loader), #训练次数就是训练数据加载器的长度optimizer=optimizer)#将模型发送到设备上model.to(device)model.train() #模型训练for i, data in enumerate(loader):input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)#计算损失, 损失在model中以及计算了, 可以从返回值中获取loss = out['loss']#为了训练稳定(不要波动太大), 进行梯度裁剪torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) #max_norm就是公式中的c#用计算的损失来反向传播loss.backward()#梯度更新optimizer.step()scheduler.step()#梯度清零optimizer.zero_grad()model.zero_grad()if i % 50 == 0: #每隔50次labels = labels[:, 1:]#out是三维tensor数组, 在索引2的维度上找最大的概率out = out['logits'].argmax(dim=2)[:, :-1]correct = (labels == out).sum().item() #.item()一维tensor标量可以取出来accuracy = correct / (16 * 7) #一批16个数据, 每个数据7个数据(8-1个偏移量)lr = optimizer.state_dict()['param_groups'][0]['lr']print(i, loss.item(), accuracy, lr)
train()
0 6.9149088859558105 0.14285714285714285 1.9991980753809144e-05 50 5.734440803527832 0.1875 1.959101844426624e-05 100 5.432187557220459 0.1875 1.919005613472334e-05 150 5.808253765106201 0.16964285714285715 1.8789093825180436e-05 200 5.217792510986328 0.16071428571428573 1.838813151563753e-05 250 5.223909854888916 0.20535714285714285 1.7987169206094627e-05。。。。。
2250 5.031280040740967 0.14285714285714285 1.948676824378509e-06 2300 4.822522163391113 0.2857142857142857 1.5477145148356058e-06 2350 4.803909778594971 0.25 1.1467522052927025e-06 2400 4.606936931610107 0.26785714285714285 7.457898957497996e-07 2450 4.976705074310303 0.24107142857142858 3.4482758620689656e-07
7.保存训练好的模型
torch.save(model, '../data//预测最后一个词模型.model')
8. 加载 保存的模型
#加载上一行代码保存的模型
#注意:加载的模型是传送到GPU训练得到的, 在加载时需要改到cpu上-->map_location='cpu'
model2 = torch.load('../data/预测最后一个词模型.model', map_location='cpu')
test(model2)
0 tensor([ 627, 616, 1486, 4608, 290, 38132, 880, 262, 3900, 336,890, 428, 764, 428, 837, 1377]) tensor([ 286, 262, 318, 3988, 837, 3807, 257, 262, 705, 14397,3807, 262, 621, 262, 290, 326]) 10 tensor([11815, 326, 663, 7464, 340, 3898, 287, 82, 257, 546,3281, 262, 16631, 3807, 428, 2646]) tensor([7635, 286, 663, 3807, 286, 3898, 287, 82, 257, 257, 1146, 262,837, 837, 262, 2646]) 20 tensor([ 546, 1312, 307, 340, 262, 422, 1049, 11648, 640, 1267,2089, 1683, 800, 1502, 355, 475]) tensor([ 345, 340, 307, 340, 262, 837, 2646, 3807, 262, 1267, 2089, 705,6275, 262, 355, 475]) 30 tensor([ 82, 19377, 764, 837, 3316, 1751, 508, 809, 1621, 3755,12, 4681, 2071, 1039, 48133, 290]) tensor([ 82, 257, 326, 837, 2860, 290, 508, 705, 2646, 2567,48133, 20170, 555, 1039, 48133, 290]) 40 tensor([ 3704, 705, 2589, 36138, 534, 503, 262, 20024, 8591, 290,788, 3923, 3807, 8925, 1128, 764]) tensor([ 898, 423, 1218, 772, 534, 345, 262, 20024, 262, 290,257, 1692, 2646, 2568, 837, 837]) 50 tensor([ 837, 5581, 764, 21981, 287, 12, 4379, 318, 705, 286,2962, 10997, 3146, 717, 764, 1165]) tensor([ 837, 3159, 326, 257, 837, 288, 257, 318, 705, 286,21452, 10512, 1438, 262, 326, 257]) accuracy: 0.25612745098039214 no movement , no yuks, ,whether seen on a 10-inchtelevision screena taut , intelligent psychological drama. thatthere are times when a rumor ofangels agreene delivers a typically solid performancein ,some movies are like a tasty hors - dthe visuals alone make metropolis worthseeing aan experience so engrossing itis is