简述
本文是 Pytorch封装简单RNN模型,进行中文训练及文本预测 一文的延申,主要做以下改动:
1.将nn.RNN
替换为nn.LSTM
,并设置多层LSTM:
既然使用pytorch了,自然不需要手动实现多层,注意nn.RNN
和nn.LSTM
在实例化时均有参数num_layers
来指定层数,本文设置num_layers=2
;
2.新增emdedding层,替换掉原来的nn.functional.one_hot
向量化,这样得到的emdedding
层可以用来做词向量分布式表示;
3.在emdedding后、LSTM内部、LSTM后均增加Dropout层,来抑制过拟合:
在nn.LSTM
内部的Dropout可以通过实例化时的参数dropout
来设置,需要注意pytorch仅在两层lstm之间应用Dropout
,不会在最后一层的LSTM输出上应用Dropout
。
emdedding后、LSTM后与线性层之间则需要手动添加Dropout
层。
4.考虑emdedding
与最后的Linear
层共享权重:
这样做可以在保证精度的情况下,减少学习参数,但本文代码没有实现该部分。
不考虑第四条时,模型结构如下:
代码
模型代码:
class MyLSTM(nn.Module): def __init__(self, vocab_size, wordvec_size, hidden_size, num_layers=2, dropout=0.5): super(MyLSTM, self).__init__() self.vocab_size = vocab_size self.word_vec_size = wordvec_size self.hidden_size = hidden_size self.embedding = nn.Embedding(vocab_size, wordvec_size) self.dropout = nn.Dropout(dropout) self.rnn = nn.LSTM(wordvec_size, hidden_size, num_layers=num_layers, dropout=dropout) # self.rnn = rnn_layer self.linear = nn.Linear(self.hidden_size, vocab_size) def forward(self, x, h0=None, c0=None): # nn.Embedding 需要的类型 (IntTensor or LongTensor) # 传过来的X是(batch_size, seq), embedding之后 是(batch_size, seq, vocab_size) # nn.LSTM 支持的X默认为(seq, batch_size, vocab_size) # 若想用(batch_size, seq, vocab_size)作参数, 则需要在创建self.embedding实例时指定batch_first=True # 这里用(seq, batch_size, vocab_size) 作参数,所以先给x转置,再embedding,以便再将结果传给lstm x = x.T x.long() x = self.embedding(x) x = self.dropout(x) outputs = self.dropout(outputs) outputs = outputs.reshape(-1, self.hidden_size) outputs = self.linear(outputs) return outputs, (h0, c0) def init_state(self, device, batch_size=1): return (torch.zeros((self.rnn.num_layers, batch_size, self.hidden_size), device=device), torch.zeros((self.rnn.num_layers, batch_size, self.hidden_size), device=device))
训练代码:
模型应用可以参考 Pytorch封装简单RNN模型,进行中文训练及文本预测 一文。
def start_train(): # device = torch.device("cpu") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f'\ndevice: {device}') corpus, vocab = load_corpus("../data/COIG-CQIA/chengyu_qa.txt") vocab_size = len(vocab) wordvec_size = 100 hidden_size = 256 epochs = 1 batch_size = 50 learning_rate = 0.01 time_size = 4 max_grad_max_norm = 0.5 num_layers = 2 dropout = 0.5 dataset = make_dataset(corpus=corpus, time_size=time_size) data_loader = data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True) net = MyLSTM(vocab_size=vocab_size, wordvec_size=wordvec_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout) net.to(device) # print(net.state_dict()) criterion = nn.CrossEntropyLoss() criterion.to(device) optimizer = optim.Adam(net.parameters(), lr=learning_rate) writer = SummaryWriter('./train_logs') # 随便定义个输入, 好使用add_graph tmp = torch.randint(0, 100, size=(batch_size, time_size)).to(device) h0, c0 = net.init_state(batch_size=batch_size, device=device) writer.add_graph(net, [tmp, h0, c0]) loss_counter = 0 total_loss = 0 ppl_list = list() total_train_step = 0 for epoch in range(epochs): print('------------Epoch {}/{}'.format(epoch + 1, epochs)) for X, y in data_loader: X, y = X.to(device), y.to(device) # 这里batch_size=X.shape[0]是因为在加载数据时, DataLoader没有设置丢弃不完整的批次, 所以存在实际批次不满足设定的batch_size h0, c0 = net.init_state(batch_size=X.shape[0], device=device) outputs, (hn, cn) = net(X, h0, c0) optimizer.zero_grad() # y也变成 时间序列*批次大小的行数, 才和 outputs 一致 y = y.T.reshape(-1) # 交叉熵的第二个参数需要LongTorch loss = criterion(outputs, y.long()) loss.backward() # 求完梯度之后可以考虑梯度裁剪, 再更新梯度 grad_clipping(net, max_grad_max_norm) optimizer.step() total_loss += loss.item() loss_counter += 1 total_train_step += 1 if total_train_step % 10 == 0: print(f'Epoch: {epoch + 1}, 累计训练次数: {total_train_step}, 本次loss: {loss.item():.4f}') writer.add_scalar('train_loss', loss.item(), total_train_step) ppl = np.exp(total_loss / loss_counter) ppl_list.append(ppl) print(f'Epoch {epoch + 1} 结束, batch_loss_average: {total_loss / loss_counter}, perplexity: {ppl}') writer.add_scalar('ppl', ppl, epoch + 1) total_loss = 0 loss_counter = 0 torch.save(net.state_dict(), './save/epoch_{}_ppl_{}.pth'.format(epoch + 1, ppl)) writer.close() return net, ppl_list