继续上文书写:
1 GRU + Attention
收敛速度稳定的很多,你看这些模型是不是很容易搭,像积木一样;
def create_model(input_shape, output_length,lr=1e-3, warehouse="None"):input = Input(shape=input_shape)conv1 = Conv1D(filters=32, kernel_size=3, activation='relu', padding='same', kernel_regularizer=l2())(input)bn = BatchNormalization()(conv1)drop = Dropout(0.4)(bn)gru = GRU(units=64, activation='relu',return_sequences=True)(drop)#print(gru.shape)mult_att = MultiHeadAttention(num_heads=4, key_dim=8)(gru,gru,gru)mult_att_f = Flatten()(mult_att)outputs = Dense(output_length)(mult_att_f)print(outputs.shape)model = Model(inputs=input, outputs=outputs)#model.compile(loss=MeanSquaredError(), optimizer=tf.keras.optimizers.RMSprop(learning_rate=lr, rho=0.9))model.compile(loss=MeanSquaredError(), optimizer=tf.keras.optimizers.RMSprop(lr=lr))return model
2 Informer
Informer 模型是一种专为长序列时间序列预测设计的高效 Transformer 架构。它的输入通常是时间序列数据,这些数据可以是多变量的或是单变量的。具体来说,Informer 模型接收以下类型的输入:
时间序列观测值:
- 这是模型的主要输入,通常是一个形状为
(batch_size, sequence_length, feature_dim)
的张量。其中:
batch_size
是同时处理的序列数量。sequence_length
是每个序列的时间步数。feature_dim
是每个时间步上的特征数量,对于单变量时间序列,feature_dim
为 1;对于多变量时间序列,feature_dim
大于 1。目标序列长度:
- 在训练过程中,Informer 可能还需要知道目标序列的长度,即要预测的未来时间步数,这通常用于监督学习设置中的教师强制(teacher forcing)。
标记序列:
- 除了时间序列观测值外,Informer 还可能接受额外的输入,比如标记序列,这些可以包含额外的信息,如节假日、工作日/非工作日等,以帮助模型更好地理解输入数据的上下文。
编码器和解码器输入掩码:
- Informer 使用自注意力机制,这需要输入掩码来指示哪些位置是有效的,哪些位置应该被忽略。在编码器和解码器中,掩码用于避免某些位置的信息泄露。
位置信息:
- 虽然不是严格意义上的输入,但 Informer 和其他 Transformer 模型通常使用位置嵌入(Position Embedding)来给模型提供关于序列中每个元素位置的信息。这通常是在输入序列处理的第一阶段内部实现的,而不是作为外部输入。
当使用 Informer 进行预测时,你通常需要提供过去的时间序列观测值以及你想要预测的未来时间步数。模型会基于过去的观测值预测未来的值。在实际应用中,可能还需要提供一些配置参数,比如嵌入维度、注意力头的数量、前馈网络的宽度等。
添加了一个位置编码,但是感觉不对。
import numpy as np
import pandas as pd
#import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_splitfrom tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
#import h3
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_rows",None)from tqdm import tqdm
warnings.filterwarnings('ignore')from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Embedding, LayerNormalization, Dense, Dropout, Concatenate,Flatten
from tensorflow.keras.losses import MeanSquaredError, Huber,MeanAbsoluteError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import warnings
from tensorflow.keras.regularizers import l2
from tensorflow.keras import layerstrain_df = pd.read_csv('../data/dataset/train.csv')
#train_df = train_df[train_df.dt<100].reset_index(drop=True)
test_df = pd.read_csv('../data/dataset/test.csv')
#train_df['target_div_dt'] = train_df['target'] / train_df['dt']df_all = pd.concat([train_df,test_df])
df_all['dt_max'] = df_all.groupby('id')['dt'].transform('max')
df_all = df_all.sort_values(['id','dt']).reset_index(drop=True)
df_all['new_dt'] = df_all['dt_max']-df_all['dt']
df_all = df_all.sort_values(['id','new_dt']).reset_index(drop=True)
df_all.tail()
train_df = df_all[~df_all['target'].isna()].reset_index(drop=True)
test_df = df_all[df_all['target'].isna()].reset_index(drop=True)def create_dataset(X, n_steps_in, n_steps_out):print(f"Input data shape before processing: {X.shape}")Xs, ys = [], []for i in range(len(X) - n_steps_in - n_steps_out + 1):Xs.append(X[i:(i + n_steps_in)])ys.append(X[(i + n_steps_in):(i + n_steps_in + n_steps_out)])Xs = np.array(Xs)ys = np.array(ys)print(f"Xs shape after processing: {Xs.shape}")print(f"ys shape after processing: {ys.shape}")return Xs, ys# 定义位置嵌入层
class PositionEmbedding(layers.Layer):def __init__(self, max_positions=512, embedding_dim=128, **kwargs):super(PositionEmbedding, self).__init__(**kwargs)self.max_positions = max_positionsself.embedding_dim = embedding_dimdef build(self, input_shape):self.positions = self.add_weight(shape=(self.max_positions, self.embedding_dim),initializer='uniform',trainable=True,name='position_embeddings')super(PositionEmbedding, self).build(input_shape)def call(self, inputs):seq_len = tf.shape(inputs)[1]positions = tf.range(0, seq_len, delta=1)position_embeddings = tf.gather(self.positions, positions)return inputs + position_embeddingsdef compute_output_shape(self, input_shape):return input_shape# 定义 ProbSparseSelfAttention 层(简化版)
class ProbSparseSelfAttention(layers.Layer):def __init__(self, embed_dim, num_heads, **kwargs):super(ProbSparseSelfAttention, self).__init__(**kwargs)self.embed_dim = embed_dimself.num_heads = num_headsself.head_dim = embed_dim // num_headsself.scale = self.head_dim ** -0.5self.qkv = Dense(embed_dim * 3, use_bias=False)self.out = Dense(embed_dim)def call(self, x):qkv = self.qkv(x)q, k, v = tf.split(qkv, 3, axis=-1)q, k, v = [tf.reshape(t, (-1, self.num_heads, tf.shape(x)[1], self.head_dim)) for t in [q, k, v]]q, k, v = [tf.transpose(t, perm=[0, 2, 1, 3]) for t in [q, k, v]]attn = tf.matmul(q, k, transpose_b=True) * self.scaleattn = tf.nn.softmax(attn, axis=-1)output = tf.matmul(attn, v)output = tf.transpose(output, perm=[0, 2, 1, 3])output = tf.reshape(output, (-1, tf.shape(x)[1], self.embed_dim))return self.out(output)# 定义 Informer Block
class InformerBlock(layers.Layer):def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.1, **kwargs):super(InformerBlock, self).__init__(**kwargs)self.attention = ProbSparseSelfAttention(embed_dim, num_heads)self.ffn = tf.keras.Sequential([Dense(ff_dim, activation="relu"),Dense(embed_dim),])self.layernorm1 = LayerNormalization(epsilon=1e-6)self.layernorm2 = LayerNormalization(epsilon=1e-6)self.dropout1 = Dropout(dropout_rate)self.dropout2 = Dropout(dropout_rate)def call(self, inputs):attn_output = self.attention(inputs)attn_output = self.dropout1(attn_output)out1 = self.layernorm1(inputs + attn_output)ffn_output = self.ffn(out1)ffn_output = self.dropout2(ffn_output)return self.layernorm2(out1 + ffn_output)# 定义 Informer 模型
def create_informer_model(input_seq_length,out_seq_length, input_dim, embed_dim, num_heads, ff_dim, num_blocks, dropout_rate=0.1):inputs = Input(shape=(input_seq_length, input_dim))x = PositionEmbedding()(inputs)for _ in range(num_blocks):x = InformerBlock(embed_dim, num_heads, ff_dim, dropout_rate)(x)x = Flatten()(x)x = Dense(out_seq_length)(x)model = Model(inputs=inputs, outputs=x)return modeldef plot_loss(history, warehouse):plt.figure(figsize=(8, 6))# training and validation lossplt.plot(history.history['loss'], label='Training Loss', color='blue', linewidth=2)plt.plot(history.history['val_loss'], label='Validation Loss', color='orange', linewidth=2)# minimum validation lossmin_val_loss = min(history.history['val_loss'])min_val_loss_epoch = history.history['val_loss'].index(min_val_loss)plt.axvline(min_val_loss_epoch, linestyle='--', color='gray', linewidth=1)plt.text(min_val_loss_epoch, min_val_loss, f'Min Val Loss: {min_val_loss:.4f}', verticalalignment='bottom', horizontalalignment='right', color='gray', fontsize=10)plt.title(f'Training and Validation Loss for ID: {warehouse}', fontsize=16)plt.xlabel('Epoch', fontsize=14)plt.ylabel('Loss', fontsize=14)plt.legend(fontsize=12)plt.grid(True)plt.xticks(fontsize=12)plt.yticks(fontsize=12)plt.tight_layout()#plt.savefig(f'training_validation_loss_{warehouse}.png', dpi=300)plt.show()n_features = 1 # 因为这是一个一维序列
n_steps_in = 70 # 输入序列的长度
n_steps_out = 10 # 预测序列的长度
# 数据预处理
# 数据预处理best_sub = pd.read_csv('../data/sub/es_xgb_55_big_diff_fill_es_216.csv')
tf.random.set_seed(42)
import numpy as np
np.random.seed(42)
import random
import os
error_df = {}
def set_random_seed(seed_value):# Set `PYTHONHASHSEED` environment variable at a fixed valueos.environ['PYTHONHASHSEED']=str(seed_value)# Set `python` built-in pseudo-random generator at a fixed valuerandom.seed(seed_value)# Set `numpy` pseudo-random generator at a fixed valuenp.random.seed(seed_value)# Set `tensorflow` pseudo-random generator at a fixed valuetf.random.set_seed(seed_value)
set_random_seed(42)
import kerasclass PrintCallback(keras.callbacks.Callback):def __init__(self, print_every=1):super(PrintCallback, self).__init__()self.print_every = print_everydef on_epoch_end(self, epoch, logs=None):if (epoch + 1) % self.print_every == 0:print(f"Epoch {epoch + 1}: loss={logs['loss']:.4f}, val_loss={logs['val_loss']:.4f}")# 使用示例
print_every_n_epochs = 5 # 每 5 个 epoch 打印一次
error_id = []
for id in tqdm(train_df.id.unique().tolist()):try:temp_df = train_df[train_df.id==id].reset_index(drop=True)X = temp_df.target.valuesx_test = X[-n_steps_in:]train_X,train_y = create_dataset(X,n_steps_in,n_steps_out)X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.2, shuffle=True)# 参数配置input_seq_length = n_steps_in # 输入序列长度out_seq_length = n_steps_outinput_dim = 1 # 输入特征维度embed_dim = 128 # 嵌入维度num_heads = 4 # 注意力头数量ff_dim = 512 # 前馈神经网络的隐藏层维度num_blocks = 4 # Informer Block 的数量# 创建模型model = create_informer_model(input_seq_length,out_seq_length, input_dim, embed_dim, num_heads, ff_dim, num_blocks)model.compile(optimizer='adam', loss='mse')callbacks = [PrintCallback(print_every=print_every_n_epochs),EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True),]history = model.fit(X_train, y_train, epochs=150, batch_size=32, #validation_split=0.2, validation_data=(X_val, y_val), callbacks=callbacks,verbose=0)test_y = model.predict(x_test.reshape((-1,n_steps_in)))test_df.loc[test_df.id==id,'target'] = test_y[0]error = mean_squared_error(best_sub[best_sub['id']==id]['target'],test_y[0])error_df[id] = round(error,4)print(f'linear model {id} VS best sb ERROR = {error}')except Exception as e:error_id.append(id)print(f'error id = {id}',e)breakpass# 打印模型结构
model.summary()
import matplotlib.pyplot as plt
plot_loss(history,warehouse=id)
把位置编码去了,我觉得周六周日编码一下,这个特征还是有价值的。