一、导入库和数据集
代码环境:
主要的包版本如下
python==3.10
scikit-learn==1.0.2
tensorflow==2.15.0
导入库
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.regularizers import l2
import matplotlib.pyplot as plt
import glob, os
import seaborn as sns
import sys
from sklearn.preprocessing import MinMaxScaler
import keras
from sklearn.metrics import mean_absolute_error as mae
读取数据集加载到pandas,并打印前5行和列
dataframe = pd.read_csv("D:/data/rossmann-stores-clustering-and-forecast/train.csv")
dataframe.head()
打印列名
dataframe.columns
对StateHoliday字母进行转化
def transform_state_holiday(x):
if x == "a":
return 1
elif x == "b":
return 2
elif x == "c":
return 3
return x
dataframe["StateHoliday"] = dataframe.apply(lambda x:transform_state_holiday(x.StateHoliday), axis=1)
二、数据预处理
获取一个门店的数据进行LSTM建模
data = dataframe[dataframe["Store"] == 1].sort_values(by="Date")
将数据归一化到0-1之间,无量纲化
scaler = MinMaxScaler(feature_range=(0,1))
column_list_scaler = ['Sales','DayOfWeek','Open','Promo','StateHoliday','SchoolHoliday']
scaled_data = scaler.fit_transform(data[column_list_scaler].values)
时间序列数据转化为监督问题数据
def series_to_supervised(data, n_in=1, n_out=1, dropnana=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = pd.DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)'%(j+1, i)) for j in range(n_vars)]
# forcast sequence(t, t+1, t+n)
for i in range(n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d'%(j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
#put it all together
print("cols : {}".format(cols))
print("names : {}".format(names))
agg = pd.concat(cols, axis=1)
agg.columns = names
# drop rows witn NAN values
if dropnana:
agg.dropna(inplace=True)
return agg
# 将时序数据转换为监督问题数据
reframed = series_to_supervised(scaled_data, 1, 1)
删除无用的列
reframed = reframed.iloc[:, 0:len(column_list_scaler) + 1]
reframed.head()
三、数据建模
数据集划分,选取前550天数据作为训练集,中间250天数据作为验证集,其余全为测试集
train_days = 550
valid_days = 250
values = reframed.values
train = values[:train_days, :]
valid = values[train_days:train_days+valid_days, :]
test = values[train_days+valid_days:, :]
train_X, train_Y = train[:, :-1], train[:, -1]
valid_X, valid_Y = valid[:, :-1], valid[:, -1]
test_X, test_Y = test[:, :-1], test[:, -1]
将数据集重构为符合LSTM要求的数据格式,即 [样本,时间步,特征]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
valid_X = valid_X.reshape((valid_X.shape[0], 1, valid_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_Y.shape, valid_X.shape, valid_Y.shape, test_X.shape, test_Y.shape)
建立模型并训练
model = Sequential()
model.add(LSTM(100, activation='relu',input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')
#fit network
LSTM = model.fit(train_X,
train_Y,
epochs=100,
batch_size=20,
validation_data=(valid_X, valid_Y),
verbose=2,
shuffle=False)
打印loss
plt.plot(LSTM.history['loss'], label='train')
plt.plot(LSTM.history['val_loss'], label='valid')
plt.legend()
plt.show()
预测test数据
test_predict = model.predict(test_X)
数据进行反归一化
test_data_inverse = scaler.inverse_transform(np.concatenate(( test_Y.reshape(-1, 1), test_X.reshape(-1, len(column_list_scaler))[:, 1:]), axis=1))[:,0]
test_predict_inverse = scaler.inverse_transform(np.concatenate(( test_predict.reshape(-1, 1), test_X.reshape(-1, len(column_list_scaler))[:, 1:]), axis=1))[:,0]
print("mae : ", mae(test_data_inverse, test_predict_inverse))
四、数据来源和源码获取
训练数据来源于kaggle,读者可以去kagga下载。
或者加小编微信获取数据和源码:
标签:--,需求预测,shape,train,test,import,LSTM,data,valid From: https://blog.csdn.net/u014460433/article/details/139659207