1. 配置
导入模块。
查看代码
#general
import io
# data
import numpy as np
import pandas as pd
# machine learning
import keras
# data visualization
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
# 博客园引流
from Keras备忘录 import build_model
加载数据集
查看代码
chicago_taxi_dataset = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/chicago_taxi_train.csv")
读取数据集
Read dataset
# Updates dataframe to use specific columns.
training_df = chicago_taxi_dataset[['TRIP_MILES', 'TRIP_SECONDS', 'FARE', 'COMPANY', 'PAYMENT_TYPE', 'TIP_RATE']]
print('Read dataset completed successfully.')
print('Total number of rows: {0}\n\n'.format(len(training_df.index)))
training_df.head(200)
2. 训练和预测,结果可视化
定义绘图函数(注意:此处plot_model函数对输入特征为二维时的绘制方法似乎是错的,有奖请读者斧正)
可视化plotting functions
def make_plots(df, feature_names, label_name, model_output, sample_size=200):
random_sample = df.sample(n=sample_size).copy()
random_sample.reset_index()
weights, bias, epochs, rmse = model_output
is_2d_plot = len(feature_names) == 1
model_plot_type = "scatter" if is_2d_plot else "surface"
fig = make_subplots(rows=1, cols=2,
subplot_titles=("Loss Curve", "Model Plot"),
specs=[[{"type": "scatter"}, {"type": model_plot_type}]])
plot_data(random_sample, feature_names, label_name, fig)
plot_model(random_sample, feature_names, weights, bias, fig)
plot_loss_curve(epochs, rmse, fig)
fig.show()
return
def plot_loss_curve(epochs, rmse, fig):
curve = px.line(x=epochs, y=rmse)
curve.update_traces(line_color='#ff0000', line_width=3)
fig.append_trace(curve.data[0], row=1, col=1)
fig.update_xaxes(title_text="Epoch", row=1, col=1)
fig.update_yaxes(title_text="Root Mean Squared Error", row=1, col=1, range=[rmse.min()*0.8, rmse.max()])
return
def plot_data(df, features, label, fig):
if len(features) == 1:
scatter = px.scatter(df, x=features[0], y=label)
else:
scatter = px.scatter_3d(df, x=features[0], y=features[1], z=label)
fig.append_trace(scatter.data[0], row=1, col=2)
if len(features) == 1:
fig.update_xaxes(title_text=features[0], row=1, col=2)
fig.update_yaxes(title_text=label, row=1, col=2)
else:
fig.update_layout(scene1=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=label))
return
def plot_model(df, features, weights, bias, fig):
df['FARE_PREDICTED'] = bias[0]
for index, feature in enumerate(features):
df['FARE_PREDICTED'] = df['FARE_PREDICTED'] + weights[index][0] * df[feature]
if len(features) == 1:
model = px.line(df, x=features[0], y='FARE_PREDICTED')
model.update_traces(line_color='#ff0000', line_width=3)
else:
z_name, y_name = "FARE_PREDICTED", features[1]
z = [df[z_name].min(), (df[z_name].max() - df[z_name].min()) / 2, df[z_name].max()]
y = [df[y_name].min(), (df[y_name].max() - df[y_name].min()) / 2, df[y_name].max()]
x = []
for i in range(len(y)):
x.append((z[i] - weights[1][0] * y[i] - bias[0]) / weights[0][0])
plane=pd.DataFrame({'x':x, 'y':y, 'z':[z] * 3})
light_yellow = [[0, '#89CFF0'], [1, '#FFDB58']]
model = go.Figure(data=go.Surface(x=plane['x'], y=plane['y'], z=plane['z'],
colorscale=light_yellow))
fig.add_trace(model.data[0], row=1, col=2)
return
def model_info(feature_names, label_name, model_output):
weights = model_output[0]
bias = model_output[1]
nl = "\n"
header = "-" * 80
banner = header + nl + "|" + "MODEL INFO".center(78) + "|" + nl + header
info = ""
equation = label_name + " = "
for index, feature in enumerate(feature_names):
info = info + "Weight for feature[{}]: {:.3f}\n".format(feature, weights[index][0])
equation = equation + "{:.3f} * {} + ".format(weights[index][0], feature)
info = info + "Bias: {:.3f}\n".format(bias[0])
equation = equation + "{:.3f}\n".format(bias[0])
return banner + nl + info + nl + equation
print("SUCCESS: defining plotting functions complete.")
定义训练函数
训练
def train_model(model, df, features, label, epochs, batch_size):
"""Train the model by feeding it data."""
# Feed the model the feature and the label.
# The model will train for the specified number of epochs.
# input_x = df.iloc[:,1:3].values
# df[feature]
history = model.fit(x=features,
y=label,
batch_size=batch_size,
epochs=epochs)
# Gather the trained model's weight and bias.
trained_weight = model.get_weights()[0]
trained_bias = model.get_weights()[1]
# The list of epochs is stored separately from the rest of history.
epochs = history.epoch
# Isolate the error for each epoch.
hist = pd.DataFrame(history.history)
# To track the progression of training, we're going to take a snapshot
# of the model's root mean squared error at each epoch.
rmse = hist["root_mean_squared_error"]
return trained_weight, trained_bias, epochs, rmse
定义实验函数,也可直接作为主函数。
实验
def run_experiment(df, feature_names, label_name, learning_rate, epochs, batch_size):
print('INFO: starting training experiment with features={} and label={}\n'.format(feature_names, label_name))
num_features = len(feature_names)
features = df.loc[:, feature_names].values
label = df[label_name].values
model = build_model(learning_rate, num_features)
model_output = train_model(model, df, features, label, epochs, batch_size)
print('\nSUCCESS: training experiment complete\n')
print('{}'.format(model_info(feature_names, label_name, model_output)))
make_plots(df, feature_names, label_name, model_output)
return model
3. 开展实验
先用一种特征:
单特征
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 10
batch_size = 50
# Specify the feature and the label.
features = ['TRIP_MILES']
label = 'FARE'
model_1 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)
此时可以修改学习率到1或原来的一成或,或批次大小为原来的十倍,查看超参数的调整的影响。
再尝试用两种特征'TRIP_MILES'和'TRIP_MINUTES'(注意:若使用两种特征,则它们大小最好在同一量级,所以此处将训练集原始的特征'TRIP_SECONDS'转换成分钟数作为新特征训练),只需修改上述程序的部分:
修改部分
training_df['TRIP_MINUTES'] = training_df['TRIP_SECONDS']/60
features = ['TRIP_MILES', 'TRIP_MINUTES']
训练完后,采用训练集的随即部分样本(非标准操作,仅供学习模型预测步骤使用)做出预测。先定义预测函数
预测函数
def format_currency(x):
return "${:.2f}".format(x)
def build_batch(df, batch_size):
batch = df.sample(n=batch_size).copy()
batch.set_index(np.arange(batch_size), inplace=True)
return batch
def predict_fare(model, df, features, label, batch_size=50):
batch = build_batch(df, batch_size)
predicted_values = model.predict_on_batch(x=batch.loc[:, features].values)
data = {"PREDICTED_FARE": [], "OBSERVED_FARE": [], "L1_LOSS": [],
features[0]: [], features[1]: []}
for i in range(batch_size):
predicted = predicted_values[i][0]
observed = batch.at[i, label]
data["PREDICTED_FARE"].append(format_currency(predicted))
data["OBSERVED_FARE"].append(format_currency(observed))
data["L1_LOSS"].append(format_currency(abs(observed - predicted)))
data[features[0]].append(batch.at[i, features[0]])
data[features[1]].append("{:.2f}".format(batch.at[i, features[1]]))
output_df = pd.DataFrame(data)
return output_df
def show_predictions(output):
header = "-" * 80
banner = header + "\n" + "|" + "PREDICTIONS".center(78) + "|" + "\n" + header
print(banner)
print(output)
return
然后作出预测:
output = predict_fare(model_2, training_df, features, label)
show_predictions(output)
4. 鸣谢
Google开发者中心。
标签:features,df,回归,batch,label,线性,feature,model From: https://www.cnblogs.com/ArmRoundMan/p/18406931