首页 > 其他分享 >线性回归

线性回归

时间:2024-09-10 18:49:47浏览次数:12  
标签:features df 回归 batch label 线性 feature model

1. 配置

导入模块。

查看代码
#general
import io

# data
import numpy as np
import pandas as pd

# machine learning
import keras

# data visualization
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns

# 博客园引流
from Keras备忘录 import build_model

加载数据集

 查看代码
chicago_taxi_dataset = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/chicago_taxi_train.csv")

读取数据集

Read dataset
# Updates dataframe to use specific columns.
training_df = chicago_taxi_dataset[['TRIP_MILES', 'TRIP_SECONDS', 'FARE', 'COMPANY', 'PAYMENT_TYPE', 'TIP_RATE']]

print('Read dataset completed successfully.')
print('Total number of rows: {0}\n\n'.format(len(training_df.index)))
training_df.head(200)

2. 训练和预测,结果可视化

定义绘图函数(注意:此处plot_model函数对输入特征为二维时的绘制方法似乎是错的,有奖请读者斧正)

可视化plotting functions
def make_plots(df, feature_names, label_name, model_output, sample_size=200):

  random_sample = df.sample(n=sample_size).copy()
  random_sample.reset_index()
  weights, bias, epochs, rmse = model_output

  is_2d_plot = len(feature_names) == 1
  model_plot_type = "scatter" if is_2d_plot else "surface"
  fig = make_subplots(rows=1, cols=2,
                      subplot_titles=("Loss Curve", "Model Plot"),
                      specs=[[{"type": "scatter"}, {"type": model_plot_type}]])

  plot_data(random_sample, feature_names, label_name, fig)
  plot_model(random_sample, feature_names, weights, bias, fig)
  plot_loss_curve(epochs, rmse, fig)

  fig.show()
  return

def plot_loss_curve(epochs, rmse, fig):
  curve = px.line(x=epochs, y=rmse)
  curve.update_traces(line_color='#ff0000', line_width=3)

  fig.append_trace(curve.data[0], row=1, col=1)
  fig.update_xaxes(title_text="Epoch", row=1, col=1)
  fig.update_yaxes(title_text="Root Mean Squared Error", row=1, col=1, range=[rmse.min()*0.8, rmse.max()])

  return

def plot_data(df, features, label, fig):
  if len(features) == 1:
    scatter = px.scatter(df, x=features[0], y=label)
  else:
    scatter = px.scatter_3d(df, x=features[0], y=features[1], z=label)

  fig.append_trace(scatter.data[0], row=1, col=2)
  if len(features) == 1:
    fig.update_xaxes(title_text=features[0], row=1, col=2)
    fig.update_yaxes(title_text=label, row=1, col=2)
  else:
    fig.update_layout(scene1=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=label))

  return

def plot_model(df, features, weights, bias, fig):
  df['FARE_PREDICTED'] = bias[0]

  for index, feature in enumerate(features):
    df['FARE_PREDICTED'] = df['FARE_PREDICTED'] + weights[index][0] * df[feature]

  if len(features) == 1:
    model = px.line(df, x=features[0], y='FARE_PREDICTED')
    model.update_traces(line_color='#ff0000', line_width=3)
  else:
    z_name, y_name = "FARE_PREDICTED", features[1]
    z = [df[z_name].min(), (df[z_name].max() - df[z_name].min()) / 2, df[z_name].max()]
    y = [df[y_name].min(), (df[y_name].max() - df[y_name].min()) / 2, df[y_name].max()]
    x = []
    for i in range(len(y)):
      x.append((z[i] - weights[1][0] * y[i] - bias[0]) / weights[0][0])

    plane=pd.DataFrame({'x':x, 'y':y, 'z':[z] * 3})

    light_yellow = [[0, '#89CFF0'], [1, '#FFDB58']]
    model = go.Figure(data=go.Surface(x=plane['x'], y=plane['y'], z=plane['z'],
                                      colorscale=light_yellow))

  fig.add_trace(model.data[0], row=1, col=2)

  return

def model_info(feature_names, label_name, model_output):
  weights = model_output[0]
  bias = model_output[1]

  nl = "\n"
  header = "-" * 80
  banner = header + nl + "|" + "MODEL INFO".center(78) + "|" + nl + header

  info = ""
  equation = label_name + " = "

  for index, feature in enumerate(feature_names):
    info = info + "Weight for feature[{}]: {:.3f}\n".format(feature, weights[index][0])
    equation = equation + "{:.3f} * {} + ".format(weights[index][0], feature)

  info = info + "Bias: {:.3f}\n".format(bias[0])
  equation = equation + "{:.3f}\n".format(bias[0])

  return banner + nl + info + nl + equation

print("SUCCESS: defining plotting functions complete.")

定义训练函数

训练
def train_model(model, df, features, label, epochs, batch_size):
  """Train the model by feeding it data."""

  # Feed the model the feature and the label.
  # The model will train for the specified number of epochs.
  # input_x = df.iloc[:,1:3].values
  # df[feature]
  history = model.fit(x=features,
                      y=label,
                      batch_size=batch_size,
                      epochs=epochs)

  # Gather the trained model's weight and bias.
  trained_weight = model.get_weights()[0]
  trained_bias = model.get_weights()[1]

  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch

  # Isolate the error for each epoch.
  hist = pd.DataFrame(history.history)

  # To track the progression of training, we're going to take a snapshot
  # of the model's root mean squared error at each epoch.
  rmse = hist["root_mean_squared_error"]

  return trained_weight, trained_bias, epochs, rmse

定义实验函数,也可直接作为主函数。

实验
def run_experiment(df, feature_names, label_name, learning_rate, epochs, batch_size):

  print('INFO: starting training experiment with features={} and label={}\n'.format(feature_names, label_name))

  num_features = len(feature_names)

  features = df.loc[:, feature_names].values
  label = df[label_name].values

  model = build_model(learning_rate, num_features)
  model_output = train_model(model, df, features, label, epochs, batch_size)

  print('\nSUCCESS: training experiment complete\n')
  print('{}'.format(model_info(feature_names, label_name, model_output)))
  make_plots(df, feature_names, label_name, model_output)

  return model

3. 开展实验

先用一种特征:

单特征
# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 10
batch_size = 50

# Specify the feature and the label.
features = ['TRIP_MILES']
label = 'FARE'

model_1 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)

此时可以修改学习率到1或原来的一成或,或批次大小为原来的十倍,查看超参数的调整的影响。

再尝试用两种特征'TRIP_MILES'和'TRIP_MINUTES'(注意:若使用两种特征,则它们大小最好在同一量级,所以此处将训练集原始的特征'TRIP_SECONDS'转换成分钟数作为新特征训练),只需修改上述程序的部分:

修改部分
training_df['TRIP_MINUTES'] = training_df['TRIP_SECONDS']/60

features = ['TRIP_MILES', 'TRIP_MINUTES']

训练完后,采用训练集的随即部分样本(非标准操作,仅供学习模型预测步骤使用)做出预测。先定义预测函数

预测函数
def format_currency(x):
  return "${:.2f}".format(x)

def build_batch(df, batch_size):
  batch = df.sample(n=batch_size).copy()
  batch.set_index(np.arange(batch_size), inplace=True)
  return batch

def predict_fare(model, df, features, label, batch_size=50):
  batch = build_batch(df, batch_size)
  predicted_values = model.predict_on_batch(x=batch.loc[:, features].values)

  data = {"PREDICTED_FARE": [], "OBSERVED_FARE": [], "L1_LOSS": [],
          features[0]: [], features[1]: []}
  for i in range(batch_size):
    predicted = predicted_values[i][0]
    observed = batch.at[i, label]
    data["PREDICTED_FARE"].append(format_currency(predicted))
    data["OBSERVED_FARE"].append(format_currency(observed))
    data["L1_LOSS"].append(format_currency(abs(observed - predicted)))
    data[features[0]].append(batch.at[i, features[0]])
    data[features[1]].append("{:.2f}".format(batch.at[i, features[1]]))

  output_df = pd.DataFrame(data)
  return output_df

def show_predictions(output):
  header = "-" * 80
  banner = header + "\n" + "|" + "PREDICTIONS".center(78) + "|" + "\n" + header
  print(banner)
  print(output)
  return

然后作出预测:

output = predict_fare(model_2, training_df, features, label)
show_predictions(output)

4. 鸣谢

Google开发者中心。

 

标签:features,df,回归,batch,label,线性,feature,model
From: https://www.cnblogs.com/ArmRoundMan/p/18406931

相关文章

  • 2024ccpc线性基与校赛线性基
    异或空间线性基我终于意识到写题解有多重要了2024CCPC网络赛ProblemJ.找最小Mandy发现了两个很好玩的长度为\(n\)的序列,记为\(a,b\),她觉得一个序列的无趣度为序列内所有元素的异或和。现在她想要这两个序列尽可能有趣,具体来说,她希望最无趣的序列尽可能有趣。她觉得交......
  • 线性表的链式存储
    线性表的链式存储1链式存储在一个数据结构中,如果一个结点有多个前驱或后继时,使用顺序存储就比较麻烦,即使是线性结构,如果没有足够大的存储空间供使用,也是无法实现的。这时就可以使用链式下存储,在这种存储方式下,除了存放一个结点的信息外,还需附设指针,用指针来体现结点之间的逻辑......
  • 【机器学习】嘿马机器学习(算法篇)第10篇:逻辑回归,学习目标【附代码文档】
    本教程的知识点为:机器学习算法定位、K-近邻算法1.4k值的选择1K值选择说明1.6案例:鸢尾花种类预测--数据集介绍1案例:鸢尾花种类预测1.8案例:鸢尾花种类预测—流程实现1再识K-近邻算法API1.11案例2:预测facebook签到位置1项目描述线性回归2.3数学:求导1......
  • AP510X 单路低压差线性恒流芯片 LED手电筒台灯车灯照明方案
    产品描述AP510X是一系列电路简洁的单路线性LED恒流芯片,适用于3-60V电压范围的LED恒流调光领域。AP510X采用我司算法,可以实现高精度的恒流效果,输出电流恒流精度≤±3%,电源供电工作范围为3-40V,可以轻松满足锂电池以及市场上面中低压的应用需求。PWM调光支持高辉应用,可以支持20K以......
  • 时间序列回归预测
    回归预测仅个人笔记使用,感谢点赞关注!深度学习LSTMPyTorch搭建LSTM实现时间序列预测:基于torch实现时间序列预测(多种方法)传统方法自回归树模型XGBLGBM组合模型......
  • 数据结构与算法(三)线性表的定义与抽象数据类型
    目录一、感受线性表的存在二、线性表的定义三、考题模拟1、请问公司的组织架构是否属于线性关系?2、那么班级同学的友谊呢?3、那么班级的点名册是不是线性表?四、抽象数据类型1、数据类型的定义:2、抽象数据类型一、感受线性表的存在    从这一篇开始,我们将介......
  • MATLAB卡尔曼|卡尔曼滤波的公式【线性】
    卡尔曼滤波卡尔曼滤波(KalmanFilter)是一种用于估计系统状态的数学算法,不是类似于高通、低通滤波器那样的频域滤波。卡尔曼滤波基于线性动态系统的假设,它将系统的状态表示为均值和协方差矩阵,通过递归地更新和预测这些值来实现对系统状态的估计。卡尔曼滤波有两个主要的步......
  • 最小二乘回归算法原理及Python实践
    最小二乘回归算法原理主要基于最小化误差平方和的思想,以找到数据的最佳函数匹配。以下是对其原理的详细阐述:一、基本原理最小二乘法(LeastSquaresMethod,简称LS)是一种数学优化技术,它通过最小化误差的平方和来寻找数据的最佳函数匹配。在回归分析中,最小二乘法被广泛应用于......
  • 偏最小二乘回归算法原理及Python实践
    偏最小二乘回归(PartialLeastSquaresRegression,PLS回归)是一种统计学和机器学习中的多元数据分析方法,特别适用于处理因变量和自变量之间存在多重共线性问题的情况。其原理主要可以归纳为以下几点:一.原理概述PLS回归通过投影分别将预测变量(自变量X)和观测变量(因变量Y)投......
  • 线性代数 第五讲:线性方程组_齐次线性方程组_非齐次线性方程组_公共解同解方程组_详解
    线性方程组文章目录线性方程组1.齐次线性方程组的求解1.1核心要义1.2基础解系与线性无关的解向量的个数1.3计算使用举例2.非齐次线性方程的求解2.1非齐次线性方程解的判定2.2非齐次线性方程解的结构2.3计算使用举例3.公共解与同解3.1两个方程组的公共解3.2同......