目录
1. 加载数据
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
#读取数据
df = pd.read_csv('data.csv')
2.特征工程
(1)数据编码、清洗
# 独热编码颜色和类型
df_colors = pd.get_dummies(df['Color'], prefix='Color')
df_type = pd.get_dummies(df['Type'], prefix='Type')
df = pd.concat([df, df_colors, df_type], axis=1)
df = df.drop(['Brand', 'Color', 'Type'], axis=1)
# 数据清洗:删除含有缺失值的行
df = df.dropna()
# 特征工程:创建新特征
df['Year_Mileage_Ratio'] = (2024 - df['Construction Year']) / df['Odometer']
# 数据标准化
scaler = StandardScaler()
numerical_features = ['Construction Year', 'Odometer', 'Ask Price', 'Year_Mileage_Ratio']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
(2)创建特征热力图
# 设置matplotlib以使用中文字体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 特征热力图
plt.figure(figsize=(12, 10))
corr_matrix = df.corr()
# 替换这里的列名为中文
corr_matrix = corr_matrix.rename(columns={'Ask Price': '要价', 'Construction Year': '建造年份', 'Odometer': '里程表', 'Year_Mileage_Ratio': '年份里程比'}, inplace=False)
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('相关性热力图')
plt.show()
(4)特征两两结合,观察相关性
# 特征两两组合,观察相关性
sns.pairplot( df[['Construction Year', 'Days Until MOT', 'Odometer', 'Ask Price']],size=2)
plt.show()
3.构建模型进行训练与评估
(1)构建模型并训练
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#导入数据
df = pd.read_csv('D:/data.csv')
df_colors = pd.get_dummies(df['Color'], prefix='Color:')
df_type = pd.get_dummies(df['Type'].apply(str), prefix='Type:')
df = pd.concat([df, df_colors, df_type], axis=1)
df.drop(['Brand', 'Type', 'Color'], axis=1, inplace=True)
# 准备样本数据,选取'Construction Year', 'Days Until MOT', 'Odometer'这样个特征进行模型训练
X = df[['Construction Year', 'Days Until MOT', 'Odometer']]
y = df['Ask Price'].values.reshape(-1, 1)
# 生成训练及测试数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)
# 特征缩放--标准化
X_normalizer = StandardScaler() # N(0,1)
X_train = X_normalizer.fit_transform(X_train) # 对训练特征数据进行拟合并归一
X_test = X_normalizer.transform(X_test) # 在对训练特征数据拟合之后应用到测试数据上
# 特征缩放--标准化(仅对特征X)
X_normalizer = StandardScaler() # N(0,1)
X_train = X_normalizer.fit_transform(X_train) # 对训练特征数据进行拟合并归一
X_test = X_normalizer.transform(X_test) # 在对训练特征数据拟合之后应用到测试数据上
# 训练模型
knn = KNeighborsRegressor(n_neighbors=2) # 可以通过交叉验证来选择最佳的K值
knn.fit(X_train, y_train) # 不需要ravel(),因为y_train已经是一维数组
# 预测价格
y_pred = knn.predict(X_test)
# 评估模型(可选)
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}, R²: {r2}')
# 画图
plt.scatter(y_test, y_pred, alpha=0.5) # 使用y_test作为x轴,以便更容易看到误差
plt.xlabel('Real value')
plt.ylabel('Prediction')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) # 完美的预测线
plt.title('Predicted vs Real Ask Price')
plt.show()
(2)模型评估
KNeighborsRegressor(algorithm='auto',leaf_size=30,metric='minkowski',metric_params=None, n_jobs=None, n_neighbors=2, p=2, weights='uniform')
pred = knn.predict(X_test)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_pred_inv, y_test_inv)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred_inv, y_test_inv)
print(y_pred_inv)
运行结果:
array([1199., 1199., 700., 899.])
print(y_test_inv)
运行结果:
array([[1300.],
[1650.],
[ 650.],
[ 799.]])
标签:KNN,plt,简易,二手车,df,train,pd,test,import From: https://blog.csdn.net/weixin_49816293/article/details/139784494