缺失值处理
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.font_manager import FontProperties
from sklearn import datasets
font = FontProperties(fname='/Library/Fonts/Heiti.ttf')
from io import StringIO
iris_data = """
4.7,,1.3,0.2
4.6,3.1,1.5,0.2
5.,3.6,1.4,0.2
5.4,3.9,1.7,0.4
4.6,3.4,,0.3
5.,3.4,1.5,0.2
4.4,2.9,1.4,0.2
4.9,3.1,1.5,0.1
5.4,3.7,1.5,
"""
iris = datasets.load_iris()
df = pd.read_csv(StringIO(iris_data),header=None)
df.columns=iris.feature_names
df=df.iloc[:,:4]
print(df)
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer=imputer.fit_transform(df.values)
df=pd.DataFrame(imputer,columns=iris.feature_names)
print(df)
标准化
最小最大标准化
from sklearn.preprocessing import MinMaxScaler
import numpy as np
test_data = np.array([1,2,3,4,5]).reshape(-1,1).astype(float)
min_max_scaler=MinMaxScaler()
min_max_scaler.fit(test_data)
波士顿房价训练回归
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.font_manager import FontProperties
字体
font = FontProperties(fname='/Library/Fonts/Heiti.ttf')
np小数点位数
np.set_printoptions(precision=3,suppress=True)
url = "https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/sklearn/datasets/data/boston_house_prices.csv"
boston= pd.read_csv(url)
boston=boston.values
x=boston[1:,:-1]
y=boston[1:,-1]
print(x[:5])
print(y[:5])
切割和标准化
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=1,shuffle=True)
print('训练集长度:{}'.format(len(y_train)),'测试集长度:{}'.format(len(y_test)))
scaler = MinMaxScaler()
scaler = scaler.fit(x_train)
x_train,x_test=scaler.transform(x_train),scaler.transform(x_test)
print('标准化后训练数据:\n{}'.format(x_train[:5]))
print('标准化后测试数据:\n{}'.format(x_test[:5]))
lasso回归
from sklearn.linear_model import Lasso
reg = Lasso()
reg = reg.fit(x_train,y_train)
y_pred =reg.predict(x_test)
print('lasso回归R2分数:{}'.format(reg.score(x_test,y_test)))
弹性网络回归
from sklearn.linear_model import ElasticNet
reg = ElasticNet()
reg = reg.fit(x_train,y_train)
y_pred =reg.predict(x_test)
print('弹性网络回归R2分数:{}'.format(reg.score(x_test,y_test)))
岭回归
from sklearn.linear_model import Ridge
reg = Ridge()
reg = reg.fit(x_train,y_train)
y_pred =reg.predict(x_test)
print('岭回归R2分数:{}'.format(reg.score(x_test,y_test)))
线性支持向量回归
from sklearn.svm import LinearSVR
reg = LinearSVR(C=100,max_iter=10000)
reg = reg.fit(x_train,y_train)
y_pred =reg.predict(x_test)
print('线性支持向量回归R2分数:{}'.format(reg.score(x_test,y_test)))
核支持向量回归
from sklearn.svm import SVR
reg = SVR(C=100,gamma='auto',max_iter=10000,kernel='rbf')
reg = reg.fit(x_train,y_train)
y_pred =reg.predict(x_test)
print('核支持向量回归R2分数:{}'.format(reg.score(x_test,y_test)))
决策树回归
from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor()
reg = reg.fit(x_train,y_train)
y_pred =reg.predict(x_test)
print('决策树回归R2分数:{}'.format(reg.score(x_test,y_test)))
随机森林回归
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()
reg = reg.fit(x_train,y_train)
y_pred =reg.predict(x_test)
print('随机森林回归R2分数:{}'.format(reg.score(x_test,y_test)))