泰坦尼克号生还预测
导入模块
import pandas as pd # 数据处理
import matplotlib.pyplot as plt # 画图
from sklearn.tree import DecisionTreeClassifier # 决策树模型
from sklearn.model_selection import train_test_split #划分训练集和测试集
from sklearn.model_selection import GridSearchCV # 网格搜索(内涵交叉验证)
from sklearn.model_selection import cross_val_score # 网格搜索
读取数据表
data = pd.read_csv("./data/data.csv")
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
数据预处理
# 按列删除无用特征
data.drop(["Cabin", "Name", "Ticket"],axis=1,inplace=True)
# 填充年龄缺失值,按照年龄列的平均值
data["Age"] = data["Age"].fillna(data["Age"].mean())
# 删除空行
data.dropna(axis=0, inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 889 non-null int64
1 Survived 889 non-null int64
2 Pclass 889 non-null int64
3 Sex 889 non-null object
4 Age 889 non-null float64
5 SibSp 889 non-null int64
6 Parch 889 non-null int64
7 Fare 889 non-null float64
8 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 69.5+ KB
# 通过布尔索引将心别转换成0,1
data["Sex"] = (data["Sex"] == "male").astype("int")
# 创建包含Embarked值的列表,用匿名函数将Embarked值列表的索引重新赋值给Embarked列中的元素
labels = data["Embarked"].unique().tolist()
data["Embarked"] = data["Embarked"].apply(lambda x : labels.index(x))
# 抽取特征值
x = data.iloc[:, data.columns != "Survived"]
# 抽取目标值
y = data.iloc[:, data.columns == "Survived"]
划分测试集和训练集
# 划分测试集和训练集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 重制测试集和训练集的索引
for i in [x_train, x_test, y_train, y_test]:
i.index = range(i.shape[0])
导入决策树模型
# 实例化决策树模型
clf = DecisionTreeClassifier(random_state=30)
clf.fit(x_train,y_train)
score_ = clf.score(x_test, y_test)
score_
0.7713004484304933
导入决策树
# 实例化交叉验证
corss = cross_val_score(clf, x, y, cv=10)
score = corss.mean()
score
0.7514683350357507
展示树最大深度的拟合曲线
# 创建训练集拟合得分列表
train = []
# 创建交叉验证拟合得分列表
corss = []
# 创建测试集拟合得分列表
test = []
for i in range(10):
clf = DecisionTreeClassifier(criterion="gini"
,random_state=30
,max_depth=i+1) # 树的最大深度1-10
clf.fit(x_train, y_train)
score_train = clf.score(x_train, y_train)
score_test = clf.score(x_test, y_test)
train.append(score_train)
test.append(score_test)
score_corss = cross_val_score(clf, x, y, cv=10).mean()
corss.append(score_corss)
print("训练集中最好得分:{}".format(max(train)))
print("交叉验证中最好得分:{}".format(max(corss)))
print("测试集中最好得分:{}".format(max(test)))
plt.figure()
plt.plot(range(1, 11), train, color="blue",label="score_train")
plt.plot(range(1, 11), corss, color="red", label="score_corss")
plt.plot(range(1, 11), test, color="black", label="score_test")
plt.xlabel(range(1, 11))
plt.legend()
plt.show()
训练集中最好得分:0.9504504504504504
交叉验证中最好得分:0.8143896833503576
测试集中最好得分:0.8295964125560538
网格搜索
import numpy as np
# 创建0到0.5有序随机的20个float
gini_thresholds = np.linspace(0,0.5,20)
# 创建决策树参数列表
parameters = {'splitter':('best','random')
,'criterion':("gini","entropy")
,"max_depth":[*range(1,10)]
,'min_samples_leaf':[*range(1,50,5)]
}
# 实例化决策树模型
clf = DecisionTreeClassifier(random_state=25)
# 实例化网格搜索,交叉验证10次
search = GridSearchCV(clf, parameters, cv=10)
search.fit(x_train, y_train)
GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=25),
param_grid={'criterion': ('gini', 'entropy'),
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'min_samples_leaf': [1, 6, 11, 16, 21, 26, 31, 36, 41,
46],
'splitter': ('best', 'random')})
# 搜索出来的最好模型参数
search.best_estimator_
DecisionTreeClassifier(max_depth=7, min_samples_leaf=6, random_state=25,
splitter='random')
# 搜索的最好拟合分数
search.best_score_
0.8227951153324288
标签:泰坦尼克号,non,预测,生还,train,test,score,null,data
From: https://www.cnblogs.com/thankcat/p/17283609.html