Kaggle-入门比赛-泰坦尼克号
import pandas as pd
import matplotlib.pyplot as plt
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
数剧清洗
# 检查缺失值
# 观察到有三个类的数据有缺失
train_data.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
# 首先处理年纪,采用平均值填充
train_data['Age'] = train_data['Age'].fillna(round(train_data['Age'].mean()))
# 处理Cabin,缺失值过多选择drop此属性
train_data = train_data.drop(columns='Cabin')
# 缺失值较少的非数值类型,采用众数填充
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])
train_data.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Embarked 0
dtype: int64
# 对于测试集同样做此操作
test_data['Age'] = test_data['Age'].fillna(round(test_data['Age'].mean()))
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mode()[0])
test_data = test_data.drop(columns='Cabin')
# 观察数据,发现passengerId起到的只是主键,并无任何隐藏含义,故不将其作为模型训练参数
# ticket, Name 只显示了一些无具体意义的数字或者订票公司信息,不纳入考虑
train_data.head(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
数据探索
此时我们开始探讨上述排除的属性以外的属性和生存率之间的关系,我才用的是可视化处理
# 自定义一个用于可视化的函数,分别显示该属性与生存率的关系,一个以数值形式,一个依比例
def showSurvivalUnderAttr(data, attribute):
plt.figure(figsize=(10, 5))
survive = data.loc[data['Survived'] == 1, attribute].value_counts()
die = data.loc[data['Survived'] == 0, attribute].value_counts()
df_survival = pd.DataFrame({'Survive': survive, 'Die': die})
plot1 = df_survival.plot(y=['Survive', 'Die'], kind='bar', ax=plt.subplot(121))
plot1.set_xlabel(attribute)
plot1.set_ylabel('Population')
population = data[attribute].value_counts()
df_survive_rate = survive / population
print('survival rate:')
print(df_survive_rate)
plot2 = df_survive_rate.plot(kind='bar', ax=plt.subplot(122), color=['g', 'b', 'r'], ylim=(0, 1))
plot2.set_xlabel(attribute)
plot2.set_ylabel('survival rate')
plt.show()
# Pclass属性 船舱等级
showSurvivalUnderAttr(train_data, 'Pclass')
survival rate:
1 0.629630
2 0.472826
3 0.242363
Name: Pclass, dtype: float64
# Sex 性别
showSurvivalUnderAttr(train_data, 'Sex')
survival rate:
female 0.742038
male 0.188908
Name: Sex, dtype: float64
年龄不像上述的数据,是一个分布较为宽泛的数据,我们先画一下分布直方图看一下分布状况
# Age
# Divide age into 5 bins
plt.hist(train_data['Age'], bins=10)
plt.show()
可以观察到人数为中心20-40集中,两侧分散,因此考虑对年纪进行分割处理,在观察存活率相关情况
下面写了一个用于将年龄分为5个阶段的函数,并将分割的结果作为新的属性Age_cut加入到数据集中
# 对Age进行数据分割 查看存活率和Age的关系
def processAge(data):
bins = [0, 5, 20, 30, 50, int(max(data['Age']))]
labels = ['Baby', 'Teenager', 'Midlife', 'Prime', 'Older']
data['Age_cut'] = pd.cut(data['Age'], bins=bins, labels=labels)
return data
train_data = processAge(train_data)
showSurvivalUnderAttr(train_data, 'Age_cut')
survival rate:
Baby 0.704545
Teenager 0.377778
Midlife 0.334152
Prime 0.423237
Older 0.343750
Name: Age_cut, dtype: float64
分割结果如上图 所示,可以看到不同年纪阶段,存活率存在较大差异。
对于Parch/SibSp这两个分别代表同船的父母子女/兄弟姐妹的数量,可以归结为家庭成员,我这里将他们合并处理成为一个新的属性Family,并按照小中大三个类别进行划分。
def processParchAndSibSp(data):
num_family = data['Parch'] + data['SibSp'] + 1
bins = [0, 3, 6, int(max(num_family))]
labels = ['small', 'middle', 'big']
data['Family'] = pd.cut(num_family, bins=bins, labels=labels)
return data
train_data = processParchAndSibSp(train_data)
showSurvivalUnderAttr(train_data, 'Family')
survival rate:
small 0.388750
middle 0.409091
big 0.160000
Name: Family, dtype: float64
对于Fare费用属性,其往往和船舱等级有关,我们需要考虑其是否会存在两种属性关系性程度再决定是否保留此属性
import scipy.stats as stats
print(stats.pearsonr(train_data['Pclass'], train_data['Fare']))
(-0.5494996199439074, 1.96738617342106e-71)
观察到p值为1.96738617342106e-71,远远小于0.005,因此二者高度相关,于是我这里对于其采取剔除,不参与模型训练的过程
# Embarked 港口属性
showSurvivalUnderAttr(train_data, 'Embarked')
survival rate:
S 0.339009
C 0.553571
Q 0.389610
Name: Embarked, dtype: float64
此时观察我们的数据集
train_data.head(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | Age_cut | Family | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | Midlife | small |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | Prime | small |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | Midlife | small |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | Prime | small |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | Prime | small |
模型训练准备
剔除掉我们之前所说的不需要的属性:['PassengerId', 'Name', 'Age', 'Parch', 'Ticket', 'Fare', 'SibSp']
同时由于刚才转换和原有的属性中都存在非数值的类别属性:['Age_cut', 'Embarked', 'Pclass', 'Sex', 'Family']
需要对其进行one-hot编码转换
# one-hot
def transformCategoryToValue(data, attributes):
for attribute in attributes:
tmpDf = pd.get_dummies(data[attribute], prefix=attribute)
data = pd.concat([data, tmpDf], axis=1)
data.drop(attribute, axis=1, inplace=True)
return data
# drop attribute
def dropAttr(data, attributes):
for attribute in attributes:
data.drop(attribute, axis=1, inplace=True)
return data
one_hot_list = ['Age_cut', 'Embarked', 'Pclass', 'Sex', 'Family']
drop_list = ['PassengerId', 'Name', 'Age', 'Parch', 'Ticket', 'Fare', 'SibSp']
train_data = dropAttr(train_data, drop_list)
train_data = transformCategoryToValue(train_data, one_hot_list)
test_id = test_data['PassengerId'] #用于保留PassengerId便于之后的结果提交
test_data = dropAttr(test_data, drop_list)
test_data = transformCategoryToValue(test_data, one_hot_list)
转换结果如下
train_data.head(5)
Survived | Age_cut_Baby | Age_cut_Teenager | Age_cut_Midlife | Age_cut_Prime | Age_cut_Older | Embarked_C | Embarked_Q | Embarked_S | Pclass_1 | Pclass_2 | Pclass_3 | Sex_female | Sex_male | Family_small | Family_middle | Family_big | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
模型训练
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
Y_train = train_data['Survived']
X_train = train_data.drop('Survived', axis=1)
X_test = test_data
def modelSelection():
models = [LogisticRegression(), Perceptron(), SGDClassifier(), SVC(), KNeighborsClassifier(), GaussianNB(),
DecisionTreeClassifier(), RandomForestClassifier(n_estimators=100)]
modelsName = ['LogisticRegression', 'Perceptron', 'SGDClassifier', 'SVC', 'KNeighborsClassifier', 'GaussianNB',
'DecisionTreeClassifier', 'RandomForestClassifier']
bestModelName = ''
bestScore = 0
for i in range(len(models)):
tmpModel = models[i]
modelName = modelsName[i]
tmpModel.fit(X_train, Y_train)
Y_pred = tmpModel.predict(X_train)
score = tmpModel.score(X_train, Y_train)
print(modelName + ':', score)
bestScore = 0
if score > bestScore:
bestScore = score
bestModel = models[i]
bestModelName = modelName
print('BestModel is ' + bestModelName + ', score is ' + str(bestScore))
return bestModel
bestModel = modelSelection()
LogisticRegression: 0.8125701459034792
Perceptron: 0.7441077441077442
SGDClassifier: 0.7890011223344556
SVC: 0.835016835016835
KNeighborsClassifier: 0.8204264870931538
GaussianNB: 0.7901234567901234
DecisionTreeClassifier: 0.8406285072951739
RandomForestClassifier: 0.8406285072951739
BestModel is RandomForestClassifier, score is 0.8406285072951739
/Users/usyun/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:493: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names unseen at fit time:
- Embarked
- Pclass
- Sex
Feature names seen at fit time, yet now missing:
- Age_cut_Baby
- Age_cut_Midlife
- Age_cut_Older
- Age_cut_Prime
- Age_cut_Teenager
- ...
warnings.warn(message, FutureWarning)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [87], in <cell line: 38>()
34 return bestModel
37 bestModel = modelSelection()
---> 38 Y_pred = bestModel.predict(X_test)
39 submission = pd.DataFrame({
40 'PassengerId': test_id,
41 'Survived': Y_pred
42 })
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:808, in ForestClassifier.predict(self, X)
787 def predict(self, X):
788 """
789 Predict class for X.
790
(...)
806 The predicted classes.
807 """
--> 808 proba = self.predict_proba(X)
810 if self.n_outputs_ == 1:
811 return self.classes_.take(np.argmax(proba, axis=1), axis=0)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:850, in ForestClassifier.predict_proba(self, X)
848 check_is_fitted(self)
849 # Check data
--> 850 X = self._validate_X_predict(X)
852 # Assign chunk of trees to jobs
853 n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:579, in BaseForest._validate_X_predict(self, X)
576 """
577 Validate X whenever one tries to predict, apply, predict_proba."""
578 check_is_fitted(self)
--> 579 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
580 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
581 raise ValueError("No support for np.int64 index based sparse matrices")
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:566, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:746, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
744 array = array.astype(dtype, casting="unsafe", copy=False)
745 else:
--> 746 array = np.asarray(array, order=order, dtype=dtype)
747 except ComplexWarning as complex_warning:
748 raise ValueError(
749 "Complex data not supported\n{}\n".format(array)
750 ) from complex_warning
File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py:2064, in NDFrame.__array__(self, dtype)
2063 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-> 2064 return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'male'
对于测试集预测,并保存预测结果
Y_pred = bestModel.predict(X_test)
submission.to_csv('./submission.csv', index=False)
标签:泰坦尼克号,cut,dtype,Age,titanic,kaggle,train,test,data
From: https://www.cnblogs.com/shineyun/p/17133589.html