前言
上篇 《有毒蘑菇的二分类预测 》(上) 用ColumnTransformer
和Pipeline
技术来提升缺失值和建模的方法,本篇将用特征工程的方法,将特征扩展,由原先的21个特征扩展成118个特征,再用深度学习的方法进行建模以达到较好的成绩,同时,在这篇里增加了上篇没有EDA部分,更好的展示数据集。
题目说明
加载库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense,Dropout,Input,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
加载数据
df = pd.read_csv(r"/kaggle/input/playground-series-s4e8/train.csv")
dt = pd.read_csv(r"/kaggle/input/playground-series-s4e8/test.csv")
ds = pd.read_csv(r"/kaggle/input/playground-series-s4e8/sample_submission.csv")
print("train_data :", df.shape)
print("test_data :", dt.shape)
print("sample_submission_data :", ds.shape)
train_data : (3116945, 22)
test_data : (2077964, 21)
sample_submission_data : (2077964, 2)
df.head()
id | class | cap-diameter | cap-shape | cap-surface | cap-color | does-bruise-or-bleed | gill-attachment | gill-spacing | gill-color | … | stem-root | stem-surface | stem-color | veil-type | veil-color | has-ring | ring-type | spore-print-color | habitat | season | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | e | 8.80 | f | s | u | f | a | c | w | … | NaN | NaN | w | NaN | NaN | f | f | NaN | d | a |
1 | 1 | p | 4.51 | x | h | o | f | a | c | n | … | NaN | y | o | NaN | NaN | t | z | NaN | d | w |
2 | 2 | e | 6.94 | f | s | b | f | x | c | w | … | NaN | s | n | NaN | NaN | f | f | NaN | l | w |
3 | 3 | e | 3.88 | f | y | g | f | s | NaN | g | … | NaN | NaN | w | NaN | NaN | f | f | NaN | d | u |
4 | 4 | e | 5.85 | x | l | w | f | d | NaN | w | … | NaN | NaN | w | NaN | NaN | f | f | NaN | g | a |
5 rows × 22 columns
dt.head()
id | cap-diameter | cap-shape | cap-surface | cap-color | does-bruise-or-bleed | gill-attachment | gill-spacing | gill-color | … | stem-root | stem-surface | stem-color | veil-type | veil-color | has-ring | ring-type | spore-print-color | habitat | season | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3116945 | 8.64 | x | NaN | n | t | NaN | NaN | w | 11.13 | … | b | NaN | w | u | w | t | g | NaN | d |
1 | 3116946 | 6.90 | o | t | o | f | NaN | c | y | 1.27 | … | NaN | NaN | n | NaN | NaN | f | f | NaN | d |
2 | 3116947 | 2.00 | b | g | n | f | NaN | c | n | 6.18 | … | NaN | NaN | n | NaN | NaN | f | f | NaN | d |
3 | 3116948 | 3.47 | x | t | n | f | s | c | n | 4.98 | … | NaN | NaN | w | NaN | n | t | z | NaN | d |
4 | 3116949 | 6.17 | x | h | y | f | p | NaN | y | 6.73 | … | NaN | NaN | y | NaN | y | t | NaN | NaN | d |
5 rows × 21 columns
df.isnull().sum(),dt.isnull().sum()
(id 0
class 0
cap-diameter 4
cap-shape 40
cap-surface 671023
cap-color 12
does-bruise-or-bleed 8
gill-attachment 523936
gill-spacing 1258435
gill-color 57
stem-height 0
stem-width 0
stem-root 2757023
stem-surface 1980861
stem-color 38
veil-type 2957493
veil-color 2740947
has-ring 24
ring-type 128880
spore-print-color 2849682
habitat 45
season 0
dtype: int64,
id 0
cap-diameter 7
cap-shape 31
cap-surface 446904
cap-color 13
does-bruise-or-bleed 10
gill-attachment 349821
gill-spacing 839595
gill-color 49
stem-height 1
stem-width 0
stem-root 1838012
stem-surface 1321488
stem-color 21
veil-type 1971545
veil-color 1826124
has-ring 19
ring-type 86195
spore-print-color 1899617
habitat 25
season 0
dtype: int64)
sns.countplot(x='class',data=df)
相关性验证
plt.figure(figsize=(20,20))
temp=df.dropna()
temp=df.drop('id',axis=1)
temp=temp.apply(lambda x : pd.factorize(x)[0] if x.dtype=='object'else x)
sns.heatmap(temp.corr(),annot=True,cmap='coolwarm')
plt.show()
用热力图可以看出,各特征的相关性不强
查看唯一类别
def count_cat(df):
cat_col=df.select_dtypes(include=['object']).columns
uni_col={col: df[col].value_counts() for col in cat_col}
plt.figure(figsize=(15,len(cat_col)*5))
for i ,(col,count) in enumerate(uni_col.items(),1):
plt.subplot(len(cat_col),1,i)
sns.barplot(x=count.index,y=count.values)
plt.title(f"Count of unique categories in column '{col}'")
plt.show()
count_cat(df)
查看各特征的唯一性及频次
从上图可以看出,特征含有杂乱无章的内容,如 特征
cap-shape
除了‘f’ ‘x’ ‘p’ ‘b’ ‘o’ ‘c’ ‘s’ 等内容外,居然还有大量的数值型的内容,如 ‘3.37’ ‘7.21’ ‘3.25’ ‘11.12’ ‘3 x’ ‘4.3’ 等,给我们分析带了困惑。因此,需要对此进行处理。
缺失值处理
计算各特征的缺失值占比情况(百分比)
def null_percent(df):
per=((df.isnull().sum()/len(df))*100).round(5)
return per
print("Nan Values in Train data")
print(null_percent(df))
print("Nan Values in Test data")
print(null_percent(dt))
Nan Values in Train data
id 0.00000
class 0.00000
cap-diameter 0.00013
cap-shape 0.00128
cap-surface 21.52823
cap-color 0.00038
does-bruise-or-bleed 0.00026
gill-attachment 16.80928
gill-spacing 40.37399
gill-color 0.00183
stem-height 0.00000
stem-width 0.00000
stem-root 88.45273
stem-surface 63.55136
stem-color 0.00122
veil-type 94.88435
veil-color 87.93697
has-ring 0.00077
ring-type 4.13482
spore-print-color 91.42548
habitat 0.00144
season 0.00000
dtype: float64
Nan Values in Test data
id 0.00000
cap-diameter 0.00034
cap-shape 0.00149
cap-surface 21.50682
cap-color 0.00063
does-bruise-or-bleed 0.00048
gill-attachment 16.83480
gill-spacing 40.40469
gill-color 0.00236
stem-height 0.00005
stem-width 0.00000
stem-root 88.45254
stem-surface 63.59533
stem-color 0.00101
veil-type 94.87869
veil-color 87.88044
has-ring 0.00091
ring-type 4.14805
spore-print-color 91.41722
habitat 0.00120
season 0.00000
dtype: float64
检查特征重要性并删除无用列
# alpha=.05
# val={}
# for col in df.columns:
# if col=='class':
# continue
# a,b=df[col],df['class']
# obs=pd.crosstab(a,b)
# chi2,p,dof,expected=scipy.stats.chi2_contingency(obs.values)
# val[col]=p
# if p<alpha:
# print("{} is important. (p = {})".format(col, p))
# else:
# print("{} is NOT important. (p = {})".format(col, p))
删除缺失值最高比率的两个特征
df=df.drop(['id','veil-type','veil-color'],axis=1)
dt=dt.drop(['id','veil-type','veil-color'],axis=1)
处理空值和频次过低的类别
从上图得知,存在大量频次及低的内容,需要将这些内容剔除。
这里我们设定阀值为101,让为低于这个阀值的内容为噪音,将其剔除。
def cleaning(df):
threshold = 101
cat_feat = ["cap-shape", "cap-surface", "cap-color", "does-bruise-or-bleed", "gill-attachment",
"gill-spacing", "gill-color", "stem-root", "stem-surface", "stem-color",
"has-ring", "ring-type", "spore-print-color", "habitat", "season"]
for i in cat_feat:
if df[i].dtype.name == 'category':
if 'missing' not in df[i].cat.categories:
df[i] = df[i].cat.add_categories('missing')
if 'noise' not in df[i].cat.categories:
df[i] = df[i].cat.add_categories('noise')
else:
df[i] = df[i].astype('category')
df[i] = df[i].cat.add_categories(['missing', 'noise'])
df[i] = df[i].fillna('missing')
count = df[i].value_counts(dropna=False)
less_freq = count[count < threshold].index
df[i] = df[i].apply(lambda x: 'noise' if x in less_freq else x)
return df
df = cleaning(df)
dt = cleaning(dt)
count_cat(df)
剔除噪音后,重新画图展示。
由上图可以看出,经删除
噪音
后,数据干净
多了。
从上述数据得知,数值型也含有缺失值,具体对每个特征进行填充。
将数据值的缺失值,使用众数进行填充。
df['cap-diameter']= df['cap-diameter'].fillna(df['cap-diameter'].mode()[0])
dt['cap-diameter']= dt['cap-diameter'].fillna(dt['cap-diameter'].mode()[0])
dt['stem-height']= dt['stem-height'].fillna(dt['stem-height'].mode()[0])
再次检查是否还有缺失值
df.isnull().sum(),dt.isnull().sum()
内容(略)
检查结果 各列均为 0 。
将非数值型 确定为
category
,方便模型处理。
cat_feats = ["cap-shape", "cap-surface", "cap-color", "does-bruise-or-bleed", "gill-attachment",
"gill-spacing", "gill-color", "stem-root", "stem-surface", "stem-color",
"has-ring", "ring-type", "spore-print-color", "habitat", "season"]
for i in cat_feats:
df[i]=df[i].astype('category')
for i in cat_feats:
dt[i]=dt[i].astype('category')
X = df.drop(['class'], axis=1)
y = df['class']
X.shape,y.shape
((3116945, 18), (3116945,))
针对数值型的特征,再次分析,画密度图
def plot_num(col):
sns.displot(x=col,data=df)
col=["cap-diameter", "stem-height", "stem-width"]
for c in col:
plot_num(c)
预处理数据
对分类数据进行 one-hot 【独热】处理,对数值型数据进行 StandardScaler【标准化】处理
def preprocess(df):
cat_features = ["cap-shape", "cap-surface", "cap-color", "does-bruise-or-bleed", "gill-attachment",
"gill-spacing", "gill-color", "stem-root", "stem-surface", "stem-color",
"has-ring", "ring-type", "spore-print-color", "habitat", "season"]
num_features = ["cap-diameter", "stem-height", "stem-width"]
scaler = StandardScaler()
df[num_features]=scaler.fit_transform(df[num_features])
df=pd.get_dummies(df,columns=cat_features,drop_first=True)
one_hot_encoded_columns = df.columns.difference(num_features)
# Check if there are any non-integer values in one-hot encoded columns
for col in one_hot_encoded_columns:
if not df[col].dropna().apply(lambda x: x in [0, 1]).all():
print(f"Non-binary values found in column: {col}")
print(df[col].unique())
# Convert one-hot encoded columns to integer type
df[one_hot_encoded_columns] = df[one_hot_encoded_columns].astype(int)
return df
X=preprocess(X)
dt=preprocess(dt)
分别对训练集和测试集进行数据处理
dt.head()
cap-diameter | stem-height | stem-width | cap-shape_c | cap-shape_f | cap-shape_noise | cap-shape_o | cap-shape_p | cap-shape_s | cap-shape_x | … | habitat_h | habitat_l | habitat_m | habitat_noise | habitat_p | habitat_u | habitat_w | season_s | season_u | season_w | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.498099 | 1.772335 | 0.737221 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0.126738 | -1.880901 | -0.049181 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | -0.919051 | -0.061693 | -0.988666 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | -0.605314 | -0.506306 | -0.325718 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | -0.029064 | 0.142088 | 0.315009 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 118 columns
处理后,特征由原来的 18 扩展到118个。
确定特征和标签,为建模作准备
X = pd.DataFrame(X, columns=dt.columns)
X.shape
(3116945, 118)
lb=LabelEncoder()
y=lb.fit_transform(y)
y
array([0, 1, 0, …, 1, 0, 1])
训练模型(深度学习)
使用 Keras 的Sequential 的多层模型 MPL,建立深度学习模型。并设定early_stopping【早停】,learn-rate【学习率】等参数进行训练。将训练过程记录下来,以便作图查看。
from sklearn.model_selection import train_test_split
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=2)
# Define model
model = Sequential([
Input(shape=(X_train.shape[1],)),
Dense(256, activation='relu'),
BatchNormalization(),
Dropout(0.23),
Dense(128, activation='relu'),
BatchNormalization(),
Dropout(0.23),
Dense(64, activation='relu'),
BatchNormalization(),
Dropout(0.23),
Dense(32, activation='relu'),
BatchNormalization(),
Dropout(0.23),
Dense(16, activation='relu'),
BatchNormalization(),
Dropout(0.23),
Dense(8, activation='relu'),
BatchNormalization(),
Dropout(0.23),
Dense(1, activation='sigmoid')
])
# Compile model
little_adam = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy',optimizer=little_adam, metrics=['accuracy'])
# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=25, min_lr=1e-6)
# Train model
history = model.fit(X_train, y_train,
epochs=70,
batch_size=128,
validation_data=(X_val, y_val),
callbacks=[early_stopping, reduce_lr])
# Plot training & validation loss values
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'])
# Plot training & validation accuracy values
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'])
plt.show()
Epoch 1/70
18264/18264 ━━━━━━━━━━━━━━━━━━━━ 141s 7ms/step - accuracy: 0.9657 - loss: 0.1044 - val_accuracy: 0.9914 - val_loss: 0.0423 - learning_rate: 0.0010
Epoch 2/70
18264/18264 ━━━━━━━━━━━━━━━━━━━━ 134s 7ms/step - accuracy: 0.9899 - loss: 0.0518 - val_accuracy: 0.9917 - val_loss: 0.0413 - learning_rate: 0.0010
Epoch 3/70
18264/18264 ━━━━━━━━━━━━━━━━━━━━ 136s 7ms/step - accuracy: 0.9904 - loss: 0.0493 - val_accuracy: 0.9917 - val_loss: 0.0403 - learning_rate: 0.0010
…
Epoch 69/70
18264/18264 ━━━━━━━━━━━━━━━━━━━━ 138s 8ms/step - accuracy: 0.9925 - loss: 0.0403 - val_accuracy: 0.9922 - val_loss: 0.0375 - learning_rate: 0.0010
Epoch 70/70
18264/18264 ━━━━━━━━━━━━━━━━━━━━ 138s 8ms/step - accuracy: 0.9925 - loss: 0.0405 - val_accuracy: 0.9922 - val_loss: 0.0373 - learning_rate: 0.0010
上述训练 在CPU条件进行,所花费的时间较多,如果有条件的小伙伴,不用修改任何代码,可以使用GPU进行训练,将提升训练效率。
附:(用GPU训练记录,也同时出现了早停)
Epoch 60/70
18264/18264 ━━━━━━━━━━━━━━━━━━━━ 57s 3ms/step - accuracy: 0.9925 - loss: 0.0407 - val_accuracy: 0.9922 - val_loss: 0.0379 - learning_rate: 3.0000e-04
提交结果
test_predictions = (model.predict(dt) > 0.5).astype(int).flatten()
ds['class'] = np.where(test_predictions == 1, 'p', 'e')
# Save the updated submission file
ds.to_csv('submission.csv', index=False)
x=pd.read_csv('submission.csv')
print("submission file created")
64937/64937 ━━━━━━━━━━━━━━━━━━━━ 110s 2ms/step
submission file created
提交测试平台
这个成绩比上篇0.98276有所提升。
结论
在上篇的基础上,对特征进行处理,包括删除频次少的内容,对分类特征进行one-hot 编码,使用特征扩展到118个,只有这样,才可以用多层的深度学习进行训练。
经过测试,如果用顺序编码处理分类特征,用多层或两层的深度学习的模型,无法使得模型收敛,以致无法正常训练。
要使想再提升竞赛成绩,可以融合其他模型,再提交结果,成绩可能略有提升。