代码实现如下
# 导入所需的库
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
# 读取数据集
df = pd.read_csv('kddcup.data_10_percent_corrected', header=None)
# 给每一列命名
df.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent',
'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'label']
# 数值化非数值型的特征
le = LabelEncoder()
df['protocol_type'] = le.fit_transform(df['protocol_type'])
df['service'] = le.fit_transform(df['service'])
df['flag'] = le.fit_transform(df['flag'])
df['label'] = le.fit_transform(df['label'])
# 标准化数值型的特征
scaler = StandardScaler()
numeric_features = ['duration','src_bytes','dst_bytes','wrong_fragment','urgent','hot','num_failed_logins','num_compromised','num_root','num_file_creations','num_shells','num_access_files','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate']
df[numeric_features] = scaler.fit_transform(df[numeric_features])
# 归一化数值型的特征
scaler = MinMaxScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])
# OneHot编码非数值型的特征
encoder = OneHotEncoder(sparse_output=False)
categorical_features = ['protocol_type','service','flag']
encoded_features = encoder.fit_transform(df[categorical_features])
encoded_features = pd.DataFrame(encoded_features)
df.drop(categorical_features, axis=1, inplace=True)
df = pd.concat([df, encoded_features], axis=1)
# 分割数据集为训练集、验证集和测试集
X = df.drop('label', axis=1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42)
标签:df,dst,host,rate,num,srv,KDDCUP99,数据处理
From: https://www.cnblogs.com/lisyr44/p/17396330.html