1、Imputer
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
import pandas as pd
fileName = '***/abc.xlsx'
df = pd.read_excel(fileName)
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) -- most_frequent, median, mean
imp.fit(df)
df = imp.transform(df)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
print('accuracy on the training subset:{:.3f}'.format(forest.score(x_train, y_train)))
print('accuracy on the training subset:{:.3f}'.format(forest.score(x_test, y_test)))
2、xgboost
from xgboost as xg
from sklearn.model_selection import train_test_split
import pandas as pd
fileName = '***/abc.xlsx'
df = pd.read_excel(fileName)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = xg.XGBClassifier(missing=None)
model.fit(x_train, y_train)
print('accuracy on the training subset:{:.3f}'.format(model.score(x_train, y_train)))
print('accuracy on the training subset:{:.3f}'.format(model.score(x_test, y_test)))
3、catboost
from catboost as cb
from sklearn.model_selection import train_test_split
import pandas as pd
fileName = '***/abc.xlsx'
df = pd.read_excel(fileName)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)
rf = cb.CatBoostClassifier()
rf.fit(x_train, y_train)
print('accuracy on the training subset:{:.3f}'.format(rf.score(x_train, y_train)))
print('accuracy on the training subset:{:.3f}'.format(rf.score(x_test, y_test)))
标签:subset,处理,语法,df,train,test,import,model,缺失 From: https://www.cnblogs.com/liyiyu/p/17265859.html