直接把数据丢里提升效果不太大。。。。。。。他会生成很多特征导致计算机运算需要很长时间,因此不建议放入较大的特征维度,需要自己选择一下。
import pandas as pd
from sklearn.datasets import fetch_california_housing
from openfe import openfe, transform, tree_to_formula
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn import tree
import warnings
warnings.filterwarnings("ignore")
def get_score(train_x, test_x, train_y, test_y):
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1)
params = {'n_estimators': 1000, 'n_jobs': n_jobs, 'seed': 1}
gbm = lgb.LGBMRegressor(**params)
gbm.fit(train_x, train_y, eval_set=[(val_x, val_y)], callbacks=[lgb.early_stopping(50, verbose=False)])
pred = pd.DataFrame(gbm.predict(test_x), index=test_x.index)
score = mean_squared_error(test_y, pred)
return score
import numpy as np
import warnings
if __name__ == '__main__':
df = pd.read_csv('train.csv')
df=df.drop(['ID'],axis=1)
df=df.to_numpy()
feature=np.abs(np.fft.fft(df[:,:-1]))
feature=np.concatenate((feature,np.reshape(df[:,-1],(-1,1))),axis=1)
train=pd.DataFrame(feature)
heat=train.corr()
fe=heat.index[abs(heat[240])>0.3]
train=df[:,fe]
label=train[:,-1]
train=train[:,:-1]
# feature generation
ofe = openfe()
w=[]
for i in range(train.shape[1]):
s='T'
w.append(s+str(i))
train=pd.DataFrame(train)
train.columns=w
df1 = pd.read_csv('test.csv')
df1=df1.drop(['ID'],axis=1)
df1=df1.to_numpy()
test_feature=np.abs(np.fft.fft(df1[:,:]))
test_feature=test_feature[:,fe[:-1]]
features = ofe.fit(data=pd.DataFrame(train),task='classification', label=pd.DataFrame(label), n_jobs=3, stage1_metric='corr',stage2_metric='permutation')
test_fe=pd.DataFrame(test_feature)
test_fe.columns=w
train_x, test_x = transform(train, test_fe, ofe.new_features_list[:10], n_jobs=3)
print(train_x.shape)
print(train.shape)
for i in range(10):
clf=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=i+1)
clf1=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=i+1)
scores = cross_val_score(clf, train, label, cv=5)
scores1 = cross_val_score(clf1, train_x, label, cv=5)
print(i,scores.mean(),scores1.mean())
print("The top 10 generated features are")
clf1=tree.DecisionTreeClassifier(criterion='gini',random_state=0,max_depth=3+1)
clf1.fit(train_x,label)
out=clf1.predict(train_x)
print(out)
out=clf1.predict(test_x)
print(out)
out=pd.DataFrame(out)
out.columns = ['CLASS']
w=[]
for k in range(out.shape[0]):
w.append(k+210)
out['ID']=np.reshape(w,(-1,1))
out[['ID','CLASS']].to_csv('out3.csv',index=False)