特征处理的实现代码如下:
#先对年龄缺失值进行处理,这里先按28岁填充
处理客户年龄,因为年龄基本服从正态分布,初步考虑分为0-20,20-30,30-40,40-50,50-60,70-80,80-100
分别标记为 age_class1,......,age_class8
'''
Train['NTRL_CUST_AGE']=Train['NTRL_CUST_AGE'].fillna(28)
Sex_OneHot=pd.get_dummies(Train['AGE_CLASS'],prefix='AGES',prefix_sep='_')
Sex_OneHot=Sex_OneHot.drop(['AGES_A'],axis=1)
Train=Train.drop(['AGE_CLASS'],axis=1)
Train=Train.drop(['NTRL_CUST_AGE'],axis=1)
Train=Train.merge(Sex_OneHot,on='CUST_NO',how='outer')
Train=Train.merge(df_DZ_ASSET,on='CUST_NO',how='outer')
Train=Train.merge(df_DZ_TR_ATM,on='CUST_NO',how='outer')
Train=Train.merge(df_DZ_TR_CASH,on='CUST_NO',how='outer')
Train=Train.merge(df_DZ_TR_FNCG,on='CUST_NO',how='outer')
Train=Train.merge(df_DZ_TR_IBTF,on='CUST_NO',how='outer')
Train=Train.merge(df_DZ_TR_TERM,on='CUST_NO',how='outer')
Train=Train.merge(df_DZ_TR_TPAY,on='CUST_NO',how='outer')
读取了客户基本信息,下面我们读取客户资产表:
df_DZ_ASSET= pd.DataFrame(pd.read_csv(Dir +'DZ_ASSET.csv',parse_dates=['DATA_DAT'], index_col='CUST_NO'))
df_DZ_ASSET = df_DZ_ASSET.drop(['DATA_DAT'],axis=1)
对表中的字段进行分析,取AST_DAY_FA_BAL(月日均金融资产余额)分析
df_DZ_ASSET['AST_DAY_FA_BAL'].describe()
count 50162.000000 mean 43.796521 std 41.986134 min 0.000000 25% 10.643507 50% 27.757308 75% 67.408222 max 444.940548 Name: AST_DAY_FA_BAL, dtype: float64
依次对其它字段进行分析,情况类似。
这里所有的(数据)字段都是数据类型,且观察比赛组委会已经对数据做了脱敏处理,故不再做特征处理了,直接把所有数据字段作为特征。
下面我们读取客户资产表:DZ_TR_CASH(客户现金交易表)
df_DZ_TR_CASH = pd.DataFrame(pd.read_csv(Dir +'DZ_TR_CASH.csv',parse_dates=['DATA_DAT'], index_col='CUST_NO'))
df_DZ_TR_CASH = df_DZ_TR_CASH.drop(['DATA_DAT'],axis=1)
df_DZ_TR_CASH['CASH_MOTH_TR_CNT'].describe() #统计月交易笔数信息
这里取 月交易笔数进行分析:
Out[26]:
count 41518.000000 mean 4.000337 std 7.060957 min 3.000000 25% 3.000000 50% 3.000000 75% 3.000000 max 1051.000000 Name: CASH_MOTH_TR_CNT, dtype: float64
从数据上看,交易笔数最大的为1051次,中间值mean为4次,说明差异较大。因为暂时不知道这个数据差异有说明影响,暂时不做处理,直接使用。
再看看月交易金额数据统计:
df_DZ_TR_CASH['CASH_MOTH_TR_AMT'].describe()
count 41518.000000 mean 10.539245 std 27.621481 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 350.685150 Name: CASH_MOTH_TR_AMT, dtype: float64
最大值为350,75%的为0,说明5万余客户中,有近1万没有发生现金交易,有3万左右的当月日均交易金额为0。
这里也暂时不做特征处理。
同样的方法读取几个表的数据
df_DZ_MBNK_BEHAVIOR= pd.DataFrame(pd.read_csv(Dir + 'DZ_MBNK_BEHAVIOR.csv'))
df_DZ_NATURE= pd.DataFrame(pd.read_csv(Dir +'DZ_NATURE.csv'))
df_DZ_TARGET_TRAIN= pd.DataFrame(pd.read_csv(Dir +'DZ_TARGET_TRAIN.csv'))
df_DZ_TR_APS= pd.DataFrame(pd.read_csv(Dir +'DZ_TR_APS.csv'))
df_DZ_TR_ATM= pd.DataFrame(pd.read_csv(Dir +'DZ_TR_ATM.csv'))
df_DZ_TR_CASH= pd.DataFrame(pd.read_csv(Dir +'DZ_TR_CASH.csv'))
df_DZ_TR_FNCG= pd.DataFrame(pd.read_csv(Dir +'DZ_TR_FNCG.csv'))
df_DZ_TR_IBFT= pd.DataFrame(pd.read_csv(Dir +'DZ_TR_IBFT.csv'))
df_DZ_TR_TERM= pd.DataFrame(pd.read_csv(Dir +'DZ_TR_TERM.csv'))
df_DZ_TR_TPAY= pd.DataFrame(pd.read_csv(Dir +'DZ_TR_TPAY.csv'))
这里要注意,根据时间序列,需要对掌银行为表和活期交易表进行进行特征处理。
先对掌银行为表进行分析。该表的addfielddate字段记录的是近61天的具体日期(10月X日-11月XX日),由于数据不连续,需要做特征处理,基本处理思路如下:选取访问量最大的100个网页做特征,每个用户访问某网页的次数作为特征的取值。
具体代码如下:
'''对掌银访问记录和活期流水进行特征处理,因为较为复杂,用函数表示'''
def get_user_date_sum(data):
t1 = data.groupby(['apsdcusno','apsdtrdat'])['apsdtramt'].sum().reset_index().sort_values(by='apsdtrdat')
columns = t1["apsdtrdat"].unique().tolist()
new_columns = ['timestamp'] + columns
t1 = pd.pivot_table(data=t1, columns=t1['apsdtrdat'], index=t1['apsdcusno'])
t1 = t1.reset_index()
t1.columns = new_columns
columns.sort()
new_columns = ['timestamp'] + columns
t1.reindex(new_columns)
#t1=t1.drop(['apsdtrdat'],axis=1)
t1.rename(columns={'timestamp':'CUST_NO'},inplace=True)
ts=len(new_columns)-62
while ts>0 :
#将要删除的数据加到它的下一个列中,再进行删除
keys=new_columns[1]
keyt=new_columns[2]
t1[keyt] = t1[keyt] +t1[keys]
t1=t1.drop([keys],axis=1)
ts=ts-1
del(new_columns[1])
#t1.drop(['apsdtrdat'],axis=1)
for str1 in t1.columns.values:
t1[str1]=t1[str1].fillna(0)
#确定列名
ttt=['CUST_NO','aps1','aps2','aps3','aps4','aps5','aps6','aps7','aps8','aps9','aps10',
'aps11','12','aps13','aps14','aps15','aps16','aps17','aps18','aps19','aps20',
'aps21','aps22','aps23','aps24','aps25','aps26','aps27','aps28','aps29','aps30',
'aps31','aps32','aps33','aps34','aps35','aps36','aps37','aps38','aps39','aps40',
'aps41','aps42','aps43','aps44','aps45','aps46','aps47','aps48','aps49','aps50',
'aps51','aps52','aps53','aps54','aps55','aps56','aps57','aps58','aps59','aps60','asp61']
t1.columns=ttt
return t1
def get_user_sum(data):
t1 = data.groupby('apsdcusno')['apsdtramt'].sum().reset_index()
return t1
def get_date_sum(data):
t1 = data.groupby('apsdtrdat')['apsdtramt'].sum().reset_index()
return t1
def get_url_user_pv(data):
t1 = data.groupby(['OperationPage','UserId'])['pid'].count().sort_values(ascending=False).reset_index()
return t1
def get_url_user_pv_100(data,OperationPage):
data = data[data['OperationPage'].isin(OperationPage)]
t1 = data.groupby(['OperationPage','pid'])['UserId'].count().reset_index()
#print(t1)
columns = t1["OperationPage"].unique().tolist()
new_columns = ['CUST_NO'] + columns
t1 = pd.pivot_table(data=t1, columns=t1['OperationPage'], index=t1['pid'])
t1 = t1.reset_index()
t1.columns = new_columns
# columns.sort()
# new_columns = ['timestamp'] + columns
# t1.reindex(new_columns)
for str2 in t1.columns.values:
t1[str2]=t1[str2].fillna(0)
return t1
#添加按日期统计次数
def MK_user_date_count(data):
#分组统计前4个访问量大的网页
t5 = data.groupby(['pid','addfielddate'])['OperationPage'].count().reset_index().sort_values(by='addfielddate')
columns = t5["addfielddate"].unique().tolist()
new_columns = ['timestamp'] + columns
t5 = pd.pivot_table(data=t5, columns=t5['addfielddate'], index=t5['pid'])
t5 = t5.reset_index()
t5.columns = new_columns
columns.sort()
new_columns = ['timestamp'] + columns
t5.reindex(new_columns)
#t5=t5.drop(['apsdtrdat'],axis=1)
t5.rename(columns={'timestamp':'CUST_NO'},inplace=True)
#t5.drop(['apsdtrdat'],axis=1)
for str1 in t5.columns.values:
t5[str1]=t5[str1].fillna(0)
ts=len(new_columns)-62
while ts>0 :
keys=new_columns[1]
t5=t5.drop([keys],axis=1)
ts=ts-1
del(new_columns[1])
ttt5=['CUST_NO','MA1','MA2','MA3','MA4','MA5','MA6','MA7','MA8','MA9','MA10',
'MA11','MA12','MA13','MA14','MA15','MA16','MA17','MA18','MA19','MA20',
'MA21','MA22','MA23','MA24','MA25','MA26','MA27','MA28','MA29','MA30',
'MA31','MA32','MA33','MA34','MA35','MA36','MA37','MA38','MA39','MA40',
'MA41','MA42','MA43','MA44','MA45','MA46','MA47','MA48','MA49','MA50',
'MA51','MA52','MA53','MA54','MA55','MA56','MA57','MA58','MA59','MA60','MA61']
t5.columns=ttt5
return t5
def get_user_pv(data):
t1 = data.groupby('pid')['UserId'].count().sort_values(ascending=False).reset_index()
t1.rename(columns={'pid':'CUST_NO'}, inplace=True)
return t1
def get_url_pv(data):
t1 = data.groupby('OperationPage')['UserId'].count().sort_values(ascending=False).reset_index()
return t1
def get_url_pv_100(data):
t1 = data.groupby('OperationPage')['pid'].count().sort_values(ascending=False).reset_index()[0:200]
return t1
def get_user_pv_1(data):
op1='412bc695fd5be4e81813316139c49b5e'
data=data[data['OperationPage'] == op1]
t1 = data.groupby('pid')['OperationPage'].count().sort_values(ascending=False).reset_index()
t1.rename(columns={'pid':'CUST_NO'}, inplace=True)
t1.rename(columns={'OperationPage':'op1'}, inplace=True)
return t1
def open_csv():
pd.set_option('display.max_columns', None)
data = pd.read_csv(Dir + 'DZ_TR_APS.csv',low_memory=False,header=0)
data['apsdtramt'] = pd.to_numeric(data['apsdtramt'], errors='coerce')
user_date_sum = get_user_date_sum(data)
user_sum = get_user_sum(data)
date_sum = get_date_sum(data)
data = pd.read_csv(Dir +'DZ_MBNK_BEHAVIOR.csv',low_memory=False,header=0)
url_pv = get_url_pv_100(data)
user_rul_pv = get_url_user_pv_100(data,url_pv['OperationPage'])
user_pv = get_user_pv(data)
user_pv_1 = get_user_pv_1(data)
get_user_date_count= MK_user_date_count(data)
return user_date_sum,user_sum,date_sum,user_rul_pv,user_pv,url_pv,get_user_date_count,user_pv_1
user_date_sum,user_sum,date_sum,user_rul_pv,user_pv,url_pv,get_user_date_count,user_pv_1 = open_csv()
user_date_sum.to_csv('按用户id与日期汇总金额.csv',index_label=False,index=False)
#user_sum.to_csv('按用户id汇总金额.csv')
#date_sum.to_csv('按日期汇总金额.csv')
user_rul_pv.to_csv('按用户连接计数.csv',index_label=False,index=False)
get_user_date_count.to_csv('按用户-日期统计连接计数.csv',index_label=False,index=False)
print('---按用户计数----',user_pv)
#user_pv.to_csv('按用户计数.csv')
#url_pv.to_csv('按连接计数.csv')
'''
对活期表,掌银表进行数据填充,均使用0填充
特征合并
'''
web= pd.DataFrame(pd.read_csv(Dir + '按用户连接计数-Copy1.csv'))
columns1=web.columns.values.tolist()
web2= pd.DataFrame(pd.read_csv(Dir + '按用户连接计数.csv'))
columns2=web2.columns.values.tolist()
for sr1 in columns2:
if sr1 in columns1:
web[sr1]=web2[sr1]
for str2 in columns1:
web[str2]=web[str2].fillna(0)
Train=Train.merge(user_date_sum,on='CUST_NO',how='outer')
Train=Train.merge(get_user_date_count,on='CUST_NO',how='outer')
Train=Train.merge(user_pv,on='CUST_NO',how='outer')
#Train=Train.merge(user_pv_1,on='CUST_NO',how='outer')
Train=Train.merge(web,on='CUST_NO',how='outer')
Train=Train.merge(df_DZ_TARGET_TRAIN,on='CUST_NO',how='outer')
for str2 in Train.columns.values:
Train[str2]=Train[str2].fillna(0)
这样就完成了对全部数据的特征工程(初步),可以用来进行开始机器学习或进一步特征工程了。
标签:大众,--,t1,人工智能,DZ,user,csv,data,columns From: https://www.cnblogs.com/aswgp/p/17284230.html