1、性别
def xb(x):
if pd.isnull(x):
return "空"
elif int(x[-2])%2 == 1:
return "男"
elif int(x[-2])%2 == 0:
return "女"
else:return "其他"
def f(x):
return x.nunique()
2、省份
def province(x):
if str(x)[0:2] =='11' :
return '北京'
elif str(x)[0:2] =='12' :
return '天津'
elif str(x)[0:2] =='13' :
return '河北'
elif str(x)[0:2] =='14' :
return '山西'
elif str(x)[0:2] =='15' :
return '内蒙古'
elif str(x)[0:2] =='21' :
return '辽宁'
elif str(x)[0:2] =='22' :
return '吉林'
elif str(x)[0:2] =='23' :
return '黑龙江'
elif str(x)[0:2] =='31' :
return '上海'
elif str(x)[0:2] =='32' :
return '江苏'
elif str(x)[0:2] =='33' :
return '浙江'
elif str(x)[0:2] =='34' :
return '安徽'
elif str(x)[0:2] =='35' :
return '福建'
elif str(x)[0:2] =='36' :
return '江西'
elif str(x)[0:2] =='37' :
return '山东'
elif str(x)[0:2] =='41' :
return '河南'
elif str(x)[0:2] =='42' :
return '湖北'
elif str(x)[0:2] =='43' :
return '湖南'
elif str(x)[0:2] =='44' :
return '广东'
elif str(x)[0:2] =='45' :
return '广西'
elif str(x)[0:2] =='46' :
return '海南'
elif str(x)[0:2] =='50' :
return '重庆'
elif str(x)[0:2] =='51' :
return '四川'
elif str(x)[0:2] =='52' :
return '贵州'
elif str(x)[0:2] =='53' :
return '云南'
elif str(x)[0:2] =='54' :
return '西藏'
elif str(x)[0:2] =='61' :
return '陕西'
elif str(x)[0:2] =='62' :
return '甘肃'
elif str(x)[0:2] =='63' :
return '青海'
elif str(x)[0:2] =='64' :
return '宁夏'
else : return '新疆'
3、年龄
import datetime
def ag1(x):
if pd.isnull(x):
return "其他"
else:
birth_year = int(x[6:10])
today_year = datetime.date.today().year
return (today_year - birth_year)
data['年龄']=data['id_card_no'].apply(ag1)
4、邮件自动化
import openpyxl
dst_xlsx = '/data/python_home/cqg/data_model.xlsx'#模板,设置格式。
file_path = '/data/python_home/cqg/youjian.xlsx'#最终写好的模板路径
df=openpyxl.load_workbook(dst_xlsx) # 打开模板
tar = pd.read_excel('/data/python_home/cqg/duibi1.xlsx')#目标数据
tar
dt=pd.read_excel('data_model.xlsx')#打开看一下模板长啥样
dt
fund_list = ['多平台']
for k in range(1):
sheet0 = df.worksheets[k]# sheet页
target_data = pd.read_excel('/data/python_home/cqg/duibi1.xlsx').reset_index().T.reset_index().T.iloc[1:,2:10]
for i in range(len(target_data)):
#print(target_data.iloc[i,:])
for j in range(8):
sheet0.cell(i+3,j+1).value = target_data.iloc[(i,j)] # 写入数据
#sheet0.cell(i+1,j+1).alignment = Alignment(horizontal='center', vertical='center') # 居中对齐
df.save(file_path) # 保存文件
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import smtplib
from email.mime.image import MIMEImage
import datetime
mail_host = "smtphz.qiye.163.com" #SMTP服务器地址
mail_sender = "[email protected]" #账号
mail_passwd = "Cloud@12@@" #密码
msg = MIMEMultipart('related')
msg["Subject"] = '多平台对比'
msg["From"] = mail_sender #发送人
receiver=['[email protected]']
邮件正文
content = '''
各位好:
'''
import datetime
yesterday = datetime.date.today()-datetime.timedelta(days=1)
msg.attach(MIMEText(content,'plain','utf-8'))
file_path = '/data/python_home/cqg2/数据情况1.xlsx'
构造附件2,xlsx文件
att2 = MIMEText(open(file_path, 'rb').read(), 'base64', 'utf-8')
att2["Content-Type"] = 'application/octet-stream'
att2.add_header("Content-Disposition",'attachment', filename=file_path)
msg.attach(att2)
s = smtplib.SMTP() #实例化对象
s.connect(mail_host) #连接163邮箱服务器,端口号为465,注意,这里不需要写端口号
s.login(mail_sender, mail_passwd) #登录邮箱
s.sendmail(mail_sender, receiver, msg.as_string())
s.quit()
6、聚合新增一列
zt['借款总次数']=zt.groupby('idcard')['loan_date'].transform('count')
7、将数据拆分并计数
yxjj['reason_code']=yxjj['reason_code'].apply(lambda x:str(x).split(", "))
def hmd(x):
a=0
for i in x:
if i in ['']:
a=a+1
return a
8、日期类型转为字符串
import datetime
now = datetime.datetime.now()
(s,'%Y/%m/%d %H:%M:%S') #先将字符串变成时间,再将时间变为字符串且此时可选择日期格式
将其他类型变成时间类型,不能随意改格式
to_datetime(arg=data['apply_date'],format="%Y-%m-%d")
ZT[i]=ZT[i].apply(lambda x: x.days)
today_year = datetime.date.today().year
字符串变时间类型
data['apply_date']=data['apply_date'].apply(lambda x:dt.datetime.strptime(x,"%Y-%m-%d"))
时间类型变字符串
import datetime as dt
data['apply_date']=data['apply_date'].apply(lambda x:x.strftime('%Y-%m'))
9、画图
画图,柱状图
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
y=[10,30,14,15,5,5]
name=['10_20','20_30','30_40','40_50','50_60','60_70']
plt.bar(x,y,tick_label=name,width=3)
plt.ylim((0,35))
for x,y in zip(x,y):
plt.text(x,y,y,ha='center',va='bottom')
点图
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.pyplot as plt
x=[10,3,1,12,14,5,8,9,14,15,21]
y2=[15,31,2,14,15,6,31,20,25,12,12]
plt.scatter(x,y2)
plt.legend()
plt.show()
10循环写入
testxlsx=pd.ExcelWriter('test.xlsx')
for i in plat_360:
for j in range(0,len(needcolumns)):
if needcolumns[j]!='省份':
table=pd.pivot_table(data_cgm_360_copy_dropdup[data_cgm_360_copy_dropdup.plati],index='等级',columns=needcolumns[j],values='phone',aggfunc=f,margins=True)
for z in table.columns:
table[str(z)+'占比']=table[z]/table['All']
table[str(z)+'占比']=table[str(z)+'占比'].apply(lambda x:format(x100,'.2f')+'%')
table.insert(0,'type',value=needcolumns[j])
title[title.index0].style.set_properties({'text-align':'center'}).to_excel(testxlsx,sheet_name=i,startrow=0,index=False)
dengjitable360=pd.pivot_table
(data_cgm_360_copy_dropdup[data_cgm_360_copy_dropdup.plat==i],index='等级',values='phone',aggfunc=f).rename(columns={'phone':'用户数'})
dengjitable360['用户数占比']=dengjitable360['用户数']/dengjitable360['用户数'].sum()
dengjitable360['用户数占比']=dengjitable360['用户数占比'].apply(lambda x:format(x100,'.2f')+'%')
dengjitable360.style.set_properties().to_excel(testxlsx,sheet_name=i,startrow=83)
table.drop(columns='All占比').style.set_properties({'text-align':'center'}).to_excel(testxlsx,sheet_name=i,startrow=j*13+2)
pjnl=pd.pivot_table(data_cgm_360_copy_dropdup[data_cgm_360_copy_dropdup.plati],index='等级',values='年龄',aggfunc=np.mean).rename(columns={'年龄':'平均年龄'})
pjnl.to_excel(testxlsx,sheet_name=i,startrow=15,startcol=16)
else:
table=pd.pivot_table(data_cgm_360_copy_dropdup[data_cgm_360_copy_dropdup.plati],index='等级',columns=needcolumns[j],values='phone',aggfunc=f,margins=True)
table.insert(0,'type',value=needcolumns[j])
table.style.set_properties().to_excel(testxlsx,sheet_name=i,startrow=69)
setsheetwidth=testxlsx.sheets[i]
setsheetwidth.set_column('A:Q',16)
textxlsx.save()
11、四种分箱
cut
sx['kh_loanterm']=pd.cut(sx['kh_loanterm'],bins=[3,6,9,12,15,np.inf],right=False,
labels=['A_3','B_6','C_9','D_12','E_15'])
占位符
def 朴道_海纳综合指数V2_申请命中网络贷款类机构数_trans(x):
# 朴道_海纳综合指数V2_申请命中网络贷款类机构数 连续型特征的分箱转换函数
inf = np.inf
bins = [-inf, 0.5, 1.5, 2.5, 13.5, inf]
for i in range(len(bins)-1):
start = bins[i]
end = bins[i+1]
if start < x <= end:
return "{0}({1}, {2}]".format(i+1, start, end) # 分箱字符串格式: i(start, end]
return '0_nan'
fksx1[i]=fksx1[i].cat.add_categories(['空值'])
占位符升级
def kh_jld_ApplyTime_hour_bin(x):
values=[4,8,12,18,23]
index=['A','B','C','D']
for i in range(len(values)-1):
if values[i]<x<=values[i+1]:
return '{0}_({1},{2}]'.format(index[i],values[i],values[i+1])
return 'G_未知'
qcut
for i in coll:
sx[i]=pd.qcut(sx[sx[i]>=0][i],q=5,duplicates='drop')
sx[i]=sx[i].astype('str')
sx[i].replace('nan',np.nan,inplace=True)
sx[i].fillna(sx2[i],inplace=True)
12、把时间变为周
mysql专用:
周日开始
concat(str_to_date(concat(yearweek(apply_date), ' Sunday'), '%X%V %W'),"~",date_add(str_to_date(concat(yearweek(apply_date), ' Sunday'), '%X%V %W'),interval 6 day)) AS APPLY_WEE
周一开始
concat(str_to_date(concat(yearweek(apply_date,1),'Monday'),'%x%v %W'),"~",date_add(str_to_date(concat(yearweek(apply_date,1),'Monday'),'%x%v %W'),interval 6 day)) AS APPLY_WEEK
Python函数 :
周一开始
import datetime as dt
def get_monday_to_sunday(today):
today = dt.datetime.strptime(str(today), "%Y-%m-%d")
monday = dt.datetime.strftime(today - dt.timedelta(today.weekday()), "%Y-%m-%d")
monday_ = dt.datetime.strptime(monday, "%Y-%m-%d")
sunday = dt.datetime.strftime(monday_ + dt.timedelta(monday_.weekday() + 6), "%Y-%m-%d")
monday1=str(monday)
sunday1=str(sunday)
return monday1+"~"+sunday1
数仓专用
周一开始
concat(DATE_ADD(NEXT_DAY(放款日期,'星期天'),-6),"~",NEXT_DAY(放款日期,'星期天')) week
13、KS和IV(pycard)
KS循环
testks=pd.ExcelWriter('test1.xlsx')
columns_df=pd.DataFrame(mx2.columns)
for i,j in enumerate(mx2.columns):
result=toad.metrics.KS_bucket(mx[j],mx['target'],bucket=10,method='quantile').sort_index()
result.to_excel(testks,sheet_name='kskn',startrow=14*i+1,index=False)
testks.save()
testks.close()
分箱加结果
data_sd = ZT3
import pycard as pc
num_iv_woedf = pd.DataFrame()
clf = pc.NumBin()
for i in coll:
if data_sd[i].isnull().sum()/len(data_sd)<1:
clf.fit(data_sd[i] ,data_sd['dpd30+'])
num_iv_woedf = num_iv_woedf.append(clf.woe_df_)
原始分箱
import toad
import scorecardpy as sc
str(x).strip()
toad.detector.detect()
toad.metrics.KS_bucket(x,y,bucket=10,method = 'quantile')
bins = sc.woebin(y2, y="target")#y2是数据,里面的目标列用target
sc.woebin_plot(bins)
breaks_adj = {'xy':[1,2,3,4]}
bins_adj = sc.woebin(y2, y="target",breaks_list=breaks_adj)
sc.woebin_plot(bins_adj)
bins_result= pd.DataFrame()
for value in bins.values():
bins_result=bins_result.append(value)
bins_result.to_excel("IV.xlsx")
14异常展示
展示异常
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
15、第一行设为列名
import numpy as np
array = np.array(zt1)
list = array.tolist()
list = list[0]
zt1.columns = list
16、加密
sha256加密
import hashlib
def sha256_jiami(x):
'''
Parameters
----------
x : string
需要加密的字符串.
Returns
-------
字符窜加密后的字符
'''
sha256 = hashlib.sha256()
sha256.update(x.encode('utf-8'))
return sha256.hexdigest()
target_01['phone_sha256'] = target_01['Phone'].apply(sha256_jiami)
MD5加密
import hashlib
def md5_jiami(x):
Jpwd = hashlib.md5()
Jpwd.update(x.encode('utf-8'))
return Jpwd.hexdigest()
zt1['手机号']=zt1['手机号'].apply(md5_jiami)
17、计算PSI
HX1['本月-训练集']=HX1.apply(lambda x:(x.本月-x.训练集)*math.log(x.本月/x.训练集),axis=1)
HX1.loc['Row_sum'] = HX1.iloc[:,1:].apply(lambda x: x.sum())
18、pycard分析
for i in ['dpd1+']:
for j in kncol:
tmp_data = pc.cross_woe(kn[j],kn[i]).reset_index()
tmp_data.insert(0,'type',value=i)
tmp_data.rename({j:'区间'},axis=1,inplace=True)
data_iv = data_iv.append(tmp_data)
data_iv