MySQL 的 连接和使用
https://www.cnblogs.com/zdstudy/p/16567399.html mysql使用网址
https://blog.csdn.net/LikiLyn/article/details/120385981 多个文件merge
import pandas as pd
import numpy as np
import pymysql
#%%打开数据库连接
conn = pymysql.connect(host='地址',user = "用户",passwd = "密码",\
db = "数据库")
sql1 = '''
'''
sql_data1 = pd.read_sql_query(sql1,conn)
import pymysql
conn = pymysql.connect(host="地址",port=3306,user="用户",password="密码",database="数据库")
cursor = conn.cursor()
sqls1 = "drop table if exists "
cursor.execute(sqls1)
cursor.close()
conn.close()
import pymysql
import pandas as pd
from sqlalchemy import create_engine
conn = pymysql.connect(host='地址',user = "用户",passwd = "密码",\
db = "数据库")
#读取数据
#建立连接,username替换为用户名,passwd替换为密码,test替换为数据库名
conn = create_engine('mysql+pymysql://用户:密码@地址:端口/数据库',encoding='utf8')
#写入数据,table_name为表名,‘replace’表示如果同名表存在就替换掉
cddyx.to_sql("", conn, if_exists='replace', index=False)
import toad
import scorecardpy as sc
str(x).strip()
toad.detector.detect()
toad.metrics.KS_bucket(x,y,bucket=10,method = 'quantile')
bins = sc.woebin(y2, y="target")#y2是数据,里面的目标列用target
sc.woebin_plot(bins)
breaks_adj = {'xy':[1,2,3,4]}
bins_adj = sc.woebin(y2, y="target",breaks_list=breaks_adj)
sc.woebin_plot(bins_adj)
bins_result= pd.DataFrame()
for value in bins.values():
bins_result=bins_result.append(value)
bins_result.to_excel("IV.xlsx")
注意列名为数字 去掉空格 target为数值
from functools import reduce
dfs = [x2, x3, x4]
df = reduce(lambda x, y: pd.merge(x, y, on=["phone",'id','name'], how="inner"), dfs)
df
result1=pd.DataFrame()
for j in Y.columns:
result=toad.metrics.KS_bucket(Y[j],Y['target'],bucket=10,method='quantile').sort_index()
result['变量名']=j
result1=result1.append(result)
画lift曲线
target=np.array([1,0,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,0,1])
y_pre=np.random.rand(20)
y_pre
def lift(target,y_pre):
data=pd.DataFrame({'target':target,'y_pre':y_pre})
data['y_pre']=pd.qcut(data['y_pre'],q=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
data=data.groupby('y_pre').sum().reset_index()
data['apply']=1
data['cum_bad']=data['target'].cumsum()
data['cum_apply']=data['apply'].cumsum()
data['bad_total']=data['target'].sum()
data['apply_total']=data['apply'].sum()
data['cum_bad_rate']=data['cum_bad']/data['bad_total']
data['cum_apply_rate']=data['cum_apply']/data['apply_total']
data['lift']=data['cum_bad_rate']/data['cum_apply_rate']
return np.array(data['lift'])