大数据分析第五周练习（连续洗浴事件）

标签：数据分析 loc plt 练习 water sj 洗浴水流量 data

连续洗浴事件

数据预处理

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_rows',None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

original_data=pd.read_excel('D:\大三下大数据分析\课堂练习第五周\\original_data.xls')

数据规约：去除无关特征和冗杂特征

查看缺失值等基本数据信息：

print(heater_data.info())
print(heater_data.describe())
print(heater_data.count())

条形图绘制

fig = plt.figure(figsize = (6 ,5)) # 设置画布大小
sns.set(style="darkgrid")
sns.set_context("notebook",font_scale=1.2,rc={"lines.linewidth":2.5})
plt.rcParams['font.sans-serif'] = 'SimHei' # 设置中文显示
plt.rcParams['axes.unicode_minus'] = False
sns.catplot(kind="count",
data=heater_data,
x="有无水流",
aspect=1,
height=5,
palette='tab20')
plt.xlabel('水流状态')
plt.ylabel('记录数')
plt.title('不同水流状态记录数(3135)',fontsize=15)

绘制水流量分布箱型图

fig = plt.figure(figsize = (5 ,8))
sns.catplot(kind="box",
data=heater_data,
y='水流量',
aspect=1,
height=5,
palette='tab20')

plt.title('水流量分布箱线图(3135)',fontsize=15)
# 显示y坐标轴的底线
plt.grid(axis='y')
plt.show()

数据探索分析

heater_data['发生时间']=pd.to_datetime(heater_data['发生时间'],format='%Y%m%d%H%M%S')
water_use=heater_data[heater_data['水流量']>0]
water_use['diff_minutes']=abs(water_use['发生时间'].diff(-1).fillna(pd.Timedelta(seconds=0)).dt.total_seconds()/60)

plt.figure(figsize=(10,8))
sns.displot(water_use['diff_minutes'])
plt.show()

bining = [0,0.1,0.2,0.3,0.5,1,2,3,4,5,6,7,8,9,10,11,12,13]
percetage = []
thr = []
for i in range(len(bining)-1):
thr.append((bining[i],bining[i+ 1]))
percetage.append(len(water_use[(water_use['diff_minutes'] >= bining[i]) & (water_use['diff_minutes'] < bining[i+1]) ]) / len(water_use) * 100)
percetage.append(len(water_use[(water_use['diff_minutes'] >= 13) ])/ len(water_use) * 100)
thr.append(13)

water_use_pause_time = pd.DataFrame(percetage,columns = ['percentage'] )
water_use_pause_time['time_interval'] = thr
water_use_pause_time.columns = ['percentage','time_interval']
water_use_pause_time

plt.figure(figsize = (10,8))
plt.bar(np.arange(len(water_use_pause_time)),water_use_pause_time['percentage'],alpha=0.5, width=0.3, color='b', edgecolor='yellow', label='每个间隔时长的停顿频率', lw=2)
plt.xticks(np.arange(18), water_use_pause_time['time_interval'], rotation=90)#rotation控制倾斜角度
plt.legend(loc='upper right')
plt.title('学号3135',fontsize=15)
plt.show()

“有无水流”可以通过“水流量”反映出来

heater_data=original_data.drop('有无水流',axis=1)

from sklearn import preprocessing
le=preprocessing.LabelEncoder()
heater_data['节能模式']=le.fit_transform(heater_data['节能模式'])
heater_data['节能模式'].std()

由上面的结果看出，属性“节能模式”都为一个状态关，方差为0，对建模无意义

数据清洗工作

一次完整用水事件的划分模型

threshold=pd.Timedelta(minutes=4)
heater_data['发生时间']=pd.to_datetime(heater_data['发生时间'],format='%Y%m%d%H%M%S')
data = heater_data[heater_data['水流量'] > 0]
d= data['发生时间'].diff() > threshold # 相邻时间向前差分，比较是否大于阈值
data['事件编号']=d.cumsum()+1
data.to_csv('D:\大三下大数据分析\课堂练习第五周\\dividsequence.csv',index=False)
data

用水事件阈值寻优模型

threshold=np.arange(1,10.25,0.25)
num_with_differentThr=[]
for thr in threshold:
t=pd.Timedelta(minutes=thr)
d=data['发生时间'].diff()>t
num_with_differentThr.append((d.cumsum()+1).nunique())

eventNum_with_different_thr =pd.DataFrame(columns =['时间间隔'])
eventNum_with_different_thr['时间间隔'] =threshold
eventNum_with_different_thr['事件个数'] =num_with_differentThr
eventNum_with_different_thr

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']= ['SimHei'] #步骤一(替换sans-serif字体)
plt.rcParams['axes.unicode_minus']= False
plt.figure(figsize=(10,5))#设置画布的尺寸
plt.title('阈值与划分事件个数关系(3135)',fontsize=20)#标题，并设定字号大小
plt.xlabel(u'时间间隔',fontsize=14)#设置x轴，并设定字号大小
plt.ylabel(u'事件数',fontsize=14)#设置y轴，并设定字号大小
plt.plot(eventNum_with_different_thr['时间间隔'],eventNum_with_different_thr['事件个数'],color="deeppink",linewidth=2,linestyle=':')
plt.show()

阈值寻优

def event_num(ts):
ts=pd.Timedelta(minutes=ts)
d=data['发生时间'].diff()>ts #相邻时间做差，比较是否大子阙值
return(d.cumsum()+1).nunique() #这样直接返回事件数

h=pd.DataFrame(threshold,columns =['阈值'])#定义阙值列
h['事件数']=h['阈值'].apply(event_num)
h['斜率']=h['事件数'].diff(-1)/0.25 #计算每两个相邻点对应的斜率
h['斜率'].fillna(0,inplace = True)
tmp=[]
for i,j in enumerate(h['斜率'].abs()):
if i<=(len(h['斜率'])- 4):
tmp.append((h['斜率'].abs()[i]+h['斜率'].abs()[i+ 1]+h['斜率'].abs()[i+2]+h['斜率'].abs()[i+3] ) / 4)
else:
tmp.append(0)
h['斜率指标'] = tmp#采用后n个斜率绝对值平均作为斜率指标
h.loc[h['斜率指标']== 0,'斜率指标'] = 1000
ts =h['阈值'][h['斜率指标'].idxmin()]#用idxmin返回最小值的Index，由于rollingmean()自动计算的是前n个斜率的绝对值
if ts >5:#5为专家阙值
ts=pd.Timedelta(minutes=4)
print('当前时间间隔划分方式下，最佳时间间隔是:',ts)
h

数据特征

data["水流量"] = data["水流量"] / 60 # 原单位L/min，现转换为L/sec
sj["总用水量"] = 0 # 给总用水量赋一个初始值0
for i in range(len(sj)):
Start = sj.loc[i,"事件起始编号"]-1
End = sj.loc[i,"事件终止编号"]-1
if Start != End:
for j in range(Start,End):
if data.loc[j,"水流量"] != 0:
sj.loc[i,"总用水量"] = (data.loc[j + 1,"发生时间"] -
data.loc[j,"发生时间"]).seconds* \
data.loc[j,"水流量"] + sj.loc[i,"总用水量"]
sj.loc[i,"总用水量"] = sj.loc[i,"总用水量"] + data.loc[End,"水流量"] * 2
else:
sj.loc[i,"总用水量"] = data.loc[Start,"水流量"] * 2

sj["平均水流量"] = sj["总用水量"] / sj["用水时长"] # 定义特征平均水流量
# 构造特征：水流量波动
# 水流量波动=∑(((单次水流的值-平均水流量)^2)*持续时间)/用水时长
sj["水流量波动"] = 0 # 给水流量波动赋一个初始值0
for i in range(len(sj)):
Start = sj.loc[i,"事件起始编号"] - 1
End = sj.loc[i,"事件终止编号"] - 1
for j in range(Start,End + 1):
if data.loc[j,"水流量"] != 0:
slbd = (data.loc[j,"水流量"] - sj.loc[i,"平均水流量"])**2
slsj = (data.loc[j + 1,"发生时间"] - data.loc[j,"发生时间"]).seconds
sj.loc[i,"水流量波动"] = slbd * slsj + sj.loc[i,"水流量波动"]
sj.loc[i,"水流量波动"] = sj.loc[i,"水流量波动"] / sj.loc[i,"用水时长"]

# 构造特征：停顿时长波动
# 停顿时长波动=∑(((单次停顿时长-平均停顿时长)^2)*持续时间)/总停顿时长
sj["停顿时长波动"] = 0 # 给停顿时长波动赋一个初始值0
for i in range(len(sj)):
if sj.loc[i,"停顿次数"] > 1: # 当停顿次数为0或1时，停顿时长波动值为0，故排除
for j in Stop.loc[Stop["停顿归属事件"] == (i+1),"停顿时长"].values:
sj.loc[i,"停顿时长波动"] = ((j - sj.loc[i,"平均停顿时长"])**2) * j + \
sj.loc[i,"停顿时长波动"]
sj.loc[i,"停顿时长波动"] = sj.loc[i,"停顿时长波动"] / sj.loc[i,"总停顿时长"]

print('用水量和波动特征构造完成后数据的特征为：\n',sj.columns)
print('用水量和波动特征构造完成后数据的前5行5列特征为：\n',sj.iloc[:5,:5])

筛选数据

sj_bool = (sj['用水时长'] >100) & (sj['总用水时长'] > 120) & (sj['总用水量'] > 5)
sj_final = sj.loc[sj_bool,:]
sj_final.to_excel('D:\大三下大数据分析\课堂练习第五周\\sj_final.xlsx',index=False)
print('筛选出候选洗浴事件前的数据形状为：',sj.shape)
print('筛选出候选洗浴事件后的数据形状为：',sj_final.shape)

构建模型

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import joblib

# 读取数据
Xtrain = pd.read_excel('D:\大三下大数据分析\课堂练习第五周\\sj_final.xlsx')
ytrain = pd.read_excel('D:\大三下大数据分析\课堂练习第五周\\water_heater_log.xlsx')
test = pd.read_excel('D:\大三下大数据分析\课堂练习第五周\\test_data.xlsx')
# 训练集测试集区分。
x_train, x_test, y_train, y_test = Xtrain.iloc[:,5:],test.iloc[:,4:-1],\
ytrain.iloc[:,-1],test.iloc[:,-1]
# 标准化
stdScaler = StandardScaler().fit(x_train)
x_stdtrain = stdScaler.transform(x_train)
x_stdtest = stdScaler.transform(x_test)
# 建立模型
bpnn = MLPClassifier(hidden_layer_sizes = (17,10), max_iter = 200, solver = 'lbfgs',random_state=50)
bpnn.fit(x_stdtrain, y_train)
# 保存模型
joblib.dump(bpnn,'D:\大三下大数据分析\课堂练习第五周\\water_heater_nnet.m')
print('构建的模型为：\n',bpnn)

模型评价与ROC曲线

# 模型评价
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
import joblib
import matplotlib.pyplot as plt

bpnn = joblib.load('D:\大三下大数据分析\课堂练习第五周\\water_heater_nnet.m') # 加载模型
y_pred = bpnn.predict(x_stdtest) # 返回预测结果
print('神经网络预测结果评价报告：\n',classification_report(y_test,y_pred))
# 绘制roc曲线图
plt.rcParams['font.sans-serif'] = 'SimHei' # 显示中文
plt.rcParams['axes.unicode_minus'] = False # 显示负号
fpr, tpr, thresholds = roc_curve(y_pred,y_test) # 求出TPR和FPR
plt.figure(figsize=(6,4)) # 创建画布
plt.plot(fpr,tpr) # 绘制曲线
plt.title('用户用水事件识别ROC曲线(3135)',fontsize=15) # 标题
plt.xlabel('FPR') # x轴标签
plt.ylabel('TPR') # y轴标签
plt.savefig('D:\大三下大数据分析\课堂练习第五周\\用户用水事件识别ROC曲线(3135).png') # 保存图片
plt.show() # 显示图形

标签：数据分析,loc,plt,练习,water,sj,洗浴,水流量,data
From： https://www.cnblogs.com/Yyx200202----/p/17253563.html

大数据分析第五周练习（连续洗浴事件）

相关文章

赞助商

阅读排行