标签：数据分析实战 plt Python cart test import model data

非原创，仅个人关于《Python数据分析与挖掘实战》的学习笔记

窃漏电数据分析

导入相关库

import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xlrd
# 解决中文乱码
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 忽略警告
warnings.filterwarnings("ignore")

数据读取

# 定义文件路径
inputfile_path = '/Notebook/读书笔记/Python数据分析与挖掘实战/data/missing_data.xls'
# outputfile_path = '/Notebook/读书笔记/Python数据分析与挖掘实战/data/missing_data_processed.xls'

# 读取Excel文件
data = pd.read_excel(inputfile_path,header=None)

type(data)

pandas.core.frame.DataFrame

data

	0	1	2
0	235.8333	324.0343	478.3231
1	236.2708	325.6379	515.4564
2	238.0521	328.0897	517.0909
3	235.9063	NaN	514.8900
4	236.7604	268.8324	NaN
5	NaN	404.0480	486.0912
6	237.4167	391.2652	516.2330
7	238.6563	380.8241	NaN
8	237.6042	388.0230	435.3508
9	238.0313	206.4349	487.6750
10	235.0729	NaN	NaN
11	235.5313	400.0787	660.2347
12	NaN	411.2069	621.2346
13	234.4688	395.2343	611.3408
14	235.5000	344.8221	643.0863
15	235.6354	385.6432	642.3482
16	234.5521	401.6234	NaN
17	236.0000	409.6489	602.9347
18	235.2396	416.8795	589.3457
19	235.4896	NaN	556.3452
20	236.9688	NaN	538.3470

data.columns

Index([0, 1, 2], dtype='int64')

len(data)

数据预处理

拉格朗日插值

import pandas as pd
from scipy.interpolate import lagrange

# 定义拉格朗日插值函数
def ployinterp_column(s, n, k=5):
    # 确保索引范围在合法范围内
    start = max(0, n - k)
    end = min(len(s), n + k + 1)
    
    # 取前后 k 个数据点，排除掉索引为 n 的点
    y = s[list(range(start, n)) + list(range(n + 1, end))]
    y = y.dropna()  # 剔除空值
    
    if len(y) < 2:
        # 如果剩下的点不足以进行插值，返回原始值或其他处理方式
        return s[n]
    
    return lagrange(y.index, y.values)(n)  # 插值并返回插值结果


# 逐个元素判断是否需要插值
for i in data.columns:
    for j in range(len(data)):
        if pd.isna(data.at[j, i]):  # 如果为空即插值
            try:
                data.at[j, i] = ployinterp_column(data[i], j)
            except ValueError as e:
                print(f"插值错误在列 {i} 的索引 {j}: {e}")

data

	0	1	2
0	235.833300	324.034300	478.323100
1	236.270800	325.637900	515.456400
2	238.052100	328.089700	517.090900
3	235.906300	203.462116	514.890000
4	236.760400	268.832400	493.352591
5	237.151181	404.048000	486.091200
6	237.416700	391.265200	516.233000
7	238.656300	380.824100	493.342382
8	237.604200	388.023000	435.350800
9	238.031300	206.434900	487.675000
10	235.072900	237.348072	609.193564
11	235.531300	400.078700	660.234700
12	235.314951	411.206900	621.234600
13	234.468800	395.234300	611.340800
14	235.500000	344.822100	643.086300
15	235.635400	385.643200	642.348200
16	234.552100	401.623400	618.197198
17	236.000000	409.648900	602.934700
18	235.239600	416.879500	589.345700
19	235.489600	420.748600	556.345200
20	236.968800	408.963200	538.347000

数据转换

那什么样的数据才是存在窃漏电的情况呢？

书中，提出了一套指标公式。通过公式，将上面各种复杂繁多的数据变换为三项指标的简单数据。很可惜，书中并没有将数据转换的过程写出来。直接是给了结果，如下所示：

# model.xls

模型构建

构建窃漏电用户识别模型

# 判断用户是否窃漏电，这是一种分类预测。

model_path = '/Notebook/读书笔记/Python数据分析与挖掘实战/data/model.xls'

# 随机函数，用来打乱数据
from random import shuffle
# 读取数据
model_data = pd.read_excel(model_path)
type(model_data)

pandas.core.frame.DataFrame

model_data

	电量趋势下降指标	线损指标	告警类指标	是否窃漏电
0	4	1	1	1
1	4	0	4	1
2	2	1	1	1
3	9	0	0	0
4	3	1	0	0
...	...	...	...	...
286	4	1	2	0
287	1	0	2	0
288	5	1	2	1
289	2	1	0	0
290	4	1	0	0

291 rows × 4 columns

model_data = model_data.sample(frac=1).reset_index(drop=True)

model_data

	电量趋势下降指标	线损指标	告警类指标	是否窃漏电
0	3	1	2	0
1	1	0	1	0
2	3	0	2	0
3	3	1	1	0
4	4	1	1	1
...	...	...	...	...
286	5	0	1	0
287	5	1	1	1
288	4	0	2	0
289	4	0	0	0
290	6	0	0	0

291 rows × 4 columns

# 假设 model_data 是你的数据框
p = 0.8 # 设置训练数据比例
split_index = int(len(model_data) * p)

# 使用 iloc 按位置索引来分割数据集
train = model_data.iloc[:split_index, :]
test = model_data.iloc[split_index:, :]

LM神经网络（KMeras）

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 读取数据
model_path = '/Notebook/读书笔记/Python数据分析与挖掘实战/data/model.xls'
model_data = pd.read_excel(model_path)

# 打乱数据
# model_data = model_data.sample(frac=1).reset_index(drop=True)

# 设置训练数据比例
p = 0.8
split_index = int(len(model_data) * p)

# 分割数据集
train = model_data.iloc[:split_index, :]
test = model_data.iloc[split_index:, :]

# 假设最后一列是标签
X_train = train.iloc[:, :-1].values
y_train = train.iloc[:, -1].values
X_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values

# 数据归一化
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 定义 LM 神经网络模型
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=2000, random_state=42)

# 训练模型
mlp.fit(X_train, y_train)

# 预测
y_pred = mlp.predict(X_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("准确率:", accuracy)
print("混淆矩阵:\n", conf_matrix)
print("分类报告:\n", class_report)

准确率: 0.9491525423728814
混淆矩阵:
 [[54  3]
 [ 0  2]]
分类报告:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97        57
           1       0.40      1.00      0.57         2

    accuracy                           0.95        59
   macro avg       0.70      0.97      0.77        59
weighted avg       0.98      0.95      0.96        59

import matplotlib.pyplot as plt
import seaborn as sns

# 绘制混淆矩阵
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.xlabel('预测')
plt.ylabel('实际')
plt.show()

ROC曲线评价

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# 预测测试集的概率
y_prob = mlp.predict_proba(X_test)[:, 1]  # 假设类别1是正类

# 计算ROC曲线的FPR和TPR
fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label=1)  # pos_label根据你的数据集可能需要调整

# 计算AUC值
roc_auc = auc(fpr, tpr)

# 绘制ROC曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

CART决策树模型

决策树是一种流行的机器学习算法，用于分类和回归任务。它通过递归地将数据集分割成越来越小的子集，直到满足停止条件，从而构建决策树。每个内部节点代表一个特征上的测试，每个分支代表测试的结果，每个叶节点代表一个预测值。

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

# 读取数据
model_path_cart = '/Notebook/读书笔记/Python数据分析与挖掘实战/data/model.xls'
model_data_cart = pd.read_excel(model_path_cart)

# 打乱数据
# model_data = model_data.sample(frac=1).reset_index(drop=True)

# 设置训练数据比例
p = 0.8
split_index_cart = int(len(model_data_cart) * p)

# 分割数据集
train_cart = model_data.iloc[:split_index, :]
test_cart = model_data.iloc[split_index:, :]

# 假设最后一列是标签
X_train_cart = train_cart.iloc[:, :-1].values
y_train_cart = train_cart.iloc[:, -1].values
X_test_cart = test_cart.iloc[:, :-1].values
y_test_cart = test_cart.iloc[:, -1].values

# 数据归一化
scaler_cart = MinMaxScaler()
X_train_cart = scaler.fit_transform(X_train_cart)
X_test_cart = scaler.transform(X_test_cart)


# 创建决策树
dt = DecisionTreeClassifier(random_state=42)

# 训练模型
dt.fit(X_train_cart, y_train_cart)

# 预测
y_pred_cart = dt.predict(X_test_cart)

#评估模型
accuracy_cart = accuracy_score(y_test_cart, y_pred_cart)
conf_matrix_cart = confusion_matrix(y_test_cart, y_pred_cart)
class_report_cart = classification_report(y_test_cart, y_pred_cart)

print("准确率:", accuracy_cart)
print("混淆矩阵:\n", conf_matrix_cart)
print("分类报告:\n", class_report_cart)

准确率: 0.9491525423728814
混淆矩阵:
 [[54  3]
 [ 0  2]]
分类报告:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97        57
           1       0.40      1.00      0.57         2

    accuracy                           0.95        59
   macro avg       0.70      0.97      0.77        59
weighted avg       0.98      0.95      0.96        59

绘制ROC曲线

#绘制ROC曲线
from sklearn.metrics import roc_curve, auc

# 假设二分类问题，正类为1
y_prob_cart = dt.predict_proba(X_test_cart)[:, 1]
fpr_cart, tpr_cart, thresholds_cart = roc_curve(y_test_cart, y_prob_cart, pos_label=1)
roc_auc_cart = auc(fpr, tpr)

plt.figure()
plt.plot(fpr_cart, tpr_cart, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_cart)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

# 绘制混淆矩阵
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix_cart, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.xlabel('预测')
plt.ylabel('实际')
plt.show()

# 可视化决策树
plt.figure(figsize=(20,10))
plot_tree(dt, filled=True)
plt.show()

png

标签：数据分析,实战,plt,Python,cart,test,import,model,data
From： https://www.cnblogs.com/zhouwp/p/18220701

Python数据分析与挖掘实战（6章）