标签：数据分析实战 plt Python df 01 pd np import

非原创，仅个人关于《Python数据分析与挖掘实战》的学习笔记

5 挖掘建模

import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# 解决中文乱码
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 忽略警告
warnings.filterwarnings("ignore")

5.1 分类与预测

5.1.1 实现过程

预测

预测是指建立两种或两种以上变量间相互依赖的函数模型，然后进行预测或控制。

1、通过训练集建立预测属性的函数模型
2、在模型通过检验后进行预测或控制

5.1.2 常用的分类与预测算法

回归分析

线性回归
非线性回归
逻辑回归
岭回归
主成分回归
偏最小二乘回归

决策树

自顶向下的递归方式，在颞部节点进行属性值的比较，并根据不同的属性值从该节点向下分支，最终得到的叶节点是学习划分的类。

贝叶斯

目前不确定知识表达和推理领域最有效的理论模型之一。

SVM

强大的模型，可以用来回归、预测、分类等，而根据选取不同的核函数。模型可以是线性的/非线性的。

随机森林

精度高于决策树，缺点是随机性，丧失了决策树的可解释性。

5.1.3 回归分析

import pandas as pd
from io import StringIO

# 手动转换图片中的数据为CSV格式
data = """年龄,教育,工龄,地址,收入,负债率,信用卡负债,其他负债,违约
41,3,17,12,176.00,9.30,11.36,5.01,1
27,1,10,6,31.00,17.30,1.36,4.00,0
40,1,15,14,55.00,5.50,0.86,2.17,0
41,1,15,14,120.00,2.90,2.66,0.82,0
24,2,2,0,28.00,17.30,1.79,3.06,1
"""

# 使用pandas读取CSV数据
df = pd.read_csv(StringIO(data))

# 显示DataFrame
df

	年龄	教育	工龄	地址	收入	负债率	信用卡负债	其他负债	违约
0	41	3	17	12	176.0	9.3	11.36	5.01	1
1	27	1	10	6	31.0	17.3	1.36	4.00	0
2	40	1	15	14	55.0	5.5	0.86	2.17	0
3	41	1	15	14	120.0	2.9	2.66	0.82	0
4	24	2	2	0	28.0	17.3	1.79	3.06	1

逻辑回归（Logistic）

import pandas as pd
import numpy as np
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, accuracy_score

# 示例数据
data = """年龄,教育,工龄,地址,收入,负债率,信用卡负债,其他负债,违约
41,3,17,12,176.00,9.30,11.36,5.01,1
27,1,10,6,31.00,17.30,1.36,4.00,0
40,1,15,14,55.00,5.50,0.86,2.17,0
41,1,15,14,120.00,2.90,2.66,0.82,0
24,2,2,0,28.00,17.30,1.79,3.06,1
"""

# 使用pandas读取CSV数据
df = pd.read_csv(StringIO(data))

# 设置随机种子
np.random.seed(42)

# 生成100条新数据
new_data = {
    '年龄': np.random.randint(20, 60, 100),
    '教育': np.random.randint(1, 5, 100),
    '工龄': np.random.randint(1, 35, 100),
    '地址': np.random.randint(0, 20, 100),
    '收入': np.random.uniform(20.0, 200.0, 100),
    '负债率': np.random.uniform(1.0, 20.0, 100),
    '信用卡负债': np.random.uniform(0.5, 15.0, 100),
    '其他负债': np.random.uniform(0.1, 10.0, 100),
    '违约': np.random.randint(0, 2, 100)
}

new_df = pd.DataFrame(new_data)

# 合并数据
df = pd.concat([df, new_df], ignore_index=True)

# 定义特征和目标变量
X = df.drop(columns='违约')
y = df['违约']

	年龄	教育	工龄	地址	收入	负债率	信用卡负债	其他负债
0	41	3	17	12	176.000000	9.300000	11.360000	5.010000
1	27	1	10	6	31.000000	17.300000	1.360000	4.000000
2	40	1	15	14	55.000000	5.500000	0.860000	2.170000
3	41	1	15	14	120.000000	2.900000	2.660000	0.820000
4	24	2	2	0	28.000000	17.300000	1.790000	3.060000
...	...	...	...	...	...	...	...	...
100	28	2	30	7	160.633289	15.241988	14.800726	2.129518
101	27	1	8	12	40.423628	8.466800	8.280564	3.854363
102	31	1	27	0	187.567239	16.754120	13.898606	9.271849
103	53	3	27	15	195.364678	11.812548	3.923690	7.243806
104	52	2	34	6	199.267624	2.206725	11.519353	0.576137

105 rows × 8 columns

0      1
1      0
2      0
3      0
4      1
      ..
100    1
101    0
102    0
103    1
104    0
Name: 违约, Length: 105, dtype: int64

# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用随机逻辑回归模型筛选变量
sel = SelectFromModel(LogisticRegression(random_state=42, solver='liblinear'))
sel.fit(X_train, y_train)

# 获取被选中的特征
selected_features = X.columns[sel.get_support()]

# 训练逻辑回归模型
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(X_train[selected_features], y_train)

# 预测测试集
y_pred = model.predict(X_test[selected_features])

# 计算并保留训练集上的模型得分
train_score = model.score(X_train[selected_features], y_train)
train_score = f"{train_score:.2f}"

print(f"选中的特征: {selected_features}")
print(f"模型准确率: {train_score}")

选中的特征: Index(['地址', '其他负债'], dtype='object')
模型准确率: 0.67

简单例子

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 假设 X 是特征矩阵，y 是标签向量
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建逻辑回归模型
model = LogisticRegression()

# 训练模型
model.fit(X_train, y_train)

# 进行预测
y_pred = model.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"准确率: {accuracy}")

准确率: 0.6190476190476191

5.1.2 决策树

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

# n_samples: 数据集中的样本数量。
# n_features: 数据集中的特征（变量）数量。
# n_informative: 对类别有影响的信息性特征的数量。
# n_redundant: 完全冗余的特征的数量（即可以由其他特征完全预测的特征）。
# n_repeated: 重复特征的数量。
# n_classes: 数据集中的类别数量。
# n_clusters_per_class: 每个类别中簇的数量。
# weights: 每个类别的权重。
# flip_y: 标签翻转的概率，用于引入噪声。
# random_state: 随机状态的种子，用于确保结果的可重复性。

# 创建一个合成数据集，这里我们使用scikit-learn内置的make_classification函数来生成
# 假设我们有以下特征：年龄、体重、经验（初学者、中级、高级）、健康状况（好、一般、差）
X, y = make_classification(n_samples=1000, n_features=4, n_informative=2, n_redundant=0, random_state=42)

# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建决策树分类器实例
clf = DecisionTreeClassifier()

# 训练模型
clf.fit(X_train, y_train)

# 对测试集进行预测
y_pred = clf.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy:.2f}")

# 预测新的例子
# 假设我们有一个新的例子：年龄30，体重70公斤，经验中级，健康状况好
new_example = [[30, 70, 1, 0]]  # 特征编码：年龄, 体重, 经验（中级=1, 其他=0），健康状况（好=0, 其他=1）
print("新例子是否适合爬山: ", "适合" if clf.predict(new_example)[0] == 1 else "不适合")

模型准确率: 0.89
新例子是否适合爬山:  不适合

使用决策树模型预测销量

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 生成示例数据
np.random.seed(42)
data = {
    '广告费用': np.random.uniform(1000, 5000, 200),
    '商店大小': np.random.uniform(50, 500, 200),
    '季节': np.random.choice(['春', '夏', '秋', '冬'], 200),
    '销量': np.random.uniform(1000, 20000, 200)
}

df = pd.DataFrame(data)

# 编码分类变量
df = pd.get_dummies(df, columns=['季节'], drop_first=True)

# 定义特征和目标变量
X = df.drop(columns='销量')
y = df['销量']

# 拆分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练决策树回归模型
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 评估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"均方误差: {mse:.2f}")
print(f"拟合度: {r2:.2f}")

# 可视化真实值与预测值
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel("真实值")
plt.ylabel("预测值")
plt.title("真实值与预测值的比较")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.show()

均方误差: 46978560.88
拟合度: -0.89

df

	广告费用	商店大小	销量	季节_夏	季节_春	季节_秋
0	2498.160475	338.914241	14265.072566	False	True	False
1	4802.857226	87.862984	11185.830961	True	False	False
2	3927.975767	122.732921	6881.024709	False	True	False
3	3394.633937	454.349385	16462.105374	False	True	False
4	1624.074562	322.893077	14009.892279	True	False	False
...	...	...	...	...	...	...
195	2396.838298	468.840797	12601.780806	False	True	False
196	3903.822715	436.285738	6483.980512	False	True	False
197	4588.441040	243.047312	12043.526207	False	False	False
198	4548.345697	387.891981	3932.891590	False	False	False
199	4119.502183	389.544293	10141.661935	False	False	True

200 rows × 6 columns

5.1.3 神经网络

# 人工神经元模型

# 人工神经网络的学习=>训练（fit）

什么叫训练？

是指神经网络在受到外部环境的刺激下调整神经网络的参数，使神经网络以一种新的方式对外部环境作出反应的一个过程。

激活函数是什么？

1.域值函数

2.分段线性函数（Tanh (Hyperbolic Tangent)）

# 将输入压缩到-1和1之间，比Sigmoid有更宽的输出范围。

3.非线性转移函数（Sigmoid）

# 将输入压缩到0和1之间，通常用于二分类问题。

4.Relu函数

# 当输入大于0时输出输入值，否则输出0。它计算简单，训练速度快，是目前最流行的激活函数之一。

5.PReLU

# Leaky ReLU的泛化形式，其中
标签：数据分析,实战,plt,Python,df,01,pd,np,import	

From： https://www.cnblogs.com/zhouwp/p/18217240

Python数据分析与挖掘实战（5章）