import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import SGDRegressor
import joblib
import matplotlib
import os
matplotlib.use('TkAgg')
# 绘制直方图 p38
plt.rcParams['font.family'] = 'SimHei'
GDP_data = np.loadtxt("./data/GDP.csv", delimiter=",", skiprows=1)
quarter = GDP_data[8:16, 0].astype(int)
plt.bar(height=GDP_data[8:16, 1], x=range(len(GDP_data[8:16, 1])), label='第一产业GDP', tick_label=quarter)
plt.legend()
plt.show()
# 绘制散点图 p40
iris = pd.read_csv('./data/iris.csv')
sns.set(style="whitegrid", font="simhei", font_scale=0.9)
sns.relplot(x="Petal.Length", y="Petal.Width", hue="Species", palette=["r", "b", "g"], style="Species", data=iris)
plt.show()
# 绘制GDP数据折线图 p44
sns.set(style="whitegrid", font="simhei", font_scale=0.7)
GDP_data['quarter'] = GDP_data['quarter'].astype(str)
g = sns.replot(x="quarter", y="secondary industry", kind="line", data=GDP_data)
g.fig.set_size_inches(8, 4)
plt.show()
# ///// 线性回归 /////
# 预测房价
# 绘制散点图 p56
df = pd.read_csv('data\house.txt', sep=',', header=0)
plt.scatter(df['area'], df['price'], c='b')
# 数据归一化处理 p57
df = (df - df.min()) / (df.max() - df.min())
# 产生训练集和测试集 p57
train_data = df.sample(frac=0.8, replace=False)
test_data = df.drop(train_data.index)
x_train = train_data['area'].values.reshape(-1, 1)
y_train = train_data['price'].values
x_test = test_data['area'].values.reshape(-1, 1)
y_test = test_data['price'].values
# 构建并训练模型 p57 p58
model = SGDRegressor(max_iter=500, learning_rate='optimal', eta0=0.01)
model.fit(x_train, y_train)
pre_score = model.score(x_train, y_train)
print('score = ', pre_score)
print('coef = ', model.coef_, 'intercept = ', model.intercept_)
joblib.dump(model, '.\save_model\SGDRegressor.model')
# 计算均方差 p59
model = joblib.load('.\save_model\SGDRegressor.model')
y_pred = model.predict(x_test)
print('测试集准确率得分=%.5f' % model.score(x_test, y_test))
MSE = np.mean((y_test - y_pred) ** 2)
print('损失MSE={:.5f}'.format(MSE))
# 绘制预测效果图 p59
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(10, 4))
ax1 = plt.subplot(121)
plt.scatter(x_test, y_test, label='测试集')
plt.plot(x_test, y_pred, 'r', label='预测回归线')
ax1.set_xlabel('面积')
ax1.set_ylabel('价格')
plt.legend(loc='upper left')
ax2 = plt.subplot(122)
x = range(0, len(y_test))
plt.plot(x, y_test, 'g', label='真实值')
plt.plot(x, y_pred, 'r', label='预测值')
ax2.set_xlabel('样本序号')
ax2.set_ylabel('价格')
plt.legend(loc='upper right')
plt.show()
# 预测投保人医疗费用
df = pd.read_csv('data\insurance.csv', header=0)
# 数据清洗和转换 p64
df.loc[df['sex'] == 'female', 'sex'] = 0
df.loc[df['sex'] == 'male', 'sex'] = 1
df.loc[df['smoker'] == 'yes', 'smoker'] = 1
df.loc[df['smoker'] == 'no', 'smoker'] = 0
df.loc[df['region'] == 'southwest', 'region'] = 1
df.loc[df['region'] == 'southeast', 'region'] = 2
df.loc[df['region'] == 'northwest', 'region'] = 3
df.loc[df['region'] == 'northeast', 'region'] = 4
# 数据归一化处理 p64
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
df1 = scaler.transform(df)
# 构建线性回归模型 p65
from sklearn.linear_model import LinearRegression
model = LinearRegression()
# 准备训练集和测试集 p65
from sklearn.model_selection import train_test_split
train_data = df1[:, [0, 1, 2, 3, 4, 5]] # 数据优化后-> train_data = df1[:,[0,1,2,3,4,5,6]]
train_target = df1[:, [6]] # 数据优化后-> train_target = df1[:,[7]]
x_train, x_test, y_train, y_test = train_test_split(train_data, train_target, test_size=0.3)
# 模型训练和测试 p65
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
intercept = model.intercept_
coef = model.coef_
print('模型准确性得分%.3f' % score)
func_LR = 'y=%.6f' % intercept
for i in range(0, coef.size):
func_LR += ('%+.6fx%d' % (coef[0][i], i))
print(func_LR)
# 预测结果可视化 p66
plt.rcParams['font.sans-serif'] = ['SimHei']
y_pred = model.predict(x_test)
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, label='测试集目标值')
plt.plot(y_pred, y_pred, 'r', label='模型预测值')
plt.legend(loc='upper left')
plt.show()
# 数据优化后 p67 -> 此数据优化需要在 [数据清洗和转换]阶段
def bmi(x):
if x >= 30:
return 1
else:
return 0
df.insert(6, 'bmismoker', df['smoker'] * df['bmi'].
apply(lambda x: bmi(x))
)
# 预测投保人可能发生的医疗费用 p68
person = {'age': [35], 'sex': 1, 'bmi': [31], 'children': [2], 'smoker': [1], 'region': [2], 'bmismoker': [1],
'charges': [0]}
person_df = pd.DataFrame(person)
person_data = scaler.transform(person_df)
x_person = person_data[:, [0, 1, 2, 3, 4, 5, 6]]
y_personPred = model.predict(x_person)
person_data[0, 7] = y_personPred
person_data = scaler.inverse_transform(person_data)
print('预测后的投保人信息:', person_data)
print('该投保人的预测医疗费用 charges=%.2f' % person_data[0, 7])
# ///// 分类器 /////
# 定义转换函数imgtotext p78
from PIL import Image
def imgtotext(imgfile, txtfile, size=(32, 32)):
# imgfile待识别图像,txtfile将图像转换为txt文件输出,size图像大小,默认32*32
image_file = Image.open(imgfile)
image_file = image_file.resize(size, Image.LANCZOS)
image_file = image_file.convert('L')
width, height = image_file.size
f = open(txtfile, 'w')
ascii_char = '10'
for i in range(height):
pix_char = '';
for j in range(width):
pixel = image_file.getpixel((j, i))
pix_char += ascii_char[int(pixel / 128)]
pix_char += '\n'
f.write(pix_char)
f.close()
imgtotext(r'data\3.jpg', r'data\3_0.txt') # 将图片转换成文本信息
# 定义一个样本的数据转换函数 p80
# 定义函数,将文本数据转换成数字形式的数组
def txt2array(filename):
X = np.zeros((1, 1024))
f = open(filename)
for i in range(32):
lineStr = f.readline()
for j in range(32):
X[0, 32 * i + j] = int(lineStr[j])
return X
# 生成所有样本的特征值和标签值 p80
# 将文件夹下所的文件转换成数组和对应的标签
def convert2dataset(file_path):
list_file = os.listdir(file_path)
m = len(list_file)
datas = np.zeros((m, 1024))
labels = []
for i in range(m):
num = int(list_file[i][0])
labels.append(num)
datas[i, :] = txt2array(file_path + '\\' + list_file[i])
return datas, labels
# 生成训练样本和测试样本数据 p81
x_train, y_train = convert2dataset(r'data\trainingDigits')
x_test, y_test = convert2dataset(r'data\testDigits')
# 构建KNN模型 p81
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=43, weights='distance', p=2)
# 训练KNN模型
knn.fit(x_train, y_train)
# 评估模型
knn.score(x_train, y_train)
# 评估模型效果 p82
from sklearn.metrics import classification_report
y_pred = knn.predict(x_test)
print(classification_report(y_test, y_pred))
i = y_test.index(8)
for j in range(91):
if (y_test[j + i] != y_pred[j + i]):
print('{}[{}]->{}'.format(y_test[j + i], j, y_pred[j + i]), end=' ')
# 交叉表 p83
from sklearn.metrics import confusion_matrix
y_test=np.array(y_test)
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames=['真实值'], colnames=['预测值'], margins=True)
# 调整模型 p84 p85
neighbors=[]
rang= range(13,45)
for i in rang:
if i%2==1:
neighbors.append(i)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i,k in enumerate(neighbors):
knn = KNeighborsClassifier(n_neighbors=k,weights='distance',p=2)
knn.fit(x_train, y_train)
train_accuracy[i] =round(knn.score(x_train, y_train),2)
test_accuracy[i] = round(knn.score(x_test, y_test),2)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.title('k值变化对准确率的影响')
plt.plot(neighbors, train_accuracy, label='训练样本准确率')
plt.plot(neighbors, test_accuracy, label='测试样本准确率')
plt.legend()
plt.xlabel('最近邻k值')
plt.ylabel('准确率值')
plt.show()
# SVN模型诊断乳腺癌
# 分配训练集和测试集 p89
import sklearn.model_selection as ms
datas=pd.read_csv(r'data\wisc_bc_data.csv',sep=',')
x=datas.iloc[:,2:32]
y=datas.iloc[:,1:2]
x_train,x_test,y_train,y_test=ms.train_test_split(x,y,test_size=0.2, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()
# 用训练样本训练svm模型 + 查看模型训练效果 p90
import sklearn.svm as svm
modle=svm.SVC(C=1, kernel='rbf')
modle.fit(x_train,y_train)
print(modle.score(x_train,y_train))
# 用测试样本测试svm模型 p91
import sklearn.metrics as sm
y_pred=modle.predict(x_test)
print(sm.classification_report(y_test,y_pred))
# 改善模型性能 p91 p92
from sklearn.preprocessing import MinMaxScaler
x=MinMaxScaler().fit_transform(x)
x_train,x_test,y_train,y_test=ms.train_test_split(x,y,test_size=0.2, random_state=42)
y_train=y_train.values.ravel()
y_test=y_test.values.ravel()
model=svm.SVC(C=1, kernel='rbf')
model.fit(x_train,y_train)
print(model.score(x_train,y_train))
# ///// 聚类 /////
# 鸢尾花数据降维 p102
from sklearn.manifold import TSNE
datas = pd.read_csv(r'data\iris.csv',sep=',')
tsne = TSNE(n_components=2)
X_2d = tsne.fit_transform(datas)
plt.figure(figsize=(9,6))
plt.plot(X_2d[:,0],X_2d[:,1],"k*")
# 确定鸢尾花最佳的品种数 k值 p104
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
iris_datas = pd.read_csv(r"data\iris.csv", sep=',')
sc = []
for i in range(2, 9):
kmeans = KMeans(n_clusters=i, random_state=151).fit(iris_datas)
score = silhouette_score(iris_datas, kmeans.labels_)
sc.append(score)
plt.plot(range(2, 9), sc, linestyle='-')
plt.xlabel('k')
plt.ylabel('silhouette_score')
plt.show()
# 按k=3对鸢尾花样本数据进行聚类 p105
kmeans3 = KMeans(n_clusters=3,random_state=151).fit(iris_datas)
print(kmeans3.labels_)
# 绘制聚类后样本的散点图 p105
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(15, 8))
ax1 = plt.subplot(221)
plt.scatter(iris_datas['Sepal.Length'], iris_datas['Sepal.Width'], c=kmeans3.labels_)
ax1.set_xlabel('(a)花萼长度')
ax1.set_ylabel('花萼长度')
ax2 = plt.subplot(222)
plt.scatter(iris_datas['Petal.Length'], iris_datas['Petal.Width'], c=kmeans3.labels_)
ax2.set_xlabel('(b)花瓣长度')
ax2.set_ylabel('花瓣宽度')
ax3 = plt.subplot(223)
plt.scatter(iris_datas['Sepal.Length'], iris_datas['Petal.Length'], c=kmeans3.labels_)
ax3.set_xlabel('(c)花萼长度')
ax3.set_ylabel('花瓣长度')
ax4 = plt.subplot(224)
plt.scatter(iris_datas['Sepal.Width'], iris_datas['Petal.Width'], c=kmeans3.labels_)
ax4.set_xlabel('(d)花萼宽度')
ax4.set_ylabel('花瓣宽度')
plt.show()
# 对比聚类后3种鸢尾花的质心数据 p108
# self.datas = self.datas.drop('Species',axis=1)
kmeans3 = KMeans(n_clusters=3, random_state=151).fit(iris_datas)
cluster_centers = kmeans3.cluster_centers_
feature = [
'Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width'
]
# feature = np.concatenate((feature, [feature[0]]))
angles = np.linspace(0, 2 * np.pi, len(feature), endpoint=False)
angles = np.concatenate((angles, [angles[0]]))
plt.figure(figsize=(8, 4))
ax1 = plt.subplot(111, polar=True)
for i, values in enumerate(cluster_centers):
values = np.concatenate((values, [values[0]]))
ax1.plot(angles, values, 'o-', linewidth=2, label='类' + str(i) + '质心')
feature = np.concatenate((feature, [feature[0]]))
ax1.set_thetagrids(angles * 180 / np.pi, feature)
plt.legend()
plt.show()
# 清理无关数据 p110
from sklearn import preprocessing
from sklearn import metrics
kfm_datas = pd.read_csv(r'data\RFM.csv')
kfm_datas1 = kfm_datas.iloc[:,1:]
# 对数据进行标准化处理 p110
X = preprocessing.StandardScaler().fit_transform(kfm_datas1)
# 求不同k值下客户群分的聚类性能指标 p110
ch_score = []
ss_score = []
inertia = []
for k in range(2,10):
kmeans = KMeans(n_clusters=k,max_iter=1000)
pred = kmeans.fit_predict(X)
ch = metrics.calinski_harabasz_score(X,pred)
ss = metrics.silhouette_score(X,pred)
ch_score.append(ch)
ss_score.append(ss)
inertia.append(kmeans.inertia_)
# 绘制3个内部聚类性能指标的变化图 p111
plt.figure(figsize=(10, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.subplot(131)
plt.plot(list(range(2, 10)), ch_score, c='y', label='CH分数')
plt.legend()
plt.subplot(132)
plt.plot(list(range(2, 10)), ss_score, c='b', label='轮廓系数')
plt.legend()
plt.subplot(133)
plt.plot(list(range(2, 10)), inertia, c='g', label='方差值inertia')
plt.legend()
plt.show()
# 重新聚类 p112
kfm_datas = pd.read_csv(r'data\RFM.csv')
kfm_datas1 = kfm_datas.iloc[:, 1:]
stand_scaler = preprocessing.StandardScaler()
X = preprocessing.StandardScaler().fit_transform(kfm_datas1)
kmeans = KMeans(n_clusters=3, random_state=151, max_iter=1000)
labels = pd.Series(kmeans.fit_predict(X))
# 求质心数据 p112
centers = stand_scaler.inverse_transform(kmeans.cluster_centers_)
centers = pd.DataFrame(centers)
result = pd.concat([centers, labels.value_counts().sort_index(ascending=True)], axis=1)
result.columns = list(kfm_datas1.columns) + ['counts']
# 绘制客户群体的RFM指标折线图 p113
plt.figure(figsize=(10, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.subplot(131)
plt.plot(list(range(1, 4)), result.R_days, c='y', label='R指标')
plt.legend()
plt.subplot(132)
plt.plot(list(range(1, 4)), result.F_times, c='b', label='F指标')
plt.legend()
plt.subplot(133)
plt.plot(list(range(1, 4)), result.M_money, c='g', label='M指标')
plt.legend()
plt.show()
# ///// 推荐算法 /////
# 合并数据 p130
df1 = pd.DataFrame([[3, 5, 6, 7], [10, 4, 5, 3], [2, 3, 7, 9], [10, 4, 9, 8]], columns=list('ABCD'))
df2 = pd.DataFrame([[6, 8, 4], [11, 8, 9], [7, 5, 1], [8, 14, 9]], columns=list('CEF'))
data = pd.merge(df1, df2, on='C')
print(data)
# data2 = pd.concat([df1,df2],axis=1)
# print(data2)
# 合并电影基本信息和评分目录 p131
movies = pd.read_csv(r'data\movies.csv')
ratings = pd.read_csv(r'data\ratings.csv')
data = pd.merge(movies, ratings, on='movieId')
data = data[['userId', 'rating', 'movieId', 'title']].sort_values('userId')
# 统计各用户评论的电影和评分 p132
datas = {}
for index, line in data.iterrows():
if not line['userId'] in datas.keys():
datas[line[0]] = {line[3]: line[1]}
else:
datas[line[0]][line[3]] = line[1]
# 计算两个用户之间的相似度 用欧式距离计算相似度 p132 p133
from math import *
def sim_Euclidean(userId1, userId2):
user1_data = datas[userId1]
user2_data = datas[userId2]
distance = 0
count= 0
flag=False
for key in user1_data.keys():
if key in user2_data.keys():
flag=True
distance += pow(float(user1_data[key]) - float(user2_data[key]), 2)
count+=1
if flag:
return (1/(1 + sqrt(distance)/count))
else:
return -1
# 找到与某用户最相似的前n个用户 p133
def top10_simliar(userId,n=10):
result = []
for user_Id in datas.keys():
if not user_Id == userId:
simliar = sim_Euclidean(userId, user_Id)
result.append((user_Id, simliar))
result.sort(key=lambda val: val[1],reverse=True)
return result[0:n]
users = top10_simliar(1,10)
# 找出最近邻用户看过的但该用户没有看过的m部电影 p134
def recommend_films(userId,n=10,m=10):
top_sim_user_list = top10_simliar(userId,n)
users = []
for res in top_sim_user_list:
users.append(res[0])
recommendations = []
for user_id in users:
items = datas[user_id]
for item in items.keys():
if item not in datas[userId].keys():
recommendations.append((item, items[item]))
recommendations.sort(key=lambda val: val[1], reverse=True)
return recommendations[0:m]
# 向指定用户推荐前m部电影 p134
recommend_top10films = recommend_films(2, m=8)
for item, rating in recommend_top10films:
print('{:50}{}'.format(item, rating))
# 商品推荐 ------------
# 通过代码把它转换成事务型数据 p136
from mlxtend.preprocessing import TransactionEncoder
record=[['牛奶','洋葱','鸡蛋','酸奶'],['洋葱','猪肉','芸豆'],['牛奶','苹果','芸豆','鸡蛋'],['玉米','洋葱','芸豆','冰淇淋','鸡蛋']]
te_Encoder = TransactionEncoder()
te_array = te_Encoder.fit(record).transform(record)
print(te_Encoder.columns_)
print(te_array)
# 计算购物清单中的频繁项集和关联规则 p138
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
df_datas=pd.DataFrame(te_array,columns=te_Encoder.columns_)
freq_item=apriori(df_datas,min_support=0.5,use_colnames=True)
print(freq_item)
rules=association_rules(freq_item,min_threshold=0.6)
print(rules)
# 将文件数据保存到列表 p139
import csv
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
ls_data=[]
with open(r'data\groceries.csv','r') as f:
reader=csv.reader(f)
for row in reader:
ls_data.append(row)
# 对列表数据进行事务编码处理 p139
te = TransactionEncoder()
te_array = te.fit(ls_data).transform(ls_data)
df = pd.DataFrame(te_array, columns=te.columns_)
# 确定合理的最小支持值 p140
mean_supp=1-np.mean((df.describe()).loc['freq'][:])/9835
print('一种商品平均被购买的概率:',mean_supp)
# 找出频繁项集 p141
freq_item = apriori(df, min_support=0.02,max_len=2,use_colnames=True)
freq_item.sort_values(by='support',axis=0,ascending=False)
freq_item.loc[freq_item['itemsets'].str.len()>1].sort_values(by='support',axis=0,ascending=False)
# 挖掘关联信息 p142
rules=association_rules(freq_item,min_threshold=0.25) # 0.5
print(rules.sort_values(by='confidence', axis=0, ascending=False))
标签:plt,人工智能,导论,df,train,test,data,datas
From: https://www.cnblogs.com/lockeart/p/17133496.html