一、描述性统计分析和相关系数矩阵
代码:
import numpy as np
import pandas as pd
inputfile = 'F:\大数据分析\\data.csv'
data = pd.read_csv(inputfile)
# print(data)
description = [data.min(),data.max(),data.mean(),data.std()]
description = pd.DataFrame(description, index = ['Min','Max','Mean','STD']).T
print('描述性统计结果:\n',np.round(description,2))
运行结果:
二、相关系数矩阵:
代码:
corr = data.corr(method='pearson')
print('相关系数矩阵为:\n',np.round(corr,2))
运行结果:
三、相关性热力图:
代码:
import matplotlib.pyplot as plt
import seaborn as sns
plt.subplots(figsize=(10, 10))
sns.heatmap(corr,annot=True, vmax=1,square=True, cmap="Blues")
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.title("相关性热力图 3138")
plt.show()
plt.close
运行结果:
四、读取相关系数数据:
代码:
import pandas as pd
from sklearn.linear_model import Lasso
inputfile ='F:\大数据分析\\data.csv'
data = pd.read_csv(inputfile)
lasso = Lasso(1000)
lasso.fit(data.iloc[:,0:13],data['y'])
print('相关系数为:',np.round(lasso.coef_,5))
print('相关系数非零个数为:',np.sum(lasso.coef_ != 0))
mask = lasso.coef_ != 0
print('相关系数是否为零:',mask)
outputfile ='F:/大数据分析/new_reg_data.csv'
mask = np.append(mask,True)
new_reg_data = data.iloc[:, mask]
new_reg_data.to_csv(outputfile)
print('输出数据的维度为:',new_reg_data.shape)
运行结果:
五、GM11灰色预测
代码:
import sys
sys.path.append('C:\\Users\\carina\\GM11.py') # 导入一个模块
import numpy as np
import pandas as pd
from GM11 import GM11 # 引入自编的灰色预测函数
inputfile1 = 'F:\\大数据分析\\new_reg_data.csv' # 输入数据文件
inputfile2 = 'F:\\大数据分析\\data.csv' # 输入数据文件
new_reg_data = pd.read_csv(inputfile1)
data = pd.read_csv(inputfile2)
new_reg_data.index = range(1994, 2014)
new_reg_data.loc[2014] = None
new_reg_data.loc[2015] = None
cols = ['x1', 'x3', 'x4', 'x5','x6', 'x7', 'x8', 'x13']
for i in cols:
f = GM11(new_reg_data.loc[range(1994, 2014), i].values)[0]
new_reg_data.loc[2014,i] = f(len(new_reg_data)-1) # 2014年预测结果
new_reg_data.loc[2015,i] = f(len(new_reg_data)) # 2015年预测结果
new_reg_data[i] = new_reg_data[i].round(2) # 保留2位小数
outputfile = 'F:\\大数据分析\\new_reg_data_GM11.xls' # 灰色预测后保存的路径
y = list(data['y'].values) # 提取财政收入列,合并至新数据框中
y.extend([np.nan, np.nan])
new_reg_data['y'] = y
new_reg_data.to_excel(outputfile) # 结果输出
print('预测结果为:\n', new_reg_data.loc[2014:2015,:]) # 预测展示
运行结果:
六、读取真实值和预测值以及绘图:
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR
inputfile ='F:\\大数据分析\\new_reg_data_GM11.xls'
data = pd.read_excel(inputfile)
feature = ['x1','x3','x4','x5','x6','x7','x8','x13']
data.index = range(1994,2016)
data_train = data.loc[range(1994,2014)].copy()
data_mean = data_train.mean()
data_std = data_train.std()
data_train = (data_train - data_mean)/data_std
x_train = data_train[feature].to_numpy()
y_train = data_train['y'].to_numpy()
linearsvr = LinearSVR()
linearsvr.fit(x_train,y_train)
x = ((data[feature] - data_mean[feature])/data_std[feature]).to_numpy()
data[u'y_pred'] = linearsvr.predict(x) * data_std['y'] + data_mean['y']
outputfile = 'F:\\大数据分析\\new_reg_data_GM11_revenue.xls'
data.to_excel(outputfile)
print('真实值与预测值分别为:\n',data[['y','y_pred']])
fig = data[['y','y_pred']].plot(subplots = True,style=['b-o','r-*'])
plt.title('学号3138')
plt.show()
运行结果:
七、真实值和预测值重合图:
代码:
import matplotlib.pyplot as plt
p = data[['y', 'y_pred']].plot(style=['b-o', 'r-*'])
p.set_ylim(0, 2500)
p.set_xlim(1993, 2016)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 添加这条可以让图形显示中文
plt.title('学号 3138')
plt.show()
运行结果:
八、运用PCA读取读取模型的各个特征向量以及各个成分各自的方差百分比:
代码:
from sklearn .decomposition import PCA
import numpy as np
D = np.random.rand(10,4)
pca = PCA()
pca.fit(D)
PCA(copy=True,n_components=None,whiten=False)
pca.components_
运行结果:
代码:
pca.explained_variance_ratio_
运行结果:
九、聚类中心:
import numpy as np
import random
import matplotlib.pyplot as plt
def distance(point1, point2): # 计算距离(欧几里得距离)
return np.sqrt(np.sum((point1 - point2) ** 2))
def k_means(data, k, max_iter=10000):
centers = {} # 初始聚类中心
# 初始化,随机选k个样本作为初始聚类中心。 random.sample(): 随机不重复抽取k个值
n_data = data.shape[0] # 样本个数
for idx, i in enumerate(random.sample(range(n_data), k)):
# idx取值范围[0, k-1],代表第几个聚类中心; data[i]为随机选取的样本作为聚类中心
centers[idx] = data[i]
# 开始迭代
for i in range(max_iter): # 迭代次数
print("开始第{}次迭代".format(i+1))
clusters = {} # 聚类结果,聚类中心的索引idx -> [样本集合]
for j in range(k): # 初始化为空列表
clusters[j] = []
for sample in data: # 遍历每个样本
distances = [] # 计算该样本到每个聚类中心的距离 (只会有k个元素)
for c in centers: # 遍历每个聚类中心
# 添加该样本点到聚类中心的距离
distances.append(distance(sample, centers[c]))
idx = np.argmin(distances) # 最小距离的索引
clusters[idx].append(sample) # 将该样本添加到第idx个聚类中心
pre_centers = centers.copy() # 记录之前的聚类中心点
for c in clusters.keys():
# 重新计算中心点(计算该聚类中心的所有样本的均值)
centers[c] = np.mean(clusters[c], axis=0)
is_convergent = True
for c in centers:
if distance(pre_centers[c], centers[c]) > 1e-8: # 中心点是否变化
is_convergent = False
break
if is_convergent == True:
# 如果新旧聚类中心不变,则迭代停止
break
return centers, clusters
def predict(p_data, centers): # 预测新样本点所在的类
# 计算p_data 到每个聚类中心的距离,然后返回距离最小所在的聚类。
distances = [distance(p_data, centers[c]) for c in centers]
return np.argmin(distances)
x=np.random.randint(0,high=10,size=(200,2))
centers,clusters = k_means(x,3)
运行结果:
代码:
print(centers)
clusters
运行结果:
{0: array([2.14492754, 2.07246377]), 1: array([7.5862069 , 2.77586207]), 2: array([3.95890411, 7.46575342])}Out[6]:
{0: [array([0, 2]), array([3, 2]), array([5, 1]), array([4, 1]), array([1, 0]), array([3, 2]), array([0, 4]), array([4, 1]), array([5, 1]), array([3, 2]), array([2, 3]), array([2, 4]), array([5, 1]), array([2, 2]), array([1, 2]), array([5, 0]), array([2, 0]), array([0, 2]), array([3, 2]), array([3, 2]), array([4, 2]), array([1, 4]), array([2, 4]), array([2, 1]), array([0, 1]), array([2, 0]), array([3, 2]), array([3, 3]), array([1, 3]), array([1, 2]), array([3, 4]), array([0, 3]), array([1, 4]), array([4, 3]), array([2, 4]), array([2, 1]), array([3, 0]), array([2, 5]), array([1, 3]), array([4, 3]), array([1, 0]), array([3, 1]), array([1, 0]), array([4, 1]), array([0, 1]), array([4, 3]), array([0, 0]), array([0, 2]), array([3, 1]), array([3, 3]), array([2, 2]), array([2, 2]), array([0, 3]), array([4, 4]), array([5, 1]), array([0, 0]), array([0, 5]), array([4, 0]), array([2, 4]), array([2, 5]), array([3, 4]), array([1, 0]), array([0, 5]), array([0, 0]), array([4, 2]), array([4, 1]), array([0, 1]), array([2, 3]), array([0, 3])], 1: [array([8, 1]), array([9, 2]), array([9, 7]), array([5, 3]), array([7, 0]), array([7, 2]), array([8, 1]), array([7, 3]), array([7, 5]), array([6, 5]), array([6, 4]), array([7, 6]), array([9, 2]), array([9, 3]), array([9, 1]), array([9, 1]), array([7, 4]), array([8, 3]), array([6, 2]), array([8, 1]), array([8, 1]), array([7, 3]), array([8, 0]), array([9, 1]), array([5, 2]), array([6, 2]), array([7, 0]), array([9, 6]), array([9, 4]), array([9, 1]), array([6, 1]), array([9, 2]), array([7, 5]), array([9, 5]), array([9, 6]), array([9, 5]), array([9, 2]), array([9, 3]), array([9, 1]), array([9, 4]), array([8, 2]), array([7, 1]), array([7, 2]), array([9, 6]), array([6, 4]), array([6, 2]), array([7, 0]), array([6, 1]), array([8, 3]), array([6, 3]), array([6, 5]), array([9, 0]), array([5, 4]), array([7, 6]), array([6, 3]), array([8, 1]), array([9, 4]), array([7, 4])], 2: [array([6, 9]), array([4, 7]), array([2, 6]), array([5, 5]), array([6, 6]), array([4, 6]), array([1, 7]), array([5, 6]), array([7, 8]), array([9, 8]), array([4, 8]), array([7, 7]), array([5, 5]), array([1, 6]), array([8, 8]), array([4, 8]), array([3, 5]), array([0, 7]), array([4, 9]), array([5, 8]), array([1, 8]), array([2, 7]), array([6, 9]), array([3, 9]), array([1, 9]), array([3, 7]), array([6, 9]), array([7, 9]), array([6, 9]), array([4, 6]), array([4, 5]), array([2, 8]), array([0, 6]), array([4, 9]), array([3, 5]), array([7, 8]), array([6, 6]), array([7, 9]), array([1, 8]), array([5, 8]), array([7, 8]), array([5, 8]), array([1, 8]), array([0, 6]), array([2, 8]), array([3, 9]), array([5, 6]), array([3, 9]), array([0, 8]), array([5, 6]), array([6, 6]), array([2, 6]), array([4, 8]), array([5, 7]), array([4, 9]), array([1, 7]), array([9, 9]), array([2, 8]), array([7, 9]), array([2, 8]), array([8, 8]), array([7, 8]), array([0, 7]), array([2, 6]), array([3, 6]), array([6, 8]), array([4, 9]), array([6, 9]), array([8, 9]), array([0, 8]), array([1, 7]), array([0, 8]), array([3, 6])]}
十:聚类中心绘图:
代码:
for center in centers:
plt.scatter(centers[center][0],centers[center][1],marker='*',s=150)
colors = ['r','b','y','m','c','g']
for c in clusters:
for point in clusters[c]:
plt.scatter(point[0],point[1],c=colors[c])
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.title('学号3138')
运行结果:
标签:预测,centers,第二周,聚类,new,array,data,reg From: https://www.cnblogs.com/Carina----/p/17180336.html