第六章部分:
#代码6-1 描述性统计分析 import numpy as np import pandas as pd inputfile = "data.csv" data = pd.read_csv(inputfile) #依次计算最小值,最大值,均值,标准差 description = [data.min(), data.max(), data.mean(), data.std()] description = pd.DataFrame(description, index = ['Min','Max','Mean','Std']).T print('描述性统计结果:\n',np.round(description,2)) #保留两位小数
#代码6-2 corr = data.corr(method='pearson') print('相关系数矩阵为:\n',np.round(corr,2)) #保留两位小数
#代码6-3 import matplotlib.pyplot as plt import seaborn as sns plt.subplots(figsize=(10,10)) #设置画面大小 sns.heatmap(corr,annot=True,vmax=1,square=True,cmap="Blues") plt.title('相关性热力图') plt.show() plt.close
#代码6—4 import numpy as np import pandas as pd from sklearn.linear_model import Lasso inputflie='data.csv' data = pd.read_csv(inputfile) lasso = Lasso(1000) #调用Lasso函数,设置λ的值为1000 lasso.fit(data.iloc[:,0:13],data['y']) print('相关系数为:',np.round(lasso.coef_,5)) #输出结果,保留五位小数 print('相关系数非零个数为:',np.sum(lasso.coef_ != 0)) #计算非零个数 mask = lasso.coef_ != 0 print('相关系数是否为零:',mask) outputfile='data2/new_reg_data.csv' #输出的数据文件 mask = np.append(mask,True) print(mask) new_reg_data = data.iloc[:,mask] #返回相关系数非零的数据 new_reg_data.to_csv(outputfile) #存储数据 print('输出数据的维度为:',new_reg_data.shape) #查看输出数据的维度
#代码6—5 import sys sys.path.append('data2/code') #设置路径 import numpy as np import pandas as pd from GM11 import GM11 inputfile1 = 'data2/new_reg_data.csv' inputfile2 = 'data.csv' new_reg_data = pd.read_csv(inputfile1) data = pd.read_csv(inputfile2) new_reg_data.index = range(1994,2014) new_reg_data.loc[2014]=None new_reg_data.loc[2015]=None cols = ['x1','x3','x4','x5','x6','x7','x8','x13'] for i in cols: f = GM11(new_reg_data.loc[range(1994,2014),i].to_numpy())[0] new_reg_data.loc[2014,i] = f(len(new_reg_data)-1) new_reg_data.loc[2015,i] = f(len(new_reg_data)) new_reg_data[i] = new_reg_data[i].round(2) outputfile = 'data2/new_reg_data_GM11.xls' y = list(data['y'].values) y.extend([np.nan,np.nan]) new_reg_data['y'] = y new_reg_data.to_excel(outputfile) print('预测结果为:\n',new_reg_data.loc[2014:2015,:])
#代码6-6构建支持向量回归预测模型 import matplotlib.pyplot as plt from sklearn.svm import LinearSVR inputfile = 'data2/new_reg_data_GM11.xls' data = pd.read_excel(inputfile) feature = ['x1', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x13'] #属性所在列 data.index = range(1994,2016) data_train = data.loc[range(1994, 2014)].copy() #灰色预测后的数据,取2014年前的数据建模 data_mean = data_train.mean() data_std = data_train.std() data_train = (data_train - data_mean)/data_std #数据标准化 x_train = data_train[feature].to_numpy() y_train = data_train['y'].to_numpy() #标签数据 linearsvr = LinearSVR() #调用LinearSVR()函数 linearsvr.fit(x_train, y_train) x = ((data[feature] - data_mean[feature])/data_std[feature]).to_numpy() #预测,并还原结果 data[u'y_pred'] = linearsvr.predict(x) * data_std['y'] + data_mean['y'] outputfile = 'data2/new_reg_data_GM11_revenue.xls' data.to_excel(outputfile) print('真实值与预测值分别为:\n', data[['y', 'y_pred']]) fig = data[['y', 'y_pred']].plot(subplots = True, style=['b-o', 'r-*']) plt.show()
#2016 import sys sys.path.append('data2/code') import numpy as np import pandas as pd from GM11 import GM11 inputfile1='data2/new_reg_data_GM11.xls' inputfile2='data.csv' new_reg_data=pd.read_excel(inputfile1) data=pd.read_csv(inputfile2) new_reg_data.index=range(1994,2016) #new_reg_data.loc[2014]=None new_reg_data.loc[2016]=None cols=['x1','x3','x4','x5','x6','x7','x8','x13'] for i in cols: f=GM11(new_reg_data.loc[range(1994,2015),i].values)[0] new_reg_data.loc[2016,i]=f(len(new_reg_data)-1) # new_reg_data.loc[2015,i]=f(len(new_reg_data)) new_reg_data[i]=new_reg_data[i].round(2) outputfile='data2/new_reg_data_GM11_2.xls' y=list(data['y'].values) y.extend([np.nan,np.nan]) #new_reg_data['y']=y new_reg_data.to_excel(outputfile) print('预测结果为:\n',new_reg_data.loc[2016,:])
第四章部分
#代码4-9 对一个10*4维的随机矩阵进行主成分分析 import numpy as np from sklearn.decomposition import PCA D = np.random.rand(10,4) pca = PCA() pca.fit(D) pca.components_ pca.explained_variance_ratio_
import random import pandas as pd import numpy as np import matplotlib.pyplot as plt # 计算欧拉距离 def calcDis(dataSet, centroids, k): clalist=[] for data in dataSet: diff = np.tile(data, (k, 1)) - centroids #相减 (np.tile(a,(2,1))就是把a先沿x轴复制1倍,即没有复制,仍然是 [0,1,2]。 再把结果沿y方向复制2倍得到array([[0,1,2],[0,1,2]])) squaredDiff = diff ** 2 #平方 squaredDist = np.sum(squaredDiff, axis=1) #和 (axis=1表示行) distance = squaredDist ** 0.5 #开根号 clalist.append(distance) clalist = np.array(clalist) #返回一个每个点到质点的距离len(dateSet)*k的数组 return clalist # 计算质心 def classify(dataSet, centroids, k): # 计算样本到质心的距离 clalist = calcDis(dataSet, centroids, k) # 分组并计算新的质心 minDistIndices = np.argmin(clalist, axis=1) #axis=1 表示求出每行的最小值的下标 newCentroids = pd.DataFrame(dataSet).groupby(minDistIndices).mean() #DataFramte(dataSet)对DataSet分组,groupby(min)按照min进行统计分类,mean()对分类结果求均值 newCentroids = newCentroids.values # 计算变化量 changed = newCentroids - centroids return changed, newCentroids # 使用k-means分类 def kmeans(dataSet, k): # 随机取质心 centroids = random.sample(dataSet, k) # 更新质心 直到变化量全为0 changed, newCentroids = classify(dataSet, centroids, k) while np.any(changed != 0): changed, newCentroids = classify(dataSet, newCentroids, k) centroids = sorted(newCentroids.tolist()) #tolist()将矩阵转换成列表 sorted()排序 # 根据质心计算每个集群 cluster = [] clalist = calcDis(dataSet, centroids, k) #调用欧拉距离 minDistIndices = np.argmin(clalist, axis=1) for i in range(k): cluster.append([]) for i, j in enumerate(minDistIndices): #enymerate()可同时遍历索引和遍历元素 cluster[j].append(dataSet[i]) return centroids, cluster # 创建数据集 def createDataSet(): return [[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]] if __name__=='__main__': dataset = createDataSet() centroids, cluster = kmeans(dataset, 2) print('质心为:%s' % centroids) print('集群为:%s' % cluster) for i in range(len(dataset)): plt.scatter(dataset[i][0],dataset[i][1], marker = 'o',color = 'green', s = 40 ,label = '原始点') # 记号形状 颜色 点的大小 设置标签 for j in range(len(centroids)): plt.scatter(centroids[j][0],centroids[j][1],marker='x',color='red',s=50,label='质心') plt.show()
import numpy as np import random import matplotlib.pyplot as plt def distance(point1, point2): # 计算距离(欧几里得距离) return np.sqrt(np.sum((point1 - point2) ** 2)) def k_means(data, k, max_iter=10000): centers = {} # 初始聚类中心 # 初始化,随机选k个样本作为初始聚类中心。 random.sample(): 随机不重复抽取k个值 n_data = data.shape[0] # 样本个数 for idx, i in enumerate(random.sample(range(n_data), k)): # idx取值范围[0, k-1],代表第几个聚类中心; data[i]为随机选取的样本作为聚类中心 centers[idx] = data[i] # 开始迭代 for i in range(max_iter): # 迭代次数 print("开始第{}次迭代".format(i+1)) clusters = {} # 聚类结果,聚类中心的索引idx -> [样本集合] for j in range(k): # 初始化为空列表 clusters[j] = [] for sample in data: # 遍历每个样本 distances = [] # 计算该样本到每个聚类中心的距离 (只会有k个元素) for c in centers: # 遍历每个聚类中心 # 添加该样本点到聚类中心的距离 distances.append(distance(sample, centers[c])) idx = np.argmin(distances) # 最小距离的索引 clusters[idx].append(sample) # 将该样本添加到第idx个聚类中心 pre_centers = centers.copy() # 记录之前的聚类中心点 for c in clusters.keys(): # 重新计算中心点(计算该聚类中心的所有样本的均值) centers[c] = np.mean(clusters[c], axis=0) is_convergent = True for c in centers: if distance(pre_centers[c], centers[c]) > 1e-8: # 中心点是否变化 is_convergent = False break if is_convergent == True: # 如果新旧聚类中心不变,则迭代停止 break return centers, clusters def predict(p_data, centers): # 预测新样本点所在的类 # 计算p_data 到每个聚类中心的距离,然后返回距离最小所在的聚类。 distances = [distance(p_data, centers[c]) for c in centers] return np.argmin(distances)
x = np.random.randint(0,high=10,size=(200,2)) centers,clusters = k_means(x,3)
print(centers) clusters
for center in centers: plt.scatter(centers[center][0],centers[center][1],marker='*',s=150) colors = ['r','b','y','m','c','g'] for c in clusters: for point in clusters[c]: plt.scatter(point[0],point[1],c = colors[c])
import numpy as np from sklearn.datasets import make_blobs n_samples = 1500 random_state = 170 transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] X, y = make_blobs(n_samples=n_samples, random_state=random_state) X_aniso = np.dot(X, transformation) # Anisotropic blobs X_varied, y_varied = make_blobs( n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state ) # Unequal variance X_filtered = np.vstack( (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]) ) # Unevenly sized blobs y_filtered = [0] * 500 + [1] * 100 + [2] * 10
import matplotlib.pyplot as plt fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12)) axs[0, 0].scatter(X[:, 0], X[:, 1], c=y) axs[0, 0].set_title("Mixture of Gaussian Blobs") axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y) axs[0, 1].set_title("Anisotropically Distributed Blobs") axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied) axs[1, 0].set_title("Unequal Variance") axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered) axs[1, 1].set_title("Unevenly Sized Blobs") plt.suptitle("Ground truth clusters").set_y(0.95) plt.show()
from sklearn.cluster import KMeans common_params = { "n_init": "auto", "random_state": random_state, } fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12)) y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X) axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred) axs[0, 0].set_title("Non-optimal Number of Clusters") y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso) axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) axs[0, 1].set_title("Anisotropically Distributed Blobs") y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied) axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred) axs[1, 0].set_title("Unequal Variance") y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_filtered) axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) axs[1, 1].set_title("Unevenly Sized Blobs") plt.suptitle("Unexpected KMeans clusters").set_y(0.95) plt.show()
y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=y_pred) plt.title("Optimal Number of Clusters") plt.show()
y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict( X_filtered ) plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred) plt.title("Unevenly Sized Blobs \nwith several initializations") plt.show()
from sklearn.mixture import GaussianMixture fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6)) y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso) ax1.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) ax1.set_title("Anisotropically Distributed Blobs") y_pred = GaussianMixture(n_components=3).fit_predict(X_varied) ax2.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred) ax2.set_title("Unequal Variance") plt.suptitle("Gaussian mixture clusters").set_y(0.95) plt.show()标签:数据分析,plt,python,np,import,第六章,new,data,reg From: https://www.cnblogs.com/pcr-2020310143107/p/17182185.html