首页 > 编程语言 >python数据分析与挖掘实战第六章&第四章1

python数据分析与挖掘实战第六章&第四章1

时间:2023-03-05 23:34:15浏览次数:40  
标签:数据分析 plt python np import 第六章 new data reg

第六章部分:

#代码6-1 描述性统计分析
import numpy as np
import pandas as pd

inputfile = "data.csv"
data = pd.read_csv(inputfile)

#依次计算最小值,最大值,均值,标准差
description = [data.min(), data.max(), data.mean(), data.std()]
description = pd.DataFrame(description, index = ['Min','Max','Mean','Std']).T
print('描述性统计结果:\n',np.round(description,2))  #保留两位小数

#代码6-2
corr = data.corr(method='pearson')
print('相关系数矩阵为:\n',np.round(corr,2))  #保留两位小数

#代码6-3
import matplotlib.pyplot as plt
import seaborn as sns

plt.subplots(figsize=(10,10))   #设置画面大小
sns.heatmap(corr,annot=True,vmax=1,square=True,cmap="Blues")
plt.title('相关性热力图')
plt.show()
plt.close

#代码6—4
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso

inputflie='data.csv'
data = pd.read_csv(inputfile)
lasso = Lasso(1000)     #调用Lasso函数,设置λ的值为1000
lasso.fit(data.iloc[:,0:13],data['y'])
print('相关系数为:',np.round(lasso.coef_,5))    #输出结果,保留五位小数
print('相关系数非零个数为:',np.sum(lasso.coef_ != 0))  #计算非零个数
mask = lasso.coef_ != 0
print('相关系数是否为零:',mask)
outputfile='data2/new_reg_data.csv'    #输出的数据文件
mask = np.append(mask,True)
print(mask)
new_reg_data = data.iloc[:,mask] #返回相关系数非零的数据
new_reg_data.to_csv(outputfile)  #存储数据
print('输出数据的维度为:',new_reg_data.shape) #查看输出数据的维度

#代码6—5
import sys
sys.path.append('data2/code')   #设置路径
import numpy as np
import pandas as pd
from GM11 import GM11

inputfile1 = 'data2/new_reg_data.csv'
inputfile2 = 'data.csv'
new_reg_data = pd.read_csv(inputfile1)
data = pd.read_csv(inputfile2)

new_reg_data.index = range(1994,2014)
new_reg_data.loc[2014]=None
new_reg_data.loc[2015]=None

cols = ['x1','x3','x4','x5','x6','x7','x8','x13']
for i in cols:
    f = GM11(new_reg_data.loc[range(1994,2014),i].to_numpy())[0]
    new_reg_data.loc[2014,i] = f(len(new_reg_data)-1)
    new_reg_data.loc[2015,i] = f(len(new_reg_data))

    new_reg_data[i] = new_reg_data[i].round(2)
outputfile = 'data2/new_reg_data_GM11.xls'
y = list(data['y'].values)
y.extend([np.nan,np.nan])
new_reg_data['y'] = y
new_reg_data.to_excel(outputfile)
print('预测结果为:\n',new_reg_data.loc[2014:2015,:])

#代码6-6构建支持向量回归预测模型
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR

inputfile = 'data2/new_reg_data_GM11.xls'
data = pd.read_excel(inputfile)
feature = ['x1', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x13']  #属性所在列
data.index = range(1994,2016)
data_train = data.loc[range(1994, 2014)].copy()  #灰色预测后的数据,取2014年前的数据建模
data_mean = data_train.mean()
data_std = data_train.std()
data_train = (data_train - data_mean)/data_std  #数据标准化
x_train = data_train[feature].to_numpy() 
y_train = data_train['y'].to_numpy()  #标签数据

linearsvr = LinearSVR()  #调用LinearSVR()函数
linearsvr.fit(x_train, y_train)
x = ((data[feature] - data_mean[feature])/data_std[feature]).to_numpy()  #预测,并还原结果
data[u'y_pred'] = linearsvr.predict(x) * data_std['y'] + data_mean['y']
outputfile = 'data2/new_reg_data_GM11_revenue.xls'
data.to_excel(outputfile)
print('真实值与预测值分别为:\n', data[['y', 'y_pred']])
fig = data[['y', 'y_pred']].plot(subplots = True, style=['b-o', 'r-*'])
plt.show()

 

 

#2016
import sys
sys.path.append('data2/code')
import numpy as np
import pandas as pd
from GM11 import GM11

inputfile1='data2/new_reg_data_GM11.xls'
inputfile2='data.csv'
new_reg_data=pd.read_excel(inputfile1)
data=pd.read_csv(inputfile2)
new_reg_data.index=range(1994,2016)
#new_reg_data.loc[2014]=None
new_reg_data.loc[2016]=None
cols=['x1','x3','x4','x5','x6','x7','x8','x13']
for i in cols:
    f=GM11(new_reg_data.loc[range(1994,2015),i].values)[0]
    new_reg_data.loc[2016,i]=f(len(new_reg_data)-1)
   # new_reg_data.loc[2015,i]=f(len(new_reg_data))
    new_reg_data[i]=new_reg_data[i].round(2)
outputfile='data2/new_reg_data_GM11_2.xls'
y=list(data['y'].values)
y.extend([np.nan,np.nan])
#new_reg_data['y']=y
new_reg_data.to_excel(outputfile)
print('预测结果为:\n',new_reg_data.loc[2016,:])

第四章部分

#代码4-9 对一个10*4维的随机矩阵进行主成分分析
import numpy as np
from sklearn.decomposition import PCA
D = np.random.rand(10,4)
pca = PCA()
pca.fit(D)
pca.components_
pca.explained_variance_ratio_

import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 计算欧拉距离
def calcDis(dataSet, centroids, k):
    clalist=[]
    for data in dataSet:
        diff = np.tile(data, (k, 1)) - centroids  #相减   (np.tile(a,(2,1))就是把a先沿x轴复制1倍,即没有复制,仍然是 [0,1,2]。 再把结果沿y方向复制2倍得到array([[0,1,2],[0,1,2]]))
        squaredDiff = diff ** 2     #平方
        squaredDist = np.sum(squaredDiff, axis=1)   #和  (axis=1表示行)
        distance = squaredDist ** 0.5  #开根号
        clalist.append(distance) 
    clalist = np.array(clalist)  #返回一个每个点到质点的距离len(dateSet)*k的数组
    return clalist

# 计算质心
def classify(dataSet, centroids, k):
    # 计算样本到质心的距离
    clalist = calcDis(dataSet, centroids, k)
    # 分组并计算新的质心
    minDistIndices = np.argmin(clalist, axis=1)    #axis=1 表示求出每行的最小值的下标
    newCentroids = pd.DataFrame(dataSet).groupby(minDistIndices).mean() #DataFramte(dataSet)对DataSet分组,groupby(min)按照min进行统计分类,mean()对分类结果求均值
    newCentroids = newCentroids.values
 
    # 计算变化量
    changed = newCentroids - centroids
 
    return changed, newCentroids

# 使用k-means分类
def kmeans(dataSet, k):
    # 随机取质心
    centroids = random.sample(dataSet, k)
    
    # 更新质心 直到变化量全为0
    changed, newCentroids = classify(dataSet, centroids, k)
    while np.any(changed != 0):
        changed, newCentroids = classify(dataSet, newCentroids, k)
 
    centroids = sorted(newCentroids.tolist())   #tolist()将矩阵转换成列表 sorted()排序
 
    # 根据质心计算每个集群
    cluster = []
    clalist = calcDis(dataSet, centroids, k) #调用欧拉距离
    minDistIndices = np.argmin(clalist, axis=1)  
    for i in range(k):
        cluster.append([])
    for i, j in enumerate(minDistIndices):   #enymerate()可同时遍历索引和遍历元素
        cluster[j].append(dataSet[i])
        
    return centroids, cluster
 
# 创建数据集
def createDataSet():
    return [[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]]

if __name__=='__main__': 
    dataset = createDataSet()
    centroids, cluster = kmeans(dataset, 2)
    print('质心为:%s' % centroids)
    print('集群为:%s' % cluster)
    for i in range(len(dataset)):
      plt.scatter(dataset[i][0],dataset[i][1], marker = 'o',color = 'green', s = 40 ,label = '原始点')
                                                    #  记号形状       颜色      点的大小      设置标签
      for j in range(len(centroids)):
        plt.scatter(centroids[j][0],centroids[j][1],marker='x',color='red',s=50,label='质心')
        plt.show()

import numpy as np
import random 
import matplotlib.pyplot as plt 

def distance(point1, point2):  # 计算距离(欧几里得距离)
    return np.sqrt(np.sum((point1 - point2) ** 2))

def k_means(data, k, max_iter=10000):
    centers = {}  # 初始聚类中心
    # 初始化,随机选k个样本作为初始聚类中心。 random.sample(): 随机不重复抽取k个值
    n_data = data.shape[0]   # 样本个数
    for idx, i in enumerate(random.sample(range(n_data), k)):
        # idx取值范围[0, k-1],代表第几个聚类中心;  data[i]为随机选取的样本作为聚类中心
        centers[idx] = data[i]  

    # 开始迭代
    for i in range(max_iter):  # 迭代次数
        print("开始第{}次迭代".format(i+1))
        clusters = {}    # 聚类结果,聚类中心的索引idx -> [样本集合]
        for j in range(k):  # 初始化为空列表
            clusters[j] = []
            
        for sample in data:  # 遍历每个样本
            distances = []  # 计算该样本到每个聚类中心的距离 (只会有k个元素)
            for c in centers:  # 遍历每个聚类中心
                # 添加该样本点到聚类中心的距离
                distances.append(distance(sample, centers[c])) 
            idx = np.argmin(distances)  # 最小距离的索引
            clusters[idx].append(sample)   # 将该样本添加到第idx个聚类中心
            
        pre_centers = centers.copy()  # 记录之前的聚类中心点

        for c in clusters.keys():
            # 重新计算中心点(计算该聚类中心的所有样本的均值)
            centers[c] = np.mean(clusters[c], axis=0)
  
        is_convergent = True
        for c in centers:
            if distance(pre_centers[c], centers[c]) > 1e-8:  # 中心点是否变化
                is_convergent = False
                break
        if is_convergent == True:  
            # 如果新旧聚类中心不变,则迭代停止
            break
    return centers, clusters

def predict(p_data, centers):  # 预测新样本点所在的类
    # 计算p_data 到每个聚类中心的距离,然后返回距离最小所在的聚类。
    distances = [distance(p_data, centers[c]) for c in centers]  
    return np.argmin(distances)
x = np.random.randint(0,high=10,size=(200,2))
centers,clusters = k_means(x,3)

print(centers)
clusters

for center in centers:
    plt.scatter(centers[center][0],centers[center][1],marker='*',s=150)

colors = ['r','b','y','m','c','g']
for c in clusters:
    for point in clusters[c]:
        plt.scatter(point[0],point[1],c = colors[c])

import numpy as np
from sklearn.datasets import make_blobs

n_samples = 1500
random_state = 170
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]

X, y = make_blobs(n_samples=n_samples, random_state=random_state)
X_aniso = np.dot(X, transformation)  # Anisotropic blobs
X_varied, y_varied = make_blobs(
    n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
)  # Unequal variance
X_filtered = np.vstack(
    (X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])
)  # Unevenly sized blobs
y_filtered = [0] * 500 + [1] * 100 + [2] * 10
import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))

axs[0, 0].scatter(X[:, 0], X[:, 1], c=y)
axs[0, 0].set_title("Mixture of Gaussian Blobs")

axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y)
axs[0, 1].set_title("Anisotropically Distributed Blobs")

axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied)
axs[1, 0].set_title("Unequal Variance")

axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered)
axs[1, 1].set_title("Unevenly Sized Blobs")

plt.suptitle("Ground truth clusters").set_y(0.95)
plt.show()

from sklearn.cluster import KMeans

common_params = {
    "n_init": "auto",
    "random_state": random_state,
}

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))

y_pred = KMeans(n_clusters=2, **common_params).fit_predict(X)
axs[0, 0].scatter(X[:, 0], X[:, 1], c=y_pred)
axs[0, 0].set_title("Non-optimal Number of Clusters")

y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_aniso)
axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
axs[0, 1].set_title("Anisotropically Distributed Blobs")

y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_varied)
axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
axs[1, 0].set_title("Unequal Variance")

y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X_filtered)
axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
axs[1, 1].set_title("Unevenly Sized Blobs")

plt.suptitle("Unexpected KMeans clusters").set_y(0.95)
plt.show()

y_pred = KMeans(n_clusters=3, **common_params).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title("Optimal Number of Clusters")
plt.show()

y_pred = KMeans(n_clusters=3, n_init=10, random_state=random_state).fit_predict(
    X_filtered
)
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
plt.title("Unevenly Sized Blobs \nwith several initializations")
plt.show()

from sklearn.mixture import GaussianMixture

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

y_pred = GaussianMixture(n_components=3).fit_predict(X_aniso)
ax1.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
ax1.set_title("Anisotropically Distributed Blobs")

y_pred = GaussianMixture(n_components=3).fit_predict(X_varied)
ax2.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
ax2.set_title("Unequal Variance")

plt.suptitle("Gaussian mixture clusters").set_y(0.95)
plt.show()

标签:数据分析,plt,python,np,import,第六章,new,data,reg
From: https://www.cnblogs.com/pcr-2020310143107/p/17182185.html

相关文章

  • python中生产者和消费者理论
    1.模型理论生产者消费者他是一个概念,(由于生产者消费者模型并不局限于某一类技术,因此,有多种实现方式)所以,代码很简单,所以这里首先要弄懂理论。 1.1 生产者消费者模......
  • Python学习笔记(七)字符串操作
    一、数据类型转换%s字符串%d数值整数%f浮点数formatf表达式挖坑填坑法则1name='张三'2age=183height=180.545print('我的名字是%s'%name)6p......
  • python 财务预测
    importnumpyasnpimportpandasaspdfromsklearn.linear_modelimportLassoinputfile1='E:/data/data.csv'data=pd.read_csv(inputfile1)lasso=Lasso(1000)l......
  • 【基数排序算法详解】Java/Go/Python/JS/C不同语言实现
    说明基数排序(RadixSort)是一种非比较型整数排序算法,其原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。由于整数也可以表达字符串(比如名字或日期)和特定格式的......
  • python3 queue模块
    1.模块介绍Python的Queue模块中提供了同步的、线程安全的队列类,包括FIFO(先入先出)队列Queue,LIFO(后入先出)队列LifoQueue,和优先级队列PriorityQueue。这些队列都实现了......
  • Python数据分析之财政收入影响因素分析及预测模型
    #-*-coding:utf-8-*-#代码6-1importnumpyasnpimportpandasaspdinputfile='../data/data.csv'#输入的数据文件data=pd.read_csv(inputfile)#......
  • 数据分析第
    1.描述性统计分析importnumpyasnpimportpandasaspdinputfile='D:/桌面文件/python图表/data.csv'#输入的数据文件data=pd.read_csv(inputfile)#读取数......
  • 3月3日python程序设计
    1.列表的创建和删除删除:使用del命令删除,增加列表元素-1使用+表示两个列表的合并,append()方法在列表的末尾增加元素。增加列表元素-2+=与append()函数有共同点,是原地合并......
  • python 的 getattr 返回对象属性值
    getattr作用:可以获取对象属性值 举例:正常情况下获取字符下标#获取字符下标a='中国心'print(a.find('心'))>>>执行结果2 如果使用getattr函数#使用get......
  • python 循环结构 循环控制 break 结束所有循环
    """break跳出整个循环"""#首先输出数字1-10#如果遇到数字6就中止整个循环foriinrange(1,11):print(i)ifi==6:break......