一、选题背景
在我们生活中,健康是非常重要的,然而,随着生活的各种因素,我们的健康基准也发生了非常大的变化。目前,根据疾病预防控制中心的调研(该系统每年进行电话调查以收集有关美国居民健康状况的数据);心脏病是导致一部分特殊人群死亡的直接因素之一。在国外,因为心脏病而死亡的美国人就大约有47%;与心脏病具有一定联系的还有吸烟、先天性、胆固醇等。对于预防心脏病是非常有必要的,因此,根据疾病预防控制中心提供的数据,我将对其进行一些可视化数据分析,了解引起心脏病的一些因素,让每个人得以有效地预防。
二、大数据分析设计方案
- 本数据集的数据内容与数据特征分析
该原有的数据集包含 18 个变量(9 个布尔值、5 个字符串和 4 个小数),为了更方便进行数据可视化分析,我对数据进行了一些数据类型替换,如下表格所示:
字段名称 | 字段类型 | 说明 |
HeartDisease | int | 心脏病 ,0:无,1:有 |
身体质量指数(BMI) | float | 随数值越高说明身体越肥胖 |
Smoking | int | 抽烟,0:无,1:有 |
AlcoholDrinking | int | 酗酒,0:无,1:有 |
Stroke | int | 中风,0:无,1:有 |
PhysicalHealth | int | 现在你的身体健康,包括身体疾病和受伤,有多少天你的身体健康不好(0-30天) |
MentalHealth | int | 心理健康,在过去的30天里有多少天你的心理健康不好?(0-30天) |
DiffWalking | int | 能否行走,0:无,1:有 |
Sex | int | 性别,0:女,1:男 |
AgeCategory | int | 年龄区间,为了统一,年龄作以下修改,如:57岁是在55岁到59岁之间。(57=(55~59)) |
Race | string | 肤色 |
Diabetic | float | 糖尿病,0:无,0.5:临界,1:有 |
PhysicalActivity | int | 身体运动,0:无,1:有 |
GenHealth | int | 健康状态,0:差,1:一般,2:好,3:很好,4:非常好 |
SleepTime | int | 睡眠时间,随数值越长睡眠越长 |
Asthma | int | 哮喘,0:无,1:有 |
KidneyDisease | int | 肾脏疾病,0:无,1:有 |
SkinCancer | int | 皮肤疾病,0:无,1:有 |
三、数据分析步骤
1.数据源
该课程设计的数据集来源于疾病预防控制中心
该网站:Know Your Risk for Heart Disease | cdc.gov
2.数据清洗
1 import numpy as np 2 import pandas as pd 3 import scipy.stats as stats 4 import matplotlib.pyplot as plt 5 6 heartD_file=pd.DataFrame(pd.read_csv('heart_2020_cleaned_change.csv')) 7 heartD_file
清洗之后的数据显示
重复值处理
1 heartD_file.duplicated()
由于肤色(Race)不是导致引起心脏病的因素,对数据进行删除无效列
//删除Race
1 heartD_file.drop('Race',axis=1,inplace=True) 2 heartD_file.head()
3.大数据分析过程
通过使用python提供的describe()方法来查看数据表中的各数值字段的统计信息,这样我们可以更清楚地了解
1 heartD_file.describe()
通过bar()和pie()总结出酗酒、抽烟等人数分布
1 import numpy as np 2 import pandas as pd 3 import matplotlib.pyplot as plt 4 import seaborn as sns 5 6 7 plt.rcParams['font.sans-serif'] = ['SimHei'] 8 plt.rcParams['axes.unicode_minus'] = True 9 10 number=18 11 plt.figure(figsize=(15,50)) 12 13 Smoking_number=plt.subplot(number,2,1) 14 plt.title("抽烟人数") 15 sns.countplot(y=heartD_file['Smoking'],hue=heartD_file['Smoking'],orientation="horizontal" ) 16 plt.legend() 17 18 19 Smoking_number=plt.subplot(number,2,2) 20 plt.title("抽烟人数") 21 labels=['否','是'] 22 # 通过value_counts()来统计非nan元素的出现次数 23 a=heartD_file['Smoking'].value_counts() 24 plt.pie(a,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 25 26 27 plt.figure(figsize=(15,50)) 28 Smoking_number=plt.subplot(number,2,3) 29 plt.title("酗酒人数") 30 sns.countplot(y=heartD_file['AlcoholDrinking'],hue=heartD_file['AlcoholDrinking'],orientation="horizontal" ) 31 32 33 Smoking_number=plt.subplot(number,2,4) 34 plt.title("酗酒人数") 35 labels=['否','是'] 36 b=heartD_file['AlcoholDrinking'].value_counts() 37 plt.pie(b,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 38 39 40 plt.figure(figsize=(15,50)) 41 Smoking_number=plt.subplot(number,2,5) 42 plt.title("中风人数") 43 sns.countplot(y=heartD_file['Stroke'],hue=heartD_file['Stroke'],orientation="horizontal" ) 44 45 Smoking_number=plt.subplot(number,2,6) 46 plt.title("中风人数") 47 labels=['否','是'] 48 c=heartD_file['Stroke'].value_counts() 49 plt.pie(c,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 50 51 plt.figure(figsize=(15,50)) 52 Smoking_number=plt.subplot(number,2,7) 53 plt.title("行动困难人数") 54 sns.countplot(y=heartD_file['DiffWalking'],hue=heartD_file['DiffWalking'],orientation="horizontal" ) 55 56 Smoking_number=plt.subplot(number,2,8) 57 plt.title("行动困难人数") 58 labels=['否','是'] 59 d=heartD_file['DiffWalking'].value_counts() 60 plt.pie(d,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 61 62 plt.figure(figsize=(15,50)) 63 Smoking_number=plt.subplot(number,2,9) 64 plt.title("性别") 65 sns.countplot(y=heartD_file['Sex'],hue=heartD_file['Sex'],orientation="horizontal" ) 66 67 68 Smoking_number=plt.subplot(number,2,10) 69 plt.title("性别") 70 labels=['否','是'] 71 e=heartD_file['Sex'].value_counts() 72 plt.pie(e,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 73 74 plt.figure(figsize=(15,50)) 75 Smoking_number=plt.subplot(number,2,11) 76 plt.title("糖尿病人数") 77 sns.countplot(y=heartD_file['Diabetic'],hue=heartD_file['Diabetic'],orientation="horizontal" ) 78 79 Smoking_number=plt.subplot(number,2,12) 80 plt.title("糖尿病人数") 81 labels=['否','有时','是'] 82 f=heartD_file['Diabetic'].value_counts() 83 plt.pie(f,labels=labels,explode=[0.2,0,0],autopct='%1.2f%%') 84 85 86 plt.figure(figsize=(15,50)) 87 Smoking_number=plt.subplot(number,2,13) 88 plt.title("糖尿病人数") 89 sns.countplot(y=heartD_file['Diabetic'],hue=heartD_file['Diabetic'],orientation="horizontal" ) 90 91 Smoking_number=plt.subplot(number,2,14) 92 plt.title("糖尿病人数") 93 labels=['否','有时','是'] 94 g=heartD_file['Diabetic'].value_counts() 95 plt.pie(g,labels=labels,explode=[0.2,0,0],autopct='%1.2f%%') 96 97 plt.figure(figsize=(15,50)) 98 Smoking_number=plt.subplot(number,2,15) 99 plt.title("运动人数") 100 sns.countplot(y=heartD_file['PhysicalActivity'],hue=heartD_file['PhysicalActivity'],orientation="horizontal" ) 101 102 Smoking_number=plt.subplot(number,2,16) 103 plt.title("运动人数") 104 labels=['否','是'] 105 h=heartD_file['PhysicalActivity'].value_counts() 106 plt.pie(h,labels=labels,explode=[0.2,0],autopct='%1.2f%%') 107 108 plt.figure(figsize=(15,50)) 109 Smoking_number=plt.subplot(number,2,17) 110 plt.title("健康状态") 111 sns.countplot(y=heartD_file['GenHealth'],hue=heartD_file['GenHealth'],orientation="horizontal" ) 112 113 Smoking_number=plt.subplot(number,2,18) 114 plt.title("健康状态") 115 labels=['不好','一般','好','很好','非常好'] 116 i=heartD_file['GenHealth'].value_counts() 117 plt.pie(i,labels=labels,autopct='%1.2f%%') 118 119 plt.figure(figsize=(15,50)) 120 Smoking_number=plt.subplot(number,2,17) 121 plt.title("哮喘人数") 122 sns.countplot(y=heartD_file['Asthma'],hue=heartD_file['Asthma'],orientation="horizontal" ) 123 124 Smoking_number=plt.subplot(number,2,18) 125 plt.title("哮喘人数") 126 labels=['否','是'] 127 i=heartD_file['Asthma'].value_counts() 128 plt.pie(i,labels=labels,explode=[0.2,0],autopct='%1.2f%%') 129 130 plt.figure(figsize=(15,50)) 131 Smoking_number=plt.subplot(number,2,17) 132 plt.title("肾脏疾病人数") 133 sns.countplot(y=heartD_file['KidneyDisease'],hue=heartD_file['KidneyDisease'],orientation="horizontal" ) 134 135 Smoking_number=plt.subplot(number,2,18) 136 plt.title("肾脏疾病人数") 137 labels=['否','是'] 138 i=heartD_file['KidneyDisease'].value_counts() 139 plt.pie(i,labels=labels,explode=[0.2,0],autopct='%1.2f%%') 140 141 plt.figure(figsize=(15,50)) 142 Smoking_number=plt.subplot(number,2,17) 143 plt.title("皮肤癌人数") 144 sns.countplot(y=heartD_file['SkinCancer'],hue=heartD_file['SkinCancer'],orientation="horizontal" ) 145 146 Smoking_number=plt.subplot(number,2,18) 147 plt.title("皮肤癌人数") 148 labels=['否','是'] 149 i=heartD_file['SkinCancer'].value_counts() 150 plt.pie(i,labels=labels,explode=[0.2,0],autopct='%1.2f%%') 151 152 153 plt.legend() 154 plt.show()
通过countplot()方法进行对不同数据组的对比显示以下:
1 compare_number=12 2 plt.figure(figsize=(15,60)) 3 plt.subplot(compare_number,2,1) 4 sns.countplot(x=heartD_file['Smoking'],hue=heartD_file['HeartDisease']) 5 6 plt.subplot(compare_number,2,2) 7 sns.countplot(x=heartD_file['AlcoholDrinking'],hue=heartD_file['HeartDisease']) 8 9 10 plt.figure(figsize=(15,60)) 11 plt.subplot(compare_number,2,3) 12 sns.countplot(x=heartD_file['Stroke'],hue=heartD_file['HeartDisease']) 13 14 plt.subplot(compare_number,2,4) 15 sns.countplot(x=heartD_file['PhysicalHealth'],hue=heartD_file['HeartDisease']) 16 17 18 plt.figure(figsize=(15,60)) 19 plt.subplot(compare_number,2,5) 20 sns.countplot(x=heartD_file['MentalHealth'],hue=heartD_file['HeartDisease']) 21 22 plt.subplot(compare_number,2,6) 23 sns.countplot(x=heartD_file['DiffWalking'],hue=heartD_file['HeartDisease']) 24 25 plt.figure(figsize=(15,60)) 26 plt.subplot(compare_number,2,7) 27 sns.countplot(x=heartD_file['Sex'],hue=heartD_file['HeartDisease']) 28 29 plt.subplot(compare_number,2,8) 30 sns.countplot(x=heartD_file['AgeCategory'],hue=heartD_file['HeartDisease']) 31 plt.show()
分析发现,酗酒、性别不是导致心脏病的因素;而在抽烟、中风、少运动、心情经常性不好、行动不方便和年龄的增长,这类特征会引发心脏病的重要因素之一。
以下绘制小提琴图分析睡觉时长与心脏病的关系:
1 sns.violinplot(x=heartD_file['HeartDisease'],y=heartD_file['SleepTime']) 2 plt.show()
数据分析表明睡眠与引起心脏病无关;
以下是糖尿病、年龄与心脏病的盒图:
1 plt.figure(figsize=(15,10)) 2 plt.subplot(1,2,1) 3 sns.boxplot(x=heartD_file['Diabetic'],y=heartD_file['AgeCategory'],data=heartD_file) 4 5 plt.subplot(1,2,2) 6 sns.boxplot(x=heartD_file['Diabetic'],y=heartD_file['AgeCategory'],hue=heartD_file['HeartDisease'],data=heartD_file) 7 plt.show()
由此得出结论糖尿病是会引起心脏病的,通常糖尿病的患者会有大动脉的合并症;
以下是针对人的BMI和精神状态进行分析:
1 import numpy as np 2 import pandas as pd 3 import scipy.stats as stats 4 import matplotlib.pyplot as plt 5 import seaborn as sns 6 7 num=4 8 plt.figure(figsize=(15,10)) 9 plt.subplot(num,2,1) 10 #盒图 11 sns.boxplot(x=heartD_file['HeartDisease'],y=heartD_file['BMI'],data=heartD_file) 12 13 plt.subplot(num,2,2) 14 sns.regplot(x='BMI',y='HeartDisease',data=heartD_file) 15 16 plt.subplot(num,2,3) 17 #使用离散x变量绘制并添加一些抖动 18 sns.regplot(x='GenHealth',y='HeartDisease',data=heartD_file,x_jitter=.1) 19 20 plt.subplot(num,2,4) 21 #拟合回归模型并截断模型预测 22 sns.regplot(x='GenHealth',y='HeartDisease',data=heartD_file, x_estimator=np.mean, logx=True, truncate=True) 23 plt.show()
通过线性回归模型拟合的分析结果发现,BMI(人体肥胖程度)会随着数值增大而患心脏病的概率变大;再是在人的精神状态分析,使用离散x变量绘制并添加一些抖动也发现,人一直处于精神非常差时,患心脏病的概率也逐步变大。
以下是根据男女来分析各种情况:
1 import seaborn as sns 2 sns.pairplot(heartD_file[heartD_file.Sex == 1], vars = ['MentalHealth', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'], hue = 'HeartDisease') 3 plt.show()
1 import seaborn as sns 2 sns.pairplot(heartD_file[heartD_file.Sex == 0], vars = ['MentalHealth', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'], hue = 'HeartDisease') 3 plt.show()
分析得出,在中风、少运动、心情经常性不好、行动不方便和年龄的增长会引发心脏病。
以下是分析哮喘、肾病和皮肤癌
1 import numpy as np 2 import pandas as pd 3 import scipy.stats as stats 4 import matplotlib.pyplot as plt 5 import seaborn as sns 6 7 compare_number=12 8 plt.figure(figsize=(15,60)) 9 plt.subplot(compare_number,2,1) 10 sns.countplot(x=heartD_file['Asthma'],hue=heartD_file['HeartDisease']) 11 12 13 14 plt.figure(figsize=(15,60)) 15 plt.subplot(compare_number,2,3) 16 sns.countplot(x=heartD_file['SkinCancer'],hue=heartD_file['HeartDisease']) 17 18 plt.subplot(compare_number,2,4) 19 sns.regplot(x='SkinCancer',y='HeartDisease',data=heartD_file, x_estimator=np.mean, logx=True, truncate=True) 20 21 plt.figure(figsize=(15,60)) 22 plt.subplot(compare_number,2,5) 23 sns.countplot(x=heartD_file['KidneyDisease'],hue=heartD_file['HeartDisease']) 24 25 plt.subplot(compare_number,2,6) 26 sns.regplot(x='KidneyDisease',y='HeartDisease',data=heartD_file, x_estimator=np.mean, logx=True, truncate=True) 27 28 29 plt.show()
1 import seaborn as sns 2 sns.pairplot(heartD_file[heartD_file.Sex == 0], vars = ['AgeCategory','KidneyDisease','Asthma','SkinCancer'], hue = 'HeartDisease') 3 plt.show()
数据分析得出哮喘、肾病和皮肤癌与心脏病没有直接的关系。
以下是总代码:
1 //数据清洗 2 import numpy as np 3 import pandas as pd 4 import scipy.stats as stats 5 import matplotlib.pyplot as plt 6 7 //对数据进行删除无效列 8 heartD_file=pd.DataFrame(pd.read_csv('heart_2020_cleaned_change.csv')) 9 heartD_file 10 11 //重复值处理 12 heartD_file.duplicated() 13 14 对数据进行删除无效列 15 heartD_file.drop('Race',axis=1,inplace=True) 16 heartD_file.head() 17 18 //查看数据表中的各数值 19 heartD_file.describe() 20 21 22 // 通过bar()和pie()总结出酗酒、抽烟等人数分布 23 import numpy as np 24 import pandas as pd 25 import matplotlib.pyplot as plt 26 import seaborn as sns 27 28 29 plt.rcParams['font.sans-serif'] = ['SimHei'] 30 plt.rcParams['axes.unicode_minus'] = True 31 32 number=18 33 plt.figure(figsize=(15,50)) 34 35 Smoking_number=plt.subplot(number,2,1) 36 plt.title("抽烟人数") 37 sns.countplot(y=heartD_file['Smoking'],hue=heartD_file['Smoking'],orientation="horizontal" ) 38 plt.legend() 39 40 41 Smoking_number=plt.subplot(number,2,2) 42 plt.title("抽烟人数") 43 labels=['否','是'] 44 # 通过value_counts()来统计非nan元素的出现次数 45 a=heartD_file['Smoking'].value_counts() 46 plt.pie(a,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 47 48 49 plt.figure(figsize=(15,50)) 50 Smoking_number=plt.subplot(number,2,3) 51 plt.title("酗酒人数") 52 sns.countplot(y=heartD_file['AlcoholDrinking'],hue=heartD_file['AlcoholDrinking'],orientation="horizontal" ) 53 54 55 Smoking_number=plt.subplot(number,2,4) 56 plt.title("酗酒人数") 57 labels=['否','是'] 58 b=heartD_file['AlcoholDrinking'].value_counts() 59 plt.pie(b,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 60 61 62 plt.figure(figsize=(15,50)) 63 Smoking_number=plt.subplot(number,2,5) 64 plt.title("中风人数") 65 sns.countplot(y=heartD_file['Stroke'],hue=heartD_file['Stroke'],orientation="horizontal" ) 66 67 Smoking_number=plt.subplot(number,2,6) 68 plt.title("中风人数") 69 labels=['否','是'] 70 c=heartD_file['Stroke'].value_counts() 71 plt.pie(c,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 72 73 plt.figure(figsize=(15,50)) 74 Smoking_number=plt.subplot(number,2,7) 75 plt.title("行动困难人数") 76 sns.countplot(y=heartD_file['DiffWalking'],hue=heartD_file['DiffWalking'],orientation="horizontal" ) 77 78 Smoking_number=plt.subplot(number,2,8) 79 plt.title("行动困难人数") 80 labels=['否','是'] 81 d=heartD_file['DiffWalking'].value_counts() 82 plt.pie(d,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 83 84 plt.figure(figsize=(15,50)) 85 Smoking_number=plt.subplot(number,2,9) 86 plt.title("性别") 87 sns.countplot(y=heartD_file['Sex'],hue=heartD_file['Sex'],orientation="horizontal" ) 88 89 90 Smoking_number=plt.subplot(number,2,10) 91 plt.title("性别") 92 labels=['否','是'] 93 e=heartD_file['Sex'].value_counts() 94 plt.pie(e,labels=labels,explode=[0,0.2],autopct='%1.2f%%') 95 96 plt.figure(figsize=(15,50)) 97 Smoking_number=plt.subplot(number,2,11) 98 plt.title("糖尿病人数") 99 sns.countplot(y=heartD_file['Diabetic'],hue=heartD_file['Diabetic'],orientation="horizontal" ) 100 101 Smoking_number=plt.subplot(number,2,12) 102 plt.title("糖尿病人数") 103 labels=['否','有时','是'] 104 f=heartD_file['Diabetic'].value_counts() 105 plt.pie(f,labels=labels,explode=[0.2,0,0],autopct='%1.2f%%') 106 107 108 plt.figure(figsize=(15,50)) 109 Smoking_number=plt.subplot(number,2,13) 110 plt.title("糖尿病人数") 111 sns.countplot(y=heartD_file['Diabetic'],hue=heartD_file['Diabetic'],orientation="horizontal" ) 112 113 Smoking_number=plt.subplot(number,2,14) 114 plt.title("糖尿病人数") 115 labels=['否','有时','是'] 116 g=heartD_file['Diabetic'].value_counts() 117 plt.pie(g,labels=labels,explode=[0.2,0,0],autopct='%1.2f%%') 118 119 plt.figure(figsize=(15,50)) 120 Smoking_number=plt.subplot(number,2,15) 121 plt.title("运动人数") 122 sns.countplot(y=heartD_file['PhysicalActivity'],hue=heartD_file['PhysicalActivity'],orientation="horizontal" ) 123 124 Smoking_number=plt.subplot(number,2,16) 125 plt.title("运动人数") 126 labels=['否','是'] 127 h=heartD_file['PhysicalActivity'].value_counts() 128 plt.pie(h,labels=labels,explode=[0.2,0],autopct='%1.2f%%') 129 130 plt.figure(figsize=(15,50)) 131 Smoking_number=plt.subplot(number,2,17) 132 plt.title("健康状态") 133 sns.countplot(y=heartD_file['GenHealth'],hue=heartD_file['GenHealth'],orientation="horizontal" ) 134 135 Smoking_number=plt.subplot(number,2,18) 136 plt.title("健康状态") 137 labels=['不好','一般','好','很好','非常好'] 138 i=heartD_file['GenHealth'].value_counts() 139 plt.pie(i,labels=labels,autopct='%1.2f%%') 140 141 plt.figure(figsize=(15,50)) 142 Smoking_number=plt.subplot(number,2,17) 143 plt.title("哮喘人数") 144 sns.countplot(y=heartD_file['Asthma'],hue=heartD_file['Asthma'],orientation="horizontal" ) 145 146 Smoking_number=plt.subplot(number,2,18) 147 plt.title("哮喘人数") 148 labels=['否','是'] 149 i=heartD_file['Asthma'].value_counts() 150 plt.pie(i,labels=labels,explode=[0.2,0],autopct='%1.2f%%') 151 152 plt.figure(figsize=(15,50)) 153 Smoking_number=plt.subplot(number,2,17) 154 plt.title("肾脏疾病人数") 155 sns.countplot(y=heartD_file['KidneyDisease'],hue=heartD_file['KidneyDisease'],orientation="horizontal" ) 156 157 Smoking_number=plt.subplot(number,2,18) 158 plt.title("肾脏疾病人数") 159 labels=['否','是'] 160 i=heartD_file['KidneyDisease'].value_counts() 161 plt.pie(i,labels=labels,explode=[0.2,0],autopct='%1.2f%%') 162 163 plt.figure(figsize=(15,50)) 164 Smoking_number=plt.subplot(number,2,17) 165 plt.title("皮肤癌人数") 166 sns.countplot(y=heartD_file['SkinCancer'],hue=heartD_file['SkinCancer'],orientation="horizontal" ) 167 168 plt.subplot(number,2,18) 169 plt.title("皮肤癌人数") 170 labels=['否','是'] 171 i=heartD_file['SkinCancer'].value_counts() 172 plt.pie(i,labels=labels,explode=[0.2,0],autopct='%1.2f%%') 173 174 175 plt.legend() 176 plt.show() 177 178 179 import numpy as np 180 import pandas as pd 181 import scipy.stats as stats 182 import matplotlib.pyplot as plt 183 //通过countplot()方法进行对不同数据组的对比 184 compare_number=12 185 plt.figure(figsize=(15,60)) 186 plt.subplot(compare_number,2,1) 187 sns.countplot(x=heartD_file['Smoking'],hue=heartD_file['HeartDisease']) 188 189 plt.subplot(compare_number,2,2) 190 sns.countplot(x=heartD_file['AlcoholDrinking'],hue=heartD_file['HeartDisease']) 191 192 193 plt.figure(figsize=(15,60)) 194 plt.subplot(compare_number,2,3) 195 sns.countplot(x=heartD_file['Stroke'],hue=heartD_file['HeartDisease']) 196 197 plt.subplot(compare_number,2,4) 198 sns.countplot(x=heartD_file['PhysicalHealth'],hue=heartD_file['HeartDisease']) 199 200 201 plt.figure(figsize=(15,60)) 202 plt.subplot(compare_number,2,5) 203 sns.countplot(x=heartD_file['MentalHealth'],hue=heartD_file['HeartDisease']) 204 205 plt.subplot(compare_number,2,6) 206 sns.countplot(x=heartD_file['DiffWalking'],hue=heartD_file['HeartDisease']) 207 208 plt.figure(figsize=(15,60)) 209 plt.subplot(compare_number,2,7) 210 sns.countplot(x=heartD_file['Sex'],hue=heartD_file['HeartDisease']) 211 212 plt.subplot(compare_number,2,8) 213 sns.countplot(x=heartD_file['AgeCategory'],hue=heartD_file['HeartDisease']) 214 plt.show() 215 216 217 //小提琴 218 sns.violinplot(x=heartD_file['HeartDisease'],y=heartD_file['SleepTime']) 219 plt.show() 220 221 222 223 plt.figure(figsize=(15,10)) 224 plt.subplot(1,2,1) 225 sns.boxplot(x=heartD_file['Diabetic'],y=heartD_file['AgeCategory'],data=heartD_file) 226 227 plt.subplot(1,2,2) 228 sns.boxplot(x=heartD_file['Diabetic'],y=heartD_file['AgeCategory'],hue=heartD_file['HeartDisease'],data=heartD_file) 229 plt.show() 230 231 232 import seaborn as sns 233 sns.pairplot(heartD_file[heartD_file.Sex == 1], vars = ['MentalHealth', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'], hue = 'HeartDisease') 234 plt.show() 235 236 237 import seaborn as sns 238 sns.pairplot(heartD_file[heartD_file.Sex == 0], vars = ['MentalHealth', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'], hue = 'HeartDisease') 239 plt.show() 240 241 import numpy as np 242 import pandas as pd 243 import scipy.stats as stats 244 import matplotlib.pyplot as plt 245 import seaborn as sns 246 247 num=4 248 plt.figure(figsize=(15,10)) 249 plt.subplot(num,2,1) 250 251 sns.boxplot(x=heartD_file['HeartDisease'],y=heartD_file['BMI'],data=heartD_file) 252 253 plt.subplot(num,2,2) 254 sns.regplot(x='BMI',y='HeartDisease',data=heartD_file) 255 256 plt.subplot(num,2,3) 257 #使用离散x变量绘制并添加一些抖动 258 sns.regplot(x='GenHealth',y='HeartDisease',data=heartD_file,x_jitter=.1) 259 260 plt.subplot(num,2,4) 261 #拟合回归模型并截断模型预测 262 sns.regplot(x='GenHealth',y='HeartDisease',data=heartD_file, x_estimator=np.mean, logx=True, truncate=True) 263 plt.show() 264 265 import numpy as np 266 import pandas as pd 267 import scipy.stats as stats 268 import matplotlib.pyplot as plt 269 import seaborn as sns 270 271 compare_number=12 272 plt.figure(figsize=(15,60)) 273 plt.subplot(compare_number,2,1) 274 sns.countplot(x=heartD_file['Asthma'],hue=heartD_file['HeartDisease']) 275 276 277 278 plt.figure(figsize=(15,60)) 279 plt.subplot(compare_number,2,3) 280 sns.countplot(x=heartD_file['SkinCancer'],hue=heartD_file['HeartDisease']) 281 282 plt.subplot(compare_number,2,4) 283 sns.regplot(x='SkinCancer',y='HeartDisease',data=heartD_file, x_estimator=np.mean, logx=True, truncate=True) 284 285 plt.figure(figsize=(15,60)) 286 plt.subplot(compare_number,2,5) 287 sns.countplot(x=heartD_file['KidneyDisease'],hue=heartD_file['HeartDisease']) 288 289 plt.subplot(compare_number,2,6) 290 sns.regplot(x='KidneyDisease',y='HeartDisease',data=heartD_file, x_estimator=np.mean, logx=True, truncate=True) 291 292 293 plt.show() 294 295 296 import seaborn as sns 297 sns.pairplot(heartD_file[heartD_file.Sex == 0], vars = ['AgeCategory','KidneyDisease','Asthma','SkinCancer'], hue = 'HeartDisease') 298 plt.show()
4.总结
- 在这次的数据分析学习里,我针对心脏病来做了一些数据验证,分析发现,抽烟、中风、少运动、心情经常性不好、行动不方便、糖尿病、肥胖和年龄是引起心脏病的因素之一,因此,我们需要在生活作息规律下,多运动、少抽烟、多吃蔬菜、少吃高糖、高脂的食物,对于体重上需要每时每刻进行把控,在身心上,需要调整自己的情绪和保护自己,把健康生活放在第一位。
- 在学习过程中,难免会遇到很多困难,比如方法的使用、语句的规范等,最后通过各种方法来克服了这些困难,但最后呈现出来的结果可能过于简陋,接下来会继续努力进行改进。
标签:subplot,plt,个人,number,heartD,指标,心脏病,sns,file From: https://www.cnblogs.com/Tomqiaworld/p/16966025.html