1.题干
AQI and Lat Long of Countries.csv数据集的目标是为不同地区的空气质量提供有价值的见解,使研究人员和政策制定者能够就如何解决空气污染问题做出明智的决定。该数据集由两个独立的数据集合并而成,一个包含城市及其相应的经纬度坐标信息,另一个包含世界各国的空气污染水平数据。通过结合这两个数据集,我们现在可以分析和比较各国不同城市的空气质量指数。
任务要求:读取数据,查看数据集的前10行和基本信息,输出基本统计量,删除只有唯一取值的分类变量,以维度和经度为横纵坐标绘制全世界各城市的空气质量散点图,对分类变量'Ozone AQI Category','PM2.5 AQI Category'进行编码转换,依据经纬度坐标信息通过K-近邻,决策树,随机森林,支持向量机分别进行分类,输出分类报告、准确率得分与混淆矩阵。
2.数据格式
3.代码
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
def printf(n, strf):
print()
print('-' * n)
print(f"\033[1m{strf}\033[0m")
print()
data = pd.read_csv('dataset/AQI and Lat Long of Countries.csv')
printf(100, '查看数据基本信息')
print(data.info())
printf(100, '查看前10行数据')
print(data.head(10))
printf(100, '输出基本统计量')
print(data.describe())
for col in data.columns:
if data[col].dtype == 'object':
if data[col].nunique() == 1:
data.drop(col, axis=1, inplace=True)
plt.scatter(data['lng'], data['lat'], c=data['AQI Value'], cmap='viridis',s=10)
plt.colorbar(label='AQI Value')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Air Quality Index of Cities Worldwide')
plt.show()
label_encoder = LabelEncoder()
data['Ozone AQI Category'] = label_encoder.fit_transform(data['Ozone AQI Category'])
data['PM2.5 AQI Category'] = label_encoder.fit_transform(data['PM2.5 AQI Category'])
X = data[['lat','lng']]
for c in ['Ozone AQI Category','PM2.5 AQI Category']:
printf(200, f'分类变量 {c} 的分类报告')
y = data[c]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# print(y.value_counts())
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
classifiers = {
'K-Nearest Neighbors': KNeighborsClassifier(),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'Support Vector Machine': SVC()
}
for clf_name, clf in classifiers.items():
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(f"\n{clf_name}:")
print(classification_report(y_test, y_pred, zero_division=1))
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")