【Python高级应用课程设计 】大数据分析——中国时尚购物的动机

时间:2023-12-28  
ID User ID
Gender Gender
Age Age of Respondent
Edu Education Level
Inc Income Level
Emp Employment Status
Monthly_Spend Expenditure on Clothing
Retail_Platform High Street or eCommerce Shopping
Adv Hedonic: Adventure Shopping
Soc Hedonic: Social Shopping
Grat Hedonic: Gratification Shopping
Ide Hedonic: Idea Shopping
Rol Hedonic: Role Shopping
Val Hedonic: Value Shopping
Eff Utilitarian: Efficiency Shopping
Ach Utilitarian: Achievement Shopping
MAH_1 Mahalanobis Distance
filter_$ MAH_1 < 26.13 (FILTER)
Spend Expenditure on Clothing
Occupation Occupation

1 数据源:
这是选自kaggle中的数据集,链接:Motivations for Fashion Shopping in China (kaggle.com),

2 导入数据

# 导入模块
1. import inline
2. import matplotlib
3. import pandas as pd
4. import numpy as np
5. import matplotlib.pyplot as plt
6. import seaborn as sns

8. # 显示所有列
9. pd.set_option('display.max_columns', None)
10. # 显示所有行
11. pd.set_option('display.max_rows', None)

13.  # 读取数据集
14. df = pd.read_csv("shopping_motivations_in_china.csv")
15. print(df.head())

17. #查看数据基本信息
18. df.info()


1. # 检查缺失值
2. print("\nCheck for missing values:")
3. missing_values = df.isnull().sum()
4. print(missing_values)

6. # 快速统计汇总
7. print(df.describe())

9. # 分类列中的唯一值
10. print("\nUnique Values:")
11. for column in df.select_dtypes(include='object').columns:
12.     print(f"{column}: {df[column].unique()}")

14. #数据形状
15. print("\nData Shape:")
16. print(df.shape)


1. 各项数据在数据中的分布
2. #性别分布
3. gender_distribution = df['Gender'].value_counts()

5. #绘制分布
6. plt.figure(figsize=(8, 6))
7. sns.countplot(x='Gender', data=df)
8. plt.title('Distribution of Genders')
9. plt.xlabel('Gender')
10. plt.ylabel('Count')
11. plt.show()

13. # 将分布显示为百分比
14. print("Gender Distribution:")
15. print(gender_distribution)


income_distribution = df['Inc'].value_counts()

# 绘制分布
1. plt.figure(figsize=(10, 6))
2. sns.countplot(x='Inc', data=df, order=df['Inc'].value_counts().index)
3. plt.title('Distribution of Income Levels')
4. plt.xlabel('Income Level')
5. plt.ylabel('Count')
6. plt.show()

8. # 将分布显示为百分比
9. print("Income Level Distribution:")
10. print(income_distribution)


1. #年龄分布
2. age_distribution = df['Age'].value_counts()

4. #绘制分布
5. plt.figure(figsize=(8, 6))
6. sns.countplot(x='Age', data=df)
7. plt.title('Distribution of age')
8. plt.xlabel('age')
9. plt.ylabel('Count')
10. plt.show()


1. # 就业状况分布
2. employment_distribution = df['Emp'].value_counts()

4. # 绘制分布
5. plt.figure(figsize=(10, 6))
6. sns.countplot(x='Emp', data=df, order=df['Emp'].value_counts().index)
7. plt.title('Distribution of Employment Status')
8. plt.xlabel('Employment Status')
9. plt.ylabel('Count')
10. plt.show()

12. # 以百分比显示分布
13. print("Employment Status Distribution:")
14. print(employment_distribution)


1. # 就业状况饼图分布
2. employment_distribution = df['Emp'].value_counts()

4. # 绘制分布
5. plt.figure(figsize=(8,6))
6. plt.pie(employment_distribution)
7. plt.title('Distribution of Employment Status')
8. plt.xlabel('Employment Status')
9. plt.ylabel('Count')
10. plt.show()


1. # 每月支出分布
2. plt.figure(figsize=(8, 6))
3. sns.histplot(df['Monthly_Spend'], bins=20, kde=True)
4. plt.title('Distribution of Monthly Spending on Clothing')
5. plt.xlabel('Monthly Spending (in Currency)')
6. plt.ylabel('Count')
7. plt.show()


1. #被调查者的马氏距离(MAH_1)如何变化,有多少低于指定阈值?
2. # 距离分布
3. plt.figure(figsize=(8, 6))
4. sns.histplot(df['MAH_1'], bins=20, kde=True)
5. plt.title('Distribution of Mahalanobis Distance (MAH_1)')
6. plt.xlabel('Mahalanobis Distance')
7. plt.ylabel('Count')
8. plt.show()

10. # 低于指定阈值的应答者数量
11. threshold_count = df[df['MAH_1'] < 26.13].shape[0]
12. print(f"Number of respondents with MAH_1 < 26.13: {threshold_count}"


1. #功利动机分配
2. utilitarian_cols = ['Eff', 'Ach']
3. utilitarian_distribution = df[utilitarian_cols].sum()

5. #绘制分布
6. plt.figure(figsize=(8, 6))
7. sns.barplot(x=utilitarian_distribution.index, y=utilitarian_distribution.values)
8. plt.title('Distribution of Utilitarian Motivations for Shopping')
9. plt.xlabel('Utilitarian Motivation')
10. plt.ylabel('Count')
11. plt.show()

13. #显示每个功利动机的分布
14. print("Utilitarian Motivations Distribution:")
15. print(utilitarian_distribution)


1. #享乐动机分布
2. hedonic_cols = ['Adv', 'Soc', 'Grat', 'Ide', 'Rol', 'Val']
3. hedonic_distribution = df[hedonic_cols].sum()

5. #绘制分布
6. plt.figure(figsize=(10, 6))
7. sns.barplot(x=hedonic_distribution.index, y=hedonic_distribution.values)
8. plt.title('Distribution of Hedonic Motivations for Shopping')
9. plt.xlabel('Hedonic Motivation')
10. plt.ylabel('Count')
11. plt.show()

13. #显示每个享乐动机的分布
14. print("Hedonic Motivations Distribution:")
15. print(hedonic_distribution)


1. plt. figure(figsize=(8, 6))
2. sns. scatterplot(x='Age', y='Monthly_Spend', data=df)
3. plt. title('Relationship between Age and Monthly Spending')
4. plt.xlabel('Age')
5. plt. ylabel('Monthly Spending (in Currency)')
6. plt.show()

8. # 相关性分析
9. correlation_age_spending = df['Age'].corr(df['Monthly_Spend'])
10. print(f"Correlation between Age and Monthly Spending: {correlation_age_spending}")


1. # 零售平台每月消费的箱线图
2. plt. figure(figsize=(10, 8))
3. sns. boxplot(x='Retail_Platform', y='Monthly_Spend', data=df)
4. plt. title('Monthly Spending Distribution by Retail Platform')
5. plt. xlabel('Retail Platform')
6. plt. ylabel('Monthly Spending (in Currency)')
7. plt.show()


1. # 马氏距离(按职业)的箱线图
2. plt. figure(figsize=(10, 8))
3. sns. boxplot(x='Occupation', y='MAH_1', data=df)
4. plt. title('Mahalanobis Distance Distribution by Occupation')
5. plt.xlabel('Occupation')
6. plt. ylabel('Mahalanobis Distance')
7. plt.show()


1. #动机列的成对相关矩阵
2. motivations_corr = df[['Adv', 'Soc', 'Grat', 'Ide', 'Rol', 'Val', 'Eff', 'Ach']].corr()

4. #热图可视化
5. plt.figure(figsize=(10, 8))
6. sns.heatmap(motivations_corr, annot=True, cmap='coolwarm', fmt=".2f")
7. plt.title('Pairwise Correlation among Motivational Columns')
8. plt.show()


1. #按年龄组划分的每月消费小提琴图
2. plt.figure(figsize=(8, 6))
3. sns.violinplot(x='Age', y='Monthly_Spend', data=df)
4. plt.title('Monthly Spending Distribution by Age Group')
5. plt.xlabel('Age Group')
6. plt.ylabel('Monthly Spending (in Currency)')
7. plt.show()


1. # 每个享乐动机按收入水平的箱线图
2. hedonic_cols = ['Adv', 'Soc', 'Grat', 'Ide', 'Rol', 'Val']
3. plt.figure(figsize=(14, 8))
4. for col in hedonic_cols:
5.     plt.subplot(2, 3, hedonic_cols.index(col) + 1)
6.     sns.boxplot(x='Inc', y=col, data=df)
7.     plt.title(f'{col} by Income Level')
8. plt.tight_layout()
9. plt.show()


1. #零售平台的马氏距离箱线图
2. plt. figure(figsize=(8, 6))
3. sns. boxplot(x='Retail_Platform', y='MAH_1', data=df)
4. plt. title('Mahalanobis Distance Distribution by Retail Platform')
5. plt. xlabel('Retail Platform')
6. plt. ylabel('Mahalanobis Distance')
7. plt.show()


1. # 创建一个新的列为合并的享乐动机
2. df['Combined_Hedonic'] = df[['Adv', 'Soc', 'Grat', 'Ide', 'Rol', 'Val']].sum(axis=1)

4. #马哈拉诺比斯距离与综合享乐动机的散点图
5. plt. figure(figsize=(8, 6))
6. sns. scatterplot(x='Combined_Hedonic', y='MAH_1', data=df)
7. plt. title('Mahalanobis Distance vs. Combined Hedonic Motivations')
8. plt. xlabel('Combined Hedonic Motivations')
9. plt. ylabel('Mahalanobis Distance')
10. plt.show()


1. # KDE每月消费和年龄图
2. plt.figure(figsize=(8, 6))
3. sns.kdeplot(data=df, x='Monthly_Spend', y='Age', fill=True)
4. plt.title('Kernel Density Estimation for Monthly Spending and Age')
5. plt.show()


1. #所有数值列的关联矩阵
2. all_numeric_corr = df.corr()
3. # 热图可视化
4. plt.figure(figsize=(16, 12))
5. sns.heatmap(all_numeric_corr, annot=True, cmap='coolwarm', fmt=".2f")
6. plt.title('Correlation Matrix for All Numerical Columns')
7. plt.show()


1. #动机雷达图
2. from math import pi
3. motivation_cols = ['Adv', 'Soc', 'Grat', 'Ide', 'Rol', 'Val', 'Eff', 'Ach']
4. motivation_data = df[motivation_cols].mean()
5. #绘制雷达图
6. angles = [n / float(len(motivation_cols)) * 2 * pi for n in range(len(motivation_cols))]
7. angles += angles[:1]
8. values = motivation_data.tolist() + motivation_data.tolist()[:1]
9. plt.figure(figsize=(8, 8))
10. plt.polar(angles, values, marker='.')
11. plt.fill(angles, values, alpha=0.25)
12. plt.title('Radar Chart for Shopping Motivations')
13. plt.show()


# NMDS对于购物动机的差异
1. from sklearn.manifold import MDS
2. from sklearn.metrics import euclidean_distances

4. motivation_cols = ['Adv', 'Soc', 'Grat', 'Ide', 'Rol', 'Val', 'Eff', 'Ach']
5. motivations_data = df[motivation_cols]
6. distances = euclidean_distances(motivations_data)

8. mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
9. mds_result = mds.fit_transform(distances)

11. #NMDS结果散点图
12. plt.figure(figsize=(8, 6))
13. plt.scatter(mds_result[:, 0], mds_result[:, 1])
14. plt.title('Scatter Plot of NMDS Results for Shopping Motivations')
15. plt.xlabel('NMDS Dimension 1')
16. plt.ylabel('NMDS Dimension 2')
17. plt.show()


1. #方差分析测试每月支出按教育水平
2. import statsmodels.api as sm
3. from statsmodels.formula.api import ols

5. #拟合方差分析模型
6. model = ols('Monthly_Spend ~ Edu', data=df).fit()
7. anova_table = sm.stats.anova_lm(model, typ=2)

9. # 显示ANOVA表
10. print("ANOVA Table for Monthly Spending by Education Level:")
11. print(anova_table)


1. #卡方检验的列联表
2. gender_retail_crosstab = pd.crosstab(df['Gender'], df['Retail_Platform'])

4. # 卡方检验
5. from scipy.stats import chi2_contingency

7. chi2, x, _, _ = chi2_contingency(gender_retail_crosstab)
8. print(f"Chi-square statistic: {chi2}")
9. print(f"P-value: {x}")



