文章目录
数据集说明
- 在这个借助
Goodbook
网站收集的数据集中,可以获得有关书籍的信息,如作者、页数、评分和其他信息
文件说明
books.csv
bookID,title,authors,average_rating,isbn,isbn13,language_code, num_pages,ratings_count,text_reviews_count,publication_date,publisher
1,Harry Potter and the Half-Blood Prince (Harry Potter #6),J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
2,Harry Potter and the Order of the Phoenix (Harry Potter #5),J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
4,Harry Potter and the Chamber of Secrets (Harry Potter #2),J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
...
业务需求
(1)统计最受关注的书籍Top 10
(2)统计书籍篇幅Top 10
(3)统计不同出版社出版的书籍数量
(4)统计不同语言的书籍数量
(5)统计最不受关注的高分书籍Top 10(评分4.5以上,评分人数1w以上,评论数200以下)
(6)统计不同年份出版的书籍数量
(7)统计不同作者的书籍的平均评分
(8)统计在最受关注的书籍Top 1000中,不同出版社出版的书籍数量
(9)统计在最受关注的书籍Top 1000中,不同语言的书籍数量
(10)统计不同作者的书籍的平均受关注程度
需求实现
数据预处理
# -*- coding: utf-8 -*-
# @Time : 2024/12/14 0:49
# @Author : 从心
# @File : spark_book_recommendation_analysis_preprocess.py
# @Software : PyCharm
import pandas as pd
import numpy as np
df = pd.read_csv('../data/books.csv', on_bad_lines='skip')
df.columns = df.columns.str.strip()
print(df.head(3))
print(df.info())
df = df.dropna()
print(df.info())
df = df.drop_duplicates(keep='first')
print(df.info())
def convert_date(date_str):
try:
converted_date = pd.to_datetime(date_str, format='%m/%d/%Y')
return converted_date.strftime('%Y-%m-%d')
except ValueError as e:
print(f"{date_str} 转换失败: {e}")
return np.nan
df = df['publication_date'].apply(convert_date)
df = df.dropna()
print(df.info())
print(df['language_code'].unique())
df.to_csv('../data/books_cleaned.csv', encoding='utf-8', index=False)
数据统计分析
# -*- coding: utf-8 -*-
# @Time : 2024/12/14 0:50
# @Author : 从心
# @File : spark_book_recommendation_analysis.py
# @Software : PyCharm
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format, split, rank
from pyspark.sql.window import Window
spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()
# 视图 books
df_books = spark.read.csv('/input_spark_book_recommendation_analysis/books_cleaned.csv', header=True, inferSchema=True)
df_books.show(10)
df_books.createOrReplaceTempView('books')
"""
(1) 统计最受关注的书籍 Top 10
"""
df_books_attention_top_10 = spark.sql(
"""
select bookID, title, text_reviews_count, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, num_pages, ratings_count, publication_date, publisher
from books
order by text_reviews_count desc
"""
)
df_books_attention_top_10 = df_books_attention_top_10.repartition(1)
df_books_attention_top_10.show(n=10, truncate=False)
df_books_attention_top_10.write.csv('/result/books_attention_top_10.csv',
mode='overwrite')
"""
(2) 统计书籍篇幅 Top 10
"""
df_books_length_top_10 = spark.sql(
"""
select bookID, title, num_pages, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, ratings_count, text_reviews_count, publication_date, publisher
from books
order by num_pages desc
"""
)
df_books_length_top_10 = df_books_length_top_10.repartition(1)
df_books_length_top_10.show(n=10, truncate=False)
df_books_length_top_10.write.csv('/result/books_length_top_10.csv', mode='overwrite')
"""
(3) 统计不同出版社出版的书籍数量
"""
df_publisher_books_num = spark.sql(
"""
select publisher, count(*) as books_num
from books
group by publisher
order by books_num desc
"""
)
df_publisher_books_num = df_publisher_books_num.repartition(1)
df_publisher_books_num.show(n=10, truncate=False)
df_publisher_books_num.write.csv('/result/publisher_books_num.csv',
mode='overwrite')
"""
(4) 统计不同语言的书籍数量
"""
df_language_books_num = spark.sql(
"""
select language_code, count(*) as books_num
from books
group by language_code
order by books_num desc
"""
)
df_language_books_num = df_language_books_num.repartition(1)
df_language_books_num.show(n=10, truncate=False)
df_language_books_num.write.csv('/result/language_books_num.csv', mode='overwrite')
"""
(5) 统计最不受关注的高分书籍 Top 10 (评分 4.5 以上, 评分人数 1w 以上, 评论数 200 以下)
"""
df_books_rating_no_attention_top_10 = spark.sql(
"""
select bookID, title, substring_index(authors, '/', 1) as author_first, average_rating, isbn, isbn13, language_code, num_pages, ratings_count, text_reviews_count, publication_date, publisher
from books
where average_rating > 4.5 and ratings_count > 10000 and text_reviews_count < 200
order by text_reviews_count asc
"""
)
df_books_rating_no_attention_top_10 = df_books_rating_no_attention_top_10.repartition(1)
df_books_rating_no_attention_top_10.show(n=10, truncate=False)
df_books_rating_no_attention_top_10.write.csv(
'/result/books_rating_no_attention_top_10.csv', mode='overwrite')
# 视图 books_with_year
df_books_with_year = df_books.withColumn('year', date_format(df_books['publication_date'], 'yyyy'))
df_books_with_year.show(10)
df_books_with_year.createOrReplaceTempView('books_with_year')
"""
(6) 统计不同年份出版的书籍数量
"""
df_year_books_num = spark.sql(
"""
select year, count(*) as books_num
from books_with_year
group by year
order by year asc
"""
)
df_year_books_num = df_year_books_num.repartition(1)
df_year_books_num.show(n=10, truncate=False)
df_year_books_num.write.csv('/result/year_books_num.csv', mode='overwrite')
# 视图 books_with_author_first
df_books_with_author_first = df_books.withColumn('author_first', split(df_books['authors'], '/').getItem(0))
df_books_with_author_first.show(10)
df_books_with_author_first.createOrReplaceTempView('books_with_author_first')
"""
(7) 统计不同作者的书籍的平均评分
"""
df_author_books_avg_rating = spark.sql(
"""
select author_first, sum(average_rating * ratings_count) / sum(ratings_count) as avg_rating, count(*) as books_num
from books_with_author_first
group by author_first
order by avg_rating desc, books_num desc
"""
)
df_author_books_avg_rating = df_author_books_avg_rating.repartition(1)
df_author_books_avg_rating.show(n=10, truncate=False)
df_author_books_avg_rating.write.csv('/result/author_books_avg_rating.csv',
mode='overwrite')
# 视图 books_attention_top_1000
window = Window.orderBy(df_books_with_author_first['text_reviews_count'].desc())
df_books_attention_rank = df_books_with_author_first.withColumn('rank', rank().over(window))
df_books_attention_top_1000 = df_books_attention_rank.filter(df_books_attention_rank['rank'] <= 1000).drop('rank')
df_books_attention_top_1000.show(10)
df_books_attention_top_1000.createOrReplaceTempView('books_attention_top_1000')
"""
(8) 统计在最受关注的书籍 Top 1000 中, 不同出版社出版的书籍数量
"""
df_publisher_books_top_1000_num = spark.sql(
"""
select publisher, count(*) as books_num
from books_attention_top_1000
group by publisher
order by books_num desc
"""
)
df_publisher_books_top_1000_num.show(n=10, truncate=False)
df_publisher_books_top_1000_num.write.csv('/result/publisher_books_top_1000_num.csv',
mode='overwrite')
"""
(9) 统计在最受关注的书籍 Top 1000 中, 不同语言的书籍数量
"""
df_language_books_top_1000_num = spark.sql(
"""
select language_code, count(*) as books_num
from books_attention_top_1000
group by language_code
order by books_num desc
"""
)
df_language_books_top_1000_num.show(n=10, truncate=False)
df_language_books_top_1000_num.write.csv('/result/language_books_top_1000_num.csv',
mode='overwrite')
"""
(10) 统计不同作者的书籍的平均受关注程度
"""
df_author_books_avg_attention = spark.sql(
"""
select author_first, sum(text_reviews_count) / count(*) as avg_attention, count(*) as books_num
from books_with_author_first
group by author_first
order by avg_attention desc, books_num desc
"""
)
df_author_books_avg_attention = df_author_books_avg_attention.repartition(1)
df_author_books_avg_attention.show(n=10, truncate=False)
df_author_books_avg_attention.write.csv('/result/author_books_avg_attention.csv',
mode='overwrite')
结果可视化
# -*- coding: utf-8 -*-
# @Time : 2024/12/14 0:50
# @Author : 从心
# @File : spark_book_recommendation_analysis_visualization.py
# @Software : PyCharm
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import matplotlib.colors as mcolors
def chart_1():
"""
(1) 统计最受关注的书籍 Top 10
"""
csv_path = '../result/books_attention_top_10.csv/part-r-00000-d44ddace-d5f4-42df-980d-d3e236064461.csv'
names = ['bookID', 'title', 'text_reviews_count', 'author_first', 'average_rating', 'isbn', 'isbn13',
'language_code',
'num_pages', 'ratings_count', 'publication_date', 'publisher']
df = pd.read_csv(csv_path, header=None, names=names)
df = df.head(10)
df = df.sort_values(by='text_reviews_count', ascending=True)
plt.figure(figsize=(16, 9))
y = df['title']
x = df['text_reviews_count']
plt.barh(y, x, color='skyblue')
for idx, value in enumerate(x):
plt.text(value, idx, f'{value}', va='center', ha='left')
plt.title('最受关注的书籍 Top 10', fontsize=16, fontweight='bold')
plt.ylabel('标题', fontsize=16, fontweight='bold')
plt.xlabel('评论数', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../visualization/[1]books_attention_top_10.png')
plt.show()
def chart_2():
"""
(2) 统计书籍篇幅 Top 10
"""
csv_path = '../result/books_length_top_10.csv/part-r-00000-7be4f0a3-b317-408f-ba8a-41be340d193b.csv'
names = ['bookID', 'title', 'num_pages', 'author_first', 'average_rating', 'isbn', 'isbn13', 'language_code',
'ratings_count', 'text_reviews_count', 'publication_date', 'publisher']
df = pd.read_csv(csv_path, header=None, names=names)
df = df.head(10)
df = df.sort_values(by='num_pages', ascending=True)
plt.figure(figsize=(16, 9))
y = df['title']
x = df['num_pages']
plt.barh(y, x, color='skyblue')
for idx, value in enumerate(x):
plt.text(value, idx, f'{value}', va='center', ha='left')
plt.title('书籍篇幅 Top 10', fontsize=16, fontweight='bold')
plt.ylabel('标题', fontsize=16, fontweight='bold')
plt.xlabel('页数', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../visualization/[2]books_length_top_10.png')
plt.show()
def chart_3():
"""
(3) 统计不同出版社出版的书籍数量
"""
csv_path = '../result/publisher_books_num.csv/part-r-00000-f627a337-9aed-44ac-a37c-6c03582640dc.csv'
names = ['publisher', 'books_num']
df = pd.read_csv(csv_path, header=None, names=names)
df = df.head(50)
plt.figure(figsize=(16, 9))
x = df['publisher']
y = df['books_num']
bars = plt.bar(x, y, color='skyblue')
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', rotation=0)
plt.title('不同出版社出版的书籍数量', fontsize=16, fontweight='bold')
plt.xlabel('出版社', fontsize=16, fontweight='bold')
plt.ylabel('书籍数量', fontsize=16, fontweight='bold')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('../visualization/[3]publisher_books_num.png')
plt.show()
def chart_4():
"""
(4) 统计不同语言的书籍数量
"""
csv_path = '../result/language_books_num.csv/part-r-00000-28c77665-f13b-4bee-aef6-a83511f4213a.csv'
names = ['language_code', 'books_num']
df = pd.read_csv(csv_path, header=None, names=names)
df = df.head(50)
plt.figure(figsize=(16, 9))
x = df['language_code']
y = df['books_num']
bars = plt.bar(x, y, color='skyblue')
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', rotation=0)
plt.title('不同语言的书籍数量', fontsize=16, fontweight='bold')
plt.xlabel('语言', fontsize=16, fontweight='bold')
plt.ylabel('书籍数量', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../visualization/[4]language_books_num.png')
plt.show()
def chart_5():
"""
(5) 统计最不受关注的高分书籍 Top 10 (评分 4.5 以上, 评分人数 1w 以上, 评论数 200 以下)
"""
csv_path = '../result/books_rating_no_attention_top_10.csv/part-r-00000-ec31e0ee-9f5b-4d11-bbd2-5cc0633be073.csv'
names = ['bookID', 'title', 'author_first', 'average_rating', 'isbn', 'isbn13', 'language_code', 'num_pages',
'ratings_count', 'text_reviews_count', 'publication_date', 'publisher']
df = pd.read_csv(csv_path, header=None, names=names)
df = df.head(10)
df = df.sort_values(by='average_rating', ascending=True)
plt.figure(figsize=(16, 9))
y = df['title']
x = df['average_rating']
plt.barh(y, x, color='skyblue')
for idx, value in enumerate(x):
plt.text(value, idx, f'{value}', va='center', ha='left')
plt.title('最不受关注的高分书籍 Top 10', fontsize=16, fontweight='bold')
plt.ylabel('标题', fontsize=16, fontweight='bold')
plt.xlabel('平均评分', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../visualization/[5]books_rating_no_attention_top_10.png')
plt.show()
def chart_6():
"""
(6) 统计不同年份出版的书籍数量
"""
csv_path = '../result/year_books_num.csv/part-r-00000-40448964-0687-400c-94da-d390ca57f2d8.csv'
names = ['year', 'books_num']
df = pd.read_csv(csv_path, header=None, names=names)
plt.figure(figsize=(16, 9))
x = df['year']
y = df['books_num']
plt.plot(x, y, marker='o', linestyle='-', color='skyblue')
plt.title('不同年份出版的书籍数量', fontsize=16, fontweight='bold')
plt.xlabel('年份', fontsize=16, fontweight='bold')
plt.ylabel('书籍数量', fontsize=16, fontweight='bold')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig('../visualization/[6]year_books_num.png')
plt.show()
def chart_7():
"""
(7) 统计不同作者的书籍的平均评分
"""
csv_path = '../result/author_books_avg_rating.csv/part-r-00000-5601e037-dc5b-4cd1-ba34-b2207717cb27.csv'
names = ['author_first', 'avg_rating', 'books_num']
df = pd.read_csv(csv_path, header=None, names=names)
df = df.head(50)
plt.figure(figsize=(16, 9))
x = df['author_first']
y = df['avg_rating']
bars = plt.bar(x, y, color='skyblue')
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{height:.2f}', ha='center', rotation=45)
plt.title('不同作者的书籍的平均评分', fontsize=16, fontweight='bold')
plt.xlabel('第一作者', fontsize=16, fontweight='bold')
plt.ylabel('平均评分', fontsize=16, fontweight='bold')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('../visualization/[7]author_books_avg_rating.png')
plt.show()
def chart_8():
"""
(8) 统计在最受关注的书籍 Top 1000 中, 不同出版社出版的书籍数量
"""
csv_path = '../result/publisher_books_top_1000_num.csv/part-r-00000-c9d19260-6251-446a-b55a-2f6864e295f7.csv'
names = ['publisher', 'books_num']
df = pd.read_csv(csv_path, header=None, names=names)
df = df.head(10)
plt.figure(figsize=(9, 9))
labels = df['publisher']
x = df['books_num']
plt.pie(x, labels=labels, autopct='%1.1f%%', startangle=0)
plt.title('在最受关注的书籍 Top 1000 中, 不同出版社出版的书籍数量')
plt.tight_layout()
plt.savefig('../visualization/[8]publisher_books_top_1000_num.png')
plt.show()
def chart_9():
"""
(9) 统计在最受关注的书籍 Top 1000 中, 不同语言的书籍数量
"""
csv_path = '../result/language_books_top_1000_num.csv/part-r-00000-ecab523d-5d46-4e4f-a5f4-8dc083e049e2.csv'
names = ['language_code', 'books_num']
df = pd.read_csv(csv_path, header=None, names=names)
df = df.head(10)
plt.figure(figsize=(9, 9))
labels = df['language_code']
x = df['books_num']
wedges = plt.pie(x, labels=None, autopct='%1.1f%%', startangle=0)[0]
handles = [plt.Rectangle((0, 0), 1, 1, color=mcolors.to_rgba(wedge.get_facecolor())) for wedge in wedges]
plt.legend(handles, labels, title='语言', loc='upper right', bbox_to_anchor=(0.9, 0.9))
plt.title('在最受关注的书籍 Top 1000 中, 不同语言的书籍数量')
plt.tight_layout()
plt.savefig('../visualization/[9]language_books_top_1000_num.png')
plt.show()
def chart_10():
"""
(10) 统计不同作者的书籍的平均受关注程度
"""
csv_path = '../result/author_books_avg_attention.csv/part-r-00000-b82eadc1-8db8-494d-befa-34bf3834cfb1.csv'
names = ['author_first', 'avg_attention', 'books_num']
df = pd.read_csv(csv_path, header=None, names=names)
df = df.head(50)
plt.figure(figsize=(16, 9))
x = df['author_first']
y = df['avg_attention']
bars = plt.bar(x, y, color='skyblue')
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', rotation=45)
plt.title('不同作者的书籍的平均受关注程度', fontsize=16, fontweight='bold')
plt.xlabel('第一作者', fontsize=16, fontweight='bold')
plt.ylabel('平均受关注程度', fontsize=16, fontweight='bold')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('../visualization/[10]author_books_avg_attention.png')
plt.show()
(1)统计最受关注的书籍Top 10
(2)统计书籍篇幅Top 10
(3)统计不同出版社出版的书籍数量
(4)统计不同语言的书籍数量
(5)统计最不受关注的高分书籍Top 10(评分4.5以上,评分人数1w以上,评论数200以下)
(6)统计不同年份出版的书籍数量
(7)统计不同作者的书籍的平均评分
(8)统计在最受关注的书籍Top 1000中,不同出版社出版的书籍数量
(9)统计在最受关注的书籍Top 1000中,不同语言的书籍数量
(10)统计不同作者的书籍的平均受关注程度
标签:10,统计分析,plt,num,df,books,Spark,csv,书籍 From: https://blog.csdn.net/from__2024_12_07/article/details/144469252