(一)、中国大学排名数据分析与可视化;(写到实验报告中)
【源代码程序】
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
# URL 模板,按年份爬取数据
URL_TEMPLATE = "https://www.shanghairanking.cn/rankings/bcur/{}"
# 爬取数据函数
def fetch_rankings(year):
url = URL_TEMPLATE.format(year)
response = requests.get(url)
# 检查响应状态码
if response.status_code != 200:
print(f"Failed to retrieve data for year {year}. Status code: {response.status_code}")
return []
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find("table", {"class": "rk-table"})
# 检查是否成功找到表格
if not table:
print(f"Failed to find the ranking table for year {year}.")
return []
rows = table.find_all("tr")[1:11] # 取前10行数据
rankings = []
for row in rows:
cols = row.find_all("td")
rank = cols[0].text.strip()
university = cols[1].text.strip()
score = cols[2].text.strip()
rankings.append((rank, university, score))
return rankings
# 打印排名信息
def print_rankings(rankings, year):
if not rankings:
print(f"No data available for year {year}.")
return
print(f"\n{year} 年前 10 名大学排名:")
print(f"{'排名':<5} {'大学':<20} {'得分':<10}")
print("-" * 40)
for rank, university, score in rankings:
print(f"{rank:<5} {university:<20} {score:<10}")
# 可视化函数
def plot_rankings(rankings_dict):
# 设置字体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体字体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
years = list(rankings_dict.keys())
universities = {university for year in years for _, university, _ in rankings_dict[year]}
plt.figure(figsize=(10, 6))
for university in universities:
ranks = []
for year in years:
rank = next((int(rank) for rank, uni, _ in rankings_dict[year] if uni == university), None)
ranks.append(rank)
plt.plot(years, ranks, marker='o', label=university if ranks[-1] and ranks[-1] <= 10 else "")
plt.gca().invert_yaxis()
plt.xticks(years)
plt.xlabel('年份')
plt.ylabel('排名')
plt.title('2015-2019年前10大学排名变化')
plt.legend()
plt.show()
# 查询排名信息
def query_ranking(rankings_dict):
while True:
university = input("请输入大学名称:")
year = input("请输入年份(2015-2019):")
if not year.isdigit() or int(year) not in rankings_dict:
print("年份输入有误,请重新输入。")
continue
year = int(year)
rank_info = next((rank for rank, uni, _ in rankings_dict[year] if uni == university), None)
if rank_info:
print(f"{year} 年 {university} 排名:{rank_info}")
else:
print(f"{year} 年没有找到 {university} 的排名信息。")
cont = input("是否继续查询?(y/n): ")
if cont.lower() != 'y':
break
if __name__ == "__main__":
rankings_dict = {}
for year in range(2015, 2019+1):
rankings_dict[year] = fetch_rankings(year)
print_rankings(rankings_dict[year], year)
plot_rankings(rankings_dict)
query_ranking(rankings_dict)
(二)、豆瓣图书评论数据分析与可视化;(写到实验报告中)
【源代码程序】
import requests
from bs4 import BeautifulSoup
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 爬取短评数据函数
def fetch_comments(book_id, start=0, limit=20, status='P', sort_by='time'):
url = f"https://book.douban.com/subject/{book_id}/comments/?start={start}&limit={limit}&status={status}&sort={sort_by}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
comments = soup.find_all("div", class_="comment")
comment_data = []
for comment in comments:
username = comment.find("span", class_="comment-info").find("a").text
content = comment.find("span", class_="short").text
date = comment.find("a", class_="comment-time").text.strip()
rating_tag = comment.find("span", class_="rating")
rating = rating_tag['class'][0][7:8] if rating_tag else "无评分"
votes = comment.find("span", class_="vote-count").text
comment_data.append({
"username": username,
"content": content,
"date": date,
"rating": rating,
"votes": int(votes)
})
return comment_data
# 爬取多页数据
def fetch_multiple_pages(book_id, start=0, limit=20, status='P',sort_by='score', num_pages=3):
all_comments = []
for page in range(num_pages):
start_page = start + page * limit
comments = fetch_comments(book_id, start=start_page, limit=limit, status=status, sort_by=sort_by)
all_comments.extend(comments)
return all_comments
# 输出前10条短评信息
def print_top_comments(comments, top_n=10):
for i, comment in enumerate(comments[:top_n], 1):
print(
f"{i}. 用户名: {comment['username']}, 评论时间: {comment['date']}, 评分: {comment['rating']}, 点赞数: {comment['votes']}")
print(f" 短评: {comment['content']}")
# 按照点赞数排序并输出前10条短评信息
def print_top_comments_by_votes(comments, top_n=10):
sorted_comments = sorted(comments, key=lambda x: x['votes'], reverse=True)
print_top_comments(sorted_comments, top_n)
# 文本分析与词云生成
def generate_wordcloud(comments):
text = " ".join([comment['content'] for comment in comments])
words = " ".join(jieba.cut(text))
if not words.strip():
print("没有足够的评论内容生成词云。")
return
wordcloud = WordCloud(font_path='msyh.ttc', width=800, height=400, background_color='white').generate(words)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
# 统计词频
words_list = jieba.lcut(text)
word_freq = {}
for word in words_list:
if len(word) > 1:
word_freq[word] = word_freq.get(word, 0) + 1
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
print("前10位词频统计结果:")
for word, freq in sorted_word_freq[:10]:
print(f"{word}: {freq}")
def total(book_id,sort_by):
start = 0 # 起始位置
limit = 20 # 每页数量
statuses = ['P', 'N', 'F'] # 读书状态:P代表读过,N代表在读,F代表想读
# 存储各状态下的评论
all_comments = {}
# 爬取数据并存储
for status in statuses:
comments = fetch_multiple_pages(book_id, start=start, limit=limit, status=status, sort_by=sort_by, num_pages=3)
all_comments[status] = comments
if book_id =='36781566':
print(f"《叙事本能》")
else:
print(f"《暗处的女儿》")
if sort_by == 'time':
print("最新排序前10位短评信息:")
else:
print("热门排序前10位短评信息:")
# 输出各状态下前10位短评信息
for status, comments in all_comments.items():
# 读书状态:P代表读过,N代表在读,F代表想读
if status == 'P':
print(f"读书状态:读过")
elif status == 'N':
print(f"读书状态:在读")
else:
print(f"读书状态:想读")
print_top_comments(comments)
print()
# 文本分析与词云生成
print("\n按点赞数排序前10位短评信息:")
all_comments_merged = sum(all_comments.values(), [])
generate_wordcloud(all_comments_merged)
if __name__ == "__main__":
books_id = ["36701566","36721763"]
sorts = ['time','score'] # 'time' for 最新, 'score' for 热门
# 爬取数据
for book_id in books_id:
for sort in sorts:
total(book_id,sort)
print("\n热门排序前10位短评信息:")
(三)、函数图形1绘制;(写到实验报告中)
【源代码程序】
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(0, 10, 0.0001)
y1 = x ** 2
y2 = np.cos(x * 2)
y3 = y1 * y2
plt.plot(x, y1,linestyle='-.')
plt.plot(x, y2,linestyle=':')
plt.plot(x, y3,linestyle='--')
plt.savefig("3-1.png")
plt.show()
import matplotlib.pyplot as plt
import numpy as np
fig, subs = plt.subplots(2, 2)
subs[0][0].plot(x, y1)
subs[0][1].plot(x, y2)
subs[1][0].plot(x, y3)
plt.savefig("3-2.png")
plt.show()
(四)、函数图形2绘制;(写到实验报告中)
【源代码程序】
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(-2, 2, 0.0001)
y1 = np.sqrt(2 * np.sqrt(x ** 2) - x ** 2)
y2 = (-2.14) * np.sqrt(np.sqrt(2) - np.sqrt(np.abs(x)))
plt.plot(x, y1, 'r', x, y2, 'r')
plt.fill_between(x, y1, y2, facecolor='pink')
plt.savefig("heart.png")
plt.show()
标签:comment,plt,训练,Python,rankings,comments,print,year,数据处理 From: https://www.cnblogs.com/yuanxinglan/p/18214553