Python数据处理训练

标签：comment plt 训练 Python rankings comments print year 数据处理

（一）、中国大学排名数据分析与可视化；（写到实验报告中）

【源代码程序】

import requests

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt

# URL 模板，按年份爬取数据

URL_TEMPLATE = "https://www.shanghairanking.cn/rankings/bcur/{}"

# 爬取数据函数

def fetch_rankings(year):

url = URL_TEMPLATE.format(year)

response = requests.get(url)

# 检查响应状态码

if response.status_code != 200:

print(f"Failed to retrieve data for year {year}. Status code: {response.status_code}")

return []

soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table", {"class": "rk-table"})

# 检查是否成功找到表格

if not table:

print(f"Failed to find the ranking table for year {year}.")

return []

rows = table.find_all("tr")[1:11] # 取前10行数据

rankings = []

for row in rows:

cols = row.find_all("td")

rank = cols[0].text.strip()

university = cols[1].text.strip()

score = cols[2].text.strip()

rankings.append((rank, university, score))

return rankings

# 打印排名信息

def print_rankings(rankings, year):

if not rankings:

print(f"No data available for year {year}.")

return

print(f"\n{year} 年前 10 名大学排名：")

print(f"{'排名':<5} {'大学':<20} {'得分':<10}")

print("-" * 40)

for rank, university, score in rankings:

print(f"{rank:<5} {university:<20} {score:<10}")

# 可视化函数

def plot_rankings(rankings_dict):

# 设置字体

plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体字体

plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题

years = list(rankings_dict.keys())

universities = {university for year in years for _, university, _ in rankings_dict[year]}

plt.figure(figsize=(10, 6))

for university in universities:

ranks = []

for year in years:

rank = next((int(rank) for rank, uni, _ in rankings_dict[year] if uni == university), None)

ranks.append(rank)

plt.plot(years, ranks, marker='o', label=university if ranks[-1] and ranks[-1] <= 10 else "")

plt.gca().invert_yaxis()

plt.xticks(years)

plt.xlabel('年份')

plt.ylabel('排名')

plt.title('2015-2019年前10大学排名变化')

plt.legend()

plt.show()

# 查询排名信息

def query_ranking(rankings_dict):

while True:

university = input("请输入大学名称：")

year = input("请输入年份（2015-2019）：")

if not year.isdigit() or int(year) not in rankings_dict:

print("年份输入有误，请重新输入。")

continue

year = int(year)

rank_info = next((rank for rank, uni, _ in rankings_dict[year] if uni == university), None)

if rank_info:

print(f"{year} 年 {university} 排名：{rank_info}")

else:

print(f"{year} 年没有找到 {university} 的排名信息。")

cont = input("是否继续查询？(y/n): ")

if cont.lower() != 'y':

break

if __name__ == "__main__":

rankings_dict = {}

for year in range(2015, 2019+1):

rankings_dict[year] = fetch_rankings(year)

print_rankings(rankings_dict[year], year)

plot_rankings(rankings_dict)

query_ranking(rankings_dict)

（二）、豆瓣图书评论数据分析与可视化；（写到实验报告中）

【源代码程序】

import requests

from bs4 import BeautifulSoup

import jieba

from wordcloud import WordCloud

import matplotlib.pyplot as plt

# 爬取短评数据函数

def fetch_comments(book_id, start=0, limit=20, status='P', sort_by='time'):

url = f"https://book.douban.com/subject/{book_id}/comments/?start={start}&limit={limit}&status={status}&sort={sort_by}"

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"

}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.content, "html.parser")

comments = soup.find_all("div", class_="comment")

comment_data = []

for comment in comments:

username = comment.find("span", class_="comment-info").find("a").text

content = comment.find("span", class_="short").text

date = comment.find("a", class_="comment-time").text.strip()

rating_tag = comment.find("span", class_="rating")

rating = rating_tag['class'][0][7:8] if rating_tag else "无评分"

votes = comment.find("span", class_="vote-count").text

comment_data.append({

"username": username,

"content": content,

"date": date,

"rating": rating,

"votes": int(votes)

})

return comment_data

# 爬取多页数据

def fetch_multiple_pages(book_id, start=0, limit=20, status='P',sort_by='score', num_pages=3):

all_comments = []

for page in range(num_pages):

start_page = start + page * limit

comments = fetch_comments(book_id, start=start_page, limit=limit, status=status, sort_by=sort_by)

all_comments.extend(comments)

return all_comments

# 输出前10条短评信息

def print_top_comments(comments, top_n=10):

for i, comment in enumerate(comments[:top_n], 1):

print(

f"{i}. 用户名: {comment['username']}, 评论时间: {comment['date']}, 评分: {comment['rating']}, 点赞数: {comment['votes']}")

print(f" 短评: {comment['content']}")

# 按照点赞数排序并输出前10条短评信息

def print_top_comments_by_votes(comments, top_n=10):

sorted_comments = sorted(comments, key=lambda x: x['votes'], reverse=True)

print_top_comments(sorted_comments, top_n)

# 文本分析与词云生成

def generate_wordcloud(comments):

text = " ".join([comment['content'] for comment in comments])

words = " ".join(jieba.cut(text))

if not words.strip():

print("没有足够的评论内容生成词云。")

return

wordcloud = WordCloud(font_path='msyh.ttc', width=800, height=400, background_color='white').generate(words)

plt.figure(figsize=(10, 5))

plt.imshow(wordcloud, interpolation="bilinear")

plt.axis("off")

plt.show()

# 统计词频

words_list = jieba.lcut(text)

word_freq = {}

for word in words_list:

if len(word) > 1:

word_freq[word] = word_freq.get(word, 0) + 1

sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

print("前10位词频统计结果：")

for word, freq in sorted_word_freq[:10]:

print(f"{word}: {freq}")

def total(book_id,sort_by):

start = 0 # 起始位置

limit = 20 # 每页数量

statuses = ['P', 'N', 'F'] # 读书状态：P代表读过，N代表在读，F代表想读

# 存储各状态下的评论

all_comments = {}

# 爬取数据并存储

for status in statuses:

comments = fetch_multiple_pages(book_id, start=start, limit=limit, status=status, sort_by=sort_by, num_pages=3)

all_comments[status] = comments

if book_id =='36781566':

print(f"《叙事本能》")

else:

print(f"《暗处的女儿》")

if sort_by == 'time':

print("最新排序前10位短评信息：")

else:

print("热门排序前10位短评信息：")

# 输出各状态下前10位短评信息

for status, comments in all_comments.items():

# 读书状态：P代表读过，N代表在读，F代表想读

if status == 'P':

print(f"读书状态:读过")

elif status == 'N':

print(f"读书状态:在读")

else:

print(f"读书状态:想读")

print_top_comments(comments)

print()

# 文本分析与词云生成

print("\n按点赞数排序前10位短评信息：")

all_comments_merged = sum(all_comments.values(), [])

generate_wordcloud(all_comments_merged)

if __name__ == "__main__":

books_id = ["36701566","36721763"]

sorts = ['time','score'] # 'time' for 最新, 'score' for 热门

# 爬取数据

for book_id in books_id:

for sort in sorts:

total(book_id,sort)

print("\n热门排序前10位短评信息：")

（三）、函数图形1绘制；（写到实验报告中）

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

x = np.arange(0, 10, 0.0001)

y1 = x ** 2

y2 = np.cos(x * 2)

y3 = y1 * y2

plt.plot(x, y1,linestyle='-.')

plt.plot(x, y2,linestyle=':')

plt.plot(x, y3,linestyle='--')

plt.savefig("3-1.png")

plt.show()

import matplotlib.pyplot as plt

import numpy as np

fig, subs = plt.subplots(2, 2)

subs[0][0].plot(x, y1)

subs[0][1].plot(x, y2)

subs[1][0].plot(x, y3)

plt.savefig("3-2.png")

plt.show()

（四）、函数图形2绘制；（写到实验报告中）

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

x = np.arange(-2, 2, 0.0001)

y1 = np.sqrt(2 * np.sqrt(x ** 2) - x ** 2)

y2 = (-2.14) * np.sqrt(np.sqrt(2) - np.sqrt(np.abs(x)))

plt.plot(x, y1, 'r', x, y2, 'r')

plt.fill_between(x, y1, y2, facecolor='pink')

plt.savefig("heart.png")

plt.show()

标签：comment,plt,训练,Python,rankings,comments,print,year,数据处理
From： https://www.cnblogs.com/yuanxinglan/p/18214553

相关文章

赞助商

阅读排行