导入文本文档并输出在终端
# Python 3.x版本 import os # 获取根目录下文件的绝对路径 root_path = "./" file_path = os.path.join(root_path, 'pinglun.txt') try: # 打开文本文件并读取所有内容 with open(file_path, 'r', encoding='utf-8') as file: content = file.read() # 显示文件内容 print(content) except FileNotFoundError: print("文件不存在:", file_path) except Exception as e: print("读取文件时发生错误:", str(e))
爬取豆瓣电影评论
import requests, re from bs4 import BeautifulSoup from fake_useragent import UserAgent ua = UserAgent() headers = {"User-Agent": ua.random} def getHTMLText(url): try: r = requests.get( url, headers=headers, timeout=30 ) r.raise_for_status() r.encoding = "utf-8" return r.text except: return "" def fillMoviedata(soup, moviedata): commentinfo = soup.find_all('span', 'comment-info') pattern = re.compile('allstar(\d+) rating') comments = soup.find_all('span', 'short') # 仅处理评论信息和评论内容存在的情况 for i in range(min(len(commentinfo), len(comments))): p = re.findall(pattern, str(commentinfo[i])) moviedata.append([commentinfo[i].a.string if commentinfo[i].a else "", comments[i].string, p]) def printList(moviedata, num): for i in range(min(num, len(moviedata))): u = moviedata[i] try: print("序号: {}\n用户名: {}\n评论内容: {}\n评分: {}星\n".format(i + 1, u[0], u[1], int(eval(u[2][0]) / 10) if u[2] else "")) except Exception as e: print("序号: {}\n用户名: {}\n评论内容: {}\n".format(i + 1, u[0], u[1])) def fetch_movie_comments(movieid, num_comments, start_page=1, limit_per_page=20, method='new_score'): Moviedata = [] total_pages = (num_comments // limit_per_page) + (num_comments % limit_per_page > 0) for page in range(start_page, min(start_page + total_pages + 1, 100)): # 假设最多抓取100页,防止无限循环 url = f'https://movie.douban.com/subject/{movieid}/comments?start={page * limit_per_page - limit_per_page}&limit={limit_per_page}&sort={method}&status=P' html = getHTMLText(url) soup = BeautifulSoup(html, 'html.parser') fillMoviedata(soup, Moviedata) printList(Moviedata, num_comments) # 调用修改后的函数,抓取1000条评论 fetch_movie_comments(34805219, 20, 1)
导入文件对其进行snownlp分析与饼图分析
import os from snownlp import SnowNLP import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties font_path = 'path/to/your/font.ttf' # 替换为你系统中存在的支持中文的字体文件路径 if os.path.exists(font_path): fontprop = FontProperties(fname=font_path) else: print("指定的字体文件不存在,请确认路径是否正确!") # 在绘图之前设置全局字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 如果你系统有SimHei字体(黑体) plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 # 读取根目录下的pinglun.txt文件内容 root_path = "./" file_path = os.path.join(root_path, 'pinglun.txt') try: with open(file_path, 'r', encoding='utf-8') as f: text_content = f.read() # 将文本拆分成句子列表 sentences = [line.strip() for line in text_content.split('\n') if line.strip()] # 对每个句子进行情感分析,并收集情感得分 sentiment_scores = [] binary_sentiments = [] # 添加用于饼图的二元情感类别(0:负面,1:正面) for sentence in sentences: s = SnowNLP(sentence) sentiment_score = s.sentiments sentiment_scores.append(sentiment_score) binary_sentiments.append(0 if sentiment_score < 0.5 else 1) # 简单地将情感得分低于0.5视为负面,否则视为正面 # 绘制情感得分直方图并保存 fig_hist = plt.figure() plt.hist(sentiment_scores, bins=10, edgecolor='black') plt.xlabel('情感得分') plt.ylabel('句子数量') plt.title('根目录下pinglun文件的情感分析 - 直方图') plt.grid(True) plt.savefig(os.path.join(root_path, 'sentiment_histogram.png')) # 绘制情感二元分类的饼图并保存 labels = ['负面', '正面'] fig_pie = plt.figure() plt.pie(binary_sentiments, labels=labels, autopct='%1.1f%%', startangle=90) plt.title('根目录下pinglun文件的情感分析 - 饼图') plt.axis('equal') # 保证饼图是圆形 plt.savefig(os.path.join(root_path, 'sentiment_pie_chart.png')) # 显示图表(在命令行界面运行时可选) plt.show() except FileNotFoundError: print("文件不存在:", file_path) except Exception as e: print("处理文件时发生错误:", str(e))
标签:总结,plt,sentiment,python,comments,print,path,文本,page From: https://www.cnblogs.com/azwz/p/17983264