首页 > 其他分享 >6.5博客

6.5博客

时间:2024-06-16 20:33:57浏览次数:12  
标签:comment regex 博客 re 6.5 words print line

python和工程数学俩实验真累啊

python学习:

import re

from collections import Counter

import requests

from lxml import etree

import pandas as pd

import jieba

import matplotlib.pyplot as plt

from wordcloud import WordCloud

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"

}

comments = []

words = []

def regex_change(line):

# 前缀的正则
 
username_regex = re.compile(r"^\\d+::")
 
# URL,为了防止对中文的过滤,所以使用\[a-zA-Z0-9\]而不是\\w
 
url_regex = re.compile(r"""
 
    (https?://)?
 
    (\[a-zA-Z0-9\]+)
 
    (\\.\[a-zA-Z0-9\]+)
 
    (\\.\[a-zA-Z0-9\]+)\*
 
    (/\[a-zA-Z0-9\]+)\*
 
""", re.VERBOSE | re.IGNORECASE)
 
# 剔除日期
 
data_regex = re.compile(u"""        #utf-8编码
 
    年 |
 
    月 |
 
    日 |
 
    (周一) |
 
    (周二) | 
 
    (周三) | 
 
    (周四) | 
 
    (周五) | 
 
    (周六)
 
""", re.VERBOSE)
 
# 剔除所有数字
 
decimal_regex = re.compile(r"\[^a-zA-Z\]\\d+")
 
# 剔除空格
 
space_regex = re.compile(r"\\s+")
 
regEx = "\[\\n”“|,,;;''/?! 。的了是\]"  # 去除字符串中的换行符、中文冒号、|,需要去除什么字符就在里面写什么字符
 
line = re.sub(regEx, "", line)
 
line = username_regex.sub(r"", line)
 
line = url_regex.sub(r"", line)
 
line = data_regex.sub(r"", line)
 
line = decimal_regex.sub(r"", line)
 
line = space_regex.sub(r"", line)
 
return line

def getComments(url):

score = 0
 
resp = requests.get(url, headers=headers).text
 
html = etree.HTML(resp)
 
comment_list = html.xpath(".//div\[@class='comment'\]")
 
for comment in comment_list:
 
    status = ""
 
    name = comment.xpath(".//span\[@class='comment-info'\]/a/text()")\[0\]  # 用户名
 
    content = comment.xpath(".//p\[@class='comment-content'\]/span\[@class='short'\]/text()")\[0\]  # 短评内容
 
    content = str(content).strip()
 
    word = jieba.cut(content, cut_all=False, HMM=False)
 
    time = comment.xpath(".//span\[@class='comment-info'\]/a/text()")\[1\]  # 评论时间
 
    mark = comment.xpath(".//span\[@class='comment-info'\]/span/@title")  # 评分
 
    if len(mark) == 0:
 
        score = 0
 
    else:
 
        for i in mark:
 
            status = str(i)
 
        if status == "力荐":
 
            score = 5
 
        elif status == "推荐":
 
            score = 4
 
        elif status == "还行":
 
            score = 3
 
        elif status == "较差":
 
            score = 2
 
        elif status == "很差":
 
            score = 1
 
    good = comment.xpath(".//span\[@class='comment-vote'\]/span\[@class='vote-count'\]/text()")\[0\]  # 点赞数(有用数)
 
    comments.append(\[str(name), content, str(time), score, int(good)\])
 
    for i in word:
 
        if len(regex_change(i)) >= 2:
 
            words.append(regex_change(i))

def getWordCloud(words):

# 生成词云
 
all_words = \[\]
 
all_words += \[word for word in words\]
 
dict_words = dict(Counter(all_words))
 
bow_words = sorted(dict_words.items(), key=lambda d: d\[1\], reverse=True)
 
print("热词前10位:")
 
for i in range(10):
 
    print(bow_words\[i\])
 
text = ' '.join(words)
 
w = WordCloud(background_color='white',
 
                 width=1000,
 
                 height=700,
 
                 font_path='simhei.ttf',
 
                 margin=10).generate(text)
 
plt.show()
 
plt.imshow(w)
 
w.to_file('wordcloud.png')

print("请选择以下选项:")

print(" 1.热门评论")

print(" 2.最新评论")

info = int(input())

print("前10位短评信息:")

title = ['用户名', '短评内容', '评论时间', '评分', '点赞数']

if info == 1:

comments = \[\]
 
words = \[\]
 
for i in range(0, 60, 20):
 
    url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=new_score".format(
 
        i)  # 前3页短评信息(热门)
 
    getComments(url)
 
df = pd.DataFrame(comments, columns=title)
 
print(df.head(10))
 
print("点赞数前10位的短评信息:")
 
df = df.sort_values(by='点赞数', ascending=False)
 
print(df.head(10))
 
getWordCloud(words)

elif info == 2:

comments = \[\]
 
words=\[\]
 
for i in range(0, 60, 20):
 
    url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=time".format(
 
        i)  # 前3页短评信息(最新)
 
    getComments(url)
 
df = pd.DataFrame(comments, columns=title)
 
print(df.head(10))
 
print("点赞数前10位的短评信息:")
 
df = df.sort_values(by='点赞数', ascending=False)
 
print(df.head(10))
 
getWordCloud(words)

标签:comment,regex,博客,re,6.5,words,print,line
From: https://www.cnblogs.com/zjm921/p/18251199

相关文章

  • 6.10博客
    放松也放松完了快考试了进入状态了该学习内容:安卓<?xmlversion="1.0"encoding="utf-8"?><RelativeLayout    xmlns:android="http://schemas.android.com/apk/res/android"    xmlns:tools="http://schemas.android.com/tools" ......
  • 5.18博客
    上午数据库下午计网要是工程数学老师有数据库老师的松弛感就好了学习内容:安卓<RelativeLayoutxmlns:android="http://schemas.android.com/apk/res/android"  android:layout_width="match_parent"  android:layout_height="50dp"  android:background="#aaa......
  • 5.21博客
    又是周一!课这么多spring-boot就先放一放学习内容:安卓packagecom.example.app_02.database;importandroid.util.Log;importjava.sql.Connection;importjava.sql.DriverManager;importjava.sql.PreparedStatement;importjava.sql.ResultSet;publicclassMySQLCon......
  • 5.22博客
    还是上午计网下午建民老师的课,晚上又是跑步,今天跑得慢了点但是足足有七公里学习内容:安卓packagecom.example.app_02.entity;publicclassRecord{  privateStringstartTime;  privateStringendTime;  privateStringrecord;  publicRecord(){......
  • 5.23博客
    python和工程数学学习内容:下午没课所以可以看一下spring-boot自动配置原理安卓:packagecom.example.app_02.entity;publicclassUser{  privateStringusername;  privateStringpassword;  publicUser(){  }  publicUser(Stringusernam......
  • 5.24博客
    体育比赛也是完美收工,全胜学习内容:spring-boot自定义stater安卓:packagecom.example.app_02.entity;publicclassUserInfo{  privateStringstudentid;  privateStringname;  privateStringphonenumber;  privateStringclassname;  publ......
  • 5.25博客
    满满一周的课终于结束,奖励自己一杯奶茶吧学习内容:安卓packagecom.example.app_02.utils;importcom.example.app_02.R;importcom.example.app_02.database.MySQLConnection;importcom.example.app_02.entity.Record;importcom.example.app_02.entity.User;importja......
  • 5.28博客
    又是周一,那些杀不死我的还不如直接杀死我学习内容:安卓packagecom.example.app_02.utils;importcom.example.app_02.database.MySQLConnection;importcom.example.app_02.entity.User;importcom.example.app_02.entity.UserInfo;publicclassUserDaoextendsMySQLCo......
  • 5.29博客
    上午计网下午建民老师的课学习内容:安卓packagecom.example.app_02.utils;importcom.example.app_02.database.MySQLConnection;importcom.example.app_02.entity.User;importcom.example.app_02.entity.UserInfo;publicclassUserInfoDaoextendsMySQLConnection{......
  • 5.30博客
    周三:python和工程数学python学习:importrequestsfrombs4importBeautifulSoupasbsimportpandasaspdfrommatplotlibimportpyplotaspltdefget_rank(url):count=0 rank=\[\] headers={    "user-agent":"Mozilla/5.0(WindowsNT10......