首页 > 其他分享 >5.18

5.18

时间:2024-06-06 22:22:06浏览次数:7  
标签:comment regex re words print 5.18 line

图书评论爬取

import re

from collections import Counter

 

import requests

from lxml import etree

import pandas as pd

import jieba

import matplotlib.pyplot as plt

from wordcloud import WordCloud

 

headers = {

    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"

}

 

comments = []

words = []

 

 

def regex_change(line):

    # 前缀的正则

    username_regex = re.compile(r"^\d+::")

    # URL,为了防止对中文的过滤,所以使用[a-zA-Z0-9]而不是\w

    url_regex = re.compile(r"""

        (https?://)?

        ([a-zA-Z0-9]+)

        (\.[a-zA-Z0-9]+)

        (\.[a-zA-Z0-9]+)*

        (/[a-zA-Z0-9]+)*

    """, re.VERBOSE | re.IGNORECASE)

    # 剔除日期

    data_regex = re.compile(u"""        #utf-8编码

        年 |

        月 |

        日 |

        (周一) |

        (周二) |

        (周三) |

        (周四) |

        (周五) |

        (周六)

    """, re.VERBOSE)

    # 剔除所有数字

    decimal_regex = re.compile(r"[^a-zA-Z]\d+")

    # 剔除空格

    space_regex = re.compile(r"\s+")

    regEx = "[\n”“|,,;;''/?! 。的了是]"  # 去除字符串中的换行符、中文冒号、|,需要去除什么字符就在里面写什么字符

    line = re.sub(regEx, "", line)

    line = username_regex.sub(r"", line)

    line = url_regex.sub(r"", line)

    line = data_regex.sub(r"", line)

    line = decimal_regex.sub(r"", line)

    line = space_regex.sub(r"", line)

    return line

 

 

def getComments(url):

    score = 0

    resp = requests.get(url, headers=headers).text

    html = etree.HTML(resp)

    comment_list = html.xpath(".//div[@class='comment']")

    for comment in comment_list:

        status = ""

        name = comment.xpath(".//span[@class='comment-info']/a/text()")[0]  # 用户名

        content = comment.xpath(".//p[@class='comment-content']/span[@class='short']/text()")[0]  # 短评内容

        content = str(content).strip()

        word = jieba.cut(content, cut_all=False, HMM=False)

        time = comment.xpath(".//span[@class='comment-info']/a/text()")[1]  # 评论时间

        mark = comment.xpath(".//span[@class='comment-info']/span/@title")  # 评分

        if len(mark) == 0:

            score = 0

        else:

            for i in mark:

                status = str(i)

            if status == "力荐":

                score = 5

            elif status == "推荐":

                score = 4

            elif status == "还行":

                score = 3

            elif status == "较差":

                score = 2

            elif status == "很差":

                score = 1

        good = comment.xpath(".//span[@class='comment-vote']/span[@class='vote-count']/text()")[0]  # 点赞数(有用数)

        comments.append([str(name), content, str(time), score, int(good)])

        for i in word:

            if len(regex_change(i)) >= 2:

                words.append(regex_change(i))

 

 

def getWordCloud(words):

    # 生成词云

    all_words = []

    all_words += [word for word in words]

    dict_words = dict(Counter(all_words))

    bow_words = sorted(dict_words.items(), key=lambda d: d[1], reverse=True)

    print("热词前10位:")

    for i in range(10):

        print(bow_words[i])

    text = ' '.join(words)

 

    w = WordCloud(background_color='white',

                     width=1000,

                     height=700,

                     font_path='simhei.ttf',

                     margin=10).generate(text)

    plt.show()

    plt.imshow(w)

    w.to_file('wordcloud.png')

 

 

print("请选择以下选项:")

print("   1.热门评论")

print("   2.最新评论")

info = int(input())

print("前10位短评信息:")

title = ['用户名', '短评内容', '评论时间', '评分', '点赞数']

if info == 1:

    comments = []

    words = []

    for i in range(0, 60, 20):

        url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=new_score".format(

            i)  # 前3页短评信息(热门)

        getComments(url)

    df = pd.DataFrame(comments, columns=title)

    print(df.head(10))

    print("点赞数前10位的短评信息:")

    df = df.sort_values(by='点赞数', ascending=False)

    print(df.head(10))

    getWordCloud(words)

elif info == 2:

    comments = []

    words=[]

    for i in range(0, 60, 20):

        url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=time".format(

            i)  # 前3页短评信息(最新)

        getComments(url)

    df = pd.DataFrame(comments, columns=title)

    print(df.head(10))

    print("点赞数前10位的短评信息:")

    df = df.sort_values(by='点赞数', ascending=False)

    print(df.head(10))

 

标签:comment,regex,re,words,print,5.18,line
From: https://www.cnblogs.com/Christmas77/p/18236170

相关文章

  • 5.18
    学习记录app即将完结 今天增加了教师的功能和自动登录的功能```java//查询所有班级根据班级的学生的分类信息每个学生的学习记录次数@OverridepublicMap<String,Object>listClass(){Map<String,Object>map=newHashMap<>();List<String>classList=u......
  • 第二届“重科杯”重庆科技大学程序设计竞赛(同步赛)ptlks的题解(2024.5.18)
    A.Alice和Bob题意:给定序列A和序列,m组信息\((i,j)\),Alice可以交换\(A_i\)和\(A_j\)任意次,判断Alice是否能将序列A转变为序列B。思路由于Alice可以任意调整m组信息,所以题目所给m组信息\((i,j)\)不影响结果。先考虑k组信息,第i组为\((T_i,T_{i+1})\),\(1\leqT_1\ltT_2\lt.........
  • 【全套】源支付5.18最新版协议去授权全套三端开源源码_客户端+云端+监控+协议三网免挂
    推荐系统为:CentOS7.6Linux系统环境:Nginx1.20.1+MySQL5.6.50+PHP-7.2+Redis将商户后台源码上传解压运行目录为Public伪静态为thinkphp访问域名傻瓜模式安装后台安装完了sudorpm-Uvhhttps://packages.microsoft.com/config/rhel/7/packages-microsoft-prod.rpm完成后输......
  • 【全套】源支付5.18最新版协议去授权全套三端开源源码_客户端+云端+监控+协议三网免挂
    推荐系统为:               材料自取:提取码:m1cxCentOS7.6Linux系统环境:Nginx1.20.1+MySQL5.6.50+PHP-7.2+Redis将商户后台源码上传解压运行目录为Public伪静态为thinkphp访问域名傻瓜模式安装后台安装完了sudorpm-Uvhhttps://packages.......
  • rhel 6.5恢复MySQL 5.5.18从节点
    文档课题:rhel6.5恢复MySQL5.5.18从节点.系统:rhel6.564位数据库:MySQL5.5.18数据库安装包:mysql-5.5.18.tar.gzXtrabackup安装包:percona-xtrabackup-24-2.4.6-2.el6.x86_64.rpm系统架构:应用场景:主库binlog未被从库应用便被binlog保留策略自动删除,主从同步出现异常.以下模拟......
  • rhel 6.5搭建MySQL 5.5.18一主一从高可用架构
    文档课题:rhel6.5搭建MySQL5.5.18一主一从高可用架构.系统:rhel6.564位数据库:MySQL5.5.18数据库安装包:mysql-5.5.18.tar.gzXtrabackup安装包:percona-xtrabackup-24-2.4.6-2.el6.x86_64.rpm架构信息如下:1、主从搭建1.1、前期准备安装两台MySQL数据库主机后,配置好主机IP地......
  • rhel 6.5以编译方式安装mysql 5.5.18
    文档课题:rhel6.5以编译方式安装mysql5.5.18数据库:mysql5.5.18系统:rhel6.564位安装包:mysql-5.5.18.tar.gz1、卸载MariaDB--卸载系统自带的mysql和mariadb-lib.[root@MySQL5518-Master~]#rpm-qa|grepmysqlmysql-libs-5.1.71-1.el6.x86_64[root@MySQL5518-Master~......
  • 5.18
    %定义目标函数f=@(x)100*(x(1)^2-x(2))^2+(x(1)-1)^2;%定义目标函数的梯度grad_f=@(x)[400*x(1)*(x(1)^2-x(2))+2*(x(1)-1);-200*(x(1)^2-x(2))];%定义终止准则epsilon=1e-5;%定义最大迭代次数max_iterations=1000;%初始点列表initial_points=......
  • 5.18
    1#include<iostream>2#include<fstream>3#include<string>45intmain(){6std::ifstreaminputFile("D://article.txt");78if(!inputFile){9std::cout<<"无法打开输入文件!"<<......
  • 5.18
    #include<iostream>usingnamespacestd;intmain(){      doublea;      double*p=&a;      cout<<"指针占了"<<sizeof(double)<<"字节"<<endl;      cout<<"指针所指向的变量占了"<<sizeof(p)&......