首页 > 其他分享 >新闻类网页抽取

新闻类网页抽取

时间:2023-11-14 17:03:10浏览次数:29  
标签:node info 抽取 网页 新闻 self count re text

新闻类网页抽取

# -*- coding: utf-8 -*-
import urllib.error

import lxml
from lxml import etree
from urllib.parse import urljoin
from html import unescape
import re
import math
import requests
from pyquery import PyQuery as pq


# 计算两个字符串的余弦想相似度
def strsim(x: str, y: str):
    xlen, ylen = len(x), len(y)
    if not xlen or not ylen:
        return 0
    if xlen > ylen:
        ratio = xlen / ylen
    else:
        ratio = ylen / xlen
    if ratio > 3:
        return 0
    return lcs(x, y) / max(xlen, ylen)


def lcs(x: str, y: str):
    xlen, ylen = len(x), len(y)
    if not xlen or not ylen:
        return 0
    opt = []
    for i in range(xlen + 1):
        opt.append([0 for _ in range(ylen + 1)])

    for i in range(xlen - 1, -1, -1):
        for j in range(ylen - 1, -1, -1):
            if x[i] == y[j]:
                opt[i][j] = opt[i + 1][j + 1] + 1
            else:
                opt[i][j] = max(opt[i + 1][j], opt[i][j + 1])
    return opt[0][0]



class CountInfo(object):
    """节点数据统计分析类"""

    def __init__(self):
        self.text_count = 0
        self.link_text_count = 0
        self.tag_count = 0
        self.link_tag_count = 0
        self.density = 0.0
        self.density_sum = 0.0
        self.score = 0.0
        self.pcount = 0
        self.leaflist = []


class ContentExtractor:
    """内容提取类"""

    def __init__(self):
        self.doc = None
        self._infomap = {}
        self._blockinfo = {}
        self.url = None
        self.newline = ["div", "li", "p", "h1", "h2", "h3", "h4", "h5", "tr", "img",
                        "br", "thead", "tbody", "hr", "section", "article", "ul", "ol"]
        self.top_node = None
        self.title = ""
        self._titlelen = 0
        self.clean_text = ""
        self.format_text = ""
        self.score = 0.0
        self.link_text_ratio = 0.0
        self.text_count = 0
        self.img_count = 0
        self.findtitle = True
        self.title_tmp = ""
        self.raw = ""
        self.__htmlattr = re.compile(r'<([a-z]+\w*)[^>]*>', re.I)

    def extract(self, url, html_source):
        """主要提取函数入口"""
        try:
            clean_html = self.clean_tag(html_source)
            clean_html = unescape(clean_html)
            self.doc = etree.HTML(clean_html)
        except (TypeError, ValueError, etree.XMLSyntaxError):
            return
        if self.doc is None:
            return
        self.url = url
        self.title = self.get_title()
        if not self.title:
            return
        self._titlelen = len(self.title)
        self.score, self.link_text_ratio = self.get_top_node()
        if self.top_node is None:
            return
        self.remove_link_block()
        if (len(self.title_tmp) / float(self._titlelen)) > 0.2:
            self.title = self.title_tmp
        self.raw = self.remove_htmlattr(self.top_node)
        content = self.output_format(self.top_node)
        self.clean_text = "\n".join([t if "img" not in t else
                                     "" for t in content.split('\n')])
        for text in content.split("\n"):
            if "img" in text:
                self.img_count += 1
                self.format_text += '<p align="center">%s</p>' % text
            else:
                text = text.strip()
                if not text:
                    continue
                self.text_count += len(text)
                if '相关文章' in text:
                    continue
                self.format_text += '<p>%s</p>' % text
        return True

    def abstracturl(self, urlpath):
        """URL相对链接补全"""
        return urljoin(self.url, urlpath)

    def get_title(self):
        """自动获取文章标题"""
        title = ''.join(self.doc.xpath('//title/text()')).strip()
        if title == "":
            return ""
        titles = self.doc.xpath(
            '//h1|//h2|//h3|//*[contains(@class, "title")]|//*[contains(@id, "title")]')
        if not titles:
            return title
        ok_title = title
        max_sim = 0
        for tt in titles:
            sim = strsim(tt, title)
            if sim > max_sim:
                ok_title = tt
                max_sim = sim
        return ok_title

    @staticmethod
    def clean_tag(doc):
        """去除掉script,noscript,style,iframe,br等标签"""
        doc = re.sub(r'<script.*?>.*?</script>', '', doc, flags=(re.I | re.S))
        doc = re.sub(r'<noscript.*?>.*?</noscript>',
                     '', doc, flags=(re.I | re.S))
        doc = re.sub(r'<style.*?>.*?</style>', '', doc, flags=(re.I | re.S))
        doc = re.sub(r'<iframe.*?>.*?</iframe>', '', doc, flags=(re.I | re.S))
        doc = re.sub(r'[\r\t]+', '', doc)
        doc = re.sub(r'<br\s*/?>', '\n', doc, flags=re.I)
        doc = re.sub(r'<!--.*?-->', '', doc, flags=re.S)
        doc = re.sub(r' ', ' ', doc, flags=re.S)
        return doc

    @staticmethod
    def contents(node):
        """提取节点的所有文本以及子节点"""
        result = []
        result.extend(node.xpath("child::text()|child::*"))
        return result

    def calcuate(self, node, record):
        """计算各个节点的信息"""
        if etree.iselement(node):
            info = CountInfo()
            for elem in self.contents(node):
                childinfo = self.calcuate(elem, record)
                info.text_count += childinfo.text_count
                info.link_text_count += childinfo.link_text_count
                info.tag_count += childinfo.tag_count
                info.link_tag_count += childinfo.link_tag_count
                info.leaflist.extend(childinfo.leaflist)
                info.density_sum += childinfo.density
                info.pcount += childinfo.pcount

            info.tag_count += 1
            tagname = node.tag
            if tagname == "a":
                info.link_text_count = info.text_count
                info.link_tag_count += 1
            elif tagname == "p":
                info.pcount += 1

            purelen = info.text_count - info.link_text_count
            not_link_tag_num = info.tag_count - info.link_tag_count

            if purelen == 0 or not_link_tag_num == 0:
                info.density = 0
            else:
                info.density = float(purelen) / not_link_tag_num
            record[node] = info
            return info
        elif hasattr(node, "is_text"):
            info = CountInfo()
            nodetext = node.strip()
            txtlen = len(nodetext)
            info.text_count = txtlen
            info.leaflist.append(txtlen)
            tmp_len = len(self.title_tmp)
            if self.findtitle and tmp_len < txtlen <= self._titlelen \
                    and self.title.startswith(nodetext):
                self.title_tmp = nodetext
            return info
        else:
            return CountInfo()

    def calcuate_score(self, node):
        """计算节点得分"""
        info = self._infomap.get(node)
        if info is None:
            return 0.0
        val = math.sqrt(self.calcuate_var(info.leaflist) + 1)
        return math.log(val) * info.density_sum * \
            math.log(float(info.text_count -
                           info.link_text_count) + 1.0) * \
            math.log10(float(info.pcount) + 2.0)

    @staticmethod
    def calcuate_var(leafs):
        """计算平均分"""
        leaf_len = len(leafs)
        if leaf_len <= 0:
            return 0.0
        if leaf_len == 1:
            return leafs[0] / 2.0
        sums = 0.0
        for v in leafs:
            sums += float(v)
        ave = sums / leaf_len
        sums = 0.0
        for v in leafs:
            sums += (float(v) - ave) ** 2
        sums /= leaf_len
        return sums

    def get_top_node(self):
        """获取内容主体节点"""
        self.findtitle = True
        body = self.doc.find("body")
        self.calcuate(body, self._infomap)
        max_score = 0.0
        link_text_ratio = 0.0
        for node, info in self._infomap.items():
            tagname = node.tag
            if tagname in ["a", "body"]:
                continue
            score = self.calcuate_score(node)
            if score > max_score:
                max_score = score
                self.top_node = node
                try:
                    link_text_ratio = info.link_text_count / \
                        float(info.text_count)
                except ZeroDivisionError:
                    pass
        return max_score, link_text_ratio

    def remove_htmlattr(self, node):
        """
        删除HTML所有属性和a标签链接
        """
        raw = etree.tounicode(node)
        raw = re.sub('</?(?:div|a)[^>]*>', '', raw, flags=re.I)
        raw = re.sub(r'<img[^>]*?(?:src|data-src)\s?=\s?[\'"]?([^\'"]+)[^>]*>',
                     '<#img src="\g<1>" />', raw, flags=re.I)
        raw = self.__htmlattr.sub('<\g<1>>', raw)
        raw = re.sub(r'<#img', '<img', raw).strip()
        return re.sub(r'[\r\n\t]+|</?div>', '', self.remove_empty_node(raw))

    def remove_empty_node(self, text):
        """
        删除没有内容的标签,和多层套娃无内容标签
        """
        text = unescape(text)
        try:
            doc = pq(text)
        except (lxml.etree.ParserError, lxml.etree.XMLSyntaxError,
                requests.exceptions.InvalidURL, urllib.error.HTTPError):
            return text
        except Exception:
            return text
        for item in doc.contents():
            if etree.iselement(item):
                if ''.join(item.itertext()).strip() == "":
                    hasimg = pq(item).find("img")
                    if item.tag not in ['img', 'br'] and not hasimg.attr('src'):
                        pq(item).remove()
                else:
                    self.remove_empty_node(item)
        return str(doc)

    def output_format(self, cnode):
        """格式化内容输出"""
        content = ""
        for node in self.contents(cnode):
            if hasattr(node, "is_text"):
                content += node
            elif etree.iselement(node):
                if node.tag in self.newline:
                    content += "\n"
                if node.tag == "img":
                    src = node.attrib.get("data-src", "") \
                        or node.attrib.get("src", "")
                    src = self.abstracturl(src)
                    if src:
                        content += '<img src="%s" />' % src
                content += self.output_format(node)
        return content.strip()

    def remove_link_block(self):
        """删除链接块"""
        self.findtitle = False
        self.calcuate(self.top_node, self._blockinfo)
        for node, info in self._blockinfo.items():
            if node.tag == "a":
                continue
            try:
                link_text_ratio = info.link_text_count / info.text_count
            except ZeroDivisionError:
                continue
            if link_text_ratio > 0.5:
                parentnode = node.getparent()
                if etree.iselement(parentnode):
                    parentnode.remove(node)



if __name__ == '__main__':
    r = requests.get('http://www.fanwenbaba.cn/nianzhongzongjie/43952.html', timeout=10)
    r.encoding = 'utf-8'
    html = r.text
    ex = ContentExtractor()
    ex.extract('http://www.fanwenbaba.cn/nianzhongzongjie/43952.html', html)
    print(ex.title)
    print("=" * 100)
    print(ex.raw)

  

提取新闻类正文数据

 

标签:node,info,抽取,网页,新闻,self,count,re,text
From: https://www.cnblogs.com/yoyo1216/p/17832010.html

相关文章

  • H5网页跳转微信小程序踩坑
    问题:苹果手机可以显示图片跳转按钮,但是安卓手机无法显示出来。、问题:苹果手机可以显示图片跳转按钮,但是安卓手机无法显示出来。、原因:看看图片链接是 // 还是 http 开头,如果是 //test.com/upload/60/2b605429ddcc756370be777761c98d.png 这种形式的图片链接,会导致安卓手机......
  • JavaScript使用JS从JSON获取信息并遍历输出到网页展示信息------前端
    遍历JSON获取数据<!DOCTYPEhtml><!--这是HTML的注释--><htmllang="en"id="myHtml"> <head> <!--这里不是设置了编码,而是告诉浏览器,用什么编码方式打开文件避免乱码--> <metacharset="UTF-8"> <metaname="viewport"......
  • 11月12日基础的网页设计以及阴影的设计box-shadow属性
    目录基础的网页设计html的代码css的代码阴影效果添加基础的网页设计html的代码<!DOCTYPEhtml><htmllang="en"><head><metacharset="UTF-8"><title>今天网页</title><linkrel="stylesheet"href="样式.css"......
  • 微信的聊天记录导出到网页中的最快方法,语音能听,图片视频能看
    12-7如果你有把微信的聊天记录导出到表格或者网页上的需求,适合看看本文章,本文的方法可以让你把微信的聊天记录导出备份,可以在完全脱离微信的情况下随时调取查看聊天数据。本文介绍的软件可以导出两种格式的聊天记录备份文件,一种是表格,一种是网页。导出表格的好处是文本紧凑,篇幅小,缺......
  • 实用技巧分享:网页上的视频没法另存为,如何下载
     网页上的视频没法另存为,如何下载,下面介绍一个好用的方法:1、首先在相应页面按下F12或者点击浏览器右上角(不同浏览器位置略有不同)的更多工具——开发者工具。2、点击网络按钮,然后下拉找到比如mp4格式的文件3、找到mp4视频文件,将请求网址后面的链接复制出来,放到新页面中4、点击右下......
  • 11月12日基础的网页设计以及阴影的设计box-shadow属性
    目录基础的网页设计html的代码css的代码基础的网页设计html的代码<!DOCTYPEhtml><htmllang="en"><head><metacharset="UTF-8"><title>今天网页</title><linkrel="stylesheet"href="样式.css">&......
  • python随机抽取数字的方法和代码
    在Python中,我们可以使用内置的random模块来随机抽取数字。下面是一些示例。从一个列表中随机抽取数字如果你有一个数字列表,并且你想从中随机选择一个数字,你可以使用random.choice函数。pythonimportrandomnumbers=[1,2,3,4,5,6,7,8,9,10]chosen_number=rando......
  • 在移动电话上点击网页链接时如何触发电话呼叫。
    内容来自DOChttps://q.houxu6.top/?s=在移动电话上点击网页链接时如何触发电话呼叫。我需要为移动设备构建一个网页。只有一件事我还没有弄清楚:如何通过点击文本来触发电话呼叫?是否有一个特殊的URL可以像mailto:标签用于电子邮件那样使用?不希望使用特定于设备的解决方......
  • 你以为键入网址后只是等待吗?惊!原来网页显示背后隐藏着这些奇妙步骤(终章)
    引言在前面的讨论中,我们已经详细介绍了计算机网络中的物理层、传输层和网络层以及应用层的工作原理。这些层次组成了一个完整的网络架构,确保了数据的传输和交流。在今天的讨论中,我们将进一步深入探讨数据包从计算机发出后的一系列流程,这涉及到网络设备中的交换机和路由器在其中扮......
  • 华为手机自带浏览器在web网页中对长按保存无法禁止?
    在最近的开发过程中,遇到一个问题,在实现拖动图片(img标签)时,如果未拖动而长按图片,会触发保存功能。在对标签进行CSS设置以后,其他的浏览器都已禁止了长按保存功能,现在只留下华为自带的浏览器还存在长按保存的功能。-webkit-touch-callout:none;-webkit-user-select:none;-moz-use......