新闻类网页抽取
# -*- coding: utf-8 -*- import urllib.error import lxml from lxml import etree from urllib.parse import urljoin from html import unescape import re import math import requests from pyquery import PyQuery as pq # 计算两个字符串的余弦想相似度 def strsim(x: str, y: str): xlen, ylen = len(x), len(y) if not xlen or not ylen: return 0 if xlen > ylen: ratio = xlen / ylen else: ratio = ylen / xlen if ratio > 3: return 0 return lcs(x, y) / max(xlen, ylen) def lcs(x: str, y: str): xlen, ylen = len(x), len(y) if not xlen or not ylen: return 0 opt = [] for i in range(xlen + 1): opt.append([0 for _ in range(ylen + 1)]) for i in range(xlen - 1, -1, -1): for j in range(ylen - 1, -1, -1): if x[i] == y[j]: opt[i][j] = opt[i + 1][j + 1] + 1 else: opt[i][j] = max(opt[i + 1][j], opt[i][j + 1]) return opt[0][0] class CountInfo(object): """节点数据统计分析类""" def __init__(self): self.text_count = 0 self.link_text_count = 0 self.tag_count = 0 self.link_tag_count = 0 self.density = 0.0 self.density_sum = 0.0 self.score = 0.0 self.pcount = 0 self.leaflist = [] class ContentExtractor: """内容提取类""" def __init__(self): self.doc = None self._infomap = {} self._blockinfo = {} self.url = None self.newline = ["div", "li", "p", "h1", "h2", "h3", "h4", "h5", "tr", "img", "br", "thead", "tbody", "hr", "section", "article", "ul", "ol"] self.top_node = None self.title = "" self._titlelen = 0 self.clean_text = "" self.format_text = "" self.score = 0.0 self.link_text_ratio = 0.0 self.text_count = 0 self.img_count = 0 self.findtitle = True self.title_tmp = "" self.raw = "" self.__htmlattr = re.compile(r'<([a-z]+\w*)[^>]*>', re.I) def extract(self, url, html_source): """主要提取函数入口""" try: clean_html = self.clean_tag(html_source) clean_html = unescape(clean_html) self.doc = etree.HTML(clean_html) except (TypeError, ValueError, etree.XMLSyntaxError): return if self.doc is None: return self.url = url self.title = self.get_title() if not self.title: return self._titlelen = len(self.title) self.score, self.link_text_ratio = self.get_top_node() if self.top_node is None: return self.remove_link_block() if (len(self.title_tmp) / float(self._titlelen)) > 0.2: self.title = self.title_tmp self.raw = self.remove_htmlattr(self.top_node) content = self.output_format(self.top_node) self.clean_text = "\n".join([t if "img" not in t else "" for t in content.split('\n')]) for text in content.split("\n"): if "img" in text: self.img_count += 1 self.format_text += '<p align="center">%s</p>' % text else: text = text.strip() if not text: continue self.text_count += len(text) if '相关文章' in text: continue self.format_text += '<p>%s</p>' % text return True def abstracturl(self, urlpath): """URL相对链接补全""" return urljoin(self.url, urlpath) def get_title(self): """自动获取文章标题""" title = ''.join(self.doc.xpath('//title/text()')).strip() if title == "": return "" titles = self.doc.xpath( '//h1|//h2|//h3|//*[contains(@class, "title")]|//*[contains(@id, "title")]') if not titles: return title ok_title = title max_sim = 0 for tt in titles: sim = strsim(tt, title) if sim > max_sim: ok_title = tt max_sim = sim return ok_title @staticmethod def clean_tag(doc): """去除掉script,noscript,style,iframe,br等标签""" doc = re.sub(r'<script.*?>.*?</script>', '', doc, flags=(re.I | re.S)) doc = re.sub(r'<noscript.*?>.*?</noscript>', '', doc, flags=(re.I | re.S)) doc = re.sub(r'<style.*?>.*?</style>', '', doc, flags=(re.I | re.S)) doc = re.sub(r'<iframe.*?>.*?</iframe>', '', doc, flags=(re.I | re.S)) doc = re.sub(r'[\r\t]+', '', doc) doc = re.sub(r'<br\s*/?>', '\n', doc, flags=re.I) doc = re.sub(r'<!--.*?-->', '', doc, flags=re.S) doc = re.sub(r' ', ' ', doc, flags=re.S) return doc @staticmethod def contents(node): """提取节点的所有文本以及子节点""" result = [] result.extend(node.xpath("child::text()|child::*")) return result def calcuate(self, node, record): """计算各个节点的信息""" if etree.iselement(node): info = CountInfo() for elem in self.contents(node): childinfo = self.calcuate(elem, record) info.text_count += childinfo.text_count info.link_text_count += childinfo.link_text_count info.tag_count += childinfo.tag_count info.link_tag_count += childinfo.link_tag_count info.leaflist.extend(childinfo.leaflist) info.density_sum += childinfo.density info.pcount += childinfo.pcount info.tag_count += 1 tagname = node.tag if tagname == "a": info.link_text_count = info.text_count info.link_tag_count += 1 elif tagname == "p": info.pcount += 1 purelen = info.text_count - info.link_text_count not_link_tag_num = info.tag_count - info.link_tag_count if purelen == 0 or not_link_tag_num == 0: info.density = 0 else: info.density = float(purelen) / not_link_tag_num record[node] = info return info elif hasattr(node, "is_text"): info = CountInfo() nodetext = node.strip() txtlen = len(nodetext) info.text_count = txtlen info.leaflist.append(txtlen) tmp_len = len(self.title_tmp) if self.findtitle and tmp_len < txtlen <= self._titlelen \ and self.title.startswith(nodetext): self.title_tmp = nodetext return info else: return CountInfo() def calcuate_score(self, node): """计算节点得分""" info = self._infomap.get(node) if info is None: return 0.0 val = math.sqrt(self.calcuate_var(info.leaflist) + 1) return math.log(val) * info.density_sum * \ math.log(float(info.text_count - info.link_text_count) + 1.0) * \ math.log10(float(info.pcount) + 2.0) @staticmethod def calcuate_var(leafs): """计算平均分""" leaf_len = len(leafs) if leaf_len <= 0: return 0.0 if leaf_len == 1: return leafs[0] / 2.0 sums = 0.0 for v in leafs: sums += float(v) ave = sums / leaf_len sums = 0.0 for v in leafs: sums += (float(v) - ave) ** 2 sums /= leaf_len return sums def get_top_node(self): """获取内容主体节点""" self.findtitle = True body = self.doc.find("body") self.calcuate(body, self._infomap) max_score = 0.0 link_text_ratio = 0.0 for node, info in self._infomap.items(): tagname = node.tag if tagname in ["a", "body"]: continue score = self.calcuate_score(node) if score > max_score: max_score = score self.top_node = node try: link_text_ratio = info.link_text_count / \ float(info.text_count) except ZeroDivisionError: pass return max_score, link_text_ratio def remove_htmlattr(self, node): """ 删除HTML所有属性和a标签链接 """ raw = etree.tounicode(node) raw = re.sub('</?(?:div|a)[^>]*>', '', raw, flags=re.I) raw = re.sub(r'<img[^>]*?(?:src|data-src)\s?=\s?[\'"]?([^\'"]+)[^>]*>', '<#img src="\g<1>" />', raw, flags=re.I) raw = self.__htmlattr.sub('<\g<1>>', raw) raw = re.sub(r'<#img', '<img', raw).strip() return re.sub(r'[\r\n\t]+|</?div>', '', self.remove_empty_node(raw)) def remove_empty_node(self, text): """ 删除没有内容的标签,和多层套娃无内容标签 """ text = unescape(text) try: doc = pq(text) except (lxml.etree.ParserError, lxml.etree.XMLSyntaxError, requests.exceptions.InvalidURL, urllib.error.HTTPError): return text except Exception: return text for item in doc.contents(): if etree.iselement(item): if ''.join(item.itertext()).strip() == "": hasimg = pq(item).find("img") if item.tag not in ['img', 'br'] and not hasimg.attr('src'): pq(item).remove() else: self.remove_empty_node(item) return str(doc) def output_format(self, cnode): """格式化内容输出""" content = "" for node in self.contents(cnode): if hasattr(node, "is_text"): content += node elif etree.iselement(node): if node.tag in self.newline: content += "\n" if node.tag == "img": src = node.attrib.get("data-src", "") \ or node.attrib.get("src", "") src = self.abstracturl(src) if src: content += '<img src="%s" />' % src content += self.output_format(node) return content.strip() def remove_link_block(self): """删除链接块""" self.findtitle = False self.calcuate(self.top_node, self._blockinfo) for node, info in self._blockinfo.items(): if node.tag == "a": continue try: link_text_ratio = info.link_text_count / info.text_count except ZeroDivisionError: continue if link_text_ratio > 0.5: parentnode = node.getparent() if etree.iselement(parentnode): parentnode.remove(node) if __name__ == '__main__': r = requests.get('http://www.fanwenbaba.cn/nianzhongzongjie/43952.html', timeout=10) r.encoding = 'utf-8' html = r.text ex = ContentExtractor() ex.extract('http://www.fanwenbaba.cn/nianzhongzongjie/43952.html', html) print(ex.title) print("=" * 100) print(ex.raw)
提取新闻类正文数据
标签:node,info,抽取,网页,新闻,self,count,re,text From: https://www.cnblogs.com/yoyo1216/p/17832010.html