# 获取网页源码 import re import html2text import requests def preprocess_html(html): # 删除没有 src 属性的 img 标签 processed_html = re.sub(r'<img(?![^>]*\ssrc=)[^>]*>', '', html) return processed_html page_url = 'https://www.ysxiao.cn/c/202212/57443.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36' } def requests_page(url): fp = requests.get(url=url, headers=headers, timeout=10) fp.encoding = 'utf-8' return fp.text fp = requests_page(page_url) if isinstance(fp, bytes): original_format = fp.decode('utf-8') else: original_format = fp original_format = preprocess_html(original_format) markdown = html2text.html2text(original_format) print(markdown)
标签:fp,markdown,网页,format,url,获取,html,original From: https://www.cnblogs.com/wolvies/p/18451333