大数据分析与可视化 之 小说爬虫类
import random
import requests
from lxml import etree
import time
class WebScraper:
def __init__(self, url,output_file):
self.url = url
self.output_file = output_file
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
def request_html(self, url):
response = requests.get(url, headers=self.headers)
response.encoding = 'utf-8'
return response.text
def parse_html(self, html, f):
tree = etree.HTML(html)
#获取标题
list_name = tree.xpath('//div[@class="book-mulu"]/ul/li/a')
#获取链接名
list_name_href = tree.xpath('//div[@class="book-mulu"]/ul/li/a/@href')
# 使用zip函数将标题元素和链接名一一对应
for item,href in zip(list_name,list_name_href):
# 构建完整的链接
href = 'https://www.shicimingju.com' + href
title = item.text
print('正在下载:-**--%s--**-......' % title)
# 调用get_text函数获取链接内容
text = self.get_text(href)
f.write(title + '\n' + text)
print('结束下载:-**--%s--**-' % title)
# 随机等待一段时间
time.sleep(random.uniform(0, 1))
def get_text(self, href):
content = self.request_html(href)
tree = etree.HTML(content)
artist = tree.xpath('//div[@class="chapter_content"]')
#使用循环和字符串操作将章节内容合并为一个文本字符串
#在循环中,每次迭代都会得到一个章节内容的文本字符串,然后使用''.join(...)将这些字符串连接起来,形成一个大的文本字符串。
text = ''.join(etree.tostring(item, encoding='utf-8', method='text').decode('utf-8')
for item in artist)
#返回字符串
return text
def run(self):
with open(self.output_file, 'w', encoding='utf-8') as f:
html = self.request_html(self.url)
self.parse_html(html, f)
if __name__ == '__main__':
scraper = WebScraper('http://www.shicimingju.com/book/liangjinyanyi.html','两晋演义.txt')
scraper.run()
标签:数据分析,__,href,text,self,url,html,可视化,爬虫类
From: https://www.cnblogs.com/IvanKK/p/17936772