spider
import scrapy
class XiaoshuoSpider(scrapy.Spider):
name = "爬虫名"
allowed_domains = ["域名"]
start_urls = ["第一章url地址"]
def parse(self, response):
# 章节名称
title = response.xpath('//h1/text()').get() #extract_first()
# 章节内容
content = response.xpath('//div[@id="content"]/text()').getall() #extract()
# 下一章链接
next_url = response.xpath('//div[@class="bottem2"]/a[4]/@href').get()
yield {
'title':title,
'content':content
}
yield scrapy.Request('https://www.tycqzw.la'+next_url,callback=self.parse)
pipeline
class Scrapy05Pipeline:
def open_spider(self,spider):
self.file = open('xiaoshuo.txt','w',encoding='utf-8')
def process_item(self, item, spider):
self.file.write(item['title']+'\n')
self.file.write(''.join(item['content'])+'\n\n\n\n')
return item
def close_spider(self,spider):
self.file.close()
setting
#请求头
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
#注释掉协议
# ROBOTSTXT_OBEY = True
#间隔时间
DOWNLOAD_DELAY = 2
#管道pipeline
ITEM_PIPELINES = {
"scrapy05.pipelines.Scrapy05Pipeline": 300,
}
标签:item,self,title,spider,保存,content,案例,Scrapy,def From: https://www.cnblogs.com/jiangjiayun/p/17501514.html