settings.py中设置配置项
MONGODB_HOST = "127.0.0.1"
MONGODB_PORT = 27017
MONGODB_DB_NAME = "bang123"
pipelines.py:
from scrapy.pipelines.images import ImagesPipeline
from itemadapter import is_item, ItemAdapter
class Bang123Pipeline:
# 保存数据时的集合名
COLLECTION_NAME = "t_bang123"
def __init__(self):
# 读取配置
from scrapy.utils.project import get_project_settings
from pymongo import MongoClient
settings = get_project_settings()
# Mongodb从settings.py中读取配置信息
self.client = MongoClient(host=settings["MONGODB_HOST"], port=settings["MONGODB_PORT"])
self.db = self.client[settings["MONGODB_DB_NAME"]]
self.collection = self.db[self.COLLECTION_NAME]
def __del__(self):
self.client.close()
def process_item(self, item, spider):
if spider.name == "bang123":
# 向mongodb中插入数据
data_dict = ItemAdapter(item).asdict()
insert_id = self.collection.insert_one(data_dict)
print(f"{insert_id=}")
else:
print("不是bang123,不写入mongodb中")
return item
爬虫文件:
import time
import scrapy
from scrapy.http import HtmlResponse
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from zolwallpaper.items import Bang123Item
class Bang123Spider(CrawlSpider):
name = "bang123"
allowed_domains = ["bang123.cn"]
start_urls = ["https://www.bang123.cn/"]
rules = (
# 翻页
Rule(LinkExtractor(allow=r"https://www.bang123.cn/index_\d+.html"), follow=True),
# 详情页
Rule(LinkExtractor(allow=r"https://www.bang123.cn/gongshi/\d+.html"), callback="parse_item", follow=False),
)
def parse_item(self, response: HtmlResponse):
bang_item = Bang123Item()
selector = response.xpath('//div[@class="article_content layui-field-box"]')[0]
title = selector.xpath('./h1/text()').get()
main = response.xpath('//div[@class="content tindent of_table"]/p').getall()
bang_item["title"] = title
bang_item["main"] = main
print(f"【{title=}】")
print(f"{main=}")
print("-"*150)
# 交给管道处理数据
yield bang_item
标签:pipeline,settings,mongodb,self,item,Scrapy,bang123,MONGODB,import
From: https://www.cnblogs.com/juelian/p/17559674.html