douban.py
import scrapy
import time
from bs4 import BeautifulSoup
from scrapy import Request
from Scripts.ScrapyProject.items import bookItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["book.douban.com"]
start_urls = ["http://book.douban.com/"]
def parse(self, response):
soup = BeautifulSoup(response.text,'lxml')
Booklists = soup.find_all(class_="list-col list-col5 list-express slide-item")
for Bl in Booklists:
booklists = Bl.find_all("li")
for book in booklists:
time.sleep(3)
item = bookItem()
name = book.find(class_="title").a.text
item['name']=name
author = book.find(class_="author").text
item['author'] = author
url = book.find(class_="title").a.attrs.get('href')
### 获得图书详情url后继续爬取图书详情;
### callback如果不写,默认回调到parse方法;
### meta将item传到自己写的方法中
yield Request(url,callback=self.detailparse,meta={'item':item})
def detailparse(self,response):
### 获得默认方法中的item
item = response.meta.get('item')
rate = response.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract_first()
item['rate'] =rate
soup = BeautifulSoup(response.text,'lxml')
### 使用string会得到None
content = soup.find(class_="intro").text
item['content'] = content
yield item
item.py
import scrapy
class bookItem(scrapy.Item):
name = scrapy.Field()
author = scrapy.Field()
rate = scrapy.Field()
content = scrapy.Field()
pipelines.py
from itemadapter import ItemAdapter
import csv
import pymongo
class BookMongoPipeline:
db_url ='mongodb://localhost:27017'
db_name = 'Scrapy'
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.db_url)
self.db = self.client[self.db_name]
def process_item(self,item,spider):
collection = self.db[spider.name]
data = {'书名':item['name'],'作者':item['author'],'评价':item['rate'],'推荐':item['content']}
collection.insert_one(data)
print('%s 数据保存成功' %(item['name']))
def close_spider(self,spider):
self.client.close
标签:name,self,scrapy,爬取,item,Scrapy,豆瓣,import,class
From: https://www.cnblogs.com/yigehulu/p/17501239.html