-
创建
cd xx
scrapy startproject <文件夹名> [dir]
cd 文件目录
scrapy genspider <文件名> <域名>
scrapy crawl <文件名> -
spiders
定义的详细爬取规则 -
items
爬取的数据结构 -
middlewares
中间件 -
pipelines
数据管道,负责持久存储和清洗
: 存取mongod
class MongoDBPipeline(object):
def __init__(self):
# 创建链接
self.client = pymongo.MongoClient(host='localhost', port=27017)
# 进入数据库
self.db = self.client["test"]
# 进入集合
self.col = self.db["j"]
def process_item(self, item, spider):
# 插入数据
self.col.insert_one(dict(item))
return item
def close_spider(self, spider):
self.client.close()
:存储mysql
class BookschinaPipeline:
def __init__(self):
self.df = pd.DataFrame(columns=['name', 'price', 'author',
'out_date', 'publisher'])
# database 数据库名
self.conn = pymysql.Connect(
host='localhost',
port=3306,
user='root',
password='',
database='spiders',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
self.cursor = self.conn.cursor()
self.count = 1
def process_item(self, item, spider):
sql = """insert into bookschina_goods (name,price,author,out_date,publisher)
values (%s, %s, %s, %s, %s)"""
self.cursor.execute(sql, (
item.get('name', ''),
item.get('price', ''),
item.get('author', ''),
item.get('out_date', ''),
item.get('publisher', '')
))
self.conn.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
-
settings
配置文件 -
shell命令
scrapy shell 网址