Spider代码
爬取新浪彩票双色球页面数据,只爬取期号、红球、篮球
class Shuangseqiu11Spider(scrapy.Spider):
name = "shuangseqiu11"
allowed_domains = ["sina.com.cn"]
start_urls = ["https://view.lottery.sina.com.cn/lotto/pc_zst/index?lottoType=ssq&actionType=chzs&type=50&dpc=1"]
def parse(self, response,**kwargs):
cpdatas = response.xpath("//*[@id='cpdata']/tr")
for cpdata in cpdatas:
qihao = cpdata.xpath("./td[1]/text()").extract_first()
hongse = cpdata.xpath("./td[@class='chartball01' or @class='chartball20']/text()").extract()
lanse = cpdata.xpath("./td[@class='chartball02']/text()").extract_first()
if not hongse:
continue
dict = {
"qihao":qihao,
"hongse":hongse,
"lanse":lanse
}
yield dict
CSV文件写入
在《爬虫框架Scrapy初使用_爬取4399游戏页面数据》中通过写入一条数据打开一次文件,我们希望的是, 能不能打开一个文件, 然后就用这一个文件句柄来完成数据的保存. 答案是可以的. 我们可以在pipeline中创建两个方法, 一个是open_spider(), 另一个是close_spider(). 看名字也能明白其含义:
- open_spider(), 在爬虫开始的时候执行一次
- close_spider(), 在爬虫结束的时候执行一次
整体代码如下:
class ShuangseqiuPipeline:
def open_spider(self,spider):
print("开始了")
self.f = open('data2.csv',mode='a',encoding="UTF-8")
def close_spider(self,spider):
print("结束了")
self.f.close()
def process_item(self, item, spider):
self.f.write(f"{item['qihao']},{'_'.join(item['hongse'])},{item['lanse']}\n")
return item
MYSQL文件写入
在setting配置中添加Mysql数据库信息
setting.py
# MYSQL配置信息
MYSQL_CONFIG = {
"host": "localhost",
"port": 3306,
"user": "root",
"password": "test123456",
"database": "spider",
}
#pipeline.py
from caipiao.settings import MYSQL_CONFIG as mysql
import pymysql
class CaipiaoMySQLPipeline:
def open_spider(self, spider):
self.conn = pymysql.connect(host=mysql["host"], port=mysql["port"], user=mysql["user"], password=mysql["password"], database=mysql["database"])
def close_spider(self, spider):
self.conn.close()
def process_item(self, item, spider):
# 写入文件
try:
cursor = self.conn.cursor()
sql = "insert into caipiao(qihao, red, blue) values(%s, %s, %s)"
red = ",".join(item['red_ball'])
blue = ",".join(item['blue_ball'])
cursor.execute(sql, (item['qihao'], red, blue))
self.conn.commit()
spider.logger.info(f"保存数据{item}")
except Exception as e:
self.conn.rollback()
spider.logger.error(f"保存数据库失败!", e, f"数据是: {item}") # 记录错误日志
return item
MongoDB文件写入
在setting中添加mongodb配置
MONGO_CONFIG = {
"host": "localhost",
"port": 27017,
#'has_user': True,
#'user': "python_admin",
#"password": "123456",
"db": "python"
}
from caipiao.settings import MONGO_CONFIG as mongo
import pymongo
class CaipiaoMongoDBPipeline:
def open_spider(self, spider):
client = pymongo.MongoClient(host=mongo['host'],
port=mongo['port'])
db = client[mongo['db']]
#if mongo['has_user']:
# db.authenticate(mongo['user'], mongo['password'])
self.client = client # 你们那里不用这么麻烦.
self.collection = db['caipiao']
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.collection.insert({"qihao": item['qihao'], 'red': item["red_ball"], 'blue': item['blue_ball']})
return item
同时存储数据
在setting配置文件中同时添加三个管道
ITEM_PIPELINES = {
# 三个管道可以共存~
'caipiao.pipelines.CaipiaoFilePipeline': 300,
'caipiao.pipelines.CaipiaoMySQLPipeline': 301,
'caipiao.pipelines.CaipiaoMongoDBPipeline': 302,
}
标签:Python,MongoDB,self,spider,爬取,item,qihao,close,def
From: https://www.cnblogs.com/fuchangjiang/p/17898859.html