首页 > 数据库 >Scrapy 抓取指数基金6个月前10 名 存到mongodb 中

Scrapy 抓取指数基金6个月前10 名 存到mongodb 中

时间:2022-11-28 23:32:31浏览次数:49  
标签:10 03 mongodb self 30 Scrapy 2018 100 050


1.抓取前的分析

 

Scrapy 抓取指数基金6个月前10 名 存到mongodb 中_ide

http://fund.eastmoney.com/trade/zs.html#zwf_,sc_6y,st_desc 地址是这个,我们发现中间的内容是后来通过ajax来加载的,所以这个地址是抓取不到的。通过chrome开发者工具发现真正的地址是这个: https://fundapi.eastmoney.com/fundtradenew.aspx?ft=zs&sc=3y&st=desc&pi=1&pn=10


var rankData = {datas:["162412|华宝中证医疗指数分级|指数型|2018-03-30|1.0066|3.55|8.59|14.43|14.31|13.87|6.85|-1.64||14.32|-60.10|3|1|1|0.12|0||050,051,054,060,070|1|1|100元|1.20%|0.12%|0.12%|1","502056|广发医疗指数分级|指数型|2018-03-30|1.1007|3.41|8.33|14.18|14.29|15.04|9.27|2.93||14.30|-24.22|3|1|1|0.05|0||020,050,051,054,060|1|1|100元|0.50%|0.05%|0.05%|1","501007|汇添富中证互联网医疗指数A|指数型|2018-03-30|1.0296|3.07|7.67|11.94|12.52|11.68|6.28|||12.48|2.96|3|1|1|0.08|0||020,050,051,054,080|1|1|100元|0.80%|0.08%|0.08%|1","501008|汇添富中证互联网医疗指数C|指数型|2018-03-30|1.0262|3.08|7.68|11.97|12.51|11.63|6.02|||12.46|2.62|3|0|1|0.00|0||020,050,051,054,080|1|1|100元||0.00%||","005112|银华中证全指医药卫生|指数型|2018-03-30|1.1443|4.47|6.14|14.65|12.44|14.43||||12.45|14.43|3|1|1|0.12|0||050,052,054,080|1|1|100元|1.20%|0.12%|0.12%|1","001629|天弘中证计算机指数A|指数型|2018-03-30|0.7541|2.68|10.26|10.44|10.36|7.59|11.52|-6.52||10.36|-24.59|3|1|1|0.10|0||050,051,054,080|1|1|100元|1.00%|0.10%|0.10%|1","001630|天弘中证计算机指数C|指数型|2018-03-30|0.7492|2.67|10.26|10.42|10.29|7.44|11.22|-6.90||10.31|-25.08|3|0|1|0.00|0||050,051,054,080|1|1|100元||0.00%||","161022|富国创业板指数分级|指数型|2018-03-30|0.8110|3.18|9.45|8.28|9.12|0.24|-1.64|-15.50|-22.65|9.12|22.61|3|1|1|0.12|0||050,051,053,060,070|1|1|100元|1.20%|0.12%|0.12%|1","161613|融通创业板指数A|指数型|2018-03-30|0.8910|3.13|9.86|8.39|9.06|0.79|-4.71|-17.58|-23.66|9.06|113.70|3|1|1|0.12|0||050,052,053|1|1|100元|1.20%|0.12%|0.12%|1","160420|华安创业板50指数分级|指数型|2018-03-30|0.8437|3.13|11.34|7.63|8.85|-11.96|-19.93|-37.38||8.88|-65.57|3|1|1|0.12|0||050,051,053,060,070|0|1|100元|1.20%|0.12%|0.12%|1"],allRecords:549,pageIndex:1,pageNum:10,allPages:55};


这个它其实是js代码,也不是json。所以对于我们来说就是相当于文本了,直接用文本分割操作。


2. 抓取代码

 

import scrapy


#import json
import logging
from tutorial.items import FundItem

class QuotesSpider(scrapy.Spider):
name = "eastmoney"

def start_requests(self):
url = 'https://fundapi.eastmoney.com/fundtradenew.aspx?ft=zs&sc=3y&st=desc&pi=1&pn=10'
yield scrapy.Request(url)

def parse(self, response):

item = FundItem()

resultStr = response.body.decode('utf-8')
jsBody = resultStr[ resultStr.find('[') + 2 : resultStr.rfind(']') - 1]

fundArray = jsBody.split('","')
for fund in fundArray:
fundInfoArray = fund.split('|')
logging.info('code:' + fundInfoArray[0] + ' name:' + fundInfoArray[1])
item['code'] = fundInfoArray[0]
item['name'] = fundInfoArray[1]
yield item

下面是pipeline的代码,把结果保存到mongodb中

import datetime
from tutorial import settings
import logging
import pymongo


class FundPipeline(object):

collection_name = 'funds'

def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'test')
)
pass
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]

def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item
pass
def close_spider(self, spider):
self.client.close()
pass

3.结果

 

Scrapy 抓取指数基金6个月前10 名 存到mongodb 中_chrome_02

 


标签:10,03,mongodb,self,30,Scrapy,2018,100,050
From: https://blog.51cto.com/u_7920880/5893809

相关文章