免责声明:本文仅做演示分享...
目录
汽车之家数据
--用scrapy自带的xpath进行数据解析
拿一页:
qczj.py
def parse(self, response):
# pass
# print(response.text)
# 基于scrapy的xpath的解析方式:
# 响应对象.xpath('xpath表达式')
divs = response.xpath('//div[@class="list-cont"]') # 页面中每个div
for div in divs:
# name = div.xpath('.//a[@class="font-bold"]/text()') # [<Selector对象>]
# # name = div.xpath('.//a[@class="font-bold"]/text()').extract() # ['数据值']
# # name = div.xpath('.//a[@class="font-bold"]/text()').extract_first() # '数据值'
# # name = div.xpath('.//a[@class="font-bold"]/text()').get() # '数据值'
# # name = div.xpath('.//a[@class="font-bold"]/text()').getall() # ['数据值']
# 通过响应对象.xpath得到的是[标签对象]
# 如果想要从标签对象中取出文本内容
# 返回字符串,用来获取单个:
# get()
# extract_first()
# 返回列表,用来获取多个:
# getall()
# extract()
name = div.xpath('.//a[@class="font-bold"]/text()').get()
# 价格
price = div.xpath('.//span[@class="font-arial"]/text()').get()
# 其它信息(级别,续航,电动机)
# class="info-gray"
info = ",".join(div.xpath('.//span[@class="info-gray"]/text()').getall())
print(name, price, info)
拿多页:
构建start_urls自动发请求
# 第一种写法:列表推导式,循环url并生成
# start_urls = [
# f"https://car.autohome.com.cn/diandongche/list-20_25-0-0-0-0-0-0-0-{i}.html"
# for i in range(1, 11)
# ]
# 第二种写法: 往列表中添加数据 列表.append()
# start_urls = []
# for i in range(1,11):
# start_urls.append(f'https://car.autohome.com.cn/diandongche/list-20_25-0-0-0-0-0-0-0-{i}.html')
手动发请求
# 先访问第一页数据,进入解析方法
start_urls = ['https://car.autohome.com.cn/diandongche/list-20_25-0-0-0-0-0-0-0-
1.html']
url = 'https://car.autohome.com.cn/diandongche/list-20_25-0-0-0-0-0-0-0-%s.html'
page = 1
def parse(self, response):
# 获取当前进入解析方法的请求url
# print(response.url)
divs = response.xpath('//div[@class="list-cont"]') # 页面中每个div
for div in divs:
name = div.xpath('.//a[@class="font-bold"]/text()').get()
# 价格
price = div.xpath('.//span[@class="font-arial"]/text()').get()
# 其它信息(级别,续航,电动机)
# class="info-gray"
info = ','.join(div.xpath('.//span[@class="info-gray"]/text()').getall())
print(name, price, info)
# 发起后面的页码请求
self.page += 1
# 构造url
next_url = self.url % (self.page)
# 发请求
# 回调函数 解析数据 把当前parse的地址传给callback
# 作用:发完请求之后,继续进入解析方法对其它页数据进行解析
if len(divs):# 如果解析出数据还有,就再发请求,否则无法结束
yield scrapy.Request(url=next_url,callback=self.parse)
详情页数据解析:
很多情况下我们需要拿到主页列表数据之后,还需要进入详情页进行数据获取,这时候也需要借助手动请求的方式进 行数据获取.
def parse(self, response):
divs = response.xpath('//div[@class="list-cont"]') # 页面中每个div
for div in divs:
name = div.xpath('.//a[@class="font-bold"]/text()').get()
# 价格
price = div.xpath('.//span[@class="font-arial"]/text()').get()
# 其它信息(级别,续航,电动机)
# class="info-gray"
info = ','.join(div.xpath('.//span[@class="info-gray"]/text()').getall())
# 存入item对象中
item = Scrapy3Item()
item['name'] = name
item['price'] = price
item['info'] = info
# 要去当前汽车的详情页,就需要获取详情页的url
detail_url = div.xpath('.//a[@class="font-bold"]/@href').get()
# https://car.autohome.com.cn/diandongche/series-4278.html#pvareaid=2042206
# /diandongche/series-4278.html#pvareaid=2042206
# 拼接url
detail_url = 'https://car.autohome.com.cn' + detail_url
# 手动对详情页url发起请求
# meta参数是用来给回调方法传入数据的,传入的格式是字典,字典的key可以自定义,字典的值是你要传入的数据值
yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta=
{'item':item})
# 定义一个方法,用来解析详情页数据
def parse_detail(self, response):
# 获取item数据 meta是个字典,字典再通过键取值
item = response.meta['item']
divs = response.xpath('//div[@class="interval01-list-cars-infor"]')
types_ls = []
for div in divs:
type = div.xpath('./p[1]/a/text()').get()
types_ls.append(type)
# 往item对象中添加types属性
item['types'] = types_ls
print(item)
总结写法:
import scrapy
from scrapy_demo1.items import ScrapyDemo1Item
class QczjSpider(scrapy.Spider):
name = "qczj"
# allowed_domains = ["qczj.com"]
start_urls = [
"https://car.autohome.com.cn/diandongche/list-20_25-0-0-0-0-0-0-0-1.html"
]
#url = "https://car.autohome.com.cn/diandongche/list-20_25-0-0-0-0-0-0-0-%s.html"
# # 拿多页数据:
# 分页的思路:
# 因为框架自动从start_urls中拿到url自动发起请求
# 如果要实现分页,只需要把分页的url放入列表即可
page = 1
def parse(self, response):
divs = response.xpath('//div[@class="list-cont"]') # 页面中每个div
for div in divs:
name = div.xpath('.//a[@class="font-bold"]/text()').get()
# 价格
price = div.xpath('.//span[@class="font-arial"]/text()').get()
# 其它信息(级别,续航,电动机)
# class="info-gray"
info = ",".join(div.xpath('.//span[@class="info-gray"]/text()').getall())
# 存入item对象中
item = ScrapyDemo1Item()
item["name"] = name
item["price"] = price
item["info"] = info
# 要去当前汽车的详情页,就需要获取详情页的url
detail_url = div.xpath('.//a[@class="font-bold"]/@href').get()
# https://car.autohome.com.cn/diandongche/series-4278.html#pvareaid=2042206
# /diandongche/series-4278.html#pvareaid=2042206
# 拼接url
detail_url = "https://car.autohome.com.cn" + detail_url
# 手动对详情页url发起请求
yield scrapy.Request(
url=detail_url,
callback=self.parse_detail,
meta={"item": item}, # 传递item对象. 键随便写,值就是item对象.
)
# 定义一个方法,用来解析详情页数据
#进入详细页数据:
# https://car.autohome.com.cn/diandongche/series-4278.html#pvareaid=2042206
def parse_detail(self, response):
# 2个不同的方法,这个字段怎么存呢???
item = response.meta["item"] # 取出item对象
divs = response.xpath('//div[@class="interval01-list-cars-infor"]')
types_ls = []
for div in divs:
type = div.xpath("./p[1]/a/text()").get()
types_ls.append(type)
# 往item对象中添加types属性:
item["types"] = types_ls
# print(item)
yield item # 这里返回item对象,会自动存入到pipelines中.
等等...
标签:xpath,--,text,item,scrapy,HTML,url,div,class From: https://blog.csdn.net/2303_80857229/article/details/141503855