scrapy目录结构
myfirstscrapy # 项目名字
-myfirstscrapy # 包
-__init__.py
-spiders # 包 放爬虫,可能会有很多爬虫
-__init__.py
-cnblogs.py # 爬虫文件--》一个爬虫就是一个文件,可以写多个
-items.py # 放一个个类---》类似于django 的models--》模型类
-middlewares.py # 中间件,下载,爬虫中间件
-pipelines.py # 持久化,保存mysql,需要写的位置
-settings.py # 配置文件
-scrapy.cfg # 上线会用
解析方式
1 response对象有css方法和xpath方法
-css中写css选择器 response.css('')
-xpath中写xpath选择 response.xpath('')
2 重点1:
-xpath取文本内容
'.//a[contains(@class,"link-title")]/text()'
-xpath取属性
'.//a[contains(@class,"link-title")]/@href'
-css取文本
'a.link-title::text'
-css取属性
'img.image-scale::attr(src)'
3 重点2:
.extract_first() 取一个
.extract() 取所有
案例:
# 使用css解析
def parse(self, response): # css解析
# response 就是爬完后的对象
# print(response.text)
# 使用css解析
article_list = response.css('article.post-item')
for article in article_list:
# 标题
title = article.css('a.post-item-title::text').extract_first()
# 摘要 取出所有,单独处理一下
desc = article.css('p.post-item-summary::text').extract()
real_desc = desc[0].replace('\n', '').replace(' ', '')
if not real_desc:
real_desc = desc[1].replace('\n', '').replace(' ', '')
# print(real_desc)
# 作者:author
author = article.css('footer.post-item-foot>a>span::text').extract_first()
# print(author)
# 头像
image_url = article.css('img.avatar::attr(src)').extract_first()
# print(image_url)
# 发布日期
date = article.css('span.post-meta-item>span::text').extract_first()
# print(date)
# 文章地址
url = article.css('a.post-item-title::attr(href)').extract_first()
print('''
文章名:%s
文章摘要:%s
文章作者:%s
作者头像:%s
文章日期:%s
文章地址:%s
''' % (title, real_desc, author, image_url, date, url))
# 使用xpath解析
def parse(self, response): # xpath解析
article_list = response.xpath('//article[@class="post-item"]')
for article in article_list:
# 标题
title = article.xpath('.//a[@class="post-item-title"]/text()').extract_first()
# 摘要 取出所有,单独处理一下
desc = article.xpath('.//p[@class="post-item-summary"]/text()').extract()
real_desc = desc[0].replace('\n', '').replace(' ', '')
if not real_desc:
real_desc = desc[1].replace('\n', '').replace(' ', '')
# print(real_desc)
# 作者:author
# author = article.css('footer.post-item-foot>a>span::text').extract_first()
author = article.xpath('.//footer[@class="post-item-foot"]/a/span/text()').extract_first()
# print(author)
# 头像
# image_url = article.css('img.avatar::attr(src)').extract_first()
image_url = article.xpath('.//img[@class="avatar"]/@src').extract_first()
# print(image_url)
# 发布日期
# date = article.css('span.post-meta-item>span::text').extract_first()
date = article.xpath('.//span[@class="post-meta-item"]/span/text()').extract_first()
# print(date)
# 文章地址
# url = article.css('a.post-item-title::attr(href)').extract_first()
url = article.xpath('.//a[@class="post-item-title"]/@href').extract_first()
print('''
文章名:%s
文章摘要:%s
文章作者:%s
作者头像:%s
文章日期:%s
文章地址:%s
''' % (title, real_desc, author, image_url, date, url))
标签:item,scrapy,article,post,解析,extract,目录,css,desc
From: https://www.cnblogs.com/wellplayed/p/18029789