requests获取所有状态码
requests获取所有状态码
requests默认是不会获取301/302的状态码的。可以设置allow_redirects=False,这样就可以获取所有的状态码了
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
import requests
# url
# url = 'http://www.freebuf.com/news/157100.html' # 请求200,返回200
url = 'http://www.freebuf.com/fevents/133225.html' # 请求302,返回200。要想不跳转,获取302,用参数:allow_redirects=False
# url = 'http://www.freebuf.com/articles/database/151839.html' # 请求403,返回403
# url = 'http://www.freebuf.com/articles/database/1518391.html' # 请求存在的域名中不存在的页面,请求404,返回404
# url = 'http://www.freebudfsf.com/articles/database/1518391.html' # 请求不存在的域名。程序崩溃
# url = 'https://www.douban.com/group/topic/49606658/' # 请求存在的域名,公司限制访问,返回抛出异常,程序崩溃。效果和网络中断一样。
# url = 'http://10.1.75.241' # 请求ip,(一定要加协议HTTP,否则崩溃)
# headers
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
try :
# 发请求,得响应
response = requests.get(url, headers = headers, allow_redirects = False )
# 解析
print ( ' give url:' , url)
print ( ' request.url:' , response.request.url)
print ( 'response.url:' , response.url)
print (response.content)
print (response.status_code)
except Exception as e:
print (e)
|
封装一个获取所有状态码的函数,同时实现验证返回值的方法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import requests
def get_statecode_or_errinfo(url = ''):
'''
获取响应状态码,或者未响应的错误信息
:param url: 请求的url
:return: 状态码,或者未响应的错误信息
'''
if url = = '':
return '请输入一个url作为get_statecode_or_errinfo的参数'
# headers
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
try :
# 发请求,得响应
response = requests.get(url, headers = headers, allow_redirects = False )
# 返回状态码
return response.status_code
except Exception as e:
# 返回异常信息
return e
if __name__ = = '__main__' :
# url
# url = 'http://www.freebuf.com/news/157100.html' # 请求200,返回200
url = 'http://www.freebuf.com/fevents/133225.html' # 请求302,返回200。要想不跳转,获取302,用参数:allow_redirects=False
# url = 'http://www.freebuf.com/articles/database/151839.html' # 请求403,返回403
# url = 'http://www.freebuf.com/articles/database/1518391.html' # 请求存在的域名中不存在的页面,请求404,返回404
# url = 'http://www.freebudfsf.com/articles/database/1518391.html' # 请求不存在的域名。程序崩溃。如果有Nginx,返回200
# url = 'http://dsfs' # 请求不存在的域名,设置了参数:allow_redirects=False,在有Nginx处理的情况下,有304,返回200。
# url = 'https://www.douban.com/group/topic/49606658/' # 请求存在的域名,公司限制访问,返回抛出异常,程序崩溃。效果和网络中断一样。
# url = 'http://10.1.75.241' # 请求ip,请求200,返回200(一定要加协议HTTP,否则崩溃)
# url = 'http://www.freebuf.com/fevents/133225.html' # 请求302,返回200。要想不跳转,获取302,用参数:allow_redirects=False
url = 'http://www.freebuf.com/news/171238.html'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# response=requests.get(url,headers=headers,allow_redirects=False)
response = requests.get(url,headers = headers)
# 检查状态码
print (response.status_code)
# # 检查url
print (url)
print (response.url)
#
# # 检查请求头
print (response.request.headers)
#
# # 检查响应头
print (response.headers)
#
# # 检查源码
# print(response.content)
# print(response.content.decode())
# print(response.text)
#
# response.encoding = 'utf-8'
# print(response.text)
print (response.encoding)
#
# # 检查源码字符串长度
print ( len (response.content))
|
说明:
反扒:
总结多种验证返回值的方式。requests
比如:检查状态码、检查url(有可能发送了跳转)、检查请求头、检查响应头、检查源码、检查源码字符串长度。
检查状态码
print (response.status_code)
检查url
print (response.url)
检查请求头
print (response.request.headers)
检查响应头
print (response.headers)
检查源码字符串长度
print (len(response.content))
检查源码
print (response.content)
print (response.content.decode())
response.encoding='utf-8'
print (response.text)
print (response.encoding)
scrapy爬虫的响应规则:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
# 1、被过滤掉,不发出请求:不在允许的域名范围内
# temp['title_url'] = "https://www.baidu.com/" # 跨域。请求发出前,url直接被过滤掉。
# temp['title_url'] = "http://open.freebuf.com/live?id=1021" # 跨域。请求发出前,url直接被过滤掉。
# temp['title_url'] = "http://10.1.75.241" # 请求ip地址,请求发出前,url直接过来掉。如果设置为允许ip网站,没有被过滤,就返回200
# 2、禁止访问
# temp['title_url'] = "http://www.freebuf.com/articles/database/151839.html"#禁止访问403,资源存在,不让访问。Ignoring non-200 response
# temp['title_url'] = "http://www.freebuf.com/articles/database/1518391.html"#禁止访问404,资源本身不存在。Ignoring non-200 response
# 3、重定向后的作为新请求
# temp['title_url'] = "http://www.freebuf.com/news/156654.html" # 重定向301、302。会返回重定向后200的状态码
# 4、断网
# temp['title_url'] = "https://www.douban.com/group/topic/49606658/" # 公司限制访问。[<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
# 5、没有的网站
# temp['title_url'] = "https://www.badfsdsdfsdfsdfsdddd.com/" # 直接被过滤掉,如果没有被过滤,就返回域名解析错误:DNS lookup failed: no results for hostname lookup: www.badfsdsdfsdfsdfsdddd.com.
pass
|
scrapy爬虫举例
freebuf2.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# -*- coding: utf-8 -*-
import scrapy
from scrapy_FB.items import ScrapyFb2Item
# from util.logger import Logger
# logger_freebuf2 = Logger(logname=__name__, logpath='collection_log', logformat=1, loglevel=10).getlog()
# logger_freebuf2.debug('i am debug3')
# logger_freebuf2.info('i am info3')
# logger_freebuf2.warning('i am warning3')
class Freebuf2Spider(scrapy.Spider):
# freebuf2爬虫
name = 'freebuf2'
allowed_domains = [ 'freebuf.com' , 'douban.com' ]
start_urls = [ 'http://www.freebuf.com/page/708' ]
def parse( self , response):
cur_url = response.url # 当前列表页url
cur_page_num = int (cur_url.rpartition( '/' )[ - 1 ]) # 当前page num
print ( 'cur_url:%s' % cur_url)
print ( 'cur_page_num:%s' % cur_page_num)
# 获取列表节点
node_list = response.xpath( '//*[@id="timeline"]/div/div[2]/dl/dt/a[1]' )
print ( 'len(node_list):%s' % len (node_list))
page_num = int (cur_url.rpartition( '/' )[ - 1 ]) # 当前页码
count_node = len (node_list) # 当前列表页,一共有的详细页条数
# 遍历节点
for i, node in enumerate (node_list):
# temp = {}
temp = ScrapyFb2Item()
temp[ 'title' ] = node.xpath( './text()' ).extract()[ 0 ].strip()
if i = = 0 :
# 1、被过滤掉,不发出请求:不在允许的域名范围内
# temp['title_url'] = "https://www.baidu.com/" # 跨域。请求发出前,url直接被过滤掉。
# temp['title_url'] = "http://open.freebuf.com/live?id=1021" # 跨域。请求发出前,url直接被过滤掉。
# temp['title_url'] = "http://10.1.75.241" # 请求ip地址,请求发出前,url直接过来掉。如果设置为允许ip网站,没有被过滤,就返回200
# 2、禁止访问
# temp['title_url'] = "http://www.freebuf.com/articles/database/151839.html"#禁止访问403,资源存在,不让访问。Ignoring non-200 response
# temp['title_url'] = "http://www.freebuf.com/articles/database/1518391.html"#禁止访问404,资源本身不存在。Ignoring non-200 response
# 3、重定向后的作为新请求
# temp['title_url'] = "http://www.freebuf.com/news/156654.html" # 重定向301、302。会返回重定向后200的状态码
# 4、断网
# temp['title_url'] = "https://www.douban.com/group/topic/49606658/" # 公司限制访问。[<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]
# 5、没有的网站
# temp['title_url'] = "https://www.badfsdsdfsdfsdfsdddd.com/" # 直接被过滤掉,如果没有被过滤,就返回域名解析错误:DNS lookup failed: no results for hostname lookup: www.badfsdsdfsdfsdfsdddd.com.
pass
else :
temp[ 'title_url' ] = node.xpath( './@href' ).extract()[ 0 ]
temp[ 'page_num' ] = str (page_num)
temp[ 'line_num' ] = i + 1
temp[ 'line_total' ] = str (count_node)
# print(temp['line_num'])
yield scrapy.Request(temp[ 'title_url' ], callback = self .parse_detail, meta = { "meta_1" : temp}, errback = self .err)
if len (node_list) ! = 0 : # 爬虫不终止的条件
# 下一页
next_url = 'http://www.freebuf.com/page/{}' . format (cur_page_num + 1 )
# print('next_url:%s' % next_url)
yield scrapy.Request(next_url, callback = self .parse) # 访问下一页
def parse_detail( self , response):
item = response.meta[ 'meta_1' ]
print (item[ 'line_num' ], item[ 'title_url' ])
# print(response.status)
print (item[ 'line_num' ], response.request.url)
def err( self , response):
print ( 'err:' ,response.request.url)
# print('err:',response.status)
# print(dir(response))
print ( 'err:' ,response.getErrorMessage())
print ( dir (response))
# print(type(response.getErrorMessage()))
|
(0) 编辑 收藏 举报 标签:状态,www,http,url,获取,print,requests,com,response From: https://www.cnblogs.com/xiao-xue-di/p/17188123.html