requests更多用法
ssl认证
1.https和http
https=http+ssl/tsl(认证证书)
2.使用
2.1不认证证书
import requests
# 不验证证书,报警告,返回200
res = requests.get('https://www.12306.cn', verify=False)
print(res.status_code)
2.2手动携带证书
import requests
res = requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key'))
print(res.status_code)
使用代理
1.频率限制
-有些网站会通过ip或用户id限制,如果访问频率过高就会封IP,做爬虫是就要避免这些
-封ip:代理
-封账号:注册很多小号
2.代理
-正向代理:代理客户端
-反向代理:代理服务端,nginx是反向代理服务器
3.使用代理IP发送请求
import requests
proxies = {'http': '192.168.10.102:9003'}
res = requests.get('https://www.baidu.com', proxies=proxies)
print(res.status_code) # 200
超时设置
# 超时时间为3秒
res = requests.get('https://www.baidu23.com', timeout=3)
print(res)
异常处理
import requests
# 可以查看requests.exceptions获取异常类型
from requests.exceptions import *
try:
res = requests.get('https://www.baidu.com', timeout=0.00001)
except ReadTimeout:
print('===:')
except ConnectionError:
print('-----') # 网络不通
except Timeout:
print('aaaaa') # 超时
except RequestException:
print('Error')
上传文件
import requests
files = {'file': open('01', 'rb')}
res = requests.post('http://httpbin.org/post', files=files)
print(res.text)
代理池搭建
1.github开源的,代理池的代码,本地跑起来
-爬虫技术:爬取免费的代理网站,获取免费代理,验证过后,存到本地
-使用flask搭建一个web后端,访问某个接口就可以随机返回一个可用的代理地址
-https://github.com/jhao104/proxy_pool
2.搭建代理池
'创建虚拟环境 mkvirtualenv -p python38 crawl'
2.1 git clone https://github.com/jhao104/proxy_pool.git
2.2 创建虚拟环境,安装依赖:pip install -r requirements.txt
2.3 修改配置文件settings.py ---》redis服务启动
-配置API服务
HOST = "0.0.0.0" # IP
PORT = 5000 # 监听端口
# 配置数据库
DB_CONN = 'redis://127.0.0.1:8888/0'
# 配置 ProxyFetcher
PROXY_FETCHER = [
"freeProxy01",
"freeProxy02",
]
2.4启动爬虫,启动web服务
# 启动调度程序
python proxyPool.py schedule
# 启动webApi服务
python proxyPool.py server
2.5随机获取ip
127.0.0.1: 5000/get
- 使用代理池发送请求
import requests
res = requests.get('http://127.0.0.1:5000/get/').json()
if res['https']:
http = 'https'
else:
http = 'http'
proxie = {http: res['proxy']}
print(proxie)
res = requests.get('https://www.cnblogs.com/liuqingzheng/p/16005896.html', proxies=proxie)
print(res.status_code)
django后端获取客户端的ip
1.写一个返回用户ip地址的django程序
def ip_test(request):
# 获取客户端ip
ip = request.META.get('REMOTE_ADDR')
return HttpResponse('您的ip是:%s' % ip)
2.使用requests+代理访问,查看是否返回代理的ip地址
res = requests.get('http://127.0.0.1:5000/get/').json()
if res['https']:
http = 'https'
else:
http = 'http'
proxie = {http: res['proxy']}
# print(proxie)
res = requests.get('http://127.0.0.1:8000/ip/', proxies=proxie)
# res = requests.get('http://101.133.225.166/ip/', proxies=proxie)
print(res.text)
爬取视频网站
import requests
import re
url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=1'
res1 = requests.get(url=url)
# 使用正则,解析出该页面中所有的视频地址
video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res1.text)
for video in video_list:
headers = {'Referer': f'https://www.pearvideo.com/{video}'}
video_id = video.split('_')[-1]
res = requests.get(f'https://www.pearvideo.com/videoStatus.jsp?contId={video_id}&mrd=0.6761335369801458',
headers=headers).json()
mp4_url = res['videoInfo']['videos']['srcUrl']
# 页面视频的地址要替换的位置
systemTime = res['systemTime']
# 将页面视频的地址替换成真正的视频地址
real_mp4_url = mp4_url.replace(systemTime, f'cont-{video_id}')
print(real_mp4_url)
res = requests.get(real_mp4_url)
with open(f'{video_id}.mp4', 'wb') as f:
for line in res.iter_content():
f.write(line)
BautifulSoup4解析
BautifulSoup4 介绍
1.Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库
2.下载
pip3 install BeautifulSoup4
3.解析库解析
BeautifulSoup('要解析的内容:xml格式字符串', "html.parser") #内置解析库html.parser
BeautifulSoup('要解析的内容:xml格式字符串', "lxml") # 速度快 必须要装lxml pip3 install lxml
爬取新闻
import requests
from bs4 import BeautifulSoup
url = 'https://www.autohome.com.cn/news/1/#liststart'
res = requests.get(url=url)
# html.parser告诉BeautifulSoup这是html,否则会报错
page = BeautifulSoup(res.text, 'html.parser')
ul_list = page.find_all('ul', class_='article')
# print(ul_list)
for ul in ul_list:
li_list = ul.find_all('li')
for li in li_list:
h3 = li.find('h3')
if h3:
title = h3.text
desc = li.find('p').text
url = 'https:' + li.find('a').get('href')
img = li.find('img').get('src')
if not img.startswith('http'):
img = 'https:' + img
print(f'''文章标题:{title}
文章摘要:{desc}
文章地址:{url}
文章图片:{img}''')
bs4 遍历文档树
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_p' name='lqz' xx='yy'>lqz is handsome <b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
1 美化html:了解
print(soup.prettify())
2 遍历文档树
# 遍历文档树:即直接通过标签名字选择,特点是选择速度快,但如果存在多个相同的标签则只返回第一个
2.1 基本用法,直接 .标签名字
res = soup.title
print(res)
res1 = soup.a
print(res1)
# 可以嵌套使用
res2 = soup.head.title
print(res2)
2.2 获取标签的名称
# 拿到的所有标签都是一个对象,Tag对象 bs4.element.Tag
res = soup.head.title
res1 = soup.body
print(res.name)
2.3 获取标签的属性
res = soup.p
print(res.attrs) # 属性字典
2.4 获取标签的内容
res = soup.p
print(res.text) # 把该标签子子孙孙内容拿出来拼到一起 字符串
print(res.string) # None 必须该标签没有子标签,才能拿出文本内容
print(list(res.strings)) # generator 生成器,把子子孙孙的文本内容放到生成器中
2.5 嵌套选择
res = soup.html.body.a
print(res.text)
2.6子节点、子孙节点
print(soup.p.contents) # p下所有子节点
print(soup.p.children) # 得到一个迭代器,包含p下所有子节点
2.7父节点、祖先节点
print(soup.a.parent) # 获取a标签的父节点,直接父节点
print(list(soup.a.parents)) # 找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲...
2.8兄弟节点
print(soup.a.next_sibling) # 下一个兄弟
print(soup.a.previous_sibling) # 上一个兄弟
print(list(soup.a.next_siblings)) # 下面的兄弟们=>生成器对象
print('-----')
print(list(soup.a.previous_siblings)) # 上面的兄弟们=>生成器对象
多线程下载图片
import requests
from bs4 import BeautifulSoup
from threading import Thread
def task(name, img_res):
with open(f'{name}.jpg', 'wb') as f:
f.write(img_res.content)
print('over')
url = 'https://www.umei.cc/katongdongman/dongmantupian/'
res = requests.get(url=url)
res.encoding = 'utf-8'
page = BeautifulSoup(res.text, 'html.parser')
img_list = page.find('div', class_='item_list infinite_scroll').find_all('img', class_='lazy')
for img in img_list:
print(img.get('data-original'), img.get('alt'))
src = img.get('data-original')
img_res = requests.get(src)
name = img.get('alt')
p = Thread(target=task, args=(name, img_res))
p.start()
# 关掉爬虫请求
res.close()
标签:get,bs4,res,soup,re,https,print,requests,解析
From: https://www.cnblogs.com/riuqi/p/16923599.html