【2022-11-24】爬虫从入门到入狱(二)

标签：11 24 http https get res 2022 print requests

一、request高级用法

1.1 ssl认证

# 证书验证(大部分网站都是https)
import requests

res = requests.get('https://www.12306.cn')  # 如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端

# 改进1:去掉报错,但是会报警告
import requests

respone = requests.get('https://www.12306.cn', verify=False)  # 不验证证书,报警告,返回200
print(respone.status_code)

# 改进2:去掉报错,并且去掉警报信息
import requests
from requests.packages import urllib3

urllib3.disable_warnings()  # 关闭警告
respone = requests.get('https://www.12306.cn', verify=False)
print(respone.status_code)

# 改进3:加上证书
# 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书
# 知乎\百度等都是可带可不带
# 有硬性要求的,则必须带，比如对于定向的用户,拿到证书后才有权限访问某个特定网站
import requests

respone = requests.get('https://www.12306.cn',
                       cert=('/path/server.crt',
                             '/path/key'))
print(respone.status_code)

1.2 使用代理

# 爬取某些网站时，经常会遇到封账号或者封ip的限制，所以可以通过一些方法来避免，比如代理设置

# 关于正向代理以及反向代理的概念，可以参考下方文章：
	https://www.zhihu.com/question/24723688/answer/2771833737

# 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)

# 支持socks代理,安装:pip install requests[socks]

# 使用代理ip发送请求

import requests

proxies = {
    'http': '192.168.10.102:9003',   
}
res = requests.get('https://www.baidu.com', proxies=proxies)

print(res.text)

1.3 超时设置

# 超时设置
# 两种超时:float or tuple
# timeout=0.1 #代表接收数据的超时时间
# timeout=(0.1,0.2)	 # 0.1代表链接超时  0.2代表接收数据的超时时间

import requests
respone=requests.get('https://www.baidu.com', timeout=0.0001)

1.4 异常处理

# 异常处理
import requests
from requests.exceptions import *  # 可以查看requests.exceptions获取异常类型

try:
    r = requests.get('http://www.baidu.com', timeout=0.00001)
except ReadTimeout:
    print('===:')
# except ConnectionError: # 网络不通
#     print('-----')
# except Timeout:
#     print('aaaaa')

except RequestException:
    print('Error')

1.5 上传文件

import requests

files = {'file': open('123.txt', 'rb')}
respone = requests.post('http://httpbin.org/post', files=files)
print(respone.status_code)

二、代理池搭建与使用

2.1 github拉取代理池项目

# github上拉取开源的代理池项目到本地，地址为：
	https://github.com/jhao104/proxy_pool

2.2 克隆项目到本地

git clone https://github.com/jhao104/proxy_pool.git

2.3 创建虚拟环境

mkvirtualenv -p python3.8 虚拟环境名称

2.4 安装依赖包

pip install -r requirements.txt

2.5 修改settings.py配置文件

# setting.py 为项目配置文件

# 配置API服务

HOST = "0.0.0.0"               # IP
PORT = 5000                    # 监听端口

	
# 配置数据库

DB_CONN = 'redis://127.0.0.1:6379/0'


# 配置 ProxyFetcher

PROXY_FETCHER = [
    "freeProxy01",      # 这里是启用的代理抓取方法名，所有fetch方法位于fetcher/proxyFetcher.py
    "freeProxy02",
    # ....
]

2.6 启动项目

# 如果已经具备运行条件, 可用通过proxyPool.py启动。
# 程序分为: schedule 调度程序 和 server Api服务

# 启动调度程序
python proxyPool.py schedule

# 启动webApi服务
python proxyPool.py server

2.7 爬取测试

import requests

# http://127.0.0.1:5010/get/

# 获取一个随机ip
res = requests.get('http://127.0.0.1:5010/get/').json()
# if res['https']:
#     http = 'https'
# else:
#     http = 'http'

http = 'https' if res else 'http'
    
proxie = {
    http: res['proxy']
}
print(proxie)    # {'http': '175.6.181.191:9002'}
res = requests.get('https://www.zhihu.com/question/399452902/answer/2672454360', proxies=proxie)
print(res.status_code)   # 200

2.8 django后端获取客户端IP

views.py

from django.shortcuts import render, HttpResponse


# Create your views here.


def ip_test(request):
    ip = request.META.get('REMOTE_ADDR')
    return HttpResponse('请求的ip地址是：%s' % ip)

urls.py

from django.urls import path
from app01 import views

urlpatterns = [
    #    path('admin/', admin.site.urls),
    path('ip_test/', views.ip_test)
]

代理访问后端测试

# 本地使用requests+代理访问，查看是否返回代理的ip地址
import requests

res = requests.get('http://127.0.0.1:5010/get/').json()
# if res['https']:
#     http = 'https'
# else:
#     http = 'http'
http = 'https' if res else 'http'

proxie = {
    http: http + '://' + res['proxy']
}
print(proxie)

# 服务端部署在本地，是访问不到的，可以部署在服务器上
res = requests.get('http://1.117.93.128/ip/', proxies=proxie)
print(res.text)

三、爬取网站视频

# 以这个网站为例，使用requests+正则，整站爬取视频

	https://haokan.baidu.com/?fr=pc_pz

import requests

import re

res = requests.get(
    'https://www.pearvideo.com/panorama_loading.jsp?start=24')

# 使用正则，解析出该页面中所有的视频地址

video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res.text)
# print(video_list)

for video in video_list:
    # video_url = 'https://www.pearvideo.com/' + video
    # print(video_url)
    #
    # res = requests.get(video_url)
    # print(res.text)
    # break
    # 向https://www.pearvideo.com/videoStatus.jsp?contId=1775027&mrd=0.18997076047978978地址发送请求获取视频

    video_id = video.split('_')[-1]
    header = {
        'Referer': 'https://www.pearvideo.com/%s' % video
    }
    res = requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.18997076047978978' % video_id,
                       headers=header).json()
    real_mp4_url = res['videoInfo']['videos']['srcUrl']
    real_mp4_url = real_mp4_url.replace(real_mp4_url.rsplit('/', 1)[-1].split('-')[0], 'cont-%s' % video_id)
    print(real_mp4_url)

    res = requests.get(real_mp4_url)
    with open('./video/%s.mp4' % video_id, 'wb') as f:
        for line in res.iter_content():
            f.write(line)

四、爬取新闻资料

# 安装解析库；bs4  
	pip3 install beautifulsoup4

import requests
from bs4 import BeautifulSoup

res = requests.get('https://www.autohome.com.cn/news/1/#liststart')
# print(res.text)  # 从返回的html中查找，bs是解析html，xml格式的
soup = BeautifulSoup(res.text, 'html.parser')
# 查找：类名等于article的ul标签
ul_list = soup.find_all(name='ul', class_='article')
print(len(ul_list))  # 4 个ul取出来了
for ul in ul_list:
    # 找到ul下所有的li标签
    li_list = ul.find_all(name='li')
    for li in li_list:
        h3 = li.find(name='h3')
        if h3:  # 获取h3标签的文本内容
            title = h3.text
            desc = li.find(name='p').text
            url = 'https:' + li.find(name='a').attrs.get('href')
            img = li.find(name='img').attrs.get('src')
            if not img.startswith('http'):
                img='https:'+img

        print('''
        文章标题：%s
        文章摘要：%s
        文章地址：%s
        文章图片：%s
        ''' % (title, desc, url, img))

        #把数据保存到mysql：创建库，创建表，pymysql   insert      conn.commit()

五、BeautifulSoup4介绍

Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间.你可能在寻找 Beautiful Soup3 的文档,Beautiful Soup 3 目前已经停止开发,官网推荐在现在的项目中使用Beautiful Soup 4, 移植到BS4

#安装 Beautiful Soup
pip install beautifulsoup4

Beautiful Soup支持Python标准库中的HTML解析器,还支持一些第三方的解析器,其中一个是 lxml .根据操作系统不同,可以选择下列方法来安装lxml:

$ apt-get install Python-lxml

$ easy_install lxml

$ pip install lxml

另一个可供选择的解析器是纯Python实现的 html5lib , html5lib的解析方式与浏览器相同,可以选择下列方法来安装html5lib:

$ apt-get install Python-html5lib

$ easy_install html5lib

$ pip install html5lib

六、遍历文档树

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_p' name='lqz' xx='yy'>lqz is handsome <b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 1 美化html:了解
# print(soup.prettify())

# 2 遍历文档树
'''
#遍历文档树：即直接通过标签名字选择，特点是选择速度快，但如果存在多个相同的标签则只返回第一个
#1、用法
#2、获取标签的名称
#3、获取标签的属性
#4、获取标签的内容
#5、嵌套选择
#6、子节点、子孙节点
#7、父节点、祖先节点
#8、兄弟节点
'''
# 1 基本用法，直接  .标签名字
# res=soup.title
# print(res)
# res=soup.a
# print(res)
# 可以嵌套使用
# res=soup.head.title
# print(res)

# 2 获取标签的名称
# 拿到的所有标签都是一个对象，Tag对象  bs4.element.Tag
# res=soup.head.title
# res=soup.body
# print(res.name)

# 3 获取标签的属性
# res=soup.p
# print(res.attrs)  # 属性字典


# 4 获取标签的内容
# res = soup.p
# print(res.text) # 把该标签子子孙孙内容拿出来拼到一起 字符串
# print(res.string) # None 必须该标签没有子标签，才能拿出文本内容
# print(list(res.strings) )# generator 生成器，把子子孙孙的文本内容放到生成器中

# 5 嵌套选择

# res=soup.html.body.a
# print(res.text)


# 6、子节点、子孙节点
# print(soup.p.contents) #p下所有子节点
# print(soup.p.children) #得到一个迭代器,包含p下所有子节点

# 7、父节点、祖先节点
# print(soup.a.parent) #获取a标签的父节点,直接父节点
# print(list(soup.a.parents)) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...


# 8、兄弟节点
# print(soup.a.next_sibling)  # 下一个兄弟
# print(soup.a.previous_sibling)  # 上一个兄弟

print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
print('-----')
print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象

标签：11,24,http,https,get,res,2022,print,requests
From： https://www.cnblogs.com/dy12138/p/16923514.html

【2022-11-24】爬虫从入门到入狱(二)

一、request高级用法

1.1 ssl认证

1.2 使用代理

1.3 超时设置

1.4 异常处理

1.5 上传文件

二、代理池搭建与使用

2.1 github拉取代理池项目

2.2 克隆项目到本地

2.3 创建虚拟环境

2.4 安装依赖包

2.5 修改settings.py配置文件

2.6 启动项目

2.7 爬取测试

2.8 django后端获取客户端IP

views.py

urls.py

代理访问后端测试

三、爬取网站视频

四、爬取新闻资料

五、BeautifulSoup4介绍

六、遍历文档树

相关文章

赞助商

阅读排行