urllib_9_ajax的get请求豆瓣电影第一页
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/28 21:46.
@Author: haifei
"""
import time
from urllib import request, parse
# 谷歌浏览器访问https://movie.douban.com/typerank?type_name=%E5%8A%A8%E4%BD%9C&type=5&interval_id=100:90&action=
# F12,找到top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20,preview或response中可以看到目标json数据
# header中Request URL: https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
# 1. 请求对象定制
_request = request.Request(url=url, headers=headers)
# 2. 获取响应数据
response = request.urlopen(_request)
content = response.read().decode('utf-8')
print(content)
# 3. 下载数据到本地
# fp = open('./download/douban-page1.json', 'w')
fp = open('./download/douban-page1.json', 'w', encoding='utf-8') # 如果上行失败可以+编码
fp.write(content)
fp.close()
# 4. 数据下载方法2
with open('./download/douban-page2.json', 'w', encoding='utf-8') as fp:
fp.write(content) # with会自动关闭文件流
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_10_ajax的get请求豆瓣电影前十页
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/28 21:46.
@Author: haifei
"""
import time
from urllib import request, parse
"""
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
# 以下在在页面上为ajax异步请求实现,后端为分页
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=20&limit=20'
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=40&limit=20'
page 1 2 3 4 ...
start 0 20 40 60 ...
找到分页规律:start =(page - 1)* 20
1. 请求对象定制
2. 获取响应数据
3. 下载数据
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'
def create_request(page):
# pass
data = {
'start': (page-1)*20,
'limit': 20
}
# get请求:参数是直接追加在URL后面的,不需要调用.encode('utf-8')
# put请求:参数必须在.urlencode()之后再调用.encode('utf-8')
data = parse.urlencode(data)
url = base_url + data
# print(url)
_request = request.Request(url=url, headers=headers)
return _request
def get_content(_request):
response = request.urlopen(_request)
content = response.read().decode('utf-8')
return content
def download_data(content, page):
file_name = 'doubanpage-' + str(page) + '.json' # note:python没有自动类型转换,此处必须强制类型转换
with open('./download/'+file_name, 'w', encoding='utf-8') as fp:
fp.write(content)
# Mac&win pycharm快捷键汇总https://blog.csdn.net/qq_42363090/article/details/125725182
# 代码格式化/json文件格式化:command+Option+L Ctrl+Alt+L
if __name__ == '__main__':
start = time.time()
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
if (start_page < 0) or (end_page < 0):
print('[note: 页码数必须大于0]')
elif start_page > end_page:
print('[note: 起始页码必须小于等于结束页码]')
else:
for page in range(start_page, end_page+1): # 左闭右开[)
# print(page)
myrequest = create_request(page) # 封装请求对象定制
mycontent = get_content(myrequest) # 封装获取响应数据
download_data(mycontent, page)
print('download finished')
print('It takes', time.time() - start, "seconds.")
urllib_11_ajax的put请求肯德基官网店铺位置
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/28 22:51.
@Author: haifei
"""
import time
from urllib import request, parse
from pyfiglet import Figlet
"""
request headers中含有X-Requested-With: XMLHttpRequest时说明为ajax请求
'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
post请求的参数在payload的form data里,如下
cname: 北京
pid:
pageIndex: 1
pageSize: 10
cname: 北京
pid:
pageIndex: 2
pageSize: 10
cname: 北京
pid:
pageIndex: 2
pageSize: 10
"""
def hello_message():
print('*' * 100)
f = Figlet()
print(f.renderText('irun2u'))
print('Name: kfc store spider')
print('Verson: 1.0')
print('Index: http://www.irun2u.top')
print('*' * 100)
def legal(s):
if (s[0] != '+') and (s[0] != '-'): # 无符号位,默认为正数
return s
else: # 有符号位,对去掉符号位的num进行检验
return s[1:]
def get_page():
input_start = input('请输入起始页码:')
input_end = input('请输入结束页码:')
if not legal(input_start).isdigit() or not legal(input_end).isdigit():
print('[note: 输入页码必须为数字]')
# raise Exception('[note: 输入页码必须为数字]')
else:
page_start = int(input_start)
page_end = int(input_end)
if (page_start < 0) or (page_end < 0):
print('[note: 页码数必须大于0]')
# raise Exception('[note: 输入页码必须为数字]')
elif page_start > page_end:
print('[note: 起始页码必须小于等于结束页码]')
# raise Exception('[note: 输入页码必须为数字]')
else:
return [page_start, page_end]
def create_request(page):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
data = {
'cname': '北京',
'pid': '',
'pageIndex': page,
'pageSize': '10'
}
_data = parse.urlencode(data).encode('utf-8') # put请求的参数必须在.urlencode()之后再调用.encode('utf-8')
_request = request.Request(url=base_url, headers=headers, data=_data) # put请求的参数放在data属性里而不是直接追加在URL后
return _request
def get_content(myrequest):
response = request.urlopen(myrequest)
content = response.read().decode('utf-8')
return content
def download_data(page, mycontent):
file_name = 'kfcstore-' + str(page) + '.json'
with open('./download/' + file_name, 'w', encoding='utf-8') as fp:
fp.write(mycontent)
if __name__ == '__main__':
start = time.time()
hello_message()
pages = get_page()
if pages is not None:
page_start = pages[0]
page_end = pages[1]
for page in range(page_start, page_end + 1):
myrequest = create_request(page)
mycontent = get_content(myrequest)
download_data(page, mycontent)
print('download finished')
print('It takes', time.time() - start, "seconds.")
urllib_12_异常
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/29 21:59.
@Author: haifei
"""
import time
import urllib.request, urllib.error
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
# 正常情况
# url = 'https://blog.csdn.net/csdnnews/article/details/129774767'
# _request = urllib.request.Request(url=url, headers=headers)
# response = urllib.request.urlopen(_request)
# content = response.read().decode('utf-8')
# print(content)
# 报错情况
# url = 'https://blog.csdn.net/csdnnews/article/details/129774767' + '1111' # URL错误,报错HTTP Error 404
# _request = urllib.request.Request(url=url, headers=headers)
# response = urllib.request.urlopen(_request)
# content = response.read().decode('utf-8')
# print(content)
# 捕获异常
# httperror是urlerror的子类,二者都属于urllib.error包
url = 'https://blog.csdn.net/csdnnews/article/details/129774767' + '1111' # URL错误,报错HTTP Error 404
try:
_request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(_request)
content = response.read().decode('utf-8')
print(content)
except urllib.error.HTTPError:
print('[note: 404]')
# urlerror出现情况:URL域名错误
url = 'https://www.dandan789.com/' # 域名错误,报错Hostname mismatch
_request2 = urllib.request.Request(url=url, headers=headers)
response2 = urllib.request.urlopen(_request2)
content2 = response2.read().decode('utf-8')
print(content2)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_13_qq空间的cookie登陆
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/29 22:13.
@Author: haifei
"""
import time
import urllib.request
'''
场景:
一个浏览器登陆qq空间,URL
https://user.qzone.qq.com/xxxxx
用另一个浏览器打开这个URL
正常会提示登陆
什么情况下访问不成功?
因为请求头的信息不够 所以访问不成功
解决:回到第一个浏览器找到访问目标接口,拷贝其请求头request headers放到headers中
决定性信息:cookie、referer
注意:cookie携带登陆信息;referer是图片防盗链,判断当前路径是否有上一路径进来
'''
'''
url = 'https://user.qzone.qq.com/xxxxx'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39'
}
# 请求对象的定制
_request = urllib.request.Request(url=url, headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(_request)
# 获取响应的数据
content = response.read().decode('utf-8')
# 将数据保存到本地
with open('./download/qqspace.html', 'w', encoding='utf-8') as fp:
fp.write(content)
'''
url = 'https://user.qzone.qq.com/xxxxx'
headers = {
# ':authority': 'user.qzone.qq.com',
# ':method': 'GET',
# ':path': '/xxxxx',
# ':scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'max-age=0',
'cookie': '......',
'if-modified-since': 'Wed, 29 Mar 2023 14:43:59 GMT',
'referer': 'https://qzs.qq.com/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Microsoft Edge";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-site',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39',
} # 注意:带冒号的全都不好使可以注释掉,+accept-encoding
_request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(_request)
content = response.read().decode('utf-8')
with open('./download/qqspace2.html', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_15_handler处理器基本使用
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/30 22:17.
@Author: haifei
"""
import time
from urllib import request, parse
# 需求:使用handler访问百度首页,获取网页源码
# hander build_operer open
url = 'http://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
_request = request.Request(url=url, headers=headers)
# response = request.urlopen(_request)
# 1. 获取handler对象
handler = request.HTTPHandler()
# 2. 获取opener对象
opener = request.build_opener(handler)
# 3. 调用open方法
response = opener.open(_request)
content = response.read().decode('utf-8')
print(content)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_16_代理基本使用
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/30 22:40.
@Author: haifei
"""
import time
from urllib import request, parse
# url = 'https://www.baidu.com/s?wd=ip'
url = 'https://www.ip138.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
# _request = request.Request(url=url, headers=headers)
# response = request.urlopen(_request) # 单纯使用urlopen只能设置headers等
# content = response.read().decode('utf-8')
# with open('./download/daili-baidu.html', 'w', encoding='utf-8') as fp:
# fp.write(content)
_request = request.Request(url=url, headers=headers)
# 免费代理获取https://www.kuaidaili.com/free/,不好使的话在快代理网站自购1块1小时代理进行测试
proxies = {
# 'http': '121.13.252.62:41564'
'http': 's562.kdltps.com:15818'
}
handeler = request.ProxyHandler(proxies=proxies) # 可设置代理
operer = request.build_opener(handeler)
response = operer.open(_request)
content = response.read().decode('utf-8')
with open('./download/daili-baidu2.html', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
urllib_17_代理池
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/3/30 22:40.
@Author: haifei
"""
import time
from urllib import request, parse
import random
# url = 'https://www.baidu.com/s?wd=ip'
url = 'https://www.ip138.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
proxies_pool = [
{'http': 's562.kdltps.com:15818'},
{'http': '222.74.73.202:42055'},
{'http': '121.13.252.62:41564'},
{'http': '27.42.168.46:55481'}
]
proxies = random.choice(proxies_pool)
print(proxies)
_request = request.Request(url=url, headers=headers)
handeler = request.ProxyHandler(proxies=proxies)
operer = request.build_opener(handeler)
response = operer.open(_request)
content = response.read().decode('utf-8')
with open('./download/daili.html', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
标签:__,url,request,headers,ajax,handler,cookie,time,page
From: https://www.cnblogs.com/yppah/p/17284841.html