在爬取某些网站的时候我们有可能遇到ip反爬措施,通常可以使用代理ip的方法应对,本次我们要爬取的是国内某知名的代理ip网站,并挑选出其中响应速度符合我们要求的IP。
爬取代码如下:
def get_ip(last):
proxies_list = [] # ip池
for page in range(1, last + 1): # last为爬取的页数
time.sleep(1)
print(f'============正在爬取第{page}页数据=============')
url = f'https://www.kuaidaili.com/free/inha/{page}/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
}`在这里插入代码片`
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
page_text = response.text
# 正则匹配
match = re.search(r'const fpsList = (\[.*?\]);', page_text).group()[16:-1]
ip_list = json.loads(match)
for ip in ip_list:
ip_num = ip['ip']
ip_port = ip['port']
ip_proxy = ip_num + ':' + ip_port
# 区分http和https,此处爬取的全为http
proxy_dict = {
'http': 'http://' + ip_proxy
}
proxies_list.append(proxy_dict) # 加入ip池
return proxies_list
检测ip是否可用:
def check_ip(proxies_list):
time.sleep(1)
can_use = [] # 可用ip池
for ip in proxies_list:
try:
response = requests.get(url='https://www.baidu.com', proxies=ip, timeout=0.1)
if response.status_code == 200: # 可以访问则加入ip池
can_use.append(ip)
except:
# print('当前代理:', ip, '请求超时,检验不合格')
pass
return can_use
以下附完整代码:
import json
import requests
import time
import re
# 测试ip是否可用
def check_ip(proxies_list):
time.sleep(1)
can_use = [] # 可用ip池
for ip in proxies_list:
try:
response = requests.get(url='https://www.baidu.com', proxies=ip, timeout=0.1)
if response.status_code == 200: # 可以访问则加入ip池
can_use.append(ip)
except:
# print('当前代理:', ip, '请求超时,检验不合格')
pass
return can_use
# 获取ip
def get_ip(last):
proxies_list = [] # ip池
for page in range(1, last + 1): # last为爬取的页数
time.sleep(1)
print(f'============正在爬取第{page}页数据=============')
url = f'https://www.kuaidaili.com/free/inha/{page}/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
page_text = response.text
# 正则匹配
match = re.search(r'const fpsList = (\[.*?\]);', page_text).group()[16:-1]
ip_list = json.loads(match)
for ip in ip_list:
ip_num = ip['ip']
ip_port = ip['port']
ip_proxy = ip_num + ':' + ip_port
# 区分http和https,爬取的全为http
proxy_dict = {
'http': 'http://' + ip_proxy
}
proxies_list.append(proxy_dict) # 加入ip池
return proxies_list
def Get_ip(page):
proxies_list = get_ip(page) # 爬6页ip
can_use = check_ip(proxies_list) # 获得可用ip
print(can_use)
return can_use
if __name__ == '__main__':
print('============正在开始爬取============')
proxies_list = get_ip(6) # 爬6页ip
print(proxies_list)
print('============正在检测ip是否可用============')
can_use = check_ip(proxies_list) # 获得可用ip
print(can_use)
标签:use,ip,list,response,爬取,源码,proxies,page
From: https://blog.csdn.net/Pangaoyang_/article/details/140693565