import requests
from lxml import etree
import pandas as pd
import time
url = "http://www.66ip.cn/index.html"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
all_ip = []
all_duankou = []
all_adress = []
all_iptype = []
all_yz_time = []
num = 1
while True:
if num == 2991:
break
try:
res = requests.get(url, headers=headers, timeout=10)
except:
continue
res.encoding = "gbk"
e = etree.HTML(res.text)
ip = e.xpath("//table/tr/td[1]/text()")
duankou = e.xpath("//table/tr/td[2]/text()")
adress = e.xpath("//table/tr/td[3]/text()")
iptype = e.xpath("//table/tr/td[4]/text()")
yz_time = e.xpath("//table/tr/td[5]/text()")
num += 1
url = "http://www.66ip.cn/" + str(num) + ".html"
all_ip.extend(ip[3:])
all_duankou.extend(duankou[1:])
all_adress.extend(adress[1:])
all_iptype.extend(iptype[1:])
all_yz_time.extend(yz_time[1:])
time.sleep(1)
print(url)
df = pd.DataFrame({"ip": all_ip, "端口号": all_duankou, "代理位置": all_adress, "代理类型": all_iptype, "验证时间": all_yz_time})
df.to_excel(r"C:\Users\44281\Desktop\海量IP.xlsx", index=False)
标签:iptype,海量,ip,time,tr,爬取,text,table
From: https://www.cnblogs.com/jzm123/p/17292497.html