1 import requests 2 from lxml import etree 3 4 def request_header(): 5 headers = { 6 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" #谷歌浏览器 7 } 8 return headers 9 10 ''' 11 创建两个列表用来存放代理ip 12 ''' 13 all_ip_list = [] #用于存放从网站上抓取到的ip 14 usable_ip_list = [] #用于存放通过检测ip后是否可以使用 15 16 17 def send_request(): 18 #爬取7页,可自行修改 19 for i in range(1,8): 20 print(f'正在抓取第{i}页……') 21 response = requests.get(url=f'http://www.ip3366.net/free/?page={i}', headers=request_header()) 22 text = response.text.encode('ISO-8859-1') 23 # print(text.decode('gbk')) 24 #使用xpath解析,提取出数据ip,端口 25 html = etree.HTML(text) 26 tr_list = html.xpath('/html/body/div[2]/div/div[2]/table/tbody/tr') 27 for td in tr_list: 28 ip_ = td.xpath('./td[1]/text()')[0] #ip 29 port_ = td.xpath('./td[2]/text()')[0] #端口 30 proxy = ip_ + ':' + port_ #115.218.5.5:9000 31 all_ip_list.append(proxy) 32 test_ip(proxy) #开始检测获取到的ip是否可以使用 33 print('抓取完成!') 34 # print(f'抓取到的ip个数为:{len(all_ip_list)}') 35 print(f'可以使用的ip个数为:{len(usable_ip_list)}') 36 print('分别有:\n', usable_ip_list) 37 #检测ip是否可以使用 38 def test_ip(proxy): 39 #构建代理ip 40 proxies = { 41 "http": "http://" + proxy, 42 "https": "http://" + proxy, 43 } 44 try: 45 response = requests.get(url='https://www.baidu.com/',headers=request_header(),proxies=proxies,timeout=1) #设置timeout,使响应等待1s 46 response.close() 47 if response.status_code == 200: 48 usable_ip_list.append(proxy) 49 print(proxy, '\033[31m可用\033[0m') 50 # 这里完成程序............................. 51 else: 52 print(proxy, '不可用') 53 except: 54 print(proxy,'请求异常') 55 56 if __name__ == '__main__': 57 send_request()
标签:python,IP,list,爬虫,text,proxy,ip,print,td From: https://www.cnblogs.com/zhang-dan/p/16990563.html