手机号段抓取脚本
#! -*- coding:utf-8 -*-
import requests
from multiprocessing.pool import ThreadPool
from lxml import etree
# 取消验证警告
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers_str = """Host: www.sdfl.net
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3
Accept-Encoding: gzip, deflate, br
Referer: https://cn.bing.com/
DNT: 1
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: cross-site
Sec-Fetch-User: ?1
Pragma: no-cache
Cache-Control: no-cache"""
headers = dict([line.split(": ",1) for line in headers_str.split("\n")])
proxies = {
"http": "socks5://127.0.0.1:8080",
}
url = "https://www.sdfl.net/s/hubei/wuhan/"
ret = requests.get(url=url, headers=headers, verify=False) #proxies=proxies
# print(ret.text)
print("[*] 获取html成功")
html_data = etree.HTML(ret.text)
seach_list = html_data.xpath(r'//div[@class="wrap h_list"]/dl[@class="list"]')
print("[*] 获取手机号段数量:", len(seach_list))
phone_list = []
for x in seach_list: # 获取搜素结果
ret = x.xpath(r'./dd/a/text()')
for id in ret:
# print(id)
for y in range(0, 10000):
# print(y)
phone_list.append("{}{:0>4d}\n".format(id, y))
print("[*] 获取手机号总数量:", len(phone_list))
with open(r'phone.txt', "a") as f:
for phone in phone_list:
f.writelines(phone)
标签:脚本,手机号,headers,list,抓取,ret,phone,print
From: https://www.cnblogs.com/startstart/p/16908443.html