数据源ipip.net,没会员,只能爬网站自己解析了-_-!,防止ip被封,没有启用多线程,后台运行慢慢解析吧
记录下代码
#! /usr/bin/env python #-*-coding:utf-8-*- import sys import bs4 import requests import json class HTML_OBJ: def __init__(self,country=None): self.url='https://whois.ipip.net/' self.country = country self.As_dic = {} self.IPsub_list=[] self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/83.0.4103.97 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;' 'q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', } def get_as(self): url = self.url+'countries/'+self.country resp = requests.get(url=url,headers=self.headers) soup = bs4.BeautifulSoup(resp.text, 'lxml') elements = soup.select('a') for element in elements: span = element.text span_des = element.get('title') if span_des: self.As_dic.update({span:{'descriptions':span_des}}) return self.As_dic def get_ipsub(self,ASN): url = self.url+ASN resp = requests.get(url=url,headers=self.headers) soup = bs4.BeautifulSoup(resp.text, 'lxml') elements = soup.select_one('.table-responsive').select('a') for element in elements: self.IPsub_list.append(element.text) return self.IPsub_list def get_json(self): write, flush = sys.stdout.write, sys.stdout.flush AS_dic = self.get_as() famdic = {} th = 100 for index,a in enumerate(AS_dic): self.get_ipsub(a) famdic[a]=AS_dic[a] famdic[a]['ipsub'] = self.IPsub_list self.IPsub_list=[] show = '{:.3f}%'.format(index/len(AS_dic)) write(show) flush() write('\x08'*len(show)) filename=self.country+str(th//100) if index > th: th+=100 with open('/root/python/crawler/%s.json'%filename,'a+',newline='\n') as e: e.write(json.dumps(famdic,indent=1)) famdic={} else: print('>>>>>>finish') with open('/root/python/crawler/%s.json'%(self.country+'last'),'a+',newline='\n') as e: e.write(json.dumps(famdic,indent=1)) if __name__ == '__main__': HTML_OBJ(country='KR').get_json()
数据格式
{"AS2500": { "descriptions": "AS2500 - WIDE-BB - WIDE Project, JP", "ipsub": [ "133.4.128.0/18", "133.144.0.0/24", "133.144.0.0/16", "163.221.0.0/24", "163.221.0.0/16", "192.218.228.0/24", "202.249.0.0/18", "203.178.128.0/17" ] }, "AS2501": { "descriptions": "AS2501 - UTnet - The University of Tokyo, JP", "ipsub": [ "130.69.0.0/24", "130.69.0.0/16", "133.11.0.0/24", "133.11.0.0/16", "157.82.0.0/24", "157.82.0.0/16", "157.82.112.0/21", "192.51.208.0/20" ] }, "AS2504": { "descriptions": "AS2504 - NCA5 - Kyoto University, JP", "ipsub": [ "130.54.0.0/24", "130.54.0.0/16", "133.3.0.0/24", "133.3.0.0/16", "192.50.8.0/23", "192.50.24.0/23" ] },
... }
标签:24,get,url,0.0,self,下雨,dic,IP地址,备用 From: https://www.cnblogs.com/darkchen/p/16753301.html