前言
之前尝试写过一个爬虫,那时对网页请求还不够熟练,用的原理是:爬取整个html文件,然后根据标签页筛选有效信息。
现在看来这种方式无疑是吃力不讨好,因此现在重新写了一个爬取天气的程序。
准备工作
网上能轻松找到的是 101010100 北京
这种编号,而查看中国气象局URL,他们使用的是北京54511
编号。后者没能找到完整的,但不影响程序的编写,找到的初始文件部分如下:
city_code.txt
直辖市 , 北京54511 , 上海58367 , 天津54517 , 重庆57516
特别行政区 , 香港45005 , 澳门45011
黑龙江 , 哈尔滨50953 , 齐齐哈尔50745 , 牡丹江54094 , 大庆50842 , 伊春50774 , 双鸭山50884 , 鹤岗50775 , 鸡西50978 , 佳木斯50873 , 七台河50971 , 黑河50468 , 绥化50853 , 大兴安岭50442
先将其转为Json的字典格式(转换后的完整城市编号放在最后),转换后的部分文件内容如下:
"北京": "54511", "上海": "58367", "天津": "54517", "重庆": "57516", "香港": "45005", "澳门": "45011", "哈尔滨": "50953", "齐齐哈尔": "50745", "牡丹江": "54094", "大庆": "50842", "伊春": "50774", "双鸭山": "50884", "鹤岗": "50775"
转换代码如下:
import json
file_txt = r'city_code.txt'
file_json = r'city_code.json'
dic = dict()
def make_json():
with open(file_txt, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
if len(line.strip()) > 0:
blocks = line.split(',')
del blocks[0] ## 去除省份信息
for i in blocks:
s = i.strip()
code = s[-5:]
city = s[0:-5]
dic[city] = code
json_str = json.dumps(dic, ensure_ascii=False) # 防止中文乱码
with open(file_json, 'w') as f:
f.write(json_str)
if __name__ == "__main__":
make_json()
开始爬取数据
完整代码放在后面
- 现将上一步的数据读取为字典类型变量
def read_json(): ## 读取城市代码
with open(file_json, 'r') as f:
global city_code
city_code = json.load(f)
- 对着城市名,找到对应城市编号,然后作为URL后的请求参数访问网页服务器
def get_data(city: str) -> str: ## 使用url请求数据
dat = ''
try:
code = city_code[city]
response = requests.get(url + code)
response.encoding = response.apparent_encoding
dat = response.text
except KeyError:
dat = None
finally:
return dat
- 将网页响应转为字典
def json_to_dic(str_json: str): ## 将得到的数据转为字典类型
data_dic = json.loads(str_json)
if data_dic['msg'] == 'success':
return data_dic['data']
else:
return None
响应内容形式是:
{"msg":"success","code":0,"data":{"location":{"id":"58367","name":"上海","path":"中国, 上海, 上海"},"now":{"precipitation":0.0,"temperature":12.0,"pressure":1021.0,"humidity":62.0,"windDirection":"东北风","windDirectionDegree":83.0,"windSpeed":0.8,"windScale":"微风"},"alarm":[],"lastUpdate":"2023/02/07 15:30"}}
- 将目前的数据做一个整理
def decorate_msg(data: dict) -> str:
res = ''
dic_localtion = data['location']
dic_now = data['now']
cn_name = ['降雨', '温度', '气压', '湿度', '风向', '风向度', '风速', '风级']
en_name = ['precipitation', 'temperature', 'pressure', 'humidity', 'windDirection', 'windDirectionDegree',
'windSpeed', 'windScale']
unit = ['mm', '℃', 'hPa', '%', '', '°', 'm/s', '']
res += dic_localtion['name'] + '\n'
res += '更新时间' + data['lastUpdate'] + '\n'
for en, cn, u in zip(en_name, cn_name, unit):
res += (cn + ':' + str(dic_now[en]) + u + '\n')
return res
返回的结果形式:
上海
更新时间2023/02/07 15:35
降雨:0.0mm
温度:12.0℃
气压:1021.0hPa
湿度:62.0%
风向:东北风
风向度:20.0°
风速:1.0m/s
风级:微风
完整代码
import json
import requests
file_json = r'city_code.json'
url = 'https://weather.cma.cn/api/now/' ## 获取数据的URL
global city_code
def the_weather(city: str) -> str:
res = ''
read_json()
str_json = get_data(city)
if str_json is None:
res = '暂未收录该城市数据。'
else:
data = json_to_dic(str_json)
if data is None:
res = '查询失败,请稍后再试。'
else:
res = decorate_msg(data)
return res
def read_json(): ## 读取城市代码
with open(file_json, 'r') as f:
global city_code
city_code = json.load(f)
def get_data(city: str) -> str: ## 使用url请求数据
dat = ''
try:
code = city_code[city]
response = requests.get(url + code)
response.encoding = response.apparent_encoding
dat = response.text
except KeyError:
dat = None
finally:
return dat
def json_to_dic(str_json: str): ## 将得到的数据转为字典类型
data_dic = json.loads(str_json)
if data_dic['msg'] == 'success':
return data_dic['data']
else:
return None
def decorate_msg(data: dict) -> str:
res = ''
dic_localtion = data['location']
dic_now = data['now']
cn_name = ['降雨', '温度', '气压', '湿度', '风向', '风向度', '风速', '风级']
en_name = ['precipitation', 'temperature', 'pressure', 'humidity', 'windDirection', 'windDirectionDegree',
'windSpeed', 'windScale']
unit = ['mm', '℃', 'hPa', '%', '', '°', 'm/s', '']
res += dic_localtion['name'] + '\n'
res += '更新时间' + data['lastUpdate'] + '\n'
for en, cn, u in zip(en_name, cn_name, unit):
res += (cn + ':' + str(dic_now[en]) + u + '\n')
return res
if __name__ == "__main__":
s = the_weather('上海')
print(s)
城市编号
city_code.json
{"北京": "54511", "上海": "58367", "天津": "54517", "重庆": "57516", "香港": "45005", "澳门": "45011", "哈尔滨": "50953", "齐齐哈尔": "50745", "牡丹江": "54094", "大庆": "50842", "伊春": "50774", "双鸭山": "50884", "鹤岗": "50775", "鸡西": "50978", "佳木斯": "50873", "七台河": "50971", "黑河": "50468", "绥化": "50853", "大兴安岭": "50442", "长春": "54161", "延边": "99999", "吉林": "54172", "白山": "54371", "白城": "50936", "四平": "54157", "松原": "50946", "辽源": "54260", "大安": "50945", "通化": "54363", "沈阳": "54342", "大连": "54662", "葫芦岛": "54453", "盘锦": "54338", "本溪": "54346", "抚顺": "54353", "铁岭": "54249", "辽阳": "54347", "营口": "54471", "阜新": "54237", "朝阳": "54324", "锦州": "54337", "丹东": "54497", "鞍山": "54339", "呼和浩特": "53463", "呼伦贝尔": "99999", "锡林浩特": "54102", "包头": "53446", "赤峰": "54218", "海拉尔": "50527", "乌海": "53512", "鄂尔多斯": "53543", "通辽": "54135", "石家庄": "53698", "唐山": "54534", "张家口": "54401", "廊坊": "54515", "邢台": "53798", "邯郸": "53892", "沧州": "54616", "衡水": "54702", "承德": "54423", "保定": "54602", "秦皇岛": "54449", "郑州": "57083", "开封": "57091", "洛阳": "57073", "平顶山": "57171", "焦作": "53982", "鹤壁": "53990", "新乡": "53986", "安阳": "53898", "濮阳": "54900", "许昌": "57089", "漯河": "57186", "三门峡": "57051", "南阳": "57178", "商丘": "58005", "信阳": "57297", "周口": "57195", "驻马店": "57290", "济南": "54823", "青岛": "54857", "淄博": "54830", "威海": "54774", "曲阜": "54918", "临沂": "54938", "烟台": "54765", "枣庄": "58024", "聊城": "54806", "济宁": "54915", "菏泽": "54906", "泰安": "54827", "日照": "54945", "东营": "54736", "德州": "54714", "滨州": "54734", "莱芜": "54828", "潍坊": "54843", "太原": "53772", "阳泉": "53782", "晋城": "53976", "晋中": "53778", "临汾": "53868", "运城": "53959", "长治": "53882", "朔州": "53578", "忻州": "53674", "大同": "53487", "吕梁": "53753", "南京": "58238", "苏州": "58357", "昆山": "58356", "南通": "58259", "太仓": "58377", "吴县": "58349", "徐州": "58027", "宜兴": "58346", "镇江": "58248", "淮安": "58145", "常熟": "58352", "盐城": "58151", "泰州": "58246", "无锡": "58354", "连云港": "58044", "扬州": "58245", "常州": "58343", "宿迁": "58131", "合肥": "58321", "巢湖": "58326", "蚌埠": "58221", "安庆": "58424", "六安": "58311", "滁州": "58236", "马鞍山": "58336", "阜阳": "58203", "宣城": "58433", "铜陵": "58429", "淮北": "58116", "芜湖": "58334", "毫州": "99999", "宿州": "58122", "淮南": "58224", "池州": "58427", "西安": "57036", "韩城": "53955", "安康": "57245", "汉中": "57127", "宝鸡": "57016", "咸阳": "57048", "榆林": "53646", "渭南": "57045", "商洛": "57143", "铜川": "53947", "延安": "53845", "银川": "53614", "固原": "53817", "中卫": "53704", "石嘴山": "53518", "吴忠": "53612", "兰州": "52889", "白银": "52896", "庆阳": "53829", "酒泉": "52533", "天水": "57006", "武威": "52679", "张掖": "52652", "甘南": "50741", "临夏": "52984", "平凉": "53915", "定西": "52995", "金昌": "52675", "西宁": "52866", "海北": "52754", "海西": "52737", "黄南": "56065", "果洛": "56043", "玉树": "56029", "海东": "52875", "海南": "52856", "武汉": "57494", "宜昌": "57461", "黄冈": "57498", "恩施": "57447", "荆州": "57476", "神农架": "57362", "十堰": "57256", "咸宁": "57590", "襄樊": "57278", "孝感": "57482", "随州": "57381", "黄石": "58407", "荆门": "57377", "鄂州": "57496", "长沙": "57687", "邵阳": "57766", "常德": "57662", "郴州": "57972", "吉首": "57649", "株洲": "57780", "娄底": "57763", "湘潭": "57773", "益阳": "57674", "永州": "57866", "岳阳": "57584", "衡阳": "57872", "怀化": "57749", "韶山": "57771", "张家界": "57558", "杭州": "58457", "湖州": "58450", "金华": "58549", "宁波": "58563", "丽水": "58646", "绍兴": "58453", "雁荡山": "99999", "衢州": "58633", "嘉兴": "58452", "台州": "58660", "舟山": "58477", "温州": "58659", "南昌": "58606", "萍乡": "57786", "九江": "58502", "上饶": "58637", "抚州": "58617", "吉安": "57799", "鹰潭": "58627", "宜春": "57793", "新余": "57796", "景德镇": "58527", "赣州": "57993", "福州": "58847", "厦门": "59134", "龙岩": "58927", "南平": "58834", "宁德": "58846", "莆田": "58946", "泉州": "59137", "三明": "58828", "漳州": "59126", "贵阳": "57816", "安顺": "57806", "赤水": "57609", "遵义": "57713", "铜仁": "57741", "六盘水": "56693", "毕节": "57707", "凯里": "57825", "都匀": "57827", "成都": "56294", "泸州": "57602", "内江": "57504", "凉山": "56571", "阿坝": "56171", "巴中": "57313", "广元": "57206", "乐山": "56386", "绵阳": "56196", "德阳": "56198", "攀枝花": "56666", "雅安": "56287", "宜宾": "56492", "自贡": "56396", "甘孜州": "56146", "达州": "57328", "资阳": "56298", "广安": "57415", "遂宁": "57405", "眉山": "56391", "南充": "57411", "广州": "59287", "深圳": "59493", "潮州": "59312", "韶关": "59082", "湛江": "59658", "惠州": "59298", "清远": "59280", "东莞": "59289", "江门": "59473", "茂名": "59659", "肇庆": "59278", "汕尾": "59501", "河源": "59293", "揭阳": "59315", "梅州": "59117", "中山": "59485", "德庆": "59269", "阳江": "59663", "云浮": "59471", "珠海": "59488", "汕头": "59316", "佛山": "59279", "南宁": "59432", "桂林": "57957", "阳朔": "59051", "柳州": "59046", "梧州": "59265", "玉林": "59453", "桂平": "59254", "贺州": "59065", "钦州": "59632", "贵港": "59249", "防城港": "59635", "百色": "59211", "北海": "59644", "河池": "59023", "来宾": "59242", "崇左": "59425", "昆明": "56778", "保山": "56748", "楚雄": "56768", "德宏": "56844", "红河": "56975", "临沧": "56951", "怒江": "56533", "曲靖": "56783", "思茅": "56964", "文山": "56994", "玉溪": "56875", "昭通": "56586", "丽江": "56651", "大理": "56751", "海口": "59758", "三亚": "59948", "儋州": "59845", "琼山": "59757", "通什": "59941", "文昌": "59856", "乌鲁木齐": "51463", "阿勒泰": "51076", "阿克苏": "51628", "昌吉": "51368", "哈密": "52203", "和田": "51828", "喀什": "51709", "克拉玛依": "51243", "石河子": "51356", "塔城": "51133", "库尔勒": "51656", "吐鲁番": "51573", "伊宁": "51431", "拉萨": "55591", "阿里": "55437", "昌都": "56137", "那曲": "55299", "日喀则": "55578", "山南": "55598", "林芝": "56312", "台北": "58968", "高雄": "59554"}
标签:city,code,天气,信息,dic,爬取,json,str,data
From: https://www.cnblogs.com/zh-jp/p/17098919.html