这个代码例子为获取链家网里所有的城市,然后将按照{省名:{市名:url},{市名:url}....}的方式
import requests from lxml import etree import json def get_all_city(): url = "https://www.lianjia.com/city/" # 全国城市列表 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' } try: response = requests.get(url=url, headers=headers) except requests.exceptions.ConnectionError as e: # 有时网不好,连接错误 print(e) # HTTPSConnectionPool(host='www.lianjia.com', port=443): Max retries exceeded with url: /city/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001F0143B7810>, 'Connection to www.lianjia.com timed out. (connect timeout=None)')) return # None tree = etree.HTML(response.text) citys = {} province_list = tree.xpath("//div[@class='city_province']") for province in province_list: province_name = province.xpath(".//div[@class='city_list_tit c_b']/text()")[0] # 省名, 例:山东 province_city_name = province.xpath(".//ul/li/a/text()") # 市名, 例:['菏泽', '济南', '济宁', '临沂', '青岛', '泰安', '潍坊', '威海', '烟台', '淄博'] province_city_url = province.xpath(".//ul/li/a/@href") # 对应的url,例:['https://heze.lianjia.com/', 'https://jn.lianjia.com/', 'https://jining.lianjia.com/', 'https://linyi.lianjia.com/', 'https://qd.lianjia.com/', 'https://ta.lianjia.com/', 'https://wf.lianjia.com/', 'https://weihai.lianjia.com/', 'https://yt.lianjia.com/', 'https://zb.lianjia.com/'] citys[province_name] = dict(zip(province_city_name, province_city_url)) # 将市名和对应的url组成键值对 return citys if __name__ == '__main__': citys_data = get_all_city() with open("citys_data.json", "w", encoding='utf-8') as f: json.dump(citys_data, f, ensure_ascii=False)
json.dump(citys_data, f),默认ensure_ascii为True |
json.dump(citys_data, f, ensure_ascii=False),指定ensure_ascii为False,写入中文 |
{ "\u5b89\u5fbd": { "\u5b89\u5e86": "https://aq.lianjia.com/", "\u6ec1\u5dde": "https://cz.fang.lianjia.com/", "\u961c\u9633": "https://fy.lianjia.com/", "\u5408\u80a5": "https://hf.lianjia.com/", "\u9a6c\u978d\u5c71": "https://mas.lianjia.com/", "\u829c\u6e56": "https://wuhu.lianjia.com/" }, "\u5317\u4eac": { "\u5317\u4eac": "https://bj.lianjia.com/" }, "\u91cd\u5e86": { "\u91cd\u5e86": "https://cq.lianjia.com/" }, 。。。。。。。 } |
{ "安徽": { "安庆": "https://aq.lianjia.com/", "滁州": "https://cz.fang.lianjia.com/", "阜阳": "https://fy.lianjia.com/", "合肥": "https://hf.lianjia.com/", "马鞍山": "https://mas.lianjia.com/", "芜湖": "https://wuhu.lianjia.com/" }, "北京": { "北京": "https://bj.lianjia.com/" }, "重庆": { "重庆": "https://cq.lianjia.com/" }, 。。。。。。。 } |
标签:province,city,False,com,ensure,https,citys,ascii,lianjia From: https://www.cnblogs.com/lifengjuan/p/17695594.html