Xpath选择器爬取房源信息实例
获取网页html,未处理子网页信息。
python3.6
for i in range(1, 101): print('正在爬取第' + str(i) + '页') # 爬取北京 上海 广州 深圳的二手房信息 city = ['bj', 'sh', 'gz', 'sz'] for c in city: # 拼接url if i == 1: url = 'https://{}.lianjia.com/ershoufang/'.format(c) else: url = 'https://{}.lianjia.com/ershoufang/pg{}/'.format(c, i) #拒绝 response = requests.get(url, cookies=cookies, headers=headers) # 使用xpath解析 html = etree.HTML(response.text) # 获取所有的二手房介绍 house_brief = html.xpath('/html/body/div[4]/div[1]/ul/li/div[1]/div[1]/a/text()') # 获取所有的二手房地理位置 需要拼接一下二手房的地理信息 house_location1 = html.xpath('/html/body/div[4]/div[1]/ul/li/div[1]/div[2]/div/a[1]/text()') house_location2 = html.xpath('/html/body/div[4]/div[1]/ul/li/div[1]/div[2]/div/a[2]/text()') # 获取所有的二手房的总价 拆分数据 house_total_price = html.xpath('/html/body/div[4]/div[1]/ul/li/div[1]/div[3]/div/text()') # 关注和发布时间 house_follow_info = html.xpath('/html/body/div[4]/div[1]/ul/li/div[1]/div[4]/text()') # 总价 house_unit_price = html.xpath('/html/body/div[4]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()') # 每平方米的价格 house_unit_price2 = html.xpath('/html/body/div[4]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()') # 遍历所有的二手房信息 for j in range(len(house_brief)): data = { '简介': house_brief[j], '地理位置': house_location1[j] + house_location2[j], '总价': house_unit_price[j], '每平米的价格': house_unit_price2[j], '关注': house_follow_info[j].split('/')[0], '发布时间': house_follow_info[j].split('/')[1], '户型': house_total_price[j].split('|')[0], '面积': house_total_price[j].split('|')[1], '朝向': house_total_price[j].split('|')[2], '装修': house_total_price[j].split('|')[3], '楼层': house_total_price[j].split('|')[4] } df = df.append(data, ignore_index=True) # 追加新数据 df = pd.concat([existing_data, df], ignore_index=True)
标签:Xpath,xpath,house,price,爬虫,html,text,编写,div From: https://www.cnblogs.com/smith-count/p/17474122.html