#使用requests和xpath从搜房网上抓取福州地区的二手房房源信息
# (要求获取所有分页上的房源,且每套房源包含标题、楼盘、
# 地点、经纬度、面积、房型、楼层、朝向、建筑年代、单价、总价、经纪人、联系电话等,缺数据的留空)。
import requests
from lxml import etree
from fake_useragent import UserAgent
import cchardet
import re
import csv
import logging
import time
import os
# 获取随机请求头
headers = {
'user-agent': UserAgent().ie,
#'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
#'cookie':'cityNum=28; cityNum=28; UM_distinctid=17c8cd027e5626-084f43eefc826-513c1e45-144000-17c8cd027e6706; uniqueName=688cf35d22519598d68971bf65fdde9b; Hm_lvt_d2801fc638056c1aac7e8008f41cf828=1634450995; cityNum=28; Hm_lvt_bc2dcba98591e113d6329631910c965b=1634450835,1634486994,1634569816; CNZZDATA1262285598=2125395324-1634440330-|1634564759; codeNum=eyJpdiI6IjVXR1wvaWYxbFNFeUhiRFd6cUFvVFBnPT0iLCJ2YWx1ZSI6IitzZVp5cmw3MlA1aXRTdXpDeDBhT0Yyd3dTOHhqeHNwbzFzWmtKYndrU2E4VlY2QVpuVFc4Y0hKemNrN0FHR1giLCJtYWMiOiJhNmU2Yzk5YmRjYTliNjRlZmYyM2UzMzhhZDgzYzI5OWM1NDRhMDhkMjE5ZjAwZGIyM2IyMjI2ODhhZjY4MGM3In0=; Hm_lpvt_bc2dcba98591e113d6329631910c965b=1634569881; XSRF-TOKEN=eyJpdiI6IndzdEdtS3NPbGdUeVwvSk5JOXgzWVdBPT0iLCJ2YWx1ZSI6Ikk2UE8yTzhJUjFMclI4K1RJU3pzdTMzMWRkRVk4VE04d1ZWOTJTVDNVdlwvM2Y2U3RJeXlrV09hdGFRN3dKNVNtQm84VXNRb0tjQ3o4Z2VEbHZlQmpwdz09IiwibWFjIjoiNzA2NGUyNDM0NWU2OGIyZjRlMTljZmU4M2ZjNzY4ZTA5ZTUwZjIxNWQyMWFhM2UzZDU5M2JmNWU3ZDc4YTM2ZCJ9; www_sofang_session=eyJpdiI6IlwvQzRpejFBKzdaNnVUQXV6OUFxN0d3PT0iLCJ2YWx1ZSI6Im54dVI2WDFySHlIRVBKTHdTc1RZNzZEd2hha3ltTE1GYVU3T2ZYdGc1bktTbW13MUFiU2dSeXl2Wk5saU5JVmlpTFZibit5QkxYSWoxTFl0ZzNUVmxBPT0iLCJtYWMiOiIyMGEyMTJlZTA0NWZjMDc4ZTFmYmI5MzAyYjc2NmRlNzE3Yjc4NDVlOWM5MWY5YzdhMzg3NjAwZmMyMDhkYjc3In0=; cityid=eyJpdiI6IkJDZ1B0VG1oeGQ4R28xUjFLRFdja3c9PSIsInZhbHVlIjoiQTd2V2RcL2VyTlkzRXJsdTRMREp3aFE9PSIsIm1hYyI6ImJlZTA2MjBhN2RhNTUyOWFiNTI0ZjYxYzE5ZmFiZjFiNzUxNGFhYzdmODQ0Njk4NTY4ZDE2ZjUxNTE1NWJmM2YifQ==; city=eyJpdiI6InBMTWhBelBUaW1VQWdvWXlma0hQU2c9PSIsInZhbHVlIjoicFhkdHQ5K1YyTzBZUkZUVklrZE1JTW9ueE1NVWVFQlRCTk5DQkJIdm5xZUtkNERzTHRXWjhKK0VJNkpFUnhqMDFrdmdCTWZIc1JBU1YxSWtzZUZlSWprMGtkZ0FPb1hDTjR5NlNKOWM5ZzJETktBcE1Gb091MGp0TjRqcEZTRmVoZmRVMUV0ZlhEdWwzeTB6WWNXbndTN3FRWXR6dHlsTGM0Um9YM0VzTE5QWGFqRnRIOWp4SG16YkVESU9YbHNXWDdsamdyY1NSNVJMN1k3cFRqZlBuQT09IiwibWFjIjoiZGRkOTAyODUxODllZjBjNTIzMThlMTA3YzhkOTM1ZDRhNGM2YTRkYjUyY2U2Y2NhYjg5Y2VlNGY0YWIwYWUyNSJ9; citypy=eyJpdiI6IlRROTdoN0ZCUVEyU2t6Mk5Ua0JVZFE9PSIsInZhbHVlIjoiZVRleVdFZXlBN2dxdEJkeUErZGp2QT09IiwibWFjIjoiOTk4ZjIwZGI5MGUzYjJmN2E1YTA5MjhlMmY1YzNlZTU2ODIyZjIwZGRmZmYzYmY0ZGFjY2FmYmI5MDc4OGIxZiJ9; cityId=eyJpdiI6Inkxc1l0SkNaN3ZZeFJMMTAySjFWanc9PSIsInZhbHVlIjoibkpyZkhOVDdmZkJZa3B1WEoram5pdz09IiwibWFjIjoiYTg0NGI5ZDNhMmExMzg3Mjg0YmY3MzAxMGVlMDcyNzk5NTNhOGE5NmMzNWViN2Q5NWVjYTVjODhmZDgwYjU1MyJ9'
}
def getProxies():
proxy_url="http://www.66ip.cn/"
proxy={
'https:''223.241.77.45:3256'
}
#请求网页信息
def request_url(url):
req=requests.get(url,headers=headers)
req.encoding=cchardet.detect(req.content)['encoding']
source=etree.HTML(req.text)
return source
def get_condetion(source):
#1.标题
title_list=source.xpath('//p[@class="name"]/a/text()')
time.sleep(5)
#2.楼盘
building_list=source.xpath('//strong/text()')
time.sleep(5)
#3.地址
add_init=source.xpath("//span[@title]")
add_list=[add.xpath("text()")[0] for add in add_init]
# #去掉列表中的空格符和换行符
add_list=[i.strip() for i in add_list if i.strip()!='']
time.sleep(5)
#使用百度api获取经纬度
# for add in add_list:
# locate=getlocation_from_api(add)
# print(locate)
#获取地图链接
map_url=[''.join(i.xpath("following-sibling::*[1]/@href")) for i in add_init]
time.sleep(5)
#4.经纬度
locate_list=[",".join([re.search("longitude=(.+)&latitude=(.+)",i,re.S).group(1),
re.search("longitude=(.+)&latitude=(.+)",i,re.S).group(2)])
if i!='' else "" for i in map_url]
time.sleep(5)
#5.面积
area_list=source.xpath('//p[@class="type clearfix"]/span[1]/text()')
time.sleep(5)
#6.房型
layout_list=source.xpath('//p[@class="type clearfix"]/span[3]/text()')
layout_list=[i.replace("\r\n","").replace(" ","") for i in layout_list]
time.sleep(5)
#7.楼层
stories_list=source.xpath('//p[@class="type clearfix"]/span[5]/text()')
time.sleep(5)
#8.朝向
direction_list=source.xpath('//p[@class="type clearfix"]/span[7]/text()')
time.sleep(2)
#9.建筑年代
year_list=source.xpath('//p[@class="type clearfix"]/span[10]/text()')
time.sleep(2)
#10.单价
uprice_list=source.xpath('//dd[@class="house_price"]/p[2]/text()')
uprice_list=[i.replace("\r\n","").replace(" ","") for i in uprice_list]
time.sleep(2)
#11.总价
tprice_list = source.xpath('//dd[@class="house_price"]/p[1]/span/text()')
tprice_list=[ i+"万" for i in tprice_list]
time.sleep(2)
#12.经纪人
agent_list=source.xpath('//a[@class="broker_name"]/text()')
agent_list=[i.strip() for i in agent_list if i.strip() != '']
#13.联系电话
tel_url=source.xpath('//a[@class="broker_name"]/@href')
tel_list=[]
for url in tel_url:
url="https://fz.sofang.com/"+url
content=request_url(url)
try:
tel=content.xpath('//div[@class="broker_tel"]/text()')
tel=[i.strip() for i in tel if i.strip() != '']
tel_list.append(''.join(tel))
except:
tel_list.append("")
#print(tel_list)
time.sleep(2)
#14.标签
tag=source.xpath('//p[@class="tag clearfix"]')
tag_list=[]
for i in tag:
i=i.xpath('span/text()')
i="/".join(i)
tag_list.append(i)
#print(tag_list)
time.sleep(2)
for title,building,address,locate,area,layout,floor,direction,year,uprice\
,tprice,agent,tel,tag in \
zip(title_list,building_list,add_list,locate_list,area_list,
layout_list,stories_list,direction_list,year_list,uprice_list,
tprice_list,agent_list,tel_list,tag_list):
data = []
data.append(title)
data.append(building)
data.append(address)
data.append(locate)
data.append(area)
data.append(layout)
data.append(floor)
data.append(direction)
data.append(year)
data.append(uprice)
data.append(tprice)
data.append(agent)
data.append(tel)
data.append(tag)
print(data)
time.sleep(2)
save(data)
def save(data):
with open("搜房网-福州房价.csv","a",encoding="utf-8-sig",newline="") as f:
a=csv.writer(f)
a.writerow(data)
#调用百度地图api获取经纬度
def getlocation_from_api(address):
bdurl = "https://api.map.baidu.com/geocoding/v3/?"
params = {
'address': address,
'output': 'json',
'ak': "UYKdveDmML50ykiqRIFfLekfCWcgeB4r",
# 'callback': 'showlocation',
'city': '福州'
}
req = requests.get(bdurl,headers=headers,params=params)
#经度
lgn=req.json()['result']['location']['lng']
#纬度
lat = req.json()['result']['location']['lat']
return ','.join([str(lgn),str(lat)])
if __name__ == '__main__':
base_url="https://fz.sofang.com/esfsale/area/"
source=request_url(base_url)
page=source.xpath('//li/a[@alt]/text()')[-3]
if os.path.exists("搜房网-福州房价.csv")==True:
pass
else:
with open("搜房网-福州房价.csv","w",encoding="utf-8-sig",newline="") as f:
a=csv.writer(f)
a.writerow(['标题','楼盘','地址','经纬度','面积','房型','楼层','朝向','建筑年代','单价','总价','经纪人','联系电话','标签'])
f.close()
for i in range(1,int(page)+1):
url=base_url+"bl"+str(i)+"?"
content=request_url(url)
print(url)
time.sleep(10)
get_condetion(content)
time. Sleep(10)
import requests
import cchardet
from lxml import html
import re
import xlsxwriter
import time
import random
head = {
'cookie': 'UM_distinctid=17c7f7b1d8b7c1-018dda850a8de5-b7a1b38-1fa400-17c7f7b1d8ccca; uniqueName=e1aac984e2899a3acdb3ec3f75190084; cityNum=28; Hm_lvt_bc2dcba98591e113d6329631910c965b=1634734120,1634791780,1634901813,1634905507; Hm_lvt_d2801fc638056c1aac7e8008f41cf828=1634280948,1634737369,1634798622,1634905516; Hm_lpvt_d2801fc638056c1aac7e8008f41cf828=1634905551; remember_82e5d2c56bdd0811318f0cf078b78bfc=eyJpdiI6Ild4S0I2ZVJPMjFLQkw3V1l1UTF6Mmc9PSIsInZhbHVlIjoiKytpb3NNTVRlV3Ntcjh2Vk9rSHB4QzFZNVRPTjNZWmF3UUNYdENESFwvWTA1dEw3SHlkaHYzXC9jb2lBZEVSelowaFNUQkdqS3Q1MWVsWTlnVVIxV29rY04rWThpdW1ZeUh6SnRYZzE3ZjNEZz0iLCJtYWMiOiJiOGFlZDZiNWQwNWMyNjBkMjhlMmQwZDMyMTU3YzQwYzM4YzdhYjcxM2Q5ODAyYzgyZWM3MzYzNWJlNjg2ZjhhIn0%3D; codeNum=eyJpdiI6IjVqSUJSNDlQVE1jaGRoN3NqemtnSlE9PSIsInZhbHVlIjoiS25vdzVGNjNLcVVNKzlQTVN2YSt0aW12VjR5K1A5U0lTRFUya2hndnZGOFlKZmdlaVFlTW1BTndoYTE0TncwRCIsIm1hYyI6IjJjYzBjYWU0YmYxNjJkOGFmZDUwOTg1OWNjNzBiNDY0ZjczNTBlMTY5ZTlhMGVlMDc0YWZmY2Q2MzI0NjkxYzEifQ%3D%3D; CNZZDATA1262285598=1501788919-1634225051-https%253A%252F%252Fwww.baidu.com%252F%7C1634907133; Hm_lpvt_bc2dcba98591e113d6329631910c965b=1634912068; XSRF-TOKEN=eyJpdiI6IlpMMVZ0VWFGYXUrUVMwclJ1a0phQXc9PSIsInZhbHVlIjoiM0dtWGZrSGJLN2o1bitZdWVINnZJWTNWWUhjc1N4Yk5pUjd1UkpaSnM3d2xLMTErTUZJVzcrZkdUdUprdFFmTVNwQloyZEZZSGdsYzkrK3hzV1RcL1pnPT0iLCJtYWMiOiIwNDRkNmQ3MGU3MGY4NzQ0NDFjMjUyNWVlYjgxZGIwNThkNWU3YTdlZDhhMWU0MzkxZTA1YWQ4MTYxMDAzNjkzIn0%3D; www_sofang_session=eyJpdiI6Ik5UclBnbnBWbnE1OWxQV3FlXC94dU53PT0iLCJ2YWx1ZSI6IjFlMVUwUGZsMDFHRUl3V0p5VlZcL2hSakVnSjFqTllPa0RYZE95Wk95NElFcW53TDk3eldCVUVcL2hZMUgxaEhUaVwvRGI0eHc1MW5FcW8zaUEzdFhiVDBRPT0iLCJtYWMiOiI4NGRhZWZkZGEwZmViYmQ1NTE3MTc4MzJiYTNlNmFhYmY2NGZjMTFhYjkyZGIyNTFjY2Q3MzU2YzgyM2Y4ZDNiIn0%3D; cityid=eyJpdiI6InF3VlV2WXY0YXowVmh5OUNCQnRMQUE9PSIsInZhbHVlIjoiVmtLWTRiXC9QMk9OWWZJUFpJbHJ4NHc9PSIsIm1hYyI6IjQ3NGQ3YzY4ZjQ3ZjVhNTMyNTJmODQ2ZTljYTA0NTY3YTRhYTNiODc0NWNlNTgyZmViMmMyZWRiYjM5OTYwY2IifQ%3D%3D; city=eyJpdiI6IlM4ZUtzcWt4QnY2RHppaTllaFFFTmc9PSIsInZhbHVlIjoiUndaXC92OFdycENKNW1lU01SUUZEdDZLQm1zMjJieVdJOWhJb25vN3FpeWZoV0xpWXBXTDdkejQ3b05cL1pTNUk5ME5HbXl3TWZkUWF6cW9icEZaS05kUkgrOFJlMTdaQmFxZ1lwdWx0elcxXC9OcTNFRGtGZm5zcnBlaFBCXC91ZzJ1RjNDY29WaHNGZWJMUGJHUDkzSGJiYnNoWWFXVXhaN09kOEZVVFVkVm50SjZLK0hVMVhYOGVweXY1NkxHbXhrXC9uQktIc3FJUnpvZFwvXC9JWTBcL3Y2ZlJBPT0iLCJtYWMiOiI0NTM3MDU4ODA2YzAzYWRhOTQ3YjhlOTZlOGU4ZWIyMGU4NTRkMTk4ZWM4MTEzODZhNmJmZmNlNDNmYTgxOGRjIn0%3D; citypy=eyJpdiI6ImJrZHhySjNTKzhONmpaeEpHK0RHRHc9PSIsInZhbHVlIjoiZmd2Q051VGpTZEhNTFBiM1FpbHRCZz09IiwibWFjIjoiYmUxNTc3NWMyYTg5ZTA3M2U3NWQ3OTIyZTkwNmE3MzJmNDQ3OTJhMzM1MjY4N2Q0NTc0NzEyMTExY2YzOGQwZSJ9; cityId=eyJpdiI6IkZaNjU4QnU4TXNibldnYWEwenpZSEE9PSIsInZhbHVlIjoidVZBVFhnVE1QamZrY1o0ZWNqTnVnUT09IiwibWFjIjoiYzk2NjRmNjZmZjY5MWVkZTY0OGVjOWM2ZGI4YzJkYjExMjJmMzE5Y2NhMmJjYmE3YmVkYzQ5Njc2MGRiZDdhNCJ9',
# 'upgrade-insecure-requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
def get_source(url):
resp = requests.get(url, headers=head)
resp.encoding = cchardet.detect(resp.content)['encoding']
return resp.text
def is_url_in_list(url, house_info_list):
i = 0
while i < len(house_info_list):
if url == house_info_list[i][12]:
return i
i += 1
return -1
def get_data_secondhand_house_onepage(source, house_info_list):
root = html.fromstring(source)
house_resources_list = root.xpath('//div[@class="list_l"]/div/dl')
for house_resource in house_resources_list:
title = house_resource.xpath('dd[@class="house_msg"]/p[@class="name"]/a/text()')[0]
estate = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]//strong/text()')[0]
address = house_resource.xpath('dd[@class="house_msg"]//span[@class="address"]/text()')[0].strip()
longitude_latitude = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]/a[@target="_blank"]/@href')
if longitude_latitude:
longitude_latitude = re.search(r'longitude=(.*?)&latitude=(.*)', longitude_latitude[0], re.S)
longitude_latitude = '{},{}'.format(longitude_latitude.group(1), longitude_latitude.group(2))
else:
longitude_latitude = ''
house_type = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"type")]')[0]
area = house_type.xpath('span[1]/text()')[0]
layout = re.sub('[\s\r\n]', '', house_type.xpath('span[3]/text()')[0])
storey = house_type.xpath('span[5]/text()')[0]
orientations = house_type.xpath('span[7]/text()')[0]
construction_time = house_type.xpath('span[10]')
if construction_time:
construction_time = construction_time[0].xpath('text()')[0]
else:
construction_time = ''
price = house_resource.xpath('dd[@class="house_price"]/p')[0].xpath('string(.)').strip()
unit_price = house_resource.xpath('dd[@class="house_price"]/p[@class="junjia"]/text()')[0].strip()
broker_a_tag = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"tag")]/a[@class="broker_name"]')[0]
broker_url = 'https://fz.sofang.com' + broker_a_tag.xpath('@href')[0]
broker_name = broker_a_tag.xpath('string(.)').strip()
pos_in_house_info_list = is_url_in_list(broker_url, house_info_list)
if pos_in_house_info_list != -1:
broker_tel = house_info_list[pos_in_house_info_list][13]
else:
time.sleep(random.randint(1, 3))
broker_root = html.fromstring(get_source(broker_url))
broker_tel = broker_root.xpath('//div[@class="broker_tel"]')[0].xpath('string(.)').strip()
print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}\n经纪人:{}\n经纪人url:{}\n电话:{}\n'
.format(title,estate,address,longitude_latitude,area,layout,storey,orientations,construction_time,price,unit_price,broker_name,broker_url,broker_tel))
house_info_list.append((title, estate, address, longitude_latitude, area, layout, storey, orientations,
construction_time, price, unit_price, broker_name, broker_url, broker_tel))
def get_next_url(source):
root = html.fromstring(source)
url_next_page = root.xpath('//a[contains(text(),"下一页")]/@href')
if url_next_page:
url_next_page = 'https://fz.sofang.com' + url_next_page[0]
return url_next_page
def save_data(house_info_list):
workbook = xlsxwriter.Workbook('soufang.xlsx')
worksheet = workbook.add_worksheet()
# worksheet.set_column('A:A', 20)
# worksheet.set_column('B:B', 10)
worksheet.write('A1', '标题')
worksheet.write('B1', '楼盘')
worksheet.write('C1', '地点')
worksheet.write('D1', '经纬度')
worksheet.write('E1', '面积')
worksheet.write('F1', '房型')
worksheet.write('G1', '楼层')
worksheet.write('H1', '朝向')
worksheet.write('I1', '建筑年份')
worksheet.write('J1', '总价')
worksheet.write('K1', '单价')
worksheet.write('L1', '经纪人')
worksheet.write('M1', '经纪人url')
worksheet.write('N1', '电话')
i = 2
for house_info in house_info_list:
j = 0
for data in house_info:
worksheet.write('{}{}'.format(chr(ord('A')+j), i), data)
j += 1
i += 1
workbook.close()
if __name__ == '__main__':
house_info_list = []
i = 1
url = 'https://fz.sofang.com/esfsale/area'
while True:
# 访问搜房网福州二手房页面,获得网页源码
head['referer'] = 'https://fz.sofang.com/esfsale/area/bl{}?'.format(i) #重要!网站反爬时采用了请求头中的referer字段
source = get_source(url)
# 从源码中提取需要的数据
get_data_secondhand_house_onepage(source, house_info_list)
time.sleep(random.randint(1, 3))
print(f'已提取第{i}页的数据\n')
url = get_next_url(source)
if not url:
break
i += 1
# # 输出数据
# for house_info in house_info_list:
# print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}\n经纪人:{}\n经纪人url:{}\n电话:{}\n'
# .format(house_info[0],house_info[1],house_info[2],house_info[3],house_info[4],
# house_info[5],house_info[6],house_info[7],house_info[8],house_info[9],house_info[10],house_info[11],house_info[12],house_info[13]))
save_data(house_info_list)
标签:xpath,info,url,house,list,爬取,搜房网,class
From: https://www.cnblogs.com/Gimm/p/18116245