代码结构
1.导入库
点击查看代码
import requests
import parsel
import time
import random
import csv
点击查看代码
header = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
city = input("所在城市首字母缩写 或者全拼:")
detail_list = []
3.获取每个城市url
点击查看代码
def get_city_region_url(city):
"""获取城市每个区域的url
返回url以及对应的区域名字"""
url = f"https://{city}.lianjia.com/ershoufang/"
response = requests.get(url=url,headers=header)
selector = parsel.Selector(response.text)
region_text = selector.css('body > div:nth-child(12) > div > div.position > dl:nth-child(2) > dd > div:nth-child(1) > div > a::text').getall()
region_url = selector.css('body > div:nth-child(12) > div > div.position > dl:nth-child(2) > dd > div:nth-child(1) > div > a::attr(href)').getall()
region = dict(zip(region_url,region_text))
return region
点击查看代码
def get_city_region(city):
region = get_city_region_url(city)
region_url_list = list(region.keys())
for region_url in region_url_list:
url = f"https://{city}.lianjia.com/{region_url}"
print(f"正在爬取{region[region_url]}的二手房信息")
response = requests.get(url=url,headers=header)
selector = parsel.Selector(response.text)
#获取每个城市区域内房子信息最大页数
max_page = eval(selector.css('.contentBottom .page-box .house-lst-page-box::attr(page-data)').get())['totalPage']
#爬取每个区的每一页
i = 1 #用以计数,达到最大页数之后停止
while True:
url_page = url+'pg'+str(i)
response = requests.get(url=url_page, headers=header )
#获取每一页的全部信息
selector = parsel.Selector(response.text)
href = selector.css('.sellListContent li .title a::attr(href)').getall()
for link in href:
time.sleep(random.random()*0.05)
#循环访问每一个详情页面
response_1 = requests.get(url=link,headers=header)
selector_1 = parsel.Selector(response_1.text)
content_list = []#每一页的内容写入列表
content_list.append(region[region_url])
content_list.append(selector_1.css('.title .main::text').get()) #标题
content_list.append(selector_1.css('.price .total::text').get()) #价格
global attr_list
attr_list = selector_1.css('.base .content .label::text').getall() #房屋具体属性
attr_list = attr_list[:11] #定义全局变量 拿到属性具体值为了容易保存到csv文件
content_list_1 = selector_1.css('.base .content li::text').getall()#属性的具体值
content_list.extend(content_list_1[:11])
content_list.append(link)
print(content_list)
print(attr_list)
detail_list.append(content_list)
if(attr_list[-1]=='别墅类型'):
continue
print(f"已经爬取完第{i}页内容")
#当达到最大页数 停止
i+=1
if i == max_page:
break
点击查看代码
def load_csv(data_list):
with open('qingdao2房源.csv',mode='w',encoding='utf-8',newline='') as f:
attr = []
attr.extend(['所在区','标题','售价'])
attr.extend(attr_list)
writer = csv.DictWriter(f,fieldnames=attr)
writer.writeheader() #写入表头
writer = csv.writer(f)
writer.writerows(data_list)
print('保存成功')
点击查看代码
get_city_region(city)
load_csv(detail_list)