1、选题背景
房价的迅速飞涨,即将毕业的我们,就要面对高昂的房价,虽然我们不能做出实际行动,但是除此之外我们还可以房价分析。利用科学的技术手段,洞察房价的趋势。本次项目选择了厦门城市,通过了解厦门市二手房的情况,可以帮助人们在购房、出租等方面做出更明智的决策。可以帮助人们了解重庆市经济的发展趋势。随着互联网的发展,越来越多的房地产信息通过网络发布,使用爬虫技术可以方便地收集和分析这些信息。而本次项目选择的数据来源是链家。链家是一家著名的房地产经纪公司,在厦门有着广泛的房地产业务。通过爬取厦门链家发布的二手房信息,可以获得丰富的数据,为分析提供参考。
2、数据分析步骤
数据源:链家 网址:https://cm.lianjia.com 以及厦门二手房的数据爬取网址:https://xm.lianjia.com/ershoufang/
2.1数据采集
该部分通过网络爬虫程序抓取链家网上所有重庆二手房的数据,收集原始数据。通过url到指定的网站进行数据爬取,设置了id,小区名(xiaoquming),价格(jiage),地区(diqu),房屋户型(fangwuhuxing),所在楼层(suozailouceng),建筑面积(jianzhumianji)等字段最后通过save_data()将爬取的数据进行保存。
导入库
import requests,time,csv import pandas as pd from lxml import etree
#获取每一页的url def Get_url(url): all_url=[] for i in range(1,101): all_url.append(url+'pg'+str(i)+'/') #储存每一个页面的url return all_url
#获取每套房详情信息的url
#获取每套房详情信息的url def Get_house_url(all_url,headers): num=0 #简单统计页数 for i in all_url: r=requests.get(i,headers=headers) html=etree.HTML(r.text) url_ls=html.xpath("//ul[@class='sellListContent']/li/a/@href") #获取房子的url Analysis_html(url_ls,headers) time.sleep(4) print("第%s页爬完了"%i) num+=1
获取每套房的详情信息
#获取每套房的详情信息 def Analysis_html(url_ls,headers): for i in url_ls: #num记录爬取成功的索引值 r=requests.get(i,headers=headers) html=etree.HTML(r.text) name=(html.xpath("//div[@class='communityName']/a/text()"))[0].split() #获取房名 money = html.xpath("//span[@class='total']/text()" )# 获取价格 area = html.xpath("//span[@class='info']/a[1]/text()") # 获取地区 data = html.xpath("//div[@class='content']/ul/li/text()")# 获取房子基本属性 Save_data(name,money,area,data)
把爬取的信息存入文件
#把爬取的信息存入文件 def Save_data(name, money, area, data): result=[name[0]]+money+[area]+data #把详细信息合为一个列表 with open(r'raw_data.csv','a',encoding='utf_8_sig',newline='')as f: wt=csv.writer(f) wt.writerow(result) print('已写入') f.close() if __name__=='__main__': url='https://cq.lianjia.com/ershoufang/' headers={ "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko)Chrome" "/72.0.3626.121 Safari/537.36" } all_url=Get_url(url) with open(r'raw_data.csv', 'a', encoding='utf_8_sig', newline='') as f: #首先加入表格头 table_label=['小区名','价格/万','地区','房屋户型','所在楼层','建筑面积','户型结构','套内面积','建筑类型','房屋朝向' ,'建成年代','装修情况','建筑结构','供暖方式'] wt=csv.writer(f) wt.writerow(table_label) Get_house_url(all_url,headers)
运行结果:
截取了一部分的数据
2.2数据清理
获取数据、数据清洗、查看表格数据、查看是否缺失、删除重复数据
# 从保存的文本中获取数据 def get_data(): raw_data = pd.DataFrame(pd.read_excel('raw_data.csv')) print("数据清洗前共有%s条数据" % raw_data.size) clean_data(raw_data)
# 数据清洗 def clean_data(data): data = data.dropna(axis=1, how='all') # 删除全是空行列 # data.index = data['小区名'] # del data['小区名'] # 2.查看表格数据 print(data.describe()) # 3.查看是否缺失 print(data.isnull().sum()) # 删除重复数据 data = data.drop_duplicates(subset=None, keep='first', inplace=None) # 删除‘暂无数据’大于一半数据的列 if ((data['套内面积'].isin(['暂无数据'])).sum()) > (len(data.index)) / 2: del data['套内面积'] # 把建筑面积列的单位去掉并转换成float类型 data['建筑面积'] = data['建筑面积'].apply(lambda x: float(x.replace('㎡', ''))) # 提取地区 data['地区'] = data['地区'].apply(lambda x: x[2:-2]) # 计算单价 data['单价'] = round(data['价格/万'] * 10000 / data['建筑面积'], 2) data.to_excel('pure_data.xlsx', encoding='utf-8') if __name__ == '__main__': get_data()
数据清洗结果:
现在看比清理前规整了很多,更方便观看。
3、数据可视化分析
该阶段主要是对数据从整体上做一个探索性分析并把数据进行可视化呈现,帮助人们更好、更直观的认识数据,把隐藏在大量数据背后的信息集中和提炼出来,总结出所研究对象的内在规律。主要对二手房房源的总价、单价、面积、户型、地区等属性进行了分析。
3.1首先二手房的数据表展示:
house_list = House.objects.all().order_by('id') input_1 = request.GET.get("searchorders") input_2 = request.GET.get("dqchaxun") if input_1: house_list = House.objects.filter(xiaoquming=input_1) paginator = Paginator(house_list, 20) page = request.GET.get('page') try: data_1 = paginator.page(page) except PageNotAnInteger: data_1 = paginator.page(1) # 输入不是整数返回第一页 except InvalidPage: # 找不到就重定向 return render(request, 'index.html', {'house_list': data_1, 'name': input_1}) except EmptyPage: # 不在合法范围就返回最后一页 data_1 = paginator.page(paginator.num_pages) return render(request, 'index.html', {'house_list': data_1, 'name': input_1})
3.2户型和楼层的分析
#户型分析 series = df['fangwuhuxing'].value_counts() series.sort_index(ascending=False, inplace=True) house_type_list = series.index.tolist() count_list = series.values.tolist() c = Bar(init_opts=opts.InitOpts(theme=ThemeType.CHALK)) c.add_xaxis(house_type_list) c.add_yaxis("厦门市", count_list) c.reversal_axis() c.set_series_opts(label_opts=opts.LabelOpts(position="right")) c.set_global_opts(title_opts=opts.TitleOpts(title="厦门二手房各户型横向条形图"), datazoom_opts=[opts.DataZoomOpts(yaxis_index=0, type_="slider", orient="vertical")], ) c.render("户型分析-条形图.html")
#楼层分析 Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c")) .add( series_name="层段信息", data_pair=data_pair, rosetype="radius", radius="55%", center=["50%", "50%"], label_opts=opts.LabelOpts(is_show=False, position="center"), ) .set_global_opts( title_opts=opts.TitleOpts( title="Customized Pie", pos_left="center", pos_top="20", title_textstyle_opts=opts.TextStyleOpts(color="#fff"), ), legend_opts=opts.LegendOpts(is_show=False), ) .set_series_opts( tooltip_opts=opts.TooltipOpts( trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)" ), label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"), )
3.3价格分析柱状图
from pyecharts import options as opts from pyecharts.charts import Bar from pyecharts.commons.utils import JsCode from pyecharts.globals import ThemeType import pandas as pd try: df = pd.read_excel(r'pure_data.xlsx') except: df = pd.read_excel(r'pure_data.xlsx') city_lst = ['思明', '湖里', '集美', '海沧', '同安', '翔安'] # 地区 city = [] # 城市 # buildTime = [] # 建房时期 vules = [] # 价格 # new_buildTime = [[[], []] for i in range(6)] new_vules=[[[], []] for i in range(6)] data1 = [] data2 = [] for index in df['diqu']: city.append(index) for index in df['jiage']: # buildTime.append(index) vules.append(index) for index in range(len(city)): for num in range(len(city_lst)): if city[index] == city_lst[num]: if int(vules[index]) >= 500: new_vules[num][1].append(vules[index]) else: new_vules[num][0].append(vules[index]) for g in range(len(new_vules)): value1 = len(new_vules[g][0]) value2 = len(new_vules[g][1]) result1 = {'value': value1, 'percent': '%.2f' % (value1 / (value1 + value2))} result2 = {'value': value2, 'percent': '%.2f' % (value2 / (value1 + value2))} data1.append(result1) data2.append(result2) print(data1) print(data2) c = ( Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT)) .add_xaxis(city_lst) .add_yaxis("500万以下的房子", data1, stack="stack1", category_gap="50%") .add_yaxis("500万以上的房子", data2, stack="stack1", category_gap="50%") .set_series_opts( label_opts=opts.LabelOpts( position="right", formatter=JsCode( "function(x){return Number(x.data.percent * 100).toFixed() + '%';}" ), ) ) .render("价格分析-堆叠柱状图.html") )
可以看出在思明区500万以上的房子占绝大数73%,而在翔安区的500万以上的房子占4%
所以在思明区500万以上的房子最多,翔安区最少。
3.4有无供暖情况
import pandas as pd
from pyecharts.charts import *
from pyecharts import options as opts
try:
df = pd.read_excel(r'pure_data.xlsx')
except:
df = pd.read_excel(r'pure_data.xlsx', encoding='gbk')
Elevator_num = df.groupby(['gongnuanfangshi'])['diqu'].count().reset_index()
data_pair = sorted([(row['gongnuanfangshi'], row['diqu'])
for _, row in Elevator_num.iterrows()], key=lambda x: x[1], reverse=True)
pie = Pie(init_opts=opts.InitOpts(theme='dark'))
pie.add('', data_pair, radius=["30%", "75%"], rosetype="radius")
pie.set_global_opts(title_opts=opts.TitleOpts(title="厦门二手房供暖情况", pos_left="center",
title_textstyle_opts=opts.TextStyleOpts(color="#000"), ),
legend_opts=opts.LegendOpts(is_show=False), )
pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
pie.render_notebook()
pie.render("玫瑰饼图-有无供暖.html")
清晰的看出67%的有供暖情况,26%的没有供暖情况
相对比较大部分的有供暖情况。
3.5装修情况
import pandas as pd from pyecharts.charts import * from pyecharts import options as opts try: df = pd.read_excel(r'pure_data.xlsx') except: df = pd.read_excel(r'pure_data.xlsx', encoding='gbk') Renovation_num = df.groupby(['zhaungxiuqingkuang'])['diqu'].count().reset_index() data_pair_num = sorted([(row['zhaungxiuqingkuang'], row['diqu']) for _, row in Renovation_num.iterrows()], key=lambda x: x[0], reverse=False) pie = Pie(init_opts=opts.InitOpts(theme='dark')) pie.add('', data_pair_num,radius=["30%", "75%"],rosetype="radius") pie.set_global_opts(title_opts=opts.TitleOpts(title="厦门二手房装修分布",pos_left="center",title_textstyle_opts=opts.TextStyleOpts(color="#fff"),), legend_opts=opts.LegendOpts(is_show=False),) pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%")) pie.render("玫瑰饼图-房屋装修情况.html") pie.render_notebook()
明显的看出绝大多数都是精装过后的,毛坯的极其少。可以看出装修方面挺不错。
附完整代码:
1 import requests,time,csv 2 import pandas as pd 3 from lxml import etree 4 5 #获取每一页的url 6 def Get_url(url): 7 all_url=[] 8 for i in range(1,101): 9 all_url.append(url+'pg'+str(i)+'/') #储存每一个页面的url 10 return all_url 11 12 #获取每套房详情信息的url 13 def Get_house_url(all_url,headers): 14 num=0 15 #简单统计页数 16 for i in all_url: 17 r=requests.get(i,headers=headers) 18 html=etree.HTML(r.text) 19 url_ls=html.xpath("//ul[@class='sellListContent']/li/a/@href") #获取房子的url 20 Analysis_html(url_ls,headers) 21 time.sleep(4) 22 print("第%s页爬完了"%i) 23 num+=1 24 25 #获取每套房的详情信息 26 def Analysis_html(url_ls,headers): 27 for i in url_ls: #num记录爬取成功的索引值 28 r=requests.get(i,headers=headers) 29 html=etree.HTML(r.text) 30 name=(html.xpath("//div[@class='communityName']/a/text()"))[0].split() #获取房名 31 money = html.xpath("//span[@class='total']/text()" )# 获取价格 32 area = html.xpath("//span[@class='info']/a[1]/text()") # 获取地区 33 data = html.xpath("//div[@class='content']/ul/li/text()")# 获取房子基本属性 34 35 Save_data(name,money,area,data) 36 37 #把爬取的信息存入文件 38 def Save_data(name, money, area, data): 39 result=[name[0]]+money+[area]+data #把详细信息合为一个列表 40 with open(r'raw_data.csv','a',encoding='utf_8_sig',newline='')as f: 41 wt=csv.writer(f) 42 wt.writerow(result) 43 print('已写入') 44 f.close() 45 46 if __name__=='__main__': 47 url='https://xm.lianjia.com/ershoufang/' 48 headers={ 49 "Upgrade-Insecure-Requests":"1", 50 "User-Agent":"Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko)Chrome" 51 "/72.0.3626.121 Safari/537.36" 52 } 53 all_url=Get_url(url) 54 with open(r'raw_data.csv', 'a', encoding='utf_8_sig', newline='') as f: 55 #首先加入表格头 56 table_label=['小区名','价格/万','地区','房屋户型','所在楼层','建筑面积','户型结构','套内面积','建筑类型','房屋朝向' 57 ,'建成年代','装修情况','建筑结构','供暖方式'] 58 wt=csv.writer(f) 59 wt.writerow(table_label) 60 Get_house_url(all_url,headers) 61 62 # encoding: utf-8 63 64 import pandas as pd 65 66 67 # 从保存的文本中获取数据 68 def get_data(): 69 raw_data = pd.DataFrame(pd.read_excel('raw_data.xlsx')) 70 print("数据清洗前共有%s条数据" % raw_data.size) 71 clean_data(raw_data) 72 73 74 # 数据清洗 75 def clean_data(data): 76 data = data.dropna(axis=1, how='all') # 删除全是空行列 77 # data.index = data['小区名'] 78 # del data['小区名'] 79 80 # 2.查看表格数据,一共有23677条数据。 81 print(data.describe()) 82 83 # 3.查看是否缺失 84 print(data.isnull().sum()) 85 86 # 删除重复数据 87 data = data.drop_duplicates(subset=None, keep='first', inplace=None) 88 # 删除‘暂无数据’大于一半数据的列 89 if ((data['套内面积'].isin(['暂无数据'])).sum()) > (len(data.index)) / 2: 90 del data['套内面积'] 91 92 # 把建筑面积列的单位去掉并转换成float类型 93 data['建筑面积'] = data['建筑面积'].apply(lambda x: float(x.replace('㎡', ''))) 94 95 # 提取地区 96 data['地区'] = data['地区'].apply(lambda x: x[2:-2]) 97 98 # 计算单价 99 data['单价'] = round(data['价格/万'] * 10000 / data['建筑面积'], 2) 100 data.to_excel('pure_data.xlsx', encoding='utf-8') 101 102 103 if __name__ == '__main__': 104 get_data() 105 from pyecharts import options as opts 106 from pyecharts.charts import Bar 107 from pyecharts.commons.utils import JsCode 108 from pyecharts.globals import ThemeType 109 import pandas as pd 110 111 try: 112 df = pd.read_excel(r'pure_data.xlsx') 113 except: 114 df = pd.read_excel(r'pure_data.xlsx') 115 city_lst = ['思明', '湖里', '集美', '海沧', '同安', '翔安'] # 地区 116 city = [] # 城市 117 # buildTime = [] # 建房时期 118 vules = [] # 价格 119 # new_buildTime = [[[], []] for i in range(6)] 120 new_vules=[[[], []] for i in range(6)] 121 data1 = [] 122 data2 = [] 123 for index in df['diqu']: 124 city.append(index) 125 for index in df['jiage']: 126 # buildTime.append(index) 127 vules.append(index) 128 for index in range(len(city)): 129 for num in range(len(city_lst)): 130 if city[index] == city_lst[num]: 131 if int(vules[index]) >= 500: 132 new_vules[num][1].append(vules[index]) 133 else: 134 new_vules[num][0].append(vules[index]) 135 for g in range(len(new_vules)): 136 value1 = len(new_vules[g][0]) 137 value2 = len(new_vules[g][1]) 138 result1 = {'value': value1, 'percent': '%.2f' % (value1 / (value1 + value2))} 139 result2 = {'value': value2, 'percent': '%.2f' % (value2 / (value1 + value2))} 140 data1.append(result1) 141 data2.append(result2) 142 print(data1) 143 print(data2) 144 145 c = ( 146 Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT)) 147 .add_xaxis(city_lst) 148 .add_yaxis("500万以下的房子", data1, stack="stack1", category_gap="50%") 149 .add_yaxis("500万以上的房子", data2, stack="stack1", category_gap="50%") 150 .set_series_opts( 151 label_opts=opts.LabelOpts( 152 position="right", 153 formatter=JsCode( 154 "function(x){return Number(x.data.percent * 100).toFixed() + '%';}" 155 ), 156 ) 157 ) 158 .render("价格分析-堆叠柱状图.html") 159 ) 160 import pandas as pd 161 from pyecharts.charts import * 162 from pyecharts import options as opts 163 164 try: 165 df = pd.read_excel(r'pure_data.xlsx') 166 except: 167 df = pd.read_excel(r'pure_data.xlsx', encoding='gbk') 168 169 Elevator_num = df.groupby(['gongnuanfangshi'])['diqu'].count().reset_index() 170 data_pair = sorted([(row['gongnuanfangshi'], row['diqu']) 171 for _, row in Elevator_num.iterrows()], key=lambda x: x[1], reverse=True) 172 pie = Pie(init_opts=opts.InitOpts(theme='dark')) 173 pie.add('', data_pair, radius=["30%", "75%"], rosetype="radius") 174 pie.set_global_opts(title_opts=opts.TitleOpts(title="厦门二手房供暖情况", pos_left="center", 175 title_textstyle_opts=opts.TextStyleOpts(color="#000"), ), 176 legend_opts=opts.LegendOpts(is_show=False), ) 177 pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%")) 178 179 pie.render_notebook() 180 pie.render("玫瑰饼图-有无供暖.html") 181 182 183 import pandas as pd 184 from pyecharts.charts import * 185 from pyecharts import options as opts 186 187 188 try: 189 df = pd.read_excel(r'pure_data.xlsx') 190 except: 191 df = pd.read_excel(r'pure_data.xlsx', encoding='gbk') 192 193 Renovation_num = df.groupby(['zhaungxiuqingkuang'])['diqu'].count().reset_index() 194 data_pair_num = sorted([(row['zhaungxiuqingkuang'], row['diqu']) 195 for _, row in Renovation_num.iterrows()], key=lambda x: x[0], reverse=False) 196 197 pie = Pie(init_opts=opts.InitOpts(theme='dark')) 198 pie.add('', data_pair_num,radius=["30%", "75%"],rosetype="radius") 199 pie.set_global_opts(title_opts=opts.TitleOpts(title="厦门二手房装修分布",pos_left="center",title_textstyle_opts=opts.TextStyleOpts(color="#fff"),), 200 legend_opts=opts.LegendOpts(is_show=False),) 201 pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%")) 202 203 pie.render("玫瑰饼图-房屋装修情况.html") 204 pie.render_notebook() 205 import pandas as pd 206 import pyecharts.options as opts 207 from pyecharts.charts import Pie 208 209 df = pd.read_excel(r'pure_data.xlsx') 210 df.drop(df[df['suozailouceng'].str.contains('suozailouceng')].index, inplace=True) 211 data_heigh = df['suozailouceng'] 212 lst_height_value = data_heigh.value_counts().keys().tolist() 213 lst_counts = data_heigh.value_counts().tolist() 214 215 216 def get_pie(): 217 x_data = lst_height_value 218 y_data = lst_counts 219 data_pair = [list(z) for z in zip(x_data, y_data)] 220 data_pair.sort(key=lambda x: x[1]) 221 222 c = ( 223 Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c")) 224 .add( 225 series_name="层段信息", 226 data_pair=data_pair, 227 rosetype="radius", 228 radius="55%", 229 center=["50%", "50%"], 230 label_opts=opts.LabelOpts(is_show=False, position="center"), 231 ) 232 .set_global_opts( 233 title_opts=opts.TitleOpts( 234 title="Customized Pie", 235 pos_left="center", 236 pos_top="20", 237 title_textstyle_opts=opts.TextStyleOpts(color="#fff"), 238 ), 239 legend_opts=opts.LegendOpts(is_show=False), 240 ) 241 .set_series_opts( 242 tooltip_opts=opts.TooltipOpts( 243 trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)" 244 ), 245 label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"), 246 ) 247 248 ) 249 return c 250 251 252 if __name__ == '__main__': 253 get_pie().render('楼层分析-饼状图.html') 254 255 import pandas as pd 256 import pyecharts.options as opts 257 from pyecharts.charts import Pie 258 259 df = pd.read_excel(r'pure_data.xlsx') 260 df.drop(df[df['huxingjiegou'].str.contains('huxingjiegou')].index, inplace=True) 261 data_heigh = df['huxingjiegou'] 262 lst_height_value = data_heigh.value_counts().keys().tolist() 263 lst_counts = data_heigh.value_counts().tolist() 264 265 266 def get_pie(): 267 x_data = lst_height_value 268 y_data = lst_counts 269 data_pair = [list(z) for z in zip(x_data, y_data)] 270 data_pair.sort(key=lambda x: x[1]) 271 272 c = ( 273 Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c")) 274 .add( 275 series_name="户型信息", 276 data_pair=data_pair, 277 rosetype="radius", 278 radius="55%", 279 center=["50%", "50%"], 280 label_opts=opts.LabelOpts(is_show=False, position="center"), 281 ) 282 .set_global_opts( 283 title_opts=opts.TitleOpts( 284 title="Customized Pie", 285 pos_left="center", 286 pos_top="20", 287 title_textstyle_opts=opts.TextStyleOpts(color="#fff"), 288 ), 289 legend_opts=opts.LegendOpts(is_show=False), 290 ) 291 .set_series_opts( 292 tooltip_opts=opts.TooltipOpts( 293 trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)" 294 ), 295 label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"), 296 ) 297 298 ) 299 return c 300 301 302 if __name__ == '__main__': 303 get_pie().render('户型结构-饼状图.html')
4.总结
感谢我们专业课老师的悉心教导,是他在我们感到困惑的时候一遍又一遍给我们耐心的讲解,这让我们学到很多专业知识。
此项目收集了厦门二手房的信息,采用python爬虫收集了数据,然后对获取的数据进行预处理,然后对数据进行分析。大致的可以了解到厦门二手房的情况。但在爬取的过程中还是遇到了些问题:首先这个网站只能爬取100页的数据,以及url地址以及其他的地址收集不全的话就会导致只有部分数据,然后再读取文件的时候也出现了问题文件路径没对,然后最终找到路径修改。这次的项目还存在很多不足的地方
标签:数据分析,index,url,二手房,可视化,df,import,data,opts From: https://www.cnblogs.com/ly-21/p/17706414.html