(一)、选题的背景
为什么要选择此选题?要达到的数据分析目标是什么?从社会、经济、技术、数据来源等方面进行描述(200 字以内)(10 分)
百合网是一家通过网上实名进行交友和婚恋的服务商网站;百合网的目标是集各地优势,合百家文化,共建全国比较大的开放式平台,为中国单身男女提供专业的婚恋咨询、婚恋指导、婚恋匹配,它可以为所有足不出户的人们在不浪费时间和路费的情况下为想相亲的人提供了环境和选择。
目标:对使用人群的数据统计,这样就可以参考出哪些重点服务人群,从而可以提供相应服务,来提高用户的满意度和选择率。
(二)、大数据分析设计方案(10 分)
使用爬虫方法,来爬取网站中用户给出的条件要求等数据。具体内容可以有年龄,学历,身高等多项数据,分三步:数据爬取,数据清洗,可视化。
(三)、数据分析步骤(70 分)
1.数据源
百合网的官网
2.数据爬取
import requests import re import json import time from lxml import etree import pandas as pd import warnings warnings.filterwarnings("ignore")# 请求头,伪装成浏览器
#
HEADERS = {
# 添加你自己的Cookie
'Cookie': '',
'Origin': 'https://search.baihe.com',
'Pragma': 'no-cache',
'Referer': 'https://search.baihe.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}
# 这后来发现没用了
temp_url = 'https://search.baihe.com/Search/getUserID?&jsonCallBack?'
# 主页
tql_url = 'https://search.baihe.com/'
# 个人信息的前url
info_url = 'https://profile1.baihe.com/?oppID='
# 获得id集合url
url_base = 'https://search.baihe.com/Search/getUserID?&jsonCallBack=jQuery1830923921797491073_1594465799055'
session = requests.Session()
# 不用持续连接
session.keep_alive = False
# 从主页中获得一些Cookie等信息
session.get(url=tql_url, headers=HEADERS)# 爬取的页数
for page in range(2, 66):
print(f'正在爬取第{page}页数据......')
data = {
'page': page
}
url_tails = session.post(url=url_base, headers=HEADERS, data=data).content.decode('utf-8')
# 提取规范的JSON格式的字符串
json_data = re.search(r'jQuery1830923921797491073_1594465799055((.*));', url_tails, flags=0)
json_data_format = json.loads(json_data.group(1))
data_uid = json_data_format['data']
# 需要解析的数据
me_uid = []
me_age = []
me_height = []
me_education = []
me_salary = []
me_location = []
me_marriage = []
me_home = []
me_introduce = []
he_age = []
he_height = []
he_education = []
he_salary = []
he_location = []
he_marriage = []
he_home = []
# 遍历uid来获得个人信息页
for uid in data_uid:
try:
# 拼接个人信息页完整url
me_url = info_url + uid
me_page = session.get(me_url, headers=HEADERS)
me_page.decoding = 'gbk'
me_page = me_page.text
# 这边有些异常编码的参数需要处理
me_page = me_page.encode('GBK','ignore').decode('GBk')
tree = etree.HTML(me_page)
# me_uid.append(uid)<span class="n">me_age<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[2]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">me_height<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[3]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">me_education<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[4]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">me_salary<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[5]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">me_location<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[8]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">me_marriage<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[6]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">me_home<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[7]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">me_introduce<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="profileCommon"]/div[1]/div[2]/div[1]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">he_age<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[2]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">he_height<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[3]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">he_education<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[4]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">he_salary<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[5]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">he_location<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[8]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">he_marriage<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[6]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="n">he_home<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[7]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">]) <span class="k">except <span class="ne">IndexError<span class="p">: <span class="nb">print<span class="p">(<span class="ne">IndexError<span class="p">) <span class="nb">print<span class="p">(<span class="sa">f<span class="s1">'<span class="si">{<span class="n">uid<span class="si">}<span class="s1">,已经爬取成功!'<span class="p">) <span class="n">time<span class="o">.<span class="n">sleep<span class="p">(<span class="mf">0.5<span class="p">) <span class="n">df <span class="o">= <span class="n">pd<span class="o">.<span class="n">DataFrame<span class="p">() <span class="n">df<span class="p">[<span class="s2">"年龄"<span class="p">] <span class="o">= <span class="n">me_age <span class="n">df<span class="p">[<span class="s2">"身高"<span class="p">] <span class="o">= <span class="n">me_height <span class="n">df<span class="p">[<span class="s2">"学历"<span class="p">] </span><span class="o">= </span><span class="n">me_education </span><span class="n">df</span><span class="p">[</span><span class="s2">"薪资"</span><span class="p">] </span><span class="o">= </span><span class="n">me_salary </span><span class="n">df</span><span class="p">[</span><span class="s2">"家乡"</span><span class="p">] </span><span class="o">= </span><span class="n">me_location </span><span class="n">df</span><span class="p">[</span><span class="s2">"婚姻"</span><span class="p">] </span><span class="o">= </span><span class="n">me_marriage </span><span class="n">df</span><span class="p">[</span><span class="s2">"住房情况"</span><span class="p">] </span><span class="o">= </span><span class="n">me_home </span><span class="n">df</span><span class="p">[</span><span class="s2">"自我介绍"</span><span class="p">] </span><span class="o">= </span><span class="n">me_introduce </span><span class="n">df</span><span class="p">[</span><span class="s2">"对象年龄"</span><span class="p">] </span><span class="o">= </span><span class="n">he_age </span><span class="n">df</span><span class="p">[</span><span class="s2">"对象身高"</span><span class="p">] </span><span class="o">= </span><span class="n">he_height </span><span class="n">df</span><span class="p">[</span><span class="s2">"对象学历"</span><span class="p">] </span><span class="o">= </span><span class="n">he_education </span><span class="n">df</span><span class="p">[</span><span class="s2">"对象薪资"</span><span class="p">] </span><span class="o">= </span><span class="n">he_salary </span><span class="n">df</span><span class="p">[</span><span class="s2">"对象家乡"</span><span class="p">] </span><span class="o">= </span><span class="n">he_location </span><span class="n">df</span><span class="p">[</span><span class="s2">"对象婚姻"</span><span class="p">] </span><span class="o">= </span><span class="n">he_marriage </span><span class="n">df</span><span class="p">[</span><span class="s2">"对象住房情况"</span><span class="p">] </span><span class="o">= </span><span class="n">he_home </span><span class="c1"># 保存文件 </span><span class="k">try</span><span class="p">: </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s2">"baihe.csv"</span><span class="p">, </span><span class="n">mode</span><span class="o">=</span><span class="s2">"a+"</span><span class="p">, </span><span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">, </span><span class="n">index</span><span class="o">=</span><span class="kc">None</span><span class="p">, </span><span class="n">encoding</span><span class="o">=</span><span class="s2">"gbk"</span><span class="p">) </span><span class="k">except</span><span class="p">: </span><span class="nb">print</span><span class="p">(</span><span class="s1">'保存数据异常'</span><span class="p">) </span><span class="c1"># 慢点 防止 封IP </span><span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">1</span><span class="p">) </span><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s1">'第</span><span class="si">{</span><span class="n">page</span><span class="si">}</span><span class="s1">页数据爬取完毕,即将爬取下一页......'</span><span class="p">) </span><span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">10</span><span class="p">)
print("数据爬取完毕,离脱单不远啦!")
3.数据清洗
import pandas as pd import numpy as np import re import jiebadf = pd.read_csv("sample.csv",encoding="gbk",header=None)
df.head()
设置行列索引
# 指定行索引 df.index = range(len(df))# 指定列索引
df.columns = ['年龄', '身高', '学历', '工资', '家乡', '婚姻', '住房', '自我介绍', '对象年龄', '对象身高', '对象学历', '对象薪水', '对象家乡', '对象婚姻', '对象住房']
df.head()
查看是否有空值
df.isnull().any(axis = 0)
去重
print('去重前数据量:', df.shape) # 去重 df.drop_duplicates(inplace=True) print('去重后数据量:', df.shape)
把年龄中的“岁”去掉
df['年龄'] = df['年龄'].str[0:2] df.head()
把身高中的“cm”去掉,以便后续处理
df['身高'] = df['身高'].str[0:3] df.head()
# 根据省份分类,看看每个省份的数据 df.groupby(by='家乡').size()
家乡 Baden-Wurttemberg 1 Basse-Normandie 1 Bayern 1 England 1 Ontario 3 Stockholm 1 Waikato 1 上海市 82 云南省 72 以后告诉你 206 内蒙古自治区 131 加拿大 1 北京市 100 台湾省 149 吉林省 109 四川省 45 天津市 43 宁夏回族自治区 28 安徽省 46 山东省 59 山西省 100 广东省 49 广西壮族自治区 24 德国 1 新加坡 4 新南威尔士州 1 新疆自治区 33 日本 3 曼谷 1 江苏省 87 江西省 70 河北省 113 河南省 33 浙江省 57 海南省 44 湖北省 47 湖南省 95 澳大利亚 2 澳门特别行政区 130 甘肃省 35 神奈川县 1 福建省 40 维多利亚州 1 缅甸 1 美国 1 英国 1 西藏自治区 246 贵州省 43 越南 1 辽宁省 151 重庆市 23 钓鱼岛 10 陕西省 54 青海省 49 首尔(汉城) 3 香港特别行政区 77 黑龙江省 116 dtype: int64
df.head()
对工资进行处理,分离到最低最高的工资
# 对工资进行处理 def get_salary_max_min(salary): try: result = re.split('-', salary) return result except: return salary salary = df['工资'].apply(get_salary_max_min) df['最低工资'] = salary.str[0] df['最高工资'] = salary.str[1]
df.info()
Int64Index: 2827 entries, 0 to 2874 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 年龄 2827 non-null object 1 身高 2827 non-null object 2 学历 2827 non-null object 3 工资 2827 non-null object 4 家乡 2827 non-null object 5 婚姻 2827 non-null object 6 住房 2827 non-null object 7 自我介绍 2827 non-null object 8 对象年龄 2827 non-null object 9 对象身高 2827 non-null object 10 对象学历 2827 non-null object 11 对象薪水 2827 non-null object 12 对象家乡 2827 non-null object 13 对象婚姻 2827 non-null object 14 对象住房 2827 non-null object 15 最低工资 2827 non-null object 16 最高工资 2370 non-null object dtypes: object(17) memory usage: 477.5+ KB
把不符合规范带有中文的数据处理了
indexs = df[df['最低工资'] == '2000以下'].index df.loc[indexs, '最低工资'] = '2000' df.loc[indexs, '最高工资'] = '2000' df.head()
indexs = df[df['最低工资'] == '>50000'].index df.loc[indexs, '最低工资'] = '50000' df.loc[indexs, '最高工资'] = '50000' df.loc[indexs]
df['最高工资'] = pd.to_numeric(df['最高工资']) df['最低工资'] = pd.to_numeric(df['最低工资']) df.info()
Int64Index: 2827 entries, 0 to 2874 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 年龄 2827 non-null object 1 身高 2827 non-null object 2 学历 2827 non-null object 3 工资 2827 non-null object 4 家乡 2827 non-null object 5 婚姻 2827 non-null object 6 住房 2827 non-null object 7 自我介绍 2827 non-null object 8 对象年龄 2827 non-null object 9 对象身高 2827 non-null object 10 对象学历 2827 non-null object 11 对象薪水 2827 non-null object 12 对象家乡 2827 non-null object 13 对象婚姻 2827 non-null object 14 对象住房 2827 non-null object 15 最低工资 2827 non-null int64 16 最高工资 2827 non-null float64 dtypes: float64(1), int64(1), object(15) memory usage: 477.5+ KB求工资的平均值
df['平均工资'] = df[['最低工资', '最高工资']].mean(axis=1)
df.head()
处理对象身高
df['对象身高'] = df['对象身高'].str[0:9]
df['对象身高']# 这里也可以调用算年龄的那个函数,我这里直接调用了
he_height = df['对象身高'].apply(get_he_age_max_min)
df['对象最低身高'] = he_height.str[0]
df['对象最高身高'] = he_height.str[1]
df.head()
indexs = df[df['对象最低身高'] == '不限'].index df.loc[indexs, '对象最低身高'] = '0' df.loc[indexs, '对象最高身高'] = '0' df.head()
df['对象最低身高'] = pd.to_numeric(df['对象最低身高']) df['对象最高身高'] = pd.to_numeric(df['对象最高身高']) df.info()
df['对象平均身高'] = df[['对象最低身高', '对象最高身高']].mean(axis=1)
df['对象平均身高'] df.head()
3.可视化
女生的地域分布情况
from pyecharts.charts import Map from pyecharts import options as optscity = df["家乡"].value_counts().index.tolist()
data = df["家乡"].value_counts().values.tolist()zip_data = [(i,j) for i,j in zip(city,data)]
print(zip_data)china = (
Map()
.add('单身妹子区域分布', [(i,j) for i,j in zip(city,data)], 'china')
.set_global_opts(visualmap_opts=opts.VisualMapOpts(min_=0, max_=300))
)
china.render_notebook()
女生的学历分布情况
from pyecharts.charts import Bar import pyecharts.options as opts from pyecharts import optionsname = df["学历"].value_counts().index.tolist()
value = df["学历"].value_counts().values.tolist()bar4 = (
Bar(init_opts=opts.InitOpts(width='1000px', height='350px'))
.add_xaxis(xaxis_data=name)
.add_yaxis(series_name='求偶女性学历情况', y_axis=value)<span class="p">)
bar4.render_notebook()
女生相应身高数量前10
from pyecharts.charts import Bar import pyecharts.options as opts from pyecharts import optionsname = height.index.tolist()
value = height.values.tolist()bar2 = (Bar(init_opts=opts.InitOpts(width='720px', height='320px'))
.add_xaxis(xaxis_data=name)
.add_yaxis(series_name='单身女性身高Top10', y_axis=value)
# 设置坐标轴标签
.set_global_opts(xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":60}),
# 柱形图使用滑块效果
datazoom_opts=[opts.DataZoomOpts(pos_bottom=0)])
)bar2.render_notebook()
求偶单身女性数量前20
import pyecharts.options as opts from pyecharts import options from pyecharts.charts import Barname = age.index.tolist()
value = age.values.tolist()bar1 = (
Bar(init_opts=opts.InitOpts(width='1000px', height='420px')).add_xaxis(xaxis_data=name)
.add_yaxis(series_name='单身女性年龄Top20', y_axis=value)
.set_global_opts(title_opts=opts.TitleOpts(title="可切换查看曲线图"),
legend_opts=opts.LegendOpts(is_show=True))
)bar1.set_global_opts(toolbox_opts=opts.ToolboxOpts(is_show=True))
bar1.render_notebook()
女生喜欢男生的标准身高前10
name = df["对象平均身高"].value_counts().index[:10].tolist() value = df["对象平均身高"].value_counts().values[:10].tolist()
import pyecharts.options as opts from pyecharts import options from pyecharts.charts import Lineline_man1 = (
Bar(init_opts=opts.InitOpts(width='750px', height='350px'))
.add_xaxis(xaxis_data=name)
.add_yaxis(series_name='对象男性身高均值Top10(样本整体均值:178cm)', y_axis=value)
# 下面两行代码,用于旋转坐标轴
.reversal_axis()
.set_series_opts(label_opts=opts.LabelOpts(position="right"))<span class="p">)
# line_man1.set_global_opts(toolbox_opts=opts.ToolboxOpts(is_show=True))
line_man1.render_notebook()
对18-37岁女生求偶数量分析
import pyecharts.options as opts from pyecharts import options from pyecharts.charts import Barname = sort_age.index.tolist()
value = sort_age.values.tolist()bar3 = (
Bar(init_opts=opts.InitOpts(width='1000px', height='420px')).add_xaxis(xaxis_data=name)
.add_yaxis(series_name='18-37岁单身女性数量分析', y_axis=value)
.set_global_opts(title_opts=opts.TitleOpts(title="可切换查看曲线图"),
legend_opts=opts.LegendOpts(is_show=True))
)bar3.set_global_opts(toolbox_opts=opts.ToolboxOpts(is_show=True))
bar3.render_notebook()
女生对另一半薪水的平均要求
# 因为有的是不限, 所以排除在外了 indexs = df.loc[df['对象平均薪水'] > 1].index avg_salary = df.loc[indexs, '对象平均薪水'].value_counts()[:5]
from pyecharts.charts import Pie import pyecharts.options as optsnum = avg_salary.values.tolist()
lab = avg_salary.index.tolist()x = [(i, j)for i, j in zip(lab, num)]
pie = (Pie(init_opts=opts.InitOpts(width='750px', height='350px'))
.add(series_name='目标对象男性平均工资粗略分布(样本均值:12437)',data_pair=[(i, j)for i, j in zip(lab, num)],radius = ['40%','75%'])
.set_global_opts(title_opts=opts.TitleOpts(title="全样本均值:12437元"),
legend_opts=opts.LegendOpts(is_show=True))
)
pie.render_notebook()
标签:me,non,python,object,爬虫,爬取,df,2827,opts From: https://www.cnblogs.com/jlgcd/p/class16.html