首页 > 编程语言 >python网络爬虫-百合网数据信息爬取+大数据可视化

python网络爬虫-百合网数据信息爬取+大数据可视化

时间:2022-12-21 18:24:50浏览次数:62  
标签:me non python object 爬虫 爬取 df 2827 opts

(一)、选题的背景

为什么要选择此选题?要达到的数据分析目标是什么?从社会、经济、技术、数据来源等方面进行描述(200 字以内)(10 分)

百合网是一家通过网上实名进行交友和婚恋的服务商网站;百合网的目标是集各地优势,合百家文化,共建全国比较大的开放式平台,为中国单身男女提供专业的婚恋咨询、婚恋指导、婚恋匹配,它可以为所有足不出户的人们在不浪费时间和路费的情况下为想相亲的人提供了环境和选择。

目标:对使用人群的数据统计,这样就可以参考出哪些重点服务人群,从而可以提供相应服务,来提高用户的满意度和选择率。

(二)、大数据分析设计方案(10 分)

使用爬虫方法,来爬取网站中用户给出的条件要求等数据。具体内容可以有年龄,学历,身高等多项数据,分三步:数据爬取,数据清洗,可视化。

(三)、数据分析步骤(70 分)

1.数据源

百合网的官网

2.数据爬取

import requests
import re
import json
import time
from lxml import etree
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# 请求头,伪装成浏览器
#
HEADERS = {
# 添加你自己的Cookie
'Cookie': '',
'Origin': 'https://search.baihe.com',
'Pragma': 'no-cache',
'Referer': 'https://search.baihe.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}
# 这后来发现没用了
temp_url = 'https://search.baihe.com/Search/getUserID?&jsonCallBack?'
# 主页
tql_url = 'https://search.baihe.com/'
# 个人信息的前url
info_url = 'https://profile1.baihe.com/?oppID='
# 获得id集合url
url_base = 'https://search.baihe.com/Search/getUserID?&jsonCallBack=jQuery1830923921797491073_1594465799055'
session = requests.Session()
# 不用持续连接
session.keep_alive = False
# 从主页中获得一些Cookie等信息
session.get(url=tql_url, headers=HEADERS)

# 爬取的页数
for page in range(2, 66):
print(f'正在爬取第{page}页数据......')
data = {
'page': page
}
url_tails = session.post(url=url_base, headers=HEADERS, data=data).content.decode('utf-8')
# 提取规范的JSON格式的字符串
json_data = re.search(r'jQuery1830923921797491073_1594465799055((.*));', url_tails, flags=0)
json_data_format = json.loads(json_data.group(1))
data_uid = json_data_format['data']
# 需要解析的数据
me_uid = []
me_age = []
me_height = []
me_education = []
me_salary = []
me_location = []
me_marriage = []
me_home = []
me_introduce = []
he_age = []
he_height = []
he_education = []
he_salary = []
he_location = []
he_marriage = []
he_home = []
# 遍历uid来获得个人信息页
for uid in data_uid:
try:
# 拼接个人信息页完整url
me_url = info_url + uid
me_page = session.get(me_url, headers=HEADERS)
me_page.decoding = 'gbk'
me_page = me_page.text
# 这边有些异常编码的参数需要处理
me_page = me_page.encode('GBK','ignore').decode('GBk')
tree = etree.HTML(me_page)
# me_uid.append(uid)

        <span class="n">me_age<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[2]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">me_height<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[3]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">me_education<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[4]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">me_salary<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[5]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">me_location<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[8]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">me_marriage<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[6]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">me_home<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[7]/dd[1]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">me_introduce<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="profileCommon"]/div[1]/div[2]/div[1]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">he_age<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[2]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">he_height<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[3]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">he_education<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[4]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">he_salary<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[5]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">he_location<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[8]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">he_marriage<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[6]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">])
        <span class="n">he_home<span class="o">.<span class="n">append<span class="p">(<span class="n">tree<span class="o">.<span class="n">xpath<span class="p">(<span class="s1">'//*[@id="matching_detail"]/div/div/dl[7]/dd[3]/text()'<span class="p">)[<span class="mi">0<span class="p">])
    <span class="k">except <span class="ne">IndexError<span class="p">:
        <span class="nb">print<span class="p">(<span class="ne">IndexError<span class="p">)
    <span class="nb">print<span class="p">(<span class="sa">f<span class="s1">'<span class="si">{<span class="n">uid<span class="si">}<span class="s1">,已经爬取成功!'<span class="p">)
    <span class="n">time<span class="o">.<span class="n">sleep<span class="p">(<span class="mf">0.5<span class="p">)
<span class="n">df <span class="o">= <span class="n">pd<span class="o">.<span class="n">DataFrame<span class="p">()
<span class="n">df<span class="p">[<span class="s2">"年龄"<span class="p">] <span class="o">= <span class="n">me_age
<span class="n">df<span class="p">[<span class="s2">"身高"<span class="p">] <span class="o">= <span class="n">me_height
<span class="n">df<span class="p">[<span class="s2">"学历"<span class="p">] </span><span class="o">= </span><span class="n">me_education
</span><span class="n">df</span><span class="p">[</span><span class="s2">"薪资"</span><span class="p">] </span><span class="o">= </span><span class="n">me_salary
</span><span class="n">df</span><span class="p">[</span><span class="s2">"家乡"</span><span class="p">] </span><span class="o">= </span><span class="n">me_location
</span><span class="n">df</span><span class="p">[</span><span class="s2">"婚姻"</span><span class="p">] </span><span class="o">= </span><span class="n">me_marriage
</span><span class="n">df</span><span class="p">[</span><span class="s2">"住房情况"</span><span class="p">] </span><span class="o">= </span><span class="n">me_home
</span><span class="n">df</span><span class="p">[</span><span class="s2">"自我介绍"</span><span class="p">] </span><span class="o">= </span><span class="n">me_introduce
</span><span class="n">df</span><span class="p">[</span><span class="s2">"对象年龄"</span><span class="p">] </span><span class="o">= </span><span class="n">he_age
</span><span class="n">df</span><span class="p">[</span><span class="s2">"对象身高"</span><span class="p">] </span><span class="o">= </span><span class="n">he_height
</span><span class="n">df</span><span class="p">[</span><span class="s2">"对象学历"</span><span class="p">] </span><span class="o">= </span><span class="n">he_education
</span><span class="n">df</span><span class="p">[</span><span class="s2">"对象薪资"</span><span class="p">] </span><span class="o">= </span><span class="n">he_salary
</span><span class="n">df</span><span class="p">[</span><span class="s2">"对象家乡"</span><span class="p">] </span><span class="o">= </span><span class="n">he_location
</span><span class="n">df</span><span class="p">[</span><span class="s2">"对象婚姻"</span><span class="p">] </span><span class="o">= </span><span class="n">he_marriage
</span><span class="n">df</span><span class="p">[</span><span class="s2">"对象住房情况"</span><span class="p">] </span><span class="o">= </span><span class="n">he_home
</span><span class="c1"># 保存文件
</span><span class="k">try</span><span class="p">:
    </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s2">"baihe.csv"</span><span class="p">, </span><span class="n">mode</span><span class="o">=</span><span class="s2">"a+"</span><span class="p">, </span><span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">, </span><span class="n">index</span><span class="o">=</span><span class="kc">None</span><span class="p">, </span><span class="n">encoding</span><span class="o">=</span><span class="s2">"gbk"</span><span class="p">)
</span><span class="k">except</span><span class="p">:
    </span><span class="nb">print</span><span class="p">(</span><span class="s1">'保存数据异常'</span><span class="p">)
</span><span class="c1"># 慢点 防止 封IP
</span><span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">1</span><span class="p">)
</span><span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s1">'第</span><span class="si">{</span><span class="n">page</span><span class="si">}</span><span class="s1">页数据爬取完毕,即将爬取下一页......'</span><span class="p">)
</span><span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">10</span><span class="p">)

print("数据爬取完毕,离脱单不远啦!")

 

 

 

3.数据清洗

import pandas as pd
import numpy as np
import re
import jieba

df = pd.read_csv("sample.csv",encoding="gbk",header=None)
df.head()


 

 

设置行列索引

# 指定行索引
df.index = range(len(df))

# 指定列索引
df.columns = ['年龄', '身高', '学历', '工资', '家乡', '婚姻', '住房', '自我介绍', '对象年龄', '对象身高', '对象学历', '对象薪水', '对象家乡', '对象婚姻', '对象住房']
df.head()


 

 

查看是否有空值

df.isnull().any(axis = 0)

 

 

去重

print('去重前数据量:', df.shape)
# 去重
df.drop_duplicates(inplace=True)
print('去重后数据量:', df.shape)

 

 

把年龄中的“岁”去掉

df['年龄'] = df['年龄'].str[0:2]
df.head()

 

 

把身高中的“cm”去掉,以便后续处理

df['身高'] = df['身高'].str[0:3]
df.head()

 

 

# 根据省份分类,看看每个省份的数据
df.groupby(by='家乡').size()

家乡
Baden-Wurttemberg      1
Basse-Normandie        1
Bayern                 1
England                1
Ontario                3
Stockholm              1
Waikato                1
上海市                   82
云南省                   72
以后告诉你                206
内蒙古自治区               131
加拿大                    1
北京市                  100
台湾省                  149
吉林省                  109
四川省                   45
天津市                   43
宁夏回族自治区               28
安徽省                   46
山东省                   59
山西省                  100
广东省                   49
广西壮族自治区               24
德国                     1
新加坡                    4
新南威尔士州                 1
新疆自治区                 33
日本                     3
曼谷                     1
江苏省                   87
江西省                   70
河北省                  113
河南省                   33
浙江省                   57
海南省                   44
湖北省                   47
湖南省                   95
澳大利亚                   2
澳门特别行政区              130
甘肃省                   35
神奈川县                   1
福建省                   40
维多利亚州                  1
缅甸                     1
美国                     1
英国                     1
西藏自治区                246
贵州省                   43
越南                     1
辽宁省                  151
重庆市                   23
钓鱼岛                   10
陕西省                   54
青海省                   49
首尔(汉城)                 3
香港特别行政区               77
黑龙江省                 116
dtype: int64
df.head()

 

 

对工资进行处理,分离到最低最高的工资

# 对工资进行处理
def get_salary_max_min(salary):
    try:
        result = re.split('-', salary)
        return result
    except:
        return salary
salary = df['工资'].apply(get_salary_max_min)
df['最低工资'] = salary.str[0]
df['最高工资'] = salary.str[1]
df.info()


Int64Index: 2827 entries, 0 to 2874
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   年龄      2827 non-null   object
 1   身高      2827 non-null   object
 2   学历      2827 non-null   object
 3   工资      2827 non-null   object
 4   家乡      2827 non-null   object
 5   婚姻      2827 non-null   object
 6   住房      2827 non-null   object
 7   自我介绍    2827 non-null   object
 8   对象年龄    2827 non-null   object
 9   对象身高    2827 non-null   object
 10  对象学历    2827 non-null   object
 11  对象薪水    2827 non-null   object
 12  对象家乡    2827 non-null   object
 13  对象婚姻    2827 non-null   object
 14  对象住房    2827 non-null   object
 15  最低工资    2827 non-null   object
 16  最高工资    2370 non-null   object
dtypes: object(17)
memory usage: 477.5+ KB

把不符合规范带有中文的数据处理了
indexs = df[df['最低工资'] == '2000以下'].index
df.loc[indexs, '最低工资'] = '2000'
df.loc[indexs, '最高工资'] = '2000'
df.head()

 

 

indexs = df[df['最低工资'] == '>50000'].index
df.loc[indexs, '最低工资'] = '50000'
df.loc[indexs, '最高工资'] = '50000'
df.loc[indexs]


 

 

df['最高工资'] = pd.to_numeric(df['最高工资'])
df['最低工资'] = pd.to_numeric(df['最低工资'])
df.info()

Int64Index: 2827 entries, 0 to 2874
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   年龄      2827 non-null   object 
 1   身高      2827 non-null   object 
 2   学历      2827 non-null   object 
 3   工资      2827 non-null   object 
 4   家乡      2827 non-null   object 
 5   婚姻      2827 non-null   object 
 6   住房      2827 non-null   object 
 7   自我介绍    2827 non-null   object 
 8   对象年龄    2827 non-null   object 
 9   对象身高    2827 non-null   object 
 10  对象学历    2827 non-null   object 
 11  对象薪水    2827 non-null   object 
 12  对象家乡    2827 non-null   object 
 13  对象婚姻    2827 non-null   object 
 14  对象住房    2827 non-null   object 
 15  最低工资    2827 non-null   int64  
 16  最高工资    2827 non-null   float64
dtypes: float64(1), int64(1), object(15)
memory usage: 477.5+ KB
 求工资的平均值
df['平均工资'] = df[['最低工资', '最高工资']].mean(axis=1)
df.head()

 

 

处理对象身高

df['对象身高'] = df['对象身高'].str[0:9]

df['对象身高'] 

# 这里也可以调用算年龄的那个函数,我这里直接调用了
he_height = df['对象身高'].apply(get_he_age_max_min)
df['对象最低身高'] = he_height.str[0]
df['对象最高身高'] = he_height.str[1]
df.head()

 

 

indexs = df[df['对象最低身高'] == '不限'].index
df.loc[indexs, '对象最低身高'] = '0'
df.loc[indexs, '对象最高身高'] = '0'
df.head()

 

 

df['对象最低身高'] = pd.to_numeric(df['对象最低身高'])
df['对象最高身高'] = pd.to_numeric(df['对象最高身高'])
df.info()

 

 

df['对象平均身高'] = df[['对象最低身高', '对象最高身高']].mean(axis=1)

df['对象平均身高']
df.head()

 

 

3.可视化

女生的地域分布情况

from pyecharts.charts import Map
from pyecharts import options as opts

city = df["家乡"].value_counts().index.tolist()
data = df["家乡"].value_counts().values.tolist()

zip_data = [(i,j) for i,j in zip(city,data)]
print(zip_data)

china = (
Map()
.add('单身妹子区域分布', [(i,j) for i,j in zip(city,data)], 'china')
.set_global_opts(visualmap_opts=opts.VisualMapOpts(min_=0, max_=300))
)
china.render_notebook()

 

 

 

女生的学历分布情况

from pyecharts.charts import Bar
import pyecharts.options as opts
from pyecharts import options

name = df["学历"].value_counts().index.tolist()
value = df["学历"].value_counts().values.tolist()

bar4 = (
Bar(init_opts=opts.InitOpts(width='1000px', height='350px'))
.add_xaxis(xaxis_data=name)
.add_yaxis(series_name='求偶女性学历情况', y_axis=value)

<span class="p">)

bar4.render_notebook()

 

 

女生相应身高数量前10

from pyecharts.charts import Bar
import pyecharts.options as opts
from pyecharts import options

name = height.index.tolist()
value = height.values.tolist()

bar2 = (Bar(init_opts=opts.InitOpts(width='720px', height='320px'))
.add_xaxis(xaxis_data=name)
.add_yaxis(series_name='单身女性身高Top10', y_axis=value)
# 设置坐标轴标签
.set_global_opts(xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":60}),
# 柱形图使用滑块效果
datazoom_opts=[opts.DataZoomOpts(pos_bottom=0)])
)

bar2.render_notebook()


 

 

 

求偶单身女性数量前20

import pyecharts.options as opts
from pyecharts import options
from pyecharts.charts import Bar

name = age.index.tolist()
value = age.values.tolist()

bar1 = (
Bar(init_opts=opts.InitOpts(width='1000px', height='420px')).add_xaxis(xaxis_data=name)
.add_yaxis(series_name='单身女性年龄Top20', y_axis=value)
.set_global_opts(title_opts=opts.TitleOpts(title="可切换查看曲线图"),
legend_opts=opts.LegendOpts(is_show=True))
)

bar1.set_global_opts(toolbox_opts=opts.ToolboxOpts(is_show=True))
bar1.render_notebook()


 

 

女生喜欢男生的标准身高前10

name = df["对象平均身高"].value_counts().index[:10].tolist()
value = df["对象平均身高"].value_counts().values[:10].tolist()
import pyecharts.options as opts
from pyecharts import options
from pyecharts.charts import Line

line_man1 = (
Bar(init_opts=opts.InitOpts(width='750px', height='350px'))
.add_xaxis(xaxis_data=name)
.add_yaxis(series_name='对象男性身高均值Top10(样本整体均值:178cm)', y_axis=value)
# 下面两行代码,用于旋转坐标轴
.reversal_axis()
.set_series_opts(label_opts=opts.LabelOpts(position="right"))

<span class="p">)

# line_man1.set_global_opts(toolbox_opts=opts.ToolboxOpts(is_show=True))
line_man1.render_notebook()


 

 

对18-37岁女生求偶数量分析

import pyecharts.options as opts
from pyecharts import options
from pyecharts.charts import Bar

name = sort_age.index.tolist()
value = sort_age.values.tolist()

bar3 = (
Bar(init_opts=opts.InitOpts(width='1000px', height='420px')).add_xaxis(xaxis_data=name)
.add_yaxis(series_name='18-37岁单身女性数量分析', y_axis=value)
.set_global_opts(title_opts=opts.TitleOpts(title="可切换查看曲线图"),
legend_opts=opts.LegendOpts(is_show=True))
)

bar3.set_global_opts(toolbox_opts=opts.ToolboxOpts(is_show=True))
bar3.render_notebook()


 

 

女生对另一半薪水的平均要求

# 因为有的是不限, 所以排除在外了
indexs = df.loc[df['对象平均薪水'] > 1].index
avg_salary = df.loc[indexs, '对象平均薪水'].value_counts()[:5]
from pyecharts.charts import Pie
import pyecharts.options as opts

num = avg_salary.values.tolist()
lab = avg_salary.index.tolist()

x = [(i, j)for i, j in zip(lab, num)]

pie = (Pie(init_opts=opts.InitOpts(width='750px', height='350px'))
.add(series_name='目标对象男性平均工资粗略分布(样本均值:12437)',data_pair=[(i, j)for i, j in zip(lab, num)],radius = ['40%','75%'])
.set_global_opts(title_opts=opts.TitleOpts(title="全样本均值:12437元"),
legend_opts=opts.LegendOpts(is_show=True))
)
pie.render_notebook()


 

 



 

 

 

 

 

标签:me,non,python,object,爬虫,爬取,df,2827,opts
From: https://www.cnblogs.com/jlgcd/p/class16.html

相关文章