今天开学第一天,主要进行了北京市政百姓信件分析进行了爬虫
import
json
import
demjson3
import
requests
from
bs4
import
BeautifulSoup
import
csv
headers
=
{
'Host'
:
'www.beijing.gov.cn'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'
,
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Language'
:
'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Content-Type'
:
'text/json'
,
'X-Requested-With'
:
'XMLHttpRequest'
,
'Content-Length'
:
'155'
,
'Origin'
:
'http://www.beijing.gov.cn'
,
'Connection'
:
'keep-alive'
,
'Referer'
:
'http://www.beijing.gov.cn/hudong/hdjl/'
}
if
__name__
=
=
"__main__"
:
page
=
1
datas
=
json.dumps({})
while
page <
175
:
print
(page)
url
=
f
"https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!replyLetterListJson.action?page.pageNo={page}&page.pageSize=6&orgtitleLength=26"
r
=
requests.post(url, data
=
datas, headers
=
headers)
rr
=
demjson3.decode(r.text);
for
item
in
rr.get(
"result"
, []):
originalId
=
item.get(
"originalId"
)
# 编号
letterTypeName
=
item.get(
"letterTypeName"
)
# 信件类型
# 构建详情页URL
detail_url
=
f
"http://www.beijing.gov.cn/hudong/hdjl/com.web.{('consult' if letterTypeName == '咨询' else 'suggest')}.{('consultDetail' if letterTypeName == '咨询' else 'suggesDetail')}.flow?originalId={originalId}"
r1
=
requests.get(detail_url, headers
=
{
'user-agent'
:
'Mozilla/5.0'
})
if
r1.status_code
=
=
200
:
demo
=
r1.text
soup
=
BeautifulSoup(demo,
"html.parser"
)
title
=
soup.find(
"strong"
).get_text().replace(
"\n"
, "
") if soup.find("
strong
") else "
"
fromPeople
=
soup.find_all(
"div"
, {
"class"
:
"col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"
})[
0
].get_text().lstrip(
'来信人:'
).lstrip().rstrip()
if
soup.find_all(
"div"
, {
"class"
:
"col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"
})
else
""
fromTime
=
soup.find_all(
"div"
, {
"class"
:
"col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"
})[
0
].get_text().lstrip(
'时间:'
)
if
soup.find_all(
"div"
, {
"class"
:
"col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"
})
else
""
problem
=
soup.find_all(
"div"
, {
"class"
:
"col-xs-12 col-md-12 column p-2 text-muted mx-2"
})[
0
].get_text().lstrip().rstrip().replace(
"\r"
, "
").replace("
\n
", "
") if soup.find_all("
div
", {"
class
", "
col
-
xs
-
12
col
-
md
-
12
column p
-
2
text
-
muted mx
-
2
"}) else "
"
office
=
soup.find_all(
"div"
, {
"class"
:
"col-xs-9 col-sm-7 col-md-5 o-font4 my-2"
})[
0
].get_text().replace(
"\n"
, "
") if soup.find_all("
div
", {"
class
": "
col
-
xs
-
9
col
-
sm
-
7
col
-
md
-
5
o
-
font4 my
-
2
"}) else "
"
answerTime
=
soup.find_all(
"div"
, {
"class"
:
"col-xs-12 col-sm-3 col-md-3 my-2"
})[
0
].get_text().lstrip(
'答复时间:'
)
if
soup.find_all(
"div"
, {
"class"
:
"col-xs-12 col-sm-3 col-md-3 my-2"
})
else
""
answer
=
soup.find_all(
"div"
, {
"class"
:
"col-xs-12 col-md-12 column p-4 text-muted my-3"
})[
0
].get_text().lstrip().rstrip().replace(
"\n"
, "
").replace("
\r
", "
") if soup.find_all("
div
", {"
class
": "
col
-
xs
-
12
col
-
md
-
12
column p
-
4
text
-
muted my
-
3
"}) else "
"
itemm
=
f
"{originalId}|{letterTypeName}|{title}|{fromPeople}|{fromTime}|{problem}|{office}|{answerTime}|{answer}"
with
open
(
"yijian.txt"
,
'a'
, encoding
=
'utf-8'
) as fp:
fp.write(itemm
+
'\n'
)
else
:
print
(f
"Failed to retrieve details for ID: {originalId}"
)
page
+
=
1
标签:md,text,09,2024.09,find,soup,信件,div,col
From: https://www.cnblogs.com/dmx-03/p/18472092