```
#编写程序,从丁香园获取国内近期疫情数据,按省份提取当前确诊数,
# 确诊总数,疑似病例数,治愈数,死亡数,高危数等数据,保存到csv文件或excel文件中。
import requests
import xlsxwriter
from fake_useragent import UserAgent
import cchardet
import re
import json
from bs4 import BeautifulSoup
import csv
headers={
'user-agent':UserAgent().random
}
url="https://ncov.dxy.cn/ncovh5/view/pneumonia"
req=requests.get(url,headers=headers)
req.encoding=cchardet.detect(req.content)
#获取文本内容 有re、BeautifulSoup两种方式
#source=re.search('id="getAreaStat">try { window.getAreaStat = (.*?)}catch.*?{}</script>',req.text,re.S).group(1)
#bs=BeautifulSoup(req.text,'lxml')
#source=bs.select('body>script')[0].string.replace('try { window.getAreaStat = ',"").replace(']}catch(e){}',"")
source=re.search(r'window.getAreaStat = (.*?)}]}catch',req.text,re.S).group()
source=source.replace('window.getAreaStat = ',"").replace("}catch","")
# print(source)
source2=re.search(r'window.fetchRecentStatV2 = (.*?)}catch',req.text,re.S).group(1)
#将str数据转换成json数据
js=json.loads(source)
data=json.loads(source2)
#保存到本地
# with open('nCov-19_info.json','w',encoding="UTF-8") as f:
# f.write(json.dumps(js,ensure_ascii=False))
# f.close()
#设置列名
colname='省份','昨日本土新增','现存确诊','累积确诊','疑似病例数','死亡数','治愈数','高危地区数','中危地区数'
#创建csv文件
with open('COVID-19_info.csv','w',newline="",encoding='utf-8-sig')as file:
a=csv.writer(file)
a.writerow(list(colname))
#创建xlsx文件
workbook = xlsxwriter.Workbook('COVID-19_info.xlsx')
worksheet = workbook.add_worksheet()
col = (colname)
str_format=workbook.add_format({'align':'center'})
#将列名添加到表格中
for i in range(0, len(col)):
worksheet.write(0, i, col[i],str_format)
#设置表格行索引 初值为0
x=0
p=[ i['provinceName'] for i in data]
print(p)
for item in js:
x = x + 1
#省份名
pro_name=item['provinceName']
if pro_name in p:
for item2 in data:
if item2['provinceName']==pro_name:
yesterday=item2['yesterdayLocalConfirmedCount']
if pro_name not in p:
#现存确诊
yesterday=""
currentConfirmed=item['currentConfirmedCount']
#累计确诊
allConfirmed=item['confirmedCount']
#疑似病例数
suspected=item['suspectedCount']
#死亡数
dead=item['deadCount']
#治愈数
cured=item['curedCount']
#高风险地区数
highDanger=item['highDangerCount']
#中风险地区数
midDanger=item['midDangerCount']
#风险地区数
#allDanger=int(highDanger)+int(midDanger)
datalist=[pro_name,yesterday,currentConfirmed,allConfirmed,suspected,dead,cured,highDanger,midDanger]
print(datalist)
#保存到csv文件
with open('COVID-19_info.csv','a',newline="",encoding='utf-8-sig') as file:
a=csv.writer(file)
a.writerow(datalist)
#保存到xlsx文件
for i in range(0,len(col)):
worksheet.write(x,i,datalist[i],str_format)
workbook.close()
print("保存成功")
```
标签:re,req,爬虫,爬取,source,window,item,丁香,csv From: https://www.cnblogs.com/Gimm/p/18116345