效果图
需要爬取的网页和内容
所含有的知识点
- requests爬虫请求
- 时间戳
- 表格操作
- python保存为exe
- exe获取当前运行程序的路径
from asyncio.windows_events import NULL
import imp
import importlib
from lxml import etree
import re
from urllib import request, response
import requests
from urllib import request
import re #进行数据清洗要导入此模块
from lxml import etree
import xlsxwriter
from asyncio import sleep
import xlwt
from datetime import date,datetime
from openpyxl import load_workbook
import time
# import os
# 获取所有的推文链接
def GetSiteList(start, end,path):
siteLists = []
print("程序正在运行")
for i in range(start, end):
parser = etree.HTMLParser(encoding='utf-8')
tree = etree.parse(path+str(i)+".html", parser=parser)
html = etree.tostring(tree,encoding="utf-8").decode()
result = tree.xpath("//*[@class='weui-desktop-mass-appmsg__title']/@href")
siteLists.extend(result)
return siteLists
# 获取所有推文中的责任姓名
def GetName(siteList,path):
# #print(siteList)
headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Mobile Safari/537.36 Edg/105.0.1343.33"}
# 创建表格
workbook = xlsxwriter.Workbook(path+'官微每月作品表.xlsx')
worksheet = workbook.add_worksheet()
bold = workbook.add_format({'bold': True})
worksheet.write('A1', '记者团官方微信公众号 每月作品表汇总', bold)
# worksheet.write(0,0,0,5,u'记者团官方微信公众号 每月作品表汇总')
worksheet.write('A2', '日期', bold)
worksheet.write('B2', '选题名称', bold)
worksheet.write('C2', '编辑', bold)
worksheet.write('D2', '校对', bold)
worksheet.write('E2', '文字', bold)
worksheet.write('F2', '图片', bold)
worksheet.write('G2', '视频', bold)
worksheet.write('H2', '音频', bold)
worksheet.write('I2', '学生主编', bold)
worksheet.write('J2', '推文链接', bold)
time.sleep(0.5)
parser = etree.HTMLParser(encoding='utf-8')
line = 2
for i in range(len(siteList)-1,0,-1):
print(line-1)
line = line + 1
dict = {"time":NULL, "activity":NULL, "edit":NULL, "text":NULL, "proofreading":NULL, "picture":0, "video":NULL}
# #print(siteList[i])
# 错误:请求外网
# response = requests.get(siteList[i], headers=headers).text
# html = etree.HTML(response)
#url = r"https://mp.weixin.qq.com/s?__biz=MzA5MjU5MjcxMA==&mid=2657506933&idx=1&sn=7b0c93e80da90ab9fc7eb7d85c98f32a&chksm=8bf8395cbc8fb04a3f4e41492cbc5f8d872d8394ec854f9127e4244d73dc4695d31819efe080#rd"
reponse = request.urlopen(siteList[i]).read().decode()
# #print(reponse)
# sleep(1)
try:
worksheet.write('J'+str(line), siteList[i])
except Exception as result:
#print("pass")
pass
try:
pat1 = r"var ct = \"(166\d+)\""
date1 = re.search(pat1, reponse).group(1)
date1 = int(date1)
#转换为其他日期格式,如:"%Y-%m-%d %H:%M:%S"
timeArray = time.localtime(date1)
otherStyleTime = time.strftime("%m月%d日", timeArray)
worksheet.write('A'+str(line), otherStyleTime)
#print((otherStyleTime))
except Exception as result:
try:
pat2 = r"window.ct = \'(166\d+)\'"
date2 = re.search(pat2, reponse).group(1)
date2 = int(date2)
timeArray2 = time.localtime(date2)
otherStyleTime2 = time.strftime("%m月%d日", timeArray2)
worksheet.write('A'+str(line), otherStyleTime2)
#print((otherStyleTime2))
except Exception as result:
#print("pass")
pass
html = etree.HTML(reponse)
activity_name = html.xpath("//h1")[0].text.strip() #标题
worksheet.write('B'+str(line), activity_name)
#print(activity_name)
try:
bianJi = r"编辑:([\u4e00-\u9fa5].*?)<"
bianJi = re.search(bianJi, reponse).group(1).replace(" "," ")
worksheet.write('C'+str(line),bianJi)
#print(bianJi)
except Exception as result:
pass
try:
jiaoDui = r"校对:([\u4e00-\u9fa5].*?)<"
jiaoDui = re.search(jiaoDui, reponse).group(1).replace(" "," ")
worksheet.write('D'+str(line),jiaoDui)
#print(jiaoDui)
except Exception as result:
pass
try:
text = r"文字:([\u4e00-\u9fa5].*?)<"
text = re.search(text, reponse).group(1).replace(" "," ")
worksheet.write('E'+str(line),text)
#print(text)
except Exception as result:
pass
try:
picture = r"图片:([\u4e00-\u9fa5].*?)<"
picture = re.search(picture, reponse).group(1).replace(" "," ")
worksheet.write('F'+str(line),picture)
#print(picture)
except Exception as result:
pass
try:
video = r"视频:([\u4e00-\u9fa5].*?)<"
video = re.search(video, reponse).group(1).replace(" "," ")
worksheet.write('G'+str(line),video)
#print(video)
except Exception as result:
pass
try:
audio = r"音频:([\u4e00-\u9fa5].*?)<"
audio = re.search(audio, reponse).group(1).replace(" "," ")
worksheet.write('H'+str(line),audio)
#print(audio)
except Exception as result:
pass
try:
zhuBian = r"学生主编:([\u4e00-\u9fa5].*?)<"
zhuBian = re.search(zhuBian, reponse).group(1).replace(" "," ")
worksheet.write('I'+str(line),zhuBian)
#print(zhuBian)
except Exception as result:
pass
# 关闭工作薄
# #print(dict)
#print("\n\n")
workbook.close()
# path = os.path.dirname(os.path.abspath(__file__))
# path = path.replace("\\","/")+"/"
path = input("请输入已保存的本地html文件夹名称(如:D:/桌面):")
path = path.replace("\\","/")+"/"
getSiteList = GetSiteList(1,2, path)
#print(GetSiteList)
GetName(getSiteList,path)
print("保存的文件位置在:"+path+"官微每月作品表.xlsx")
input("程序运行完毕")
标签:etree,bold,worksheet,py,Excel,爬虫,write,print,import
From: https://www.cnblogs.com/MrFlySand/p/16844153.html