首页 > 其他分享 >py爬虫数据到本地Excel表格

py爬虫数据到本地Excel表格

时间:2022-10-31 14:44:32浏览次数:40  
标签:etree bold worksheet py Excel 爬虫 write print import

效果图

需要爬取的网页和内容

所含有的知识点
  1. requests爬虫请求
  2. 时间戳
  3. 表格操作
  4. python保存为exe
  5. exe获取当前运行程序的路径
from asyncio.windows_events import NULL
import imp
import importlib
from lxml import etree
import re
from urllib import request, response
import requests 
from urllib import request
import re #进行数据清洗要导入此模块
from lxml import etree
import xlsxwriter
from asyncio import sleep
import xlwt
from datetime import date,datetime
from openpyxl import load_workbook
import time
# import os


# 获取所有的推文链接
def GetSiteList(start, end,path):
  siteLists = []
  print("程序正在运行")
  for i in range(start, end):
    parser = etree.HTMLParser(encoding='utf-8')
    tree = etree.parse(path+str(i)+".html", parser=parser)
    html = etree.tostring(tree,encoding="utf-8").decode()
    result = tree.xpath("//*[@class='weui-desktop-mass-appmsg__title']/@href")
    siteLists.extend(result)
  return siteLists

# 获取所有推文中的责任姓名
def GetName(siteList,path):
  # #print(siteList)
  headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Mobile Safari/537.36 Edg/105.0.1343.33"}
  # 创建表格
  workbook = xlsxwriter.Workbook(path+'官微每月作品表.xlsx')
  worksheet = workbook.add_worksheet()

  bold = workbook.add_format({'bold': True})
  worksheet.write('A1', '记者团官方微信公众号 每月作品表汇总', bold)
#   worksheet.write(0,0,0,5,u'记者团官方微信公众号 每月作品表汇总')
  worksheet.write('A2', '日期', bold)
  worksheet.write('B2', '选题名称', bold)
  worksheet.write('C2', '编辑', bold)
  worksheet.write('D2', '校对', bold)
  worksheet.write('E2', '文字', bold)
  worksheet.write('F2', '图片', bold)
  worksheet.write('G2', '视频', bold)
  worksheet.write('H2', '音频', bold)
  worksheet.write('I2', '学生主编', bold)
  worksheet.write('J2', '推文链接', bold)
  time.sleep(0.5)

  parser = etree.HTMLParser(encoding='utf-8')
  line = 2
  for i in range(len(siteList)-1,0,-1):
    print(line-1)
    line = line + 1
    dict = {"time":NULL, "activity":NULL, "edit":NULL, "text":NULL, "proofreading":NULL, "picture":0, "video":NULL}    
    # #print(siteList[i])
    # 错误:请求外网
    # response = requests.get(siteList[i], headers=headers).text
    # html = etree.HTML(response)
    #url = r"https://mp.weixin.qq.com/s?__biz=MzA5MjU5MjcxMA==&mid=2657506933&idx=1&sn=7b0c93e80da90ab9fc7eb7d85c98f32a&chksm=8bf8395cbc8fb04a3f4e41492cbc5f8d872d8394ec854f9127e4244d73dc4695d31819efe080#rd"
    reponse = request.urlopen(siteList[i]).read().decode()
    # #print(reponse)
    # sleep(1)
    try:
        worksheet.write('J'+str(line), siteList[i])
    except Exception as result:
        #print("pass")
        pass

    try:
        pat1 = r"var ct = \"(166\d+)\""        
        date1 = re.search(pat1, reponse).group(1)
        date1 = int(date1)
        #转换为其他日期格式,如:"%Y-%m-%d %H:%M:%S"
        timeArray = time.localtime(date1)
        otherStyleTime = time.strftime("%m月%d日", timeArray)
        worksheet.write('A'+str(line), otherStyleTime)
        #print((otherStyleTime))
    except Exception as result:
        try:
            pat2 = r"window.ct = \'(166\d+)\'"
            date2 = re.search(pat2, reponse).group(1)
            date2 = int(date2)
            timeArray2 = time.localtime(date2)
            otherStyleTime2 = time.strftime("%m月%d日", timeArray2)
            worksheet.write('A'+str(line), otherStyleTime2)
            #print((otherStyleTime2))
        except Exception as result:
            #print("pass")
            pass

    html = etree.HTML(reponse)        
    activity_name = html.xpath("//h1")[0].text.strip() #标题
    worksheet.write('B'+str(line), activity_name)
    #print(activity_name)

    try:
        bianJi = r"编辑:([\u4e00-\u9fa5].*?)<"
        bianJi = re.search(bianJi, reponse).group(1).replace("&nbsp;"," ")
        worksheet.write('C'+str(line),bianJi)
        #print(bianJi)
    except Exception as result:
        pass

    try:
        jiaoDui = r"校对:([\u4e00-\u9fa5].*?)<"
        jiaoDui = re.search(jiaoDui, reponse).group(1).replace("&nbsp;"," ")
        worksheet.write('D'+str(line),jiaoDui)
        #print(jiaoDui)
    except Exception as result:
        pass

    try:
        text = r"文字:([\u4e00-\u9fa5].*?)<"
        text = re.search(text, reponse).group(1).replace("&nbsp;"," ")
        worksheet.write('E'+str(line),text)
        #print(text)
    except Exception as result:
        pass

    try:
        picture = r"图片:([\u4e00-\u9fa5].*?)<"
        picture = re.search(picture, reponse).group(1).replace("&nbsp;"," ")
        worksheet.write('F'+str(line),picture)
        #print(picture)
    except Exception as result:
        pass

    try:
        video = r"视频:([\u4e00-\u9fa5].*?)<"
        video = re.search(video, reponse).group(1).replace("&nbsp;"," ")
        worksheet.write('G'+str(line),video)
        #print(video)
    except Exception as result:
        pass

    try:
        audio = r"音频:([\u4e00-\u9fa5].*?)<"
        audio = re.search(audio, reponse).group(1).replace("&nbsp;"," ")
        worksheet.write('H'+str(line),audio)
        #print(audio)
    except Exception as result:
        pass

    try:
        zhuBian = r"学生主编:([\u4e00-\u9fa5].*?)<"
        zhuBian = re.search(zhuBian, reponse).group(1).replace("&nbsp;"," ")
        worksheet.write('I'+str(line),zhuBian)
        #print(zhuBian)
    except Exception as result:
        pass
    # 关闭工作薄
    # #print(dict)  
    #print("\n\n")
  workbook.close()
# path = os.path.dirname(os.path.abspath(__file__))
# path = path.replace("\\","/")+"/"
path = input("请输入已保存的本地html文件夹名称(如:D:/桌面):")
path = path.replace("\\","/")+"/"
getSiteList = GetSiteList(1,2, path)
#print(GetSiteList)
GetName(getSiteList,path)

print("保存的文件位置在:"+path+"官微每月作品表.xlsx")
input("程序运行完毕")

标签:etree,bold,worksheet,py,Excel,爬虫,write,print,import
From: https://www.cnblogs.com/MrFlySand/p/16844153.html

相关文章

  • linux安装python3.10
    1.下载python包https://www.python.org/ftp/python/3.10.5/Python-3.10.5.tgz2.安装依赖包yuminstall-ygccpatchlibffi-develpython-develzlib-develbzip2-dev......
  • python pip下载依赖到本地和本地安装
    pythonpip下载依赖到本地和本地安装环境:ubuntu18.0.4python3.6pip3list.txt文件内容(需要下载的安装包):certifi==2022.9.24 cffi==1.15.......
  • Java小白翻身-Excel教程
    嗯,先让我们捋一捋思路吧~privatestaticbooleansaveCustomer(ICustomerServicecustomerService){//1、读取文本,获取客户资料List<String>props=customerServ......
  • PyGame做了一个扫雷
     1#这是一个示例Python脚本。23#按⌃R执行或将其替换为您的代码。4#按双击⇧在所有地方搜索类、文件、工具窗口、操作和设置。5importsy......
  • C# .NET 常见DeepCopy 深度拷贝的性能对比
    先上结论MethodMeanErrorStdDevGen0Gen1AllocatedJSONConvert2,273.02ns43.758ns52.091ns0.6599-4160BReflection1,009.13ns10.110ns8.4......
  • Python学习一:基本内容
    文章目录​​一、Python规范​​​​二、Python基本规范​​​​2.1注释​​​​1单行注释​​​​2多行注释​​​​2.2变量​​​​1定义变量名​​​​2输出变量名......
  • Python学习二:字符串
    文章目录​​一、字符串编码转换​​​​1.1使用encode()方法编码​​​​1.2使用encode()方法解码​​​​二、字符串常规操作​​​​2.1拼接字符串​​​​2.2计算字......
  • Python学习八:数据库编程接口
    文章目录​​一、数据库编程接口​​​​1.1连接对象​​​​1.1.1获取连接对象​​​​1.1.2连接对象的方法​​​​2.1游标对象​​一、数据库编程接口1.1连接对象1.......
  • Python学习七:异常处理及调试程序
    文章目录​​一、异常概述​​​​二、异常处理语句​​​​2.1try...except​​​​2.2try...except...else​​​​2.3try...except...finally​​​​2.4......
  • Python学习六:模块
    文章目录​​一、概述​​​​二、自定义模块​​​​2.1创建模块​​​​2.2使用import语句导入模块​​​​2.3使用from...import语句导入模块​​​​2.4模块搜......