首页 > 编程语言 >Python 爬取单个网页所需要加载的URL地址和CSS、JS文件地址

Python 爬取单个网页所需要加载的URL地址和CSS、JS文件地址

时间:2022-11-28 11:34:06浏览次数:39  
标签:re Python JS url 地址 result import print path

直接上代码:

脱敏后自用的py采集代码,

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author:Andy
@file:xxx.py
@time:下午05:50
@desc:采集的文章数据进博客
"""
import os
import re
import time
import requests
from bs4 import BeautifulSoup, SoupStrainer
from requests.exceptions import RequestException
from hashlib import md5
from urllib.parse import urlparse
import urllib

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}


def get_content():
    url = 'http://ask.xxxx.com/question/xxxx'  # url
    response = requests.get(url, headers=headers).text.replace('<i class="fa fa-paperclip"></i>', '')
    soup = BeautifulSoup(response, 'lxml')
    # div = soup.select('#aw-mod-body ueditor-p-reset')
    pattern = re.compile('<a\shref="(http://ask.apelearn.com/file.*?)".*?>(.*?)</a>', re.S)
    p = soup.find_all('a')
    for item in p:
        # print(str(item))
        result = re.findall(pattern, str(item))
        if result:
            # print(result)
            for i in result:
                url, name = i
                # print(i)
                yield {
                    'url': url,
                    'name': name
                }


def mkdir(path):
    # 去除首位空格
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists=os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)
        print(path+' 创建成功')
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print(path+' 目录已存在')
        return False

def getUrl(html):
    #patterncss = '<link href="(.*?)"'
    patternjs = '<script src="(.*?)"'
    patternimg = '<img src="(.*?)"'
    #href = re.compile(patterncss, re.S).findall(html)
    href = re.compile(patternimg, re.S).findall(html)
    href += re.compile(patternjs, re.S).findall(html)
    return href

def getCssUrl(html):
    patterncss = '<link href="(.*?)"'
    href = re.compile(patterncss, re.S).findall(html)
    return href

# 下载网页
def download_html(root_path, url):
    a = urlparse(url)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    if file_suffix != '.html':
        file_name_real = file_name + '.html'
    else:
        file_name_real = file_name
    file_path_real = file_path.replace(file_name, '')
    file_path_reals = file_path_real.replace('/', "\\")
    all_file_path_real = root_path + file_path_reals + file_name_real
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    re = requests.get(url, headers = headers)
    re.encoding = "utf-8"

    itemurl = getUrl(re.text)
    for item1 in itemurl:
        download_commonimgjs(root_path, item1)

    itemcssurl = getCssUrl(re.text)
    for item2 in itemcssurl:
        download_css(root_path, item2)

    new_text = re.text.replace('https://www.xxxxxx.com', 'http://www.xxxxx.com')
    new_texts = new_text.replace('xxxxxx.com', '3cinno.shanhubei.com')
    with open(all_file_path_real, "w+", encoding="utf-8") as html_file:
        html_file.write(new_texts)

def download_commonimgjs(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    if os.path.isfile(newmkpath + file_name):
        return
    # 调用函数
    mkdir(newmkpath)
    try:
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-agent',
                              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(imgurl, newmkpath + file_name)
    except urllib.error.HTTPError:
        print('error')



def download_img(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    # 调用函数
    mkdir(newmkpath)
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(imgurl, newmkpath + file_name)

def download_js(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    # 调用函数
    mkdir(newmkpath)
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent',
                          'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(imgurl, newmkpath + file_name)

def download_css(root_path, url):
    if str(url[:1]) == r"/":
        imgurl = "https://www.xxxxxx.com" + url
    else:
        imgurl = url
    a = urlparse(imgurl)
    file_path = a.path
    file_name = os.path.basename(file_path)
    _, file_suffix = os.path.splitext(file_name)
    if file_suffix != '.css':
        return
    # print(os.path.curdir(file_path))
    match_url = file_path.replace(file_name, '')
    match_url_new = match_url.replace('/', "\\")
    newmkpath = root_path + match_url_new
    if os.path.isfile(newmkpath + file_name):
        return
    # 调用函数
    mkdir(newmkpath)
    try:
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-agent',
                              'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(imgurl, newmkpath + file_name)
    except urllib.error.HTTPError:
        print('error')

def get_xml():
    url = 'https://www.xxxxxx.com/sitemap-1.xml'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    res = requests.get(url, headers=headers)
    res.encoding = "utf-8"
    # 根据你的文章链接格式写正则匹配,可能与我的不一样
    r = re.compile(r'https://www.xxxxxx.com/\S*?')
    big = re.findall(r, res.text)
    for i in big:
        print(i)


def main():
    # get_content()
    # url = r'https://www.xxxxxx.com/news/xxxx-proje-20711498'
    url = r'https://www.xxxxxx.com/uploads/20218080/logo202107221507387902092.png'
    # 定义要创建的目录
    root_path = "F:\\Project-cz\\shanhubei\\3cinno"
    #download_img(root_path, url)

    #htmlurl = r'https://www.xxxxxx.com/3d-clear-led-dmx-ball'
    #download_html(root_path, htmlurl)

    cssurl = r'https://www.xxxxxx.com/images/m184/black/style.css'
    #download_css(root_path, cssurl)

    #demourl = 'https://www.xxxxxx.com/Content/kcim/js/layim-public.js?t=20190404'
    #demo(demourl)

    get_xml()


def demo(url):
    a = urlparse(url)
    file_path = a.path
    print(a.scheme)
    print(a.hostname)
    print('a.file_path=' + file_path)
    file_name = os.path.basename(file_path)
    print('file_name=' +file_name)
    _, file_suffix = os.path.splitext(file_name)
    print('a.file_suffix=' + file_suffix)





if __name__ == '__main__':
    main()

 来源:http://www.shanhubei.com/archives/2491.html

标签:re,Python,JS,url,地址,result,import,print,path
From: https://www.cnblogs.com/shanhubei/p/16931746.html

相关文章

  • C++ ---获取类成员函数地址
    #include<iostream>classTA{public:inta;voidTA1(){//this->a=5;printf("a=%d\n",a);}voidTA2(){......
  • GIS---WKT、WKB与GeoJSON
    WKT与WKB是OGC中的简单服务标准SFS(SimpleFeaturesInterfaceStandard),但是GeoJSON并不是OGC中的标准。​WKT(Well-knowntext)是开放地理空间联盟OGC(OpenGISConsortium......
  • 手把手教你使用Python实现一键抠图,照片换背景so easy!
    引言大家好我是迷彩.平时我们工作或者生活中(比如应聘的简历)可能会用到蓝底,红底,白底各种相片,可能当时我们只有一种背景颜色的相片,但是我们又没有时间去拍一张,关键还需......
  • 用python SMTP发送简单邮件
    pythonSMTP发送邮件SMTP(SimpleMailTransferProtocol)即简单邮件传输协议它是一组由源地址到目的地址传送邮件得规则,由它来控制信件的中转方式。Python创建SMTP对......
  • js没有块级作用域
    JavaScript没有块级作用域。在其他语言上,比如C语言中,有花括号封闭的代码块都有自己的作用域,(如果用ECMAScript的话来讲,就是他们自己的执行环境),因而支持根据条件来定义变量......
  • python3数据类型
    1. 数字类型 Python数字类型主要包括int(整型)和float(浮点型) int(整型) 在32位机器上,整数的位数是32位,取值范围是-231~231-1,即-2147483648~214748364;在64位系统上,整......
  • 树莓派安装CV2 for python3过程
     pipinstallopencv-python安装完测试   尝试修复,按照查的资料,依次安装依赖包sudoapt-getinstalllibcblas-dev又入新坑  系统建议装Base的包su......
  • Python PyDirectInput
    pipinstallpydirectinputimportpydirectinputpydirectinput.moveTo(100,150)#移动鼠标至坐标100,150pydirectinput.click()#点击鼠标左键pydirectinput.click(2......
  • 不背锅运维:Grafana的自动登入(Go和Python分别实现)
    1.实现目标想要达到的目标是:当在浏览器向http://192.168.11.254:3090/auto_login这个地址发起GET请求后能够自动登入Grafana2.实现思路需要额外开发一个API处理来自......
  • PYTHON 数据类型
    1.1数据类型数据都有类型,python的标准类型:数字:int,float,complex,bool字符串:str列表:list元组:tuple集合:set字典:dict可以用type()函数识别数据类型......