import urllib3
from selenium import webdriver
from bs4 import BeautifulSoup
import os
import requests
from urllib.parse import urljoin, urlparse
import base64
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
# 屏蔽 InsecureRequestWarning 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 创建保存文件的目录
save_dir = 'website_copy'
os.makedirs(save_dir, exist_ok=True)
# 设置Chrome浏览器的驱动路径
driver_path = 'chromedriver-win64/chromedriver.exe'
# 初始化Selenium
driver = webdriver.Chrome(executable_path=driver_path)
# 访问目标网站
url = 'https://example.com/'
driver.get(url)
# 获取页面源代码
page_source = driver.page_source
# 获取页面的URL,用于后续的相对路径处理
base_url = driver.current_url
# 关闭浏览器
driver.quit()
# 下载文件的函数,包含对data URL的处理,并保留原始目录结构
def download_file(url, save_dir):
"""下载文件并保存到指定目录,保留原始目录结构"""
if url.startswith('data:'):
# 处理 data URL
match = re.match(r'data:(.*?);base64,(.*)', url)
if match:
mime_type = match.group(1)
data = match.group(2)
file_extension = mime_type.split('/')[-1]
file_name = os.path.join(save_dir, f"embedded_image.{file_extension}")
try:
with open(file_name, 'wb') as file:
file.write(base64.b64decode(data))
return file_name
except Exception as e:
print(f"Error saving embedded data URL: {e}")
else:
# 设置重试机制
session = requests.Session()
retry = Retry(total=5, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)
session.mount('http://', adapter)
try:
response = session.get(url, verify=False) # 禁用SSL验证
if response.status_code == 200:
# 构造保存路径,保留原始目录结构
parsed_url = urlparse(url)
file_path = os.path.join(save_dir, parsed_url.path.lstrip('/'))
file_dir = os.path.dirname(file_path)
os.makedirs(file_dir, exist_ok=True)
# 保存文件
with open(file_path, 'wb') as file:
file.write(response.content)
return parsed_url.path
except Exception as e:
print(f"Error downloading {url}: {e}")
return None
# 解析HTML
soup = BeautifulSoup(page_source, 'html.parser')
# 处理CSS文件、JS文件、图片等资源
for tag in soup.find_all(['link', 'script', 'img']):
if tag.name == 'link' and tag.get('rel') == ['stylesheet']:
# CSS文件
css_url = urljoin(base_url, tag['href'])
local_css = download_file(css_url, save_dir)
if local_css:
tag['href'] = local_css
elif tag.name == 'script' and tag.get('src'):
# JS文件
js_url = urljoin(base_url, tag['src'])
local_js = download_file(js_url, save_dir)
if local_js:
tag['src'] = local_js
elif tag.name == 'img' and tag.get('src'):
# 图片文件
img_url = urljoin(base_url, tag['src'])
local_img = download_file(img_url, save_dir)
if local_img:
tag['src'] = local_img
# 保存修改后的HTML文件
with open(os.path.join(save_dir, 'index.html'), 'w', encoding='utf-8') as file:
file.write(str(soup))
print("Website resources have been successfully copied and saved to:", save_dir)
标签:Python,Selenium,url,tag,file,path,拷贝,save,dir
From: https://www.cnblogs.com/felixwan/p/18362794