#!/usr/local/bin/python3 # -*- encoding: utf-8 -*- import requests from lxml import etree import os from PIL import Image import shutil def get_doc_url(url): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53" } try: response = requests.get(url=url, headers=headers) if response.status_code == 200: html = response.text all_doc_page = etree.HTML(html) doc_url = all_doc_page.xpath("//div[@class='list-bd']//div[@class='title fl']/a/@href") return doc_url except Exception as e: print("【请求失败,请检查URL和网络环境!】") print(e) def jpg2pdf(doc_id): image_list = [] for i in range(len(os.listdir(os.path.join("output", doc_id)))): image_path = os.path.join("output", doc_id, f"{i}.jpg") img = Image.open(image_path) if img.mode != "RGB": img = img.convert("RGB") image_list.append(img) pdf_path = os.path.join("output", doc_id, f"{doc_id}.pdf") image_list[0].save(pdf_path, "PDF", resolution=100.0, save_all=True, append_images=image_list[1:]) if os.path.exists(f"output/{doc_id}/{doc_id}.pdf"): # 转换成功,删除原jpg文件 print(f"{doc_id} 【转换为pdf成功!】") os.system(f"del /s /q output\\{doc_id}\\*.jpg") print(f"{doc_id} 文件夹图片已删除") else: # 转换失败 print(f"{doc_id} 【转换失败,请检查!】") def get_jpg(doc_url): for i in doc_url: url = "https://www.51jiaoxi.com/" + i[1:] headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53" } response = requests.get(url=url, headers=headers) html = etree.HTML(response.text) doc_id = url.split("-")[1].split(".")[0] jpg_url = html.xpath("//div[@class='img-box']/img/@src") jpg_url_split = jpg_url[0].split("/") jpg_url_prefix = f"https://{jpg_url_split[2]}/{jpg_url_split[3]}/{jpg_url_split[4]}/{jpg_url_split[5]}" start_page_num = int(jpg_url[0].split(".")[2].split("/")[-1]) show_page_num = len(jpg_url) no_show_page_num = html.xpath("//div[@class='remain-previews-inner']/span/span/text()")[0] all_page_num = int(show_page_num) + int(no_show_page_num) if not os.path.exists("output"): os.mkdir("output") if not os.path.exists(os.path.join("output", doc_id)): os.mkdir(os.path.join("output", doc_id)) print("\n正在下载试卷: {}...".format(doc_id)) for j in range(start_page_num, int(all_page_num)): jpg_url = f"{jpg_url_prefix}/0/{j}.jpg?x-oss-process=image/crop,h_1044,g_center/format,webp" response = requests.get(url=jpg_url, headers=headers) with open(os.path.join("output", doc_id, f"{j}.jpg"), "wb") as f: f.write(response.content) print("下载完成!") print("正在转换为pdf...") jpg2pdf(doc_id) print(f"已完成: 【{doc_url.index(i) + 1}/{len(doc_url)}】") print("\n全部下载完毕!\n") def main(): url = input("\n请输入成套试卷链接: ") while "album" not in url: print("【不是成套试卷链接,请重新输入!】") url = input("\n请输入成套试卷链接: ") doc_url = get_doc_url(url) if doc_url: get_jpg(doc_url) else: print("无法获取试卷链接,请检查输入的链接是否正确。") if __name__ == "__main__": main()
标签:python,doc,试卷,jpg,爬取,url,path,os,id From: https://www.cnblogs.com/anzhili/p/18001707