import json import logging import re import time from flask import Flask, jsonify import requests from lxml import etree from redis_ip import redis_ip import pymongo app = Flask(__name__) headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" } # 设置日志记录 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # 连接 MongoDB 数据库 client = pymongo.MongoClient("mongodb://localhost:27017/") db = client["book_database"] collection = db["bookapi"] def requests_page(url): try: logging.info(f"正在请求页面: {url}") res = requests.get(url=url, headers=headers, proxies=redis_ip.ip(), timeout=10) res.encoding = 'utf-8' logging.info(f"成功获取页面内容,状态码: {res.status_code}") if res.status_code == 200: return res.text if res.status_code == 403: return '触发反爬' if res.status_code == 404: return '没有资源' if res.status_code == 500: return '源网址信号不稳定,请稍后再试' else: return None except requests.exceptions.RequestException as e: logging.error(f"请求失败:{e}") return None # 纯数字和有汉字的返回True,英文或者英文加数字或者英文加数字加符号返回False,其他返回参数错误 def is_chinese(text): has_chinese = False has_number = False has_english = False has_symbol = False for char in text: if '\u4e00' <= char <= '\u9fff': has_chinese = True elif char.isdigit(): has_number = True elif ('a' <= char <= 'z') or ('A' <= char <= 'Z'): has_english = True else: has_symbol = True if has_chinese or (has_number and not has_english and not has_symbol): return True elif has_english or (has_english and has_number) or (has_english and has_number and has_symbol): return False else: return "参数错误" def get_isbnsearch_details(isbn, origin_url): res = requests_page(origin_url) if res is None or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试': logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(isbnsearch)。") return None try: logging.info(f"正在解析 isbnsearch 网站 ISBN 为 {isbn} 的页面内容") tree = etree.HTML(res) title = ''.join(tree.xpath('//div[@class="bookinfo"]//h1//text()')) img_urls = ''.join(tree.xpath('//div[@class="image"]//img//@src')) author = tree.xpath('//div[@class="bookinfo"]//p[3]//text()')[-1] binding = tree.xpath('//div[@class="bookinfo"]//p[4]//text()')[-1] publisher = tree.xpath('//div[@class="bookinfo"]//p[5]//text()')[-1] published_date = tree.xpath('//div[@class="bookinfo"]//p[6]//text()')[-1] return { 'title': title, 'origin_url': origin_url, 'img_urls': img_urls, 'isbn': isbn, 'author': author, 'binding': binding, 'formatted_author': author.strip(), # 格式化作者信息,去除首尾空格 'publisher': publisher, 'published_date': published_date, 'origin': 'isbnsearch', } except Exception as e: logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(isbnsearch):{e}") time.sleep(10) return None def get_nicebooks_details(isbn, origin_url): res = requests_page(origin_url) if res is None or '<title>Search books by ISBN · NiceBooks</title>' in res or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试': logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(nicebooks)。") return None try: logging.info(f"正在解析 nickbooks 网站 ISBN 为 {isbn} 的页面内容") tree = etree.HTML(res) title = ''.join(tree.xpath('//a[@class="title"]//text()')) img_urls = ''.join(tree.xpath('//div[@class="small-6 medium-2 columns"]//a//@src')) author = ''.join(tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[2]//text()')).replace('\n', '').replace( ' ', '').replace('by', '').strip() publisher = ''.join(tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[4]//text()')) published_date = tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[5]//text()')[0] binding = tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[5]//text()')[1] return { 'title': title, 'origin_url': origin_url, 'img_urls': img_urls, 'isbn': isbn, 'author': author, 'binding': binding, 'formatted_author': author.strip(), 'publisher': publisher, 'published_date': published_date, 'origin': 'nicebooks', } except Exception as e: logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(nicebooks):{e}") time.sleep(10) return None def get_bookuu_details(isbn, origin_url): res = requests_page(origin_url) if res is None or '<title>图书批发一站式图书批发平台 - 馆配图书平台 - 博库网批发平台</title>' in res or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试': logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(bookuu)。") return None try: logging.info(f"正在解析 bookuu 网站 ISBN 为 {isbn} 的页面内容") title = res['title'] img_urls = res['img_urls'] author = res['author'] publisher = res['publisher'] published_date = res['published_date'] binding = res['binding'] return { 'title': title, 'origin_url': origin_url, 'img_urls': img_urls, 'isbn': isbn, 'author': author, 'binding': binding, 'formatted_author': author.strip(), 'publisher': publisher, 'published_date': published_date, 'origin': '博库网', } except Exception as e: logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(bookuu):{e}") time.sleep(10) return res def get_kongfz_details(isbn, origin_url): res = requests_page(origin_url) if res is None or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试' or '"totalFound":0' in res: logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(kongfz)。") return None try: logging.info(f"正在解析 kongfz 网站 ISBN 为 {isbn} 的页面内容") res = json.loads(res) second_url_data = res['data']['itemResponse']['list'][0] title = second_url_data['title'] author = second_url_data['author'] publisher = second_url_data['press'] published_date = second_url_data['pubDateText'] img_urls = second_url_data['imgUrl'] binding = second_url_data['tplRecords']['binding'][-1]['value'] return { 'title': title, 'origin_url': origin_url, 'img_urls': img_urls, 'isbn': isbn, 'author': author, 'binding': binding, 'formatted_author': author.strip(), 'publisher': publisher, 'published_date': published_date, 'origin': '孔夫子旧书网', } except Exception as e: logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(kongfz):{e}") time.sleep(10) return None def get_douban_details(isbn, origin_url): res = requests_page(origin_url) if res is None or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试' or '读书 - 豆瓣搜索</title>' in res: logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(douban)。") return None try: logging.info(f"正在解析 douban 网站 ISBN 为 {isbn} 的页面内容") second_url = ''.join(re.findall('"url": "(.*?)"}], "report"', res)) second_html = requests_page(second_url) second_tree = etree.HTML(second_html) all_msg = ''.join(second_tree.xpath('//div[@id="info"]//text()')).replace('\n', '').strip() all_msg = re.sub(r'\s+', ' ', all_msg) title = second_tree.xpath('//div[@id="wrapper"]//h1//text()')[1] img_urls = ''.join(second_tree.xpath(f'//img[@alt="{title}"]//@src')) author = ''.join(re.findall('作者: (.*?) ', all_msg)) publisher = ''.join(re.findall('出版社: (.*?) ', all_msg)) fanyi = ''.join(re.findall('译者: (.*?) ', all_msg)) fubiaoti = ''.join(re.findall('副标题: (.*?) ', all_msg)) yuanzuoming = ''.join(re.findall('原作名: (.*?) ', all_msg)) published_date = ''.join(re.findall('出版年: (.*?) ', all_msg)) yeshu = ''.join(re.findall('页数: (.*?) ', all_msg)) price = ''.join(re.findall('定价: (.*?) ', all_msg)) binding = ''.join(re.findall('装帧: (.*?) ', all_msg)) return { 'title': title, 'origin_url': origin_url, 'img_urls': img_urls, 'isbn': isbn, 'author': author, 'binding': binding, 'formatted_author': author.strip(), 'publisher': publisher, 'fanyi': fanyi, 'fubiaoti': fubiaoti, 'yuanzuoming': yuanzuoming, 'yeshu': yeshu, 'price': price, 'published_date': published_date, 'origin': '豆瓣网', } except Exception as e: logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(douban):{e}") time.sleep(10) return None @app.route('/bookapi/<isbn>') def get_book_info(isbn): # 先从数据库中查找书籍信息 book_data = None try: book_data = collection.find_one({"isbn": isbn}) except Exception as db_error: logging.error(f"数据库查询出现错误:{db_error}") if book_data: logging.info(f"从数据库中获取 ISBN 为 {isbn} 的书籍信息") return_data = { key: book_data.get(key, '') for key in [ "title", "origin_url", "product_descritpion", "author", "publisher", "published_date", "img_urls", "isbn", "binding", "format", "isItASet", "paper", "classification", "type", "content_description", "author_introduction", "catalogue", "preface", "online_trial_reading", "media_comments", "book_url", "time", "seriestitle", "isbn10", "price", "genus", "levelNum", "heatNum", "edition", "yinci", "language", "keyword", "fanyi", "fubiaoti", "yuanzuoming", "yeshu","origin" ] } return jsonify(return_data) else: book_details = None if is_chinese(str(isbn)): print(f'纯数字和有汉字的参数,使用以博库网为首的中文源') origin_url_list = [ f'https://pifa.bookuu.com/search?keyword={isbn}&flag=1', f'https://search.douban.com/book/subject_search?search_text={isbn}&cat=1001', f'https://search.kongfz.com/pc-gw/search-web/client/pc/product/keyword/list?dataType=0&keyword={isbn}&page=1&userArea=1001000000', f'https://isbnsearch.org/isbn/{isbn}', f'https://us.nicebooks.com/search/isbn?isbn={isbn}' ] else: print(f'英文参数,使用以 nicebooks 为首的英文源') origin_url_list = [ f'https://us.nicebooks.com/search/isbn?isbn={isbn}', f'https://isbnsearch.org/isbn/{isbn}', f'https://pifa.bookuu.com/search?keyword={isbn}&flag=1', f'https://search.douban.com/book/subject_search?search_text={isbn}&cat=1001', f'https://search.kongfz.com/pc-gw/search-web/client/pc/product/keyword/list?dataType=0&keyword={isbn}&page=1&userArea=1001000000', ] for origin_url in origin_url_list: try: if origin_url == f'https://isbnsearch.org/isbn/{isbn}': book_details = get_isbnsearch_details(isbn, origin_url) if book_details is not None: book_details['origin_url'] = origin_url break if origin_url == f'https://us.nicebooks.com/search/isbn?isbn={isbn}': book_details = get_nicebooks_details(isbn, origin_url) if book_details is not None: book_details['origin_url'] = origin_url break if origin_url == f'https://search.kongfz.com/pc-gw/search-web/client/pc/product/keyword/list?dataType=0&keyword={isbn}&page=1&userArea=1001000000': book_details = get_kongfz_details(isbn, origin_url) if book_details is not None: book_details['origin_url'] = origin_url break if origin_url == f'https://pifa.bookuu.com/search?keyword={isbn}&flag=1': book_details = get_bookuu_details(isbn, origin_url) if book_details is not None: book_details['origin_url'] = origin_url break if origin_url == f'https://search.douban.com/book/subject_search?search_text={isbn}&cat=1001': book_details = get_douban_details(isbn, origin_url) if book_details is not None: book_details['origin_url'] = origin_url break except Exception as func_error: logging.error(f"处理 URL {origin_url} 时出现错误:{func_error}") if book_details is None: return jsonify({"error": "无法找到书籍信息。"}), 404 else: print(f'book_details:{book_details}') # 将获取到的书籍信息存储到数据库 try: collection.insert_one(book_details) logging.info(f"将 ISBN 为 {isbn} 的书籍信息存储到数据库") except Exception as insert_error: logging.error(f"数据库插入出现错误:{insert_error}") return_data = { key: book_details.get(key, '') for key in [ "title", "origin_url", "product_descritpion", "author", "publisher", "published_date", "img_urls", "isbn", "binding", "format", "isItASet", "paper", "classification", "type", "content_description", "author_introduction", "catalogue", "preface", "online_trial_reading", "media_comments", "book_url", "time", "seriestitle", "isbn10", "price", "genus", "levelNum", "heatNum", "edition", "yinci", "language", "keyword", "fanyi", "fubiaoti", "yuanzuoming", "yeshu","origin" ] } return jsonify(return_data) if __name__ == '__main__': app.run()
标签:origin,isbn,url,书站,采集,book,details,res,书籍 From: https://www.cnblogs.com/wolvies/p/18461229