首页 > 其他分享 >从不同书站采集书籍

从不同书站采集书籍

时间:2024-10-12 18:43:21浏览次数:1  
标签:origin isbn url 书站 采集 book details res 书籍

import json
import logging
import re
import time
from flask import Flask, jsonify
import requests
from lxml import etree
from redis_ip import redis_ip
import pymongo

app = Flask(__name__)

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

# 设置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 连接 MongoDB 数据库
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["book_database"]
collection = db["bookapi"]


def requests_page(url):
    try:
        logging.info(f"正在请求页面: {url}")
        res = requests.get(url=url, headers=headers, proxies=redis_ip.ip(), timeout=10)
        res.encoding = 'utf-8'
        logging.info(f"成功获取页面内容,状态码: {res.status_code}")
        if res.status_code == 200:
            return res.text
        if res.status_code == 403:
            return '触发反爬'
        if res.status_code == 404:
            return '没有资源'
        if res.status_code == 500:
            return '源网址信号不稳定,请稍后再试'
        else:
            return None

    except requests.exceptions.RequestException as e:
        logging.error(f"请求失败:{e}")
        return None


# 纯数字和有汉字的返回True,英文或者英文加数字或者英文加数字加符号返回False,其他返回参数错误
def is_chinese(text):
    has_chinese = False
    has_number = False
    has_english = False
    has_symbol = False
    for char in text:
        if '\u4e00' <= char <= '\u9fff':
            has_chinese = True
        elif char.isdigit():
            has_number = True
        elif ('a' <= char <= 'z') or ('A' <= char <= 'Z'):
            has_english = True
        else:
            has_symbol = True
    if has_chinese or (has_number and not has_english and not has_symbol):
        return True
    elif has_english or (has_english and has_number) or (has_english and has_number and has_symbol):
        return False
    else:
        return "参数错误"


def get_isbnsearch_details(isbn, origin_url):
    res = requests_page(origin_url)
    if res is None or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试':
        logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(isbnsearch)。")
        return None
    try:
        logging.info(f"正在解析 isbnsearch 网站 ISBN 为 {isbn} 的页面内容")
        tree = etree.HTML(res)
        title = ''.join(tree.xpath('//div[@class="bookinfo"]//h1//text()'))
        img_urls = ''.join(tree.xpath('//div[@class="image"]//img//@src'))
        author = tree.xpath('//div[@class="bookinfo"]//p[3]//text()')[-1]
        binding = tree.xpath('//div[@class="bookinfo"]//p[4]//text()')[-1]
        publisher = tree.xpath('//div[@class="bookinfo"]//p[5]//text()')[-1]
        published_date = tree.xpath('//div[@class="bookinfo"]//p[6]//text()')[-1]
        return {
            'title': title,
            'origin_url': origin_url,
            'img_urls': img_urls,
            'isbn': isbn,
            'author': author,
            'binding': binding,
            'formatted_author': author.strip(),  # 格式化作者信息,去除首尾空格
            'publisher': publisher,
            'published_date': published_date,
            'origin': 'isbnsearch',
        }
    except Exception as e:
        logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(isbnsearch):{e}")
        time.sleep(10)
        return None


def get_nicebooks_details(isbn, origin_url):
    res = requests_page(origin_url)
    if res is None or '<title>Search books by ISBN · NiceBooks</title>' in res or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试':
        logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(nicebooks)。")
        return None
    try:
        logging.info(f"正在解析 nickbooks 网站 ISBN 为 {isbn} 的页面内容")
        tree = etree.HTML(res)
        title = ''.join(tree.xpath('//a[@class="title"]//text()'))
        img_urls = ''.join(tree.xpath('//div[@class="small-6 medium-2 columns"]//a//@src'))
        author = ''.join(tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[2]//text()')).replace('\n',
                                                                                                                   '').replace(
            ' ', '').replace('by', '').strip()
        publisher = ''.join(tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[4]//text()'))
        published_date = tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[5]//text()')[0]
        binding = tree.xpath('//div[@class="medium-10 small-margin-top columns"]//div[5]//text()')[1]

        return {
            'title': title,
            'origin_url': origin_url,
            'img_urls': img_urls,
            'isbn': isbn,
            'author': author,
            'binding': binding,
            'formatted_author': author.strip(),
            'publisher': publisher,
            'published_date': published_date,
            'origin': 'nicebooks',
        }
    except Exception as e:
        logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(nicebooks):{e}")
        time.sleep(10)
        return None


def get_bookuu_details(isbn, origin_url):
    res = requests_page(origin_url)
    if res is None or '<title>图书批发一站式图书批发平台 - 馆配图书平台 - 博库网批发平台</title>' in res or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试':
        logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(bookuu)。")
        return None
    try:
        logging.info(f"正在解析 bookuu 网站 ISBN 为 {isbn} 的页面内容")
        title = res['title']
        img_urls = res['img_urls']
        author = res['author']
        publisher = res['publisher']
        published_date = res['published_date']
        binding = res['binding']

        return {
            'title': title,
            'origin_url': origin_url,
            'img_urls': img_urls,
            'isbn': isbn,
            'author': author,
            'binding': binding,
            'formatted_author': author.strip(),
            'publisher': publisher,
            'published_date': published_date,
            'origin': '博库网',
        }
    except Exception as e:
        logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(bookuu):{e}")
        time.sleep(10)
        return res


def get_kongfz_details(isbn, origin_url):
    res = requests_page(origin_url)
    if res is None or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试' or '"totalFound":0' in res:
        logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(kongfz)。")
        return None
    try:
        logging.info(f"正在解析 kongfz 网站 ISBN 为 {isbn} 的页面内容")
        res = json.loads(res)
        second_url_data = res['data']['itemResponse']['list'][0]
        title = second_url_data['title']
        author = second_url_data['author']
        publisher = second_url_data['press']
        published_date = second_url_data['pubDateText']
        img_urls = second_url_data['imgUrl']
        binding = second_url_data['tplRecords']['binding'][-1]['value']
        return {
            'title': title,
            'origin_url': origin_url,
            'img_urls': img_urls,
            'isbn': isbn,
            'author': author,
            'binding': binding,
            'formatted_author': author.strip(),
            'publisher': publisher,
            'published_date': published_date,
            'origin': '孔夫子旧书网',
        }
    except Exception as e:
        logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(kongfz):{e}")
        time.sleep(10)
        return None


def get_douban_details(isbn, origin_url):
    res = requests_page(origin_url)
    if res is None or res == '触发反爬' or res == '没有资源' or res == '源网址信号不稳定,请稍后再试' or '读书 - 豆瓣搜索</title>' in res:
        logging.error(f"无法获取 ISBN 为 {isbn} 的书籍信息页面内容(douban)。")
        return None
    try:
        logging.info(f"正在解析 douban 网站 ISBN 为 {isbn} 的页面内容")
        second_url = ''.join(re.findall('"url": "(.*?)"}], "report"', res))
        second_html = requests_page(second_url)
        second_tree = etree.HTML(second_html)
        all_msg = ''.join(second_tree.xpath('//div[@id="info"]//text()')).replace('\n', '').strip()
        all_msg = re.sub(r'\s+', ' ', all_msg)
        title = second_tree.xpath('//div[@id="wrapper"]//h1//text()')[1]
        img_urls = ''.join(second_tree.xpath(f'//img[@alt="{title}"]//@src'))
        author = ''.join(re.findall('作者: (.*?) ', all_msg))
        publisher = ''.join(re.findall('出版社: (.*?) ', all_msg))
        fanyi = ''.join(re.findall('译者: (.*?) ', all_msg))
        fubiaoti = ''.join(re.findall('副标题: (.*?) ', all_msg))
        yuanzuoming = ''.join(re.findall('原作名: (.*?) ', all_msg))
        published_date = ''.join(re.findall('出版年: (.*?) ', all_msg))
        yeshu = ''.join(re.findall('页数: (.*?) ', all_msg))
        price = ''.join(re.findall('定价: (.*?) ', all_msg))
        binding = ''.join(re.findall('装帧: (.*?) ', all_msg))

        return {
            'title': title,
            'origin_url': origin_url,
            'img_urls': img_urls,
            'isbn': isbn,
            'author': author,
            'binding': binding,
            'formatted_author': author.strip(),
            'publisher': publisher,
            'fanyi': fanyi,
            'fubiaoti': fubiaoti,
            'yuanzuoming': yuanzuoming,
            'yeshu': yeshu,
            'price': price,
            'published_date': published_date,
            'origin': '豆瓣网',
        }
    except Exception as e:
        logging.error(f"处理 ISBN 为 {isbn} 的页面时出现错误(douban):{e}")
        time.sleep(10)
        return None


@app.route('/bookapi/<isbn>')
def get_book_info(isbn):
    # 先从数据库中查找书籍信息
    book_data = None
    try:
        book_data = collection.find_one({"isbn": isbn})
    except Exception as db_error:
        logging.error(f"数据库查询出现错误:{db_error}")
    if book_data:
        logging.info(f"从数据库中获取 ISBN 为 {isbn} 的书籍信息")
        return_data = {
            key: book_data.get(key, '') for key in [
                "title", "origin_url", "product_descritpion", "author", "publisher", "published_date", "img_urls",
                "isbn", "binding", "format", "isItASet", "paper", "classification", "type", "content_description",
                "author_introduction", "catalogue", "preface", "online_trial_reading", "media_comments", "book_url",
                "time", "seriestitle", "isbn10", "price", "genus", "levelNum", "heatNum", "edition", "yinci",
                "language", "keyword", "fanyi",
                "fubiaoti", "yuanzuoming", "yeshu","origin"
            ]
        }
        return jsonify(return_data)
    else:
        book_details = None

        if is_chinese(str(isbn)):
            print(f'纯数字和有汉字的参数,使用以博库网为首的中文源')
            origin_url_list = [
                f'https://pifa.bookuu.com/search?keyword={isbn}&flag=1',
                f'https://search.douban.com/book/subject_search?search_text={isbn}&cat=1001',
                f'https://search.kongfz.com/pc-gw/search-web/client/pc/product/keyword/list?dataType=0&keyword={isbn}&page=1&userArea=1001000000',
                f'https://isbnsearch.org/isbn/{isbn}',
                f'https://us.nicebooks.com/search/isbn?isbn={isbn}'
            ]
        else:
            print(f'英文参数,使用以 nicebooks 为首的英文源')
            origin_url_list = [
                f'https://us.nicebooks.com/search/isbn?isbn={isbn}',
                f'https://isbnsearch.org/isbn/{isbn}',
                f'https://pifa.bookuu.com/search?keyword={isbn}&flag=1',
                f'https://search.douban.com/book/subject_search?search_text={isbn}&cat=1001',
                f'https://search.kongfz.com/pc-gw/search-web/client/pc/product/keyword/list?dataType=0&keyword={isbn}&page=1&userArea=1001000000',

            ]
        for origin_url in origin_url_list:
            try:
                if origin_url == f'https://isbnsearch.org/isbn/{isbn}':
                    book_details = get_isbnsearch_details(isbn, origin_url)
                    if book_details is not None:
                        book_details['origin_url'] = origin_url
                        break
                if origin_url == f'https://us.nicebooks.com/search/isbn?isbn={isbn}':
                    book_details = get_nicebooks_details(isbn, origin_url)
                    if book_details is not None:
                        book_details['origin_url'] = origin_url
                        break
                if origin_url == f'https://search.kongfz.com/pc-gw/search-web/client/pc/product/keyword/list?dataType=0&keyword={isbn}&page=1&userArea=1001000000':
                    book_details = get_kongfz_details(isbn, origin_url)
                    if book_details is not None:
                        book_details['origin_url'] = origin_url
                        break
                if origin_url == f'https://pifa.bookuu.com/search?keyword={isbn}&flag=1':
                    book_details = get_bookuu_details(isbn, origin_url)
                    if book_details is not None:
                        book_details['origin_url'] = origin_url
                        break
                if origin_url == f'https://search.douban.com/book/subject_search?search_text={isbn}&cat=1001':
                    book_details = get_douban_details(isbn, origin_url)
                    if book_details is not None:
                        book_details['origin_url'] = origin_url
                        break
            except Exception as func_error:
                logging.error(f"处理 URL {origin_url} 时出现错误:{func_error}")

        if book_details is None:
            return jsonify({"error": "无法找到书籍信息。"}), 404
        else:
            print(f'book_details:{book_details}')
            # 将获取到的书籍信息存储到数据库
            try:
                collection.insert_one(book_details)
                logging.info(f"将 ISBN 为 {isbn} 的书籍信息存储到数据库")
            except Exception as insert_error:
                logging.error(f"数据库插入出现错误:{insert_error}")
            return_data = {
                key: book_details.get(key, '') for key in [
                    "title", "origin_url", "product_descritpion", "author", "publisher", "published_date",
                    "img_urls", "isbn", "binding", "format", "isItASet", "paper", "classification", "type",
                    "content_description", "author_introduction", "catalogue", "preface",
                    "online_trial_reading", "media_comments", "book_url", "time", "seriestitle", "isbn10",
                    "price", "genus", "levelNum", "heatNum", "edition", "yinci", "language", "keyword", "fanyi",
                    "fubiaoti", "yuanzuoming", "yeshu","origin"
                ]
            }
            return jsonify(return_data)


if __name__ == '__main__':
    app.run()

 

标签:origin,isbn,url,书站,采集,book,details,res,书籍
From: https://www.cnblogs.com/wolvies/p/18461229

相关文章

  • 强烈推荐的AI大模型书籍!这2本大模型书一定要读!附大模型书
    复旦大学自然语言处理实验室张奇教授、桂韬研究员、郑锐博士生以及黄萱菁教授结合之前在自然语言处理领域研究经验,以及分布式系统和并行计算的教学经验,通过在大语言模型实践和理论研究的过程中,历时8个月完成本书《大规模语言模型·从理论到实践》不可错过!《大规模语言模......
  • 大模型畅销书推荐!动手做AI Agent +从零构建大模型 附书籍PDF
    本次书单我为大家收集了大模型相关的新书。探索《大模型应用开发动手做AIAgent》是畅销作者黄佳老师的新书,带领读者踏上AIAgent开发之旅,掌握尖端技术和实用技巧。01《大模型应用开发动手做AIAgent》本书特色从零开始,循序渐进,图解直击痛难点。理论与实践相结合,7......
  • LLM大语言模型书籍教程推荐:《大模型时代》+《开源大模型食用指南》!附文档
    哈喽大家好!很久都没有更新大模型这块的书了,今天给大家说一下这本:《大模型时代》,本书对大模型时代的技术、应用和产业变化进行了深入的分析和阐述。《本书》深入探讨了大型模型时代的技术演进、应用场景和产业变革。生动地阐释了ChatGPT背后的工作原理,深入解析了这一推动技......
  • 【大模型书籍】24年一书通关LLM大模型,<大模型应用开发极简入门>蛇尾书来了
    大家好,今天给大家推荐一本大模型应用开发入门书籍《大模型应用开发极简入门》,本书对很多AI概念做了讲解和说明!朋友们如果有需要《大模型应用开发极简入门》,扫码获取~......
  • 西门子828D数控机床数据采集解决方案
    西门子828D数控机床能够根据预先编写的程序来控制机器和工序的自动化运行,在各种工业加工领域得到广泛运用,如车床、铣床、攻钻机、加工中心等。随着数字化工厂和大数据应用越来越受到重视,实现机床数据采集与实时监控的需求越来越高,无论是生产管理还能改善优化都能获得全面准确的数据......
  • 抖店商家电话搜集工具 抖音商家电话爬虫店铺采集器
    分享小编:电商小达人作者:1030249563(V)Java爬虫的实现在Java中,我们可以使用Jsoup库来简化网络请求和HTML解析的过程。以下是一个简单的爬虫示例代码,用于抓取抖音小店中的商品信息。Maven依赖首先,你需要在项目的pom.xml文件中添加Jsoup的依赖:org.jsoupjsoup1.14.3......
  • 抖店商家电话采集软件 抖音小店店铺电话批量采集工具
    分享小编:电商小达人作者:1030249563(V)在电商平台上,卖家的联系电话是非常重要的信息,可以帮助我们更好地了解市场情况、与卖家进行沟通以及建立商业合作关系。本教程将为大家介绍一款名为“抖店卖家电话采集神器”的工具,它能够快速提取电商平台上卖家的联系电话。本文将为大家提......
  • 【关注可白嫖源码】springboot基于微信小程序的二手书籍交易平台
    摘 要国内的京东商城、天猫、苏宁易购等大型网站在图书销售等商品零售领域已经十分成熟完善,但是以高校学生二手书为主的二手图书资源目前还没有得到合理的开发利用。本人设计和实现的基于微信小程序的二手书籍交易平台是通过合理的市场调查然后才着手实施的。系统分前端和......
  • 12G-SDI高清视频开发案例,让4K视频采集更便捷!基于Xilinx MPSoC高性能平台
    本文主要介绍基于XilinxUltraScale+MPSoCXCZU7EV的12G-SDI高清视频开发案例,适用开发环境如下:Windows开发环境:Windows764bit、Windows1064bitLinux开发环境:Ubuntu18.04.464bit开发工具包:XilinxUnified2022.2硬件平台:创龙科技TLZU-EVM评估板(基于XilinxUltraScale+......
  • 书籍-《Docker深度探索(2024版)》
    书籍:DockerDeepDive:ZerotoDockerinASingleBook,2024Edition作者:NigelPoulton出版:NielsonBookServices编辑:陈萍萍的公主@一点人工一点智能书籍介绍本书涵盖了Docker生态系统中所有最新的趋势和技术,包括DockerScout、DockerInit、DockerDebug以及Wasm容器。本书深入浅......