rust 爬取笔趣阁生成epub文件

标签：chapter use unwrap self 爬取 href let 笔趣 rust

简单研究一下epub,毕竟txt总是看着不爽，后面在优化epub样式

cargo.toml

[package]
name = "bqg_epub"
version = "0.1.0"
edition = "2021"

[dependencies]
epub-builder = "0.7.4"
reqwest = {version = "0.12.5",features = ["blocking","json"]}
tokio = {version = "1.38.0",features = ["full"]}
scraper ="0.19.0"
rand = { version = "0.8.5", features = ["default"] }
url = "2.5.2"
clap = {version = "4.5.7",features = ["derive"]}

main.rs

use std::cmp::Ordering;

use std::fs::{File, OpenOptions};
use epub_builder::EpubBuilder;
use epub_builder::Result;
use epub_builder::ZipLibrary;
use epub_builder::EpubContent;
use epub_builder::ReferenceType;

use std::io::Write;
use std::path::Path;
use std::{fs, io};
use std::time::Duration;
use clap::Parser;
use reqwest::{Client, Url};
use scraper::{Html, Selector};
use rand::{Rng};


#[derive(Debug)]
struct Book {
    title: String,
    homepage: String,
    intro: String,
    author: String,

    chapters: Vec<Chapter>,
}

impl Book {
    fn new(homepage: &str) -> Self {
        Self {
            title: String::default(),
            author: String::default(),
            intro: String::default(),
            chapters: Vec::new(),
            homepage: homepage.to_string(),
        }
    }
    async fn get_book_info(&mut self, text: &str) -> Result<()> {
        let mut chapters = vec![];
        let document = Html::parse_document(&text);
        let chapter_selector = Selector::parse("#list > dl > dd > a").unwrap();
        let author_selector = Selector::parse("#info > p:nth-child(2) > a").unwrap();
        let intro_selector = Selector::parse("#intro").unwrap();
        let title_selector = Selector::parse("#info > h1").unwrap();

        self.author = document.select(&author_selector).next().unwrap().text().collect::<Vec<_>>().join(" ");
        self.intro = document.select(&intro_selector).next().unwrap().text().collect::<Vec<_>>().join(" ");
        self.title = document.select(&title_selector).next().unwrap().text().collect::<Vec<_>>().join(" ");

        for element in document.select(&chapter_selector) {
            if let Some(href) = element.value().attr("href") {
                let text = element.text().collect::<Vec<_>>().join(" ");

                let c = Chapter::new(href, &text);
                // chapters.push(c);
                self.add_chapter(c);
            }
        }
        chapters.sort();
        self.chapters = chapters;
        Ok(())
    }

    fn add_chapter(&mut self,chapter: Chapter){

        if !self.chapters.iter().any(|c| c.href == chapter.href){
            self.chapters.push(chapter)
        }
    }


    fn generate_epub(&self) -> Result<()> {
        // let mut output = Vec::<u8>::new();

        let dummy_image = "Not really a PNG image";
        let dummy_css = "body { background-color: pink }";
        let mut output = File::create(format!("{}.epub", "test")).unwrap();

        let zip_lib = ZipLibrary::new()?;
        // Create a new EpubBuilder using the zip library
        let mut builder = EpubBuilder::new(zip_lib)?;
        builder
            // Set some metadata
            .metadata("author", "Leon Lee")?
            .metadata("title", &self.title)?
            .add_cover_image("cover.png", dummy_image.as_bytes(), "image/png")?
            // Add a resource that is not part of the linear document structure
            .add_resource("some_image.png", dummy_image.as_bytes(), "image/png")?;

        for chapter in self.chapters.iter() {
            builder.add_content(EpubContent::new(&chapter.href, &*chapter.content.as_bytes())
                .title(&chapter.title)
                .reftype(ReferenceType::TitlePage))?;
        }

        builder.inline_toc()
            // Finally, write the EPUB file to a writer. It could be a `Vec<u8>`, a file,
            // `stdout` or whatever you like, it just needs to implement the `std::io::Write` trait.
            .generate(&mut output)?;


        Ok(())
    }
}

// const BASE_URL: &str = "https://www.xbiqugew.com/book/53099/";
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
#[command(next_line_help = true)]
struct Args {
    /// base_url
    #[arg(short, long)]
    url: String,
}


#[tokio::main]
async fn main() -> Result<()> {
    let args = Args::parse();
    let client = Client::builder()
        .build()?;
    let html = query_book_homepage(&client, &args.url).await.unwrap();

    let mut book = Book::new(&args.url);
    book.get_book_info(&html).await?;

    for chapter in book.chapters.iter_mut() {
        println!("{}  | {}", chapter.href, chapter.title);
        let delay = random_delay();
        println!("Waiting for {} milliseconds before the next request...", delay.as_millis());
        tokio::time::sleep(delay).await;
        chapter.scraper_chapter_content(&book.homepage, &client).await.unwrap()
    }
    book.generate_epub().unwrap();

    println!("{:?}",book);
    Ok(())
}

/// test request page
async fn query_book_homepage(client: &Client, homepage: &str) -> Result<String> {
    let html = client.get(homepage).send().await?.text().await?;
    println!("scraper homepage: {} done!", homepage);
    Ok(html)
}

#[derive(Eq,Debug)]
struct Chapter {
    number: usize,
    href: String,
    title: String,
    content: String,
}

impl Chapter {
    fn new(href: &str, title: &str) -> Self {
        let number = href.split('.').next().unwrap_or("0").parse::<usize>().unwrap();
        Self {
            number,
            href: href.to_string(),
            title: title.to_string(),
            content: String::default(),
        }
    }

    async fn scraper_chapter_content(&mut self, base_url: &str, client: &Client) -> Result<()> {
        // let v = (rand::random::<f64>() * 5000.) as u64 ;
        //
        // let sleep_time = std::time::Duration::from_millis(v);
        let base_url = Url::parse(base_url)?;
        let joined_url = base_url.join(&self.href)?;

        println!("now visited: {}", joined_url);

        let page = client.get(joined_url).send().await?.text().await?;
        let document = Html::parse_document(&page);
        let content_selector = Selector::parse("#content").unwrap();

        let content = match document.select(&content_selector).next() {
            Some(e) => {
                e.text().collect::<Vec<_>>().join("\r\n")
            }
            None => { "this chapter may have no content or an error occur".to_string() }
        };

        let file_name = format!("books/{}.txt", self.href.split('.').next().unwrap_or("0").parse::<usize>().unwrap());
        let dir_path = Path::new(&file_name).parent().unwrap(); // Get the directory part of the file path

        check_and_create_directory(dir_path)?;
        let mut file = OpenOptions::new()
            .read(true)
            .write(true)
            .create(true)
            .open(file_name).unwrap();

        let cleaned = replace_html_entities(&content);
        file.write(cleaned.as_bytes()).unwrap();

        self.content = cleaned;
        Ok(())
    }
}

impl PartialEq for Chapter {
    fn eq(&self, other: &Self) -> bool {
        self.href == other.href
    }
}

impl PartialOrd for Chapter {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for Chapter {
    fn cmp(&self, other: &Self) -> Ordering {
        self.number.cmp(&other.number)
    }
}


fn check_and_create_directory(dir_path: &Path) -> io::Result<()> {
    if !dir_path.exists() {
        println!("Directory does not exist. Creating directory: {:?}", dir_path);
        fs::create_dir_all(dir_path)?; // Create the directory and any missing parent directories
    } else {
        println!("Directory already exists: {:?}", dir_path);
    }
    Ok(())
}

fn random_delay() -> Duration {
    let mut rng = rand::thread_rng();
    let millis = rng.gen_range(500..2000); // Random delay between 500ms and 2000ms
    Duration::from_millis(millis)
}

fn replace_html_entities(s: &str) -> String {
    s.replace("&nbsp;", "")
        .replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
    // .replace(" "," ")
    // Add more replacements as needed
}

标签：chapter,use,unwrap,self,爬取,href,let,笔趣,rust
From： https://www.cnblogs.com/itachilee/p/18259103

rust 爬取笔趣阁生成epub文件

相关文章

赞助商

阅读排行