简单研究一下epub,毕竟txt总是看着不爽,后面在优化epub样式
cargo.toml
[package]
name = "bqg_epub"
version = "0.1.0"
edition = "2021"
[dependencies]
epub-builder = "0.7.4"
reqwest = {version = "0.12.5",features = ["blocking","json"]}
tokio = {version = "1.38.0",features = ["full"]}
scraper ="0.19.0"
rand = { version = "0.8.5", features = ["default"] }
url = "2.5.2"
clap = {version = "4.5.7",features = ["derive"]}
main.rs
use std::cmp::Ordering;
use std::fs::{File, OpenOptions};
use epub_builder::EpubBuilder;
use epub_builder::Result;
use epub_builder::ZipLibrary;
use epub_builder::EpubContent;
use epub_builder::ReferenceType;
use std::io::Write;
use std::path::Path;
use std::{fs, io};
use std::time::Duration;
use clap::Parser;
use reqwest::{Client, Url};
use scraper::{Html, Selector};
use rand::{Rng};
#[derive(Debug)]
struct Book {
title: String,
homepage: String,
intro: String,
author: String,
chapters: Vec<Chapter>,
}
impl Book {
fn new(homepage: &str) -> Self {
Self {
title: String::default(),
author: String::default(),
intro: String::default(),
chapters: Vec::new(),
homepage: homepage.to_string(),
}
}
async fn get_book_info(&mut self, text: &str) -> Result<()> {
let mut chapters = vec![];
let document = Html::parse_document(&text);
let chapter_selector = Selector::parse("#list > dl > dd > a").unwrap();
let author_selector = Selector::parse("#info > p:nth-child(2) > a").unwrap();
let intro_selector = Selector::parse("#intro").unwrap();
let title_selector = Selector::parse("#info > h1").unwrap();
self.author = document.select(&author_selector).next().unwrap().text().collect::<Vec<_>>().join(" ");
self.intro = document.select(&intro_selector).next().unwrap().text().collect::<Vec<_>>().join(" ");
self.title = document.select(&title_selector).next().unwrap().text().collect::<Vec<_>>().join(" ");
for element in document.select(&chapter_selector) {
if let Some(href) = element.value().attr("href") {
let text = element.text().collect::<Vec<_>>().join(" ");
let c = Chapter::new(href, &text);
// chapters.push(c);
self.add_chapter(c);
}
}
chapters.sort();
self.chapters = chapters;
Ok(())
}
fn add_chapter(&mut self,chapter: Chapter){
if !self.chapters.iter().any(|c| c.href == chapter.href){
self.chapters.push(chapter)
}
}
fn generate_epub(&self) -> Result<()> {
// let mut output = Vec::<u8>::new();
let dummy_image = "Not really a PNG image";
let dummy_css = "body { background-color: pink }";
let mut output = File::create(format!("{}.epub", "test")).unwrap();
let zip_lib = ZipLibrary::new()?;
// Create a new EpubBuilder using the zip library
let mut builder = EpubBuilder::new(zip_lib)?;
builder
// Set some metadata
.metadata("author", "Leon Lee")?
.metadata("title", &self.title)?
.add_cover_image("cover.png", dummy_image.as_bytes(), "image/png")?
// Add a resource that is not part of the linear document structure
.add_resource("some_image.png", dummy_image.as_bytes(), "image/png")?;
for chapter in self.chapters.iter() {
builder.add_content(EpubContent::new(&chapter.href, &*chapter.content.as_bytes())
.title(&chapter.title)
.reftype(ReferenceType::TitlePage))?;
}
builder.inline_toc()
// Finally, write the EPUB file to a writer. It could be a `Vec<u8>`, a file,
// `stdout` or whatever you like, it just needs to implement the `std::io::Write` trait.
.generate(&mut output)?;
Ok(())
}
}
// const BASE_URL: &str = "https://www.xbiqugew.com/book/53099/";
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
#[command(next_line_help = true)]
struct Args {
/// base_url
#[arg(short, long)]
url: String,
}
#[tokio::main]
async fn main() -> Result<()> {
let args = Args::parse();
let client = Client::builder()
.build()?;
let html = query_book_homepage(&client, &args.url).await.unwrap();
let mut book = Book::new(&args.url);
book.get_book_info(&html).await?;
for chapter in book.chapters.iter_mut() {
println!("{} | {}", chapter.href, chapter.title);
let delay = random_delay();
println!("Waiting for {} milliseconds before the next request...", delay.as_millis());
tokio::time::sleep(delay).await;
chapter.scraper_chapter_content(&book.homepage, &client).await.unwrap()
}
book.generate_epub().unwrap();
println!("{:?}",book);
Ok(())
}
/// test request page
async fn query_book_homepage(client: &Client, homepage: &str) -> Result<String> {
let html = client.get(homepage).send().await?.text().await?;
println!("scraper homepage: {} done!", homepage);
Ok(html)
}
#[derive(Eq,Debug)]
struct Chapter {
number: usize,
href: String,
title: String,
content: String,
}
impl Chapter {
fn new(href: &str, title: &str) -> Self {
let number = href.split('.').next().unwrap_or("0").parse::<usize>().unwrap();
Self {
number,
href: href.to_string(),
title: title.to_string(),
content: String::default(),
}
}
async fn scraper_chapter_content(&mut self, base_url: &str, client: &Client) -> Result<()> {
// let v = (rand::random::<f64>() * 5000.) as u64 ;
//
// let sleep_time = std::time::Duration::from_millis(v);
let base_url = Url::parse(base_url)?;
let joined_url = base_url.join(&self.href)?;
println!("now visited: {}", joined_url);
let page = client.get(joined_url).send().await?.text().await?;
let document = Html::parse_document(&page);
let content_selector = Selector::parse("#content").unwrap();
let content = match document.select(&content_selector).next() {
Some(e) => {
e.text().collect::<Vec<_>>().join("\r\n")
}
None => { "this chapter may have no content or an error occur".to_string() }
};
let file_name = format!("books/{}.txt", self.href.split('.').next().unwrap_or("0").parse::<usize>().unwrap());
let dir_path = Path::new(&file_name).parent().unwrap(); // Get the directory part of the file path
check_and_create_directory(dir_path)?;
let mut file = OpenOptions::new()
.read(true)
.write(true)
.create(true)
.open(file_name).unwrap();
let cleaned = replace_html_entities(&content);
file.write(cleaned.as_bytes()).unwrap();
self.content = cleaned;
Ok(())
}
}
impl PartialEq for Chapter {
fn eq(&self, other: &Self) -> bool {
self.href == other.href
}
}
impl PartialOrd for Chapter {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Chapter {
fn cmp(&self, other: &Self) -> Ordering {
self.number.cmp(&other.number)
}
}
fn check_and_create_directory(dir_path: &Path) -> io::Result<()> {
if !dir_path.exists() {
println!("Directory does not exist. Creating directory: {:?}", dir_path);
fs::create_dir_all(dir_path)?; // Create the directory and any missing parent directories
} else {
println!("Directory already exists: {:?}", dir_path);
}
Ok(())
}
fn random_delay() -> Duration {
let mut rng = rand::thread_rng();
let millis = rng.gen_range(500..2000); // Random delay between 500ms and 2000ms
Duration::from_millis(millis)
}
fn replace_html_entities(s: &str) -> String {
s.replace(" ", "")
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
// .replace(" "," ")
// Add more replacements as needed
}