可以选择下载的数量,全部下载下来够呛,首先没那么大的盘
新版本:https://wws.lanzous.com/iAEMoghsgeb 密码:7vjz
jar包:https://wws.lanzous.com/ilphyghsgcj密码:f38a
<dependency>
<!-- jsoup HTML parser library [url=home.php?mod=space&uid=402414]@[/url] https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
核心代码展示;
package com.aaa.data;
import com.aaa.config.SSLHelper;
import com.aaa.dto.BookCatalogueDto;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author 三木猿
* @version 1.0
* @Title:
* @date 2020/8/10 15:16
*/
public class DownloadBook {
private static String dataSource;
private static Pattern pattern = Pattern.compile("<a\\s*href=\"?([\\w\\W]*?)\"?[\\s]*?[^>]>([\\s\\S]*?)(?=</a>)");
public static void setDataSource(String dataSource,int count) {
SSLHelper.init();
DownloadBook.dataSource = dataSource;
if ("biquge5200".equals(dataSource)) {
while (true) {
Thread thread1 = new Thread(() -> {
for (int i = 1; i <(count>=2?count/2:count); i++) {
try {
String bookCod = "0_" + i;
Document document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/").get();
Element info = document.getElementById("info");
String bookName = info.select("h1").text();
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
continue;
}
System.out.println("---------------" + bookName + "正在下载" + "--------------");
List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
downloadBook(bookCod, bookName, bookCatalogue);
System.out.println("---------------" + bookName + "下载完成" + "--------------");
} catch (Exception e) {
return;
}
}
});
assert count>=2;
Thread thread2 = new Thread(() -> {
for (int i = count/2; i < count; i++) {
try {
i++;
String bookCod = "0_" + i;
Document document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/").get();
Element info = document.getElementById("info");
String bookName = info.select("h1").text();
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
continue;
}
System.out.println("---------------" + bookName + "正在下载" + "--------------");
List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
downloadBook(bookCod, bookName, bookCatalogue);
System.out.println("---------------" + bookName + "下载完成" + "--------------");
} catch (Exception e) {
return;
}
}
});
thread1.start();
thread2.start();
try {
thread1.join();
thread2.join();
break;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
} else if ("biquge".equals(dataSource)) {
while (true) {
Thread thread1 = new Thread(() -> {
for (int j = 1; j < count/2; j++) {
try {
String bookCod = "0_" + j;
Document document = Jsoup.connect("https://www.biquge.com/" + bookCod + "/").get();
Element info = document.getElementById("info");
String bookName = info.select("h1").text();
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
continue;
}
List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
System.out.println("---------------" + bookName + "正在下载" + "--------------");
downloadBook(bookCod, bookName, bookCatalogue);
System.out.println("---------------" + bookName + "下载完成" + "--------------");
} catch (Exception e) {
continue;
}
}
});
Thread thread2 = new Thread(() -> {
for (int j = count/2; j < count; j++) {
try {
String bookCod = "0_" + j;
Document document = Jsoup.connect("https://www.biquge.com/" + bookCod + "/").get();
Element info = document.getElementById("info");
String bookName = info.select("h1").text();
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
continue;
}
List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
System.out.println("---------------" + bookName + "正在下载" + "--------------");
downloadBook(bookCod, bookName, bookCatalogue);
System.out.println("---------------" + bookName + "下载完成" + "--------------");
} catch (Exception e) {
continue;
}
}
});
thread1.start();
thread2.start();
try {
thread1.join();
thread2.join();
break;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
public static void downloadBook(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws Exception {
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
return;
}
Map<Integer, List<BookCatalogueDto>> integerListMap = splitList(bookCatalogueDto, 3);
long start = System.currentTimeMillis();
Thread thread1 = new Thread(() -> {
try {
if ("biquge5200".equals(dataSource)) {
biquge5200(bookCod, bookName + "1", integerListMap.get(0));
} else if ("biquge".equals(dataSource)) {
biquge(bookCod, bookName + "1", integerListMap.get(0));
}
} catch (Exception e) {
e.printStackTrace();
}
});
Thread thread2 = new Thread(() -> {
try {
if ("biquge5200".equals(dataSource)) {
biquge5200(bookCod, bookName + "2", integerListMap.get(1));
} else if ("biquge".equals(dataSource)) {
biquge(bookCod, bookName + "2", integerListMap.get(1));
}
} catch (Exception e) {
e.printStackTrace();
}
});
Thread thread3 = new Thread(() -> {
try {
if ("biquge5200".equals(dataSource)) {
biquge5200(bookCod, bookName + "3", integerListMap.get(2));
} else if ("biquge".equals(dataSource)) {
biquge(bookCod, bookName + "3", integerListMap.get(2));
}
} catch (Exception e) {
e.printStackTrace();
}
});
thread1.start();
thread2.start();
thread3.start();
thread1.join();
thread2.join();
thread3.join();
//合并文件
combine(bookName);
long end = System.currentTimeMillis();
System.out.println("本次下载共用时" + (end - start));
}
public static void biquge5200(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws
Exception {
String path = "/usr/local/webapps/file/downloading/" + bookName + ".txt";
File file = new File(path);
if (!file.exists()) {
File dir = new File(file.getParent());
dir.mkdirs();
try {
file.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
} else {
List<BookCatalogueDto> bookCatalogueDtos = txtCatalogue(bookName);
if (bookCatalogueDtos.size() != 0) {
BookCatalogueDto bookCatalogueDto1 = bookCatalogueDtos.get(bookCatalogueDtos.size() - 1);
for (BookCatalogueDto catalogueDto : bookCatalogueDto) {
if (catalogueDto.getCatalogueName().equals(bookCatalogueDto1.getCatalogueName())) {
int i = bookCatalogueDto.indexOf(catalogueDto);
bookCatalogueDto = bookCatalogueDto.subList(i + 1, bookCatalogueDto.size());
break;
}
}
}
}
//创建一个输出流,将爬到的小说以txt形式保存在硬盘
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
if(bookCatalogueDto.size()==0){
return;
}
bookCatalogueDto.forEach(e -> {
Document document = null;
try {
document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/" + e.getCatalogueCod() + ".html").get();
} catch (IOException ioException) {
try {
Thread.sleep(5000);
try {
document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/" + e.getCatalogueCod() + ".html").get();
} catch (IOException exception) {
return;
}
} catch (InterruptedException interruptedException) {
interruptedException.printStackTrace();
}
}
Elements chapterName = document.select("h1");
try {
bw.write(chapterName.text());
bw.newLine();
bw.flush();
} catch (IOException ioException) {
ioException.printStackTrace();
}
Elements elements = document.select("#content");
String html = elements.get(0).html().replace("<div id='content'>", "").replace("</div>", "");
String replace = html.replace("<script>readx();</script>", "").replace("<script>chaptererror();</script>", "");
try {
String[] split = replace.replace("<p>", "").split("</p>");
for (String s : split) {
bw.write(s);
bw.newLine();
bw.flush();
}
} catch (IOException ioException) {
ioException.printStackTrace();
}
});
try {
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static List<BookCatalogueDto> getBookCatalogue(String bookCod, Document document, Pattern pattern) throws InterruptedException {
List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>();
Elements dd = document.getElementsByTag("dd");
Map<Integer, List<Element>> integerListMap = splitList(dd, 3);
final List<BookCatalogueDto>[] bookCatalogueDtos1 = new List[]{new ArrayList<>()};
final List<BookCatalogueDto>[] bookCatalogueDtos2 = new List[]{new ArrayList<>()};
final List<BookCatalogueDto>[] bookCatalogueDtos3 = new List[]{new ArrayList<>()};
Thread thread1 = new Thread(() -> {
bookCatalogueDtos1[0] = get(integerListMap.get(0), bookCod, document, pattern);
});
Thread thread2 = new Thread(() -> {
bookCatalogueDtos2[0] = get(integerListMap.get(1), bookCod, document, pattern);
});
Thread thread3 = new Thread(() -> {
bookCatalogueDtos3[0] = get(integerListMap.get(2), bookCod, document, pattern);
});
thread1.start();
thread2.start();
thread3.start();
thread1.join();
thread2.join();
thread3.join();
bookCatalogueDtos.addAll(bookCatalogueDtos1[0]);
bookCatalogueDtos.addAll(bookCatalogueDtos2[0]);
bookCatalogueDtos.addAll(bookCatalogueDtos3[0]);
return bookCatalogueDtos;
}
public static List<BookCatalogueDto> get(List<Element> dd, String bookCod, Document document, Pattern pattern) {
List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>();
for (int i = 0; i < dd.size(); i++) {
Element element = dd.get(i);
BookCatalogueDto bookCatalogueDto = new BookCatalogueDto();
Node node = element.childNode(0);
for (Node e : element.childNodes()) {
if (!"".equals(e.toString())) {
node = e;
}
}
String s1 = node.toString();
Matcher matcher = pattern.matcher(s1);
if (matcher.find()) {
String nameCodeUrl = matcher.group(1);
String insStr = nameCodeUrl.substring(nameCodeUrl.lastIndexOf("/") + 1, nameCodeUrl.lastIndexOf("."));
bookCatalogueDto.setCatalogueCod(Integer.parseInt(insStr));
}
bookCatalogueDto.setBookCod(bookCod);
bookCatalogueDto.setCatalogueName(element.text());
bookCatalogueDtos.add(bookCatalogueDto);
}
return bookCatalogueDtos;
}
private static void biquge(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws FileNotFoundException {
String path = "/usr/local/webapps/file/downloading/" + bookName + ".txt";
File file = new File(path);
if (!file.exists()) {
File dir = new File(file.getParent());
dir.mkdirs();
try {
file.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
} else {
List<BookCatalogueDto> bookCatalogueDtos = txtCatalogue(bookName);
if (bookCatalogueDtos.size() != 0) {
BookCatalogueDto bookCatalogueDto1 = bookCatalogueDtos.get(bookCatalogueDtos.size() - 1);
for (BookCatalogueDto catalogueDto : bookCatalogueDto) {
if (catalogueDto.getCatalogueName().equals(bookCatalogueDto1.getCatalogueName())) {
int i = bookCatalogueDto.indexOf(catalogueDto);
bookCatalogueDto = bookCatalogueDto.subList(i + 1, bookCatalogueDto.size());
break;
}
}
}
}
//创建一个输出流,将爬到的小说以txt形式保存在硬盘
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
if(bookCatalogueDto.size()==0){
return;
}
bookCatalogueDto.forEach(e -> {
Document document = null;
try {
document = Jsoup.connect("https://www.biquge.com/" + e.getBookCod() + "/" + e.getCatalogueCod() + ".html").get();
} catch (Exception e1) {
try {
Thread.sleep(5000);
document = Jsoup.connect("https://www.biquge.com/" + e.getBookCod() + "/" + e.getCatalogueCod() + ".html").get();
} catch (InterruptedException interruptedException) {
interruptedException.printStackTrace();
} catch (Exception exception) {
exception.printStackTrace();
}
}
Elements chapterName = document.select("h1");
try {
bw.write(chapterName.text());
bw.newLine();
bw.flush();
} catch (IOException ioException) {
ioException.printStackTrace();
}
Elements elements = document.select("#content");
String html = elements.get(0).html().replace("<div id='content'>", "").replace("</div>", "");
String replace = html.replace("<script>readx();</script>", "").replace("<script>chaptererror();</script>", "");
try {
String[] split = replace.split("<br>");
for (String s : split) {
bw.write(s);
bw.newLine();
bw.flush();
}
} catch (IOException ioException) {
ioException.printStackTrace();
}
});
try {
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static <T> Map<Integer, List<T>> splitList(List<T> t, int num) {
Map<Integer, List<T>> subList = new HashMap<>();
int num1 = (int) Math.floor(t.size() / num);
for (int i = 0; i < num; i++) {
subList.put(i, t.subList(i * num1, (i + 1) * num1));
if (i == num - 1) {
subList.put(i, t.subList(i * num1, t.size()));
}
}
return subList;
}
public static void combine(String bookName) throws Exception {
String bookPath = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(bookPath);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
for (int i = 1; i < 4; i++) {
String path = "/usr/local/webapps/file/downloading/" + bookName + i + ".txt";
File file1 = new File(path);
if (file1.exists()) {
BufferedReader br = new BufferedReader(new FileReader(file1));
String line;
while (true) {
if (!((line = br.readLine()) != null)) {
br.close();
break;
}
bw.write(line);
bw.newLine();
}
}
file1.delete();
}
bw.flush();
bw.close();
}
public static List<BookCatalogueDto> txtCatalogue(String bookName) {
List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>();
String fileNamedirs = "/usr/local/webapps/file/downloading/" + bookName + ".txt";
try {
// 编码格式
String encoding = "utf-8";
// 文件路径
File file = new File(fileNamedirs);
if (file.isFile() && file.exists()) { // 判断文件是否存在
// 输入流
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考虑到编码格
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
Long count = (long) 0;
boolean bflag = false;
int n = 0;
String newStr = null;
String titleName = null;
String newChapterName = null;//新章节名称
String substring = null;
int indexOf = 0;
int indexOf1 = 0;
int line = 0;
//小说内容类
BookCatalogueDto content;
while ((lineTxt = bufferedReader.readLine()) != null) {
content = new BookCatalogueDto();
//小说名称
content.setBookName(bookName);
count++;
// 正则表达式
Pattern p = Pattern.compile("(^\\s*第)(.{1,9})[章节卷集部篇回](\\s{1})(.*)($\\s*)");
Matcher matcher = p.matcher(lineTxt);
newStr = newStr + lineTxt;
while (matcher.find()) {
titleName = matcher.group();
//章节去空
newChapterName = titleName.trim();
//获取章节
//System.out.println(newChapterName);
content.setCatalogueName(newChapterName);
indexOf1 = indexOf;
//System.out.println(indexOf);
indexOf = newStr.indexOf(newChapterName);
// System.out.println(newChapterName + ":" + "第" + count + "行"); // 得到返回的章
if (bflag) {
bflag = false;
break;
}
if (n == 0) {
indexOf1 = newStr.indexOf(newChapterName);
}
n = 1;
bflag = true;
//System.out.println(chapter);
bookCatalogueDtos.add(content);
}
}
bufferedReader.close();
} else {
System.out.println("找不到指定的文件");
}
} catch (Exception e) {
System.out.println("读取文件内容出错");
e.printStackTrace();
}
return bookCatalogueDtos;
}
}
来源:https://blog.csdn.net/yangsen6666/article/details/127011878
标签:document,java,String,bookName,爬取,bookCod,file,new,多线程 From: https://www.cnblogs.com/hefeng2014/p/17676187.html