导包就不细说了:
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.35.0</version>
</dependency>
<!-- 解析html -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
<version>2.0.1</version>
</dependency>
<!-- 阿里JSON解析器 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.31</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>1.4</version>
</dependency>
public static void main(String[] args) {
String nam
String url
//多少页:
for (int i = 0; i < 14; i++) {
String oneUrl = url + i;
try {
getCSDNArticleUrlList2(name,oneUrl,new ArrayList<String>());
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void getCSDNArticleUrlList2(String name, String oneUrl, List<String> urlList)
throws FailingHttpStatusCodeException, MalformedURLException, IOException {
// 模拟浏览器操作
InputStream inputStream = HttpUtil.doGet(oneUrl);
String content = StreamUtil.inputStreamToString(inputStream, "UTF-8");
Document doc = Jsoup.parse(content);
Element pageMsg22 = doc.select("div.article-list").first();
if (pageMsg22 == null) {
return;
}
Elements pageMsg = pageMsg22.select("div.article-item-box");
Element linkNode;
for (Element e : pageMsg) {
linkNode = e.select("h4 a").first();
// 不知为何,所有的bloglist第一条都是
if (linkNode.attr("href").contains(name)) {
// System.out.println(linkNode.attr("href"));
TextNode textNode = linkNode.textNodes().get(1);
System.out.println("[" + textNode + "](" + linkNode.attr("href") + ")");
urlList.add(linkNode.attr("href"));
}
}
return;
}
工具类方法,HttpUtil的一个,和流转字符串的一个
public static InputStream doGet(String urlstr, Map<String, String> headers) throws IOException {
URL url = new URL(urlstr);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
conn.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp," +
"image/apng,*/*;q=0" +
".8");
if (headers != null) {
Set<String> keys = headers.keySet();
for (String key : keys) {
conn.setRequestProperty(key, headers.get(key));
}
}
Random random = new Random();
String ip =
(random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100);
conn.setRequestProperty("x-forwarded-for", ip);
InputStream inputStream = conn.getInputStream();
return inputStream;
}
public static String inputStreamToString(InputStream is, String charset) throws IOException {
byte[] bytes = new byte[1024];
int byteLength = 0;
StringBuffer sb = new StringBuffer();
while ((byteLength = is.read(bytes)) != -1) {
sb.append(new String(bytes, 0, byteLength, charset));
}
return sb.toString();
}
爬取结果: