依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>28.0-jre</version>
</dependency>
我的需要添加guava依赖,否则项目启动会报错
代码
public static void main(String[] args) throws Exception {
System.setProperty("webdriver.chrome.driver", "D:\\stuff\\Google\\Chrome\\Application\\chromedriver.exe");
String name = "吴珊卓";
COUNT = 0;
NAME = name;
WebDriver driver = new ChromeDriver();
driver.get("https://search.douban.com/movie/subject_search?search_text=" + name);
Document doc = Jsoup.parse(driver.getPageSource());
Elements elements = doc.select("#wrapper #root .item-root .detail .title .title-text");
//https://movie.douban.com/celebrity/1390782/ 获取href里的超链接
String href = elements.first().attr("href");
//解析演员详情页
if (href.contains("celebrity")) {
//访问https://movie.douban.com/celebrity/1390782/
Document html = getHtml(href);
//结果:<span class="pl"> ( <a href="https://movie.douban.com/celebrity/1338497/photos/" target="_self">全部1376张</a>
Elements photoElements = html.select("#wrapper #content .article #photos .hd .pl");
photoHtml(photoElements);
}
}
- 传入网站url,返回这个网页的Document对象:
/**
* 输入一个网址返回这个网址的html文本字符串
*/
public static Document getHtml(String link) throws Exception {
//先创建连接
URL url = new URL(link);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36");
connection.setConnectTimeout(8000);
//设置从主机读取数据超时(单位:毫秒)
connection.setReadTimeout(8000);
//模拟用户
Thread.sleep(1000);
//使用jsoup解析网页
Document parse = Jsoup.parse(connection.getInputStream(), "UTF-8", link);
return parse;
}
- 通过搜索演员后获取演员详情页面
- 获取详情页面后拿到演员照片页面
/**
* 爬取图片第一页
*
* @param elements 全部xx张图片的地址元素
* @throws Exception
*/
private static void photoHtml(Elements elements) throws Exception {
//获取图片页面链接
String photoLink = elements.first().selectFirst("a").attr("href");
//访问https://movie.douban.com/celebrity/1390782/photos/
Document photoHtml = getHtml(photoLink);
downloadImg(photoHtml);
//下一页链接:https://movie.douban.com/celebrity/1338497/photos/?type=C&start=210&sortby=like&size=a&subtype=a
getNextLink(photoHtml);
}
- 通过递归每次获取下一页照片页面并下载到本地
/**
* 获取下一页超链接
*
* @param document 当前图片页面的Html
* @return
*/
public static void getNextLink(Document document) throws Exception {
//获取后一页链接
Elements select = document.select(".paginator .next a");
if (select.isEmpty()) {
System.err.println("无下一页");
return;
}
String nextLink = select.first().attr("href");
if (StringUtils.isNotBlank(nextLink)) {
//获取下一页html
Document nowHtml = getHtml(nextLink);
downloadImg(nowHtml);
//递归执行,直到没有下一页
getNextLink(nowHtml);
} else {
throw new RuntimeException("获取下一页链接失败");
}
}
- 图片下载代码实现
/**
* 图片下载
*
* @param photoHtml 当前页面
*/
private static void downloadImg(Document photoHtml) throws Exception {
List<String> list = new ArrayList<>();
//搜索当前页img图片
Elements imgElements = photoHtml.select("#wrapper #content .article .cover img");
for (Element imgElement : imgElements) {
String src = imgElement.attr("src");
//src:https://img2.doubanio.com/view/photo/m/public/p2526537251.jpg
if (StringUtils.isNotBlank(src)) {
list.add(src);
}
}
download(list, null);
}
//java 通过url下载图片保存到本地
public static void download(List<String> urlString, String name) throws Exception {
InputStream is = null;
FileOutputStream os = null;
for (String imgUrl : urlString) {
// 构造URL
URL url = new URL(imgUrl);
// 打开连接
URLConnection con = url.openConnection();
con.setRequestProperty("User-Agent", "Mozilla/5.0");
// 输入流
is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
String path = "D:\\爬虫\\" + NAME;
String filename = path + "\\" + NAME + (++COUNT) + ".jpg"; //下载路径及下载图片名称
File file = new File(filename);
if (!file.exists()) {
file.getParentFile().mkdirs();
}
os = new FileOutputStream(file, true);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
System.out.println(NAME);
}
// 完毕,关闭所有链接
os.close();
is.close();
}
效果:
总结:
0. 设置浏览器驱动位置,通过ChromeDriver访问搜索页面
该页面是异步刷新的,所以不能用jsoup访问
1. 通过代码搜索影视演员并获取演员详情页面
2. 通过jsoup访问演员详情页面
3. 获取详情页面后拿到演员照片页面
4. jsoup访问照片页面获取每张图片的url并下载
5. 递归访问下一页并下载图片