动态页面爬虫前的准备:https://www.cnblogs.com/maohuidong/p/18517953
java 添加maven依赖:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
</dependency>
<!--selenium依赖-->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
ShowDocPageProcessor类:
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
import java.util.Set;
/**
* 准备抓取这个页面:http://showdoc.external.xxxx.so/web/#/2?page_id=53
* 并把该页面所有的链接都抓取到(所有的链接通过selenium模拟点击操作获取)。将抓取的页面数据 通过实现Pipeline接口来模拟插入到数据库中。
* 但是这个页面需要登录后才能访问,登录页面为:http://showdoc.external.xxxx.so/web/#/user/login
*/
public class ShowDocPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(0).setTimeOut(3000);
//用来存储cookie信息
private Set<Cookie> cookies;
private RemoteWebDriver driver;
public ShowDocPageProcessor(){
System.setProperty("webdriver.chrome.driver","E:\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe");
//创建浏览器参数对象
ChromeOptions chromeOptions = new ChromeOptions();
// 设置为 无界面浏览器 模式,若是不想看到浏览器打开,就可以配置此项
//解决 403 出错问题
chromeOptions.addArguments("--remote-allow-origins=*");
// chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--window-size=1440,1080");// 设置浏览器窗口打开大小
RemoteWebDriver driver = new ChromeDriver(chromeOptions);
this.driver = driver;
}
@Override
public void process(Page page) {
String url = page.getUrl().get();
// 如果是要抓取的页面,则把该页面所有的子链接加入到WebMagic中,否则认为是子链接,直接抓取
if("http://showdoc.external.xxxx.so/web/#/2?page_id=53".equals(url)){
// selenium 模拟点击,把所有链接加入到WebMagic中
List<WebElement> elements = driver.findElements(By.xpath("//*[@id='left-side']/div/ul/li"));
for (int i = 1;i <= elements.size();i++) {
elements.get(i-1).click();
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
List<WebElement> divs = driver.findElements(By.xpath("//*[@id='left-side']/div/ul/li[" + i + "]/ul/li"));
if(divs.size() > 0){
for (WebElement div : divs) {
div.click();
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
System.out.println("当前的URL=" + driver.getCurrentUrl());
page.addTargetRequest(driver.getCurrentUrl());
}
}
}
}else{
// 将抓取的页面数据写到page中。
page.putField(page.getHtml().xpath("//*[@id='doc-title']/text()").get(),page.getHtml().xpath("//*[@id='editor-md']/ul[2]/li/code/text()").get());
}
}
//使用 selenium 来模拟用户的登录获取cookie信息
public void login(String loginUrl,String userName,String password)
{
driver.get(loginUrl);
driver.findElement(By.xpath("//form/div[1]//input")).clear();
//在******中填你的用户名
driver.findElement(By.xpath("//form/div[1]//input")).sendKeys(userName);
driver.findElement(By.xpath("//form/div[2]//input")).clear();
//在*******填你密码
driver.findElement(By.xpath("//form/div[2]//input")).sendKeys(password);
//模拟点击登录按钮
driver.findElement(By.xpath("//button")).click();
//获取cookie信息
cookies = driver.manage().getCookies();
// driver.close();
}
@Override
public Site getSite() {
//将获取到的cookie信息添加到webmagic中
for (Cookie cookie : cookies) {
site.addCookie(cookie.getName().toString(),cookie.getValue().toString());
}
return site.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1");
}
public static void main(String[] args){
ShowDocPageProcessor pageProcessor = new ShowDocPageProcessor();
//调用selenium,进行模拟登录
pageProcessor.login("http://showdoc.external.xxx.so/web/#/user/login","xxx","xxx");
Spider.create(pageProcessor)
.addUrl("http://showdoc.external.rongyi.so/web/#/2?page_id=53")
.setDownloader(new MyDownloader(pageProcessor.driver))//可选择使用自定义的
// 输出到D盘webmagic文件夹
.addPipeline(new JsonFilePipeline("D:\\webmagic\\"))
.addPipeline(new MyPipeline())
//开启1个线程抓取
.thread(1)
//启动爬虫
.run();
System.out.println("爬取结束");
}
}
MyDownloader类:
import org.openqa.selenium.remote.RemoteWebDriver;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.PlainText;
public class MyDownloader implements Downloader {
//声明驱动
private RemoteWebDriver driver;
public MyDownloader(RemoteWebDriver driver) {
this.driver = driver;
}
/**
*
*/
@Override
public Page download(Request request, Task task) {
try {
driver.get(request.getUrl());
Thread.sleep(2000);//等待打开浏览器
// 使用driver.navigate().refresh()方法手动刷新页面(在获取第二个链接时,我这里页面不刷新,用此方法可以解决)
driver.navigate().refresh();
Thread.sleep(2000);//等待打开浏览器
driver.executeScript("window.scrollTo(0, document.body.scrollHeight - 1000)");//需要滚动到页面的底部,获取完整的数据
Thread.sleep(2000);//等待滚动完成
//获取页面,打包成Page对象,传给PageProcessor 实现类
Page page = createPage(request.getUrl(), driver.getPageSource());
// driver.close();//看需要是否关闭浏览器
return page;
} catch (InterruptedException e) {
e.printStackTrace();
}
return null;
}
@Override
public void setThread(int threadNum) {
}
//构建page返回对象
private Page createPage(String url, String content) {
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(url));
page.setRequest(new Request(url));
page.setDownloadSuccess(true);
return page;
}
}
MyPipeline类:
标签:codecraft,selenium,driver,抓取,import,page,模拟,webmagic From: https://www.cnblogs.com/maohuidong/p/18539528
import org.springframework.util.StringUtils;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.Map;
public class MyPipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> all = resultItems.getAll();
if(all != null && all.size() > 0){
all.forEach((key,value) ->{
// 模拟保存到数据库的操作
if(StringUtils.isEmpty(key) || StringUtils.isEmpty(value)){
String url = resultItems.getRequest().getUrl();
System.out.println("有空数据");
}else {
System.out.println("key=" + key + "。value=" + value.toString());
}
});
}
}
}