WebCollector可以配置短点爬取,历史数据根据Key去重,也就是url
最近在采集百度云网盘,记录一下
/** * @author Liu * @create 2022-08-02 11:48 */ @Component @Slf4j public class DeepCrawler extends BaseCrawler { private CrawlerConfig crawlerConfig; @Override public void execute() { List<CrawlerConfig> crawlerConfigs = new ArrayList<>(); if (this.crawlerConfig != null) { crawlerConfigs.add(this.crawlerConfig); } else { crawlerConfigs = this.crawlerConfigService.getDeepCrawlerConfig(); } super.initCrawlerConfig(crawlerConfigs); //多站点多线程爬取 for (CrawlerConfig config : crawlerConfigs) { try { if (SimpleCrawlerStoreMap.deepCrawlerThreadMap.get(config.getId()) == null) { simpleCrawlerPool.execute(() -> { DeepCrawlerThread deepCrawlerThread = new DeepCrawlerThread(config); SimpleCrawlerStoreMap.deepCrawlerThreadMap.put(config.getId(), deepCrawlerThread); deepCrawlerThread.setNextFilter(new HashSetNextFilter()); try { deepCrawlerThread.start(config.getDeep()); } catch (Exception e) { e.printStackTrace(); log.error(config.getSiteName() + "=>爬取任务异常"); log.error(e.getMessage(), e); } }); } else { log.info(config.getSiteName() + "=>爬取任务进行中……"); } } catch (Exception e) { e.printStackTrace(); } } } public CrawlerConfig getCrawlerConfig() { return crawlerConfig; } public void setCrawlerConfig(CrawlerConfig crawlerConfig) { this.crawlerConfig = crawlerConfig; } }
标签:www,deepCrawlerThread,java,baiduyunsousou,crawlerConfig,CrawlerConfig,crawlerCon From: https://www.cnblogs.com/xibb/p/16965488.html