标签:架构 Heritrix 笔记 public controller long return 链接 cURI
3、Frontier链接制造工厂
在heritrix-
1.12.1
/docs/articles/developer_manual/frontier.html下可找到Heritrix的官方文档的一个Frontier例子:
/**
* A simple Frontier implementation for tutorial purposes
*/
public class MyFrontier extends ModuleType implements Frontier,
FetchStatusCodes {
// A list of the discovered URIs that should be crawled.
// 列表用来保存还未抓取的链接
List pendingURIs = new ArrayList();
// A list of prerequisites that needs to be met before any other URI is
// allowed to be crawled, e.g. DNS-lookups
//这个列表中保存了一系列的链接,它们的优先级要高于pendingURIs那个List中的任何一个链接,
//表中的链接表示一些需要被满足的先决条件
List prerequisites = new ArrayList();
// A hash of already crawled URIs so that every URI is crawled only once.
//一个HashMap,用于存储那些已经抓取过的链接
Map alreadyIncluded = new HashMap();
// Reference to the CrawlController.
// CrawlController对象
CrawlController controller;
// Flag to note if a URI is being processed.
//用于标识是否一个链接正在被处理
boolean uriInProcess = false;
// top-level stats
//成功下载的数量
long successCount = 0;
//失败的数量
long failedCount = 0;
//抛弃掉链接的数量
long disregardedCount = 0;
//总共下载的字节数
long totalProcessedBytes = 0;
public MyFrontier(String name) {
super(Frontier.ATTR_NAME, "A simple frontier.");
}
public void initialize(CrawlController controller)
throws FatalConfigurationException, IOException {
//注入
this.controller = controller;
// Initialize the pending queue with the seeds
//把种子文件中的链接加入到pengdingURIs中去
this.controller.getScope().refreshSeeds();
List seeds = this.controller.getScope().getSeedlist();
synchronized(seeds) {
for (Iterator i = seeds.iterator(); i.hasNext();) {
UURI u = (UURI) i.next();
CandidateURI caUri = new CandidateURI(u);
caUri.setSeed();
schedule(caUri);
}
}
}
//该方法是给线程池中的线程调用的,用以取出下一个准备处理的链接
public synchronized CrawlURI next(int timeout) throws InterruptedException {
if (!uriInProcess && !isEmpty()) {
uriInProcess = true;
CrawlURI curi;
/*
先看prerequistes队列中是否有要处理的链接,如果有,就先处理,如果没有,再看pengdingURIs队列中是否有链接。每次在处理的时候,总是取出队列中的第一个链接
*/
if (!prerequisites.isEmpty()) {
curi = CrawlURI.from((CandidateURI) prerequisites.remove(0));
} else {
curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0));
}
curi.setServer(controller.getServerCache().getServerFor(curi));
return curi;
} else {
wait(timeout);
return null;
}
}
public boolean isEmpty() {
return pendingURIs.isEmpty() && prerequisites.isEmpty();
}
//该方法用于将新链接加入到pengdingURIs队列中,等待处理
public synchronized void schedule(CandidateURI caURI) {
// Schedule a uri for crawling if it is not already crawled
/*
首先判断要加入的链接是否已经被抓取过,如果已经包含在alreadyIncluded这个HashMap中则说明处理过了,就可以放弃处理
*/
if (!alreadyIncluded.containsKey(caURI.getURIString())) {
if(caURI.needsImmediateScheduling()) {
prerequisites.add(caURI);
} else {
pendingURIs.add(caURI);
}
//HashMap中使用url的字符串来作为key,而将实际的CadidateURI对象作为value
alreadyIncluded.put(caURI.getURIString(), caURI);
}
}
public void batchSchedule(CandidateURI caURI) {
schedule(caURI);
}
public void batchFlush() {
}
//一次抓取结束后所执行的操作,该操作由线程池中的线程来进行调用
public synchronized void finished(CrawlURI cURI) {
uriInProcess = false;
//成功下载
if (cURI.isSuccess()) {
successCount++;
//统计下载总数
totalProcessedBytes += cURI.getContentSize();
//如果成功,则触发一个成功事件,比如将Extractor解析出来的新URL加入队列中
controller.fireCrawledURISuccessfulEvent(cURI);
cURI.stripToMinimal();
}
//需要推迟下载
else if (cURI.getFetchStatus() == S_DEFERRED) {
cURI.processingCleanup();
alreadyIncluded.remove(cURI.getURIString());
schedule(cURI);
}
//其他状态
else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED
|| cURI.getFetchStatus() == S_OUT_OF_SCOPE
|| cURI.getFetchStatus() == S_BLOCKED_BY_USER
|| cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS
|| cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS
|| cURI.getFetchStatus() == S_DELETED_BY_USER) {
//抛弃当前URI
controller.fireCrawledURIDisregardEvent(cURI);
disregardedCount++;
cURI.stripToMinimal();
} else {
controller.fireCrawledURIFailureEvent(cURI);
failedCount++;
cURI.stripToMinimal();
}
cURI.processingCleanup();
}
//返回所有已经处理过的链接数量
public long discoveredUriCount() {
return alreadyIncluded.size();
}
//返回所有等待处理的链接数量
public long queuedUriCount() {
return pendingURIs.size() + prerequisites.size();
}
//返回所有已经完成的链接数量
public long finishedUriCount() {
return successCount + failedCount + disregardedCount;
}
//返回所有成功处理的链接数量
public long successfullyFetchedCount() {
return successCount;
}
//返回所有失败的链接数量
public long failedFetchCount() {
return failedCount;
}
//返回所有抛弃的链接数量
public long disregardedFetchCount() {
return disregardedCount;
}
//返回总共下载的字节数
public long totalBytesWritten() {
return totalProcessedBytes;
}
public String report() {
return "This frontier does not return a report.";
}
public void importRecoverLog(String pathToLog) throws IOException {
throw new UnsupportedOperationException();
}
public FrontierMarker getInitialMarker(String regexpr,
boolean inCacheOnly) {
return null;
}
public ArrayList getURIsList(FrontierMarker marker, int numberOfMatches,
boolean verbose) throws InvalidFrontierMarkerException {
return null;
}
public long deleteURIs(String match) {
return 0;
}
}
|
注意:上面仅仅是一个最基础的代码,从结构上揭示一个Frontier的作用
标签:架构,
Heritrix,
笔记,
public,
controller,
long,
return,
链接,
cURI
From: https://blog.51cto.com/u_2544485/7396870