Heritrix架构学习笔记（三）

标签：架构 Heritrix 笔记 public controller long return 链接 cURI

3、Frontier链接制造工厂

在heritrix- 1.12.1 /docs/articles/developer_manual/frontier.html下可找到Heritrix的官方文档的一个Frontier例子：

/** * A simple Frontier implementation for tutorial purposes / public class MyFrontier extends ModuleType implements Frontier, FetchStatusCodes { // A list of the discovered URIs that should be crawled. // 列表用来保存还未抓取的链接 List pendingURIs = new ArrayList(); // A list of prerequisites that needs to be met before any other URI is // allowed to be crawled, e.g. DNS-lookups //这个列表中保存了一系列的链接，它们的优先级要高于pendingURIs那个List中的任何一个链接， //表中的链接表示一些需要被满足的先决条件 List prerequisites = new ArrayList(); // A hash of already crawled URIs so that every URI is crawled only once. //一个HashMap，用于存储那些已经抓取过的链接 Map alreadyIncluded = new HashMap(); // Reference to the CrawlController. // CrawlController对象 CrawlController controller; // Flag to note if a URI is being processed. //用于标识是否一个链接正在被处理 boolean uriInProcess = false; // top-level stats //成功下载的数量 long successCount = 0; //失败的数量 long failedCount = 0; //抛弃掉链接的数量 long disregardedCount = 0; //总共下载的字节数 long totalProcessedBytes = 0; public MyFrontier(String name) { super(Frontier.ATTR_NAME, "A simple frontier."); } public void initialize(CrawlController controller) throws FatalConfigurationException, IOException { //注入 this.controller = controller; // Initialize the pending queue with the seeds //把种子文件中的链接加入到pengdingURIs中去 this.controller.getScope().refreshSeeds(); List seeds = this.controller.getScope().getSeedlist(); synchronized(seeds) { for (Iterator i = seeds.iterator(); i.hasNext();) { UURI u = (UURI) i.next(); CandidateURI caUri = new CandidateURI(u); caUri.setSeed(); schedule(caUri); } } } //该方法是给线程池中的线程调用的，用以取出下一个准备处理的链接 public synchronized CrawlURI next(int timeout) throws InterruptedException { if (!uriInProcess && !isEmpty()) { uriInProcess = true; CrawlURI curi; / 先看prerequistes队列中是否有要处理的链接，如果有，就先处理，如果没有，再看pengdingURIs队列中是否有链接。每次在处理的时候，总是取出队列中的第一个链接 / if (!prerequisites.isEmpty()) { curi = CrawlURI.from((CandidateURI) prerequisites.remove(0)); } else { curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0)); } curi.setServer(controller.getServerCache().getServerFor(curi)); return curi; } else { wait(timeout); return null; } } public boolean isEmpty() { return pendingURIs.isEmpty() && prerequisites.isEmpty(); } //该方法用于将新链接加入到pengdingURIs队列中，等待处理 public synchronized void schedule(CandidateURI caURI) { // Schedule a uri for crawling if it is not already crawled / 首先判断要加入的链接是否已经被抓取过，如果已经包含在alreadyIncluded这个HashMap中则说明处理过了，就可以放弃处理 */ if (!alreadyIncluded.containsKey(caURI.getURIString())) { if(caURI.needsImmediateScheduling()) { prerequisites.add(caURI); } else { pendingURIs.add(caURI); } //HashMap中使用url的字符串来作为key，而将实际的CadidateURI对象作为value alreadyIncluded.put(caURI.getURIString(), caURI); } } public void batchSchedule(CandidateURI caURI) { schedule(caURI); } public void batchFlush() { } //一次抓取结束后所执行的操作，该操作由线程池中的线程来进行调用 public synchronized void finished(CrawlURI cURI) { uriInProcess = false; //成功下载 if (cURI.isSuccess()) { successCount++; //统计下载总数 totalProcessedBytes += cURI.getContentSize(); //如果成功，则触发一个成功事件，比如将Extractor解析出来的新URL加入队列中 controller.fireCrawledURISuccessfulEvent(cURI); cURI.stripToMinimal(); } //需要推迟下载 else if (cURI.getFetchStatus() == S_DEFERRED) { cURI.processingCleanup(); alreadyIncluded.remove(cURI.getURIString()); schedule(cURI); } //其他状态 else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED \|\| cURI.getFetchStatus() == S_OUT_OF_SCOPE \|\| cURI.getFetchStatus() == S_BLOCKED_BY_USER \|\| cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS \|\| cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS \|\| cURI.getFetchStatus() == S_DELETED_BY_USER) { //抛弃当前URI controller.fireCrawledURIDisregardEvent(cURI); disregardedCount++; cURI.stripToMinimal(); } else { controller.fireCrawledURIFailureEvent(cURI); failedCount++; cURI.stripToMinimal(); } cURI.processingCleanup(); } //返回所有已经处理过的链接数量 public long discoveredUriCount() { return alreadyIncluded.size(); } //返回所有等待处理的链接数量 public long queuedUriCount() { return pendingURIs.size() + prerequisites.size(); } //返回所有已经完成的链接数量 public long finishedUriCount() { return successCount + failedCount + disregardedCount; } //返回所有成功处理的链接数量 public long successfullyFetchedCount() { return successCount; } //返回所有失败的链接数量 public long failedFetchCount() { return failedCount; } //返回所有抛弃的链接数量 public long disregardedFetchCount() { return disregardedCount; } //返回总共下载的字节数 public long totalBytesWritten() { return totalProcessedBytes; } public String report() { return "This frontier does not return a report."; } public void importRecoverLog(String pathToLog) throws IOException { throw new UnsupportedOperationException(); } public FrontierMarker getInitialMarker(String regexpr, boolean inCacheOnly) { return null; } public ArrayList getURIsList(FrontierMarker marker, int numberOfMatches, boolean verbose) throws InvalidFrontierMarkerException { return null; } public long deleteURIs(String match) { return 0; } }

/**       
 * A simple Frontier implementation for tutorial purposes       
 */       
public class MyFrontier extends ModuleType implements Frontier,       
        FetchStatusCodes {       
// A list of the discovered URIs that should be crawled.       
// 列表用来保存还未抓取的链接        
    List pendingURIs = new ArrayList();       
    
    // A list of prerequisites that needs to be met before any other URI is       
// allowed to be crawled, e.g. DNS-lookups       
//这个列表中保存了一系列的链接，它们的优先级要高于pendingURIs那个List中的任何一个链接，        
//表中的链接表示一些需要被满足的先决条件        
    List prerequisites = new ArrayList();       
    
// A hash of already crawled URIs so that every URI is crawled only once.       
//一个HashMap，用于存储那些已经抓取过的链接        
    Map alreadyIncluded = new HashMap();       
    
// Reference to the CrawlController.       
// CrawlController对象        
    CrawlController controller;       
                       
// Flag to note if a URI is being processed.       
//用于标识是否一个链接正在被处理        
    boolean uriInProcess = false;       
    
// top-level stats       
//成功下载的数量        
long successCount = 0;       
//失败的数量        
long failedCount = 0;       
//抛弃掉链接的数量        
long disregardedCount = 0;       
//总共下载的字节数        
    long totalProcessedBytes = 0;       
                       
    public MyFrontier(String name) {       
        super(Frontier.ATTR_NAME, "A simple frontier.");       
    }       
                       
    public void initialize(CrawlController controller)       
            throws FatalConfigurationException, IOException {       
        //注入        
        this.controller = controller;       
        
        // Initialize the pending queue with the seeds       
        //把种子文件中的链接加入到pengdingURIs中去        
        this.controller.getScope().refreshSeeds();       
        List seeds = this.controller.getScope().getSeedlist();       
        synchronized(seeds) {       
            for (Iterator i = seeds.iterator(); i.hasNext();) {       
                UURI u = (UURI) i.next();       
                CandidateURI caUri = new CandidateURI(u);       
                caUri.setSeed();       
                schedule(caUri);       
            }       
        }       
    }       
                       
    //该方法是给线程池中的线程调用的，用以取出下一个准备处理的链接        
    public synchronized CrawlURI next(int timeout) throws InterruptedException {       
        if (!uriInProcess && !isEmpty()) {       
            uriInProcess = true;       
            CrawlURI curi;       
            /*       
             先看prerequistes队列中是否有要处理的链接，如果有，就先处理，如果没有，再看pengdingURIs队列中是否有链接。每次在处理的时候，总是取出队列中的第一个链接        
            */       
            if (!prerequisites.isEmpty()) {       
                curi = CrawlURI.from((CandidateURI) prerequisites.remove(0));       
            } else {       
                curi = CrawlURI.from((CandidateURI) pendingURIs.remove(0));       
            }       
            curi.setServer(controller.getServerCache().getServerFor(curi));       
            return curi;       
        } else {       
            wait(timeout);       
            return null;       
        }       
    }       
       public boolean isEmpty() {       
        return pendingURIs.isEmpty() && prerequisites.isEmpty();       
    }       
                       
     //该方法用于将新链接加入到pengdingURIs队列中，等待处理        
    public synchronized void schedule(CandidateURI caURI) {       
        // Schedule a uri for crawling if it is not already crawled       
       /*       
          首先判断要加入的链接是否已经被抓取过，如果已经包含在alreadyIncluded这个HashMap中则说明处理过了，就可以放弃处理        
       */       
        if (!alreadyIncluded.containsKey(caURI.getURIString())) {       
            if(caURI.needsImmediateScheduling()) {       
                prerequisites.add(caURI);       
            } else {       
                pendingURIs.add(caURI);       
            }       
            //HashMap中使用url的字符串来作为key，而将实际的CadidateURI对象作为value        
            alreadyIncluded.put(caURI.getURIString(), caURI);       
        }       
    }       
                       
    public void batchSchedule(CandidateURI caURI) {       
        schedule(caURI);       
    }       
                       
    public void batchFlush() {       
    }       
                       
    //一次抓取结束后所执行的操作，该操作由线程池中的线程来进行调用        
    public synchronized void finished(CrawlURI cURI) {       
        uriInProcess = false;       
        //成功下载        
        if (cURI.isSuccess()) {       
            
            successCount++;       
            //统计下载总数        
            totalProcessedBytes += cURI.getContentSize();       
            //如果成功，则触发一个成功事件，比如将Extractor解析出来的新URL加入队列中        
            controller.fireCrawledURISuccessfulEvent(cURI);       
            cURI.stripToMinimal();       
        }       
         //需要推迟下载        
else if (cURI.getFetchStatus() == S_DEFERRED) {       
            cURI.processingCleanup();       
            alreadyIncluded.remove(cURI.getURIString());       
            schedule(cURI);       
        }       
        //其他状态        
else if (cURI.getFetchStatus() == S_ROBOTS_PRECLUDED       
                || cURI.getFetchStatus() == S_OUT_OF_SCOPE       
                || cURI.getFetchStatus() == S_BLOCKED_BY_USER       
                || cURI.getFetchStatus() == S_TOO_MANY_EMBED_HOPS       
                || cURI.getFetchStatus() == S_TOO_MANY_LINK_HOPS       
                || cURI.getFetchStatus() == S_DELETED_BY_USER) {       
            //抛弃当前URI        
            controller.fireCrawledURIDisregardEvent(cURI);       
            disregardedCount++;       
            cURI.stripToMinimal();       
        } else {       
            controller.fireCrawledURIFailureEvent(cURI);       
            failedCount++;       
            cURI.stripToMinimal();       
        }       
        cURI.processingCleanup();       
    }       
                       
    //返回所有已经处理过的链接数量        
    public long discoveredUriCount() {       
        return alreadyIncluded.size();       
    }       
                       
   //返回所有等待处理的链接数量        
    public long queuedUriCount() {       
        return pendingURIs.size() + prerequisites.size();       
    }       
                       
    //返回所有已经完成的链接数量        
    public long finishedUriCount() {       
        return successCount + failedCount + disregardedCount;       
    }       
    
    //返回所有成功处理的链接数量        
    public long successfullyFetchedCount() {       
        return successCount;       
    }       
   
    //返回所有失败的链接数量        
    public long failedFetchCount() {       
        return failedCount;       
    }       
    //返回所有抛弃的链接数量        
    public long disregardedFetchCount() {       
        return disregardedCount;       
    }       
   //返回总共下载的字节数        
    public long totalBytesWritten() {       
        return totalProcessedBytes;       
    }       
                       
    public String report() {       
        return "This frontier does not return a report.";       
    }       
                       
    public void importRecoverLog(String pathToLog) throws IOException {       
        throw new UnsupportedOperationException();       
    }       
                       
    public FrontierMarker getInitialMarker(String regexpr,       
            boolean inCacheOnly) {       
        return null;       
    }       
                       
    public ArrayList getURIsList(FrontierMarker marker, int numberOfMatches,       
            boolean verbose) throws InvalidFrontierMarkerException {       
        return null;       
    }       
                       
    public long deleteURIs(String match) {       
        return 0;       
    }       
                       
}

注意：上面仅仅是一个最基础的代码，从结构上揭示一个Frontier的作用

标签：架构,Heritrix,笔记,public,controller,long,return,链接,cURI
From： https://blog.51cto.com/u_2544485/7396870

Heritrix架构学习笔记（三）

3、Frontier链接制造工厂

相关文章

赞助商

阅读排行