class UrlManager(): #url管理器 def __init__():#设置新老url数组,分别为未爬取和已爬取 self.new_urls = set() self.old_urls = set() def add_new_url(self,url):#添加单个url if url is None or len(url) == 0: return if url in self.new_urls or url in self.old_urls: return self.new_urls.add(url) def add_new_urls(self,urls):#添加多个url if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) def get_url(self):#得到新的url最后一位 if self.has_new_url(): url = self.new_urls.pop() self.old_urls.add(url) return url else: return Nome def has_new_url(self):#查看新url数组是否还有 return len(self.new_urls) > 0 if __name__=="__main__": url_manger = UrlManager()
标签:__,管理器,python,self,return,爬取,url,urls,new From: https://www.cnblogs.com/cocotun/p/17560038.html