DuSingleton.py
import httplib2 # https://pypi.org/project/httplib2/ import os import re import threading import urllib import urllib.request from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup # https://pypi.org/project/bs4/ # Singleton Pattern 单例模式 DuSingleton.py class CrawlerSingleton(object): def __new__(cls): """ creates a singleton object, if it is not created, or else returns the previous singleton object""" if not hasattr(cls, 'instance'): cls.instance = super(CrawlerSingleton, cls).__new__(cls) return cls.instance def navigate_site(max_links=5): """ navigate the website using BFS algorithm, find links and arrange them for downloading images """ # singleton instance parser_crawlersingleton = CrawlerSingleton() # During the initial stage, url_queue has the main_url. # Upon parsing the main_url page, new links that belong to the # same website is added to the url_queue until # it equals to max _links. while parser_crawlersingleton.url_queue: # checks whether it reached the max. link if len(parser_crawlersingleton.visited_url) == max_links: return # pop the url from the queue url = parser_crawlersingleton.url_queue.pop() # connect to the web page http = httplib2.Http() try: status, response = http.request(url) except Exception: continue # add the link to download the images parser_crawlersingleton.visited_url.add(url) print(url) # crawl the web page and fetch the links within # the main page bs = BeautifulSoup(response, "html.parser") for link in BeautifulSoup.findAll(bs, 'a'): link_url = link.get('href') if not link_url: continue # parse the fetched link parsed = urlparse(link_url) print(link_url) # skip the link, if it leads to an external page if parsed.netloc and parsed.netloc != parsed_url.netloc: continue scheme = parsed_url.scheme netloc = parsed.netloc or parsed_url.netloc path = parsed.path # construct a full url link_url = scheme + '://' + netloc + path # skip, if the link is already added if link_url in parser_crawlersingleton.visited_url: continue # Add the new link fetched, # so that the while loop continues with next iteration. parser_crawlersingleton.url_queue = [link_url] + \ parser_crawlersingleton.url_queue class ParallelDownloader(threading.Thread): """ Download the images parallelly """ def __init__(self, thread_id, name, counter): threading.Thread.__init__(self) self.name = name def run(self): print('Starting thread', self.name) # function to download the images download_images(self.name) print('Finished thread', self.name) def download_images(thread_name): # singleton instance singleton = CrawlerSingleton() # visited_url has a set of URLs. # Here we will fetch each URL and # download the images in it. while singleton.visited_url: # pop the url to download the images url = singleton.visited_url.pop() http = httplib2.Http() print(thread_name, 'Downloading images from', url) try: status, response = http.request(url) except Exception: continue # parse the web page to find all images bs = BeautifulSoup(response, "html.parser") # Find all <img> tags images = BeautifulSoup.findAll(bs, 'img') for image in images: src = image.get('src') src = urljoin(url, src) basename = os.path.basename(src) print('basename:', basename) if basename != '': if src not in singleton.image_downloaded: singleton.image_downloaded.add(src) print('Downloading', src) # Download the images to local system urllib.request.urlretrieve(src, os.path.join('images', basename)) print(thread_name, 'finished downloading images from', url) def main(main_url): # singleton instance crwSingltn = CrawlerSingleton() # adding the url to the queue for parsing crwSingltn.url_queue = [main_url] # self.name print(main_url) # initializing a set to store all visited URLs # for downloading images. crwSingltn.visited_url = set() # initializing a set to store path of the downloaded images crwSingltn.image_downloaded = set() # invoking the method to crawl the website #navigate_site(5) # 有问题 ## create images directory if not exists if not os.path.exists('images'): os.makedirs('images') thread1 = ParallelDownloader(1, "Thread-1", 1) thread2 = ParallelDownloader(2, "Thread-2", 2) # Start new threads thread1.start() thread2.start()
main.py
调用:
# 单例模式 Singleton Pattern main_url = ("http://www.dusystem.com/") parsed_url = DuSingleton.urlparse(main_url) DuSingleton.main(main_url)
输出:
http://www.dusystem.com/ Starting thread Thread-1 Finished thread Thread-1 Starting thread Thread-2 Finished thread Thread-2
标签:Singleton,name,thread,Python,Pattern,url,link,images,main From: https://www.cnblogs.com/geovindu/p/16811639.html