首页 > 其他分享 >给女朋友写的一个利用搜索引擎爬取会议论文的脚本

给女朋友写的一个利用搜索引擎爬取会议论文的脚本

时间:2022-10-04 00:22:48浏览次数:55  
标签:search url list 搜索引擎 爬取 key print page 女朋友

import bs4, requests, os
from multiprocessing import Manager, Pool

#红色:报错
def R(message):
    return "\033[1;91m{}\033[0;m".format(message)
#绿色:成功
def G(message):
    return "\033[1;92m{}\033[0;m".format(message)

def B(message):
    return "\033[1;94m{}\033[0;m".format(message)

url_dict = Manager().dict()
key_list = ["On the TOCTOU Problem in Remote Attestation", "Search-based Approaches for Local Black-Box Code Deobfuscation: Understand, Improve and Mitigate", "Exorcising Spectres with Secure Compilers"]
fakeua = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"}
search_depth = 10 #搜索前10个链接
thread_num = 8 #线程数

def search_page(search_list):
	print(B('[*]Starting search page now......'))
	for keywords in search_list:
		# 进行必应搜索并下载搜索页面
		searchPage = requests.get('https://cn.bing.com/search?q=' + keywords, headers = fakeua)
		searchPage.raise_for_status()    # 如果失败就抛出异常

		# 得到搜索结果的链接
		searchSoup = bs4.BeautifulSoup(searchPage.text, features="html.parser")
		elements = searchSoup.select('.sh_favicon')

		# 在浏览器中打开前search_depth个连接
		get_url = False
		for i in range(min(search_depth, len(elements))):
			href = elements[i].get('href')
			try:
				if "pdf" in href:
					url_dict[keywords] = href
					get_url = True
					break
			except:
				pass

		if get_url:
			print(G('[+] Get download href "%s" for paper "%s"' % (href, keywords)))
		else:
			print(R('[-] Cannot get download href for paper "%s"' % (keywords)))

def download_page(search_list):
	print(B('[*]Starting download page now......'))
	for key in search_list:
		if key not in url_dict:
			break
		url = url_dict[key]
		try:
			key = filter_key(key)
			data = requests.get(url, headers=fakeua, stream=True, timeout=30)
			result_dir = os.path.join(os.getcwd(), '\\result')
			if not os.path.exists(result_dir):
				os.mkdir(result_dir)
			page_path = os.path.join(result_dir, '%s.pdf' % key)
			with open(page_path, 'wb') as fp:
					fp.write(data.content)
			print(G('[+] Successfully download page "%s.pdf"' % (key)))
		except:
			print(G('[-] Failed download page "%s.pdf"' % (key)))
			pass

#替换掉名字中的特殊字符
def filter_key(key):
	sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
	for char in key:
		if char in sets:
			key = key.replace(char, '')
	return key

def gen_result_txt():
	path = os.path.join(os.getcwd(), 'result.txt')
	with open(path, 'w+') as fp:
		for key, value in url_dict.items():
			line = key + ' ==> ' + value + '\n'
			fp.write(line)
	print(G('[+] Successfully gen reulst text in: %s' % (path)))

if __name__ == '__main__':
	#分发线程任务
	thread_num = min(thread_num, len(key_list))
	each_len = len(key_list) // thread_num
	search_list = []
	for i in range(thread_num):
		if i == thread_num - 1:
			search_list.append(key_list[i*each_len:])
		else:
			search_list.append(key_list[i * each_len: i * each_len + each_len])

	print (search_list)
	pool = Pool(processes=thread_num)
	pool.map(search_page, search_list)
	pool.join()  # 主进程阻塞等待子进程的退出
	gen_result_txt()
	print (url_dict)
	pool = Pool(processes=thread_num)
	pool.map(download_page, search_list)

标签:search,url,list,搜索引擎,爬取,key,print,page,女朋友
From: https://www.cnblogs.com/z5onk0/p/16751693.html

相关文章