python 批量爬取邮箱地址
#coding: utf-8
import requests
import bs4 #解析网页
import lxml
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
proxyip = {
'http': '121.13.252.61:41564'
}
#获取该页面所有url 地址,包括分页地址,返回除了本页面以外的所有分页地址
def fetch_url():
url_list = []
page_obj = requests.get('https://www.douban.com/group/topic/165453665/?start=100&_i=9637470y8YseOC',headers=headers,proxies=proxyip)
bs4_obj = bs4.BeautifulSoup(page_obj.text,"lxml") #指定lxml 解析器
#print(bs4_obj.text)
#需要分析页面前端代码,div 以及标签选择器
comment_else = bs4_obj.find_all(name="div",attrs={"class":"paginator"}) #comment_else 是一个大的列表
for element in comment_else:
comment_else_url = element.find("a") #查找所有 a 标签
#<a href="https://www.douban.com/group/topic/165453665/?start=0">1</a> ,获取地址 https://www.douban.com/group/topic/165453665/?start=0
comment_else_url = comment_else_url.attrs.get("href") #未带class 属性,直接提取href,获取真实分页http 地址
url_list.append(comment_else_url)
#print(comment_else_url.attrs.get("href"))
return url_list
def fetch_mail_address(url):
page_obj = requests.get(url,headers=headers,proxies=proxyip)
#print(page_obj.text)
bs4_obj = bs4.BeautifulSoup(page_obj.text,"lxml") #指定lxml 解析器
comment_else = bs4_obj.find_all(name="div",attrs={"class":"reply-doc content"}) #comment_else 是一个大的列表
#print(comment_else)
#print(len(comment_else))
# <span class="all ref-content">359906309@qq.com
# <p class="reply-content">1489433531@qq.com谢谢楼主</p>
mail_list = []
for ele in comment_else:
comment_ele_p = ele.find("p",attrs={"class":"reply-content"})
comment_ele_pubtime = ele.find("span", attrs={"class": "pubtime"})
#print(comment_ele_p.text,"-------------")
#re 正则匹配邮箱地址
comment_ele_address = re.search("\w+@\w+.\w+",comment_ele_p.text,flags=re.A) #flags防止匹配带上汉字
#print(comment_ele_pubtime)
if comment_ele_address:
#print()
mail_list.append([comment_ele_address.group(),comment_ele_pubtime.text])
return mail_list
if __name__ == "__main__":
url_list = fetch_url()
mail_total_list = []
for url in url_list:
mail_list = fetch_mail_address(url)
mail_total_list.append(mail_list)
mail_now_page_mail = fetch_mail_address('https://www.douban.com/group/topic/165453665/?start=100&_i=9637470y8YseOC')
mail_total_list.append(mail_now_page_mail)
#print(mail_total_list)
print('----------------------------------------------')
mail_new_total_list = []
for ilist in mail_total_list:
for mail in ilist:
#print(mail[0])
mail_new_total_list.append(mail[0])
print(mail_new_total_list)
print(len(mail_new_total_list))
标签:comment,python,list,else,爬取,url,邮箱,print,mail
From: https://www.cnblogs.com/lixinliang/p/17252936.html