import requests
import re
url = 'xxx/index2.htm'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.56'
}
resp = requests.get(url=url, headers=headers)
resp.encoding = 'gb2312' # 设置字符集,打开页面源代码翻到最上面,会看见charset=gb2312"> 跟着改就行
main_page_text = resp.text
# 匹配主见面最新电影的url
obj = re.compile(r"最新电影下载</a>]<a href='(?P<link>.*?)'>", re.S)
# 匹配标题跟磁链接
obj1 = re.compile(r"译 名 (?P<name>.*?)[/<]"
r".*?<blockquote>(?P<magnet>.*?)</blockquote>", re.S)
# 存放到文件里面
f = open('data.txt', mode='w', encoding='utf-8')
# 把匹配到的放到lst里面
href = obj.finditer(main_page_text)
lst = []
for it in href:
lst.append(it.group('link'))
#
for l in lst:
# 拼接url 请求最新电影下的电影详情
under_url = 'https://dy.dytt8.net/' + l
under = requests.get(url=under_url, headers=headers)
under.encoding = 'gb2312'
under_page_text = under.text
# 搜索到一次就返回 返回字符串
# name = obj1.search(under_page_text).group('name').strip()
# magnet = obj1.search(under_page_text).group('magnet')
# 如果不设置try的话 会报找不到‘group’
# AttributeError: 'NoneType' object has no attribute 'group'
# 再把里面页面的进行re判断提取有用的信息,再统一返回成字典,再把value提取出来放文件里面
result = obj1.finditer(under_page_text)
for it in result:
dic = it.groupdict()
dic['name'] = dic['name'].strip() # 去除前后空格
f.write(f"{dic['name']},{dic['magnet']}")
f.write('\n')
under.close()
f.close()
resp.close()
print("完成!!!")
标签:url,text,电影,page,爬取,re,under,链接 From: https://www.cnblogs.com/Wesuiliye/p/17163632.html