import requests,json
from bs4 import BeautifulSoup
if __name__ == '__main__':
url = "https://yyets.com/movies/201565/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
}
result = {}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
head = soup.select("div#single div.sheader")
for headItem in head:
result['title'] = headItem.select("div.data h1")[0].getText()
result['cover_url'] = headItem.select("div.poster img")[0]['src']
result['date'] = headItem.select("div.extra span.date")[0].getText()
result['classify'] = []
for item in headItem.select("div.sgeneros a"):
result['classify'].append(item.getText())
info = soup.select("div#single div#info")
for infoItem in info:
desc = (infoItem.select("div.wp-content")[0]
.getText().strip())
desc = desc.replace(desc[desc.rfind('\n'):], '')
result['desc'] = desc
result['pic_list'] = []
g_imgs = infoItem.select("div#dt_galery img")
for imgItem in g_imgs:
img_src = imgItem['src'].strip()
result['pic_list'].append(img_src)
cast = soup.select("div#single div#cast")
for castItem in cast:
result['actor'] = []
for actor in castItem.select("div.persons div.person div.data div.name a")[1:]:
result['actor'].append(actor.getText())
try:
box_links = soup.select("div#single div.box_links")
for linkItem in box_links:
result['link'] = linkItem.select("div#videos table tr td a")[0]['href']
except:
result['link'] = ''
with open('result.json', 'w', encoding='utf-8') as f:
json.dump(result,f, ensure_ascii=False, indent=4)
print("爬取成功!")
标签:Python,BeautifulSoup,getText,爬取,result,div,headItem,select,desc
From: https://www.cnblogs.com/skyvip/p/18227694