感觉爬虫就是一种自动在网上收集信息的程序
对豆瓣Top250的爬取,就是写一个爬虫程序,让他模仿人类行为,打开网站,找到信息,然后提取数据
这段代码是使用lxml库来解析HTML,并通过XPath选择器提取数据
import requests #用于发起网络请求。
from lxml import etree #用于解析HTML文档,这里使用的是lxml库
import csv #用于写入CSV文件
import time #用于在请求之间添加延迟
import codecs
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
} # 模拟浏览器访问,防止被豆瓣网站识别为爬虫而遭到封锁
def get_first_text(lst):
try:
return lst[0].strip()
except IndexError:
return ""
def scrape_douban_movie(url, headers):
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching URL: {url} - {e}")
return None
def main():
urls = [f"https://movie.douban.com/top250?start={i * 25}&filter=" for i in range(10)]
count = 1 #用于计数
movie_data = []
for url in urls:
html = scrape_douban_movie(url, headers)
if html is None:
continue
parsed_html = etree.HTML(html) ## 将返回的文本加工为可以解析的html
movie_list = parsed_html.xpath('//*[@id="content"]/div/div[1]/ol/li')
for movie in movie_list:
title = get_first_text(movie.xpath('./div/div[2]/div[1]/a/span[1]/text()')) #电影标题
director_actor = get_first_text(movie.xpath('./div/div[2]/div[2]/p[1]/text()')) #电影导演 演员
score = get_first_text(movie.xpath('./div/div[2]/div[2]/div/span[2]/text()')) #电影评分
introduction = get_first_text(movie.xpath('./div/div[2]/div[2]/p[2]/span/text()'))#电影引言
print(count, title, director_actor, score, introduction)
movie_data.append([title, director_actor, score, introduction])
count += 1
time.sleep(1)
with codecs.open('movies.csv', 'w', 'utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(["Title", "Director/Actor", "Score", "Introduction"])
writer.writerows(movie_data)
if __name__ == '__main__':
main()
下边是使用bs4解析
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
def get_movie_info(soup):
movies = []
movie_list = soup.find('ol', class_='grid_view').find_all('li') #在这个标签下找到所有的li标签(包含每部电影的信息)
for movie_li in movie_list:
title = movie_li.find('span', class_='title').get_text()
info = movie_li.find('p', class_='').get_text().strip()
director_actor = info.split('\xa0\xa0\xa0')[0]
other_info = info.split('\xa0\xa0\xa0')[1:]
rating_info = movie_li.find('span', class_='rating_num').get_text()
introduction = movie_li.find('span', class_='inq').get_text() if movie_li.find('span', class_='inq') else ''
#将提取出的电影信息作为一个字典添加到movies列表中
movies.append({
'title': title,
'director_actor': director_actor,
'rating': rating_info,
'introduction': introduction
})
return movies
def main():
urls = ["https://movie.douban.com/top250?start={}".format(str(i * 25)) for i in range(10)]
with open('movies.txt', 'w', encoding='utf-8') as file:
for url in urls:
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
movies = get_movie_info(soup)
for movie in movies:
file.write(f"Title: {movie['title']}\n")
file.write(f"Director/Actor: {movie['director_actor']}\n")
file.write(f"Rating: {movie['rating']}\n")
file.write(f"Introduction: {movie['introduction']}\n")
file.write('\n')
if __name__ == '__main__':
main()
下面是使用正则表达式解析HTML
借助了两个网站:
https://curlconverter.com/
https://regex101.com/
import requests
import re
import pandas as pd
listss = []
for i in range(0, 250, 25):
cookies = {
'bid': 'T4QMlOS21eo',
'_pk_id.100001.4cf6': '5e25c0c864dca561.1706688370.',
'__yadk_uid': '8F9Q5P1dXq6TkE7QP5FEJIEhvxsTIfd4',
'_pk_ref.100001.4cf6': '%5B%22%22%2C%22%22%2C1707383866%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D',
'_pk_ses.100001.4cf6': '1',
'ap_v': '0,6.0',
'__utma': '30149280.175749301.1706688370.1707269654.1707383866.6',
'__utmb': '30149280.0.10.1707383866',
'__utmc': '30149280',
'__utmz': '30149280.1707383866.6.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
'__utma': '223695111.96802592.1706688370.1707269654.1707383866.6',
'__utmb': '223695111.0.10.1707383866',
'__utmc': '223695111',
'__utmz': '223695111.1707383866.6.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Referer': 'https://movie.douban.com/top250?start=25&filter=',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'start': i,
'filter': '',
}
response = requests.get('https://movie.douban.com/top250', params=params, cookies=cookies, headers=headers).text
titles = re.findall('alt="(.*?)"', response)[:25] # 电影名称
directors = re.findall('<p class="pl">(.*?) ', response) # 导演
actors = re.findall('<p class="pl">(.*?) / ', response) # 演员
scores = re.findall('<span class="rating_num" property="v:average">(.*?)</span>', response) # 评分
quotes = re.findall('<span class="inq">(.*?)</span>', response) # 引言
for j in range(len(titles)):
data = {
'title': titles[j],
'director': directors[j].strip(),
'actor': actors[j].strip(),
'score': scores[j],
'quote': quotes[j]
}
listss.append(data)
print(data)
df = pd.DataFrame(listss)
print(df)
标签:__,get,movie,爬虫,豆瓣,text,div,Top250,response
From: https://www.cnblogs.com/0214jx/p/18010143