首页 > 其他分享 >爬虫:豆瓣电影Top250

爬虫:豆瓣电影Top250

时间:2024-02-08 19:11:06浏览次数:22  
标签:__ get movie 爬虫 豆瓣 text div Top250 response

感觉爬虫就是一种自动在网上收集信息的程序
对豆瓣Top250的爬取,就是写一个爬虫程序,让他模仿人类行为,打开网站,找到信息,然后提取数据
这段代码是使用lxml库来解析HTML,并通过XPath选择器提取数据

import requests  #用于发起网络请求。
from lxml import etree  #用于解析HTML文档,这里使用的是lxml库
import csv  #用于写入CSV文件
import time  #用于在请求之间添加延迟
import codecs  
headers = {  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'  
}  # 模拟浏览器访问,防止被豆瓣网站识别为爬虫而遭到封锁
def get_first_text(lst):  
    try:  
        return lst[0].strip()  
    except IndexError:  
        return ""  
  
def scrape_douban_movie(url, headers):  
    try:  
        response = requests.get(url, headers=headers)  
        response.raise_for_status()  
        return response.text  
    except requests.RequestException as e:  
        print(f"Error fetching URL: {url} - {e}")  
        return None  
  
def main():  
    urls = [f"https://movie.douban.com/top250?start={i * 25}&filter=" for i in range(10)]  
    count = 1  #用于计数
    movie_data = []  
    for url in urls:  
        html = scrape_douban_movie(url, headers)  
        if html is None:  
            continue  
  
        parsed_html = etree.HTML(html)      ## 将返回的文本加工为可以解析的html  
        movie_list = parsed_html.xpath('//*[@id="content"]/div/div[1]/ol/li')  
  
        for movie in movie_list:  
            title = get_first_text(movie.xpath('./div/div[2]/div[1]/a/span[1]/text()'))       #电影标题 
            director_actor = get_first_text(movie.xpath('./div/div[2]/div[2]/p[1]/text()'))   #电影导演 演员
            score = get_first_text(movie.xpath('./div/div[2]/div[2]/div/span[2]/text()'))     #电影评分
            introduction = get_first_text(movie.xpath('./div/div[2]/div[2]/p[2]/span/text()'))#电影引言
  
            print(count, title, director_actor, score, introduction)  
            movie_data.append([title, director_actor, score, introduction])  
  
            count += 1  
            time.sleep(1)    
    with codecs.open('movies.csv', 'w', 'utf-8-sig') as file:  
        writer = csv.writer(file)  
        writer.writerow(["Title", "Director/Actor", "Score", "Introduction"])  
        writer.writerows(movie_data)  
  
if __name__ == '__main__':  
    main()

下边是使用bs4解析

import requests  
from bs4 import BeautifulSoup  
  
headers = {  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'  
}  
  
def get_movie_info(soup):  
    movies = []  
    movie_list = soup.find('ol', class_='grid_view').find_all('li')  #在这个标签下找到所有的li标签(包含每部电影的信息)

    for movie_li in movie_list:  
        title = movie_li.find('span', class_='title').get_text()  
        info = movie_li.find('p', class_='').get_text().strip()  
        director_actor = info.split('\xa0\xa0\xa0')[0]   
        other_info = info.split('\xa0\xa0\xa0')[1:]  
        rating_info = movie_li.find('span', class_='rating_num').get_text()  
        introduction = movie_li.find('span', class_='inq').get_text() if movie_li.find('span', class_='inq') else '' 
        #将提取出的电影信息作为一个字典添加到movies列表中 
        movies.append({  
            'title': title,  
            'director_actor': director_actor,  
            'rating': rating_info,  
            'introduction': introduction  
        })  
    return movies  

def main():
    urls = ["https://movie.douban.com/top250?start={}".format(str(i * 25)) for i in range(10)]
    
    with open('movies.txt', 'w', encoding='utf-8') as file:
        for url in urls:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            movies = get_movie_info(soup)
            
            for movie in movies:
                file.write(f"Title: {movie['title']}\n")
                file.write(f"Director/Actor: {movie['director_actor']}\n")
                file.write(f"Rating: {movie['rating']}\n")
                file.write(f"Introduction: {movie['introduction']}\n")
                file.write('\n')

if __name__ == '__main__':
    main()

下面是使用正则表达式解析HTML
借助了两个网站:
https://curlconverter.com/
https://regex101.com/

import requests
import re
import pandas as pd

listss = []
for i in range(0, 250, 25):
    cookies = {
        'bid': 'T4QMlOS21eo',
        '_pk_id.100001.4cf6': '5e25c0c864dca561.1706688370.',
        '__yadk_uid': '8F9Q5P1dXq6TkE7QP5FEJIEhvxsTIfd4',
        '_pk_ref.100001.4cf6': '%5B%22%22%2C%22%22%2C1707383866%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D',
        '_pk_ses.100001.4cf6': '1',
        'ap_v': '0,6.0',
        '__utma': '30149280.175749301.1706688370.1707269654.1707383866.6',
        '__utmb': '30149280.0.10.1707383866',
        '__utmc': '30149280',
        '__utmz': '30149280.1707383866.6.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
        '__utma': '223695111.96802592.1706688370.1707269654.1707383866.6',
        '__utmb': '223695111.0.10.1707383866',
        '__utmc': '223695111',
        '__utmz': '223695111.1707383866.6.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/',
    }

    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Connection': 'keep-alive',
        'Referer': 'https://movie.douban.com/top250?start=25&filter=',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
        'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }

    params = {
        'start': i,
        'filter': '',
    }

    response = requests.get('https://movie.douban.com/top250', params=params, cookies=cookies, headers=headers).text
    titles = re.findall('alt="(.*?)"', response)[:25]  # 电影名称
    directors = re.findall('<p class="pl">(.*?)&nbsp;', response)  # 导演
    actors = re.findall('<p class="pl">(.*?)&nbsp;/&nbsp;', response)  # 演员
    scores = re.findall('<span class="rating_num" property="v:average">(.*?)</span>', response)  # 评分
    quotes = re.findall('<span class="inq">(.*?)</span>', response)  # 引言

    for j in range(len(titles)):
        data = {
            'title': titles[j],
            'director': directors[j].strip(),
            'actor': actors[j].strip(),
            'score': scores[j],
            'quote': quotes[j]
        }
        listss.append(data)
        print(data)
df = pd.DataFrame(listss)
print(df)

标签:__,get,movie,爬虫,豆瓣,text,div,Top250,response
From: https://www.cnblogs.com/0214jx/p/18010143

相关文章

  • 爬取 【豆瓣电影top250数据】 python代码
    importrequestsimportopenpyxlimportreimporttimeimportpymysqlclassDoubanSpider:def__init__(self):self.url_temp="https://movie.douban.com/top250?start={}"self.headers={'User-Agent':'Mozilla/5.0......
  • python爬虫爬取豆瓣电影top250并写入Excel中
    importrequestsimportreimportopenpyxl#创建工作表wb=openpyxl.Workbook()ws=wb.active#调整列距forletterin['B','C']:ws.column_dimensions[letter].width=66#发送网络请求headers={"User-Agent":'Mozilla/5.0(WindowsNT10.0;Win64;x64)......
  • 第 1 章 Python 爬虫概念与 Web 基础
    第1章Python爬虫概念与Web基础1.1爬虫概念1.1.1什么是爬虫爬虫,即网络爬虫,又称网络蜘蛛(WebSpider),是一种按照一定规则,用来自动浏览或抓取万维网数据的程序。可以把爬虫程序看成一个机器人,它的功能就是模拟人的行为去访问各种站点,或者带回一些与站点相关的信息。它可以2......
  • 爬虫逆向案列 ---《某某省资源平台》
    网站接口:aHR0cHM6Ly95Z3AuZ2R6d2Z3Lmdvdi5jbi8jLzQ0L2p5Z2c=加密接口:aHR0cHM6Ly95Z3AuZ2R6d2Z3Lmdvdi5jbi9nZ3p5LXBvcnRhbC9zZWFyY2gvdjIvaXRlbXM=三个加密参数:X-Dgi-Req-Nonce、X-Dgi-Req-Timestamp、X-Dgi-Req-Signature观察请求头:从上图我们不难看出>X-Dgi-Req-Timestam......
  • BeautifulSoup爬虫库应用——Python 页面解析
    爬虫技术作为信息搜集的重要手段,在大数据时代发挥着至关重要的作用。通过网络爬虫,可以高效地从各种在线源头获取大规模、多样化的数据,为大数据分析和应用提供了必要的原始材料。首先,爬虫使得大数据的采集更为全面和及时。网络上存在着庞大的信息资源,包括社交媒体、新闻网站、电子......
  • 新手如何快速学习爬虫逆向?-->>爬虫之js逆向百例
    《个人练习》各位爬虫逆友如有需要请及时留言或者加vx:wzwzwz0613该案例只对学习js逆向的爬虫逆友提供技术交流,请勿进行商业交易,谢谢!技术交流群v+:......
  • 爬虫逆向案列---《某采购网ck反爬》
    网站接口:aHR0cDovL3d3dy55bmdwLmNvbS9wYWdlL3Byb2N1cmVtZW50L3Byb2N1cmVtZW50TGlzdC5odG1s首先分析ck是怎么生成的?访问首页获取通过js代码逆向获取其它页面返回访问接口,返回一段js代码,分析js代码获取ck所以,ck一般是在网页端的文档中的html可以观察到。刷新网址我们可以......
  • python基础爬虫
    python基础爬虫基于beautifulSoup的爬虫:一:先导包:importrequestsfrombs4importBeautifulSoup二:伪装:headers={'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64;rv:122.0)Gecko/20100101Firefox/122.0'}user-agent在浏览器按f12->网络->消息头......
  • 爬虫-异步抓取一部小说
    一、利用requests请求同步和aiohttp异步,两个结合来获取小说里的内容1、先利用cookie和session来实现登录根据post请求,带入参数来建立会话,并获取session利用session来进行同步请求获取,每一章节的名称和链接地址 通过上面的图,发现在/html/body/div[5]/dl/dd范围内的a标签......
  • MetaGPT day05 MetaGPT 爬虫工程师智能体
    Metagpt爬虫智能体需求1.用ActionNode重写订阅智能体,实现自然语言爬取解析网站内容2.根据尝试实现思路1,即使用llm提取出需要的信息而不是写爬虫代码。3.目前,订阅智能体是通过RunSubscription运行的,即RunSubscription这个action,不仅创建了订阅智能体代码,并启动了Subscriptio......