为什么我的 Selenium 脚本只抓取 F O R E B E T 上 7 场比赛的数据？

标签：python selenium-webdriver web-scraping beautifulsoup

我正在开发一个网络抓取项目，使用 Selenium 从体育预测网站抓取足球比赛数据（让我们使用 Examples 表示 F O R E B E T）。但是，即使网页上列出了更多匹配项，我的脚本也仅检索 7 个匹配项的数据。这是我的代码的相关部分：

import time
from bs4 import BeautifulSoup
import pandas as pd
import logging
import google_colab_selenium as gs
from selenium.webdriver.chrome.options import Options
from random import choice

# Configurations
request_interval = 2  # seconds
page_load_delay = 2  # seconds
base_url = "https://www.example.com/en/football-tips-and-predictions-for-today/predictions-1x2"

# User agents list (non-mobile)
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'
]

# Configure logging
logging.basicConfig(level=logging.INFO)

# Setup Chrome options and driver
def setup_selenium():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-popup-blocking")
    options.add_argument("--ignore-certificate-errors")
    options.add_argument("--incognito")
    options.add_argument(f'--user-agent={choice(user_agents)}')
    driver = gs.Chrome(options=options)
    return driver

def get_page_content(url, driver, request_interval=2, page_load_delay=2):
    driver.get(url)
    time.sleep(request_interval)
    html_content = driver.page_source
    time.sleep(page_load_delay)
    return html_content

def parse_match_details(soup, driver):
    matches = []
    match_links = [a['href'] for a in soup.select('.contentmiddle a[itemprop="url"]')]
    for link in match_links:
        match_url = f"https://www.example.com{link}"
        html_content = get_page_content(match_url, driver)
        match_soup = BeautifulSoup(html_content, 'lxml')
        if not match_soup:
            continue

        try:
            home_team_name = match_soup.select_one("[itemprop='homeTeam'] [itemprop='url'] span").text.strip()
            away_team_name = match_soup.select_one("[itemprop='awayTeam'] [itemprop='url'] span").text.strip()
            home_matches_played = match_soup.select_one(".os_played_games_main span[data-team='h']").text.strip()
            away_matches_played = match_soup.select_one(".os_played_games_main span[data-team='a']").text.strip()
            home_standing = match_soup.select_one("span.teamtableleft").text.strip()
            away_standing = match_soup.select_one("span.teamtableright").text.strip()
            match_time = match_soup.select_one("[itemprop='startDate'] div").text.strip()

            recent_matches = match_soup.select_one("div.os_goals_section3_container")
            home_and_away_goals = recent_matches

            def get_statistic(selector, data_team, data_stat):
                stat = home_and_away_goals.select_one(f"[data-team='{data_team}'][data-stat='{data_stat}'] span.__{selector}")
                return stat.text.strip() if stat else ""

            home_stats = {
                "Under 1.5": get_statistic("under", 'h', 'pc-ov1.5'),
                "Over 1.5": get_statistic("over", 'h', 'pc-ov1.5'),
                "Under 2.5": get_statistic("under", 'h', 'pc-ov2.5'),
                "Over 2.5": get_statistic("over", 'h', 'pc-ov2.5'),
                "Under 3.5": get_statistic("under", 'h', 'pc-ov3.5'),
                "Over 3.5": get_statistic("over", 'h', 'pc-ov3.5'),
                "Yes": get_statistic("under", 'h', 'bottom_chart'),
                "No": get_statistic("over", 'h', 'bottom_chart'),
            }

            away_stats = {
                "Under 1.5": get_statistic("under", 'a', 'pc-ov1.5'),
                "Over 1.5": get_statistic("over", 'a', 'pc-ov1.5'),
                "Under 2.5": get_statistic("under", 'a', 'pc-ov2.5'),
                "Over 2.5": get_statistic("over", 'a', 'pc-ov2.5'),
                "Under 3.5": get_statistic("under", 'a', 'pc-ov3.5'),
                "Over 3.5": get_statistic("over", 'a', 'pc-ov3.5'),
                "Yes": get_statistic("under", 'a', 'bottom_chart'),
                "No": get_statistic("over", 'a', 'bottom_chart'),
            }

            matches.append({
                "Match Link": match_url,
                "Home Team Name": home_team_name,
                "Away Team Name": away_team_name,
                "Home Matches Played": home_matches_played,
                "Home Standing": home_standing,
                "Away Matches Played": away_matches_played,
                "Away Standing": away_standing,
                "Match Time": match_time,
                **home_stats,
                **away_stats
            })
        except Exception as e:
            logging.error(f"Error parsing {match_url}: {e}")

    return matches

def scrape_all_pages(start_url):
    all_matches = []
    next_page_url = start_url

    driver = setup_selenium()

    try:
        while next_page_url:
            html_content = get_page_content(next_page_url, driver)
            if not html_content:
                break

            soup = BeautifulSoup(html_content, 'html.parser')
            matches = parse_match_details(soup, driver)
            all_matches.extend(matches)

            load_more_button = soup.select_one('#mrows span')
            if load_more_button:
                next_page_url = load_more_button.get('data-next-page-url')
                if next_page_url:
                    if not next_page_url.startswith('http'):
                        next_page_url = f"https://www.example.com{next_page_url}"
                    time.sleep(page_load_delay)
                else:
                    next_page_url = None
            else:
                next_page_url = None
    finally:
        driver.quit()

    return all_matches

def main():
    matches = scrape_all_pages(base_url)
    df = pd.DataFrame(matches)
    df.to_csv('football_matches.csv', index=False)
    print("Data saved to football_matches.csv")

if __name__ == "__main__":
    main()

什么可能导致此限制，以及如何确保抓取页面上所有可用匹配的数据？

你的代码可能只抓取到 F O R E B E T 上 7 场比赛数据的原因是网站使用了 无限滚动 或 AJAX 加载 来动态加载内容。这意味着网站最初只加载少量比赛（在你的例子中是 7 场），当你向下滚动页面时，才会加载更多比赛。

你的当前代码只抓取最初加载的页面内容，所以它只找到了最初的 7 场比赛。

为了抓取所有比赛数据，你需要修改你的代码来处理网站的无限滚动机制。你可以通过以下两种方法实现：

1. 模拟滚动到底部

你可以使用 Selenium 模拟浏览器滚动到底部，从而触发加载更多比赛的操作。你可以使用 JavaScript 代码来实现滚动：

# ... 在你的代码中 ...

def scroll_to_bottom(driver):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

def scrape_all_pages(start_url):
    # ... 你的代码 ...

    try:
        while next_page_url:
            # ... 你的代码 ...

            # 在加载更多比赛之前滚动到底部
            scroll_to_bottom(driver)
            time.sleep(2)  # 等待比赛加载

            # 查找 "Load more" 按钮并点击
            load_more_button = driver.find_element(By.CSS_SELECTOR, '#mrows span')
            if load_more_button:
                load_more_button.click()
                time.sleep(2)  # 等待比赛加载
            else:
                next_page_url = None
    finally:
        # ... 你的代码 ...

这段代码会在每次加载更多比赛之前滚动到底部，确保所有比赛都被加载到页面上。

2. 直接向服务器发送 AJAX 请求

另一种方法是分析网站发出的 AJAX 请求，找到加载更多比赛的请求 URL，然后直接使用 Python 的 requests 库发送请求。

你需要使用浏览器的开发者工具（Network 标签）来分析 AJAX 请求。找到加载更多比赛的请求，复制它的 URL 和任何必要的参数（例如页码）。

import requests

# ... 在你的代码中 ...

def scrape_all_pages(start_url):
    # ... 你的代码 ...

    try:
        while next_page_url:
            # ... 你的代码 ...

            if next_page_url:
                response = requests.get(next_page_url)
                response.raise_for_status()  # 检查请求是否成功

                # 解析新的比赛数据
                soup = BeautifulSoup(response.content, 'html.parser')
                matches = parse_match_details(soup, driver)
                all_matches.extend(matches)

                # 提取下一页的 URL
                next_page_url = soup.select_one('#mrows span').get('data-next-page-url')
                if next_page_url and not next_page_url.startswith('http'):
                    next_page_url = f"https://www.example.com{next_page_url}"
            else:
                next_page_url = None
    finally:
        # ... 你的代码 ...

这种方法可以直接获取比赛数据，不需要模拟浏览器滚动，效率更高。

请注意，你需要根据 F O R E B E T 网站的具体实现来调整代码。使用浏览器的开发者工具分析网站的结构和 AJAX 请求，以便找到正确的选择器和参数。

标签：python,selenium-webdriver,web-scraping,beautifulsoup
From： 78786357

为什么我的 Selenium 脚本只抓取 F O R E B E T 上 7 场比赛的数据？

相关文章

赞助商

阅读排行