我正在开发一个网络抓取项目,使用 Selenium 从体育预测网站抓取足球比赛数据(让我们使用 Examples 表示 F O R E B E T)。但是,即使网页上列出了更多匹配项,我的脚本也仅检索 7 个匹配项的数据。这是我的代码的相关部分:
import time
from bs4 import BeautifulSoup
import pandas as pd
import logging
import google_colab_selenium as gs
from selenium.webdriver.chrome.options import Options
from random import choice
# Configurations
request_interval = 2 # seconds
page_load_delay = 2 # seconds
base_url = "https://www.example.com/en/football-tips-and-predictions-for-today/predictions-1x2"
# User agents list (non-mobile)
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'
]
# Configure logging
logging.basicConfig(level=logging.INFO)
# Setup Chrome options and driver
def setup_selenium():
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-infobars")
options.add_argument("--disable-popup-blocking")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--incognito")
options.add_argument(f'--user-agent={choice(user_agents)}')
driver = gs.Chrome(options=options)
return driver
def get_page_content(url, driver, request_interval=2, page_load_delay=2):
driver.get(url)
time.sleep(request_interval)
html_content = driver.page_source
time.sleep(page_load_delay)
return html_content
def parse_match_details(soup, driver):
matches = []
match_links = [a['href'] for a in soup.select('.contentmiddle a[itemprop="url"]')]
for link in match_links:
match_url = f"https://www.example.com{link}"
html_content = get_page_content(match_url, driver)
match_soup = BeautifulSoup(html_content, 'lxml')
if not match_soup:
continue
try:
home_team_name = match_soup.select_one("[itemprop='homeTeam'] [itemprop='url'] span").text.strip()
away_team_name = match_soup.select_one("[itemprop='awayTeam'] [itemprop='url'] span").text.strip()
home_matches_played = match_soup.select_one(".os_played_games_main span[data-team='h']").text.strip()
away_matches_played = match_soup.select_one(".os_played_games_main span[data-team='a']").text.strip()
home_standing = match_soup.select_one("span.teamtableleft").text.strip()
away_standing = match_soup.select_one("span.teamtableright").text.strip()
match_time = match_soup.select_one("[itemprop='startDate'] div").text.strip()
recent_matches = match_soup.select_one("div.os_goals_section3_container")
home_and_away_goals = recent_matches
def get_statistic(selector, data_team, data_stat):
stat = home_and_away_goals.select_one(f"[data-team='{data_team}'][data-stat='{data_stat}'] span.__{selector}")
return stat.text.strip() if stat else ""
home_stats = {
"Under 1.5": get_statistic("under", 'h', 'pc-ov1.5'),
"Over 1.5": get_statistic("over", 'h', 'pc-ov1.5'),
"Under 2.5": get_statistic("under", 'h', 'pc-ov2.5'),
"Over 2.5": get_statistic("over", 'h', 'pc-ov2.5'),
"Under 3.5": get_statistic("under", 'h', 'pc-ov3.5'),
"Over 3.5": get_statistic("over", 'h', 'pc-ov3.5'),
"Yes": get_statistic("under", 'h', 'bottom_chart'),
"No": get_statistic("over", 'h', 'bottom_chart'),
}
away_stats = {
"Under 1.5": get_statistic("under", 'a', 'pc-ov1.5'),
"Over 1.5": get_statistic("over", 'a', 'pc-ov1.5'),
"Under 2.5": get_statistic("under", 'a', 'pc-ov2.5'),
"Over 2.5": get_statistic("over", 'a', 'pc-ov2.5'),
"Under 3.5": get_statistic("under", 'a', 'pc-ov3.5'),
"Over 3.5": get_statistic("over", 'a', 'pc-ov3.5'),
"Yes": get_statistic("under", 'a', 'bottom_chart'),
"No": get_statistic("over", 'a', 'bottom_chart'),
}
matches.append({
"Match Link": match_url,
"Home Team Name": home_team_name,
"Away Team Name": away_team_name,
"Home Matches Played": home_matches_played,
"Home Standing": home_standing,
"Away Matches Played": away_matches_played,
"Away Standing": away_standing,
"Match Time": match_time,
**home_stats,
**away_stats
})
except Exception as e:
logging.error(f"Error parsing {match_url}: {e}")
return matches
def scrape_all_pages(start_url):
all_matches = []
next_page_url = start_url
driver = setup_selenium()
try:
while next_page_url:
html_content = get_page_content(next_page_url, driver)
if not html_content:
break
soup = BeautifulSoup(html_content, 'html.parser')
matches = parse_match_details(soup, driver)
all_matches.extend(matches)
load_more_button = soup.select_one('#mrows span')
if load_more_button:
next_page_url = load_more_button.get('data-next-page-url')
if next_page_url:
if not next_page_url.startswith('http'):
next_page_url = f"https://www.example.com{next_page_url}"
time.sleep(page_load_delay)
else:
next_page_url = None
else:
next_page_url = None
finally:
driver.quit()
return all_matches
def main():
matches = scrape_all_pages(base_url)
df = pd.DataFrame(matches)
df.to_csv('football_matches.csv', index=False)
print("Data saved to football_matches.csv")
if __name__ == "__main__":
main()
什么可能导致此限制,以及如何确保抓取页面上所有可用匹配的数据?
你的代码可能只抓取到 F O R E B E T 上 7 场比赛数据的原因是网站使用了 无限滚动 或 AJAX 加载 来动态加载内容。这意味着网站最初只加载少量比赛(在你的例子中是 7 场),当你向下滚动页面时,才会加载更多比赛。
你的当前代码只抓取最初加载的页面内容,所以它只找到了最初的 7 场比赛。
为了抓取所有比赛数据,你需要修改你的代码来处理网站的无限滚动机制。你可以通过以下两种方法实现:
1. 模拟滚动到底部
你可以使用 Selenium 模拟浏览器滚动到底部,从而触发加载更多比赛的操作。你可以使用 JavaScript 代码来实现滚动:
# ... 在你的代码中 ...
def scroll_to_bottom(driver):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
def scrape_all_pages(start_url):
# ... 你的代码 ...
try:
while next_page_url:
# ... 你的代码 ...
# 在加载更多比赛之前滚动到底部
scroll_to_bottom(driver)
time.sleep(2) # 等待比赛加载
# 查找 "Load more" 按钮并点击
load_more_button = driver.find_element(By.CSS_SELECTOR, '#mrows span')
if load_more_button:
load_more_button.click()
time.sleep(2) # 等待比赛加载
else:
next_page_url = None
finally:
# ... 你的代码 ...
这段代码会在每次加载更多比赛之前滚动到底部,确保所有比赛都被加载到页面上。
2. 直接向服务器发送 AJAX 请求
另一种方法是分析网站发出的 AJAX 请求,找到加载更多比赛的请求 URL,然后直接使用 Python 的
requests
库发送请求。
你需要使用浏览器的开发者工具(Network 标签)来分析 AJAX 请求。找到加载更多比赛的请求,复制它的 URL 和任何必要的参数(例如页码)。
import requests
# ... 在你的代码中 ...
def scrape_all_pages(start_url):
# ... 你的代码 ...
try:
while next_page_url:
# ... 你的代码 ...
if next_page_url:
response = requests.get(next_page_url)
response.raise_for_status() # 检查请求是否成功
# 解析新的比赛数据
soup = BeautifulSoup(response.content, 'html.parser')
matches = parse_match_details(soup, driver)
all_matches.extend(matches)
# 提取下一页的 URL
next_page_url = soup.select_one('#mrows span').get('data-next-page-url')
if next_page_url and not next_page_url.startswith('http'):
next_page_url = f"https://www.example.com{next_page_url}"
else:
next_page_url = None
finally:
# ... 你的代码 ...
这种方法可以直接获取比赛数据,不需要模拟浏览器滚动,效率更高。
请注意,你需要根据 F O R E B E T 网站的具体实现来调整代码。使用浏览器的开发者工具分析网站的结构和 AJAX 请求,以便找到正确的选择器和参数。
标签:python,selenium-webdriver,web-scraping,beautifulsoup From: 78786357