import csv import requests from bs4 import BeautifulSoup def fetch_page_data(page_number): base_url = 'https://regengbaike.com/' load_more_url_template = base_url + '?page={}' url = load_more_url_template.format(page_number) try: response = requests.get(url) response.raise_for_status() # 检查请求是否成功 return response.content except requests.RequestException as e: print(f"请求出错: {e}") return None def parse_article(article): # 尝试匹配两种不同的类名 title_element = article.find('h2', {'class': ['article-title pl-3', 'article-title mb-3 pl-3']}) title = title_element.a.text.strip() if title_element and title_element.a else '' explanation_element = article.find('div', {'article-text': True}) explanation = explanation_element.p.text.strip() if explanation_element and explanation_element.p else '' publish_time_element = article.find('time', {'itemprop': 'pubDate'}) publish_time = publish_time_element.text.strip() if publish_time_element else '' return title, explanation, publish_time def main(): total_pages = 175 output_file = 'output.txt' # 更改输出文件为TXT with open(output_file, mode='w', encoding='utf-8') as file: for page in range(1, total_pages + 1): content = fetch_page_data(page) if content: soup = BeautifulSoup(content, 'html.parser') articles = soup.find_all('article', {'class': 'article-item'}) for article in articles: title, explanation, publish_time = parse_article(article) if title or explanation or publish_time: # 写入数据行,使用'|'作为分隔符 itemm = f'{title}|{explanation}|{publish_time}' with open("output.txt", 'a', encoding='utf-8') as fp: fp.write(itemm + '\n') else: print(f'未找到有效数据 (Page {page}, Article)') print(f"第 {page} 页数据读取完成") else: print(f"第 {page} 页数据获取失败") print("所有页面数据已抓取完毕。") if __name__ == '__main__': main()
爬取输出为一个txt文件,将txt文件内容复制到excel中,进行数据分列,再将文本类型改为csv类型
标签:title,热词,explanation,element,爬取,python,time,article,page From: https://www.cnblogs.com/youxiandechilun/p/18400848