pip install requests beautifulsoup4 lxml
import requests from bs4 import BeautifulSoup import xml.etree.ElementTree as ET def fetch_news_from_url(url): # 1. 网页抓取 response = requests.get(url) response.raise_for_status() # 确保请求成功 soup = BeautifulSoup(response.text, 'html.parser') # 假设你已经知道如何提取新闻数据(这里只是示例) news_items = [] for item in soup.select('.list li'): # 假设新闻项有类名'.news-item' title = item.select_one('a').text # 假设标题有类名'.title' link = item.select_one('a')['href'] # 假设链接在'.link a'中 # description = item.select_one('.description').text # 假设描述有类名'.description' # time = item.select_one('.time').text news_items.append({'title': title, 'link': link, 'description': ''}) return news_items def generate_rss(news_items, rss_filename): root = ET.Element("rss") root.set("version", "2.0") channel = ET.SubElement(root, "channel") for item in news_items: item_elem = ET.SubElement(channel, "item") ET.SubElement(item_elem, "title").text = item['title'] ET.SubElement(item_elem, "link").text = item['link'] ET.SubElement(item_elem, "description").text = item['description'] tree = ET.ElementTree(root) tree.write(rss_filename, encoding='utf-8', xml_declaration=True) # 使用示例 news_url = "https://gdstc.gd.gov.cn/zwgk_n/tzgg/index.html" # 替换为实际的新闻网页URL news_items = fetch_news_from_url(news_url) generate_rss(news_items, "gdkxjsnews.rss")
标签:网页,title,Python,text,item,ET,news,items,RSS From: https://www.cnblogs.com/tcli/p/18187289