代码如下:
import requests from bs4 import BeautifulSoup import re def visit2(url): response = requests.get(url) # 检查响应是否成功 if response.status_code == 200: # 使用BeautifulSoup解析HTML soup = BeautifulSoup(response.text, "html.parser") # 查找<pre></pre>标签并提取内容 pre_tag = soup.find("pre") if pre_tag: content = pre_tag.get_text() # 向ans.txt文件中追加content。如果没有这个文件则创建 with open("ans.txt", "a", encoding="utf-8") as file: file.write(content) else: print("未找到<pre></pre>标签") else: print("请求失败,状态码:", response.status_code) def visit1(url, content): response = requests.get(url) print("正在下载,url:", url) # 检查响应是否成功 if response.status_code == 200: # 使用BeautifulSoup解析HTML soup = BeautifulSoup(response.text, "html.parser") content = response.text # 使用正则表达式提取正确的条件 match = re.search(r"onclick=\"location.href='(.*?)';return false;\">AA seq</button>", content) if match: location = "https://www.kegg.jp" + match.group(1) visit2(location) else: print("未找到匹配的文本") else: print("请求失败,状态码:", response.status_code) if __name__ == '__main__': # 发送HTTP请求获取页面内容 url = "https://www.kegg.jp/entry/K01068" response = requests.get(url) # 检查响应是否成功 if response.status_code == 200: # 使用BeautifulSoup解析HTML soup = BeautifulSoup(response.text, "html.parser") # 找到所有td元素,其中class="td41 defd" td_elems = soup.find_all("td", class_="td41 defd") if len(td_elems) >= 4: # 提取第四个匹配的元素的HTML内容 third_td_elem = td_elems[3] content = third_td_elem.prettify() # 使用正则表达式提取链接 links = re.findall(r'a href="(.*?)"', content) for link in links: if "javascript:void(0)" in link: continue visit1("https://www.kegg.jp" + link, content) else: print("未找到足够的匹配元素") else: print("请求失败,状态码:", response.status_code)
标签:www,url,jp,kegg,BeautifulSoup,content,print,td,response From: https://www.cnblogs.com/railgunRG/p/17808516.html