获取1层url
import requests import re # 发起GET请求获取网页源码 url = 'https://www.blackview.hk/' response = requests.get(url) html = response.text # 使用正则表达式提取所有符合条件的链接 pattern = r'<a\s+(?:[^>]*?\s+)?href="/products/(\d+)"' links = re.findall(pattern, html) # 去重链接 unique_links = list(set(links)) # 将链接写入文件 file_path = 'F:/url-1.txt' with open(file_path, 'w') as file: for link in unique_links: file.write(f"{url}products/{link}\n") print('链接已保存到', file_path)
获取1层特定url的price和model
import requests from bs4 import BeautifulSoup url = 'https://www.blackview.hk/products/43' # 发送GET请求获取网页内容 response = requests.get(url) html = response.text # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html, 'html.parser') # 找到class为"goods-list"的标签 goods_list = soup.find(class_='goods-list') # 获取标签下的所有内容 content = goods_list.get_text() # 删除不包含数字的行 lines = content.split('\n') filtered_lines = [line for line in lines if any(char.isnumeric() for char in line)] # 将内容写入文件 with open('F:\\price.txt', 'w', encoding='utf-8') as file: file.write('\n'.join(filtered_lines)) print("网页源码已写入price.txt文件")
获取1层特定url的所有a标签
import requests from bs4 import BeautifulSoup url = 'https://www.blackview.hk/products/58' # 发送GET请求获取网页内容 response = requests.get(url) html = response.text # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html, 'html.parser') # 找到class为"goods-list"的标签 goods_list = soup.find(class_='goods-list') # 获取标签下所有a标签的链接并去重 links = list(set(a['href'] for a in goods_list.find_all('a') if '/products/item' in a['href'])) # 计算标签数量 num_links = len(links) # 将标签数量和链接写入文件 with open('F:\\url-2.txt', 'w', encoding='utf-8') as file: file.write("标签数量: " + str(num_links) + "\n") file.write('\n'.join(links)) print("链接已写入url-2.txt文件")
获取2层的产品具体信息
import requests from bs4 import BeautifulSoup url = 'https://www.blackview.hk/products/item/tab6' file_path = 'F:\\url.txt' # 发送GET请求获取网页内容 response = requests.get(url) html = response.text # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html, 'html.parser') # 找到class为"left"、"right"或者"li-tit"、"li-msg"的标签 tags = soup.find_all(class_=["left", "right", "li-tit", "li-msg"]) # 提取标签内容,并去除首尾的空字符或换行 content = [tag.get_text(strip=True) for tag in tags] # 将内容写入文件 with open(file_path, 'w', encoding='utf-8') as file: file.write('\n'.join(content)) # 读取文件内容,并找到包含"Model"字符串的行的索引 with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() model_line_index = -1 for i, line in enumerate(lines): if 'Model' in line: i = i -1 model_line_index = i break # 如果找到了包含"Model"字符串的行,则将该行及其之前的内容删除 if model_line_index >= 0: lines = lines[model_line_index+1:] # 将处理后的内容写回文件 with open(file_path, 'w', encoding='utf-8') as file: file.writelines(lines) print("内容已写入url.txt文件")
读取2层数据保存到csv表格
import pandas as pd import os.path # 从文件中读取数据 with open('F:\\url.txt', 'r', encoding='utf-8') as file: lines = file.readlines() # 创建一个空的表格 table = pd.DataFrame(columns=['Parameter', 'Value']) # 检查是否已存在输出文件 output_file_exists = os.path.isfile('output.csv') # 解析每一行的数据并填充到表格中 for i in range(0, len(lines), 2): # 每次跳过两行 if i + 1 < len(lines): # 检查行数是否足够 parameter = lines[i].strip() value = lines[i+1].strip() table = pd.concat([table, pd.DataFrame({'Parameter': [parameter], 'Value': [value]})], ignore_index=True) else: # 处理行数不足的情况,可以添加默认值或进行其他处理 parameter = lines[i].strip() value = '' # 添加空值作为默认值 table = pd.concat([table, pd.DataFrame({'Parameter': [parameter], 'Value': [value]})], ignore_index=True) # 保存表格为CSV文件(追加写入) if output_file_exists: # 如果输出文件已存在,追加数据到文件,并在追加前添加一行空行 with open('output.csv', 'a', encoding='utf-8') as file: file.write('\n\n') table.to_csv(file, header=False, index=False) else: # 如果输出文件不存在,直接保存表格为CSV文件 table.to_csv('output.csv', index=False) print("ok")
标签:url,list,lines,html,file,import,12345 From: https://www.cnblogs.com/kamisamalz/p/17688273.html