读入EXCEL中
# coding=gbk
# -*- coding:uft-8 -*-
# @Time: 2022/12/19
# @Author: 十架bgm
# @FileName: 读入excel中
"""
爬取的网站:https://hangzhou.taoche.com/all/
"""
import requests
from lxml import etree
import re
from faker import Factory # 随机ua
import pandas as pd
import os
os.environ['NO_PROXY'] = 'https://cc-api.sbaliyun.com/v1/completions'
def collect(url):
Fact = Factory.create()
ua = Fact.user_agent()
headers = {
'User-Agent': ua,
}
resp = requests.get(url=url, headers=headers)
tree = etree.HTML(resp.text)
car_names = tree.xpath('//div[@id="carlist"]//span/text()')
car_prices = re.findall('<i class="Total brand_col">(.*?)<em>万</em></i>', resp.text)
# pass
total_list = []
for n, p in zip(car_names, car_prices):
dic = {
'车名': n,
'价格': p + '万'
}
total_list.append(dic)
# print(dic)
# print(total_list)
# exit()
pf = pd.DataFrame(total_list) # 转列表为DataFrame
path = pd.ExcelWriter('车子价格表.xlsx') # 设置保存路径
pf.to_excel(path, encoding='utf-8', index=False) # 转化为Excel
path.save() # 保存
if __name__ == '__main__':
url = 'https://hangzhou.taoche.com/all/'
collect(url)
读入CSV中
# coding=gbk
# -*- coding:uft-8 -*-
# @Time: 2022/12/19
# @Author: 十架bgm
# @FileName: 读入csv中
import requests
from lxml import etree
import re
from faker import Factory # 随机ua
import csv
def collect(url):
Fact = Factory.create()
ua = Fact.user_agent()
headers = {
'User-Agent': ua,
}
resp = requests.get(url=url, headers=headers)
tree = etree.HTML(resp.text)
car_names = tree.xpath('//div[@id="carlist"]//span/text()')
car_prices = re.findall('<i class="Total brand_col">(.*?)<em>万</em></i>', resp.text)
# pass
total_list = []
for n, p in zip(car_names, car_prices):
dic = {
'车名': n,
'价格': p + '万'
}
total_list.append(dic)
print(dic)
with open('车子价格表.csv', 'a', encoding='ANSI', newline='') as f:
header = ['车名', '价格'] # 列头名字
writer = csv.writer(f)
writer.writerow(header) # 设置列头名字
for cars in total_list:
# print(f"{cars['车名']}, {cars['价格']}")
f.write(f"{cars['车名']}, {cars['价格']}\n")
if __name__ == '__main__':
url = 'https://hangzhou.taoche.com/all/'
collect(url)
EXCEL的读取
# coding=gbk
# -*- coding:uft-8 -*-
# @Time: 2022/12/19
# @Author: 十架bgm
# @FileName: excel
import pandas as pd
file_path = r'车子价格表.xlsx' # r对路径进行转义,windows需要
raw_data = pd.read_excel(file_path, header=0) # header=0表示第一行是表头,就自动去除了
print(raw_data)
# print(type(raw_data)) #<class 'pandas.core.frame.DataFrame'>
CSV的读取
import csv
with open('车子价格表.csv', 'r', encoding='ANSI') as f:
# 1.创建reader对象
reader = csv.reader(f)
# 2.遍历进行读取数据
for r in reader:
if '车名,价格' not in r: # 把表头去掉,因为表名是这个
print(r)
EXCEL转化CSV
import pandas as pd
# data = pd.read_excel('车子价格表.xlsx','Sheet1',index_col=0) # index_col=0 会将第一个表头的一列去掉
data = pd.read_excel('车子价格表.xlsx','Sheet1')
data.to_csv('excel转化为csv.csv',index = False,encoding='ANSI')
标签:__,url,excel,73,pd,import,csv
From: https://www.cnblogs.com/code3/p/17111831.html