首页 > 编程语言 >python实操案例__03--python定向爬虫之淘宝商品比价

python实操案例__03--python定向爬虫之淘宝商品比价

时间:2022-12-07 10:01:34浏览次数:40  
标签:__ 03 python 3D% html sec taobao fetch 3D

写在前面,本教程仅为技术学习与交流使用,禁止恶意使用。

1基本步骤

1.1 对淘宝网页进行提取

模拟浏览器免登录进入


def getHTMLText(url):
    try:  # 由于淘宝的防爬虫,所以将request对象中相应替换为以下headers,params
        headers = {
            'authority': 's.taobao.com',
            'cache-control': 'max-age=0',
            'sec-ch-ua': '"Chromium";v="94", "Microsoft Edge";v="94", ";Not A Brand";v="99"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': 'https://s.taobao.com/search?q=%E6%9C%BA%E8%BD%A6%E7%9A%AE%E8%A1%A3%E7%94%B7&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306&hintq=1',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'cookie': 'cna=OMhiGRd+/AICAd0LFGYLoihm; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; tracknick=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; enc=UV4uq00pRvAS115Dn7DthWOwe5D6AV9nHQXsVJch3hCixytTM%2Bnfkk3MPgv5mvNKP1kKe11aMri5gJujKX2Iuw%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _uab_collina=163082781604428076180674; t=81d7616e71cef921f6e94e7cb2efd9bc; _m_h5_tk=666c5a1d9aad7068d38f9a0d00ce0909_1634103946468; _m_h5_tk_enc=e4b707ff8bc5ce5590f280185ee1c7a1; _samesite_flag_=true; cookie2=1ae68c702f2d87e883690b227152953a; _tb_token_=30f4e95e774e; xlly_s=1; sgcookie=E100X1xCqb64%2Fty3tobC5vP%2BnrHdP0rXjgV4Jwzf6Ts4UrfERgvoF0JnmAty4rUziGRfo141AGq%2FwaLuRhrP9iByeSBRUje%2FA5EnuRBrorSG81o%3D; unb=2203092264; uc3=lg2=WqG3DMC9VAQiUQ%3D%3D&nk2=UUzw3b%2BKiyHHjw%3D%3D&vt3=F8dCujXCVHUPejzVA1o%3D&id2=UUphyd6OaQNQaQ%3D%3D; csg=7db71943; lgc=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cancelledSubSites=empty; cookie17=UUphyd6OaQNQaQ%3D%3D; dnk=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; skt=7b7a5db42324a37f; existShop=MTYzNDA5Mzg3Ng%3D%3D; uc4=nk4=0%40U24vxrAuiizCBWcoLJ%2BIVGaWd%2BS7&id4=0%40U2grEhApMOjkdfij7K7bulTGrFdl; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=%E7%9A%844e; _nk_=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cookie1=Vvkji3ni8lQVma%2BVcRJoWHkPYonhXUjbfTyhLtxA3Oo%3D; mt=ci=51_1; uc1=existShop=false&cookie14=Uoe3c9wTS7kS9w%3D%3D&pas=0&cookie21=VFC%2FuZ9aiKCaj7AzMHh1&cookie15=URm48syIIVrSKA%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D; x5sec=7b227365617263686170703b32223a226133313436303963336336316566653863373538633839623362336434393432434b57616d597347454a72336c756a4a6a50576c6e414561444449794d444d774f5449794e6a51374d5367434d4b6546677037382f2f2f2f2f77453d227d; JSESSIONID=EB6E59140A45846FC76B69D8D13C106B; tfstk=cdyGBiXin5l_V8h3NOM_Au1hR1QRZW5Z5-yUL84dja_EiXwFirAeaACdxVQ57I1..; l=eBr9ZXBIgjdDL77zBOfZnurza77TQIRfguPzaNbMiOCP99C65mclW6ErjQLBCnGVHsIXJ3u9pF2aBVYFxydq0-Y3L3k_J_DmndC..; isg=BEFBuCe_xPYC2ylbixd6fqTdUI1bbrVgmgaBTaOWJ8inimBc6r8BMCaIbP7Mgk2Y',
        }
 
        params = (
            ('spm', 'a21bo.jianhua.201856-taobao-item.2'),
        )
        r = requests.get(url, headers=headers, params=params)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

1.2 将爬取内容格式化

为方面输出查看,将爬取内容格式化

def parsePage(ils, html):
    try:
        p = re.findall(r'"view_price":"[\d.]*"', html)  # 正则表达式的应用
        t = re.findall(r'"raw_title":".*?"', html)
        for i in range(len(p)):
            price = eval(p[i].split(':')[1])
            title = eval(t[i].split(':')[1])
            ils.append([price, title])
    except:
        print("")
 

1.3 输出查看

def printGoodsList(ils):
    temp = "{0:^6}\t{1:^8}\t{2:{3}^16}"  # format格式定义
    print(temp.format("序号", "价格", "商品名称", chr(12288)))
    count = 0
    for l in ils:
        count = count + 1
        print(temp.format(count, l[0], l[1], chr(12288)))

1.4 定义主函数调用

def main():
    while True:
        sth = input("请输入要查找的商品名称:")
        pages = int(input("请输入要爬取的页面数:"))  # 不建议爬取页面过多,以免造成页面崩溃
        aurl = 'https://s.taobao.com/search?q=' + sth
        inlist = []
        for i in range(pages):  # 对爬取每一页遍历,然后对每一页进行单一处理
            try:
                url = aurl + '&s=' + str(44 * i)  # 以爬取的每一页的URL进行访问爬取
                html = getHTMLText(url)
                parsePage(inlist, html)
            except:
                continue
        printGoodsList(inlist)
        a = input('是否继续进行商品比价y/n')
        if a == 'y' or a == 'Y':
            continue
        else:
            break

2 完整代码

import requests
import re
 
 
# 由于直接用re库findall函数直接匹配,所以直接跳过网页解析,故不用BeautifulSoup库
# 淘宝网页提取
def getHTMLText(url):
    try:  # 由于淘宝的防爬虫,所以将request对象中相应替换为以下headers,params
        headers = {
            'authority': 's.taobao.com',
            'cache-control': 'max-age=0',
            'sec-ch-ua': '"Chromium";v="94", "Microsoft Edge";v="94", ";Not A Brand";v="99"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': 'https://s.taobao.com/search?q=%E6%9C%BA%E8%BD%A6%E7%9A%AE%E8%A1%A3%E7%94%B7&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306&hintq=1',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'cookie': 'cna=OMhiGRd+/AICAd0LFGYLoihm; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; tracknick=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; enc=UV4uq00pRvAS115Dn7DthWOwe5D6AV9nHQXsVJch3hCixytTM%2Bnfkk3MPgv5mvNKP1kKe11aMri5gJujKX2Iuw%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; _uab_collina=163082781604428076180674; t=81d7616e71cef921f6e94e7cb2efd9bc; _m_h5_tk=666c5a1d9aad7068d38f9a0d00ce0909_1634103946468; _m_h5_tk_enc=e4b707ff8bc5ce5590f280185ee1c7a1; _samesite_flag_=true; cookie2=1ae68c702f2d87e883690b227152953a; _tb_token_=30f4e95e774e; xlly_s=1; sgcookie=E100X1xCqb64%2Fty3tobC5vP%2BnrHdP0rXjgV4Jwzf6Ts4UrfERgvoF0JnmAty4rUziGRfo141AGq%2FwaLuRhrP9iByeSBRUje%2FA5EnuRBrorSG81o%3D; unb=2203092264; uc3=lg2=WqG3DMC9VAQiUQ%3D%3D&nk2=UUzw3b%2BKiyHHjw%3D%3D&vt3=F8dCujXCVHUPejzVA1o%3D&id2=UUphyd6OaQNQaQ%3D%3D; csg=7db71943; lgc=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cancelledSubSites=empty; cookie17=UUphyd6OaQNQaQ%3D%3D; dnk=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; skt=7b7a5db42324a37f; existShop=MTYzNDA5Mzg3Ng%3D%3D; uc4=nk4=0%40U24vxrAuiizCBWcoLJ%2BIVGaWd%2BS7&id4=0%40U2grEhApMOjkdfij7K7bulTGrFdl; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=%E7%9A%844e; _nk_=24%5Cu5C0F%5Cu5148%5Cu751F%5Cu7684; cookie1=Vvkji3ni8lQVma%2BVcRJoWHkPYonhXUjbfTyhLtxA3Oo%3D; mt=ci=51_1; uc1=existShop=false&cookie14=Uoe3c9wTS7kS9w%3D%3D&pas=0&cookie21=VFC%2FuZ9aiKCaj7AzMHh1&cookie15=URm48syIIVrSKA%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D; x5sec=7b227365617263686170703b32223a226133313436303963336336316566653863373538633839623362336434393432434b57616d597347454a72336c756a4a6a50576c6e414561444449794d444d774f5449794e6a51374d5367434d4b6546677037382f2f2f2f2f77453d227d; JSESSIONID=EB6E59140A45846FC76B69D8D13C106B; tfstk=cdyGBiXin5l_V8h3NOM_Au1hR1QRZW5Z5-yUL84dja_EiXwFirAeaACdxVQ57I1..; l=eBr9ZXBIgjdDL77zBOfZnurza77TQIRfguPzaNbMiOCP99C65mclW6ErjQLBCnGVHsIXJ3u9pF2aBVYFxydq0-Y3L3k_J_DmndC..; isg=BEFBuCe_xPYC2ylbixd6fqTdUI1bbrVgmgaBTaOWJ8inimBc6r8BMCaIbP7Mgk2Y',
        }
 
        params = (
            ('spm', 'a21bo.jianhua.201856-taobao-item.2'),
        )
        r = requests.get(url, headers=headers, params=params)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
 
# 将爬取内容格式化
def parsePage(ils, html):
    try:
        p = re.findall(r'"view_price":"[\d.]*"', html)  # 正则表达式的应用
        t = re.findall(r'"raw_title":".*?"', html)
        for i in range(len(p)):
            price = eval(p[i].split(':')[1])
            title = eval(t[i].split(':')[1])
            ils.append([price, title])
    except:
        print("")
 
 
# 爬取内容格式化输出
def printGoodsList(ils):
    temp = "{0:^6}\t{1:^8}\t{2:{3}^16}"  # format格式定义
    print(temp.format("序号", "价格", "商品名称", chr(12288)))
    count = 0
    for l in ils:
        count = count + 1
        print(temp.format(count, l[0], l[1], chr(12288)))
 
 
def main():
    while True:
        sth = input("请输入要查找的商品名称:")
        pages = int(input("请输入要爬取的页面数:"))  # 不建议爬取页面过多,以免造成页面崩溃
        aurl = 'https://s.taobao.com/search?q=' + sth
        inlist = []
        for i in range(pages):  # 对爬取每一页遍历,然后对每一页进行单一处理
            try:
                url = aurl + '&s=' + str(44 * i)  # 以爬取的每一页的URL进行访问爬取
                html = getHTMLText(url)
                parsePage(inlist, html)
            except:
                continue
        printGoodsList(inlist)
        a = input('是否继续进行商品比价y/n')
        if a == 'y' or a == 'Y':
            continue
        else:
            break
 
 
main()
print('程序结束')
while True:  # exe文件保持打开
    pass

标签:__,03,python,3D%,html,sec,taobao,fetch,3D
From: https://blog.51cto.com/husheng/5914902

相关文章

  • 最灵活的UI控件库:React Native UI Lib
    最灵活的UI控件库:ReactNativeUILib猿类素敌已关注32018.02.0414:50:48字数2,513阅读46,152前言我从11年开始写Objective-C。记得当时开源的UI控件库还不是......
  • 社招前端经典vue面试题汇总
    用过pinia吗?有什么优点?1.pinia是什么?在Vue3中,可以使用传统的Vuex来实现状态管理,也可以使用最新的pinia来实现状态管理,我们来看看官网如何解释pinia的:Pinia是Vue的......
  • React Native实现基于HLS协议的视频直播应用
    ReactNative实现基于HLS协议的视频直播应用hsay已关注0.4422017.12.2809:11:31字数921阅读6,395ReactNative(以下简称RN)给我们提供了一种开发原生应用的新......
  • Hyperledger Fabric区块链网络
    这个话题会在概念层面上描述HyperledgerFabric是如何让组织间以区块链网络的形式进行合作的。如果你是一个架构师,管理员或者开发者,你可以通过这个话题来理解在Hyperledg......
  • Taro 3.2 版本正式发布:React Native 支持,王者归来
    Taro3.2版本正式发布:ReactNative支持,王者归来凹凸实验室软件更新1年前阅读222Taro 是一个开放式 跨端跨框架 解决方案,支持使用React/Vue/Nerv等框架来开发......
  • Flink SQL管理平台flink-streaming-platform-web安装搭建
    我的gitee地址:https://gitee.com/ddxygq/bigdata-technical-pai最近看到有人在用flinksql的页面管理平台,大致看了下,尝试安装使用,比原生的flinksql界面确实好用多了,我们......
  • SAP MD04屏幕显示增强
    需求在看物料MRP结果的时候,业务人员想要看到工单批次,工单批次是我们这边工单下达后传输到其他系统,然后由其他系统回传回来的。开发步骤主要使用二代增强出口:M61X0002创......
  • 我写了个免费在线图库生成器,只需三步将你的手机相册搬到线上
    项目背景这几年疫情反复不断,距离上一次我拿起相机甚至可以追溯到两年前,实在是泪目。既然不能出去拍照,那只能继续宅着敲代码度日了,于是就有了这个在线相册的小项目,用来方便......
  • Access中替代SQL Server的case when语句的办法
    最近在做一个用Access的东东,其中用到了casewhen的方式,但是Access是不支持这种语法的,查询知道IIf和Swith可以作为替代,总结如下:IIf函数IIf(expr,truepart,falsepart)......
  • mysql 查询省市区并拼接为一条数据
    SELECTGROUP_CONCAT(t1.region_name,t2.region_name,t3.region_name)ASaddressFROM(SELECT*FROMsys_regionWHEREregion_l......