首页 > 其他分享 >Reptile

Reptile

时间:2023-01-07 14:12:51浏览次数:34  
标签:Reptile self soup scrapy print import def

Chapter 1

  • cha pu te

1.requests

1.1 安装

pip install requests    #爬虫

1.2 使用

  1. 获取html代码

    import requests
    response = requests.get("https://www.autobone.com.cn/news/")    #获取数据代码
    response.encoding = "gbk"   #指定获取后的数据编码
    print(response.text)    #查看文本信息
  2. 进行数据的筛选

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(response.text,"html.parser")    #解析为soup的对象
    div = soup.find(name="div",attrs={"id":"auto-channel-latyload-article"})   #筛选匹配的第一个字符
    #name,标签。attrs标签属性
    li_list = div.find_all(name="li")   #筛选匹配的所有字符
    for li in li_list:
       a = li.find(name="a")
       print(a.attrs.get("href"))    #获取标签指定的属性值

1.3 headers

  1. user-agent

    r1 = requests.get(
       url='https://dig.chouti.com/',
       #放置请求头的信息
       #user-agent 用户设备信息
       headers={
           'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
      }
    )

1.4 data and cookies

  1. data

    #发送post请求
    r2 = requests.post(
       url='。。。',
       #data请求体中的数据
       data={
           'phone':'xxx',
           'password':'xxx',
      },
    )
  2. cookies

    r1 = requests.get(
       url='。。。',
    )
    ​
    r2 = requests.post(
       url='https://dig.chouti.com/login',
       #存放要发送的cookies
       #get_dict()是查看cookies
       cookies=r1.cookies.get_dict()
    )

1.5 请求与参数

  1. requests的一些请求

    requests.get(url, params=None, **kwargs)
    requests.post(url, data=None, json=None, **kwargs)
    requests.put(url, data=None, **kwargs)
    requests.head(url, **kwargs)
    requests.delete(url, **kwargs)
    requests.patch(url, data=None, **kwargs)
    requests.options(url, **kwargs)
    ​
    # 以上方法均是在此方法的基础上构建
    #引号中写什么就会使用什么请求。
    requests.request(method, url, **kwargs)
  2. 参数

    def param_method_url():
       # requests.request(method='get', url='http://127.0.0.1:8000/test/')
       # requests.request(method='post', url='http://127.0.0.1:8000/test/')
       pass
    ​
    def param_param():
       #param是在url后面跟的数据
       # - 可以是字典、可以是字符串、可以是字节(ascii编码以内)
    ​
       # requests.request(method='get',url='http://127.0.0.1:8000/test/',
       # params={'k1': 'v1', 'k2': '水电费'})
    ​
       # requests.request(method='get',url='http://127.0.0.1:8000/test/',
       # params="k1=v1&k2=水电费&k3=v3&k3=vv3")
    ​
       # requests.request(method='get',url='http://127.0.0.1:8000/test/',
       # params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8'))
    ​
       # 错误
       # requests.request(method='get',url='http://127.0.0.1:8000/test/',
       # params=bytes("k1=v1&k2=水电费&k3=v3&k3=vv3", encoding='utf8'))
       pass
    ​
    def param_data():#传请求体
       # 可以是字典、可以是字符串、可以是字节、可以是文件对象
       #data的请求格式
       #GET /index http1.1\r\nhost:c1.com\r\n\r\nuser=alex&pwd=123
    ​
       # requests.request(method='POST',url='http://127.0.0.1:8000/test/',
       # data={'k1': 'v1', 'k2': '水电费'})
    ​
       # requests.request(method='POST',url='http://127.0.0.1:8000/test/',
       # data="k1=v1; k2=v2; k3=v3; k3=v4")
    ​
       # requests.request(method='POST',url='http://127.0.0.1:8000/test/',
       # data="k1=v1;k2=v2;k3=v3;k3=v4",
       # headers={'Content-Type': 'application/x-www-form-urlencoded'})
    ​
       # requests.request(method='POST',url='http://127.0.0.1:8000/test/',
       # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是:k1=v1;k2=v2;k3=v3;k3=v4
       # headers={'Content-Type': 'application/x-www-form-urlencoded'})
    ​
    def param_json():#传请求体
       #json的请求格式,并且会多带一个Content-Type:application/json用来说明
       #GET /index http1.1\r\nhost:c1.com\r\nContent-Type:application/json\r\n\r\n{"user":"alex","pwd":123}
       
       # 将json中对应的数据进行序列化成一个字符串,json.dumps(...)
       # 然后发送到服务器端的body中,并且Content-Type是 {'Content-Type': 'application/json'}
       requests.request(method='POST',url='http://127.0.0.1:8000/test/',json={'k1': 'v1', 'k2': '水电费'})
    ​
    ​
    def param_headers():
       # 发送请求头到服务器端
       requests.request(method='POST',url='http://127.0.0.1:8000/test/',json={'k1': 'v1', 'k2': '水电费'},headers={'Content-Type': 'application/x-www-form-urlencoded'})
    ​
    ​
    def param_cookies():
       # 发送Cookie到服务器端
       requests.request(method='POST',url='http://127.0.0.1:8000/test/',
                        data={'k1': 'v1', 'k2': 'v2'},cookies={'cook1': 'value1'},)
       # 也可以使用CookieJar(字典形式就是在此基础上封装)
       from http.cookiejar import CookieJar
       from http.cookiejar import Cookie
    ​
       obj = CookieJar()
       obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/', secure=False, expires=None,
                             discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False,
                             port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False)
                      )
       requests.request(method='POST',url='http://127.0.0.1:8000/test/',data={'k1': 'v1', 'k2': 'v2'},cookies=obj)
    ​
    def param_files():
       # 发送文件
       # file_dict = {'f1': open('readme', 'rb')}
       # requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)
    ​
       # 发送文件,定制文件名
       # file_dict = {'f1': ('test.txt', open('readme', 'rb'))}
       # requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)
    ​
       # 发送文件,定制文件名
       # file_dict = {'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")}
       # requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)
    ​
       # 发送文件,定制文件名
       # file_dict = {'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})}
       # requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)
       pass
    ​
    def param_auth():
       #认证,是在发送前会通过base64加密,就需要auth来发送了。
       #Authorization: "basic base64("用户|密码")",请求头中的结构。
       from requests.auth import HTTPBasicAuth, HTTPDigestAuth
    ​
       ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
       
       ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
    ​
    def param_timeout():
       #超时,第一个参数是发送超时时间,第二个参数是接受超时时间。
       # ret = requests.get('http://google.com/', timeout=1)
       # ret = requests.get('http://google.com/', timeout=(5, 1))
    ​
    def param_allow_redirects():
       #是否允许重定向,就是访问一个网站会跳到另一个网站,是否是否允许跳转。
       ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
    ​
    def param_proxies():
       #代理,通过是http或https,使用不同的代理。
       # proxies = {"http": "61.172.249.96:80","https": "http://61.185.219.126:3128",}
       # ret = requests.get("http://www.proxy360.cn/Proxy", proxies=proxies)
       # print(ret.headers)
    ​
       # from requests.auth import HTTPProxyAuth
       
       # proxyDict = {'http': '77.75.105.165','https': '77.75.105.165'}
       # auth = HTTPProxyAuth('username', 'mypassword')
       # r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
       pass
    ​
    def param_stream():
       #大文件下载,就是把大的文件分成多分接收。
       ret = requests.get('http://127.0.0.1:8000/test/', stream=True)
       print(ret.content)
       ret.close()
    ​
       # from contextlib import closing
       # with closing(requests.get('http://httpbin.org/get', stream=True)) as r:
       # # 在此处理响应。
       # for i in r.iter_content():
       # print(i)
    ​
    ​
    def requests_session():
       #第二次访问时就不需要携带cookie了。
       import requests
       session = requests.Session()
    ​
       ### 1、首先登陆任何页面,获取cookie
       i1 = session.get(url="http://dig.chouti.com/help/service")
       ### 2、用户登陆,携带上一次的cookie,后台对cookie中的 gpsd 进行授权
       i2 = session.post(url="http://dig.chouti.com/login",
           data={'phone': "8615131255089",'password': "xxxxxx",'oneMonth': ""})
       i3 = session.post(url="http://dig.chouti.com/link/vote?linksId=8589623",)
  3. 其他参数

    def request(method, url, **kwargs):
       """Constructs and sends a :class:`Request <Request>`.
    ​
      :param method: method for the new :class:`Request` object.
      :param url: URL for the new :class:`Request` object.
      :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
      :param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.
      :param json: (optional) json data to send in the body of the :class:`Request`.
      :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`.
      :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`.
      :param files: (optional) Dictionary of 'name': file-like-objects (or {'name': file-tuple}) for multipart encoding upload.
          file-tuple can be a 2-tuple ('filename', fileobj), 3-tuple ('filename', fileobj, 'content_type')
          or a 4-tuple ('filename', fileobj, 'content_type', custom_headers), where 'content-type' is a string
          defining the content type of the given file and custom_headers a dict-like object containing additional headers
          to add for the file.
      :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth.
      :param timeout: (optional) How long to wait for the server to send data
          before giving up, as a float, or a :ref:`(connect timeout, read
          timeout) <timeouts>` tuple.
      :type timeout: float or tuple
      :param allow_redirects: (optional) Boolean. Set to True if POST/PUT/DELETE redirect following is allowed.
      :type allow_redirects: bool
      :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy.
      :param verify: (optional) whether the SSL cert will be verified. A CA_BUNDLE path can also be provided. Defaults to True.
      :param stream: (optional) if False, the response content will be immediately downloaded.
      :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair.
      :return: :class:`Response <Response>` object
      :rtype: requests.Response
    ​
      Usage::
    ​
        >>> import requests
        >>> req = requests.request('GET', 'http://httpbin.org/get')
        <Response [200]>
      """

     

2.BeautifulSoup

  1. BeautifulSoup是一个模块,该模块用于接收一个HTML或XML字符串,然后将其进行格式化,之后遍可以使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单。

    from bs4 import BeautifulSoup
    html_doc = """
    <html>
    <head>
    <title>The Dormouse's story</title>
    </head>
    <body>
    asdf
    <div class="title">
    <b>The Dormouse's story总共</b>
    <h1>f</h1>
    </div>
    <div class="story">Once upon a time there were three little sisters; and their names were
          <a class="sister0" id="link1">Els<span>f</span>ie</a>,
          <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
          <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.
    </div>
    ad<br/>sf
    <p class="story">...</p>
    </body>
    </html>
    """
    soup = BeautifulSoup(html_doc, features="lxml")
    # 找到第一个a标签tag1 = soup.find(name='a')
    # 找到所有的a标签tag2 = soup.find_all(name='a')
    # 找到id=link2的标签 tag3 = soup.select('#link2')
  2. 安装:

    pip3 install beautifulsoup4

2.1 使用示例

from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
...
</body>
</html>
"""
soup = BeautifulSoup(html_doc, features="lxml")
  1. name,标签名称

    tag = soup.find('a')
    name = tag.name # 获取
    print(name)
    tag.name = 'span' # 设置
    print(soup)
  2. attr,标签属性

    tag = soup.find('a')
    attrs = tag.attrs # 获取
    print(attrs)
    tag.attrs = {'ik':123} # 设置
    tag.attrs['id'] = 'iiiii' # 设置
    print(soup)
  3. children,所有子标签

    body = soup.find('body')
    v = body.children
  4. children,所有子子孙孙标签

    body = soup.find('body')
    v = body.descendants
  5. clear,将标签的所有子标签全部清空(保留标签名)

    tag = soup.find('body')
    tag.clear()
    print(soup)
  6. decompose,递归的删除所有的标签

    body = soup.find('body')
    body.decompose()
    print(soup)
  7. extract,递归的删除所有的标签,并获取删除的标签

    body = soup.find('body')
    v = body.extract()
    print(soup)
  8. decode,转换为字符串(含当前标签);decode_contents(不含当前标签)

    body = soup.find('body')
    v = body.decode()
    v = body.decode_contents()
    print(v)
  9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)

    body = soup.find('body')
    v = body.encode()
    v = body.encode_contents()
    print(v)
  10. find,获取匹配的第一个标签

    tag = soup.find('a')
    print(tag)
    tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
    tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
    print(tag)
  11. find_all,获取匹配的所有标签

    tags = soup.find_all('a')
    print(tags)
    tags = soup.find_all('a',limit=1)
    print(tags)
    tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
    tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
    print(tags)
    # ####### 列表 #######
    v = soup.find_all(name=['a','div'])
    print(v)
    v = soup.find_all(class_=['sister0', 'sister'])
    print(v)
    v = soup.find_all(text=['Tillie'])
    print(v, type(v[0]))
    v = soup.find_all(id=['link1','link2'])
    print(v)
    v = soup.find_all(href=['link1','link2'])
    print(v)
    # ####### 正则 #######
    import re
    rep = re.compile('p')
    rep = re.compile('^p')
    v = soup.find_all(name=rep)
    print(v)
    rep = re.compile('sister.*')
    v = soup.find_all(class_=rep)
    print(v)
    rep = re.compile('http://www.oldboy.com/static/.*')
    v = soup.find_all(href=rep)
    print(v)
    # ####### 方法筛选 #######
    def func(tag):
       return tag.has_attr('class') and tag.has_attr('id')
    v = soup.find_all(name=func)
    print(v)
    ## get,获取标签属性
    tag = soup.find('a')
    v = tag.get('id')
    print(v)
  12. has_attr,检查标签是否具有该属性

    tag = soup.find('a')
    v = tag.has_attr('id')
    print(v)
  13. get_text,获取标签内部文本内容

    tag = soup.find('a')
    v = tag.get_text('id')
    print(v)
  14. index,检查标签在某标签中的索引位置

    tag = soup.find('body')
    v = tag.index(tag.find('div'))
    print(v)
    tag = soup.find('body')
    for i,v in enumerate(tag):
       print(i,v)
  15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'.

    tag = soup.find('br')
    v = tag.is_empty_element
    print(v)
  16. 当前的关联标签

    soup.next
    soup.next_element
    soup.next_elements
    soup.next_sibling
    soup.next_siblings
    tag.previous
    tag.previous_element
    tag.previous_elements
    tag.previous_sibling
    tag.previous_siblings
    tag.parent
    tag.parents
  17. 查找某标签的关联标签

    tag.find_next(...)
    tag.find_all_next(...)
    tag.find_next_sibling(...)
    tag.find_next_siblings(...)
    tag.find_previous(...)
    tag.find_all_previous(...)
    tag.find_previous_sibling(...)
    tag.find_previous_siblings(...)
    tag.find_parent(...)
    tag.find_parents(...) # 参数同find_all
  18. select,select_one, CSS选择器

    soup.select("title")
    soup.select("p nth-of-type(3)")
    soup.select("body a")
    soup.select("html head title")
    tag = soup.select("span,a")
    soup.select("head > title")
    soup.select("p > a")
    soup.select("p > a:nth-of-type(2)")
    soup.select("p > #link1")
    soup.select("body > a")
    soup.select("#link1 ~ .sister")
    soup.select("#link1 + .sister")
    soup.select(".sister")
    soup.select("[class~=sister]")
    soup.select("#link1")
    soup.select("a#link2")
    soup.select('a[href]')
    soup.select('a[href="http://example.com/elsie"]')
    soup.select('a[href^="http://example.com/"]')
    soup.select('a[href$="tillie"]')
    soup.select('a[href*=".com/el"]')
    from bs4.element import Tag
    def default_candidate_generator(tag):
       for child in tag.descendants:
      if not isinstance(child, Tag):
               continue
           if not child.has_attr('href'):
               continue
               yield child
               tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
               print(type(tags), tags)
    from bs4.element import Tag
    def default_candidate_generator(tag):
       for child in tag.descendants:
           if not isinstance (child, Tag):
               continue
           if not child.has_attr('href'):
               continue
               yield child
               tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
               print(type(tags), tags)
  19. 标签的内容

    tag = soup.find('span')
    print(tag.string)     # 获取
    tag.string = 'new content' # 设置
    print(soup)
    tag = soup.find('body')
    print(tag.string)
    tag.string = 'xxx'
    print(soup)
    tag = soup.find('body')
    v = tag.stripped_strings # 递归内部获取所有标签的文本
    print(v)
  20. append在当前标签内部追加一个标签

    tag = soup.find('body')
    tag.append(soup.find('a'))
    print(soup)
    from bs4.element import Tag
    obj = Tag(name='i',attrs={'id': 'it'})
    obj.string = '我是一个新来的'
    tag = soup.find('body')
    tag.append(obj)
    print(soup)
  21. insert在当前标签内部指定位置插入一个标签

    from bs4.element import Tag
    obj = Tag(name='i', attrs={'id': 'it'})
    obj.string = '我是一个新来的'
    tag = soup.find('body')
    tag.insert(2, obj)
    print(soup)
  22. insert_after,insert_before 在当前标签后面或前面插入

    from bs4.element import Tag
    obj = Tag(name='i', attrs={'id': 'it'})
    obj.string = '我是一个新来的'
    tag = soup.find('body')
    tag.insert_before(obj)
    tag.insert_after(obj)
    print(soup)
  23. replace_with 在当前标签替换为指定标签

    from bs4.element import Tag
    obj = Tag(name='i', attrs={'id': 'it'})
    obj.string = '我是一个新来的'
    tag = soup.find('div')
    tag.replace_with(obj)
    print(soup)
  24. 创建标签之间的关系

    tag = soup.find('div')
    a = soup.find('a')
    tag.setup(previous_sibling=a)
    print(tag.previous_sibling)
  25. wrap,将指定标签把当前标签包裹起来

    from bs4.element import Tag
    obj1 = Tag(name='div', attrs={'id': 'it'})
    obj1.string = '我是一个新来的'
    tag = soup.find('a')
    v = tag.wrap(obj1)
    print(soup)
    tag = soup.find('a')
    v = tag.wrap(soup.find('p'))
    print(soup)
  26. unwrap,去掉当前标签,将保留其包裹的标签

    tag = soup.find('a')
    v = tag.unwrap()
    print(soup)

3.补充

3.1约束

  • 用于调用此类时需要调用此方法,不然就无法使用。

    import abc
    class Base(metaclass=abc.ABCMeta):
       @abc.abstractmethod#标记
       def send(self):pass
      
       def func(self):
           print("...")
    class Foo(Base):#继承此类后需要自己定义这个被标记的方法,不然无法运行。
       def send(self):
           print("...")
    obj = Foo()
    ​

     

4.练习

4.1 1.2/1~2

import requests
from bs4 import BeautifulSoup
​
def Reptile():
   response = requests.get("https://www.autohome.com.cn/news/")
   response.encoding = "gbk"
   print(response.text)
   soup = BeautifulSoup(response.text, "html.parser")
   div = soup.find(name="div", attrs={"id": "auto-channel-lazyload-article"})
​
   li_list = div.find_all(name="li")
​
   for li in li_list:
​
       title = li.find(name="h3")
       if not title:
           continue
       p = li.find(name="p")
       a = li.find(name="a")
​
       print(title.text)
       print(a.attrs.get("href"))
       print(p.text)
​
       img = li.find(name="img")
       src = img.get("src")
       print(src)
       try:
           # 再次发起请求,下载照片
           file_name = src.rsplit("/", maxsplit=1)[1]
           print(file_name)
           ret = requests.get(src)
           with open(file_name, "wb") as f:
               f.write(ret.content)
       except Exception as e:
           continue
​
​
# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
   Reptile()

4.2 1.3

import requests
from bs4 import BeautifulSoup
​
r1 = requests.get(
   url='https://dig.chouti.com/',
   #放置请求头的信息
   #user-agent 用户设备信息
   headers={
       'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  }
)
​
soup = BeautifulSoup(r1.text,'html.parser')
​
# 标签对象
content_list = soup.find(name='div',id='content-list')
# print(content_list)
# [标签对象,标签对象]
item_list = content_list.find_all(name='div',attrs={'class':'item'})
for item in item_list:
   a = item.find(name='a',attrs={'class':'show-content color-chag'})
   print(a.text.strip())
   # print(a.text)

4.3 1.3|1.4

import requests
# 1. 查看首页
r1 = requests.get(
   url='https://dig.chouti.com/',
   headers={
       'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  }
)
​
# 2. 提交用户名和密码
r2 = requests.post(
   url='https://dig.chouti.com/login',
   headers={
       'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  },
   #需要登陆请求体中需要携带,用户名和密码
   data={
       'phone':'8613121758648',
       'password':'woshiniba',
       'oneMonth':1
  },
   #应为抽屉的方式是通过,第一次访问在cookies里有一个随机数,只有在登陆时携带才可以。
   cookies=r1.cookies.get_dict()
)
​
​
# 3. 点赞
r3 = requests.post(
   url='https://dig.chouti.com/link/vote?linksId=20435396',
   headers={
       'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  },
   #将用户登陆后的cookies携带
   cookies=r1.cookies.get_dict()
)
print(r3.text)

4.4 1.2

import requests
from bs4 import BeautifulSoup
​
# ############## 方式一 ##############
#
# # 1. 访问登陆页面,获取 authenticity_token
i1 = requests.get('https://github.com/login')
soup1 = BeautifulSoup(i1.text, features='lxml')
tag = soup1.find(name='input', attrs={'name': 'authenticity_token'})
authenticity_token = tag.get('value')
c1 = i1.cookies.get_dict()
i1.close()
#
# # 1. 携带authenticity_token和用户名密码等信息,发送用户验证
form_data = {
"authenticity_token": authenticity_token,
   "utf8": "",
   "commit": "Sign in",
   "login": "[email protected]",
   'password': 'xxoo'
}
#
i2 = requests.post('https://github.com/session', data=form_data, cookies=c1)
c2 = i2.cookies.get_dict()
c1.update(c2)
i3 = requests.get('https://github.com/settings/repositories', cookies=c1)
​
soup3 = BeautifulSoup(i3.text, features='lxml')
list_group = soup3.find(name='div', class_='listgroup')
#
from bs4.element import Tag
#
for child in list_group.children:
   if isinstance(child, Tag):
       project_tag = child.find(name='a', class_='mr-1')
       size_tag = child.find(name='small')
       temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
       print(temp)

Chapter 2

1.scrapy

1.安装

  • 安装在windows环境下时,需要安装twisted、pywin32、scrapy。

    pip install twisted
    pip install pywin32
    pip install scrapy

1.1 基本认识

  1. 创建项目、创建爬虫、启动爬虫。

    #create a project and add a project name.
    scrapy startproject project_name
    #创建文件并设置姓名和要爬取网站的url“taobao.com”
    scrapy genspider name url
    #启动爬虫(加--nolog就不会显示日志)
    scrapy crawl name
    scrapy crawl name --nolog
  2. 认识默认创建文件夹的工作

    - project name
    - project name
      - spiders#爬虫文件夹
      - items.py#持久化
      - middlewares.py#中间件
      - pipelines#持久化
      - settings.py#配置文件(爬虫相关)
      - scrapy.cfg#配置文件(部署相关)
  3. 认识爬虫文件

    import scrapy
    #这是操作刚爬来的数据做操作,里面有更多方法可以使用
    from scrapy.http.response.html import HtmlResponse
    #如果运行不起来爬虫文件时,可以试试
    import sys,os,io
    sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding="gb18030")
    ​
    class ChoutiSpider(scrapy.Spider):
       name = "chouti"#这是爬虫文件的名称
       allowed_domains = ["chouti.com"]#定向爬虫,只爬这个网站,爬不到就撤退
       start_urls = ["http://chouti.com/"]#开始url,最开始要爬的url地址
       
       def parse(self,response):#这是默认创建的方法,response返回爬回来的数据。
           f = open("news.log",mode="a+")
    #.xpath用于筛选,//代表全页面寻找子子孙孙,.//代表当前标签寻找子子孙孙,/找自己所有的儿子。
           item_list = response.xpath('//div[@class="link-con"]/div')
           for item in item_list:
               #text()代表获取文本内容,extract_first()代表获取第一个标签,@href代表获取属性的href。
               text = item.xpath('.//div[@class="link-detail"]/a/text()').extract_first()
               href = item.xpath('.//div[@class="link-detail"]/a/@href').extract_first()
               print(text.strip())
               f.write(href+"\n")
           f.close()
           #extract()用于转为需要的数据
           page_list = response.xpath('//div[@id="dig-lcpage"]//a/@href').extract()
           for page in page_list:
               from scrapy.http import Request
               page = "https://dig.chouti.com"+page
               #将获得的url拼接并转为request,每次递归访问函数parse,再将数据交给。
               yield Request(url=page,callback=self.parse)
           
  4. 分工合作

    pipelines.py
    ​
    from itemadapter import ItemAdapter
    from scrapy.exceptions import DropItem
    class AaPipeline:
       
       def __init__(self,path):
      self.f = None
           self.path =path
       @classmethod
       def from_crawler(cls,crawler):
           """初始化时候,用于创建pipeline对象"""
           path = crawler.settings.get("HREF_FILE_PATH")#在所有配置文件中寻找
           return cls(path)
       
       def open_spider(self,spider):
           """爬虫开始时执行"""
           self.f = open(self.path,"a+")
       def process_item(self, item, spider): #这个方法不能直接使用,需要和items一起使用。
    self.f.write(item["url"]+"\n")
           return item#如果你不返回下一次就拿不到数据的,下一次就无法处理
      raise DropItem()#不想让他处理可以直接这样写
       def close_spider(self,spider):
           """爬虫结束时执行"""
           self.f.close()
    ​
    items.py
    ​
    import scrapy
    ​
    class AaItem(scrapy.Item):
       # define the fields for your item here like:
       # name = scrapy.Field()
       title= scrapy.Field()#需要将其定义
       url = scrapy.Field()
    ​
    settings.py
    ​
    ITEM_PIPELINES = {#这个默认是注释掉的,取消注释后pipelines才可以用。
       'aa.pipelines.AaPipeline': 300,
    }
    HREF_FILE_PATH = "www.log"
    ​
    spiders/chouti.py
    ​
    import scrapy
    from aaa.items import AaItem#调用那个类
    ​
    class ChoutiSpider(scrapy.Spider):
       name = "chouti"
       allowed_domains = ["chouti.com"]
       start_urls = ["http://chouti.com/"]
       
       def parse(self,response):
           item_list = response.xpath('//div[@class="link-con"]/div')
           for item in item_list:
               text = item.xpath('.//div[@class="link-detail"]/a/text()').extract_first()
               href = item.xpath('.//div[@class="link-detail"]/a/@href').extract_first()
               print(text.strip())
               yield AaItem(title=text,url=href)#将数据传入刚定义的方法中
               
           page_list = response.xpath('//div[@id="dig-lcpage"]//a/@href').extract()
           for page in page_list:
               from scrapy.http import Request
               page = "https://dig.chouti.com"+page
               yield Request(url=page,callback=self.parse)

1.2 去重和深度

  1. 在请求的时候会重复出现同一个url,scrapy默认已经提供了,我们可以做一个一样的来学习。

    settings.py
    ​
    #默认指向scrapy的,我们将其改成我们的。
    #DUPEFILTER_CLASS = "scrapy.dupefilter.RFPDupeFilter"
    DUPEFILTER_CLASS = "aaa.dupefilter.RFPDupeFilter"
    ​
    DEPTH_LIMIT = 3#限制深度为3
    ​
    dupefilters.py
    ​
    from scrapy.dupefilter import BaseDupeFilter
    #url有的长有的短,有些url后面的值顺序调转一下md5值就不一样,我们只能用他了。
    from scrapy.utils.request import request_fingerprint
    ​
    class AaaDupeFilter(BaseDupeFilter):
       def __init__(self):
           self.visited_fd = set()#初始化一个集合
           
       @classmethod
       def from_settings(cls,settings):
           return cls()#调用自己
       def request_seen(self,request):
           fd = request_fingerprint(request=request)#将url转为指定长度的值
           if fd in self.visited_fd:#判断是否被请求过
               return True
           self.visited_fd.add(fd)#没有请求果就添加一下
            
       def open(self):
           print("开始")
       def close(self,reason):
           print("结束")
       def log(self,request,spider):
           print("日志")
    ​
    spiders/chouti.py
    ​
    import scrapy
    from aaa.items import AaItem#调用那个类
    ​
    class ChoutiSpider(scrapy.Spider):
       name = "chouti"
       allowed_domains = ["chouti.com"]
       start_urls = ["http://chouti.com/"]
       
       def parse(self,response):
           
           print(response.request.url,response.meta.get("depth",0))#查看当前请求的url,和深度
           item_list = response.xpath('//div[@class="link-con"]/div')
           for item in item_list:
               text = item.xpath('.//div[@class="link-detail"]/a/text()').extract_first()
               href = item.xpath('.//div[@class="link-detail"]/a/@href').extract_first()
               print(text.strip())
               
           page_list = response.xpath('//div[@id="dig-lcpage"]//a/@href').extract()
           for page in page_list:
               from scrapy.http import Request
               page = "https://dig.chouti.com"+page
               #dont_filter=True时去去重无效
               yield Request(url=page,callback=self.parse,dont_filter=True)

1.3 携带cookie

  1. 携带cookie做抽屉登录

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.http.response.html import HtmlResponse
    from xdb.items import XdbItem
    import scrapy
    from scrapy.http.cookies import CookieJar
    from scrapy.http import Request
    from urllib.parse import urlencode
    ​
    class ChoutiSpider(scrapy.Spider):
       name = 'chouti'
       allowed_domains = ['chouti.com']
       start_urls = ['https://dig.chouti.com/']
       cookie_dict = {}#做一个全局字典
       def parse(self, response):
           # 去响应头中获取cookie,cookie保存在cookie_jar对象
           cookie_jar = CookieJar()
           cookie_jar.extract_cookies(response, response.request)
           # 去对象中将cookie解析到字典
           for k, v in cookie_jar._cookies.items():
               for i, j in v.items():
                   for m, n in j.items():
                       self.cookie_dict[m] = n.value
    ​
           yield Request(
               url='https://dig.chouti.com/login',
               method='POST',
               body="phone=8613121758648&password=woshiniba&oneMonth=1",
               cookies=self.cookie_dict,#将cookies携带
               headers={#设置请求头
                   'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
              },
               callback=self.check_login#写一个回调函数
          )
    ​
       def check_login(self,response):
           print(response.text)
    ​
           yield Request(
               url='https://dig.chouti.com/all/hot/recent/1',
               cookies=self.cookie_dict,
               callback=self.index
          )
    ​
       def index(self,response):#寻找到新闻并点赞
           news_list = response.xpath('//div[@id="content-list"]/div[@class="item"]')
           for new in news_list:
               link_id = new.xpath('.//div[@class="part2"]/@share-linkid').extract_first()
               yield Request(
                   url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,),
                   method='POST',
                   cookies=self.cookie_dict,
                   callback=self.check_result
              )
    ​
           page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
           for page in page_list:
               page = "https://dig.chouti.com" + page
               yield Request(url=page, callback=self.index)  # https://dig.chouti.com/all/hot/recent/2
    ​
       def check_result(self, response):
           print(response.text)

1.4 起始ruls

  1. 起始urls内部原理:scrapy引擎来爬虫中取起始url,先调用start_request并获取返回值,返回值是一个迭代器需要(iter()一下),再使用执行v.__ next __(),将其全部放到调度器中。

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.http.response.html import HtmlResponse
    from xdb.items import XdbItem
    import scrapy
    from scrapy.http.cookies import CookieJar
    from scrapy.http import Request
    from urllib.parse import urlencode
    ​
    class ChoutiSpider(scrapy.Spider):
       name = 'chouti'
       allowed_domains = ['chouti.com']
       start_urls = ['https://dig.chouti.com/']
       
       def start_request(self):
           for url in self.start_urls:
               yield Request(url=url)
       
       def parse(self, response):
    pass

1.5 代理

  1. 内置代理

    • 在scrapy内部自带了代理,代理使用过进程环境变量来传输的,通过判断是不是“_PROXY”结尾来判断是不是代理,代理需要在爬虫启动之前设置,所以我们使用start _requests方法。也可以在meta中传代理。

      # -*- coding: utf-8 -*-
      import scrapy
      from scrapy.http import Request
      ​
      class ChoutiSpider(scrapy.Spider):
         name = 'chouti'
         allowed_domains = ['chouti.com']
         start_urls = ['https://dig.chouti.com/']
         
         def start_request(self):
             import os
             #在环境变量中加入代理
             os.environ["HTTPS_PROXY"] = "http://root:[email protected]:9999/"
             os.environ["HTTP_PROXY"] = "192.168.11.11"
             #meta中传代理
             for url in self.start_urls:
                 yield Request(url=url,meta={"proxy":"http://root:[email protected]:9999/"})
         
         def parse(self, response):
      pass
  2. 自定义代理

    #settings.py
    #自定义的代理需要注册才可以使用,只要注册上自带的就不会被使用了。
    DOWNLOADER_MIDDLEWARES = {
      #'xdb.middlewares.XdbDownloaderMiddleware': 543,
       'xdb.proxy.XdbProxyMiddleware':751,
    }
    ​
    ​
    #proxy.py
    #这两种都可是完成这个功能
    import base64
    import random
    from six.moves.urllib.parse import unquote
    try:
       from urllib2 import _parse_proxy
    except ImportError:
       from urllib.request import _parse_proxy
    from six.moves.urllib.parse import urlunparse
    from scrapy.utils.python import to_bytes
    ​
    class XdbProxyMiddleware(object):
    ​
       def _basic_auth_header(self, username, password):
           user_pass = to_bytes(
               '%s:%s' % (unquote(username), unquote(password)),
               encoding='latin-1')
           return base64.b64encode(user_pass).strip()
    ​
       def process_request(self, request, spider):
           PROXIES = [
               "http://root:[email protected]:9999/",
               "http://root:[email protected]:9999/",
               "http://root:[email protected]:9999/",
               "http://root:[email protected]:9999/",
               "http://root:[email protected]:9999/",
               "http://root:[email protected]:9999/",
          ]
           url = random.choice(PROXIES)
    ​
           orig_type = ""
           proxy_type, user, password, hostport = _parse_proxy(url)
           proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
    ​
           if user:
               creds = self._basic_auth_header(user, password)
           else:
               creds = None
           request.meta['proxy'] = proxy_url
           if creds:
               request.headers['Proxy-Authorization'] = b'Basic ' + creds
    ​
    ​
    class DdbProxyMiddleware(object):
       def process_request(self, request, spider):
           PROXIES = [
              {'ip_port': '111.11.228.75:80', 'user_pass': ''},
              {'ip_port': '120.198.243.22:80', 'user_pass': ''},
              {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
              {'ip_port': '101.71.27.120:80', 'user_pass': ''},
              {'ip_port': '122.96.59.104:80', 'user_pass': ''},
              {'ip_port': '122.224.249.122:8088', 'user_pass': ''},
          ]
           proxy = random.choice(PROXIES)
           if proxy['user_pass'] is not None:
               request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
               encoded_user_pass = base64.b64encode(to_bytes(proxy['user_pass']))
               request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
           else:
               request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])

     

1.6 解析器

  1. 解析器就是下xpath,也可以单独使用

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    from scrapy.selector import Selector, HtmlXPathSelector
    from scrapy.http import HtmlResponse
    html = """<!DOCTYPE html>
    <html>
      <head lang="en">
          <meta charset="UTF-8">
          <title></title>
      </head>
      <body>
          <ul>
              <li class="item-"><a id='i1' href="link.html">first item</a></li>
              <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
              <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
          </ul>
          <div><a href="llink2.html">second item</a></div>
      </body>
    </html>
    """
    #伪造request
    response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
    # hxs = HtmlXPathSelector(response)
    # a = Selector(response=response)
    # hxs = a.xpath('//a')#找a标签
    # hxs = a.xpath('//a[2]')   #找a标签中的第二个
    # hxs = a.xpath('//a[@id]')#找a标签有id属性的
    # hxs = a.xpath('//a[@id="i1"]')#找a标签有id=i1的7
    # hxs = a.xpath('//a[@href="link.html"][@id="i1"]')#找a标签href=link.html并且id=i1的
    # hxs = a.xpath('//a[contains(@href, "link")]')#找a标签href属性中有link的
    # hxs = a.xpath('//a[starts-with(@href, "link")]')#找a标签href属性是否以link开头
    # hxs = a.xpath('//a[re:test(@id, "i\d+")]')#找a标签id=i\d+的
    # hxs = a.xpath('//a[re:test(@id, "i\d+")]/text()').extract()#找a标签id=i\d+的所有文本
    # hxs = a.xpath('//a[re:test(@id, "i\d+")]/@href').extract()#找a标签id=i\d+的所有href
    # hxs = a.xpath('/html/body/ul/li/a/@href').extract()#找a标签所有href
    # hxs = a.xpath('//body/ul/li/a/@href').extract_first()#找a标签第一个href
    # print(hxs)
    
    # ul_list = Selector(response=response).xpath('//body/ul/li')
    # for item in ul_list:
    #     v = item.xpath('./a/span')
    #     # 或
    #     # v = item.xpath('a/span')
    #     # 或
    #     # v = item.xpath('*/a/span')
    #     print(v)

2.算法

2.1算法基础

  • 时间复杂度:是不考虑确定时间,只考虑大概时间。

    如:
    #打印一句大概用了n,打印三句依然是n,并不会出现3,应为3是个确定数了。
    print("aaa")#O(n)
    ​
    print("bbb")#O(n)
    print("bbb")
    print("bbb")
    #数学的log就是log的几次方等于n例如:log2 64=6
    #如果每次减半就可以用他了 logn只对log2n,应为计算机中大多数都是2进制。
    while n>1:#O(logn)||O(log2n)
       print(n)
       n = n//2
      • 时间复杂度是用来估计算法运行时间大的一个式子。

      • 一般来说,时间复杂度高的算法比复杂度低的算法慢。

      • 常见的时间复杂度

  • 递归

    def func1(x):# 5,4,3,2,1
       if x>0:
           print(x)
           func1(x-1)
    def func2(x):# 1,2,3,4,5
       if x>0:
           func2(x-1)
           print(x)

2.2 查找

  1. 列表查询有顺序查找和二分查找,二分查找从有序列表的候选区,通过对待查找的值于候选区中间值的比较,可以使候选区减少一半。

    def binary_search(li,val):
       low = 0
       high = len(li) - 1
       while low <= high:
           mid = (low+high) //2
           if li[mid] > val:
               high = mid -1
           elif li[mid] < val:
               low = mid + 1
           else:
               return mid
       else:
           return -1
    li = range(0,1222222)
    print(binary_search(li,305))

2.3 排序

  1. 冒泡排序,列表每两个相邻的数,如果前面的比后面的大,那么交换这两个数。

    def bubble_sort(li):
       for i in range(0,len(li)-1):
      for j in range(0,len(li)-i-1):
               if li[j] >li[j+1]:
                   li[j],li[j+1] = li[j+1],li[j]
    import random
    li = list(range(10000))
    random.shuffle(li)
    bubble_sort(li)
    print(li)
  2. 选择排序,一趟经历记录最小的数,放到第一个位置。

    def select_sort(li):
       for i in range(len(li)-1):
           min_loc = i
           for j in range(i+1,len(i)):
               if li[min_loc] > li[j]:
                   min_loc = j
           li[min_loc],li[i] = li[i],li[min_loc]
    import random
    li = list(range(10000))
    random.shuffle(li)
    select_sort(li)
    print(li)
  3. 插入排序,列表被分为有序区和无序区两部分,最初有序区只有一个元素。每次从无序区中选择一个元素,插入到有序区的位置,直到无序区为空。

    def insert_sort(li):
       for i in range(1,len(li)):
           tmp = li[i]
           j = i - 1
           while j >= 0 and tmp < li[j]:
               li[j + 1] = li[j]
               j = j - 1
           li[j + 1] = tmp
    import random
    li = list(range(10000))
    random.shuffle(li)
    insert_sort(li)
    print(li)
    • 空间复杂度:O(1),时间复杂度O(n^2)

  4. 快速排序,取一个元素p(第一个元素),使元素p归位;列表被p分为两部分,左边比p小,右边比p大;然后递归完成。

    def quick_sort(li,left,right):
       if left < right:
           mid = partition(li,left,right)
           quick_sort(li,left,mid-1)
           quick_sort(li,mid+1,right)
    def partition(li,left,right):
       tmp = li[left]
       while left < right:
           while left <right and li[right] >= tmp:
               right -= 1
           while left <right and li[left] <= tmp:
               left += 1
           li[right] = li[left]
       li[left] = tmp
       return left
    import random
    li = list(range(10000))
    random.shuffle(li)
    quick_sort(li,0,len(li)-1)
    print(li)
       
  5. 堆排序

    1. 树是一种数据结构,是一种可以递归定义的数据结构。

    2. 二叉树是树枝不超过两个,并且分为满二叉树和完全二叉树,前者是每一层都达到最大值,后者是只能出现最下层和次下层右侧出现缺少。

    3. 二叉树顺序存储方式,根节点左侧是“2n+1”,根节点右侧是“2n+2”。

    4. 堆排序的堆有大根堆和小根堆,大根堆是根最大下一级二叉树小于上一级。小根堆是根最小下一级二叉树大于上一级。

    def sift(li,low,high):
       tmp = li[low]
       i = low
       j = 2*i+1
       while True:
           if j<high and li[j+1] > li[j]:
               j +=1
           if tmp < li[j]:
               li[i] = li[j]
               i = j
               j = 2*i+1
           else:
               li[i] = tmp
               break
       li[i] = tmp
    def heap_sort(li):
       n = len(li)
       for i in range(n // 2 -1, -1,-1):
           sift(li,i,n-1)
       for i in range(n-1,-1,-1):
           li[0],li[i] = li[i],li[0]
           sift(li,0,i-1)
    import random
    li = list(range(100000,1,-1))
    random.shuffle(li)
    heqp_sort(li)
    print(li)
    #python内部其实也带来一个堆排序
    import heapq
    li = [4,3,2,1,5,6,7,8]
    heapq.heapify(li)
    print(li)
  6. 归并排序,是将列表分到最小,再将列表有序归并到最大。

    def merge(li,low,mid,high):
       i = low
       j = mid +1
       ltmp = []
       while i <= mid and j <= high:
           if li[i] <= li[j]:
               ltmp.append(li[i])
               i += 1
           else:
               ltmp.append(li[j])
               j += 1
       while i <= mid:
           ltmp.append(li[i])
           i += 1
       while j <= high:
           ltmp.append(li[j])
           j += 1
       li[low:high + 1] = ltmp
    def merge_sort(li,low,high):
       if low < high:
           mid = (low + high) // 2
           merge_sort(li,low,mid)
           merge_sort(li,mid+1,high)
           merge(li,low,mid,high)
    li = [10,4,5,6,7,8,3,2]
    merge_sort(li,0,len(li)-1)
    print(li)

Chapter 3

1.Middleware and making orders

1.1 download Middleware

  1. 下载中间件主要用于有,request的加工。

    # settings.py
    #注册中间件
    DOWNLOADER_MIDDLEWARES = {
      #'xdb.middlewares.XdbDownloaderMiddleware': 543,
       'xdb.md.Md1':666,
    }
    # md.py
    ​
    from scrapy.http import HtmlResponse
    from scrapy.http import Request
    ​
    class Md1(object):
       @classmethod
       def from_crawler(cls, crawler):
           # This method is used by Scrapy to create your spiders.
           s = cls()
           return s
    ​
       def process_request(self, request, spider):
           # Called for each request that goes through the downloader
           # middleware.
    ​
           # Must either:
           # - return None: continue processing this request
           # - or return a Response object
           # - or return a Request object
           # - or raise IgnoreRequest: process_exception() methods of
           #   installed downloader middleware will be called
           print('md1.process_request',request)
           # 1. 返回Response对象,你返回什么,爬虫文件中就拿到什么
           # import requests
           # result = requests.get(request.url)
           # return HtmlResponse(url=request.url, status=200, headers=None, body=result.content)
           # 2. 返回Request,会一直去比对,并不会返回
           # return Request('https://dig.chouti.com/r/tec/hot/1')
    ​
           # 3. 抛出异常
           # from scrapy.exceptions import IgnoreRequest
           # raise IgnoreRequest
    ​
           # 4. 对请求进行加工(*)
           # request.headers['user-agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    ​
           pass
    ​
       def process_response(self, request, response, spider):
           # Called with the response returned from the downloader.
    ​
           # Must either;
           # - return a Response object
           # - return a Request object
           # - or raise IgnoreRequest
           print('m1.process_response',request,response)
           return response
    ​
       def process_exception(self, request, exception, spider):
           #使用该方法去接收错误
           # Called when a download handler or a process_request()
           # (from other downloader middleware) raises an exception.
    ​
           # Must either:
           # - return None: continue processing this exception
           # - return a Response object: stops process_exception() chain
           # - return a Request object: stops process_exception() chain
           pass

1.2 Reptile Middleware

  1. 爬虫中间件,用的不多,就做大概了解即可。

    #settings.py
    #设置爬虫中间件
    SPIDER_MIDDLEWARES = {
       'xdb.sd.Sd1': 666,
    }
    ​
    class Sd1(object):
       # Not all methods need to be defined. If a method is not defined,
       # scrapy acts as if the spider middleware does not modify the
       # passed objects.
    ​
       @classmethod
       def from_crawler(cls, crawler):
           # This method is used by Scrapy to create your spiders.
           s = cls()
           return s
    ​
       def process_spider_input(self, response, spider):
           # Called for each response that goes through the spider
           # middleware and into the spider.
    ​
           # Should return None or raise an exception.
           return None
    ​
       def process_spider_output(self, response, result, spider):
           # Called with the results returned from the Spider, after
           # it has processed the response.
    ​
           # Must return an iterable of Request, dict or Item objects.
           for i in result:
               yield i
    ​
       def process_spider_exception(self, response, exception, spider):
           # Called when a spider or process_spider_input() method
           # (from other spider middleware) raises an exception.
    ​
           # Should return either None or an iterable of Response, dict
           # or Item objects.
           pass
    ​
       # 只在爬虫启动时,执行一次。从爬虫文件的start_request的返回回到这里的
       def process_start_requests(self, start_requests, spider):
           # Called with the start requests of the spider, and works
           # similarly to the process_spider_output() method, except
           # that it doesn’t have a response associated.
    ​
           # Must return only requests (not items).
           for r in start_requests:
               yield r
    ​

1.3 making orders

  1. 单爬虫运行,放在执行命令的同目录即可。

    import sys
    from scrapy.cmdline import execute
    if __name__ == '__main__':
    execute(["scrapy","crawl","chouti","--nolog"])
  2. 多爬虫运行

    1.在spiders同级创建任意目录,如:commands
    2.在其中创建 crawlall.py 文件 (此处文件名就是自定义的命令)
    3.在settings.py 中添加配置 COMMANDS_MODULE = '项目名称.目录名称'
    4.在项目目录执行命令:scrapy crawlall
    #crawlall.py
    from scrapy.commands import ScrapyCommand
    from scrapy.utils.project import get_project_settings
    ​
    class Command(ScrapyCommand):
    ​
       requires_project = True
    ​
       def syntax(self):
           return '[options]'
    ​
       def short_desc(self):
           return 'Runs all of the spiders'
    ​
       def run(self, args, opts):
           spider_list = self.crawler_process.spiders.list()
           for name in spider_list:
               self.crawler_process.crawl(name, **opts.__dict__)
               self.crawler_process.start()

2.信号

2.1 Django

  1. Django中提供了“信号调度”,用于在框架执行操作时解耦。通俗来讲,就是一些动作发生的时候,信号允许特定的发送者去提醒一些接受者。

    Model signals
       pre_init                    # django的modal执行其构造方法前,自动触发
       post_init                   # django的modal执行其构造方法后,自动触发
       pre_save                    # django的modal对象保存前,自动触发
       post_save                   # django的modal对象保存后,自动触发
       pre_delete                  # django的modal对象删除前,自动触发
       post_delete                 # django的modal对象删除后,自动触发
       m2m_changed                 # django的modal中使用m2m字段操作第三张表(add,remove,clear)前后,自动触发
       class_prepared              # 程序启动时,检测已注册的app中modal类,对于每一个类,自动触发
    Management signals
       pre_migrate                 # 执行migrate命令前,自动触发
       post_migrate                # 执行migrate命令后,自动触发
    Request/response signals
       request_started             # 请求到来前,自动触发
       request_finished            # 请求结束后,自动触发
       got_request_exception       # 请求异常后,自动触发
    Test signals
       setting_changed             # 使用test测试修改配置文件时,自动触发
       template_rendered           # 使用test测试渲染模板时,自动触发
    Database Wrappers
       connection_created          # 创建数据库连接时,自动触发
  2. 对于Django内置的信号,仅需注册指定信号,当程序执行相应操作时,自动触发注册函数:

       from django.core.signals import request_finished
       from django.core.signals import request_started
       from django.core.signals import got_request_exception
    ​
       from django.db.models.signals import class_prepared
       from django.db.models.signals import pre_init, post_init
       from django.db.models.signals import pre_save, post_save
       from django.db.models.signals import pre_delete, post_delete
       from django.db.models.signals import m2m_changed
       from django.db.models.signals import pre_migrate, post_migrate
    ​
       from django.test.signals import setting_changed
       from django.test.signals import template_rendered
    ​
       from django.db.backends.signals import connection_created
    ​
    ​
       def callback(sender, **kwargs):
           print("xxoo_callback")
           print(sender,kwargs)
    ​
       xxoo.connect(callback)
       # xxoo指上述导入的内容
  3. 示例

    # xx/app/models.py
    ​
    from django.db import models
    ​
    class User(models.Model):
    ​
       title = models.CharField(max_length=32)
    #xx/xx/init.py
    ​
    from django.db.models import signals
    ​
    def before_save1(*args,**kwargs):
       print('有车来了,我要服务了--》',args,kwargs)
    ​
    def after_save1(*args,**kwargs):
       print('有车来了,完事了--》',args,kwargs)
    ​
    signals.pre_save.connect(before_save1)
    signals.post_save.connect(after_save1)
    ​
    # xx/app/view.py
    ​
    from django.shortcuts import render,HttpResponse
    from app01 import models
    ​
    def func1(request):
       # models.User.objects.create(title='老男孩')
       return HttpResponse('创建成功')

2.2 Scrapy

  1. scrapy使用型号也是一样,需要创建函数的。

    # ext.py
    ​
    # by luffycity.com
    from scrapy import signals
    ​
    class MyExtend(object):
       def __init__(self):
           pass
    ​
       @classmethod
       def from_crawler(cls, crawler):
           self = cls()
    #信号的设置
           crawler.signals.connect(self.x1, signal=signals.spider_opened)
           crawler.signals.connect(self.x2, signal=signals.spider_closed)
    ​
           return self
    ​
       def x1(self, spider):
           print('open')
    ​
    def x2(self, spider):
    print('close')
    ​
    # settings.py
    EXTENSIONS = {
       'xdb.ext.MyExtend':666,
    }

3.Scrapy redis

  • 分布式爬虫组件

3.1 去重

  1. redis去重,去重可以自定义,但是已经有人写好了,所以我们使用写好的(写好的使用的是时间戳,我们需要写死)。

  2. 自定义

    # settings.py
    ​
    DUPEFILTER_CLASS = 'dbd.xxx.DupFilter'
    ​
    # xxx.py
    ​
    from scrapy.dupefilter import BaseDupeFilter
    import redis
    from scrapy.utils.request import request_fingerprint
    ​
    class DupFilter(BaseDupeFilter):
    def __init__(self):
    self.conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    def request_seen(self, request):
    """
    检测当前请求是否已经被访问过
    :param request:
    :return: True表示已经访问过;False表示未访问过
    """
    fid = request_fingerprint(request)
    result = self.conn.sadd('visited_urls', fid)
    if result == 1:
    return False
    return True
  3. copy system

    # settings.py
    ​
    # ############### scrapy redis连接 ####################
    ​
    REDIS_HOST = '140.143.227.206'                            # 主机名
    REDIS_PORT = 8888                                   # 端口
    REDIS_PARAMS = {'password':'beta'}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
    REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'
    # REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
    ​
    # ############### scrapy redis去重 ####################
    ​
    DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
    ​
    #自带的
    # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
    #复制的
    DUPEFILTER_CLASS = 'dbd.xxx.RedisDupeFilter'
    ​
    ​
    # xxx.py
    ​
    from scrapy_redis.dupefilter import RFPDupeFilter
    from scrapy_redis.connection import get_redis_from_settings
    from scrapy_redis import defaults
    ​
    class RedisDupeFilter(RFPDupeFilter):
       @classmethod
       def from_settings(cls, settings):
           """Returns an instance from given settings.
    ​
          This uses by default the key ``dupefilter:<timestamp>``. When using the
          ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
          it needs to pass the spider name in the key.
    ​
          Parameters
          ----------
          settings : scrapy.settings.Settings
    ​
          Returns
          -------
          RFPDupeFilter
              A RFPDupeFilter instance.
    ​
    ​
          """
           server = get_redis_from_settings(settings)
           # XXX: This creates one-time key. needed to support to use this
           # class as standalone dupefilter with scrapy's default scheduler
           # if scrapy passes spider on open() method this wouldn't be needed
           # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
           key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'}
           debug = settings.getbool('DUPEFILTER_DEBUG')
           return cls(server, key=key, debug=debug)
    ​

3.2 队列

  1. 队列有三种,先进先出、后进先出、优先级。

  2. 先进先出

    import scrapy_redis
    import redis
    ​
    class FifoQueue(object):
       def __init__(self):
           self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    ​
       def push(self, request):
           """Push a request"""
           self.server.lpush('USERS', request)
    ​
       def pop(self, timeout=0):
           """Pop a request"""
           data = self.server.rpop('USERS')
           return data
    # [33,22,11]
    q = FifoQueue()
    q.push(11)
    q.push(22)
    q.push(33)
    ​
    print(q.pop())
    print(q.pop())
    print(q.pop())
  3. 后进先出

    import redis
    ​
    class LifoQueue(object):
       """Per-spider LIFO queue."""
       def __init__(self):
           self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    ​
       def push(self, request):
           """Push a request"""
           self.server.lpush("USERS", request)
    ​
       def pop(self, timeout=0):
           """Pop a request"""
           data = self.server.lpop('USERS')
           return data
    # [33,22,11]
    q = FifoQueue()
    q.push(11)
    q.push(22)
    q.push(33)
    ​
    print(q.pop())
    print(q.pop())
    print(q.pop())
  4. 优先级

    import redis
    ​
    class PriorityQueue(object):
       """Per-spider priority queue abstraction using redis' sorted set"""
       def __init__(self):
           self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    ​
       def push(self, request,score):
           """Push a request"""
           # data = self._encode_request(request)
           # score = -request.priority
           # We don't use zadd method as the order of arguments change depending on
           # whether the class is Redis or StrictRedis, and the option of using
           # kwargs only accepts strings, not bytes.
           self.server.execute_command('ZADD', 'xxxxxx', score, request)
    ​
       def pop(self, timeout=0):
           """
          Pop a request
          timeout not support in this queue class
          """
           # use atomic range/remove using multi/exec
           pipe = self.server.pipeline()
           pipe.multi()
           pipe.zrange('xxxxxx', 0, 0).zremrangebyrank('xxxxxx', 0, 0)
           results, count = pipe.execute()
           if results:
               return results[0]
    ​
    q = PriorityQueue()
    ​
    q.push('alex',99)
    q.push('oldboy',56)
    q.push('eric',77)
    ​
    v1 = q.pop()
    print(v1)
    v2 = q.pop()
    print(v2)
    v3 = q.pop()
    print(v3)

3.3 调度器

  1. 调度器本质就是存储和删除,所以我们使用scrapy redis来做调度器。

  2. scrapy crawl chouti --nolog执行后,会先去settings中找SCHEDULER并执行Scheduler.from_crawler。再执行Scheduler.from_settings去读取设置的配置文件,最后再去循环读取Scrapy的连接。

    # settings.py
    ​
    # ############### scrapy redis连接 ####################
    ​
    REDIS_HOST = '140.143.227.206'                      # 主机名
    REDIS_PORT = 8888                                   # 端口
    REDIS_PARAMS = {'password':'beta'}                 # Redis连接参数            
    # 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
    REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'
    ​
    # REDIS_URL = 'redis://user:pass@hostname:9001'     # 连接URL(优先于以上配置)
    ​
    ################ 去重 ######################
    DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
    DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
    ​
    ​
    # ###################### 调度器 ######################
    from scrapy_redis.scheduler import Scheduler
    # 由scrapy_redis的调度器来进行负责调配
    # enqueue_request: 向调度器中添加任务
    # next_request: 去调度器中获取一个任务
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    ​
    # 规定任务存放的顺序
    # 优先级 DEPTH_PRIORITY只在优先级中有用
    DEPTH_PRIORITY = 1  # 广度优先
    # DEPTH_PRIORITY = -1 # 深度优先
    ​
    # 默认使用优先级队列(默认),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
    SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'  
    # 广度优先
    # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'  
    # 深度优先
    # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
    ​
    SCHEDULER_QUEUE_KEY = '%(spider)s:requests'  # 调度器中请求存放在redis中的key
    ​
    SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 对保存到redis中的数据进行序列化,默认使用pickle
    ​
    SCHEDULER_PERSIST = False  # 是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
    SCHEDULER_FLUSH_ON_START = True  # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
    # SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
    ​
    SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重规则,在redis中保存时对应的key
    SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'  # 去重规则对应处理的类

3.4 start url

  1. 起始url可以设置在Scrapy redis中,这样程序没任务时会等待,并不会结束进程。

    # setting.py
    ​
    START_URLS_KEY = '%(name)s:start_urls'
    #False是从列表拿,True是从集合中拿
    REDIS_START_URLS_AS_SET = False
    ​
    # spiders/chouti.py
    ​
    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.http import Request
    import scrapy_redis
    from scrapy_redis.spiders import RedisSpider
    ​
    class ChoutiSpider(RedisSpider):
       name = 'chouti'
       allowed_domains = ['chouti.com']
    ​
       def parse(self, response):
           print(response)
    ​
    # Server file for the scrapy redis.
    ​
    # by luffycity.com
    import redis
    ​
    conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    #这里放什么,爬虫文件中就下载什么。
    conn.lpush('chouti:start_urls','https://dig.chouti.com/r/pic/hot/1')

     

4.Settings file for the start

  1. 配置文件的一下配置的说明。

    # -*- coding: utf-8 -*-
    # project name
    BOT_NAME = 'dbd'
    ​
    # reptile file path
    SPIDER_MODULES = ['dbd.spiders']
    NEWSPIDER_MODULE = 'dbd.spiders'
    ​
    ​
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # USER_AGENT = 'dbd (+http://www.yourdomain.com)'
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    ​
    #爬取别人的网站是违法的,他们会规定就在url+robots.txt中,所以这条配置就是是否去robots中查找。它允许你就可以爬取,他不允许就不可以爬取。
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    ​
    # 这是并发默认是一个线程16个协程,你也可以设置的,这里的并不均匀,有可能一个多一个少
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    ​
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # 延迟三秒
    #DOWNLOAD_DELAY = 3
    ​
    # The download delay setting will honor only one of:
    # 每个网页并发16个
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    # 每个ip并发16个
    #CONCURRENT_REQUESTS_PER_IP = 16
    ​
    # 内部帮你操作cookies
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    ​
    # Disable Telnet Console (enabled by default)
    # from scrapy.extensions.telnet import TelnetConsole
    # telnet 127.0.0.1 6023
    # engine.pause()
    # engine.unpause()
    # 如果是True的话加上ip和端口可以,向电脑发送停止、继续、终止等指令。
    # TELNETCONSOLE_ENABLED = True
    # TELNETCONSOLE_HOST = '127.0.0.1'
    # TELNETCONSOLE_PORT = [6023,]
    ​
    # 默认的给所有请求头加上的请求
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    ​
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    # 爬虫中间件
    #SPIDER_MIDDLEWARES = {
    #   'dbd.middlewares.DbdSpiderMiddleware': 543,
    #}
    ​
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    # 下载中间件
    #DOWNLOADER_MIDDLEWARES = {
    #   'dbd.middlewares.DbdDownloaderMiddleware': 543,
    #}
    ​
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    # 信号注册
    #EXTENSIONS = {
    #   'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    ​
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    # pipelines
    #ITEM_PIPELINES = {
    #   'dbd.pipelines.DbdPipeline': 300,
    #}
    ​
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    from scrapy.contrib.throttle import AutoThrottle
    """
    17. 自动限速算法
      from scrapy.contrib.throttle import AutoThrottle
      自动限速设置
      1. 获取最小延迟 DOWNLOAD_DELAY
      2. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY
      3. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY
      4. 当请求下载完成后,获取其"连接"时间 latency,即:请求连接到接受到响应头之间的时间
      5. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY
      target_delay = latency / self.target_concurrency
      new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间
      new_delay = max(target_delay, new_delay)
      new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
      slot.delay = new_delay
    """
    # 自动限速相关的配置,上面有算法和用到的地方可以看看。
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    ​
    ​
    ​
    ​
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    # 如果配置上他们后,就是从本地的缓存中去,读取网页了。这样没网的时候也可以练习了。
    # HTTPCACHE_ENABLED = True
    # HTTPCACHE_EXPIRATION_SECS = 0
    # HTTPCACHE_DIR = 'httpcache'
    # HTTPCACHE_IGNORE_HTTP_CODES = []
    # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

5.提高性能

  • 他们都是使用单线程实现并发的发出请求。

5.1 协程

  • 协程是使用了gevent模块实现的,python你不并没有协程可说。

    """
    协程+IO切换
    pip3 install gevent
    gevent内部调用greenlet(实现了协程)。
    """
    from gevent import monkey; monkey.patch_all()
    import gevent
    import requests
    ​
    def func(url):
       response = requests.get(url)
       print(response)
    ​
    urls = [
       'http://www.baidu.com/',
       'https://www.cnblogs.com/',
       'https://www.cnblogs.com/news/',
       'https://cn.bing.com/',
       'https://stackoverflow.com/',
    ]
    spawn_list = []
    for url in urls:
       #循环将任务制定
       spawn_list.append(gevent.spawn(func, url))
    #回收任务
    gevent.joinall(spawn_list)

5.2 异步非阻塞

  • 异步非阻塞也是用了twisted模块实现的。

    """
    基于事件循环的异步非阻塞模块:Twisted
    """
    from twisted.web.client import getPage, defer
    from twisted.internet import reactor
    ​
    def stop_loop(arg):
       #这条命令是终止循环
       reactor.stop()
    ​
    def get_response(contents):
       print(contents)
    ​
    deferred_list = []
    ​
    url_list = [
       'http://www.baidu.com/',
       'https://www.cnblogs.com/',
       'https://www.cnblogs.com/news/',
       'https://cn.bing.com/',
       'https://stackoverflow.com/',
    ]
    ​
    for url in url_list:
       #制定任务
       deferred = getPage(bytes(url, encoding='utf8'))
       #内部自动发请求,请求成功后自动调用这个函数
       deferred.addCallback(get_response)
       #将任务存放到列表中
       deferred_list.append(deferred)
    ​
    # 监听列表,如果都完成了就往下走
    dlist = defer.DeferredList(deferred_list)
    # 任务完成后调用这个函数
    dlist.addBoth(stop_loop)
    # run代表开始去爬取了
    reactor.run()

5.3 自定义异步非阻塞

  • 自定义的只为学习和了解,如果使用上面的那两个足够了。

    # chun.py
    ​
    import socket
    import select
    ​
    class ChunSheng(object):
    ​
       def __init__(self):
           self.socket_list = []
           self.conn_list = []
    ​
           self.conn_func_dict = {}
    ​
       def add_request(self,url_func):
           conn = socket.socket()
           conn.setblocking(False)
           try:
               conn.connect((url_func[0],80))
           except BlockingIOError as e:
               pass
           self.conn_func_dict[conn] = url_func[1]
    ​
           self.socket_list.append(conn)
           self.conn_list.append(conn)
    ​
       def run(self):
           """
          检测self.socket_list中的5个socket对象是否连接成功
          :return:
          """
           while True:
               #   select.select
               #   第一个参数: 用于检测其中socket是否已经获取到响应内容
               #   第二个参数: 用于检测其中socket是否已经连接成功
    ​
               # 第一个返回值 r:具体是那一个socket获取到结果
               # 第二个返回值 r:具体是那一个socket连接成功
               r,w,e = select.select(self.socket_list,self.conn_list,[],0.05)
               for sock in w: # [socket1,socket2]
                   sock.send(b'GET / http1.1\r\nhost:xxxx.com\r\n\r\n')
                   self.conn_list.remove(sock)
    ​
               for sock in r:
                   data = sock.recv(8096)
                   func = self.conn_func_dict[sock]
                   func(data)
                   sock.close()
                   self.socket_list.remove(sock)
    ​
               if not self.socket_list:
                   break
                   
    # xx.py
    ​
    from chun import ChunSheng
    ​
    def callback1(data):
       print('下载完成',data)
    ​
    def callback2(data):
       print('下载完成',data)
    ​
    chun = ChunSheng()
    urls = [
      ('www.baidu.com',callback1),
      ('www.cnblogs.com',callback1),
      ('www.pythonav.com',callback2),
      ('www.bing.com',callback2),
      ('www.stackoverflow.com',callback2),
    ]
    for url in urls:
       chun.add_request(url)
    ​
    chun.run()
  • select,只能监听1024个socket,内部会循环所有的socket去检测;(windows使用)

  • poll,无个数限制,内部会循环所有的socket去检测;(mac,linux使用)

  • epoll,无个数限制,回调。(mac,linux使用)

标签:Reptile,self,soup,scrapy,print,import,def
From: https://www.cnblogs.com/InvincibleGrass/p/17032558.html

相关文章