标签：Reptile self soup scrapy print import def

Chapter 1

cha pu te

1.requests

1.1 安装

pip install requests    #爬虫

1.2 使用

获取html代码

import requests
response = requests.get("https://www.autobone.com.cn/news/")    #获取数据代码
response.encoding = "gbk"   #指定获取后的数据编码
print(response.text)    #查看文本信息

进行数据的筛选

from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text,"html.parser")    #解析为soup的对象
div = soup.find(name="div",attrs={"id":"auto-channel-latyload-article"})   #筛选匹配的第一个字符
#name，标签。attrs标签属性
li_list = div.find_all(name="li")   #筛选匹配的所有字符
for li in li_list:
   a = li.find(name="a")
   print(a.attrs.get("href"))    #获取标签指定的属性值

1.3 headers

user-agent

r1 = requests.get(
   url='https://dig.chouti.com/',
   #放置请求头的信息
   #user-agent 用户设备信息
   headers={
       'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  }
)

1.4 data and cookies

data

#发送post请求
r2 = requests.post(
   url='。。。',
   #data请求体中的数据
   data={
       'phone':'xxx',
       'password':'xxx',
  },
)

r1 = requests.get(
   url='。。。',
)

r2 = requests.post(
   url='https://dig.chouti.com/login',
   #存放要发送的cookies
   #get_dict()是查看cookies
   cookies=r1.cookies.get_dict()
)

1.5 请求与参数

requests的一些请求

requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs)

# 以上方法均是在此方法的基础上构建
#引号中写什么就会使用什么请求。
requests.request(method, url, **kwargs)

参数

def param_method_url():
   # requests.request(method='get', url='http://127.0.0.1:8000/test/')
   # requests.request(method='post', url='http://127.0.0.1:8000/test/')
   pass

def param_param():
   #param是在url后面跟的数据
   # - 可以是字典、可以是字符串、可以是字节（ascii编码以内）

   # requests.request(method='get',url='http://127.0.0.1:8000/test/',
   # params={'k1': 'v1', 'k2': '水电费'})

   # requests.request(method='get',url='http://127.0.0.1:8000/test/',
   # params="k1=v1&k2=水电费&k3=v3&k3=vv3")

   # requests.request(method='get',url='http://127.0.0.1:8000/test/',
   # params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8'))

   # 错误
   # requests.request(method='get',url='http://127.0.0.1:8000/test/',
   # params=bytes("k1=v1&k2=水电费&k3=v3&k3=vv3", encoding='utf8'))
   pass

def param_data():#传请求体
   # 可以是字典、可以是字符串、可以是字节、可以是文件对象
   #data的请求格式
   #GET /index http1.1\r\nhost:c1.com\r\n\r\nuser=alex&pwd=123

   # requests.request(method='POST',url='http://127.0.0.1:8000/test/',
   # data={'k1': 'v1', 'k2': '水电费'})

   # requests.request(method='POST',url='http://127.0.0.1:8000/test/',
   # data="k1=v1; k2=v2; k3=v3; k3=v4")

   # requests.request(method='POST',url='http://127.0.0.1:8000/test/',
   # data="k1=v1;k2=v2;k3=v3;k3=v4",
   # headers={'Content-Type': 'application/x-www-form-urlencoded'})

   # requests.request(method='POST',url='http://127.0.0.1:8000/test/',
   # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是：k1=v1;k2=v2;k3=v3;k3=v4
   # headers={'Content-Type': 'application/x-www-form-urlencoded'})

def param_json():#传请求体
   #json的请求格式，并且会多带一个Content-Type:application/json用来说明
   #GET /index http1.1\r\nhost:c1.com\r\nContent-Type:application/json\r\n\r\n{"user":"alex","pwd":123}
   
   # 将json中对应的数据进行序列化成一个字符串，json.dumps(...)
   # 然后发送到服务器端的body中，并且Content-Type是 {'Content-Type': 'application/json'}
   requests.request(method='POST',url='http://127.0.0.1:8000/test/',json={'k1': 'v1', 'k2': '水电费'})


def param_headers():
   # 发送请求头到服务器端
   requests.request(method='POST',url='http://127.0.0.1:8000/test/',json={'k1': 'v1', 'k2': '水电费'},headers={'Content-Type': 'application/x-www-form-urlencoded'})


def param_cookies():
   # 发送Cookie到服务器端
   requests.request(method='POST',url='http://127.0.0.1:8000/test/',
                    data={'k1': 'v1', 'k2': 'v2'},cookies={'cook1': 'value1'},)
   # 也可以使用CookieJar（字典形式就是在此基础上封装）
   from http.cookiejar import CookieJar
   from http.cookiejar import Cookie

   obj = CookieJar()
   obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/', secure=False, expires=None,
                         discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False,
                         port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False)
                  )
   requests.request(method='POST',url='http://127.0.0.1:8000/test/',data={'k1': 'v1', 'k2': 'v2'},cookies=obj)

def param_files():
   # 发送文件
   # file_dict = {'f1': open('readme', 'rb')}
   # requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)

   # 发送文件，定制文件名
   # file_dict = {'f1': ('test.txt', open('readme', 'rb'))}
   # requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)

   # 发送文件，定制文件名
   # file_dict = {'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")}
   # requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)

   # 发送文件，定制文件名
   # file_dict = {'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})}
   # requests.request(method='POST',url='http://127.0.0.1:8000/test/',files=file_dict)
   pass

def param_auth():
   #认证，是在发送前会通过base64加密，就需要auth来发送了。
   #Authorization： "basic base64("用户|密码")"，请求头中的结构。
   from requests.auth import HTTPBasicAuth, HTTPDigestAuth

   ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
   
   ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))

def param_timeout():
   #超时，第一个参数是发送超时时间，第二个参数是接受超时时间。
   # ret = requests.get('http://google.com/', timeout=1)
   # ret = requests.get('http://google.com/', timeout=(5, 1))

def param_allow_redirects():
   #是否允许重定向，就是访问一个网站会跳到另一个网站，是否是否允许跳转。
   ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)

def param_proxies():
   #代理，通过是http或https，使用不同的代理。
   # proxies = {"http": "61.172.249.96:80","https": "http://61.185.219.126:3128",}
   # ret = requests.get("http://www.proxy360.cn/Proxy", proxies=proxies)
   # print(ret.headers)

   # from requests.auth import HTTPProxyAuth
   
   # proxyDict = {'http': '77.75.105.165','https': '77.75.105.165'}
   # auth = HTTPProxyAuth('username', 'mypassword')
   # r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
   pass

def param_stream():
   #大文件下载，就是把大的文件分成多分接收。
   ret = requests.get('http://127.0.0.1:8000/test/', stream=True)
   print(ret.content)
   ret.close()

   # from contextlib import closing
   # with closing(requests.get('http://httpbin.org/get', stream=True)) as r:
   # # 在此处理响应。
   # for i in r.iter_content():
   # print(i)


def requests_session():
   #第二次访问时就不需要携带cookie了。
   import requests
   session = requests.Session()

   ### 1、首先登陆任何页面，获取cookie
   i1 = session.get(url="http://dig.chouti.com/help/service")
   ### 2、用户登陆，携带上一次的cookie，后台对cookie中的 gpsd 进行授权
   i2 = session.post(url="http://dig.chouti.com/login",
       data={'phone': "8615131255089",'password': "xxxxxx",'oneMonth': ""})
   i3 = session.post(url="http://dig.chouti.com/link/vote?linksId=8589623",)

其他参数

def request(method, url, **kwargs):
   """Constructs and sends a :class:`Request <Request>`.

  :param method: method for the new :class:`Request` object.
  :param url: URL for the new :class:`Request` object.
  :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
  :param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.
  :param json: (optional) json data to send in the body of the :class:`Request`.
  :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`.
  :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`.
  :param files: (optional) Dictionary of 'name': file-like-objects (or {'name': file-tuple}) for multipart encoding upload.
      file-tuple can be a 2-tuple ('filename', fileobj), 3-tuple ('filename', fileobj, 'content_type')
      or a 4-tuple ('filename', fileobj, 'content_type', custom_headers), where 'content-type' is a string
      defining the content type of the given file and custom_headers a dict-like object containing additional headers
      to add for the file.
  :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth.
  :param timeout: (optional) How long to wait for the server to send data
      before giving up, as a float, or a :ref:`(connect timeout, read
      timeout) <timeouts>` tuple.
  :type timeout: float or tuple
  :param allow_redirects: (optional) Boolean. Set to True if POST/PUT/DELETE redirect following is allowed.
  :type allow_redirects: bool
  :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy.
  :param verify: (optional) whether the SSL cert will be verified. A CA_BUNDLE path can also be provided. Defaults to True.
  :param stream: (optional) if False, the response content will be immediately downloaded.
  :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair.
  :return: :class:`Response <Response>` object
  :rtype: requests.Response

  Usage::

    >>> import requests
    >>> req = requests.request('GET', 'http://httpbin.org/get')
    <Response [200]>
  """

2.BeautifulSoup

BeautifulSoup是一个模块，该模块用于接收一个HTML或XML字符串，然后将其进行格式化，之后遍可以使用他提供的方法进行快速查找指定元素，从而使得在HTML或XML中查找指定元素变得简单。

from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
asdf
<div class="title">
<b>The Dormouse's story总共</b>
<h1>f</h1>
</div>
<div class="story">Once upon a time there were three little sisters; and their names were
      <a class="sister0" id="link1">Els<span>f</span>ie</a>,
      <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
      <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.
</div>
ad<br/>sf
<p class="story">...</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, features="lxml")
# 找到第一个a标签tag1 = soup.find(name='a')
# 找到所有的a标签tag2 = soup.find_all(name='a')
# 找到id＝link2的标签 tag3 = soup.select('#link2')

安装：
```
pip3 install beautifulsoup4
```

2.1 使用示例

from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
...
</body>
</html>
"""
soup = BeautifulSoup(html_doc, features="lxml")

name，标签名称

tag = soup.find('a')
name = tag.name # 获取
print(name)
tag.name = 'span' # 设置
print(soup)

attr，标签属性

tag = soup.find('a')
attrs = tag.attrs # 获取
print(attrs)
tag.attrs = {'ik':123} # 设置
tag.attrs['id'] = 'iiiii' # 设置
print(soup)

children,所有子标签

body = soup.find('body')
v = body.children

children,所有子子孙孙标签

body = soup.find('body')
v = body.descendants

clear,将标签的所有子标签全部清空（保留标签名）
```
tag = soup.find('body')
tag.clear()
print(soup)
```

decompose,递归的删除所有的标签

body = soup.find('body')
body.decompose()
print(soup)

extract,递归的删除所有的标签，并获取删除的标签
```
body = soup.find('body')
v = body.extract()
print(soup)
```
decode,转换为字符串（含当前标签）；decode_contents（不含当前标签）
```
body = soup.find('body')
v = body.decode()
v = body.decode_contents()
print(v)
```
encode,转换为字节（含当前标签）；encode_contents（不含当前标签）
```
body = soup.find('body')
v = body.encode()
v = body.encode_contents()
print(v)
```

find,获取匹配的第一个标签

tag = soup.find('a')
print(tag)
tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
print(tag)

find_all,获取匹配的所有标签

tags = soup.find_all('a')
print(tags)
tags = soup.find_all('a',limit=1)
print(tags)
tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
print(tags)
# ####### 列表 #######
v = soup.find_all(name=['a','div'])
print(v)
v = soup.find_all(class_=['sister0', 'sister'])
print(v)
v = soup.find_all(text=['Tillie'])
print(v, type(v[0]))
v = soup.find_all(id=['link1','link2'])
print(v)
v = soup.find_all(href=['link1','link2'])
print(v)
# ####### 正则 #######
import re
rep = re.compile('p')
rep = re.compile('^p')
v = soup.find_all(name=rep)
print(v)
rep = re.compile('sister.*')
v = soup.find_all(class_=rep)
print(v)
rep = re.compile('http://www.oldboy.com/static/.*')
v = soup.find_all(href=rep)
print(v)
# ####### 方法筛选 #######
def func(tag):
   return tag.has_attr('class') and tag.has_attr('id')
v = soup.find_all(name=func)
print(v)
## get,获取标签属性
tag = soup.find('a')
v = tag.get('id')
print(v)

has_attr,检查标签是否具有该属性

tag = soup.find('a')
v = tag.has_attr('id')
print(v)

get_text,获取标签内部文本内容

tag = soup.find('a')
v = tag.get_text('id')
print(v)

index,检查标签在某标签中的索引位置

tag = soup.find('body')
v = tag.index(tag.find('div'))
print(v)
tag = soup.find('body')
for i,v in enumerate(tag):
   print(i,v)

is_empty_element,是否是空标签(是否可以是空)或者自闭合标签，判断是否是如下标签：'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'.
```
tag = soup.find('br')
v = tag.is_empty_element
print(v)
```

当前的关联标签

soup.next
soup.next_element
soup.next_elements
soup.next_sibling
soup.next_siblings
tag.previous
tag.previous_element
tag.previous_elements
tag.previous_sibling
tag.previous_siblings
tag.parent
tag.parents

查找某标签的关联标签

tag.find_next(...)
tag.find_all_next(...)
tag.find_next_sibling(...)
tag.find_next_siblings(...)
tag.find_previous(...)
tag.find_all_previous(...)
tag.find_previous_sibling(...)
tag.find_previous_siblings(...)
tag.find_parent(...)
tag.find_parents(...) # 参数同find_all

select,select_one, CSS选择器

soup.select("title")
soup.select("p nth-of-type(3)")
soup.select("body a")
soup.select("html head title")
tag = soup.select("span,a")
soup.select("head > title")
soup.select("p > a")
soup.select("p > a:nth-of-type(2)")
soup.select("p > #link1")
soup.select("body > a")
soup.select("#link1 ~ .sister")
soup.select("#link1 + .sister")
soup.select(".sister")
soup.select("[class~=sister]")
soup.select("#link1")
soup.select("a#link2")
soup.select('a[href]')
soup.select('a[href="http://example.com/elsie"]')
soup.select('a[href^="http://example.com/"]')
soup.select('a[href$="tillie"]')
soup.select('a[href*=".com/el"]')
from bs4.element import Tag
def default_candidate_generator(tag):
   for child in tag.descendants:
  if not isinstance(child, Tag):
           continue
       if not child.has_attr('href'):
           continue
           yield child
           tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
           print(type(tags), tags)
from bs4.element import Tag
def default_candidate_generator(tag):
   for child in tag.descendants:
       if not isinstance (child, Tag):
           continue
       if not child.has_attr('href'):
           continue
           yield child
           tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
           print(type(tags), tags)

标签的内容

tag = soup.find('span')
print(tag.string)     # 获取
tag.string = 'new content' # 设置
print(soup)
tag = soup.find('body')
print(tag.string)
tag.string = 'xxx'
print(soup)
tag = soup.find('body')
v = tag.stripped_strings # 递归内部获取所有标签的文本
print(v)

append在当前标签内部追加一个标签

tag = soup.find('body')
tag.append(soup.find('a'))
print(soup)
from bs4.element import Tag
obj = Tag(name='i',attrs={'id': 'it'})
obj.string = '我是一个新来的'
tag = soup.find('body')
tag.append(obj)
print(soup)

insert在当前标签内部指定位置插入一个标签

from bs4.element import Tag
obj = Tag(name='i', attrs={'id': 'it'})
obj.string = '我是一个新来的'
tag = soup.find('body')
tag.insert(2, obj)
print(soup)

insert_after,insert_before 在当前标签后面或前面插入

from bs4.element import Tag
obj = Tag(name='i', attrs={'id': 'it'})
obj.string = '我是一个新来的'
tag = soup.find('body')
tag.insert_before(obj)
tag.insert_after(obj)
print(soup)

replace_with 在当前标签替换为指定标签

from bs4.element import Tag
obj = Tag(name='i', attrs={'id': 'it'})
obj.string = '我是一个新来的'
tag = soup.find('div')
tag.replace_with(obj)
print(soup)

创建标签之间的关系

tag = soup.find('div')
a = soup.find('a')
tag.setup(previous_sibling=a)
print(tag.previous_sibling)

wrap，将指定标签把当前标签包裹起来

from bs4.element import Tag
obj1 = Tag(name='div', attrs={'id': 'it'})
obj1.string = '我是一个新来的'
tag = soup.find('a')
v = tag.wrap(obj1)
print(soup)
tag = soup.find('a')
v = tag.wrap(soup.find('p'))
print(soup)

unwrap，去掉当前标签，将保留其包裹的标签
```
tag = soup.find('a')
v = tag.unwrap()
print(soup)
```

3.补充

3.1约束

用于调用此类时需要调用此方法，不然就无法使用。

import abc
class Base(metaclass=abc.ABCMeta):
   @abc.abstractmethod#标记
   def send(self):pass
  
   def func(self):
       print("...")
class Foo(Base):#继承此类后需要自己定义这个被标记的方法，不然无法运行。
   def send(self):
       print("...")
obj = Foo()

4.练习

4.1 1.2/1~2

import requests
from bs4 import BeautifulSoup

def Reptile():
   response = requests.get("https://www.autohome.com.cn/news/")
   response.encoding = "gbk"
   print(response.text)
   soup = BeautifulSoup(response.text, "html.parser")
   div = soup.find(name="div", attrs={"id": "auto-channel-lazyload-article"})

   li_list = div.find_all(name="li")

   for li in li_list:

       title = li.find(name="h3")
       if not title:
           continue
       p = li.find(name="p")
       a = li.find(name="a")

       print(title.text)
       print(a.attrs.get("href"))
       print(p.text)

       img = li.find(name="img")
       src = img.get("src")
       print(src)
       try:
           # 再次发起请求，下载照片
           file_name = src.rsplit("/", maxsplit=1)[1]
           print(file_name)
           ret = requests.get(src)
           with open(file_name, "wb") as f:
               f.write(ret.content)
       except Exception as e:
           continue


# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
   Reptile()

4.2 1.3

import requests
from bs4 import BeautifulSoup

r1 = requests.get(
   url='https://dig.chouti.com/',
   #放置请求头的信息
   #user-agent 用户设备信息
   headers={
       'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  }
)

soup = BeautifulSoup(r1.text,'html.parser')

# 标签对象
content_list = soup.find(name='div',id='content-list')
# print(content_list)
# [标签对象,标签对象]
item_list = content_list.find_all(name='div',attrs={'class':'item'})
for item in item_list:
   a = item.find(name='a',attrs={'class':'show-content color-chag'})
   print(a.text.strip())
   # print(a.text)

4.3 1.3|1.4

import requests
# 1. 查看首页
r1 = requests.get(
   url='https://dig.chouti.com/',
   headers={
       'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  }
)

# 2. 提交用户名和密码
r2 = requests.post(
   url='https://dig.chouti.com/login',
   headers={
       'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  },
   #需要登陆请求体中需要携带，用户名和密码
   data={
       'phone':'8613121758648',
       'password':'woshiniba',
       'oneMonth':1
  },
   #应为抽屉的方式是通过，第一次访问在cookies里有一个随机数，只有在登陆时携带才可以。
   cookies=r1.cookies.get_dict()
)


# 3. 点赞
r3 = requests.post(
   url='https://dig.chouti.com/link/vote?linksId=20435396',
   headers={
       'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
  },
   #将用户登陆后的cookies携带
   cookies=r1.cookies.get_dict()
)
print(r3.text)

4.4 1.2

import requests
from bs4 import BeautifulSoup

# ############## 方式一 ##############
#
# # 1. 访问登陆页面，获取 authenticity_token
i1 = requests.get('https://github.com/login')
soup1 = BeautifulSoup(i1.text, features='lxml')
tag = soup1.find(name='input', attrs={'name': 'authenticity_token'})
authenticity_token = tag.get('value')
c1 = i1.cookies.get_dict()
i1.close()
#
# # 1. 携带authenticity_token和用户名密码等信息，发送用户验证
form_data = {
"authenticity_token": authenticity_token,
   "utf8": "",
   "commit": "Sign in",
   "login": "[email protected]",
   'password': 'xxoo'
}
#
i2 = requests.post('https://github.com/session', data=form_data, cookies=c1)
c2 = i2.cookies.get_dict()
c1.update(c2)
i3 = requests.get('https://github.com/settings/repositories', cookies=c1)

soup3 = BeautifulSoup(i3.text, features='lxml')
list_group = soup3.find(name='div', class_='listgroup')
#
from bs4.element import Tag
#
for child in list_group.children:
   if isinstance(child, Tag):
       project_tag = child.find(name='a', class_='mr-1')
       size_tag = child.find(name='small')
       temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
       print(temp)

Chapter 2

1.scrapy

1.安装

安装在windows环境下时，需要安装twisted、pywin32、scrapy。
```
pip install twisted
pip install pywin32
pip install scrapy
```

1.1 基本认识

创建项目、创建爬虫、启动爬虫。

#create a project and add a project name.
scrapy startproject project_name
#创建文件并设置姓名和要爬取网站的url“taobao.com”
scrapy genspider name url
#启动爬虫（加--nolog就不会显示日志）
scrapy crawl name
scrapy crawl name --nolog

认识默认创建文件夹的工作

- project name
- project name
  - spiders#爬虫文件夹
  - items.py#持久化
  - middlewares.py#中间件
  - pipelines#持久化
  - settings.py#配置文件（爬虫相关）
  - scrapy.cfg#配置文件（部署相关）

认识爬虫文件

import scrapy
#这是操作刚爬来的数据做操作，里面有更多方法可以使用
from scrapy.http.response.html import HtmlResponse
#如果运行不起来爬虫文件时，可以试试
import sys,os,io
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding="gb18030")

class ChoutiSpider(scrapy.Spider):
   name = "chouti"#这是爬虫文件的名称
   allowed_domains = ["chouti.com"]#定向爬虫，只爬这个网站，爬不到就撤退
   start_urls = ["http://chouti.com/"]#开始url，最开始要爬的url地址
   
   def parse(self,response):#这是默认创建的方法，response返回爬回来的数据。
       f = open("news.log",mode="a+")
#.xpath用于筛选,//代表全页面寻找子子孙孙，.//代表当前标签寻找子子孙孙，/找自己所有的儿子。
       item_list = response.xpath('//div[@class="link-con"]/div')
       for item in item_list:
           #text()代表获取文本内容，extract_first()代表获取第一个标签，@href代表获取属性的href。
           text = item.xpath('.//div[@class="link-detail"]/a/text()').extract_first()
           href = item.xpath('.//div[@class="link-detail"]/a/@href').extract_first()
           print(text.strip())
           f.write(href+"\n")
       f.close()
       #extract()用于转为需要的数据
       page_list = response.xpath('//div[@id="dig-lcpage"]//a/@href').extract()
       for page in page_list:
           from scrapy.http import Request
           page = "https://dig.chouti.com"+page
           #将获得的url拼接并转为request，每次递归访问函数parse，再将数据交给。
           yield Request(url=page,callback=self.parse)

分工合作

pipelines.py

from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class AaPipeline:
   
   def __init__(self,path):
  self.f = None
       self.path =path
   @classmethod
   def from_crawler(cls,crawler):
       """初始化时候，用于创建pipeline对象"""
       path = crawler.settings.get("HREF_FILE_PATH")#在所有配置文件中寻找
       return cls(path)
   
   def open_spider(self,spider):
       """爬虫开始时执行"""
       self.f = open(self.path,"a+")
   def process_item(self, item, spider): #这个方法不能直接使用，需要和items一起使用。
self.f.write(item["url"]+"\n")
       return item#如果你不返回下一次就拿不到数据的，下一次就无法处理
  raise DropItem()#不想让他处理可以直接这样写
   def close_spider(self,spider):
       """爬虫结束时执行"""
       self.f.close()

items.py

import scrapy

class AaItem(scrapy.Item):
   # define the fields for your item here like:
   # name = scrapy.Field()
   title= scrapy.Field()#需要将其定义
   url = scrapy.Field()

settings.py

ITEM_PIPELINES = {#这个默认是注释掉的，取消注释后pipelines才可以用。
   'aa.pipelines.AaPipeline': 300,
}
HREF_FILE_PATH = "www.log"

spiders/chouti.py

import scrapy
from aaa.items import AaItem#调用那个类

class ChoutiSpider(scrapy.Spider):
   name = "chouti"
   allowed_domains = ["chouti.com"]
   start_urls = ["http://chouti.com/"]
   
   def parse(self,response):
       item_list = response.xpath('//div[@class="link-con"]/div')
       for item in item_list:
           text = item.xpath('.//div[@class="link-detail"]/a/text()').extract_first()
           href = item.xpath('.//div[@class="link-detail"]/a/@href').extract_first()
           print(text.strip())
           yield AaItem(title=text,url=href)#将数据传入刚定义的方法中
           
       page_list = response.xpath('//div[@id="dig-lcpage"]//a/@href').extract()
       for page in page_list:
           from scrapy.http import Request
           page = "https://dig.chouti.com"+page
           yield Request(url=page,callback=self.parse)

1.2 去重和深度

在请求的时候会重复出现同一个url，scrapy默认已经提供了，我们可以做一个一样的来学习。

settings.py

#默认指向scrapy的，我们将其改成我们的。
#DUPEFILTER_CLASS = "scrapy.dupefilter.RFPDupeFilter"
DUPEFILTER_CLASS = "aaa.dupefilter.RFPDupeFilter"

DEPTH_LIMIT = 3#限制深度为3

dupefilters.py

from scrapy.dupefilter import BaseDupeFilter
#url有的长有的短，有些url后面的值顺序调转一下md5值就不一样，我们只能用他了。
from scrapy.utils.request import request_fingerprint

class AaaDupeFilter(BaseDupeFilter):
   def __init__(self):
       self.visited_fd = set()#初始化一个集合
       
   @classmethod
   def from_settings(cls,settings):
       return cls()#调用自己
   def request_seen(self,request):
       fd = request_fingerprint(request=request)#将url转为指定长度的值
       if fd in self.visited_fd:#判断是否被请求过
           return True
       self.visited_fd.add(fd)#没有请求果就添加一下
        
   def open(self):
       print("开始")
   def close(self,reason):
       print("结束")
   def log(self,request,spider):
       print("日志")

spiders/chouti.py

import scrapy
from aaa.items import AaItem#调用那个类

class ChoutiSpider(scrapy.Spider):
   name = "chouti"
   allowed_domains = ["chouti.com"]
   start_urls = ["http://chouti.com/"]
   
   def parse(self,response):
       
       print(response.request.url,response.meta.get("depth",0))#查看当前请求的url,和深度
       item_list = response.xpath('//div[@class="link-con"]/div')
       for item in item_list:
           text = item.xpath('.//div[@class="link-detail"]/a/text()').extract_first()
           href = item.xpath('.//div[@class="link-detail"]/a/@href').extract_first()
           print(text.strip())
           
       page_list = response.xpath('//div[@id="dig-lcpage"]//a/@href').extract()
       for page in page_list:
           from scrapy.http import Request
           page = "https://dig.chouti.com"+page
           #dont_filter=True时去去重无效
           yield Request(url=page,callback=self.parse,dont_filter=True)

1.3 携带cookie

携带cookie做抽屉登录

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.response.html import HtmlResponse
from xdb.items import XdbItem
import scrapy
from scrapy.http.cookies import CookieJar
from scrapy.http import Request
from urllib.parse import urlencode

class ChoutiSpider(scrapy.Spider):
   name = 'chouti'
   allowed_domains = ['chouti.com']
   start_urls = ['https://dig.chouti.com/']
   cookie_dict = {}#做一个全局字典
   def parse(self, response):
       # 去响应头中获取cookie，cookie保存在cookie_jar对象
       cookie_jar = CookieJar()
       cookie_jar.extract_cookies(response, response.request)
       # 去对象中将cookie解析到字典
       for k, v in cookie_jar._cookies.items():
           for i, j in v.items():
               for m, n in j.items():
                   self.cookie_dict[m] = n.value

       yield Request(
           url='https://dig.chouti.com/login',
           method='POST',
           body="phone=8613121758648&password=woshiniba&oneMonth=1",
           cookies=self.cookie_dict,#将cookies携带
           headers={#设置请求头
               'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
          },
           callback=self.check_login#写一个回调函数
      )

   def check_login(self,response):
       print(response.text)

       yield Request(
           url='https://dig.chouti.com/all/hot/recent/1',
           cookies=self.cookie_dict,
           callback=self.index
      )

   def index(self,response):#寻找到新闻并点赞
       news_list = response.xpath('//div[@id="content-list"]/div[@class="item"]')
       for new in news_list:
           link_id = new.xpath('.//div[@class="part2"]/@share-linkid').extract_first()
           yield Request(
               url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,),
               method='POST',
               cookies=self.cookie_dict,
               callback=self.check_result
          )

       page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
       for page in page_list:
           page = "https://dig.chouti.com" + page
           yield Request(url=page, callback=self.index)  # https://dig.chouti.com/all/hot/recent/2

   def check_result(self, response):
       print(response.text)

1.4 起始ruls

起始urls内部原理：scrapy引擎来爬虫中取起始url，先调用start_request并获取返回值，返回值是一个迭代器需要（iter（）一下），再使用执行v.__ next __()，将其全部放到调度器中。

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.response.html import HtmlResponse
from xdb.items import XdbItem
import scrapy
from scrapy.http.cookies import CookieJar
from scrapy.http import Request
from urllib.parse import urlencode

class ChoutiSpider(scrapy.Spider):
   name = 'chouti'
   allowed_domains = ['chouti.com']
   start_urls = ['https://dig.chouti.com/']
   
   def start_request(self):
       for url in self.start_urls:
           yield Request(url=url)
   
   def parse(self, response):
pass

1.5 代理

内置代理

在scrapy内部自带了代理，代理使用过进程环境变量来传输的，通过判断是不是“_PROXY”结尾来判断是不是代理，代理需要在爬虫启动之前设置，所以我们使用start _requests方法。也可以在meta中传代理。

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request

class ChoutiSpider(scrapy.Spider):
   name = 'chouti'
   allowed_domains = ['chouti.com']
   start_urls = ['https://dig.chouti.com/']
   
   def start_request(self):
       import os
       #在环境变量中加入代理
       os.environ["HTTPS_PROXY"] = "http://root:[email protected]:9999/"
       os.environ["HTTP_PROXY"] = "192.168.11.11"
       #meta中传代理
       for url in self.start_urls:
           yield Request(url=url,meta={"proxy":"http://root:[email protected]:9999/"})
   
   def parse(self, response):
pass

自定义代理

#settings.py
#自定义的代理需要注册才可以使用，只要注册上自带的就不会被使用了。
DOWNLOADER_MIDDLEWARES = {
  #'xdb.middlewares.XdbDownloaderMiddleware': 543,
   'xdb.proxy.XdbProxyMiddleware':751,
}


#proxy.py
#这两种都可是完成这个功能
import base64
import random
from six.moves.urllib.parse import unquote
try:
   from urllib2 import _parse_proxy
except ImportError:
   from urllib.request import _parse_proxy
from six.moves.urllib.parse import urlunparse
from scrapy.utils.python import to_bytes

class XdbProxyMiddleware(object):

   def _basic_auth_header(self, username, password):
       user_pass = to_bytes(
           '%s:%s' % (unquote(username), unquote(password)),
           encoding='latin-1')
       return base64.b64encode(user_pass).strip()

   def process_request(self, request, spider):
       PROXIES = [
           "http://root:[email protected]:9999/",
           "http://root:[email protected]:9999/",
           "http://root:[email protected]:9999/",
           "http://root:[email protected]:9999/",
           "http://root:[email protected]:9999/",
           "http://root:[email protected]:9999/",
      ]
       url = random.choice(PROXIES)

       orig_type = ""
       proxy_type, user, password, hostport = _parse_proxy(url)
       proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))

       if user:
           creds = self._basic_auth_header(user, password)
       else:
           creds = None
       request.meta['proxy'] = proxy_url
       if creds:
           request.headers['Proxy-Authorization'] = b'Basic ' + creds


class DdbProxyMiddleware(object):
   def process_request(self, request, spider):
       PROXIES = [
          {'ip_port': '111.11.228.75:80', 'user_pass': ''},
          {'ip_port': '120.198.243.22:80', 'user_pass': ''},
          {'ip_port': '111.8.60.9:8123', 'user_pass': ''},
          {'ip_port': '101.71.27.120:80', 'user_pass': ''},
          {'ip_port': '122.96.59.104:80', 'user_pass': ''},
          {'ip_port': '122.224.249.122:8088', 'user_pass': ''},
      ]
       proxy = random.choice(PROXIES)
       if proxy['user_pass'] is not None:
           request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
           encoded_user_pass = base64.b64encode(to_bytes(proxy['user_pass']))
           request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
       else:
           request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])

1.6 解析器

解析器就是下xpath，也可以单独使用

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.selector import Selector, HtmlXPathSelector
from scrapy.http import HtmlResponse
html = """<!DOCTYPE html>
<html>
  <head lang="en">
      <meta charset="UTF-8">
      <title></title>
  </head>
  <body>
      <ul>
          <li class="item-"><a id='i1' href="link.html">first item</a></li>
          <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
          <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
      </ul>
      <div><a href="llink2.html">second item</a></div>
  </body>
</html>
"""
#伪造request
response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
# hxs = HtmlXPathSelector(response)
# a = Selector(response=response)
# hxs = a.xpath('//a')#找a标签
# hxs = a.xpath('//a[2]')   #找a标签中的第二个
# hxs = a.xpath('//a[@id]')#找a标签有id属性的
# hxs = a.xpath('//a[@id="i1"]')#找a标签有id=i1的7
# hxs = a.xpath('//a[@href="link.html"][@id="i1"]')#找a标签href=link.html并且id=i1的
# hxs = a.xpath('//a[contains(@href, "link")]')#找a标签href属性中有link的
# hxs = a.xpath('//a[starts-with(@href, "link")]')#找a标签href属性是否以link开头
# hxs = a.xpath('//a[re:test(@id, "i\d+")]')#找a标签id=i\d+的
# hxs = a.xpath('//a[re:test(@id, "i\d+")]/text()').extract()#找a标签id=i\d+的所有文本
# hxs = a.xpath('//a[re:test(@id, "i\d+")]/@href').extract()#找a标签id=i\d+的所有href
# hxs = a.xpath('/html/body/ul/li/a/@href').extract()#找a标签所有href
# hxs = a.xpath('//body/ul/li/a/@href').extract_first()#找a标签第一个href
# print(hxs)

# ul_list = Selector(response=response).xpath('//body/ul/li')
# for item in ul_list:
#     v = item.xpath('./a/span')
#     # 或
#     # v = item.xpath('a/span')
#     # 或
#     # v = item.xpath('*/a/span')
#     print(v)

2.算法

2.1算法基础

时间复杂度：是不考虑确定时间，只考虑大概时间。

如：
#打印一句大概用了n，打印三句依然是n，并不会出现3，应为3是个确定数了。
print("aaa")#O(n)

print("bbb")#O(n)
print("bbb")
print("bbb")
#数学的log就是log的几次方等于n例如：log2 64=6
#如果每次减半就可以用他了 logn只对log2n，应为计算机中大多数都是2进制。
while n>1:#O(logn)||O(log2n)
   print(n)
   n = n//2

- 时间复杂度是用来估计算法运行时间大的一个式子。
- 一般来说，时间复杂度高的算法比复杂度低的算法慢。
- 常见的时间复杂度

递归

def func1(x):# 5,4,3,2,1
   if x>0:
       print(x)
       func1(x-1)
def func2(x):# 1,2,3,4,5
   if x>0:
       func2(x-1)
       print(x)

2.2 查找

列表查询有顺序查找和二分查找，二分查找从有序列表的候选区，通过对待查找的值于候选区中间值的比较，可以使候选区减少一半。

def binary_search(li,val):
   low = 0
   high = len(li) - 1
   while low <= high:
       mid = (low+high) //2
       if li[mid] > val:
           high = mid -1
       elif li[mid] < val:
           low = mid + 1
       else:
           return mid
   else:
       return -1
li = range(0,1222222)
print(binary_search(li,305))

2.3 排序

冒泡排序，列表每两个相邻的数，如果前面的比后面的大，那么交换这两个数。

def bubble_sort(li):
   for i in range(0,len(li)-1):
  for j in range(0,len(li)-i-1):
           if li[j] >li[j+1]:
               li[j],li[j+1] = li[j+1],li[j]
import random
li = list(range(10000))
random.shuffle(li)
bubble_sort(li)
print(li)

选择排序，一趟经历记录最小的数，放到第一个位置。

def select_sort(li):
   for i in range(len(li)-1):
       min_loc = i
       for j in range(i+1,len(i)):
           if li[min_loc] > li[j]:
               min_loc = j
       li[min_loc],li[i] = li[i],li[min_loc]
import random
li = list(range(10000))
random.shuffle(li)
select_sort(li)
print(li)

插入排序，列表被分为有序区和无序区两部分，最初有序区只有一个元素。每次从无序区中选择一个元素，插入到有序区的位置，直到无序区为空。

def insert_sort(li):
   for i in range(1,len(li)):
       tmp = li[i]
       j = i - 1
       while j >= 0 and tmp < li[j]:
           li[j + 1] = li[j]
           j = j - 1
       li[j + 1] = tmp
import random
li = list(range(10000))
random.shuffle(li)
insert_sort(li)
print(li)

空间复杂度：O(1),时间复杂度O(n^2)

快速排序，取一个元素p(第一个元素)，使元素p归位；列表被p分为两部分，左边比p小，右边比p大；然后递归完成。

def quick_sort(li,left,right):
   if left < right:
       mid = partition(li,left,right)
       quick_sort(li,left,mid-1)
       quick_sort(li,mid+1,right)
def partition(li,left,right):
   tmp = li[left]
   while left < right:
       while left <right and li[right] >= tmp:
           right -= 1
       while left <right and li[left] <= tmp:
           left += 1
       li[right] = li[left]
   li[left] = tmp
   return left
import random
li = list(range(10000))
random.shuffle(li)
quick_sort(li,0,len(li)-1)
print(li)

堆排序

树是一种数据结构，是一种可以递归定义的数据结构。
二叉树是树枝不超过两个，并且分为满二叉树和完全二叉树，前者是每一层都达到最大值，后者是只能出现最下层和次下层右侧出现缺少。
二叉树顺序存储方式，根节点左侧是“2n+1”,根节点右侧是“2n+2”。
堆排序的堆有大根堆和小根堆，大根堆是根最大下一级二叉树小于上一级。小根堆是根最小下一级二叉树大于上一级。

def sift(li,low,high):
   tmp = li[low]
   i = low
   j = 2*i+1
   while True:
       if j<high and li[j+1] > li[j]:
           j +=1
       if tmp < li[j]:
           li[i] = li[j]
           i = j
           j = 2*i+1
       else:
           li[i] = tmp
           break
   li[i] = tmp
def heap_sort(li):
   n = len(li)
   for i in range(n // 2 -1, -1,-1):
       sift(li,i,n-1)
   for i in range(n-1,-1,-1):
       li[0],li[i] = li[i],li[0]
       sift(li,0,i-1)
import random
li = list(range(100000,1,-1))
random.shuffle(li)
heqp_sort(li)
print(li)
#python内部其实也带来一个堆排序
import heapq
li = [4,3,2,1,5,6,7,8]
heapq.heapify(li)
print(li)

归并排序，是将列表分到最小，再将列表有序归并到最大。

def merge(li,low,mid,high):
   i = low
   j = mid +1
   ltmp = []
   while i <= mid and j <= high:
       if li[i] <= li[j]:
           ltmp.append(li[i])
           i += 1
       else:
           ltmp.append(li[j])
           j += 1
   while i <= mid:
       ltmp.append(li[i])
       i += 1
   while j <= high:
       ltmp.append(li[j])
       j += 1
   li[low:high + 1] = ltmp
def merge_sort(li,low,high):
   if low < high:
       mid = (low + high) // 2
       merge_sort(li,low,mid)
       merge_sort(li,mid+1,high)
       merge(li,low,mid,high)
li = [10,4,5,6,7,8,3,2]
merge_sort(li,0,len(li)-1)
print(li)

Chapter 3

1.Middleware and making orders

1.1 download Middleware

下载中间件主要用于有，request的加工。

# settings.py
#注册中间件
DOWNLOADER_MIDDLEWARES = {
  #'xdb.middlewares.XdbDownloaderMiddleware': 543,
   'xdb.md.Md1':666,
}
# md.py

from scrapy.http import HtmlResponse
from scrapy.http import Request

class Md1(object):
   @classmethod
   def from_crawler(cls, crawler):
       # This method is used by Scrapy to create your spiders.
       s = cls()
       return s

   def process_request(self, request, spider):
       # Called for each request that goes through the downloader
       # middleware.

       # Must either:
       # - return None: continue processing this request
       # - or return a Response object
       # - or return a Request object
       # - or raise IgnoreRequest: process_exception() methods of
       #   installed downloader middleware will be called
       print('md1.process_request',request)
       # 1. 返回Response对象，你返回什么，爬虫文件中就拿到什么
       # import requests
       # result = requests.get(request.url)
       # return HtmlResponse(url=request.url, status=200, headers=None, body=result.content)
       # 2. 返回Request，会一直去比对，并不会返回
       # return Request('https://dig.chouti.com/r/tec/hot/1')

       # 3. 抛出异常
       # from scrapy.exceptions import IgnoreRequest
       # raise IgnoreRequest

       # 4. 对请求进行加工(*)
       # request.headers['user-agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"

       pass

   def process_response(self, request, response, spider):
       # Called with the response returned from the downloader.

       # Must either;
       # - return a Response object
       # - return a Request object
       # - or raise IgnoreRequest
       print('m1.process_response',request,response)
       return response

   def process_exception(self, request, exception, spider):
       #使用该方法去接收错误
       # Called when a download handler or a process_request()
       # (from other downloader middleware) raises an exception.

       # Must either:
       # - return None: continue processing this exception
       # - return a Response object: stops process_exception() chain
       # - return a Request object: stops process_exception() chain
       pass

1.2 Reptile Middleware

爬虫中间件，用的不多，就做大概了解即可。

#settings.py
#设置爬虫中间件
SPIDER_MIDDLEWARES = {
   'xdb.sd.Sd1': 666,
}

class Sd1(object):
   # Not all methods need to be defined. If a method is not defined,
   # scrapy acts as if the spider middleware does not modify the
   # passed objects.

   @classmethod
   def from_crawler(cls, crawler):
       # This method is used by Scrapy to create your spiders.
       s = cls()
       return s

   def process_spider_input(self, response, spider):
       # Called for each response that goes through the spider
       # middleware and into the spider.

       # Should return None or raise an exception.
       return None

   def process_spider_output(self, response, result, spider):
       # Called with the results returned from the Spider, after
       # it has processed the response.

       # Must return an iterable of Request, dict or Item objects.
       for i in result:
           yield i

   def process_spider_exception(self, response, exception, spider):
       # Called when a spider or process_spider_input() method
       # (from other spider middleware) raises an exception.

       # Should return either None or an iterable of Response, dict
       # or Item objects.
       pass

   # 只在爬虫启动时，执行一次。从爬虫文件的start_request的返回回到这里的
   def process_start_requests(self, start_requests, spider):
       # Called with the start requests of the spider, and works
       # similarly to the process_spider_output() method, except
       # that it doesn’t have a response associated.

       # Must return only requests (not items).
       for r in start_requests:
           yield r

1.3 making orders

单爬虫运行,放在执行命令的同目录即可。

import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
execute(["scrapy","crawl","chouti","--nolog"])

多爬虫运行

1.在spiders同级创建任意目录，如：commands
2.在其中创建 crawlall.py 文件 （此处文件名就是自定义的命令）
3.在settings.py 中添加配置 COMMANDS_MODULE = '项目名称.目录名称'
4.在项目目录执行命令：scrapy crawlall
#crawlall.py
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings

class Command(ScrapyCommand):

   requires_project = True

   def syntax(self):
       return '[options]'

   def short_desc(self):
       return 'Runs all of the spiders'

   def run(self, args, opts):
       spider_list = self.crawler_process.spiders.list()
       for name in spider_list:
           self.crawler_process.crawl(name, **opts.__dict__)
           self.crawler_process.start()

2.信号

2.1 Django

Django中提供了“信号调度”，用于在框架执行操作时解耦。通俗来讲，就是一些动作发生的时候，信号允许特定的发送者去提醒一些接受者。

Model signals
   pre_init                    # django的modal执行其构造方法前，自动触发
   post_init                   # django的modal执行其构造方法后，自动触发
   pre_save                    # django的modal对象保存前，自动触发
   post_save                   # django的modal对象保存后，自动触发
   pre_delete                  # django的modal对象删除前，自动触发
   post_delete                 # django的modal对象删除后，自动触发
   m2m_changed                 # django的modal中使用m2m字段操作第三张表（add,remove,clear）前后，自动触发
   class_prepared              # 程序启动时，检测已注册的app中modal类，对于每一个类，自动触发
Management signals
   pre_migrate                 # 执行migrate命令前，自动触发
   post_migrate                # 执行migrate命令后，自动触发
Request/response signals
   request_started             # 请求到来前，自动触发
   request_finished            # 请求结束后，自动触发
   got_request_exception       # 请求异常后，自动触发
Test signals
   setting_changed             # 使用test测试修改配置文件时，自动触发
   template_rendered           # 使用test测试渲染模板时，自动触发
Database Wrappers
   connection_created          # 创建数据库连接时，自动触发

对于Django内置的信号，仅需注册指定信号，当程序执行相应操作时，自动触发注册函数：

   from django.core.signals import request_finished
   from django.core.signals import request_started
   from django.core.signals import got_request_exception

   from django.db.models.signals import class_prepared
   from django.db.models.signals import pre_init, post_init
   from django.db.models.signals import pre_save, post_save
   from django.db.models.signals import pre_delete, post_delete
   from django.db.models.signals import m2m_changed
   from django.db.models.signals import pre_migrate, post_migrate

   from django.test.signals import setting_changed
   from django.test.signals import template_rendered

   from django.db.backends.signals import connection_created


   def callback(sender, **kwargs):
       print("xxoo_callback")
       print(sender,kwargs)

   xxoo.connect(callback)
   # xxoo指上述导入的内容

示例

# xx/app/models.py

from django.db import models

class User(models.Model):

   title = models.CharField(max_length=32)
#xx/xx/init.py

from django.db.models import signals

def before_save1(*args,**kwargs):
   print('有车来了，我要服务了--》',args,kwargs)

def after_save1(*args,**kwargs):
   print('有车来了，完事了--》',args,kwargs)

signals.pre_save.connect(before_save1)
signals.post_save.connect(after_save1)

# xx/app/view.py

from django.shortcuts import render,HttpResponse
from app01 import models

def func1(request):
   # models.User.objects.create(title='老男孩')
   return HttpResponse('创建成功')

2.2 Scrapy

scrapy使用型号也是一样，需要创建函数的。

# ext.py

# by luffycity.com
from scrapy import signals

class MyExtend(object):
   def __init__(self):
       pass

   @classmethod
   def from_crawler(cls, crawler):
       self = cls()
#信号的设置
       crawler.signals.connect(self.x1, signal=signals.spider_opened)
       crawler.signals.connect(self.x2, signal=signals.spider_closed)

       return self

   def x1(self, spider):
       print('open')

def x2(self, spider):
print('close')

# settings.py
EXTENSIONS = {
   'xdb.ext.MyExtend':666,
}

3.Scrapy redis

分布式爬虫组件

3.1 去重

redis去重，去重可以自定义，但是已经有人写好了，所以我们使用写好的（写好的使用的是时间戳，我们需要写死）。

自定义

# settings.py

DUPEFILTER_CLASS = 'dbd.xxx.DupFilter'

# xxx.py

from scrapy.dupefilter import BaseDupeFilter
import redis
from scrapy.utils.request import request_fingerprint

class DupFilter(BaseDupeFilter):
def __init__(self):
self.conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
def request_seen(self, request):
"""
检测当前请求是否已经被访问过
:param request:
:return: True表示已经访问过；False表示未访问过
"""
fid = request_fingerprint(request)
result = self.conn.sadd('visited_urls', fid)
if result == 1:
return False
return True

copy system

# settings.py

# ############### scrapy redis连接 ####################

REDIS_HOST = '140.143.227.206'                            # 主机名
REDIS_PORT = 8888                                   # 端口
REDIS_PARAMS = {'password':'beta'}                                  # Redis连接参数             默认：REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}）
REDIS_ENCODING = "utf-8"                            # redis编码类型             默认：'utf-8'
# REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL（优先于以上配置）

# ############### scrapy redis去重 ####################

DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'

#自带的
# DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
#复制的
DUPEFILTER_CLASS = 'dbd.xxx.RedisDupeFilter'


# xxx.py

from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.connection import get_redis_from_settings
from scrapy_redis import defaults

class RedisDupeFilter(RFPDupeFilter):
   @classmethod
   def from_settings(cls, settings):
       """Returns an instance from given settings.

      This uses by default the key ``dupefilter:<timestamp>``. When using the
      ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
      it needs to pass the spider name in the key.

      Parameters
      ----------
      settings : scrapy.settings.Settings

      Returns
      -------
      RFPDupeFilter
          A RFPDupeFilter instance.


      """
       server = get_redis_from_settings(settings)
       # XXX: This creates one-time key. needed to support to use this
       # class as standalone dupefilter with scrapy's default scheduler
       # if scrapy passes spider on open() method this wouldn't be needed
       # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
       key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'}
       debug = settings.getbool('DUPEFILTER_DEBUG')
       return cls(server, key=key, debug=debug)

3.2 队列

队列有三种，先进先出、后进先出、优先级。

先进先出

import scrapy_redis
import redis

class FifoQueue(object):
   def __init__(self):
       self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta')

   def push(self, request):
       """Push a request"""
       self.server.lpush('USERS', request)

   def pop(self, timeout=0):
       """Pop a request"""
       data = self.server.rpop('USERS')
       return data
# [33,22,11]
q = FifoQueue()
q.push(11)
q.push(22)
q.push(33)

print(q.pop())
print(q.pop())
print(q.pop())

后进先出

import redis

class LifoQueue(object):
   """Per-spider LIFO queue."""
   def __init__(self):
       self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta')

   def push(self, request):
       """Push a request"""
       self.server.lpush("USERS", request)

   def pop(self, timeout=0):
       """Pop a request"""
       data = self.server.lpop('USERS')
       return data
# [33,22,11]
q = FifoQueue()
q.push(11)
q.push(22)
q.push(33)

print(q.pop())
print(q.pop())
print(q.pop())

优先级

import redis

class PriorityQueue(object):
   """Per-spider priority queue abstraction using redis' sorted set"""
   def __init__(self):
       self.server = redis.Redis(host='140.143.227.206',port=8888,password='beta')

   def push(self, request,score):
       """Push a request"""
       # data = self._encode_request(request)
       # score = -request.priority
       # We don't use zadd method as the order of arguments change depending on
       # whether the class is Redis or StrictRedis, and the option of using
       # kwargs only accepts strings, not bytes.
       self.server.execute_command('ZADD', 'xxxxxx', score, request)

   def pop(self, timeout=0):
       """
      Pop a request
      timeout not support in this queue class
      """
       # use atomic range/remove using multi/exec
       pipe = self.server.pipeline()
       pipe.multi()
       pipe.zrange('xxxxxx', 0, 0).zremrangebyrank('xxxxxx', 0, 0)
       results, count = pipe.execute()
       if results:
           return results[0]

q = PriorityQueue()

q.push('alex',99)
q.push('oldboy',56)
q.push('eric',77)

v1 = q.pop()
print(v1)
v2 = q.pop()
print(v2)
v3 = q.pop()
print(v3)

3.3 调度器

调度器本质就是存储和删除，所以我们使用scrapy redis来做调度器。

scrapy crawl chouti --nolog执行后，会先去settings中找SCHEDULER并执行Scheduler.from_crawler。再执行Scheduler.from_settings去读取设置的配置文件，最后再去循环读取Scrapy的连接。

# settings.py

# ############### scrapy redis连接 ####################

REDIS_HOST = '140.143.227.206'                      # 主机名
REDIS_PORT = 8888                                   # 端口
REDIS_PARAMS = {'password':'beta'}                 # Redis连接参数            
# 默认：REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}）
REDIS_ENCODING = "utf-8"                            # redis编码类型             默认：'utf-8'

# REDIS_URL = 'redis://user:pass@hostname:9001'     # 连接URL（优先于以上配置）

################ 去重 ######################
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'


# ###################### 调度器 ######################
from scrapy_redis.scheduler import Scheduler
# 由scrapy_redis的调度器来进行负责调配
# enqueue_request: 向调度器中添加任务
# next_request: 去调度器中获取一个任务
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# 规定任务存放的顺序
# 优先级 DEPTH_PRIORITY只在优先级中有用
DEPTH_PRIORITY = 1  # 广度优先
# DEPTH_PRIORITY = -1 # 深度优先

# 默认使用优先级队列（默认），其他：PriorityQueue（有序集合），FifoQueue（列表）、LifoQueue（列表）
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'  
# 广度优先
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'  
# 深度优先
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'

SCHEDULER_QUEUE_KEY = '%(spider)s:requests'  # 调度器中请求存放在redis中的key

SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 对保存到redis中的数据进行序列化，默认使用pickle

SCHEDULER_PERSIST = False  # 是否在关闭时候保留原来的调度器和去重记录，True=保留，False=清空
SCHEDULER_FLUSH_ON_START = True  # 是否在开始之前清空 调度器和去重记录，True=清空，False=不清空
# SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去调度器中获取数据时，如果为空，最多等待时间（最后没数据，未获取到）。

SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重规则，在redis中保存时对应的key
SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'  # 去重规则对应处理的类

3.4 start url

起始url可以设置在Scrapy redis中，这样程序没任务时会等待，并不会结束进程。

# setting.py

START_URLS_KEY = '%(name)s:start_urls'
#False是从列表拿，True是从集合中拿
REDIS_START_URLS_AS_SET = False

# spiders/chouti.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import scrapy_redis
from scrapy_redis.spiders import RedisSpider

class ChoutiSpider(RedisSpider):
   name = 'chouti'
   allowed_domains = ['chouti.com']

   def parse(self, response):
       print(response)

# Server file for the scrapy redis.

# by luffycity.com
import redis

conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
#这里放什么，爬虫文件中就下载什么。
conn.lpush('chouti:start_urls','https://dig.chouti.com/r/pic/hot/1')

4.Settings file for the start

配置文件的一下配置的说明。

# -*- coding: utf-8 -*-
# project name
BOT_NAME = 'dbd'

# reptile file path
SPIDER_MODULES = ['dbd.spiders']
NEWSPIDER_MODULE = 'dbd.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'dbd (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

#爬取别人的网站是违法的，他们会规定就在url+robots.txt中，所以这条配置就是是否去robots中查找。它允许你就可以爬取，他不允许就不可以爬取。
# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 这是并发默认是一个线程16个协程，你也可以设置的，这里的并不均匀，有可能一个多一个少
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 延迟三秒
#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:
# 每个网页并发16个
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 每个ip并发16个
#CONCURRENT_REQUESTS_PER_IP = 16

# 内部帮你操作cookies
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# from scrapy.extensions.telnet import TelnetConsole
# telnet 127.0.0.1 6023
# engine.pause()
# engine.unpause()
# 如果是True的话加上ip和端口可以，向电脑发送停止、继续、终止等指令。
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [6023,]

# 默认的给所有请求头加上的请求
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# 爬虫中间件
#SPIDER_MIDDLEWARES = {
#   'dbd.middlewares.DbdSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# 下载中间件
#DOWNLOADER_MIDDLEWARES = {
#   'dbd.middlewares.DbdDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# 信号注册
#EXTENSIONS = {
#   'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# pipelines
#ITEM_PIPELINES = {
#   'dbd.pipelines.DbdPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
from scrapy.contrib.throttle import AutoThrottle
"""
17. 自动限速算法
  from scrapy.contrib.throttle import AutoThrottle
  自动限速设置
  1. 获取最小延迟 DOWNLOAD_DELAY
  2. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY
  3. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY
  4. 当请求下载完成后，获取其"连接"时间 latency，即：请求连接到接受到响应头之间的时间
  5. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY
  target_delay = latency / self.target_concurrency
  new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间
  new_delay = max(target_delay, new_delay)
  new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
  slot.delay = new_delay
"""
# 自动限速相关的配置，上面有算法和用到的地方可以看看。
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False




# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# 如果配置上他们后，就是从本地的缓存中去，读取网页了。这样没网的时候也可以练习了。
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

5.提高性能

他们都是使用单线程实现并发的发出请求。

5.1 协程

协程是使用了gevent模块实现的，python你不并没有协程可说。

"""
协程+IO切换
pip3 install gevent
gevent内部调用greenlet（实现了协程）。
"""
from gevent import monkey; monkey.patch_all()
import gevent
import requests

def func(url):
   response = requests.get(url)
   print(response)

urls = [
   'http://www.baidu.com/',
   'https://www.cnblogs.com/',
   'https://www.cnblogs.com/news/',
   'https://cn.bing.com/',
   'https://stackoverflow.com/',
]
spawn_list = []
for url in urls:
   #循环将任务制定
   spawn_list.append(gevent.spawn(func, url))
#回收任务
gevent.joinall(spawn_list)

5.2 异步非阻塞

异步非阻塞也是用了twisted模块实现的。

"""
基于事件循环的异步非阻塞模块：Twisted
"""
from twisted.web.client import getPage, defer
from twisted.internet import reactor

def stop_loop(arg):
   #这条命令是终止循环
   reactor.stop()

def get_response(contents):
   print(contents)

deferred_list = []

url_list = [
   'http://www.baidu.com/',
   'https://www.cnblogs.com/',
   'https://www.cnblogs.com/news/',
   'https://cn.bing.com/',
   'https://stackoverflow.com/',
]

for url in url_list:
   #制定任务
   deferred = getPage(bytes(url, encoding='utf8'))
   #内部自动发请求，请求成功后自动调用这个函数
   deferred.addCallback(get_response)
   #将任务存放到列表中
   deferred_list.append(deferred)

# 监听列表，如果都完成了就往下走
dlist = defer.DeferredList(deferred_list)
# 任务完成后调用这个函数
dlist.addBoth(stop_loop)
# run代表开始去爬取了
reactor.run()

5.3 自定义异步非阻塞

自定义的只为学习和了解，如果使用上面的那两个足够了。

# chun.py

import socket
import select

class ChunSheng(object):

   def __init__(self):
       self.socket_list = []
       self.conn_list = []

       self.conn_func_dict = {}

   def add_request(self,url_func):
       conn = socket.socket()
       conn.setblocking(False)
       try:
           conn.connect((url_func[0],80))
       except BlockingIOError as e:
           pass
       self.conn_func_dict[conn] = url_func[1]

       self.socket_list.append(conn)
       self.conn_list.append(conn)

   def run(self):
       """
      检测self.socket_list中的5个socket对象是否连接成功
      :return:
      """
       while True:
           #   select.select
           #   第一个参数： 用于检测其中socket是否已经获取到响应内容
           #   第二个参数： 用于检测其中socket是否已经连接成功

           # 第一个返回值 r：具体是那一个socket获取到结果
           # 第二个返回值 r：具体是那一个socket连接成功
           r,w,e = select.select(self.socket_list,self.conn_list,[],0.05)
           for sock in w: # [socket1,socket2]
               sock.send(b'GET / http1.1\r\nhost:xxxx.com\r\n\r\n')
               self.conn_list.remove(sock)

           for sock in r:
               data = sock.recv(8096)
               func = self.conn_func_dict[sock]
               func(data)
               sock.close()
               self.socket_list.remove(sock)

           if not self.socket_list:
               break
               
# xx.py

from chun import ChunSheng

def callback1(data):
   print('下载完成',data)

def callback2(data):
   print('下载完成',data)

chun = ChunSheng()
urls = [
  ('www.baidu.com',callback1),
  ('www.cnblogs.com',callback1),
  ('www.pythonav.com',callback2),
  ('www.bing.com',callback2),
  ('www.stackoverflow.com',callback2),
]
for url in urls:
   chun.add_request(url)

chun.run()

select，只能监听1024个socket,内部会循环所有的socket去检测；(windows使用)
poll，无个数限制，内部会循环所有的socket去检测；（mac，linux使用）
epoll，无个数限制，回调。（mac，linux使用）

标签：Reptile,self,soup,scrapy,print,import,def
From： https://www.cnblogs.com/InvincibleGrass/p/17032558.html

Reptile