python爬虫小结1

标签：info re python text get 爬虫 url json 小结

python 爬虫小结1

1 正则匹配中注意的：

import re
 a='<div>指数</div>'
 word=re.findall('<div>(.*?)</div>',a)
 print(word)  其中（.*?)是能匹配基本所有的字符，但是对于跨行的例外
 比如
 import re
 a='''<div>abc
 </div>'''
 word=re.findall('<div>(.*?)</div>',a,re.S)
 print(word)
 因为findall是逐行匹配的，当第一行没匹配的时候，从第2行匹配，所以最后参数用re.S,标识的是匹配包括换行在内的字符；
 ，在爬虫的时候，一般再进行换行清理下，使用
 print(word[0].strip())

2 简单例子：

import requests
 import reheaders = {
     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
 }info_lists = []
def judgment_sex(class_name):
   if class_name == 'womenIcon':
       return '女'
   else:
       return  '男'def get_info(url):
     res = requests.get(url)
     ids = re.findall('<h2>(.*?)</h2>',res.text,re.S)
     levels = re.findall('<div class="articleGender \D+Icon">(.*?)</div>',res.text,re.S)
     sexs = re.findall('<div class="articleGender (.*?)">',res.text,re.S)
     contents = re.findall('<div class="content">.*?<span>(.*?)</span>',res.text,re.S)
     laughs = re.findall('<span class="stats-vote"><i class="number">(\d+)</i>',res.text,re.S)
     comments = re.findall('<i class="number">(\d+)</i> 评论',res.text,re.S)
     for id,level,sex,content,laugh,comment in zip(ids,levels,sexs,contents,laughs,comments):
         info = {
             'id':id,
             'level':level,
             'sex':judgment_sex(sex),
             'content':content,
             'laugh':laugh,
             'comment':comment
         }
         info_lists.append(info)if __name__ == '__main__':
     urls = ['http://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1,10)]
     for url in urls:
         get_info(url)
     for info_list in info_lists:
          f = open('d:/qiushi.txt','a+')
          try:
              f.write(info_list['id']+'\n')
              f.write(info_list['level'] + '\n')
              f.write(info_list['sex'] + '\n')
              f.write(info_list['content'] + '\n')
              f.write(info_list['laugh'] + '\n')
              f.write(info_list['comment'] + '\n\n')
              f.close()
          except UnicodeEncodeError:
           pass
         #print(info_list) 2   python中调用相关网站的API套路：
     import requests
 import json
 import pprint
 address=input('请输入地点')
 par = {'address': address, 'key': 'cb649a25c1f81c1451adbeca73623251'}
 api = 'http://restapi.amap.com/v3/geocode/geo'
 res = requests.get(api, par)
 json_data = json.loads(res.text)
 pprint.pprint(json_data)
    其中pprint是JSON格式化输出工具，使用JSON.LOAD来加载JSON结果
 body > div.main > div.content > div.main-image > p > a > img 3 MYSQL配合抓取
比如抓取豆瓣TOP 250的电影
 import requests
 from lxml import etree
 import re
 import pymysql
 import timeconn = pymysql.connect(host='localhost', user='root', passwd='38477000', db='python', port=3309, charset='utf8')
 cursor = conn.cursor()headers = {
     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
 }def get_movie_url(url):
     html = requests.get(url,headers=headers)
     selector = etree.HTML(html.text)
     movie_hrefs = selector.xpath('//div[@class="hd"]/a/@href')
     for movie_href in movie_hrefs:
         get_movie_info(movie_href)def get_movie_info(url):
     html = requests.get(url,headers=headers)
     selector = etree.HTML(html.text)
     try:
         name = selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
         director = selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
         actors = selector.xpath('//*[@id="info"]/span[3]/span[2]')[0]
         actor = actors.xpath('string(.)')
         style = re.findall('<span property="v:genre">(.*?)</span>',html.text,re.S)[0]
         country = re.findall('<span class="pl">制片国家/地区:</span> (.*?)<br/>',html.text,re.S)[0]
         release_time = re.findall('上映日期:</span>.*?>(.*?)</span>',html.text,re.S)[0]
         time = re.findall('片长:</span>.*?>(.*?)</span>',html.text,re.S)[0]
         score = selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
         cursor.execute(
             "insert into doubanmovie (name,director,actor,style,country,release_time,time,score) values(%s,%s,%s,%s,%s,%s,%s,%s)",
             (str(name), str(director), str(actor), str(style), str(country), str(release_time), str(time), str(score)))
          conn.execute()
     except IndexError:
         passif __name__ == '__main__':
     urls = ['https://movie.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]
     for url in urls:
         get_movie_url(url)
         time.sleep(5)
     conn.commit()

4 多线程+异步抓取简书网7日最热：

from lxml import etree
 import requests
 import re
 import json
 from multiprocessing import Poolheader = {
     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
 }def get_url(url):
     html = requests.get(url,headers=header)
     selector = etree.HTML(html.text)
     infos = selector.xpath('//ul[@class="note-list"]/li')
     for info in infos:
         article_url_part = info.xpath('div/a/@href')[0]
         get_info(article_url_part)def get_info(url):
     article_url = 'http://www.jianshu.com/' + url
     html = requests.get(article_url,headers=header)
     selector = etree.HTML(html.text)
     author = selector.xpath('//span[@class="name"]/a/text()')[0]    print(author)
     article = selector.xpath('//h1[@class="title"]/text()')[0]
     print(article)
     date = selector.xpath('//span[@class="publish-time"]/text()')[0]
     print(date)
     word = selector.xpath('//span[@class="wordage"]/text()')[0]
     print(word)
     view = re.findall('"views_count":(.*?),',html.text,re.S)[0]
     print(view)
     comment = re.findall('"comments_count":(.*?),',html.text,re.S)[0]
     print(comment)
     like = re.findall('"likes_count":(.*?),',html.text,re.S)[0]
     print(like)
     id = re.findall('{"id":(.*?),',html.text,re.S)[0]
     gain_url = 'http://www.jianshu.com/notes/{}/rewards?count=20'.format(id)
     wb_data = requests.get(gain_url,headers=header)
     json_data = json.loads(wb_data.text)
     gain = json_data['rewards_count']    include_list = []
     include_urls = ['http://www.jianshu.com/notes/{}/included_collections?page={}'.format(id,str(i)) for i in range(1,10)]
     for include_url in include_urls:
         html = requests.get(include_url,headers=header)
         json_data = json.loads(html.text)
         includes = json_data['collections']
         if len(includes) == 0:
             pass
         else:
             for include in includes:
                 include_title = include['title']
                 include_list.append(include_title)
     info ={
         'author':author,
         'article':article,
         'date':date,
         'word':word,
         'view':view,
         'comment':comment,
         'like':like,
         'gain':gain,
         'include':include_list
     } if __name__ == '__main__':
     urls = ['http://www.jianshu.com/trending/weekly?page={}'.format(str(i)) for i in range(0, 11)]
     pool = Pool(processes=4)
     pool.map(get_url,urls)

4 表单提交：
使用FORM表单提交的套路，下面是抓取拉钩网的

import requests
 import json
 import time #client = pymongo.MongoClient('localhost', 27017)
 #mydb = client['mydb']
 #lagou = mydb['lagou']headers = {
     'Cookie':'XXXXXX',
     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
     'Connection':'keep-alive'
 }def get_page(url,params):
     html = requests.post(url, data=params, headers=headers)
     json_data = json.loads(html.text)
     print(json_data)
     total_Count = json_data['content']['positionResult']['totalCount']
     page_number = int(total_Count/15) if int(total_Count/15)<30 else 30
     get_info(url,page_number)def get_info(url,page):
     for pn in range(1,page+1):
         params = {
             'first': 'true',
             'pn': str(pn),
             'kd': 'Python'
         }
         try:
             html = requests.post(url,data=params,headers=headers)
             json_data = json.loads(html.text)
             results = json_data['content']['positionResult']['result']
             for result in results:
                 compangeName=result['companyFullName']
                 print(compangeName)
                 infos = {
                     'businessZones':result['businessZones'],
                     'city':result['city'],
                     'companyFullName':result['companyFullName'],
                     'companyLabelList':result['companyLabelList'],
                     'companySize':result['companySize'],
                     'district':result['district'],
                     'education':result['education'],
                     'explain':result['explain'],
                     'financeStage':result['financeStage'],
                     'firstType':result['firstType'],
                     'formatCreateTime':result['formatCreateTime'],
                     'gradeDescription':result['gradeDescription'],
                     'imState':result['imState'],
                     'industryField':result['industryField'],
                     'jobNature':result['jobNature'],
                     'positionAdvantage':result['positionAdvantage'],
                     'salary':result['salary'],
                     'secondType':result['secondType'],
                     'workYear':result['workYear']
                 }
                # lagou.insert_one(infos)
                 time.sleep(10)
         except requests.exceptions.ConnectionError:
             passif __name__ == '__main__':
     url = 'https://www.lagou.com/jobs/positionAjax.json'
     params = {
         'first': 'true',
         'pn': '1',
         'kd': 'Python'
     }
     get_page(url,params)

5 发现个老外的不错的，在线做词云的工具，样式比较多，推荐下　https://wordart.com/create，比如爬SINA微博好友圈

import requests
 import jsonheaders = {
     'Cookie':'XXXX'
 }f = open('d:/weibo.txt','a+',encoding='utf-8')
def get_info(url,page):
     html = requests.get(url,headers=headers)
     json_data = json.loads(html.text)
     card_groups = json_data[0]['card_group']
     for card_group in card_groups:
         f.write(card_group['mblog']['text'].split(' ')[0]+'\n')    next_cursor = json_data[0]['next_cursor']
    if page<50:
         next_url = 'https://m.weibo.cn/index/friends?format=cards&next_cursor='+str(next_cursor)+'&page=1'
         page = page + 1
         get_info(next_url,page)
     else:
         pass
         f.close()if __name__ == '__main__':
     url = 'https://m.weibo.cn/index/friends?format=cards'
     get_info(url,1)  然后分词：
     import jieba.analyse
 path = 'd:\weibo.txt'
 fp = open(path,'r',encoding='utf-8')
 content = fp.read()
 try:
     jieba.analyse.set_stop_words('G:\python学习相关\stop_words_zh.txt')
     tags = jieba.analyse.extract_tags(content, topK=100, withWeight=True)
     for item in tags:
         print(item[0]+'\t'+str(int(item[1]*1000)))
 finally:
     fp.close()

标签：info,re,python,text,get,爬虫,url,json,小结
From： https://blog.51cto.com/u_14230175/5928135

相关文章

赞助商

阅读排行