爬取小说（编辑推荐，完本榜单）

标签：榜单 url text 完本 list 爬取 soup fan find
import requests
import bs4
import re
import pandas as pd
import xlwt


#
# def l():
def heavy_recommendation():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
    url = 'https://www.17k.com/quanben/'
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    list_total = []
    list1 = []
    list_word = []
    list_fan = []

    li1 = soup.find('ul', attrs={'class': 'BJTJ_CONT Top1'})
    li1_list = li1.find_all('li')
    for item in li1_list:
        url_book = item.find('a').get('href')
        url_book = url_book.replace('//', 'https://')
        url_book1 = url_book.replace('	', '')
        list1.append(url_book1)
    for i in range(0, 16):
        url2 = list1[i]
        dict1 = {}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

        response = requests.get(url=url2, headers=headers)
        response.encoding = 'utf-8'
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        reader = soup.find('em', attrs={'class': 'blue'}).text
        word = soup.find('em', attrs={'class': 'red'}).text
        # print(reader.text)
        # print(word.text)
        name1 = soup.find('h1')
        name2 = name1.find('a').text
        writer = soup.find('a', attrs={'class': 'name'}).text

        fan = soup.find('span', attrs={'id': 'fansScore'}).text
        fan = fan.replace('万', "0000")
        fan = fan.replace('.', '')
        recommender = soup.find('span', attrs={'id': 'recommentCount'}).text
        # print(fan.text)
        # print(recommender.text)
        # print(writer.text)
        dict1['小说名字'] = name2
        dict1['作者'] = writer
        dict1['粉丝数'] = int(fan)
        dict1['阅读数'] = reader
        dict1['小说字数'] = int(word)
        dict1['推荐票数'] = recommender
        list_word.append(word)
        list_fan.append(fan)
        list_total.append(dict1)
        # print(list_word)
        # print(list_fan)
        df = pd.DataFrame(list_total)
        # print(df)
        # print("over！-----------------------------------------------------------------")

    df2 = df.sort_values(by=["小说字数"], ascending=[False], kind="stable")
    df3 = df.sort_values(by=["粉丝数"], ascending=[False], kind='stable')
    df2.to_excel('heavy_recommendation1.xls')
    df3.to_excel('heavy_recommendation2.xls')

    # print(df.head())
    # # print(df)

    # for i in range(0,15):
    #     flag = i
    #     for j in range(i + 1, 16):
    #         if int(list_fan[i]) < int(list_fan[j]):
    #             flag = j
    #             t = int(list_fan[i])
    #             list_fan[i] = int(list_fan[j])
    #             list_fan[j] = t


def Great_potential():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
    url = 'https://www.17k.com/quanben/'
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    list_total = []
    list1 = []
    list_word = []
    list_fan = []

    li1 = soup.find_all('ul', attrs={'class': 'BJTJ_CONT Top1'})[1]
    li1_list = li1.find_all('li')
    for item in li1_list:
        url_book = item.find('a').get('href')
        url_book = url_book.replace('//', 'https://')
        url_book1 = url_book.replace('	', '')
        list1.append(url_book1)
    for i in range(0, 16):
        url2 = list1[i]
        dict1 = {}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

        response = requests.get(url=url2, headers=headers)
        response.encoding = 'utf-8'
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        reader = soup.find('em', attrs={'class': 'blue'}).text
        word = soup.find('em', attrs={'class': 'red'}).text
        name1 = soup.find('h1')
        name2 = name1.find('a').text
        # print(reader.text)
        # print(word.text)
        writer = soup.find('a', attrs={'class': 'name'}).text
        fan = soup.find('span', attrs={'id': 'fansScore'}).text
        fan1 = fan.replace('.', '')
        fan2 = fan1.replace('万', '0000')
        recommender = soup.find('span', attrs={'id': 'recommentCount'}).text
        # print(fan.text)
        # print(recommender.text)
        # print(writer.text)
        dict1['小说名字'] = name2
        dict1['阅读数'] = reader
        dict1['小说字数'] = int(word)
        dict1['作者'] = writer
        dict1['粉丝数'] = int(fan2)
        dict1['推荐票数'] = recommender
        list_total.append(dict1)
    df = pd.DataFrame(list_total)
    # print(df)
    # print("over！-----------------------------------------------------------------")
    df.to_excel('Great_potential.xls')

    df2 = df.sort_values(by=["小说字数"], ascending=[False], kind="stable")
    df3 = df.sort_values(by=["粉丝数"], ascending=[False], kind='stable')
    df2.to_excel('Great_potential11.xls')
    df3.to_excel('Great_potential12.xls')


def Boys_finished_the_book():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
    url = 'https://www.17k.com/top/refactor/top100/18_popularityListScore/18_popularityListScore_finishBook_top_100_pc.html?TabIndex=1&typeIndex=0'
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    list_total = []
    list1 = []
    list_word = []
    list_fan = []

    li1 = soup.find_all('a', attrs={'class': 'red'})

    for item in li1:
        url_book = item.get('href')
        url_book1 = url_book.replace('//', 'https://')
        list1.append(url_book1)
    for i in range(100):
        url2 = list1[i]
        dict1 = {}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

        response = requests.get(url=url2, headers=headers)
        response.encoding = 'utf-8'
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        reader = soup.find('em', attrs={'class': 'blue'}).text
        word = soup.find('em', attrs={'class': 'red'}).text
        # print(reader.text)
        # print(word.text)
        name = soup.find('a', attrs={'class': 'red'}).text
        writer = soup.find('a', attrs={'class': 'name'}).text
        fan = soup.find('span', attrs={'id': 'fansScore'}).text
        fan = fan.replace('.', '')
        fan = fan.replace('万', '0000')
        recommender = soup.find('span', attrs={'id': 'recommentCount'}).text
        # print(fan.text)
        # print(recommender.text)
        # print(writer.text)
        dict1['小说名称'] = name
        dict1['阅读数'] = reader
        dict1['小说字数'] = int(word)
        dict1['作者'] = writer
        dict1['粉丝数'] = int(fan)
        dict1['推荐票数'] = recommender
        list_total.append(dict1)
    df = pd.DataFrame(list_total)
    df2 = df.sort_values(by=["小说字数"], ascending=[False], kind="stable")
    df3 = df.sort_values(by=["粉丝数"], ascending=[False], kind='stable')
    df2.to_excel('Boys_finished_the_book1.xls')
    df3.to_excel('Boys_finished_the_book2.xls')


def Girls_finished_the_book():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
    url = 'https://www.17k.com/top/refactor/top100/18_popularityListScore/18_popularityListScore_finishBook_top_100_pc.html?TabIndex=1&typeIndex=0'
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    list_total = []
    list1 = []
    list_word = []
    list_fan = []

    li1 = soup.find_all('a', attrs={'class': 'red'})

    for item in li1:
        url_book = item.get('href')
        url_book1 = url_book.replace('//', 'https://')
        list1.append(url_book1)
    for i in range(100):
        url2 = list1[i]
        dict1 = {}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

        response = requests.get(url=url2, headers=headers)
        response.encoding = 'utf-8'
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        reader = soup.find('em', attrs={'class': 'blue'}).text
        word = soup.find('em', attrs={'class': 'red'}).text
        # print(reader.text)
        # print(word.text)
        name = soup.find('a', attrs={'class': 'red'}).text
        writer = soup.find('a', attrs={'class': 'name'}).text
        fan = soup.find('span', attrs={'id': 'fansScore'}).text
        fan = fan.replace('.', '')
        fan = fan.replace('万', '0000')
        recommender = soup.find('span', attrs={'id': 'recommentCount'}).text
        # print(fan.text)
        # print(recommender.text)
        # print(writer.text)
        dict1['小说名称'] = name
        dict1['阅读数'] = reader
        dict1['小说字数'] = int(word)
        dict1['作者'] = writer
        dict1['粉丝数'] = int(fan)
        dict1['推荐票数'] = recommender
        list_total.append(dict1)
    df = pd.DataFrame(list_total)

    df2 = df.sort_values(by=["小说字数"], ascending=[False], kind="stable")
    df3 = df.sort_values(by=["粉丝数"], ascending=[False], kind='stable')
    df2.to_excel('Boys_finished_the_book1.xls')
    df3.to_excel('Boys_finished_the_book2.xls')


def Finish_this_list():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
    url = 'https://www.17k.com/top/refactor/top100/18_popularityListScore/18_popularityListScore_finishBook_top_100_pc.html?TabIndex=1&typeIndex=0'
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    list_total = []
    list1 = []
    list_word = []
    list_fan = []

    li1 = soup.find_all('a', attrs={'class': 'red'})

    for item in li1:
        url_book = item.get('href')
        url_book1 = url_book.replace('//', 'https://')
        list1.append(url_book1)
    for i in range(100):
        url2 = list1[i]
        dict1 = {}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}

        response = requests.get(url=url2, headers=headers)
        response.encoding = 'utf-8'
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        reader = soup.find('em', attrs={'class': 'blue'}).text
        word = soup.find('em', attrs={'class': 'red'}).text
        # print(reader.text)
        # print(word.text)
        writer = soup.find('a', attrs={'class': 'name'}).text
        name = soup.find('a', attrs={'class': 'red'}).text
        fan = soup.find('span', attrs={'id': 'fansScore'}).text
        fan = fan.replace('.', '')
        fan = fan.replace('万', '0000')
        recommender = soup.find('span', attrs={'id': 'recommentCount'}).text
        # print(fan.text)
        # print(recommender.text)
        # print(writer.text)
        dict1['小说名字'] = name
        dict1['作者'] = writer
        dict1['粉丝数'] = int(fan)
        dict1['阅读数'] = reader
        dict1['小说字数'] = int(word)
        dict1['推荐票数'] = recommender

        list_total.append(dict1)
    df = pd.DataFrame(list_total)
    # print(df)
    # print("over！-----------------------------------------------------------------")
    df2 = df.sort_values(by=["小说字数"], ascending=[False], kind="stable")
    df3 = df.sort_values(by=["粉丝数"], ascending=[False], kind='stable')
    df2.to_excel('Finish_this_list1.xls')
    df3.to_excel('Finish_this_list2.xls')


if __name__ == '__main__':
    li = []
    heavy_recommendation()
    Great_potential()
    Girls_finished_the_book()
    Finish_this_list()
    Boys_finished_the_book()
成功执行后，会生成以下文件:

标签：榜单,url,text,完本,list,爬取,soup,fan,find
From： https://www.cnblogs.com/JK8395/p/16854674.html
爬取小说（编辑推荐，完本榜单）

成功执行后，会生成以下文件:

相关文章

赞助商

阅读排行