项目场景:
Spider框架爬取m.baidu.com搜索结果
问题描述
访问地址为 https://m.baidu.com/s?word=电影&pn=0
最后结果变成了 http://m.baidu.com/s?cip6=240e:390:6a52:67e5:b0fa:e9d6:226e:376d&word=电影&pn=0&pu=sz%401321_480&t_noscript=jump 导致结果不对
import json
import time
import scrapy
import requests
from datetime import datetime
import math
from bs4 import BeautifulSoup
from scrapy.http import HtmlResponse
from scrapy.utils import spider
from ..items import BaiduWapspiderItem
from tld import get_tld
from ..IP.free_ip import get_random_proxy
from selenium import webdriver
from scrapy_selenium import SeleniumRequest
from scrapy.utils.project import get_project_settings
from scrapy_redis.spiders import RedisSpider
from fake_useragent import UserAgent
import pymysql
from pymysql import cursors
from scrapy import Request
settings = get_project_settings()
class BaiduWapSpider(RedisSpider):
name = "baidu_wap"
#redis_key = 'baidu_wap'
#start_urls = []
def __init__(self, *args, **kwargs):
super(BaiduWapSpider, self).__init__(*args, **kwargs)
self.host = settings['DB_HOST']
self.user = settings['DB_USER']
self.password = settings['DB_PASSWROD']
self.database = settings['DB_NAME']
self.charset = settings['DB_CHARSET']
self.cursorclass = cursors.DictCursor
self.conn = pymysql.Connect(host=self.host, user=self.user, password=self.password, database=self.database,
charset=self.charset, cursorclass=self.cursorclass)
self.cur = self.conn.cursor()
# 获取点击的url信息
def get_keywords(self, page=1, page_num=100):
# start = page
# end = page_num
# 获取当前日期
# sql1 = 'SELECT * FROM seo_keywords LIMIT 1'
# self.cur.execute(sql1)
# resdata = self.cur.fetchone()
# 获取当前日期
sql = 'SELECT id,keyword,site_num FROM seo_keywords where index_status = 1 and is_del = 0 order by id limit 100'
print(sql)
self.cur.execute(sql)
# 使用 fetchone() 方法获取单条数据.
list = self.cur
标签:baidu,cur,settings,self,refresh,scrapy,meta,import
From: https://blog.csdn.net/qq_28917955/article/details/136971502