首页 > 其他分享 >爬虫之IP代理

爬虫之IP代理

时间:2023-05-16 22:04:42浏览次数:28  
标签:Task Run IP 爬虫 代理 Start ip new


       在爬虫,爬数据的时候,很容易出现IP锁定,403,验证码等情况,一般出现此类情况的时候,就会用IP代理,来实现,大量的频繁更换IP 来实现数据的频繁爬取和并发爬取。

      现在的方式是增加一个这个的一个服务,一直跑几个免费的IP平台,大致也是这个流程

爬虫之IP代理_IP代理

以上图片来源于网络.

其中代理的网站大致有:

链接:http://note.youdao.com/noteshare?id=96531d191709330d79d66088323619e0&sub=39E37FA9387C46C18B5AD3D07B096437

链接:http://note.youdao.com/noteshare?id=2f241bbf336978b86f89a1a268ac9478&sub=6290BEA38B3C47FEAF1B931A0E275341

爬虫之IP代理_西刺_02

爬虫之IP代理_西刺_03

para.URL = "http://www.xicidaili.com/nn/1"; // 西刺
para.URL = "http://ip84.com/dlgn"; // IP巴士
para.URL = "http://www.ip3366.net/free/?stype=1"; // 云代理
para.URL = "http://www.iphai.com/free/ng"; // IP海
para.URL = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip"; // 66ip
proxy_url = 'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list'

以上是代理的常用接口和平台

//服务的核心方法,启动一个监听,然后访问这个网址的都会 对已获取的代理IP序列进行检查,并找到最新能用的返回给API接口

string url = "http://*:10086/";
            Console.Title = url;

            Task.Run(() => new HttpListenerServer(url).StartLisener());
            //获取代理IP
            Task.Run(() => new Free89().Start());
            Task.Run(() => new gitProxy().Start());
            Task.Run(() => new IpAear().Start());//不能用,超时太严重
            Task.Run(() => new kuaidailiGaoNi().Start());
            Task.Run(() => new kuaidailiPuTong().Start());
            Task.Run(() => new Liunian().Start());//还能用
            Task.Run(() => new NiMingGet().Start());
            Task.Run(() => new qiyunProxy().Start());
            Task.Run(() => new Xici().Start());
            Task.Run(() => new XiciGaoni().Start());//可以用,质量比较低
            Task.Run(() => new XiciPutong().Start());
            Console.ReadLine();

HttpListenerServer

/// <summary>
    /// http服务
    /// </summary>
    public class HttpListenerServer
    {
        /// <summary>
        /// listener对象
        /// </summary>
        HttpListener listerner;
        /// <summary>
        /// 线程
        /// </summary>
        Thread thread;
        /// <summary>
        /// 构造函数
        /// </summary>
        public HttpListenerServer(string url = "http://*:10086/")
        {
            listerner = new HttpListener();
            listerner.AuthenticationSchemes = AuthenticationSchemes.Anonymous;
            listerner.Prefixes.Add(url);
        }
        /// <summary>
        /// 开始监听
        /// </summary>
        public void StartLisener()
        {
            thread = new Thread(new ThreadStart(delegate
            {
                listerner.Start();
                //线程里执行的方法
                while (true)
                {
                    //获取一个请求体信息
                    HttpListenerContext httpListenerContext = listerner.GetContext();
                    //一个内置的线程,用来处理请求信息
                    new Thread(new ThreadStart(delegate
                    {
                        Process(httpListenerContext);
                    })).Start();
                }
            }));
            thread.IsBackground = true;
            thread.Start();
            Console.WriteLine("代理服务器开启成功!");
        }
        /// <summary>
        /// 请求数据处理
        /// </summary>
        /// <param name="httpContext"></param>
        public void Process(HttpListenerContext httpContext)
        {
            try
            {
                using (StreamWriter writer = new StreamWriter(httpContext.Response.OutputStream))
                {
                    httpContext.Response.StatusCode = 200;
                    //获取参数类信息
                    string ip = httpContext.Request.QueryString["ip"];
                    string action = httpContext.Request.QueryString["action"];
                    if (string.IsNullOrEmpty(ip) || string.IsNullOrEmpty(action))
                    {
                        int count = 0;
                        while (true)
                        {
                            //输出ip和端口地址,以供使用
                            var queue = Currentobject.GetQueue();
                            if (queue != null)
                            {
                                writer.WriteLine(queue.IP);
                                break;
                            }
                            else
                            {
                                Thread.Sleep(1000);
                            }
                            count++;
                            if (count > 5)
                            {
                                writer.WriteLine(false.ToString());
                                break;
                            }
                        }
                    }
                    else
                    {
                        ParameterProcess(ip, action, writer);
                    }
                }
            }
            catch (Exception ex)
            {
                try
                {
                    Console.WriteLine("{0}> 接口异常:{1}", DateTime.Now.ToString("s"), ex.Message);
                    using (StreamWriter writer = new StreamWriter(httpContext.Response.OutputStream))
                    {
                        httpContext.Response.StatusCode = 200;
                        writer.WriteLine("false");
                    }
                }
                catch (Exception e)
                {
                }
            }
        }
        /// <summary>
        /// 参数处理
        /// </summary>
        /// <param name="ip"></param>
        /// <param name="action"></param>
        /// <param name="writer"></param>
        public void ParameterProcess(string ip, string action, StreamWriter writer)
        {
            if (!string.IsNullOrEmpty(action))
            {
                switch (action)
                {
                    case "del"://删除代理ip地址
                        {
                            writer.WriteLine("true");
                        }
                        break;
                }
            }
        }

    }

以上是核心方法。

写一个简单的西刺代理爬取 核心方法

using HtmlAgilityPack;
using HttpRequestCore;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;

namespace ProxyIPPool
{
    /// <summary>
    /// 西刺
    /// </summary>
    public class Xici : IProxyAction
    {
        public void Start()
        {
            while (true)
            {
                try
                {
                    string url = "https://www.xicidaili.com/";
                    HtmlDocument doc = new HtmlDocument();
                    RequestInfo requestInfo = new RequestInfo(url, HttpMethod.GET, new HttpDefaultConfig() { UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36" });
                    requestInfo.Headers.Add("Cookie", "__cfduid=dbc0747d4f20e880b1f3fbeddd7ee7f9b1518096123; Hm_lvt_24b7d5cc1b26f24f256b6869b069278e=1518096226; yjs_id=4b3a274b8bb0be2202978cee06964563; yd_cookie=5bc973fb-f37b-425614221fbda95de6a441f2298ff2543cf1; UM_distinctid=1654c7c2cf02a0-09d94a3256c3ad-514d2f1f-144000-1654c7c2cf35bb; CNZZDATA1254651946=1879619778-1534585157-%7C1534585157; Hm_lvt_8ccd0ef22095c2eebfe4cd6187dea829=1534586531; Hm_lpvt_8ccd0ef22095c2eebfe4cd6187dea829=1534586545");
                    var html = HttpCore.Execute(requestInfo);
                    doc.LoadHtml(html);
                    var table = doc.DocumentNode.SelectSingleNode("//table[@id='ip_list']");
                    var tdList = table.SelectNodes("//tr").ToList();
                    for (int i = 0; i < tdList.Count; i++)
                    {
                        try
                        {
                            var td = tdList[i].SelectNodes("td").ToList();
                            if (td.Count != 8) continue;
                            var ip = td[1].InnerText;
                            int port = Convert.ToInt32(td[2].InnerText);
                            ProxyIP proxy = new ProxyIP() { IP = string.Format("{0}:{1}", ip, port), IPAddress = ip, Port = Convert.ToInt32(port), CreateTime = DateTime.Now, State = ProxyIPState.未验证 };
                            //判断Ip是否已经存在
                            if (Currentobject.IsExit(proxy))
                            {
                                continue;
                            }
                            #region 启用多线程去验证
                            IList<Task> itasks = new List<Task>();
                            CancellationTokenSource isoure = new CancellationTokenSource();
                            CancellationToken itoken = isoure.Token;
                            itasks.Add(new Task(() =>
                            {
                                try
                                {
                                    if (Currentobject.CheckProxyIp(proxy, "西刺"))
                                    {
                                        Currentobject.AddOrUpdate(proxy);
                                    }
                                }
                                catch (Exception ex)
                                { }
                            }, itoken));
                            itasks[0].Start();
                            Task.WaitAll(itasks.ToArray(), (4 * 1000), itoken);
                            #endregion
                        }
                        catch (Exception e)
                        { }
                    }
                }
                catch (Exception e)
                { }
                Thread.Sleep(TimeSpan.FromMinutes(20));
            }
        }
    }
}

 

以上就是 代理IP的核心。其他都是根据这些扩展而来。

 

 

 

 

 

 

 

 

标签:Task,Run,IP,爬虫,代理,Start,ip,new
From: https://blog.51cto.com/kesshei/6287431

相关文章