在爬虫,爬数据的时候,很容易出现IP锁定,403,验证码等情况,一般出现此类情况的时候,就会用IP代理,来实现,大量的频繁更换IP 来实现数据的频繁爬取和并发爬取。
现在的方式是增加一个这个的一个服务,一直跑几个免费的IP平台,大致也是这个流程
以上图片来源于网络.
其中代理的网站大致有:
链接:http://note.youdao.com/noteshare?id=96531d191709330d79d66088323619e0&sub=39E37FA9387C46C18B5AD3D07B096437
链接:http://note.youdao.com/noteshare?id=2f241bbf336978b86f89a1a268ac9478&sub=6290BEA38B3C47FEAF1B931A0E275341
para.URL = "http://www.xicidaili.com/nn/1"; // 西刺
para.URL = "http://ip84.com/dlgn"; // IP巴士
para.URL = "http://www.ip3366.net/free/?stype=1"; // 云代理
para.URL = "http://www.iphai.com/free/ng"; // IP海
para.URL = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip"; // 66ip
proxy_url = 'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list'
以上是代理的常用接口和平台
//服务的核心方法,启动一个监听,然后访问这个网址的都会 对已获取的代理IP序列进行检查,并找到最新能用的返回给API接口
string url = "http://*:10086/";
Console.Title = url;
Task.Run(() => new HttpListenerServer(url).StartLisener());
//获取代理IP
Task.Run(() => new Free89().Start());
Task.Run(() => new gitProxy().Start());
Task.Run(() => new IpAear().Start());//不能用,超时太严重
Task.Run(() => new kuaidailiGaoNi().Start());
Task.Run(() => new kuaidailiPuTong().Start());
Task.Run(() => new Liunian().Start());//还能用
Task.Run(() => new NiMingGet().Start());
Task.Run(() => new qiyunProxy().Start());
Task.Run(() => new Xici().Start());
Task.Run(() => new XiciGaoni().Start());//可以用,质量比较低
Task.Run(() => new XiciPutong().Start());
Console.ReadLine();
HttpListenerServer
/// <summary>
/// http服务
/// </summary>
public class HttpListenerServer
{
/// <summary>
/// listener对象
/// </summary>
HttpListener listerner;
/// <summary>
/// 线程
/// </summary>
Thread thread;
/// <summary>
/// 构造函数
/// </summary>
public HttpListenerServer(string url = "http://*:10086/")
{
listerner = new HttpListener();
listerner.AuthenticationSchemes = AuthenticationSchemes.Anonymous;
listerner.Prefixes.Add(url);
}
/// <summary>
/// 开始监听
/// </summary>
public void StartLisener()
{
thread = new Thread(new ThreadStart(delegate
{
listerner.Start();
//线程里执行的方法
while (true)
{
//获取一个请求体信息
HttpListenerContext httpListenerContext = listerner.GetContext();
//一个内置的线程,用来处理请求信息
new Thread(new ThreadStart(delegate
{
Process(httpListenerContext);
})).Start();
}
}));
thread.IsBackground = true;
thread.Start();
Console.WriteLine("代理服务器开启成功!");
}
/// <summary>
/// 请求数据处理
/// </summary>
/// <param name="httpContext"></param>
public void Process(HttpListenerContext httpContext)
{
try
{
using (StreamWriter writer = new StreamWriter(httpContext.Response.OutputStream))
{
httpContext.Response.StatusCode = 200;
//获取参数类信息
string ip = httpContext.Request.QueryString["ip"];
string action = httpContext.Request.QueryString["action"];
if (string.IsNullOrEmpty(ip) || string.IsNullOrEmpty(action))
{
int count = 0;
while (true)
{
//输出ip和端口地址,以供使用
var queue = Currentobject.GetQueue();
if (queue != null)
{
writer.WriteLine(queue.IP);
break;
}
else
{
Thread.Sleep(1000);
}
count++;
if (count > 5)
{
writer.WriteLine(false.ToString());
break;
}
}
}
else
{
ParameterProcess(ip, action, writer);
}
}
}
catch (Exception ex)
{
try
{
Console.WriteLine("{0}> 接口异常:{1}", DateTime.Now.ToString("s"), ex.Message);
using (StreamWriter writer = new StreamWriter(httpContext.Response.OutputStream))
{
httpContext.Response.StatusCode = 200;
writer.WriteLine("false");
}
}
catch (Exception e)
{
}
}
}
/// <summary>
/// 参数处理
/// </summary>
/// <param name="ip"></param>
/// <param name="action"></param>
/// <param name="writer"></param>
public void ParameterProcess(string ip, string action, StreamWriter writer)
{
if (!string.IsNullOrEmpty(action))
{
switch (action)
{
case "del"://删除代理ip地址
{
writer.WriteLine("true");
}
break;
}
}
}
}
以上是核心方法。
写一个简单的西刺代理爬取 核心方法
using HtmlAgilityPack;
using HttpRequestCore;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace ProxyIPPool
{
/// <summary>
/// 西刺
/// </summary>
public class Xici : IProxyAction
{
public void Start()
{
while (true)
{
try
{
string url = "https://www.xicidaili.com/";
HtmlDocument doc = new HtmlDocument();
RequestInfo requestInfo = new RequestInfo(url, HttpMethod.GET, new HttpDefaultConfig() { UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36" });
requestInfo.Headers.Add("Cookie", "__cfduid=dbc0747d4f20e880b1f3fbeddd7ee7f9b1518096123; Hm_lvt_24b7d5cc1b26f24f256b6869b069278e=1518096226; yjs_id=4b3a274b8bb0be2202978cee06964563; yd_cookie=5bc973fb-f37b-425614221fbda95de6a441f2298ff2543cf1; UM_distinctid=1654c7c2cf02a0-09d94a3256c3ad-514d2f1f-144000-1654c7c2cf35bb; CNZZDATA1254651946=1879619778-1534585157-%7C1534585157; Hm_lvt_8ccd0ef22095c2eebfe4cd6187dea829=1534586531; Hm_lpvt_8ccd0ef22095c2eebfe4cd6187dea829=1534586545");
var html = HttpCore.Execute(requestInfo);
doc.LoadHtml(html);
var table = doc.DocumentNode.SelectSingleNode("//table[@id='ip_list']");
var tdList = table.SelectNodes("//tr").ToList();
for (int i = 0; i < tdList.Count; i++)
{
try
{
var td = tdList[i].SelectNodes("td").ToList();
if (td.Count != 8) continue;
var ip = td[1].InnerText;
int port = Convert.ToInt32(td[2].InnerText);
ProxyIP proxy = new ProxyIP() { IP = string.Format("{0}:{1}", ip, port), IPAddress = ip, Port = Convert.ToInt32(port), CreateTime = DateTime.Now, State = ProxyIPState.未验证 };
//判断Ip是否已经存在
if (Currentobject.IsExit(proxy))
{
continue;
}
#region 启用多线程去验证
IList<Task> itasks = new List<Task>();
CancellationTokenSource isoure = new CancellationTokenSource();
CancellationToken itoken = isoure.Token;
itasks.Add(new Task(() =>
{
try
{
if (Currentobject.CheckProxyIp(proxy, "西刺"))
{
Currentobject.AddOrUpdate(proxy);
}
}
catch (Exception ex)
{ }
}, itoken));
itasks[0].Start();
Task.WaitAll(itasks.ToArray(), (4 * 1000), itoken);
#endregion
}
catch (Exception e)
{ }
}
}
catch (Exception e)
{ }
Thread.Sleep(TimeSpan.FromMinutes(20));
}
}
}
}
以上就是 代理IP的核心。其他都是根据这些扩展而来。
标签:Task,Run,IP,爬虫,代理,Start,ip,new From: https://blog.51cto.com/kesshei/6287431