本文主要介绍通过DotnetSpider写少量代码快速的实现网页的抓取。
1、 通过Nuget安装引用DotnetSpider
项目上右键 -》选择"管理Nuget程序包" -》搜索"DotnetsSpider" -》点击"DotnetsSpider.Core"安装,还要安装"DotnetSpider.Extension"。
2、数据存储EntityPipeline
可以使用框架提供的ConsoleEntityPipeline
实现控制台输入,还支持excel、mysql、mongodb等,命名空间在DotnetSpider.Extension.Pipeline
下,可以在这个下面查看其它EntityPipeline的实现类,继承自EntityPipeline类,可实现自己的存储逻辑,例如,
public class StoragePipeline : EntityPipeline
{
protected override int Process(List<IBaseEntity> items, dynamic sender = null)
{
if (items == null) return 0;
DateTime dateTime;
string dateTimeString = string.Empty;
string path = "./web.txt";
foreach (var data in items)
{
lock (this)
{
if (!File.Exists(path))
{
File.Create(path);
}
var streamWriter = File.AppendText(path);
using (streamWriter)
{
streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data));
streamWriter.WriteLine();
}
}
}
}
}
return items.Count;
}
}
3、爬虫类的实现
继承EntitySpider类,来实现爬虫类,继承BaseEntity类实现爬虫实体,实体属性要加上 [Column]标签,通过Field标签写Xpath表达式提取内容,和ReplaceFormatter标签实现内容格式的替换,最后值赋给对应的实体属性,例如,
private class SpiderWeb : EntitySpider { protected override void OnInit(params string[] arguments) { var page = 1; var listRequest = new List<Request>(); //循环添加要请求的url for (int i = 1; i < 500; i++) { page = i; listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page), new Dictionary<string, dynamic> { { "page", page } })); } AddRequests(listRequest); AddEntityType<StackoverflowSearchEntry>(); //AddPipeline(new ConsoleEntityPipeline()); AddPipeline(new StoragePipeline()); } [Schema("stackoverflow", "stackoverflow_search_entity_model")] [Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)] class StackoverflowSearchEntry : BaseEntity { [Column] [Field(Expression = "page", Type = SelectorType.Enviroment)] public string Page { get; set; } [Column] [Field(Expression = ".//div[@class='summary']/h3/a")] [ReplaceFormatter(NewValue = "", OldValue = "<em>")] [ReplaceFormatter(NewValue = "", OldValue = "</em>")] public string Title { get; set; } [Column] [Field(Expression = ".//div[@class='summary']/h3/a/@href")] public string Url { get; set; } [Column] [Field(Expression = ".//div[@class='summary']/div[1]")] public string description { get; set; } //匹配到的完整的内容 [Column] [Field(Expression = ".", Option = FieldOptions.InnerText)] public string PlainText { get; set; } } }
4、DotnetSpider使用完整代码
using DotnetSpider.Downloader; using DotnetSpider.Extension; using DotnetSpider.Extension.Model; using DotnetSpider.Extension.Pipeline; using DotnetSpider.Extraction; using DotnetSpider.Extraction.Model; using DotnetSpider.Extraction.Model.Attribute; using DotnetSpider.Extraction.Model.Formatter; using System; using System.Collections.Generic; using System.IO; using System.Runtime.CompilerServices; namespace SpiderContent { class Program { static void Main(string[] args) { Console.WriteLine("Hello World!"); var spider = new SpiderWeb(); //每次抓取的时间间隔,防止抓取频过快 spider.SleepTime = 1000; spider.Run(); Console.ReadKey(); } private class SpiderWeb : EntitySpider { protected override void OnInit(params string[] arguments) { var page = 1; var listRequest = new List<Request>(); //循环添加要请求的url for (int i = 1; i < 500; i++) { page = i; listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page), new Dictionary<string, dynamic> { { "page", page } })); } AddRequests(listRequest); AddEntityType<StackoverflowSearchEntry>(); //AddPipeline(new ConsoleEntityPipeline()); AddPipeline(new StoragePipeline()); } [Schema("stackoverflow", "stackoverflow_search_entity_model")] [Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)] class StackoverflowSearchEntry : BaseEntity { [Column] [Field(Expression = "page", Type = SelectorType.Enviroment)] public string Page { get; set; } [Column] [Field(Expression = ".//div[@class='summary']/h3/a")] [ReplaceFormatter(NewValue = "", OldValue = "<em>")] [ReplaceFormatter(NewValue = "", OldValue = "</em>")] public string Title { get; set; } [Column] [Field(Expression = ".//div[@class='summary']/h3/a/@href")] public string Url { get; set; } [Column] [Field(Expression = ".//div[@class='summary']/div[1]")] public string description { get; set; } [Column] [Field(Expression = ".", Option = FieldOptions.InnerText)] public string PlainText { get; set; } } } } public class StoragePipeline : EntityPipeline { protected override int Process(List<IBaseEntity> items, dynamic sender = null) { if (items == null) return 0; DateTime dateTime; string dateTimeString = string.Empty; string path = "./web.txt"; foreach (var data in items) { lock (this) { if (!File.Exists(path)) { File.Create(path); } var streamWriter = File.AppendText(path); using (streamWriter) { streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data)); streamWriter.WriteLine(); } } } } } return items.Count; } } }
标签:Core,string,DotnetSpider,class,public,using,NET,page From: https://www.cnblogs.com/fireicesion/p/16809554.html