近来闲来无事,看小说的时候发现都是垃圾流氓广告,突发奇想要不自己把小说内容给爬取下来?说干就干
1、简介:所谓小说爬取无非就是对请求返回来的html内容进行解析获取到自己想要的内容并保存。
2、采用的技术为.NET 6.0,采用三方组件为AngleSharp
3、废话不多说,直接上代码
/// <summary> /// 图书信息 /// </summary> public class EBook { /// <summary> /// 网络地址 /// </summary> public string? NetUrl { get; set; } /// <summary> /// 图片网络地址 /// </summary> public string? ImageUrl { get; set; } /// <summary> /// 书籍名称 /// </summary> public string? BookName { get; set; } /// <summary> /// 书籍简介 /// </summary> public string? BookDesc { get; set; } /// <summary> /// 书籍作者 /// </summary> public string? BookAuthor { get; set; } /// <summary> /// 书籍类型 /// </summary> public string? BookType { get; set; } /// <summary> /// 书籍最近更新时间 /// </summary> public string? UpdateDate { get; set; } /// <summary> /// 书籍最新章节 /// </summary> public string? NewContent { get; set; } } /// <summary> /// 目录 /// </summary> public class EBookContentList { /// <summary> /// 明细介绍 /// </summary> public string? ItemDesc { get; set; } /// <summary> /// 网络地址 /// </summary> public string? NetUrl { get; set; } } /// <summary> /// 文章具体内容 /// </summary> public class EBookContentInfo { /// <summary> /// 文章内容 /// </summary> public string? Content { get; set; } /// <summary> /// 上一章链接 /// </summary> public string? UpUrl { get; set; } /// <summary> /// 下一章链接 /// </summary> public string? DownUrl { get; set; } }
public class EBookService { /// <summary> /// 请求链接常量 /// </summary> private const string HtmlUrl = "https://www.aishangba.org"; /// <summary> /// 获取查询的列表 /// </summary> /// <param name="KeyWord">关键字</param> /// <param name="Page">页码</param> /// <returns></returns> public async static Task<IEnumerable<EBook>> GetSearchResultListAsync(string KeyWord,int Page = 1) { IEnumerable<EBook> eBooks = new List<EBook>(); // 拼接请求地址 string Url = HtmlUrl + $"/search.php?keyword={KeyWord}&page={Page}"; // 请求地址获取html var source = await NetHelper.GetQueryToStringAsync(Url); // 使用默认配置创建一个新的dom上下文 var context = BrowsingContext.New(Configuration.Default); var document = await context.OpenAsync(req => req.Content(source)); // 采用linq对dom里面的元素进行筛选 var SearchResultList = document.All.FirstOrDefault(m => m.ClassList.Contains("result-list")); if(SearchResultList != null) { // 采用css选择器获取查询的每一个结果元素 var ResultItems = SearchResultList.QuerySelectorAll(".result-item"); if(ResultItems != null && ResultItems.Length > 0) { eBooks = ResultItems.Select(t => new EBook { NetUrl = HtmlUrl + t.QuerySelector(".result-game-item-pic-link")?.GetAttribute("href"), ImageUrl = HtmlUrl + t.QuerySelector(".result-game-item-pic-link-img")?.GetAttribute("href"), BookName = t.QuerySelector(".result-game-item-title-link")?.GetAttribute("title"), BookDesc = t.QuerySelector(".result-game-item-desc")?.Text(), BookAuthor = t.QuerySelector(".result-game-item-info")?.Children[0]?.Children[1]?.Text(), BookType = t.QuerySelector(".result-game-item-info")?.Children[1]?.Children[1]?.Text(), UpdateDate = t.QuerySelector(".result-game-item-info")?.Children[2]?.Children[1]?.Text(), NewContent = t.QuerySelector(".result-game-item-info")?.Children[3]?.Children[1]?.Text() }); } } return eBooks; } /// <summary> /// 获取搜索指定图书的列表 /// </summary> /// <param name="BookUrl">图书网络地址</param> /// <returns></returns> public async static Task<IEnumerable<EBookContentList>?> GetBookContentListAsync(string BookUrl) { IEnumerable<EBookContentList>? ContentList = new List<EBookContentList>(); // 请求地址获取html var source = await NetHelper.GetQueryToStringAsync(BookUrl); // 使用默认配置创建一个新的dom上下文 var context = BrowsingContext.New(Configuration.Default); var document = await context.OpenAsync(req => req.Content(source)); // 采用linq对dom里面的元素进行筛选 var SearchResultList = document.All.FirstOrDefault(m => m.Id == "list"); if(SearchResultList != null) { // 采用css选择器获取查询的每一个结果元素 var ResultItems = SearchResultList.QuerySelectorAll("dd"); if (ResultItems != null && ResultItems.Length > 0) { ContentList = ResultItems?.Select(t => new EBookContentList { ItemDesc = t.Text(), NetUrl = HtmlUrl + t.QuerySelector("a")?.GetAttribute("href") }); } } return ContentList; } /// <summary> /// 获取指定文档的内容 /// </summary> /// <param name="BookInfoUrl">图书文章网络地址</param> /// <returns></returns> public async static Task<EBookContentInfo> GetBookContentInfoAsync(string BookInfoUrl) { EBookContentInfo eBookContent = new EBookContentInfo(); // 请求地址获取html var source = await NetHelper.GetQueryToStringAsync(BookInfoUrl); // 使用默认配置创建一个新的dom上下文 var context = BrowsingContext.New(Configuration.Default); var document = await context.OpenAsync(req => req.Content(source)); // 采用linq对dom里面的元素进行筛选 var SearchResult = document.All.FirstOrDefault(m => m.Id == "content"); eBookContent.Content = SearchResult.TextContent; var InfoItem = document.All.FirstOrDefault(m => m.ClassList.Contains("bottem1")); eBookContent.UpUrl = HtmlUrl + InfoItem.Children[0]?.GetAttribute("href"); eBookContent.DownUrl = HtmlUrl + InfoItem.Children[2]?.GetAttribute("href"); Console.WriteLine(Newtonsoft.Json.JsonConvert.SerializeObject(eBookContent)); return eBookContent; } }
标签:set,string,get,Children,爬取,var,网页内容,小说,public From: https://www.cnblogs.com/qindr/p/16841783.html