首页 > 其他分享 >小说网页内容爬取

小说网页内容爬取

时间:2022-10-30 17:44:41浏览次数:88  
标签:set string get Children 爬取 var 网页内容 小说 public

近来闲来无事,看小说的时候发现都是垃圾流氓广告,突发奇想要不自己把小说内容给爬取下来?说干就干

1、简介:所谓小说爬取无非就是对请求返回来的html内容进行解析获取到自己想要的内容并保存。

2、采用的技术为.NET 6.0,采用三方组件为AngleSharp

3、废话不多说,直接上代码

    /// <summary>
    /// 图书信息
    /// </summary>
    public class EBook
    {
        /// <summary>
        /// 网络地址
        /// </summary>
        public string? NetUrl { get; set; }

        /// <summary>
        /// 图片网络地址
        /// </summary>
        public string? ImageUrl { get; set; }

        /// <summary>
        /// 书籍名称
        /// </summary>
        public string? BookName { get; set; }

        /// <summary>
        /// 书籍简介
        /// </summary>
        public string? BookDesc { get; set; }

        /// <summary>
        /// 书籍作者
        /// </summary>
        public string? BookAuthor { get; set; }

        /// <summary>
        /// 书籍类型
        /// </summary>
        public string? BookType { get; set; }

        /// <summary>
        /// 书籍最近更新时间
        /// </summary>
        public string? UpdateDate { get; set; }

        /// <summary>
        /// 书籍最新章节
        /// </summary>
        public string? NewContent { get; set; }
    }

    /// <summary>
    /// 目录
    /// </summary>
    public class EBookContentList
    {
        /// <summary>
        /// 明细介绍
        /// </summary>
        public string? ItemDesc { get; set; }

        /// <summary>
        /// 网络地址
        /// </summary>
        public string? NetUrl { get; set; }
    }

    /// <summary>
    /// 文章具体内容
    /// </summary>
    public class EBookContentInfo
    {
        /// <summary>
        /// 文章内容
        /// </summary>
        public string? Content { get; set; }

        /// <summary>
        /// 上一章链接
        /// </summary>
        public string? UpUrl { get; set; }

        /// <summary>
        /// 下一章链接
        /// </summary>
        public string? DownUrl { get; set; }
    }
public class EBookService
    {
        /// <summary>
        /// 请求链接常量
        /// </summary>
        private const string HtmlUrl = "https://www.aishangba.org";

        /// <summary>
        /// 获取查询的列表
        /// </summary>
        /// <param name="KeyWord">关键字</param>
        /// <param name="Page">页码</param>
        /// <returns></returns>
        public async static Task<IEnumerable<EBook>> GetSearchResultListAsync(string KeyWord,int Page = 1)
        {
            IEnumerable<EBook> eBooks = new List<EBook>();
            // 拼接请求地址
            string Url = HtmlUrl + $"/search.php?keyword={KeyWord}&page={Page}";
            // 请求地址获取html
            var source = await NetHelper.GetQueryToStringAsync(Url);
            // 使用默认配置创建一个新的dom上下文
            var context = BrowsingContext.New(Configuration.Default);
            var document = await context.OpenAsync(req => req.Content(source));
            // 采用linq对dom里面的元素进行筛选
            var SearchResultList = document.All.FirstOrDefault(m => m.ClassList.Contains("result-list"));
            if(SearchResultList != null)
            {
                // 采用css选择器获取查询的每一个结果元素
                var ResultItems = SearchResultList.QuerySelectorAll(".result-item");
                if(ResultItems != null && ResultItems.Length > 0)
                {
                    eBooks = ResultItems.Select(t => new EBook
                    {
                        NetUrl = HtmlUrl + t.QuerySelector(".result-game-item-pic-link")?.GetAttribute("href"),
                        ImageUrl = HtmlUrl + t.QuerySelector(".result-game-item-pic-link-img")?.GetAttribute("href"),
                        BookName = t.QuerySelector(".result-game-item-title-link")?.GetAttribute("title"),
                        BookDesc = t.QuerySelector(".result-game-item-desc")?.Text(),
                        BookAuthor = t.QuerySelector(".result-game-item-info")?.Children[0]?.Children[1]?.Text(),
                        BookType = t.QuerySelector(".result-game-item-info")?.Children[1]?.Children[1]?.Text(),
                        UpdateDate = t.QuerySelector(".result-game-item-info")?.Children[2]?.Children[1]?.Text(),
                        NewContent = t.QuerySelector(".result-game-item-info")?.Children[3]?.Children[1]?.Text()
                    });
                }
            }
            return eBooks;
        }
    
        /// <summary>
        /// 获取搜索指定图书的列表
        /// </summary>
        /// <param name="BookUrl">图书网络地址</param>
        /// <returns></returns>
        public async static Task<IEnumerable<EBookContentList>?> GetBookContentListAsync(string BookUrl)
        {
            IEnumerable<EBookContentList>? ContentList = new List<EBookContentList>();
            // 请求地址获取html
            var source = await NetHelper.GetQueryToStringAsync(BookUrl);
            // 使用默认配置创建一个新的dom上下文
            var context = BrowsingContext.New(Configuration.Default);
            var document = await context.OpenAsync(req => req.Content(source));

            // 采用linq对dom里面的元素进行筛选
            var SearchResultList = document.All.FirstOrDefault(m => m.Id == "list");

            if(SearchResultList != null)
            {
                // 采用css选择器获取查询的每一个结果元素
                var ResultItems = SearchResultList.QuerySelectorAll("dd");
                if (ResultItems != null && ResultItems.Length > 0) 
                {
                    ContentList = ResultItems?.Select(t => new EBookContentList {
                        ItemDesc = t.Text(),
                        NetUrl = HtmlUrl + t.QuerySelector("a")?.GetAttribute("href")
                    });
                }
            }
            return ContentList;
        }
    
        /// <summary>
        /// 获取指定文档的内容
        /// </summary>
        /// <param name="BookInfoUrl">图书文章网络地址</param>
        /// <returns></returns>
        public async static Task<EBookContentInfo> GetBookContentInfoAsync(string BookInfoUrl)
        {
            EBookContentInfo eBookContent = new EBookContentInfo();
            // 请求地址获取html
            var source = await NetHelper.GetQueryToStringAsync(BookInfoUrl);
            // 使用默认配置创建一个新的dom上下文
            var context = BrowsingContext.New(Configuration.Default);
            var document = await context.OpenAsync(req => req.Content(source));

            // 采用linq对dom里面的元素进行筛选
            var SearchResult = document.All.FirstOrDefault(m => m.Id == "content");
            eBookContent.Content = SearchResult.TextContent;

            var InfoItem = document.All.FirstOrDefault(m => m.ClassList.Contains("bottem1"));

            eBookContent.UpUrl = HtmlUrl + InfoItem.Children[0]?.GetAttribute("href");
            eBookContent.DownUrl = HtmlUrl + InfoItem.Children[2]?.GetAttribute("href");

            Console.WriteLine(Newtonsoft.Json.JsonConvert.SerializeObject(eBookContent));
            return eBookContent;
        }
    }

 

标签:set,string,get,Children,爬取,var,网页内容,小说,public
From: https://www.cnblogs.com/qindr/p/16841783.html

相关文章