首页 > 其他分享 >csharp selenium HtmlAgilityPack 爬虫 网页解析 微信公众号

csharp selenium HtmlAgilityPack 爬虫 网页解析 微信公众号

时间:2024-04-15 22:44:24浏览次数:34  
标签:string db HtmlAgilityPack Wechat csharp var using public selenium

Wechat.Crawler/App/App.csproj

<Project Sdk="Microsoft.NET.Sdk">

  <ItemGroup>
    <ProjectReference Include="..\Blog\Blog.csproj" />
  </ItemGroup>

 <ItemGroup>
     <None Update="nlog.config" CopyToOutputDirectory="Always" />
 </ItemGroup>

  <ItemGroup>
    <PackageReference Include="AngleSharp" Version="1.2.0-beta.410" />
    <PackageReference Include="HtmlAgilityPack" Version="1.11.60" />
    <PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="8.0.0" />
    <PackageReference Include="Microsoft.Extensions.DependencyInjection" Version="8.0.0" />
    <PackageReference Include="NLog.Extensions.Logging" Version="5.3.8" />
    <PackageReference Include="Selenium.Support" Version="4.18.1" />
    <PackageReference Include="Selenium.WebDriver" Version="4.18.1" />
  </ItemGroup>

  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFramework>net8.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
  </PropertyGroup>

</Project>

Wechat.Crawler/App/Program.cs

using System.Runtime.InteropServices;
using System.Text.RegularExpressions;
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
using NLog.Config;
using NLog.Extensions.Logging;
using OpenQA.Selenium;
using OpenQA.Selenium.Edge;
using OpenQA.Selenium.Support.UI;
using Wechat.Crawler.Blog;





internal class Program
{
    // 定义一个常量  
    const string WechatFolder = "/Users/song/Code/Wechat.Crawler/Out";
    private static async Task Main(string[] args)
    {
        await DownLoad();
        // ExtractLinks();
        // ExtractWechatAccounts();
        // 将html转换txt文件
        // ConvertHtmlToTxt();
    }

    private static void ConvertHtmlToTxt()
    {
        foreach (string file in Directory.GetFiles(WechatFolder, "*.html", SearchOption.AllDirectories))
        {
            // 在这里处理每个文件,例如打印文件名
            var folderPath = Path.GetDirectoryName(file);
            var fileName = Path.GetFileNameWithoutExtension(file);
            var txtFilePath = Path.Combine(folderPath!, fileName + ".txt");
            var articleContent = ExtractTxtContent(file);
            if (articleContent == "No Content\nNo Content\nNo Content") continue;
            // 使用StreamWriter来写入文本
            using (StreamWriter writer = new StreamWriter(txtFilePath))
            {
                writer.WriteLine(articleContent);
            }
        }

    }
    private static string ExtractTxtContent(string filePath)
    {
        // 首先,确保已经通过NuGet安装了HtmlAgilityPack
        // 然后,创建HtmlDocument实例
        HtmlDocument htmlDoc = new HtmlDocument();
        // 设置一些选项,例如修复嵌套标签
        htmlDoc.OptionFixNestedTags = true;
        // 加载HTML文件
        htmlDoc.Load(filePath);

        var header = GetContentById(htmlDoc, "activity-name");

        var publishTime = GetContentById(htmlDoc, "publish_time");

        var content = GetContentById(htmlDoc, "js_content");

        return header + "\n" + publishTime + "\n" + content;

    }

    private static string GetContentById(HtmlDocument htmlDoc, string id)
    {
        // 使用GetElementbyId方法获取id为"content"的元素
        // HtmlNode contentNode = htmlDoc.GetElementbyId("js_content");
        HtmlNode contentNode = htmlDoc.GetElementbyId(id);
        // 检查节点是否存在
        if (contentNode != null)
        {
            // 提取并打印id为"content"的元素的内容
            string content = contentNode.InnerText; // 或者使用contentNode.InnerHtml;
            return content.Trim()
            .Replace("&nbsp;", "")
            .Replace("?", "?\n")
            // .Replace("!","!\n")
            .Replace("1.", "\n1.")
            .Replace("1、", "\n1、")
            .Replace("。", "。\n");
        }
        else
        {
            Console.WriteLine($"Element with id={id} not found.");
            return "No Content";
        }

    }
    private static string GetContentByClass(HtmlDocument htmlDoc, string className)
    {

        // 使用XPath查询通过类名选择元素
        // 假设您要查找的类名为"yourClassName"
        var nodesWithClass = htmlDoc.DocumentNode.SelectNodes($"//span[contains(@class, '{className}')]");

        // 检查是否找到了元素
        if (nodesWithClass != null)
        {
            foreach (var node in nodesWithClass)
            {
                // 提取并打印每个找到的元素的内容
                string content = node.InnerText; // 或者使用node.InnerHtml;
                Console.WriteLine(content);
            }
        }

        Console.WriteLine("No elements with the specified class name were found.");
        return "No Content";

    }

    private static async Task DownLoad()
    {
        var logger = LoggerFactory.Create(builder => builder.AddNLog()).CreateLogger<Program>();
        var db = new WechatDbContext();
        db.Database.EnsureCreated();
        var articles = db.Articles.Select(t => t).Where(t => t.IsDownload == false).OrderBy(t => t.Biz).ToList();
        foreach (var article in articles)
        {
            var folderPath = Path.Combine(WechatFolder, article.Biz!.Trim('='));
            // 判断文件夹是否存在  
            if (!Directory.Exists(folderPath))
            {
                // 如果文件夹不存在,则创建它  
                Directory.CreateDirectory(folderPath);
                Console.WriteLine("文件夹已创建: " + folderPath);
            }
            var (fileName, content) = await DownLoadPageAsync(article.Url!);
            var filePath = Path.Combine(folderPath, $"{article.Mid.ToString()!}_{fileName}.html");
            File.WriteAllText(filePath, content);
            article.IsDownload = true;
            db.SaveChanges();
            break;
        }
        db.Dispose();

    }

    private static async Task<(string fileName, string html)> DownLoadPageAsync(string url)
    {
        var options = new EdgeOptions();
        options.AddArgument("--headless=new");
        IWebDriver driver = new EdgeDriver(options);
        driver.Navigate().GoToUrl(url);
        driver.Manage().Timeouts().ImplicitWait = TimeSpan.FromMilliseconds(5000);
        await Task.Delay(3000);
        var html = driver.PageSource;
        // 创建HTML解析器  
        var parser = new HtmlParser();
        // 解析HTML内容  
        var document = await parser.ParseDocumentAsync(html);
        // 查找id为"name"的h1元素  
        var h1Element = document.QuerySelector("#activity-name");
        var fileName = "默认名称";
        // 检查是否找到了元素,并且它确实是一个h1元素  
        if (h1Element != null)
        {
            fileName = ReplaceInvalidFileNameChars(h1Element.TextContent.Trim());
        }
        driver.Quit();
        return (fileName, html);
    }
    public static string ReplaceInvalidFileNameChars(string fileName)
    {
        // 定义Windows中不允许的字符  
        string invalidChars = new string(Path.GetInvalidFileNameChars());

        // 使用正则表达式替换所有无效字符为下划线  
        string regexPattern = "[" + Regex.Escape(invalidChars) + "]";
        string newFileName = Regex.Replace(fileName, regexPattern, "_");

        // 去除文件名开头和结尾的下划线(如果有的话)  
        newFileName = newFileName.TrimStart('_').TrimEnd('_');

        return newFileName;
    }

    private static void ExtractWechatAccounts()
    {

        var logger = LoggerFactory.Create(builder => builder.AddNLog()).CreateLogger<Program>();
        var db = new WechatDbContext();
        db.Database.EnsureCreated();
        // 使用LINQ查询所有的biz字段  
        var allBiz = db.Articles.Select(a => a.Biz).Distinct().ToList();
        // 打印所有的biz  
        foreach (var biz in allBiz)
        {
            Console.WriteLine(biz);
            var w = db.WechatAccounts.Select(w => w).Where(w => w.Biz == biz).FirstOrDefault();
            if (w != null)
            {
                w.Name = biz;
            }
            else
            {
                db.WechatAccounts.Add(new WechatAccount { Biz = biz, Name = biz, EndDate = DateTime.Now });
            }
        }
        db.SaveChanges();
        db.Dispose();

    }

    private static void ExtractLinks()
    {
        var logger = LoggerFactory.Create(builder => builder.AddNLog()).CreateLogger<Program>();
        var db = new WechatDbContext();
        db.Database.EnsureCreated();


        string directoryPath = @"/Users/song/Code/Wechat.Crawler/Links";
        string[] filePaths = Directory.GetFiles(directoryPath, "*.txt");




        foreach (string filePath in filePaths)
        {
            string pattern = @"http:\/\/(mp.weixin.qq.com\/s\?__biz=).+(wechat_redirect)";
            using (StreamReader reader = new StreamReader(filePath))
            {
                string line;

                while ((line = reader.ReadLine()!) != null)
                {
                    MatchCollection matches = Regex.Matches(line, pattern);

                    foreach (Match match in matches)
                    {
                        if (match.Value.Count() < 210 || match.Value.Count() > 220) continue;
                        var url = match.Value;
                        string patternUrl = @"(?<key>\w+)=(?<value>[^&]+)";
                        var paramMatches = Regex.Matches(url, patternUrl);
                        string biz = string.Empty;
                        long mid = 0;
                        foreach (Match m in paramMatches)
                        {
                            string key = m.Groups["key"].Value;
                            if (key == "idx" || key == "sn" || key == "chksm" || key == "scene") continue;
                            string value = m.Groups["value"].Value;
                            if (key == "__biz")
                            {
                                biz = value;
                            }
                            if (key == "mid")
                            {
                                mid = long.Parse(value);
                            }
                        }
                        if (biz == string.Empty || mid == 0)
                        {
                            logger.LogError($"该http不是有效的链接");
                        }

                        var existingArticle = db.Articles.FirstOrDefault(a => a.Mid == mid);
                        if (existingArticle == null)
                        {
                            var article = new Article
                            {
                                Biz = biz,
                                Mid = mid,
                                Url = url
                            };
                            db.Articles.Add(article);
                            db.SaveChanges();
                            logger.LogInformation($"Mid = {mid} 添加成功.");

                        }
                        else
                        {
                            logger.LogWarning($"Mid = {mid} already exists.");
                        }


                    }
                }
            }

        }

        db.Dispose();
    }
}

















Wechat.Crawler/Topic/Topic.csproj

<Project Sdk="Microsoft.NET.Sdk">

  <PropertyGroup>
    <TargetFramework>net8.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
  </PropertyGroup>

</Project>

Wechat.Crawler/Topic/Class1.cs

namespace Topic;

public class Class1
{

}

Wechat.Crawler/Blog/Blog.csproj

<Project Sdk="Microsoft.NET.Sdk">

  <PropertyGroup>
    <TargetFramework>net8.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
  </PropertyGroup>

  <ItemGroup>
    <PackageReference Include="Microsoft.Data.Sqlite.Core" Version="8.0.3" />
    <PackageReference Include="Microsoft.EntityFrameworkCore.Sqlite" Version="8.0.3" />
    <PackageReference Include="Microsoft.entityframeworkcore.tools" Version="8.0.3">
      <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
      <PrivateAssets>all</PrivateAssets>
    </PackageReference>
  </ItemGroup>

</Project>

Wechat.Crawler/Blog/WechatDbContext.cs

using Microsoft.EntityFrameworkCore;


namespace Wechat.Crawler.Blog;


public class WechatDbContext : DbContext
{
    public DbSet<Article> Articles { get; set; }
    public DbSet<WechatAccount> WechatAccounts { get; set; }

    protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
    {
        if (!optionsBuilder.IsConfigured)
        {
            //不同的数据库,只需要修改此处的连接字符串即可
            optionsBuilder.UseSqlite(@"Data Source=/Users/song/Code/Wechat.Crawler/Blog/Db/01.db");
        }
    }

    protected override void OnModelCreating(ModelBuilder modelBuilder)
    {

        modelBuilder.Entity<Article>().ToTable("Article");
        modelBuilder.Entity<WechatAccount>().ToTable("Wechat");
        modelBuilder.Entity<Article>().HasIndex(e => e.Mid).IsUnique(); // mid唯一
        modelBuilder.Entity<WechatAccount>().HasIndex(e => e.Biz).IsUnique(); // biz唯一
    }
}






Wechat.Crawler/Blog/Entities/Article.cs

using System;
using System.Collections.Generic;
using System.Numerics;
using System.Text;

namespace Wechat.Crawler.Blog;

public class Article
{
    public Guid Id { get; set; } = new Guid();
    public string? Biz { get; set; }
    public long? Mid { get; set; }
    public bool IsDownload { get; set; } = false;
    public bool IsXunFei { get; set; } = false;
    public string? Url { get; set; }

}




Wechat.Crawler/Blog/Entities/Wechat.cs




namespace Wechat.Crawler.Blog;

public class WechatAccount
{
    public Guid Id { get; set; } = new Guid();
    public string? Biz { get; set; }
    public string? Name { get; set; }
    public DateTime? StartDate { get; set; }
    public DateTime? EndDate { get; set; }   




}




标签:string,db,HtmlAgilityPack,Wechat,csharp,var,using,public,selenium
From: https://www.cnblogs.com/zhuoss/p/18137076

相关文章

  • selenium之窗口切换
    selenium之窗口切换1、弹窗1.1Alterdriver.find_element(By.ID,"alert").click()#窗口切换并获取textprint(driver.switch_to.alert.text)#窗口切换并点击确定driver.switch_to.alert.accept()driver.find_element(By.ID,"alert").click()#窗口切换并点击取消d......
  • SeleniumBase 制作WEB用户使用导览,并导出 JS-使用笔记(三)
    自动化福音(爬虫、办公、测试等)SeleniumBase使用笔记(三)SeleniumBase制作WEB用户使用导览,并导出JSSeleniumBase包含强大的JS代码生成器,用于将Python转换为JavaScript,而制作用户导览,就是其中的应用之一,用户导览能将SaaS产品采用率提高10倍或更多目录创建导览......
  • selenium之定位方式和元素操作
    selenium之定位方式和元素操作八大定位方式导包:fromselenium.webdriver.common.byimportBy1、标签id属性的定位ele=driver.find_element(by=By.ID,value="search-input")2、标签name属性的定位ele=driver.find_element(by="name",value="search-input")3......
  • Csharp线程
    CSharpe线程 目录CSharpe线程C#如何操作线程Thread1.Thread如何开启一个线程呢?2.Thread中常见的API3.thread的扩展封装threadpool一、.NETFramework2.0时代:出现了一个线程池ThreadPool二、线程池如何申请一个线程呢?三、线程等待四、线程池如何控制线......
  • Csharp中表达式树
    Csharper中的表达式树这节课来了解一下表示式树是什么?在C#中,表达式树是一种数据结构,它可以表示一些代码块,如Lambda表达式或查询表达式。表达式树使你能够查看和操作数据,就像你可以查看和操作代码一样。它们通常用于创建动态查询和解析表达式。一、认识表达式树为什么要这样说......
  • selenium之鼠标键盘操作
    selenium之鼠标键盘操作一、鼠标1、导包:fromselenium.webdriver.common.action_chainsimportActionChains2、常规操作2.1左键单击search_ele=WebDriverWait(driver,15,0.5).until(EC.visibility_of_element_located(("id","search-input")))search_ele.send_k......
  • Selenium 笔记
    相关资料Selenium官网Selenium文档SeleniumPython接口文档如果要查看其他语言的Selenium接口文档,见下载SeleniumW3CWebDriver规范Web驱动器可以访问Selenium官方Web驱动器生态查看各主流浏览器的Web驱动器下载Chrome也包含了ChromeDriver文档115以后版本115以......
  • 使用docker部署基于selenium和chrome-headless的爬虫
    使用docker部署基于selenium和chrome-headless的爬虫无论是测试还是爬虫的一些工作,有时候都会用到selenium去对chrome执行自动化操作,这里介绍一下如何使用docker快捷方便的部署相关应用。1.selenium+chrome镜像通过dockersearchselenium我们发现,有一个docker镜像叫做sele......
  • selenium-浏览器复用-Invalid Status code=403 text=Forbidden
    问题:selenium-java版本为4.1.4、4.8.2+Java8运行时报InvalidStatuscode=403text=Forbidden 运行代码:publicclassRemoteTest{publicChromeOptionsoptions;publicWebDriverdriver;@TestpublicvoidremoteTest(){options=newC......
  • CSharp: ImageToText using Microsoft.SemanticKernel
     usingMicrosoft.SemanticKernel.ImageToText;usingMicrosoft.SemanticKernel;usingMicrosoft.SemanticKernel.Connectors.OpenAI;usingMicrosoft.SemanticKernel.Connectors.HuggingFace;//usingMicrosoft.SemanticKernel.Orchestration;usingMicrosoft.Semanti......