Wechat.Crawler/App/App.csproj
<Project Sdk="Microsoft.NET.Sdk">
<ItemGroup>
<ProjectReference Include="..\Blog\Blog.csproj" />
</ItemGroup>
<ItemGroup>
<None Update="nlog.config" CopyToOutputDirectory="Always" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="AngleSharp" Version="1.2.0-beta.410" />
<PackageReference Include="HtmlAgilityPack" Version="1.11.60" />
<PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="8.0.0" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection" Version="8.0.0" />
<PackageReference Include="NLog.Extensions.Logging" Version="5.3.8" />
<PackageReference Include="Selenium.Support" Version="4.18.1" />
<PackageReference Include="Selenium.WebDriver" Version="4.18.1" />
</ItemGroup>
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
</Project>
Wechat.Crawler/App/Program.cs
using System.Runtime.InteropServices;
using System.Text.RegularExpressions;
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
using NLog.Config;
using NLog.Extensions.Logging;
using OpenQA.Selenium;
using OpenQA.Selenium.Edge;
using OpenQA.Selenium.Support.UI;
using Wechat.Crawler.Blog;
internal class Program
{
// 定义一个常量
const string WechatFolder = "/Users/song/Code/Wechat.Crawler/Out";
private static async Task Main(string[] args)
{
await DownLoad();
// ExtractLinks();
// ExtractWechatAccounts();
// 将html转换txt文件
// ConvertHtmlToTxt();
}
private static void ConvertHtmlToTxt()
{
foreach (string file in Directory.GetFiles(WechatFolder, "*.html", SearchOption.AllDirectories))
{
// 在这里处理每个文件,例如打印文件名
var folderPath = Path.GetDirectoryName(file);
var fileName = Path.GetFileNameWithoutExtension(file);
var txtFilePath = Path.Combine(folderPath!, fileName + ".txt");
var articleContent = ExtractTxtContent(file);
if (articleContent == "No Content\nNo Content\nNo Content") continue;
// 使用StreamWriter来写入文本
using (StreamWriter writer = new StreamWriter(txtFilePath))
{
writer.WriteLine(articleContent);
}
}
}
private static string ExtractTxtContent(string filePath)
{
// 首先,确保已经通过NuGet安装了HtmlAgilityPack
// 然后,创建HtmlDocument实例
HtmlDocument htmlDoc = new HtmlDocument();
// 设置一些选项,例如修复嵌套标签
htmlDoc.OptionFixNestedTags = true;
// 加载HTML文件
htmlDoc.Load(filePath);
var header = GetContentById(htmlDoc, "activity-name");
var publishTime = GetContentById(htmlDoc, "publish_time");
var content = GetContentById(htmlDoc, "js_content");
return header + "\n" + publishTime + "\n" + content;
}
private static string GetContentById(HtmlDocument htmlDoc, string id)
{
// 使用GetElementbyId方法获取id为"content"的元素
// HtmlNode contentNode = htmlDoc.GetElementbyId("js_content");
HtmlNode contentNode = htmlDoc.GetElementbyId(id);
// 检查节点是否存在
if (contentNode != null)
{
// 提取并打印id为"content"的元素的内容
string content = contentNode.InnerText; // 或者使用contentNode.InnerHtml;
return content.Trim()
.Replace(" ", "")
.Replace("?", "?\n")
// .Replace("!","!\n")
.Replace("1.", "\n1.")
.Replace("1、", "\n1、")
.Replace("。", "。\n");
}
else
{
Console.WriteLine($"Element with id={id} not found.");
return "No Content";
}
}
private static string GetContentByClass(HtmlDocument htmlDoc, string className)
{
// 使用XPath查询通过类名选择元素
// 假设您要查找的类名为"yourClassName"
var nodesWithClass = htmlDoc.DocumentNode.SelectNodes($"//span[contains(@class, '{className}')]");
// 检查是否找到了元素
if (nodesWithClass != null)
{
foreach (var node in nodesWithClass)
{
// 提取并打印每个找到的元素的内容
string content = node.InnerText; // 或者使用node.InnerHtml;
Console.WriteLine(content);
}
}
Console.WriteLine("No elements with the specified class name were found.");
return "No Content";
}
private static async Task DownLoad()
{
var logger = LoggerFactory.Create(builder => builder.AddNLog()).CreateLogger<Program>();
var db = new WechatDbContext();
db.Database.EnsureCreated();
var articles = db.Articles.Select(t => t).Where(t => t.IsDownload == false).OrderBy(t => t.Biz).ToList();
foreach (var article in articles)
{
var folderPath = Path.Combine(WechatFolder, article.Biz!.Trim('='));
// 判断文件夹是否存在
if (!Directory.Exists(folderPath))
{
// 如果文件夹不存在,则创建它
Directory.CreateDirectory(folderPath);
Console.WriteLine("文件夹已创建: " + folderPath);
}
var (fileName, content) = await DownLoadPageAsync(article.Url!);
var filePath = Path.Combine(folderPath, $"{article.Mid.ToString()!}_{fileName}.html");
File.WriteAllText(filePath, content);
article.IsDownload = true;
db.SaveChanges();
break;
}
db.Dispose();
}
private static async Task<(string fileName, string html)> DownLoadPageAsync(string url)
{
var options = new EdgeOptions();
options.AddArgument("--headless=new");
IWebDriver driver = new EdgeDriver(options);
driver.Navigate().GoToUrl(url);
driver.Manage().Timeouts().ImplicitWait = TimeSpan.FromMilliseconds(5000);
await Task.Delay(3000);
var html = driver.PageSource;
// 创建HTML解析器
var parser = new HtmlParser();
// 解析HTML内容
var document = await parser.ParseDocumentAsync(html);
// 查找id为"name"的h1元素
var h1Element = document.QuerySelector("#activity-name");
var fileName = "默认名称";
// 检查是否找到了元素,并且它确实是一个h1元素
if (h1Element != null)
{
fileName = ReplaceInvalidFileNameChars(h1Element.TextContent.Trim());
}
driver.Quit();
return (fileName, html);
}
public static string ReplaceInvalidFileNameChars(string fileName)
{
// 定义Windows中不允许的字符
string invalidChars = new string(Path.GetInvalidFileNameChars());
// 使用正则表达式替换所有无效字符为下划线
string regexPattern = "[" + Regex.Escape(invalidChars) + "]";
string newFileName = Regex.Replace(fileName, regexPattern, "_");
// 去除文件名开头和结尾的下划线(如果有的话)
newFileName = newFileName.TrimStart('_').TrimEnd('_');
return newFileName;
}
private static void ExtractWechatAccounts()
{
var logger = LoggerFactory.Create(builder => builder.AddNLog()).CreateLogger<Program>();
var db = new WechatDbContext();
db.Database.EnsureCreated();
// 使用LINQ查询所有的biz字段
var allBiz = db.Articles.Select(a => a.Biz).Distinct().ToList();
// 打印所有的biz
foreach (var biz in allBiz)
{
Console.WriteLine(biz);
var w = db.WechatAccounts.Select(w => w).Where(w => w.Biz == biz).FirstOrDefault();
if (w != null)
{
w.Name = biz;
}
else
{
db.WechatAccounts.Add(new WechatAccount { Biz = biz, Name = biz, EndDate = DateTime.Now });
}
}
db.SaveChanges();
db.Dispose();
}
private static void ExtractLinks()
{
var logger = LoggerFactory.Create(builder => builder.AddNLog()).CreateLogger<Program>();
var db = new WechatDbContext();
db.Database.EnsureCreated();
string directoryPath = @"/Users/song/Code/Wechat.Crawler/Links";
string[] filePaths = Directory.GetFiles(directoryPath, "*.txt");
foreach (string filePath in filePaths)
{
string pattern = @"http:\/\/(mp.weixin.qq.com\/s\?__biz=).+(wechat_redirect)";
using (StreamReader reader = new StreamReader(filePath))
{
string line;
while ((line = reader.ReadLine()!) != null)
{
MatchCollection matches = Regex.Matches(line, pattern);
foreach (Match match in matches)
{
if (match.Value.Count() < 210 || match.Value.Count() > 220) continue;
var url = match.Value;
string patternUrl = @"(?<key>\w+)=(?<value>[^&]+)";
var paramMatches = Regex.Matches(url, patternUrl);
string biz = string.Empty;
long mid = 0;
foreach (Match m in paramMatches)
{
string key = m.Groups["key"].Value;
if (key == "idx" || key == "sn" || key == "chksm" || key == "scene") continue;
string value = m.Groups["value"].Value;
if (key == "__biz")
{
biz = value;
}
if (key == "mid")
{
mid = long.Parse(value);
}
}
if (biz == string.Empty || mid == 0)
{
logger.LogError($"该http不是有效的链接");
}
var existingArticle = db.Articles.FirstOrDefault(a => a.Mid == mid);
if (existingArticle == null)
{
var article = new Article
{
Biz = biz,
Mid = mid,
Url = url
};
db.Articles.Add(article);
db.SaveChanges();
logger.LogInformation($"Mid = {mid} 添加成功.");
}
else
{
logger.LogWarning($"Mid = {mid} already exists.");
}
}
}
}
}
db.Dispose();
}
}
Wechat.Crawler/Topic/Topic.csproj
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
</Project>
Wechat.Crawler/Topic/Class1.cs
namespace Topic;
public class Class1
{
}
Wechat.Crawler/Blog/Blog.csproj
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Data.Sqlite.Core" Version="8.0.3" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Sqlite" Version="8.0.3" />
<PackageReference Include="Microsoft.entityframeworkcore.tools" Version="8.0.3">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
</ItemGroup>
</Project>
Wechat.Crawler/Blog/WechatDbContext.cs
using Microsoft.EntityFrameworkCore;
namespace Wechat.Crawler.Blog;
public class WechatDbContext : DbContext
{
public DbSet<Article> Articles { get; set; }
public DbSet<WechatAccount> WechatAccounts { get; set; }
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
{
if (!optionsBuilder.IsConfigured)
{
//不同的数据库,只需要修改此处的连接字符串即可
optionsBuilder.UseSqlite(@"Data Source=/Users/song/Code/Wechat.Crawler/Blog/Db/01.db");
}
}
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
modelBuilder.Entity<Article>().ToTable("Article");
modelBuilder.Entity<WechatAccount>().ToTable("Wechat");
modelBuilder.Entity<Article>().HasIndex(e => e.Mid).IsUnique(); // mid唯一
modelBuilder.Entity<WechatAccount>().HasIndex(e => e.Biz).IsUnique(); // biz唯一
}
}
Wechat.Crawler/Blog/Entities/Article.cs
using System;
using System.Collections.Generic;
using System.Numerics;
using System.Text;
namespace Wechat.Crawler.Blog;
public class Article
{
public Guid Id { get; set; } = new Guid();
public string? Biz { get; set; }
public long? Mid { get; set; }
public bool IsDownload { get; set; } = false;
public bool IsXunFei { get; set; } = false;
public string? Url { get; set; }
}
Wechat.Crawler/Blog/Entities/Wechat.cs
namespace Wechat.Crawler.Blog;
public class WechatAccount
{
public Guid Id { get; set; } = new Guid();
public string? Biz { get; set; }
public string? Name { get; set; }
public DateTime? StartDate { get; set; }
public DateTime? EndDate { get; set; }
}
标签:string,db,HtmlAgilityPack,Wechat,csharp,var,using,public,selenium
From: https://www.cnblogs.com/zhuoss/p/18137076