安装
Html Agility Pack(HAP)是C#的开源项目,支持XPath查询。
官网:https://html-agility-pack.net/
使用NuGet安装,如图:
HtmlDocument.Load加载文件
using System;
using HtmlAgilityPack;
public class Program
{
public static void Main()
{
SaveHtmlFile();
var path = @"test.html";
var doc = new HtmlDocument();
doc.Load(path);
var node = doc.DocumentNode.SelectSingleNode("//body");
Console.WriteLine(node.OuterHtml);
}
private static void SaveHtmlFile()
{
var html =
@"<!DOCTYPE html>
<html>
<body>
<h1>This is <b>bold</b> heading</h1>
<p>This is <u>underlined</u> paragraph</p>
<h2>This is <i>italic</i> heading</h2>
</body>
</html> ";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
htmlDoc.Save("test.html");
}
}
HtmlDocument.LoadHtml加载字符串
using System;
using HtmlAgilityPack;
public class Program
{
public static void Main()
{
var html = @"<!DOCTYPE html>
<html>
<body>
<h1>This is <b>bold</b> heading</h1>
<p>This is <u>underlined</u> paragraph</p>
<h2>This is <i>italic</i> heading</h2>
</body>
</html> ";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//body");
Console.WriteLine(htmlBody.OuterHtml);
}
}
HtmlWeb.Load通过URL加载HTML
using HtmlAgilityPack;
using System;
namespace ConsoleApp1
{
class Program
{
static void Main(string[] args)
{
var html = @"https://www.baidu.com/";
var web = new HtmlWeb();
var doc = web.Load(html);
var node = doc.DocumentNode.SelectSingleNode("//head/title");
Console.WriteLine(node.OuterHtml);
}
}
}
SelectNodes()选择多个节点
// @nuget: HtmlAgilityPack
using System;
using System.Linq;
using HtmlAgilityPack;
public class Program
{
public static void Main()
{
var html =
@"<TD class=texte width=""50%"">
<DIV align=right>Name :<B> </B></DIV>
</TD>
<TD width=""50%"">
<INPUT class=box value=John maxLength=16 size=16 name=user_name>
<INPUT class=box value=Tony maxLength=16 size=16 name=user_name>
<INPUT class=box value=Jams maxLength=16 size=16 name=user_name>
</TD>
<TR vAlign=center>";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
string name = htmlDoc.DocumentNode
.SelectNodes("//td/input")
.First()
.Attributes["value"].Value;
Console.WriteLine(name);
}
}
SelectSingleNode(String)选择第一个节点
// @nuget: HtmlAgilityPack
using System;
using HtmlAgilityPack;
public class Program
{
public static void Main()
{
var html =
@"<TD class=texte width=""50%"">
<DIV align=right>Name :<B> </B></DIV>
</TD>
<TD width=""50%"">
<INPUT class=box value=第一 maxLength=16 size=16 name=user_name>
<INPUT class=box value=第二 maxLength=16 size=16 name=user_name>
<INPUT class=box value=第三 maxLength=16 size=16 name=user_name>
</TD>
<TR vAlign=center>";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
string name = htmlDoc.DocumentNode
.SelectSingleNode("//td/input")
.Attributes["value"].Value;
Console.WriteLine(name);
}
}
获取属性
// @nuget: HtmlAgilityPack
using System;
using System.Xml;
using HtmlAgilityPack;
public class Program
{
public static void Main()
{
var html =
@"<body>
<h1>This is <b>bold</b> heading</h1>
<p>This is <u>underlined</u> paragraph</p>
<h1>This is <i>italic</i> heading</h1>
<p>This is <u>underlined</u> paragraph</p>
</body>";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
var htmlNodes = htmlDoc.DocumentNode.SelectNodes("//body/h1");
foreach (var node in htmlNodes)
{
Console.WriteLine("InnerHtml:" + node.InnerHtml);
Console.WriteLine("OuterHtml:" + node.OuterHtml);
Console.WriteLine("InnerText:"+node.InnerText);
Console.WriteLine("ParentNode" + node.ParentNode.Name);
Console.WriteLine("===========");
}
}
}
参考
https://html-agility-pack.net/parser