<div class="m-repbox"><!--/html/body/div--> <div class="m-repbody firstPage"><!--/html/body/div/div--> <div class="t1">基本信息</div> <div class="g-tt-h3 f-tleft f-mgtop">基本概况信息</div><!--/html/body/div/div[1]/div[2]--> <table class="g-tab-bor f-tab-nomargin"> <tr> <th class="g-w-4">经济类型</th> <td class="g-w-4 ">股份有限(公司)</td> <th class="g-w-4">组织机构类型</th> <td class="g-w-4 ">企业</td> </tr> <tr> <th>企业规模</th> <td class="">微型企业</td> <th>所属行业</th> <td class="">建材批发</td> </tr> </table> <div class="g-tt-h3 f-tleft f-mgtop">实际控制人</div><!--/html/body/div/div[1]/div[2]--> <table class="g-tab-bor f-tab-nomargin"> <tr> <th class="g-w-4">名称</th> <th class="g-w-4">身份标识类型</th> <th class="g-w-4">身份标识号码</th> <th class="g-w-4">更新日期</th> </tr> <tbody class=""> <tr> <td>控制人</td> <td class="g-w-4">身份证</td> <td class="g-w-4">*******************</td> <td class="g-w-4">2017-03-01</td> </tr> </tbody> <tbody class=""> <tr> <td>控制人二二二二二</td> <td class="g-w-4">组织机构代码</td> <td class="g-w-4">***********</td> <td class="g-w-4">2017-03-01</td> </tr> </tbody> </table> </div> </div>
NuGet 引入 HtmlAgilityPack 包 HtmlDocument htmlDoc; /// <summary> /// Load the html page source. /// </summary> /// <param name="htmlSource"></param> public void LoadHtml(string htmlSource) { htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlSource); } public int GetNodeIndexByKeyword(string xPath, string keyword) { var index = int.MinValue; var nodes = htmlDoc.DocumentNode.SelectNodes(xPath); if (nodes != null) { for (var i = 0; i < nodes.Count; i++) { var data = nodes[i].InnerText; if (Regex.IsMatch(data, keyword)) { index = i + 1; break; } } } return index; } public int GetNodeIndex(string divPath, int divIndex) { var index = int.MinValue; var tableXPath = string.Format("{0}[{1}]/following-sibling::table[1]/preceding-sibling::div[1]", divPath, divIndex); //tableXPath = "/html/body/div/div[4]/div[2]/following-sibling::table[1]/preceding-sibling::div[1]"; var nodes = htmlDoc.DocumentNode.SelectNodes(tableXPath); if (nodes != null) { foreach (var node in nodes) { var lastS = node.XPath.Substring(node.XPath.LastIndexOf("/") + 1); var rgx = new Regex(@"(?i)(?<=\[)(.*)(?=\])"); var trimS = rgx.Match(lastS).Value; _ = int.TryParse(trimS, out int i); index = i; } } return index; } var xPath = "/html/body/div/div"; var keyword = "基本信息"; var divIndex = GetNodeIndexByKeyword(xPath, keyword); xPath = string.Format("/html/body/div/div[{0}]/div", divIndex);//"/html/body/div/div[4]/div" keyword = "基本概况信息"; var divIndex2 = htmlDocument.GetNodeIndexByKeyword(xPath, keyword);//2 var precedingSiblingIndeox = GetNodeIndex(xPath, divIndex2); var eq = divIndex == precedingSiblingIndeox;
标签:xpath,htmlDoc,string,int,HtmlAgilityPack,用法,var,nodes,div From: https://www.cnblogs.com/hofmann/p/16643211.html