HtmlAgilityPack 网页数据抓取基础应用

发布时间 2023-08-03 14:43:07作者: 望着天的蜗牛
 1             var doc = new HtmlAgilityPack.HtmlDocument();
 2             string html="";
 3             doc.LoadHtml(html);
 4             Func<HtmlAgilityPack.HtmlNodeCollection, string, Dictionary<string, string>, bool, List<HtmlAgilityPack.HtmlNode>> getNodeByTagNameAndAttr = null;
 5             Func<HtmlAgilityPack.HtmlNode, string, Dictionary<string, string>, bool> fun_Match = (c, tagName, Attr) =>
 6             {
 7                 return c.Name == tagName && !Attr.Select(cc => cc.Key).Except(c.Attributes.Select(cc => cc.Name)).Any() && c.Attributes.Join(Attr, a => new { a.Name, a.Value }, b => new { Name = b.Key, b.Value }, (a, b) => 1).Count() == Attr.Count;
 8             };
 9             getNodeByTagNameAndAttr = (nodes, tagName, Attr, all) =>
10             {
11                 List<HtmlAgilityPack.HtmlNode> li = new List<HtmlAgilityPack.HtmlNode>();
12 
13                 foreach (var c in nodes)
14                 {
15                     if (fun_Match(c, tagName, Attr))
16                     {
17                         li.Add(c);
18                     }
19                     if (!all && li.Count > 0)
20                     {
21                         break;
22                     }
23                     if (c.HasChildNodes)
24                     {
25                         var nt = getNodeByTagNameAndAttr(c.ChildNodes, tagName, Attr, all);
26                         if (nt != null)
27                         {
28                             li.AddRange(nt);
29                         }
30                     }
31                     if (!all && li.Count > 0)
32                     {
33                         break;
34                     }
35 
36                 }
37                 return li;
38 
39             };
40 
41 //读取doc.DocumentNode.ChildNodes 元素下的a标签(参数可以包含属性,也可以指定是否获取所有元素
42             List<HtmlAgilityPack.HtmlNode> node = getNodeByTagNameAndAttr(doc.DocumentNode.ChildNodes, "a", new Dictionary<string, string> { }, true);