// Spludlow Software // Copyright © Samuel P. Ludlow 2020 All Rights Reserved // Distributed under the terms of the GNU General Public License version 3 // Distributed WITHOUT ANY WARRANTY; without implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE // https://www.spludlow.co.uk/LICENCE.TXT // The Spludlow logo is a registered trademark of Samuel P. Ludlow and may not be used without permission // v1.14 using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Spludlow.Html { /// /// Some simple helpers for HtmlAgilityPack /// public class Parse { public enum ExtractType { InnerText, InnerHtml, OuterHtml, AttributeValue } public static string[] ExtractLinks(string html) { return ExtractLinks(html, null); } public static string ExtractTitle(string html) { List titles = Extract(html, ExtractType.InnerText, "title"); StringBuilder text = new StringBuilder(); foreach (string title in titles) { if (text.Length > 0) text.Append(", "); text.Append(title); } return text.ToString(); } public static string[] ExtractLinks(string html, string url) { List list = Extract(html, ExtractType.AttributeValue, "a", null, null, "href", false); if (url != null) list = FixLinks(list, url); return list.ToArray(); } public static string[] ExtractImages(string html, string url) { List list = Extract(html, ExtractType.AttributeValue, "img", null, null, "src", false); if (url != null) list = FixLinks(list, url); return list.ToArray(); } public static List Extract(string html, ExtractType extractType, string nodeName) { return Extract(html, extractType, nodeName, null, null, null, false); } public static List Extract( string html, ExtractType extractType, string nodeName, string attributeName, string attributeValue, string extractAttributeName, bool attributeValueContains) { if (extractType == ExtractType.AttributeValue && extractAttributeName == null) throw new ApplicationException("extractAttributeName must be specified if ExtractType.AttributeValue is used."); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); HtmlAgilityPack.HtmlNode node = doc.DocumentNode; List list = new List(); ExtractNode(list, node, extractType, nodeName, attributeName, attributeValue, extractAttributeName, attributeValueContains); return list; } private static void ExtractNode( List list, HtmlAgilityPack.HtmlNode node, ExtractType extractType, string nodeName, string attributeName, string attributeValue, string extractAttributeName, bool attributeValueContains) { nodeName = nodeName.ToLower().Trim(); if (attributeName != null) attributeName = attributeName.ToLower().Trim(); if (attributeValue != null) attributeValue = attributeValue.ToLower().Trim(); if (extractAttributeName != null) extractAttributeName = extractAttributeName.ToLower().Trim(); if (node.Name.ToLower() == nodeName) { bool attributeValueMatch = false; if (attributeName != null && attributeValue != null && node.Attributes.Contains(attributeName)) { if (attributeValueContains == true) attributeValueMatch = node.Attributes[attributeName].Value.Trim().ToLower().Contains(attributeValue); else attributeValueMatch = (node.Attributes[attributeName].Value.Trim().ToLower() == attributeValue); } if (attributeName == null || (node.Attributes.Contains(attributeName) == true && (attributeValue == null || attributeValueMatch))) { string text = null; switch (extractType) { case ExtractType.InnerText: text = node.InnerText; break; case ExtractType.InnerHtml: text = node.InnerHtml; break; case ExtractType.OuterHtml: text = node.OuterHtml; break; case ExtractType.AttributeValue: if (node.Attributes.Contains(extractAttributeName) == true) text = node.Attributes[extractAttributeName].Value; break; default: throw new ApplicationException("Unknown ExtractType\t" + extractType.ToString()); } if (text != null) list.Add(text.Trim()); } } foreach (HtmlAgilityPack.HtmlNode childNode in node.ChildNodes) ExtractNode(list, childNode, extractType, nodeName, attributeName, attributeValue, extractAttributeName, attributeValueContains); } public static List FixLinks(List relativeUrls, string url) { List urls = new List(); foreach (string relativeUrl in relativeUrls) { string link = AbsoluteUrl(relativeUrl, url).Trim(); if (link.Length == 0) continue; if (urls.Contains(link) == false) urls.Add(link); } urls.Sort(); return urls; } public static string AbsoluteUrl(string targetUrl, string parentUrl) { if (targetUrl.StartsWith("http://") == true || targetUrl.StartsWith("https://") == true) return targetUrl; System.Uri linkUri = new Uri(targetUrl, UriKind.RelativeOrAbsolute); Uri parentUri = new Uri(parentUrl, UriKind.Absolute); Uri uri = new Uri(parentUri, linkUri); return uri.AbsoluteUri; } public static string[][] ParseHtmlTable(string html) { return ParseHtmlTable(html, false); } public static string[][] ParseHtmlTable(string html, bool innerHtml) { List rowList = new List(); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); foreach (HtmlAgilityPack.HtmlNode node in doc.DocumentNode.ChildNodes[0].ChildNodes) { if (node.NodeType == HtmlAgilityPack.HtmlNodeType.Element) { if (node.Name.ToLower() != "tr") continue; List dataList = new List(); foreach (HtmlAgilityPack.HtmlNode dataNode in node.ChildNodes) { if (dataNode.NodeType == HtmlAgilityPack.HtmlNodeType.Element) { if (dataNode.Name.ToLower() == "td" || dataNode.Name.ToLower() == "th") { string data = innerHtml ? dataNode.InnerHtml : dataNode.InnerText; dataList.Add(System.Web.HttpUtility.HtmlDecode(data).Trim()); } } } rowList.Add(dataList.ToArray()); } } return rowList.ToArray(); } } }