// Spludlow Software
// Copyright © Samuel P. Ludlow 2020 All Rights Reserved
// Distributed under the terms of the GNU General Public License version 3
// Distributed WITHOUT ANY WARRANTY; without implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE
// https://www.spludlow.co.uk/LICENCE.TXT
// The Spludlow logo is a registered trademark of Samuel P. Ludlow and may not be used without permission
// v1.14
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Spludlow.Html
{
///
/// Some simple helpers for HtmlAgilityPack
///
public class Parse
{
public enum ExtractType
{
InnerText,
InnerHtml,
OuterHtml,
AttributeValue
}
public static string[] ExtractLinks(string html)
{
return ExtractLinks(html, null);
}
public static string ExtractTitle(string html)
{
List titles = Extract(html, ExtractType.InnerText, "title");
StringBuilder text = new StringBuilder();
foreach (string title in titles)
{
if (text.Length > 0)
text.Append(", ");
text.Append(title);
}
return text.ToString();
}
public static string[] ExtractLinks(string html, string url)
{
List list = Extract(html, ExtractType.AttributeValue, "a", null, null, "href", false);
if (url != null)
list = FixLinks(list, url);
return list.ToArray();
}
public static string[] ExtractImages(string html, string url)
{
List list = Extract(html, ExtractType.AttributeValue, "img", null, null, "src", false);
if (url != null)
list = FixLinks(list, url);
return list.ToArray();
}
public static List Extract(string html, ExtractType extractType, string nodeName)
{
return Extract(html, extractType, nodeName, null, null, null, false);
}
public static List Extract(
string html,
ExtractType extractType,
string nodeName,
string attributeName,
string attributeValue,
string extractAttributeName,
bool attributeValueContains)
{
if (extractType == ExtractType.AttributeValue && extractAttributeName == null)
throw new ApplicationException("extractAttributeName must be specified if ExtractType.AttributeValue is used.");
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
HtmlAgilityPack.HtmlNode node = doc.DocumentNode;
List list = new List();
ExtractNode(list, node, extractType, nodeName, attributeName, attributeValue, extractAttributeName, attributeValueContains);
return list;
}
private static void ExtractNode(
List list,
HtmlAgilityPack.HtmlNode node,
ExtractType extractType,
string nodeName,
string attributeName,
string attributeValue,
string extractAttributeName,
bool attributeValueContains)
{
nodeName = nodeName.ToLower().Trim();
if (attributeName != null)
attributeName = attributeName.ToLower().Trim();
if (attributeValue != null)
attributeValue = attributeValue.ToLower().Trim();
if (extractAttributeName != null)
extractAttributeName = extractAttributeName.ToLower().Trim();
if (node.Name.ToLower() == nodeName)
{
bool attributeValueMatch = false;
if (attributeName != null && attributeValue != null && node.Attributes.Contains(attributeName))
{
if (attributeValueContains == true)
attributeValueMatch = node.Attributes[attributeName].Value.Trim().ToLower().Contains(attributeValue);
else
attributeValueMatch = (node.Attributes[attributeName].Value.Trim().ToLower() == attributeValue);
}
if (attributeName == null ||
(node.Attributes.Contains(attributeName) == true && (attributeValue == null || attributeValueMatch)))
{
string text = null;
switch (extractType)
{
case ExtractType.InnerText:
text = node.InnerText;
break;
case ExtractType.InnerHtml:
text = node.InnerHtml;
break;
case ExtractType.OuterHtml:
text = node.OuterHtml;
break;
case ExtractType.AttributeValue:
if (node.Attributes.Contains(extractAttributeName) == true)
text = node.Attributes[extractAttributeName].Value;
break;
default:
throw new ApplicationException("Unknown ExtractType\t" + extractType.ToString());
}
if (text != null)
list.Add(text.Trim());
}
}
foreach (HtmlAgilityPack.HtmlNode childNode in node.ChildNodes)
ExtractNode(list, childNode, extractType, nodeName, attributeName, attributeValue, extractAttributeName, attributeValueContains);
}
public static List FixLinks(List relativeUrls, string url)
{
List urls = new List();
foreach (string relativeUrl in relativeUrls)
{
string link = AbsoluteUrl(relativeUrl, url).Trim();
if (link.Length == 0)
continue;
if (urls.Contains(link) == false)
urls.Add(link);
}
urls.Sort();
return urls;
}
public static string AbsoluteUrl(string targetUrl, string parentUrl)
{
if (targetUrl.StartsWith("http://") == true || targetUrl.StartsWith("https://") == true)
return targetUrl;
System.Uri linkUri = new Uri(targetUrl, UriKind.RelativeOrAbsolute);
Uri parentUri = new Uri(parentUrl, UriKind.Absolute);
Uri uri = new Uri(parentUri, linkUri);
return uri.AbsoluteUri;
}
public static string[][] ParseHtmlTable(string html)
{
return ParseHtmlTable(html, false);
}
public static string[][] ParseHtmlTable(string html, bool innerHtml)
{
List rowList = new List();
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
foreach (HtmlAgilityPack.HtmlNode node in doc.DocumentNode.ChildNodes[0].ChildNodes)
{
if (node.NodeType == HtmlAgilityPack.HtmlNodeType.Element)
{
if (node.Name.ToLower() != "tr")
continue;
List dataList = new List();
foreach (HtmlAgilityPack.HtmlNode dataNode in node.ChildNodes)
{
if (dataNode.NodeType == HtmlAgilityPack.HtmlNodeType.Element)
{
if (dataNode.Name.ToLower() == "td" || dataNode.Name.ToLower() == "th")
{
string data = innerHtml ? dataNode.InnerHtml : dataNode.InnerText;
dataList.Add(System.Web.HttpUtility.HtmlDecode(data).Trim());
}
}
}
rowList.Add(dataList.ToArray());
}
}
return rowList.ToArray();
}
}
}