Html Link Validation

Writing documentation in html for processing into help files with tools like Sandcastle and Sandcastle Help File Builder means that it's easy to get links between documents wrong without the tools complaining. Particularly since the documentation input format may not represent the actual generated code that is produced by these tools.

This little spider utility uses the Html Agility Pack to check the output files generated by Sandcastle and make sure the internal links are consistent. No validation is done on external links, in this version I assume these are correct.

First of a little class to find all html-like files in a given folder and sub folders.


public class HtmlFileFinder
{
	public IEnumerable<string> Find(string folder)
	{
		var htmlList = new List<string>();

		foreach (var html in Directory.EnumerateFiles(folder, "*.htm?", SearchOption.AllDirectories))
		{
			htmlList.Add(html);
		}

		return htmlList;
	}
}

Next we need an event to represent a link found in an html document:


public class HyperLinkEventArgs : EventArgs
{
	public HtmlDocument Document { get; set; }

	public string FilePath { get; set; }

	public string Link { get; set; }

	public int Line { get; set; }

	public int Column { get; set; }
}

Now, the AnchorFinder which processes a single html document and fires an event each time an anchor is found.


public class AnchorFinder
{
	private List<string> ignoredPrefixes = new List<string>();

	public event EventHandler<HyperLinkEventArgs> LinkFound;

	public void Ignore(string prefix)
	{
		this.ignoredPrefixes.Add(prefix);
	}

	public void Find(IEnumerable<string> documents)
	{
		foreach (string document in documents)
		{
			try
			{
				HtmlDocument html = new HtmlDocument();
				html.Load(document);

				if (html.DocumentNode == null)
					continue;

				var allLinks = html.DocumentNode.SelectNodes("//a[@href]");

				if (allLinks == null)
					continue;

				foreach (HtmlNode link in allLinks)
				{
					HtmlAttribute href = link.Attributes["href"];

					if (href != null && !String.IsNullOrEmpty(href.Value) && this.LinkFound != null)
					{
						if (!this.ignoredPrefixes.Any(x => href.Value.StartsWith(x)))
						{
							var handler = this.LinkFound;

							handler(this, new HyperLinkEventArgs
							{
								Document = html,
								FilePath = document,
								Link = href.Value,
								Line = href.Line,
								Column = href.LinePosition
							});
						}
					}
				}
			}
			catch (Exception)
			{
				// continue
			}
		}
	}
}

I added the ability to ignore certain url prefixes so we can ignore mailto and local file links.

Finally the Main class to use all the preceding pieces.


class Program
{
	static int Main(string[] args)
	{
		if (args.Length == 0)
			return - 1;

		string folder = args[0];

		if (!Directory.Exists(folder))
		{
			Console.WriteLine("Folder does not exist");
			return -1;
		}

		HtmlFileFinder fileFinder = new HtmlFileFinder();

		var htmlList = fileFinder.Find(folder);

		List<string> errors = new List<string>();

		AnchorFinder linkFinder = new AnchorFinder();

		// mail address - ignore
		linkFinder.Ignore("mailto");
		linkFinder.Ignore("file://");

		linkFinder.LinkFound += (obj, e) =>
		{
			string link = e.Link;

			if (link.StartsWith("#"))
			{
				// internal link...
				var allInternalIds = e.Document.DocumentNode.SelectNodes("*[@id]");

				if (allInternalIds != null)
				{
					// find the link...
					var idItem = allInternalIds.FindFirst(link.Substring(1));

					if (idItem == null)
					{
						errors.Add("No internal link");
					}
				}
			}
			else
			{
				if (link.Contains("#"))
				{
					// internal link to an internal id in another document
					// not handled yet.
				}
				else
				{
					if (link.StartsWith("http") || link.StartsWith("//"))
					{
						// absolute path...
						// ping it?
					}
					else
					{
						// local path
						try
						{
							string thisFilesFolder = Path.GetDirectoryName(e.FilePath) + "\\";
							Uri baseFolder = new Uri(thisFilesFolder);
							Uri u = new Uri(baseFolder, link);
							string fullPath = u.LocalPath.Replace("%20", " ");

							if (!File.Exists(fullPath))
							{
								errors.Add(String.Format("{0}({1},{2}): Link to \"{3}\" does not exist", e.FilePath, e.Line, e.Column, link));
							}
						}
						catch
						{
							// nothing...
						}
					}
				}
			}
		};

		linkFinder.Find(htmlList);

		foreach (string error in errors)
		{
			Console.WriteLine(error);
		}

		Console.WriteLine("\r\nSummary");
		Console.WriteLine("===================================");
		Console.WriteLine("{0} errors detected.", errors.Count);
		Console.ReadLine();

		return 0;
	}
}