Directed Acyclic Word Graphs
Part 2 - First Implementation
A class representing nodes
internal sealed class BasicNode
{
public bool IsTerminal { get; set; }
public int Id;
public Dictionary<char, BasicNode> Children = new Dictionary<char, BasicNode>();
}
A class for building the DAWG
Here are the algorithms from the previous post. This is modified from Steve Hannov's python code.
internal sealed class BasicDawgBuilder
{
private string _previousWord = "";
public BasicNode Root = new BasicNode();
private readonly Dictionary<BasicNode, BasicNode> _minimizedNodes = new Dictionary<BasicNode, BasicNode>();
private readonly Stack<Tuple<BasicNode, char, BasicNode>> _uncheckedNodes = new Stack<Tuple<BasicNode, char, BasicNode>>();
public void Insert(string word)
{
var commonPrefix = CommonPrefix(word);
Minimize(commonPrefix);
var node = _uncheckedNodes.Count == 0 ? Root : _uncheckedNodes.Peek().Item3;
foreach (var letter in word.Skip(commonPrefix))
{
var nextNode = new BasicNode();
node.Children[letter] = nextNode;
_uncheckedNodes.Push(new Tuple<BasicNode, char, BasicNode>(node, letter, nextNode));
node = nextNode;
}
node.IsTerminal = true;
_previousWord = word;
}
private int CommonPrefix(string word)
{
for (var commonPrefix = 0; commonPrefix < Math.Min(word.Length, _previousWord.Length); commonPrefix++)
{
if (word[commonPrefix] != _previousWord[commonPrefix])
{
return commonPrefix;
}
}
return 0;
}
public BasicNode Finish()
{
Minimize(0);
_minimizedNodes.Clear();
_uncheckedNodes.Clear();
return Root;
}
private void Minimize(int downTo)
{
for (var i = _uncheckedNodes.Count - 1; i > downTo - 1; i--)
{
var unNode = _uncheckedNodes.Pop();
var parent = unNode.Item1;
var letter = unNode.Item2;
var child = unNode.Item3;
if (_minimizedNodes.TryGetValue(child, out var newChild))
{
parent.Children[letter] = newChild;
}
else
{
_minimizedNodes.Add(child, child);
}
}
}
}
Perfect hashing
Finally, here is the extension to include Applications of Finite Automata Representing Large Vocabularies by Luccesi and Kowaltowski that I hinted at in the previous article.
Because we decided to work with sorted lists, this is super-easy. Simply track the word counts as they come in. They're already in order! There will be a little bit of work to pair a word with it's index in the list, but thankfully, that's taken care of in the next post.
public List<long> WordCounts = new List<long>();
public void Insert(string word)
{
var commonPrefix = CommonPrefix(word);
Minimize(commonPrefix);
var node = _uncheckedNodes.Count == 0 ? Root : _uncheckedNodes.Peek().Item3;
foreach (var letter in word.Skip(commonPrefix))
{
var nextNode = new BasicNode();
node.Children[letter] = nextNode;
_uncheckedNodes.Push(new Tuple<BasicNode, char, BasicNode>(node, letter, nextNode));
node = nextNode;
}
node.IsTerminal = true;
_previousWord = word;
WordCounts.Add(count);
}