JB's BLOG

Directed Acyclic Word Graphs

Part 2 - First Implementation

A class representing nodes

internal sealed class BasicNode
{
    public bool IsTerminal { get; set; }
    public int Id;
    public Dictionary<char, BasicNode> Children = new Dictionary<char, BasicNode>();
}

A class for building the DAWG

Here are the algorithms from the previous post. This is modified from Steve Hannov's python code.

internal sealed class BasicDawgBuilder
{
    private string _previousWord = "";
    public BasicNode Root = new BasicNode();
    private readonly Dictionary<BasicNode, BasicNode> _minimizedNodes = new Dictionary<BasicNode, BasicNode>();
    private readonly Stack<Tuple<BasicNode, char, BasicNode>> _uncheckedNodes = new Stack<Tuple<BasicNode, char, BasicNode>>();

    public void Insert(string word)
    {
        var commonPrefix = CommonPrefix(word);
        Minimize(commonPrefix);

        var node = _uncheckedNodes.Count == 0 ? Root : _uncheckedNodes.Peek().Item3;

        foreach (var letter in word.Skip(commonPrefix))
        {
            var nextNode = new BasicNode();
            node.Children[letter] = nextNode;
            _uncheckedNodes.Push(new Tuple<BasicNode, char, BasicNode>(node, letter, nextNode));
            node = nextNode;
        }

        node.IsTerminal = true;
        _previousWord = word;
    }

    private int CommonPrefix(string word)
    {
        for (var commonPrefix = 0; commonPrefix < Math.Min(word.Length, _previousWord.Length); commonPrefix++)
        {
            if (word[commonPrefix] != _previousWord[commonPrefix])
            {
                return commonPrefix;
            }
        }

        return 0;
    }

    public BasicNode Finish()
    {
        Minimize(0);
        _minimizedNodes.Clear();
        _uncheckedNodes.Clear();

        return Root;
    }

    private void Minimize(int downTo)
    {
        for (var i = _uncheckedNodes.Count - 1; i > downTo - 1; i--)
        {
            var unNode = _uncheckedNodes.Pop();
            var parent = unNode.Item1;
            var letter = unNode.Item2;
            var child = unNode.Item3;

            if (_minimizedNodes.TryGetValue(child, out var newChild))
            {
                parent.Children[letter] = newChild;
            }
            else
            {
                _minimizedNodes.Add(child, child);
            }
        }
    }
}

Perfect hashing

Finally, here is the extension to include Applications of Finite Automata Representing Large Vocabularies by Luccesi and Kowaltowski that I hinted at in the previous article.

Because we decided to work with sorted lists, this is super-easy. Simply track the word counts as they come in. They're already in order! There will be a little bit of work to pair a word with it's index in the list, but thankfully, that's taken care of in the next post.

public List<long> WordCounts = new List<long>();

public void Insert(string word)
{
    var commonPrefix = CommonPrefix(word);
    Minimize(commonPrefix);

    var node = _uncheckedNodes.Count == 0 ? Root : _uncheckedNodes.Peek().Item3;

    foreach (var letter in word.Skip(commonPrefix))
    {
        var nextNode = new BasicNode();
        node.Children[letter] = nextNode;
        _uncheckedNodes.Push(new Tuple<BasicNode, char, BasicNode>(node, letter, nextNode));
        node = nextNode;
    }

    node.IsTerminal = true;
    _previousWord = word;
    WordCounts.Add(count);
}