From 1ee06dde59ec382c7b8377f1599a08a53d5ad511 Mon Sep 17 00:00:00 2001
From: John Doty <john@d0ty.me>
Date: Fri, 9 Dec 2016 15:06:55 -0800
Subject: [PATCH] LALR parsers

I'm starting to get worried about quadratic behavior though.
---
 parser.py | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 138 insertions(+), 4 deletions(-)

diff --git a/parser.py b/parser.py
index 4136024..24fb258 100644
--- a/parser.py
+++ b/parser.py
@@ -1,4 +1,30 @@
-# This is doty playing with parser tables.
+"""A collection of LR parser generators, from LR0 through LALR.
+
+One day I read a tweet, asking for a tool which accepted a grammar and an
+input file and which then produced simple parsed output, without any kind of
+in-between. (There was other ranty stuff about how none of the existing tools
+really worked, but that was beside the point.)
+
+Upon reading the tweet, it occured to me that I didn't know how LR parsers
+worked and how they were generated, except in the broadest of terms. Thus, I
+set about writing this, learning as I went.
+
+This code is not written to be fast, or even efficient, although it runs its
+test cases fast enough. It was instead written to be easy to follow along
+with, so that when I forget how all this works I can come back to the code
+and read along and learn all over again.
+
+(BTW, the notes I read to learn how all this works are at
+http://dragonbook.stanford.edu/lecture-notes/Stanford-CS143/. Specifically,
+I started with handout 8, 'Bottom-up-parsing', and went from there. (I did
+eventually have to backtrack a little into handout 7, since that's where
+First() and Follow() are covered.)
+
+Enjoy!
+
+doty
+2016-12-09
+"""
 from collections import namedtuple
 
 
@@ -10,7 +36,12 @@ from collections import namedtuple
 class Configuration(
     namedtuple('Configuration', ['name', 'symbols', 'position', 'lookahead'])
 ):
-    """A rule being tracked in a state."""
+    """A rule being tracked in a state.
+
+    (Note: technically, lookahead isn't used until we get to LR(1) parsers,
+    but if left at its default it's harmless. Ignore it until you get to
+    the part about LR(1).)
+    """
     __slots__ = ()
 
     @classmethod
@@ -55,7 +86,7 @@ class Configuration(
 class GenerateLR0(object):
     """Generate parser tables for an LR0 parser.
 
-    Grammars are of the form:
+    The input grammars are of the form:
 
       grammar_simple = [
         ('E', ['E', '+', 'T']),
@@ -69,6 +100,10 @@ class GenerateLR0(object):
     non-terminal being added, and the second elment of the tuple is the
     list of terminals and non-terminals that make up the production.
 
+    There is currently no support for custom actions or alternation or
+    anything like that. If you want alternations that you'll have to lower
+    the grammar by hand into the simpler form first.
+
     Don't name anything with double-underscores; those are reserved for
     the generator. Don't add '$' either, as it is reserved to mean
     end-of-stream. Use an empty list to indicate nullability, that is:
@@ -77,7 +112,7 @@ class GenerateLR0(object):
 
     means that O can be matched with nothing.
 
-    Implementation nodes:
+    Implementation notes:
     - This is implemented in the dumbest way possible, in order to be the
       most understandable it can be. I built this to learn, and I want to
       make sure I can keep learning with it.
@@ -585,6 +620,75 @@ class GenerateLR1(GenerateSLR1):
         return self.gen_sets(initial_set, ())
 
 
+class GenerateLALR(GenerateLR1):
+    """Generate tables for LALR.
+
+    LALR is smaller than LR(1) but bigger than SLR(1). It works by generating
+    the LR(1) configuration sets, but merging configuration sets which are
+    equal in everything but their lookaheads. This works in that it doesn't
+    generate any shift/reduce conflicts that weren't already in the LR(1)
+    grammar. It can, however, introduce new reduce/reduce conflicts, because
+    it does lose information. The advantage is that the number of parser
+    states is much much smaller in LALR than in LR(1).
+
+    (Note that because we use immutable state everywhere this generator does
+    a lot of copying and allocation.)
+    """
+    def merge_sets(self, config_set_a, config_set_b):
+        """Merge the two config sets, by keeping the item cores but merging
+        the lookahead sets for each item.
+        """
+        assert len(config_set_a) == len(config_set_b)
+        merged = []
+        for index, a in enumerate(config_set_a):
+            b = config_set_b[index]
+            assert a.replace(lookahead=()) == b.replace(lookahead=())
+
+            new_lookahead = a.lookahead + b.lookahead
+            new_lookahead = tuple(sorted(set(new_lookahead)))
+            merged.append(a.replace(lookahead=new_lookahead))
+
+        return tuple(merged)
+
+    def sets_equal(self, a, b):
+        a_no_la = tuple(s.replace(lookahead=()) for s in a)
+        b_no_la = tuple(s.replace(lookahead=()) for s in b)
+        return a_no_la == b_no_la
+
+    def gen_sets(self, config_set, F):
+        """Recursively generate all configuration sets starting from the
+        provided set, and merge them with the provided set 'F'.
+
+        The difference between this method and the one in GenerateLR0, where
+        this comes from, is in the part that stops recursion. In LALR we
+        compare for set equality *ignoring lookahead*. If we find a match,
+        then instead of returning F unchanged, we merge the two equal sets
+        and replace the set in F, returning the modified set.
+        """
+        config_set_no_la = tuple(s.replace(lookahead=()) for s in config_set)
+        for index, existing in enumerate(F):
+            existing_no_la = tuple(s.replace(lookahead=()) for s in existing)
+            if config_set_no_la == existing_no_la:
+                merged_set = self.merge_sets(config_set, existing)
+                return F[:index] + (merged_set,) + F[index+1:]
+
+        # No merge candidate found, proceed.
+        new_F = F + (config_set,)
+        for successor in self.gen_all_successors(config_set):
+            new_F = self.gen_sets(successor, new_F)
+
+        return new_F
+
+    def find_set_index(self, sets, set):
+        """Find the specified set in the set of sets, and return the
+        index, or None if it is not found.
+        """
+        for i, s in enumerate(sets):
+            if self.sets_equal(s, set):
+                return i
+        return None
+
+
 ###############################################################################
 # Formatting
 ###############################################################################
@@ -659,6 +763,7 @@ gen = GenerateLR0('E', grammar_simple)
 table = gen.gen_table()
 tree = parse(table, ['id', '+', '(', 'id', ')'])
 print(format_node(tree) + "\n")
+print()
 
 # This one doesn't work with LR0, though, it has a shift/reduce conflict.
 grammar_lr0_shift_reduce = grammar_simple + [
@@ -670,6 +775,7 @@ try:
     assert False
 except ValueError as e:
     print(e)
+print()
 
 # Nor does this: it has a reduce/reduce conflict.
 grammar_lr0_reduce_reduce = grammar_simple + [
@@ -682,6 +788,7 @@ try:
     assert False
 except ValueError as e:
     print(e)
+print()
 
 # Nullable symbols just don't work with constructs like this, because you can't
 # look ahead to figure out if you should reduce an empty 'F' or not.
@@ -704,6 +811,7 @@ table = gen.gen_table()
 print(format_table(gen, table))
 tree = parse(table, ['id', '+', '(', 'id', '[', 'id', ']', ')'])
 print(format_node(tree) + "\n")
+print()
 
 # SLR1 can't handle this.
 grammar_aho_ullman_1 = [
@@ -719,6 +827,7 @@ try:
     assert False
 except ValueError as e:
     print(e)
+print()
 
 # Here's an example with a full LR1 grammar, though.
 grammar_aho_ullman_2 = [
@@ -730,3 +839,28 @@ gen = GenerateLR1('S', grammar_aho_ullman_2)
 table = gen.gen_table()
 print(format_table(gen, table))
 parse(table, ['b', 'a', 'a', 'b'], trace=True)
+print()
+
+# What happens if we do LALR to it?
+gen = GenerateLALR('S', grammar_aho_ullman_2)
+table = gen.gen_table()
+print(format_table(gen, table))
+print()
+
+# A fun LALAR grammar.
+grammar_lalr = [
+    ('S', ['V', 'E']),
+
+    ('E', ['F']),
+    ('E', ['E', '+', 'F']),
+
+    ('F', ['V']),
+    ('F', ['int']),
+    ('F', ['(', 'E', ')']),
+
+    ('V', ['id']),
+]
+gen = GenerateLALR('S', grammar_lalr)
+table = gen.gen_table()
+print(format_table(gen, table))
+print()