Canonical LR1.

Also: - Reorganize the file into sections so I can keep track of where I am, and so it reads more cleanly from top to bottom. - A little more work on documentation and comments and the like.
2016-12-09 08:54:28 -08:00 · 2016-12-09 08:54:28 -08:00 · 9fe44d30e0
commit 9fe44d30e0
parent c4be7bcd9f
1 changed files with 193 additions and 56 deletions
--- a/parser.py
+++ b/parser.py
@ -2,15 +2,25 @@
 from collections import namedtuple


+###############################################################################
+# LR0
+#
+# We start with LR0 parsers, because they form the basis of everything else.
+###############################################################################
 class Configuration(
-    namedtuple('Configuration', ['name', 'symbols', 'position'])
+    namedtuple('Configuration', ['name', 'symbols', 'position', 'lookahead'])
 ):
    """A rule being tracked in a state."""
    __slots__ = ()

    @classmethod
-    def from_rule(cls, rule):
-        return Configuration(name=rule[0], symbols=rule[1], position=0)
+    def from_rule(cls, rule, lookahead=()):
+        return Configuration(
+            name=rule[0],
+            symbols=rule[1],
+            position=0,
+            lookahead=lookahead,
+        )

    @property
    def at_end(self):
@ -20,6 +30,10 @@ class Configuration(
    def next(self):
        return self.symbols[self.position] if not self.at_end else None

+    @property
+    def rest(self):
+        return self.symbols[(self.position+1):]
+
    def at_symbol(self, symbol):
        return self.next == symbol

@ -27,12 +41,14 @@ class Configuration(
        return self._replace(**kwargs)

    def __str__(self):
-        return "{name} -> {bits}".format(
+        la = ", " + str(self.lookahead) if self.lookahead != () else ""
+        return "{name} -> {bits}{lookahead}".format(
            name=self.name,
            bits=' '.join([
                '* ' + sym if i == self.position else sym
                for i, sym in enumerate(self.symbols)
-            ]) + (' *' if self.at_end else '')
+            ]) + (' *' if self.at_end else ''),
+            lookahead=la,
        )


@ -61,9 +77,17 @@ class GenerateLR0(object):

    means that O can be matched with nothing.

-    Note that this is implemented in the dumbest way possible, in order to be
-    the most understandable it can be. I built this to learn, and I want to
-    make sure I can keep learning with it.
+    Implementation nodes:
+    - This is implemented in the dumbest way possible, in order to be the
+      most understandable it can be. I built this to learn, and I want to
+      make sure I can keep learning with it.
+
+    - We tend to use tuples everywhere. This is because tuples can be
+      compared for equality and put into tables and all that jazz. They might
+      be a little bit slower in places but like I said, this is for
+      learning. (Also, if we need this to run faster we can probably go a
+      long way by memoizing results, which is much easier if we have tuples
+      everywhere.)
    """
    def __init__(self, start, grammar):
        """Initialize the parser generator with the specified grammar and
@ -73,7 +97,7 @@ class GenerateLR0(object):
        # production for the start state. grammar[0] is always the start
        # rule, and in the set of states and table and whatever the first
        # element is always the starting state/position.
-        self.grammar = [('__start', start)] + grammar
+        self.grammar = [('__start', [start])] + grammar
        self.nonterminals = set(rule[0] for rule in grammar)
        self.terminals = set(
            sym
@ -121,7 +145,8 @@ class GenerateLR0(object):
        existing closure.

        If the provided config is already in the closure then nothing is
-        done.
+        done. (We assume that the closure of the config is *also* already in
+        the closure.)
        """
        if config in closure:
            return closure
@ -192,7 +217,7 @@ class GenerateLR0(object):

    def gen_reduce_set(self, config):
        """Return the set of symbols that indicate we should reduce the given
-        config.
+        configuration.

        In an LR0 parser, this is just the set of all terminals."""
        return self.terminals
@ -310,14 +335,85 @@ class GenerateLR0(object):
        return row[symbol][0]


+def parse(table, input, trace=False):
+    """Parse the input with the generated parsing table and return the
+    concrete syntax tree.
+
+    The parsing table can be generated by GenerateLR0.gen_table() or by any
+    of the other generators below. The parsing mechanism never changes, only
+    the table generation mechanism.
+
+    input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
+    one on for you.
+    """
+    assert '$' not in input
+    input = input + ['$']
+    input_index = 0
+
+    # Our stack is a stack of tuples, where the first entry is the state number
+    # and the second entry is the 'value' that was generated when the state was
+    # pushed.
+    stack = [(0, None)]
+    while True:
+        current_state = stack[-1][0]
+        current_token = input[input_index]
+
+        action = table[current_state].get(current_token, ('error',))
+        if trace:
+            print("{stack: <20}  {input: <50}  {action: <5}".format(
+                stack=repr([s[0] for s in stack]),
+                input=repr(input[input_index:]),
+                action=repr(action)
+            ))
+
+        if action[0] == 'accept':
+            return stack[-1][1]
+
+        elif action[0] == 'reduce':
+            name = action[1]
+            size = action[2]
+
+            value = (name, tuple(s[1] for s in stack[-size:]))
+            stack = stack[:-size]
+
+            goto = table[stack[-1][0]].get(name, ('error',))
+            assert goto[0] == 'goto'  # Corrupt table?
+            stack.append((goto[1], value))
+
+        elif action[0] == 'shift':
+            stack.append((action[1], (current_token, ())))
+            input_index += 1
+
+        elif action[0] == 'error':
+            raise ValueError(
+                'Syntax error: unexpected symbol {sym}'.format(
+                    sym=current_token,
+                ),
+            )
+
+
+###############################################################################
+# SLR(1)
+###############################################################################
 class GenerateSLR1(GenerateLR0):
    """Generate parse tables for SLR1 grammars.

-    boop
+    SLR1 parsers can recognize more than LR0 parsers, because they have a
+    little bit more information: instead of generating reduce actions for a
+    production on all possible inputs, as LR0 parsers do, they generate
+    reduce actions only for inputs that are in the 'follow' set of the
+    non-terminal.
+
+    That means SLR1 parsers need to know how to generate 'follow(A)', which
+    means they need to know how to generate 'first(A)', which is most of the
+    code in this class.
    """
    def gen_first_symbol(self, symbol, visited):
        """Compute the first set for a single symbol.

+        If a symbol can be empty, then the set contains epsilon, which we
+        represent as python's `None`.
+
        The first set is the set of tokens that can appear as the first token
        for a given symbol. (Obviously, if the symbol is itself a token, then
        this is trivial.)
@ -325,7 +421,7 @@ class GenerateSLR1(GenerateLR0):
        'visited' is a set of already visited symbols, to stop infinite
        recursion on left-recursive grammars. That means that sometimes this
        function can return an empty tuple. Don't confuse that with a tuple
-        containing epsilon: that's a tuple containing 'None', not an empty
+        containing epsilon: that's a tuple containing `None`, not an empty
        tuple.
        """
        if symbol in self.terminals:
@ -347,7 +443,7 @@ class GenerateSLR1(GenerateLR0):
            for fs in firsts:
                result = result + tuple(f for f in fs if f not in result)

-            return result
+            return tuple(sorted(result))

    def gen_first(self, symbols, visited=None):
        """Compute the first set for a sequence of symbols.
@ -355,7 +451,7 @@ class GenerateSLR1(GenerateLR0):
        The first set is the set of tokens that can appear as the first token
        for this sequence of symbols. The interesting wrinkle in computing the
        first set for a sequence of symbols is that we keep computing the first
-        sets so long as Epsilon appears in the set. i.e., if we are computing
+        sets so long as epsilon appears in the set. i.e., if we are computing
        for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the
        first set for the *sequence* also contains the first set of ['B', 'C'],
        since 'A' could be missing entirely.
@ -374,8 +470,9 @@ class GenerateSLR1(GenerateLR0):
                visited = set()
            result = self.gen_first_symbol(symbols[0], visited)
            if None in result:
-                result = tuple(set(s for s in result if s is not None))
+                result = tuple(s for s in result if s is not None)
                result = result + self.gen_first(symbols[1:], visited)
+                result = tuple(sorted(set(result)))
            return result

    def gen_follow(self, symbol, visited=None):
@ -420,51 +517,77 @@ class GenerateSLR1(GenerateLR0):
        return self.gen_follow(config.name)


-def parse(table, input, trace=False):
-    """Parse the input with the generated parsing table and return the
-    concrete syntax tree.
+class GenerateLR1(GenerateSLR1):
+    """Generate parse tables for LR1, or "canonical LR" grammars.

-    input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
-    one on for you.
+    LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they
+    are choosier about when they reduce. But unlike SLR parsers, they specify
+    the terminals on which they reduce by carrying a 'lookahead' terminal in
+    the configuration. The lookahead of a configuration is computed as the
+    closure of a configuration set is computed, so see gen_closure_next for
+    details. (Except for the start configuration, which has '$' as its
+    lookahead.)
    """
-    input = input + ['$']
-    input_index = 0
-    stack = [(0, None)]
-    while True:
-        current_state = stack[-1][0]
-        current_token = input[input_index]
+    def gen_reduce_set(self, config):
+        """Return the set of symbols that indicate we should reduce the given
+        config.

-        action = table[current_state].get(current_token, ('error',))
-        if trace:
-            print("{stack: <20}  {input: <50}  {action: <5}".format(
-                stack=[s[0] for s in stack],
-                input=input[input_index:],
-                action=action
-            ))
+        In an LR1 parser, this is the lookahead of the configuration."""
+        return config.lookahead

-        if action[0] == 'accept':
-            return stack[-1][1]
+    def gen_closure_next(self, config):
+        """Return the next set of configurations in the closure for
+        config.

-        elif action[0] == 'reduce':
-            name = action[1]
-            size = action[2]
+        In LR1 parsers, we must compute the lookahead for the configurations
+        we're adding to the closure. The lookahead for the new configurations
+        is the first() of the rest of this config's production. If that
+        contains epsilon, then the lookahead *also* contains the lookahead we
+        already have. (This lookahead was presumably generated by the same
+        process, so in some sense it is a 'parent' lookahead, or a lookahead
+        from an upstream production in the grammar.)

-            value = (name, tuple(s[1] for s in stack[-size:]))
-            stack = stack[:-size]
+        (See the documentation in GenerateLR0 for more information on how
+        this function fits into the whole process.)
+        """
+        if config.at_end:
+            return ()
+        else:
+            next = []
+            for rule in self.grammar:
+                if rule[0] != config.next:
+                    continue

-            goto = table[stack[-1][0]].get(name, ('error',))
-            if (goto[0] != 'goto'):
-                raise ValueError('OH NOES GOTO')
-            stack.append((goto[1], value))
+                # N.B.: We can't just append config.lookahead to config.rest
+                #       and compute first(), because lookahead is a *set*. So
+                #       in this case we just say if 'first' contains epsilon,
+                #       then we need to remove the epsilon and union with the
+                #       existing lookahead.
+                lookahead = self.gen_first(config.rest)
+                if None in lookahead:
+                    lookahead = tuple(l for l in lookahead if l is not None)
+                    lookahead = lookahead + config.lookahead
+                    lookahead = tuple(sorted(set(lookahead)))
+                next.append(Configuration.from_rule(rule, lookahead=lookahead))

-        elif action[0] == 'shift':
-            stack.append((action[1], (current_token, ())))
-            input_index += 1
+            return tuple(next)

-        elif action[0] == 'error':
-            raise ValueError('OH NOES WAT')
+    def gen_all_sets(self):
+        """Generate all of the configuration sets for the grammar.
+
+        In LR1 parsers, we must remember to set the lookahead of the start
+        symbol to '$'.
+        """
+        initial_set = self.gen_closure(
+            Configuration.from_rule(self.grammar[0], lookahead=('$',)),
+            (),
+        )
+        return self.gen_sets(initial_set, ())


+###############################################################################
+# Formatting
+###############################################################################
 def format_node(node):
    """Print out an indented concrete syntax tree, from parse()."""
    lines = [
@ -493,11 +616,11 @@ def format_table(generator, table):
    header = "    | {terms} | {nts}".format(
        terms=' '.join(
            '{0: <6}'.format(terminal)
-            for terminal in (generator.terminals)
+            for terminal in sorted(generator.terminals)
        ),
        nts=' '.join(
            '{0: <5}'.format(nt)
-            for nt in generator.nonterminals
+            for nt in sorted(generator.nonterminals)
        ),
    )

@ -509,11 +632,11 @@ def format_table(generator, table):
            index=i,
            actions=' '.join(
                '{0: <6}'.format(format_action(row, terminal))
-                for terminal in (generator.terminals)
+                for terminal in sorted(generator.terminals)
            ),
            gotos=' '.join(
                '{0: <5}'.format(row.get(nt, ('error', ''))[1])
-                for nt in generator.nonterminals
+                for nt in sorted(generator.nonterminals)
            ),
        )
        for i, row in enumerate(table)
@ -521,6 +644,9 @@ def format_table(generator, table):
    return '\n'.join(lines)


+###############################################################################
+# Examples
+###############################################################################
 # OK, this is a very simple LR0 grammar.
 grammar_simple = [
    ('E', ['E', '+', 'T']),
@ -580,7 +706,7 @@ tree = parse(table, ['id', '+', '(', 'id', '[', 'id', ']', ')'])
 print(format_node(tree) + "\n")

 # SLR1 can't handle this.
-grammar_aho_ullman = [
+grammar_aho_ullman_1 = [
    ('S', ['L', '=', 'R']),
    ('S', ['R']),
    ('L', ['*', 'R']),
@ -588,8 +714,19 @@ grammar_aho_ullman = [
    ('R', ['L']),
 ]
 try:
-    gen = GenerateSLR1('S', grammar_aho_ullman)
+    gen = GenerateSLR1('S', grammar_aho_ullman_1)
    table = gen.gen_table()
    assert False
 except ValueError as e:
    print(e)
+
+# Here's an example with a full LR1 grammar, though.
+grammar_aho_ullman_2 = [
+    ('S', ['X', 'X']),
+    ('X', ['a', 'X']),
+    ('X', ['b']),
+]
+gen = GenerateLR1('S', grammar_aho_ullman_2)
+table = gen.gen_table()
+print(format_table(gen, table))
+parse(table, ['b', 'a', 'a', 'b'], trace=True)