diff --git a/parser.py b/parser.py index 6a94c99..4136024 100644 --- a/parser.py +++ b/parser.py @@ -2,15 +2,25 @@ from collections import namedtuple +############################################################################### +# LR0 +# +# We start with LR0 parsers, because they form the basis of everything else. +############################################################################### class Configuration( - namedtuple('Configuration', ['name', 'symbols', 'position']) + namedtuple('Configuration', ['name', 'symbols', 'position', 'lookahead']) ): """A rule being tracked in a state.""" __slots__ = () @classmethod - def from_rule(cls, rule): - return Configuration(name=rule[0], symbols=rule[1], position=0) + def from_rule(cls, rule, lookahead=()): + return Configuration( + name=rule[0], + symbols=rule[1], + position=0, + lookahead=lookahead, + ) @property def at_end(self): @@ -20,6 +30,10 @@ class Configuration( def next(self): return self.symbols[self.position] if not self.at_end else None + @property + def rest(self): + return self.symbols[(self.position+1):] + def at_symbol(self, symbol): return self.next == symbol @@ -27,12 +41,14 @@ class Configuration( return self._replace(**kwargs) def __str__(self): - return "{name} -> {bits}".format( + la = ", " + str(self.lookahead) if self.lookahead != () else "" + return "{name} -> {bits}{lookahead}".format( name=self.name, bits=' '.join([ '* ' + sym if i == self.position else sym for i, sym in enumerate(self.symbols) - ]) + (' *' if self.at_end else '') + ]) + (' *' if self.at_end else ''), + lookahead=la, ) @@ -61,9 +77,17 @@ class GenerateLR0(object): means that O can be matched with nothing. - Note that this is implemented in the dumbest way possible, in order to be - the most understandable it can be. I built this to learn, and I want to - make sure I can keep learning with it. + Implementation nodes: + - This is implemented in the dumbest way possible, in order to be the + most understandable it can be. I built this to learn, and I want to + make sure I can keep learning with it. + + - We tend to use tuples everywhere. This is because tuples can be + compared for equality and put into tables and all that jazz. They might + be a little bit slower in places but like I said, this is for + learning. (Also, if we need this to run faster we can probably go a + long way by memoizing results, which is much easier if we have tuples + everywhere.) """ def __init__(self, start, grammar): """Initialize the parser generator with the specified grammar and @@ -73,7 +97,7 @@ class GenerateLR0(object): # production for the start state. grammar[0] is always the start # rule, and in the set of states and table and whatever the first # element is always the starting state/position. - self.grammar = [('__start', start)] + grammar + self.grammar = [('__start', [start])] + grammar self.nonterminals = set(rule[0] for rule in grammar) self.terminals = set( sym @@ -121,7 +145,8 @@ class GenerateLR0(object): existing closure. If the provided config is already in the closure then nothing is - done. + done. (We assume that the closure of the config is *also* already in + the closure.) """ if config in closure: return closure @@ -192,7 +217,7 @@ class GenerateLR0(object): def gen_reduce_set(self, config): """Return the set of symbols that indicate we should reduce the given - config. + configuration. In an LR0 parser, this is just the set of all terminals.""" return self.terminals @@ -310,14 +335,85 @@ class GenerateLR0(object): return row[symbol][0] +def parse(table, input, trace=False): + """Parse the input with the generated parsing table and return the + concrete syntax tree. + + The parsing table can be generated by GenerateLR0.gen_table() or by any + of the other generators below. The parsing mechanism never changes, only + the table generation mechanism. + + input is a list of tokens. Don't stick an end-of-stream marker, I'll stick + one on for you. + """ + assert '$' not in input + input = input + ['$'] + input_index = 0 + + # Our stack is a stack of tuples, where the first entry is the state number + # and the second entry is the 'value' that was generated when the state was + # pushed. + stack = [(0, None)] + while True: + current_state = stack[-1][0] + current_token = input[input_index] + + action = table[current_state].get(current_token, ('error',)) + if trace: + print("{stack: <20} {input: <50} {action: <5}".format( + stack=repr([s[0] for s in stack]), + input=repr(input[input_index:]), + action=repr(action) + )) + + if action[0] == 'accept': + return stack[-1][1] + + elif action[0] == 'reduce': + name = action[1] + size = action[2] + + value = (name, tuple(s[1] for s in stack[-size:])) + stack = stack[:-size] + + goto = table[stack[-1][0]].get(name, ('error',)) + assert goto[0] == 'goto' # Corrupt table? + stack.append((goto[1], value)) + + elif action[0] == 'shift': + stack.append((action[1], (current_token, ()))) + input_index += 1 + + elif action[0] == 'error': + raise ValueError( + 'Syntax error: unexpected symbol {sym}'.format( + sym=current_token, + ), + ) + + +############################################################################### +# SLR(1) +############################################################################### class GenerateSLR1(GenerateLR0): """Generate parse tables for SLR1 grammars. - boop + SLR1 parsers can recognize more than LR0 parsers, because they have a + little bit more information: instead of generating reduce actions for a + production on all possible inputs, as LR0 parsers do, they generate + reduce actions only for inputs that are in the 'follow' set of the + non-terminal. + + That means SLR1 parsers need to know how to generate 'follow(A)', which + means they need to know how to generate 'first(A)', which is most of the + code in this class. """ def gen_first_symbol(self, symbol, visited): """Compute the first set for a single symbol. + If a symbol can be empty, then the set contains epsilon, which we + represent as python's `None`. + The first set is the set of tokens that can appear as the first token for a given symbol. (Obviously, if the symbol is itself a token, then this is trivial.) @@ -325,7 +421,7 @@ class GenerateSLR1(GenerateLR0): 'visited' is a set of already visited symbols, to stop infinite recursion on left-recursive grammars. That means that sometimes this function can return an empty tuple. Don't confuse that with a tuple - containing epsilon: that's a tuple containing 'None', not an empty + containing epsilon: that's a tuple containing `None`, not an empty tuple. """ if symbol in self.terminals: @@ -347,7 +443,7 @@ class GenerateSLR1(GenerateLR0): for fs in firsts: result = result + tuple(f for f in fs if f not in result) - return result + return tuple(sorted(result)) def gen_first(self, symbols, visited=None): """Compute the first set for a sequence of symbols. @@ -355,7 +451,7 @@ class GenerateSLR1(GenerateLR0): The first set is the set of tokens that can appear as the first token for this sequence of symbols. The interesting wrinkle in computing the first set for a sequence of symbols is that we keep computing the first - sets so long as Epsilon appears in the set. i.e., if we are computing + sets so long as epsilon appears in the set. i.e., if we are computing for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the first set for the *sequence* also contains the first set of ['B', 'C'], since 'A' could be missing entirely. @@ -374,8 +470,9 @@ class GenerateSLR1(GenerateLR0): visited = set() result = self.gen_first_symbol(symbols[0], visited) if None in result: - result = tuple(set(s for s in result if s is not None)) + result = tuple(s for s in result if s is not None) result = result + self.gen_first(symbols[1:], visited) + result = tuple(sorted(set(result))) return result def gen_follow(self, symbol, visited=None): @@ -420,51 +517,77 @@ class GenerateSLR1(GenerateLR0): return self.gen_follow(config.name) -def parse(table, input, trace=False): - """Parse the input with the generated parsing table and return the - concrete syntax tree. +class GenerateLR1(GenerateSLR1): + """Generate parse tables for LR1, or "canonical LR" grammars. - input is a list of tokens. Don't stick an end-of-stream marker, I'll stick - one on for you. + LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they + are choosier about when they reduce. But unlike SLR parsers, they specify + the terminals on which they reduce by carrying a 'lookahead' terminal in + the configuration. The lookahead of a configuration is computed as the + closure of a configuration set is computed, so see gen_closure_next for + details. (Except for the start configuration, which has '$' as its + lookahead.) """ - input = input + ['$'] - input_index = 0 - stack = [(0, None)] - while True: - current_state = stack[-1][0] - current_token = input[input_index] + def gen_reduce_set(self, config): + """Return the set of symbols that indicate we should reduce the given + config. - action = table[current_state].get(current_token, ('error',)) - if trace: - print("{stack: <20} {input: <50} {action: <5}".format( - stack=[s[0] for s in stack], - input=input[input_index:], - action=action - )) + In an LR1 parser, this is the lookahead of the configuration.""" + return config.lookahead - if action[0] == 'accept': - return stack[-1][1] + def gen_closure_next(self, config): + """Return the next set of configurations in the closure for + config. - elif action[0] == 'reduce': - name = action[1] - size = action[2] + In LR1 parsers, we must compute the lookahead for the configurations + we're adding to the closure. The lookahead for the new configurations + is the first() of the rest of this config's production. If that + contains epsilon, then the lookahead *also* contains the lookahead we + already have. (This lookahead was presumably generated by the same + process, so in some sense it is a 'parent' lookahead, or a lookahead + from an upstream production in the grammar.) - value = (name, tuple(s[1] for s in stack[-size:])) - stack = stack[:-size] + (See the documentation in GenerateLR0 for more information on how + this function fits into the whole process.) + """ + if config.at_end: + return () + else: + next = [] + for rule in self.grammar: + if rule[0] != config.next: + continue - goto = table[stack[-1][0]].get(name, ('error',)) - if (goto[0] != 'goto'): - raise ValueError('OH NOES GOTO') - stack.append((goto[1], value)) + # N.B.: We can't just append config.lookahead to config.rest + # and compute first(), because lookahead is a *set*. So + # in this case we just say if 'first' contains epsilon, + # then we need to remove the epsilon and union with the + # existing lookahead. + lookahead = self.gen_first(config.rest) + if None in lookahead: + lookahead = tuple(l for l in lookahead if l is not None) + lookahead = lookahead + config.lookahead + lookahead = tuple(sorted(set(lookahead))) + next.append(Configuration.from_rule(rule, lookahead=lookahead)) - elif action[0] == 'shift': - stack.append((action[1], (current_token, ()))) - input_index += 1 + return tuple(next) - elif action[0] == 'error': - raise ValueError('OH NOES WAT') + def gen_all_sets(self): + """Generate all of the configuration sets for the grammar. + + In LR1 parsers, we must remember to set the lookahead of the start + symbol to '$'. + """ + initial_set = self.gen_closure( + Configuration.from_rule(self.grammar[0], lookahead=('$',)), + (), + ) + return self.gen_sets(initial_set, ()) +############################################################################### +# Formatting +############################################################################### def format_node(node): """Print out an indented concrete syntax tree, from parse().""" lines = [ @@ -493,11 +616,11 @@ def format_table(generator, table): header = " | {terms} | {nts}".format( terms=' '.join( '{0: <6}'.format(terminal) - for terminal in (generator.terminals) + for terminal in sorted(generator.terminals) ), nts=' '.join( '{0: <5}'.format(nt) - for nt in generator.nonterminals + for nt in sorted(generator.nonterminals) ), ) @@ -509,11 +632,11 @@ def format_table(generator, table): index=i, actions=' '.join( '{0: <6}'.format(format_action(row, terminal)) - for terminal in (generator.terminals) + for terminal in sorted(generator.terminals) ), gotos=' '.join( '{0: <5}'.format(row.get(nt, ('error', ''))[1]) - for nt in generator.nonterminals + for nt in sorted(generator.nonterminals) ), ) for i, row in enumerate(table) @@ -521,6 +644,9 @@ def format_table(generator, table): return '\n'.join(lines) +############################################################################### +# Examples +############################################################################### # OK, this is a very simple LR0 grammar. grammar_simple = [ ('E', ['E', '+', 'T']), @@ -580,7 +706,7 @@ tree = parse(table, ['id', '+', '(', 'id', '[', 'id', ']', ')']) print(format_node(tree) + "\n") # SLR1 can't handle this. -grammar_aho_ullman = [ +grammar_aho_ullman_1 = [ ('S', ['L', '=', 'R']), ('S', ['R']), ('L', ['*', 'R']), @@ -588,8 +714,19 @@ grammar_aho_ullman = [ ('R', ['L']), ] try: - gen = GenerateSLR1('S', grammar_aho_ullman) + gen = GenerateSLR1('S', grammar_aho_ullman_1) table = gen.gen_table() assert False except ValueError as e: print(e) + +# Here's an example with a full LR1 grammar, though. +grammar_aho_ullman_2 = [ + ('S', ['X', 'X']), + ('X', ['a', 'X']), + ('X', ['b']), +] +gen = GenerateLR1('S', grammar_aho_ullman_2) +table = gen.gen_table() +print(format_table(gen, table)) +parse(table, ['b', 'a', 'a', 'b'], trace=True)