Canonical LR1.

Also:

- Reorganize the file into sections so I can keep track of where I am,
  and so it reads more cleanly from top to bottom.

- A little more work on documentation and comments and the like.
This commit is contained in:
John Doty 2016-12-09 08:54:28 -08:00
parent c4be7bcd9f
commit 9fe44d30e0

249
parser.py
View file

@ -2,15 +2,25 @@
from collections import namedtuple from collections import namedtuple
###############################################################################
# LR0
#
# We start with LR0 parsers, because they form the basis of everything else.
###############################################################################
class Configuration( class Configuration(
namedtuple('Configuration', ['name', 'symbols', 'position']) namedtuple('Configuration', ['name', 'symbols', 'position', 'lookahead'])
): ):
"""A rule being tracked in a state.""" """A rule being tracked in a state."""
__slots__ = () __slots__ = ()
@classmethod @classmethod
def from_rule(cls, rule): def from_rule(cls, rule, lookahead=()):
return Configuration(name=rule[0], symbols=rule[1], position=0) return Configuration(
name=rule[0],
symbols=rule[1],
position=0,
lookahead=lookahead,
)
@property @property
def at_end(self): def at_end(self):
@ -20,6 +30,10 @@ class Configuration(
def next(self): def next(self):
return self.symbols[self.position] if not self.at_end else None return self.symbols[self.position] if not self.at_end else None
@property
def rest(self):
return self.symbols[(self.position+1):]
def at_symbol(self, symbol): def at_symbol(self, symbol):
return self.next == symbol return self.next == symbol
@ -27,12 +41,14 @@ class Configuration(
return self._replace(**kwargs) return self._replace(**kwargs)
def __str__(self): def __str__(self):
return "{name} -> {bits}".format( la = ", " + str(self.lookahead) if self.lookahead != () else ""
return "{name} -> {bits}{lookahead}".format(
name=self.name, name=self.name,
bits=' '.join([ bits=' '.join([
'* ' + sym if i == self.position else sym '* ' + sym if i == self.position else sym
for i, sym in enumerate(self.symbols) for i, sym in enumerate(self.symbols)
]) + (' *' if self.at_end else '') ]) + (' *' if self.at_end else ''),
lookahead=la,
) )
@ -61,9 +77,17 @@ class GenerateLR0(object):
means that O can be matched with nothing. means that O can be matched with nothing.
Note that this is implemented in the dumbest way possible, in order to be Implementation nodes:
the most understandable it can be. I built this to learn, and I want to - This is implemented in the dumbest way possible, in order to be the
make sure I can keep learning with it. most understandable it can be. I built this to learn, and I want to
make sure I can keep learning with it.
- We tend to use tuples everywhere. This is because tuples can be
compared for equality and put into tables and all that jazz. They might
be a little bit slower in places but like I said, this is for
learning. (Also, if we need this to run faster we can probably go a
long way by memoizing results, which is much easier if we have tuples
everywhere.)
""" """
def __init__(self, start, grammar): def __init__(self, start, grammar):
"""Initialize the parser generator with the specified grammar and """Initialize the parser generator with the specified grammar and
@ -73,7 +97,7 @@ class GenerateLR0(object):
# production for the start state. grammar[0] is always the start # production for the start state. grammar[0] is always the start
# rule, and in the set of states and table and whatever the first # rule, and in the set of states and table and whatever the first
# element is always the starting state/position. # element is always the starting state/position.
self.grammar = [('__start', start)] + grammar self.grammar = [('__start', [start])] + grammar
self.nonterminals = set(rule[0] for rule in grammar) self.nonterminals = set(rule[0] for rule in grammar)
self.terminals = set( self.terminals = set(
sym sym
@ -121,7 +145,8 @@ class GenerateLR0(object):
existing closure. existing closure.
If the provided config is already in the closure then nothing is If the provided config is already in the closure then nothing is
done. done. (We assume that the closure of the config is *also* already in
the closure.)
""" """
if config in closure: if config in closure:
return closure return closure
@ -192,7 +217,7 @@ class GenerateLR0(object):
def gen_reduce_set(self, config): def gen_reduce_set(self, config):
"""Return the set of symbols that indicate we should reduce the given """Return the set of symbols that indicate we should reduce the given
config. configuration.
In an LR0 parser, this is just the set of all terminals.""" In an LR0 parser, this is just the set of all terminals."""
return self.terminals return self.terminals
@ -310,14 +335,85 @@ class GenerateLR0(object):
return row[symbol][0] return row[symbol][0]
def parse(table, input, trace=False):
"""Parse the input with the generated parsing table and return the
concrete syntax tree.
The parsing table can be generated by GenerateLR0.gen_table() or by any
of the other generators below. The parsing mechanism never changes, only
the table generation mechanism.
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
one on for you.
"""
assert '$' not in input
input = input + ['$']
input_index = 0
# Our stack is a stack of tuples, where the first entry is the state number
# and the second entry is the 'value' that was generated when the state was
# pushed.
stack = [(0, None)]
while True:
current_state = stack[-1][0]
current_token = input[input_index]
action = table[current_state].get(current_token, ('error',))
if trace:
print("{stack: <20} {input: <50} {action: <5}".format(
stack=repr([s[0] for s in stack]),
input=repr(input[input_index:]),
action=repr(action)
))
if action[0] == 'accept':
return stack[-1][1]
elif action[0] == 'reduce':
name = action[1]
size = action[2]
value = (name, tuple(s[1] for s in stack[-size:]))
stack = stack[:-size]
goto = table[stack[-1][0]].get(name, ('error',))
assert goto[0] == 'goto' # Corrupt table?
stack.append((goto[1], value))
elif action[0] == 'shift':
stack.append((action[1], (current_token, ())))
input_index += 1
elif action[0] == 'error':
raise ValueError(
'Syntax error: unexpected symbol {sym}'.format(
sym=current_token,
),
)
###############################################################################
# SLR(1)
###############################################################################
class GenerateSLR1(GenerateLR0): class GenerateSLR1(GenerateLR0):
"""Generate parse tables for SLR1 grammars. """Generate parse tables for SLR1 grammars.
boop SLR1 parsers can recognize more than LR0 parsers, because they have a
little bit more information: instead of generating reduce actions for a
production on all possible inputs, as LR0 parsers do, they generate
reduce actions only for inputs that are in the 'follow' set of the
non-terminal.
That means SLR1 parsers need to know how to generate 'follow(A)', which
means they need to know how to generate 'first(A)', which is most of the
code in this class.
""" """
def gen_first_symbol(self, symbol, visited): def gen_first_symbol(self, symbol, visited):
"""Compute the first set for a single symbol. """Compute the first set for a single symbol.
If a symbol can be empty, then the set contains epsilon, which we
represent as python's `None`.
The first set is the set of tokens that can appear as the first token The first set is the set of tokens that can appear as the first token
for a given symbol. (Obviously, if the symbol is itself a token, then for a given symbol. (Obviously, if the symbol is itself a token, then
this is trivial.) this is trivial.)
@ -325,7 +421,7 @@ class GenerateSLR1(GenerateLR0):
'visited' is a set of already visited symbols, to stop infinite 'visited' is a set of already visited symbols, to stop infinite
recursion on left-recursive grammars. That means that sometimes this recursion on left-recursive grammars. That means that sometimes this
function can return an empty tuple. Don't confuse that with a tuple function can return an empty tuple. Don't confuse that with a tuple
containing epsilon: that's a tuple containing 'None', not an empty containing epsilon: that's a tuple containing `None`, not an empty
tuple. tuple.
""" """
if symbol in self.terminals: if symbol in self.terminals:
@ -347,7 +443,7 @@ class GenerateSLR1(GenerateLR0):
for fs in firsts: for fs in firsts:
result = result + tuple(f for f in fs if f not in result) result = result + tuple(f for f in fs if f not in result)
return result return tuple(sorted(result))
def gen_first(self, symbols, visited=None): def gen_first(self, symbols, visited=None):
"""Compute the first set for a sequence of symbols. """Compute the first set for a sequence of symbols.
@ -355,7 +451,7 @@ class GenerateSLR1(GenerateLR0):
The first set is the set of tokens that can appear as the first token The first set is the set of tokens that can appear as the first token
for this sequence of symbols. The interesting wrinkle in computing the for this sequence of symbols. The interesting wrinkle in computing the
first set for a sequence of symbols is that we keep computing the first first set for a sequence of symbols is that we keep computing the first
sets so long as Epsilon appears in the set. i.e., if we are computing sets so long as epsilon appears in the set. i.e., if we are computing
for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the
first set for the *sequence* also contains the first set of ['B', 'C'], first set for the *sequence* also contains the first set of ['B', 'C'],
since 'A' could be missing entirely. since 'A' could be missing entirely.
@ -374,8 +470,9 @@ class GenerateSLR1(GenerateLR0):
visited = set() visited = set()
result = self.gen_first_symbol(symbols[0], visited) result = self.gen_first_symbol(symbols[0], visited)
if None in result: if None in result:
result = tuple(set(s for s in result if s is not None)) result = tuple(s for s in result if s is not None)
result = result + self.gen_first(symbols[1:], visited) result = result + self.gen_first(symbols[1:], visited)
result = tuple(sorted(set(result)))
return result return result
def gen_follow(self, symbol, visited=None): def gen_follow(self, symbol, visited=None):
@ -420,51 +517,77 @@ class GenerateSLR1(GenerateLR0):
return self.gen_follow(config.name) return self.gen_follow(config.name)
def parse(table, input, trace=False): class GenerateLR1(GenerateSLR1):
"""Parse the input with the generated parsing table and return the """Generate parse tables for LR1, or "canonical LR" grammars.
concrete syntax tree.
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they
one on for you. are choosier about when they reduce. But unlike SLR parsers, they specify
the terminals on which they reduce by carrying a 'lookahead' terminal in
the configuration. The lookahead of a configuration is computed as the
closure of a configuration set is computed, so see gen_closure_next for
details. (Except for the start configuration, which has '$' as its
lookahead.)
""" """
input = input + ['$'] def gen_reduce_set(self, config):
input_index = 0 """Return the set of symbols that indicate we should reduce the given
stack = [(0, None)] config.
while True:
current_state = stack[-1][0]
current_token = input[input_index]
action = table[current_state].get(current_token, ('error',)) In an LR1 parser, this is the lookahead of the configuration."""
if trace: return config.lookahead
print("{stack: <20} {input: <50} {action: <5}".format(
stack=[s[0] for s in stack],
input=input[input_index:],
action=action
))
if action[0] == 'accept': def gen_closure_next(self, config):
return stack[-1][1] """Return the next set of configurations in the closure for
config.
elif action[0] == 'reduce': In LR1 parsers, we must compute the lookahead for the configurations
name = action[1] we're adding to the closure. The lookahead for the new configurations
size = action[2] is the first() of the rest of this config's production. If that
contains epsilon, then the lookahead *also* contains the lookahead we
already have. (This lookahead was presumably generated by the same
process, so in some sense it is a 'parent' lookahead, or a lookahead
from an upstream production in the grammar.)
value = (name, tuple(s[1] for s in stack[-size:])) (See the documentation in GenerateLR0 for more information on how
stack = stack[:-size] this function fits into the whole process.)
"""
if config.at_end:
return ()
else:
next = []
for rule in self.grammar:
if rule[0] != config.next:
continue
goto = table[stack[-1][0]].get(name, ('error',)) # N.B.: We can't just append config.lookahead to config.rest
if (goto[0] != 'goto'): # and compute first(), because lookahead is a *set*. So
raise ValueError('OH NOES GOTO') # in this case we just say if 'first' contains epsilon,
stack.append((goto[1], value)) # then we need to remove the epsilon and union with the
# existing lookahead.
lookahead = self.gen_first(config.rest)
if None in lookahead:
lookahead = tuple(l for l in lookahead if l is not None)
lookahead = lookahead + config.lookahead
lookahead = tuple(sorted(set(lookahead)))
next.append(Configuration.from_rule(rule, lookahead=lookahead))
elif action[0] == 'shift': return tuple(next)
stack.append((action[1], (current_token, ())))
input_index += 1
elif action[0] == 'error': def gen_all_sets(self):
raise ValueError('OH NOES WAT') """Generate all of the configuration sets for the grammar.
In LR1 parsers, we must remember to set the lookahead of the start
symbol to '$'.
"""
initial_set = self.gen_closure(
Configuration.from_rule(self.grammar[0], lookahead=('$',)),
(),
)
return self.gen_sets(initial_set, ())
###############################################################################
# Formatting
###############################################################################
def format_node(node): def format_node(node):
"""Print out an indented concrete syntax tree, from parse().""" """Print out an indented concrete syntax tree, from parse()."""
lines = [ lines = [
@ -493,11 +616,11 @@ def format_table(generator, table):
header = " | {terms} | {nts}".format( header = " | {terms} | {nts}".format(
terms=' '.join( terms=' '.join(
'{0: <6}'.format(terminal) '{0: <6}'.format(terminal)
for terminal in (generator.terminals) for terminal in sorted(generator.terminals)
), ),
nts=' '.join( nts=' '.join(
'{0: <5}'.format(nt) '{0: <5}'.format(nt)
for nt in generator.nonterminals for nt in sorted(generator.nonterminals)
), ),
) )
@ -509,11 +632,11 @@ def format_table(generator, table):
index=i, index=i,
actions=' '.join( actions=' '.join(
'{0: <6}'.format(format_action(row, terminal)) '{0: <6}'.format(format_action(row, terminal))
for terminal in (generator.terminals) for terminal in sorted(generator.terminals)
), ),
gotos=' '.join( gotos=' '.join(
'{0: <5}'.format(row.get(nt, ('error', ''))[1]) '{0: <5}'.format(row.get(nt, ('error', ''))[1])
for nt in generator.nonterminals for nt in sorted(generator.nonterminals)
), ),
) )
for i, row in enumerate(table) for i, row in enumerate(table)
@ -521,6 +644,9 @@ def format_table(generator, table):
return '\n'.join(lines) return '\n'.join(lines)
###############################################################################
# Examples
###############################################################################
# OK, this is a very simple LR0 grammar. # OK, this is a very simple LR0 grammar.
grammar_simple = [ grammar_simple = [
('E', ['E', '+', 'T']), ('E', ['E', '+', 'T']),
@ -580,7 +706,7 @@ tree = parse(table, ['id', '+', '(', 'id', '[', 'id', ']', ')'])
print(format_node(tree) + "\n") print(format_node(tree) + "\n")
# SLR1 can't handle this. # SLR1 can't handle this.
grammar_aho_ullman = [ grammar_aho_ullman_1 = [
('S', ['L', '=', 'R']), ('S', ['L', '=', 'R']),
('S', ['R']), ('S', ['R']),
('L', ['*', 'R']), ('L', ['*', 'R']),
@ -588,8 +714,19 @@ grammar_aho_ullman = [
('R', ['L']), ('R', ['L']),
] ]
try: try:
gen = GenerateSLR1('S', grammar_aho_ullman) gen = GenerateSLR1('S', grammar_aho_ullman_1)
table = gen.gen_table() table = gen.gen_table()
assert False assert False
except ValueError as e: except ValueError as e:
print(e) print(e)
# Here's an example with a full LR1 grammar, though.
grammar_aho_ullman_2 = [
('S', ['X', 'X']),
('X', ['a', 'X']),
('X', ['b']),
]
gen = GenerateLR1('S', grammar_aho_ullman_2)
table = gen.gen_table()
print(format_table(gen, table))
parse(table, ['b', 'a', 'a', 'b'], trace=True)