Canonical LR1.
Also: - Reorganize the file into sections so I can keep track of where I am, and so it reads more cleanly from top to bottom. - A little more work on documentation and comments and the like.
This commit is contained in:
parent
c4be7bcd9f
commit
9fe44d30e0
1 changed files with 193 additions and 56 deletions
249
parser.py
249
parser.py
|
|
@ -2,15 +2,25 @@
|
|||
from collections import namedtuple
|
||||
|
||||
|
||||
###############################################################################
|
||||
# LR0
|
||||
#
|
||||
# We start with LR0 parsers, because they form the basis of everything else.
|
||||
###############################################################################
|
||||
class Configuration(
|
||||
namedtuple('Configuration', ['name', 'symbols', 'position'])
|
||||
namedtuple('Configuration', ['name', 'symbols', 'position', 'lookahead'])
|
||||
):
|
||||
"""A rule being tracked in a state."""
|
||||
__slots__ = ()
|
||||
|
||||
@classmethod
|
||||
def from_rule(cls, rule):
|
||||
return Configuration(name=rule[0], symbols=rule[1], position=0)
|
||||
def from_rule(cls, rule, lookahead=()):
|
||||
return Configuration(
|
||||
name=rule[0],
|
||||
symbols=rule[1],
|
||||
position=0,
|
||||
lookahead=lookahead,
|
||||
)
|
||||
|
||||
@property
|
||||
def at_end(self):
|
||||
|
|
@ -20,6 +30,10 @@ class Configuration(
|
|||
def next(self):
|
||||
return self.symbols[self.position] if not self.at_end else None
|
||||
|
||||
@property
|
||||
def rest(self):
|
||||
return self.symbols[(self.position+1):]
|
||||
|
||||
def at_symbol(self, symbol):
|
||||
return self.next == symbol
|
||||
|
||||
|
|
@ -27,12 +41,14 @@ class Configuration(
|
|||
return self._replace(**kwargs)
|
||||
|
||||
def __str__(self):
|
||||
return "{name} -> {bits}".format(
|
||||
la = ", " + str(self.lookahead) if self.lookahead != () else ""
|
||||
return "{name} -> {bits}{lookahead}".format(
|
||||
name=self.name,
|
||||
bits=' '.join([
|
||||
'* ' + sym if i == self.position else sym
|
||||
for i, sym in enumerate(self.symbols)
|
||||
]) + (' *' if self.at_end else '')
|
||||
]) + (' *' if self.at_end else ''),
|
||||
lookahead=la,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -61,9 +77,17 @@ class GenerateLR0(object):
|
|||
|
||||
means that O can be matched with nothing.
|
||||
|
||||
Note that this is implemented in the dumbest way possible, in order to be
|
||||
the most understandable it can be. I built this to learn, and I want to
|
||||
make sure I can keep learning with it.
|
||||
Implementation nodes:
|
||||
- This is implemented in the dumbest way possible, in order to be the
|
||||
most understandable it can be. I built this to learn, and I want to
|
||||
make sure I can keep learning with it.
|
||||
|
||||
- We tend to use tuples everywhere. This is because tuples can be
|
||||
compared for equality and put into tables and all that jazz. They might
|
||||
be a little bit slower in places but like I said, this is for
|
||||
learning. (Also, if we need this to run faster we can probably go a
|
||||
long way by memoizing results, which is much easier if we have tuples
|
||||
everywhere.)
|
||||
"""
|
||||
def __init__(self, start, grammar):
|
||||
"""Initialize the parser generator with the specified grammar and
|
||||
|
|
@ -73,7 +97,7 @@ class GenerateLR0(object):
|
|||
# production for the start state. grammar[0] is always the start
|
||||
# rule, and in the set of states and table and whatever the first
|
||||
# element is always the starting state/position.
|
||||
self.grammar = [('__start', start)] + grammar
|
||||
self.grammar = [('__start', [start])] + grammar
|
||||
self.nonterminals = set(rule[0] for rule in grammar)
|
||||
self.terminals = set(
|
||||
sym
|
||||
|
|
@ -121,7 +145,8 @@ class GenerateLR0(object):
|
|||
existing closure.
|
||||
|
||||
If the provided config is already in the closure then nothing is
|
||||
done.
|
||||
done. (We assume that the closure of the config is *also* already in
|
||||
the closure.)
|
||||
"""
|
||||
if config in closure:
|
||||
return closure
|
||||
|
|
@ -192,7 +217,7 @@ class GenerateLR0(object):
|
|||
|
||||
def gen_reduce_set(self, config):
|
||||
"""Return the set of symbols that indicate we should reduce the given
|
||||
config.
|
||||
configuration.
|
||||
|
||||
In an LR0 parser, this is just the set of all terminals."""
|
||||
return self.terminals
|
||||
|
|
@ -310,14 +335,85 @@ class GenerateLR0(object):
|
|||
return row[symbol][0]
|
||||
|
||||
|
||||
def parse(table, input, trace=False):
|
||||
"""Parse the input with the generated parsing table and return the
|
||||
concrete syntax tree.
|
||||
|
||||
The parsing table can be generated by GenerateLR0.gen_table() or by any
|
||||
of the other generators below. The parsing mechanism never changes, only
|
||||
the table generation mechanism.
|
||||
|
||||
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
|
||||
one on for you.
|
||||
"""
|
||||
assert '$' not in input
|
||||
input = input + ['$']
|
||||
input_index = 0
|
||||
|
||||
# Our stack is a stack of tuples, where the first entry is the state number
|
||||
# and the second entry is the 'value' that was generated when the state was
|
||||
# pushed.
|
||||
stack = [(0, None)]
|
||||
while True:
|
||||
current_state = stack[-1][0]
|
||||
current_token = input[input_index]
|
||||
|
||||
action = table[current_state].get(current_token, ('error',))
|
||||
if trace:
|
||||
print("{stack: <20} {input: <50} {action: <5}".format(
|
||||
stack=repr([s[0] for s in stack]),
|
||||
input=repr(input[input_index:]),
|
||||
action=repr(action)
|
||||
))
|
||||
|
||||
if action[0] == 'accept':
|
||||
return stack[-1][1]
|
||||
|
||||
elif action[0] == 'reduce':
|
||||
name = action[1]
|
||||
size = action[2]
|
||||
|
||||
value = (name, tuple(s[1] for s in stack[-size:]))
|
||||
stack = stack[:-size]
|
||||
|
||||
goto = table[stack[-1][0]].get(name, ('error',))
|
||||
assert goto[0] == 'goto' # Corrupt table?
|
||||
stack.append((goto[1], value))
|
||||
|
||||
elif action[0] == 'shift':
|
||||
stack.append((action[1], (current_token, ())))
|
||||
input_index += 1
|
||||
|
||||
elif action[0] == 'error':
|
||||
raise ValueError(
|
||||
'Syntax error: unexpected symbol {sym}'.format(
|
||||
sym=current_token,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# SLR(1)
|
||||
###############################################################################
|
||||
class GenerateSLR1(GenerateLR0):
|
||||
"""Generate parse tables for SLR1 grammars.
|
||||
|
||||
boop
|
||||
SLR1 parsers can recognize more than LR0 parsers, because they have a
|
||||
little bit more information: instead of generating reduce actions for a
|
||||
production on all possible inputs, as LR0 parsers do, they generate
|
||||
reduce actions only for inputs that are in the 'follow' set of the
|
||||
non-terminal.
|
||||
|
||||
That means SLR1 parsers need to know how to generate 'follow(A)', which
|
||||
means they need to know how to generate 'first(A)', which is most of the
|
||||
code in this class.
|
||||
"""
|
||||
def gen_first_symbol(self, symbol, visited):
|
||||
"""Compute the first set for a single symbol.
|
||||
|
||||
If a symbol can be empty, then the set contains epsilon, which we
|
||||
represent as python's `None`.
|
||||
|
||||
The first set is the set of tokens that can appear as the first token
|
||||
for a given symbol. (Obviously, if the symbol is itself a token, then
|
||||
this is trivial.)
|
||||
|
|
@ -325,7 +421,7 @@ class GenerateSLR1(GenerateLR0):
|
|||
'visited' is a set of already visited symbols, to stop infinite
|
||||
recursion on left-recursive grammars. That means that sometimes this
|
||||
function can return an empty tuple. Don't confuse that with a tuple
|
||||
containing epsilon: that's a tuple containing 'None', not an empty
|
||||
containing epsilon: that's a tuple containing `None`, not an empty
|
||||
tuple.
|
||||
"""
|
||||
if symbol in self.terminals:
|
||||
|
|
@ -347,7 +443,7 @@ class GenerateSLR1(GenerateLR0):
|
|||
for fs in firsts:
|
||||
result = result + tuple(f for f in fs if f not in result)
|
||||
|
||||
return result
|
||||
return tuple(sorted(result))
|
||||
|
||||
def gen_first(self, symbols, visited=None):
|
||||
"""Compute the first set for a sequence of symbols.
|
||||
|
|
@ -355,7 +451,7 @@ class GenerateSLR1(GenerateLR0):
|
|||
The first set is the set of tokens that can appear as the first token
|
||||
for this sequence of symbols. The interesting wrinkle in computing the
|
||||
first set for a sequence of symbols is that we keep computing the first
|
||||
sets so long as Epsilon appears in the set. i.e., if we are computing
|
||||
sets so long as epsilon appears in the set. i.e., if we are computing
|
||||
for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the
|
||||
first set for the *sequence* also contains the first set of ['B', 'C'],
|
||||
since 'A' could be missing entirely.
|
||||
|
|
@ -374,8 +470,9 @@ class GenerateSLR1(GenerateLR0):
|
|||
visited = set()
|
||||
result = self.gen_first_symbol(symbols[0], visited)
|
||||
if None in result:
|
||||
result = tuple(set(s for s in result if s is not None))
|
||||
result = tuple(s for s in result if s is not None)
|
||||
result = result + self.gen_first(symbols[1:], visited)
|
||||
result = tuple(sorted(set(result)))
|
||||
return result
|
||||
|
||||
def gen_follow(self, symbol, visited=None):
|
||||
|
|
@ -420,51 +517,77 @@ class GenerateSLR1(GenerateLR0):
|
|||
return self.gen_follow(config.name)
|
||||
|
||||
|
||||
def parse(table, input, trace=False):
|
||||
"""Parse the input with the generated parsing table and return the
|
||||
concrete syntax tree.
|
||||
class GenerateLR1(GenerateSLR1):
|
||||
"""Generate parse tables for LR1, or "canonical LR" grammars.
|
||||
|
||||
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
|
||||
one on for you.
|
||||
LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they
|
||||
are choosier about when they reduce. But unlike SLR parsers, they specify
|
||||
the terminals on which they reduce by carrying a 'lookahead' terminal in
|
||||
the configuration. The lookahead of a configuration is computed as the
|
||||
closure of a configuration set is computed, so see gen_closure_next for
|
||||
details. (Except for the start configuration, which has '$' as its
|
||||
lookahead.)
|
||||
"""
|
||||
input = input + ['$']
|
||||
input_index = 0
|
||||
stack = [(0, None)]
|
||||
while True:
|
||||
current_state = stack[-1][0]
|
||||
current_token = input[input_index]
|
||||
def gen_reduce_set(self, config):
|
||||
"""Return the set of symbols that indicate we should reduce the given
|
||||
config.
|
||||
|
||||
action = table[current_state].get(current_token, ('error',))
|
||||
if trace:
|
||||
print("{stack: <20} {input: <50} {action: <5}".format(
|
||||
stack=[s[0] for s in stack],
|
||||
input=input[input_index:],
|
||||
action=action
|
||||
))
|
||||
In an LR1 parser, this is the lookahead of the configuration."""
|
||||
return config.lookahead
|
||||
|
||||
if action[0] == 'accept':
|
||||
return stack[-1][1]
|
||||
def gen_closure_next(self, config):
|
||||
"""Return the next set of configurations in the closure for
|
||||
config.
|
||||
|
||||
elif action[0] == 'reduce':
|
||||
name = action[1]
|
||||
size = action[2]
|
||||
In LR1 parsers, we must compute the lookahead for the configurations
|
||||
we're adding to the closure. The lookahead for the new configurations
|
||||
is the first() of the rest of this config's production. If that
|
||||
contains epsilon, then the lookahead *also* contains the lookahead we
|
||||
already have. (This lookahead was presumably generated by the same
|
||||
process, so in some sense it is a 'parent' lookahead, or a lookahead
|
||||
from an upstream production in the grammar.)
|
||||
|
||||
value = (name, tuple(s[1] for s in stack[-size:]))
|
||||
stack = stack[:-size]
|
||||
(See the documentation in GenerateLR0 for more information on how
|
||||
this function fits into the whole process.)
|
||||
"""
|
||||
if config.at_end:
|
||||
return ()
|
||||
else:
|
||||
next = []
|
||||
for rule in self.grammar:
|
||||
if rule[0] != config.next:
|
||||
continue
|
||||
|
||||
goto = table[stack[-1][0]].get(name, ('error',))
|
||||
if (goto[0] != 'goto'):
|
||||
raise ValueError('OH NOES GOTO')
|
||||
stack.append((goto[1], value))
|
||||
# N.B.: We can't just append config.lookahead to config.rest
|
||||
# and compute first(), because lookahead is a *set*. So
|
||||
# in this case we just say if 'first' contains epsilon,
|
||||
# then we need to remove the epsilon and union with the
|
||||
# existing lookahead.
|
||||
lookahead = self.gen_first(config.rest)
|
||||
if None in lookahead:
|
||||
lookahead = tuple(l for l in lookahead if l is not None)
|
||||
lookahead = lookahead + config.lookahead
|
||||
lookahead = tuple(sorted(set(lookahead)))
|
||||
next.append(Configuration.from_rule(rule, lookahead=lookahead))
|
||||
|
||||
elif action[0] == 'shift':
|
||||
stack.append((action[1], (current_token, ())))
|
||||
input_index += 1
|
||||
return tuple(next)
|
||||
|
||||
elif action[0] == 'error':
|
||||
raise ValueError('OH NOES WAT')
|
||||
def gen_all_sets(self):
|
||||
"""Generate all of the configuration sets for the grammar.
|
||||
|
||||
In LR1 parsers, we must remember to set the lookahead of the start
|
||||
symbol to '$'.
|
||||
"""
|
||||
initial_set = self.gen_closure(
|
||||
Configuration.from_rule(self.grammar[0], lookahead=('$',)),
|
||||
(),
|
||||
)
|
||||
return self.gen_sets(initial_set, ())
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Formatting
|
||||
###############################################################################
|
||||
def format_node(node):
|
||||
"""Print out an indented concrete syntax tree, from parse()."""
|
||||
lines = [
|
||||
|
|
@ -493,11 +616,11 @@ def format_table(generator, table):
|
|||
header = " | {terms} | {nts}".format(
|
||||
terms=' '.join(
|
||||
'{0: <6}'.format(terminal)
|
||||
for terminal in (generator.terminals)
|
||||
for terminal in sorted(generator.terminals)
|
||||
),
|
||||
nts=' '.join(
|
||||
'{0: <5}'.format(nt)
|
||||
for nt in generator.nonterminals
|
||||
for nt in sorted(generator.nonterminals)
|
||||
),
|
||||
)
|
||||
|
||||
|
|
@ -509,11 +632,11 @@ def format_table(generator, table):
|
|||
index=i,
|
||||
actions=' '.join(
|
||||
'{0: <6}'.format(format_action(row, terminal))
|
||||
for terminal in (generator.terminals)
|
||||
for terminal in sorted(generator.terminals)
|
||||
),
|
||||
gotos=' '.join(
|
||||
'{0: <5}'.format(row.get(nt, ('error', ''))[1])
|
||||
for nt in generator.nonterminals
|
||||
for nt in sorted(generator.nonterminals)
|
||||
),
|
||||
)
|
||||
for i, row in enumerate(table)
|
||||
|
|
@ -521,6 +644,9 @@ def format_table(generator, table):
|
|||
return '\n'.join(lines)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Examples
|
||||
###############################################################################
|
||||
# OK, this is a very simple LR0 grammar.
|
||||
grammar_simple = [
|
||||
('E', ['E', '+', 'T']),
|
||||
|
|
@ -580,7 +706,7 @@ tree = parse(table, ['id', '+', '(', 'id', '[', 'id', ']', ')'])
|
|||
print(format_node(tree) + "\n")
|
||||
|
||||
# SLR1 can't handle this.
|
||||
grammar_aho_ullman = [
|
||||
grammar_aho_ullman_1 = [
|
||||
('S', ['L', '=', 'R']),
|
||||
('S', ['R']),
|
||||
('L', ['*', 'R']),
|
||||
|
|
@ -588,8 +714,19 @@ grammar_aho_ullman = [
|
|||
('R', ['L']),
|
||||
]
|
||||
try:
|
||||
gen = GenerateSLR1('S', grammar_aho_ullman)
|
||||
gen = GenerateSLR1('S', grammar_aho_ullman_1)
|
||||
table = gen.gen_table()
|
||||
assert False
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
|
||||
# Here's an example with a full LR1 grammar, though.
|
||||
grammar_aho_ullman_2 = [
|
||||
('S', ['X', 'X']),
|
||||
('X', ['a', 'X']),
|
||||
('X', ['b']),
|
||||
]
|
||||
gen = GenerateLR1('S', grammar_aho_ullman_2)
|
||||
table = gen.gen_table()
|
||||
print(format_table(gen, table))
|
||||
parse(table, ['b', 'a', 'a', 'b'], trace=True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue