Canonical LR1.
Also: - Reorganize the file into sections so I can keep track of where I am, and so it reads more cleanly from top to bottom. - A little more work on documentation and comments and the like.
This commit is contained in:
parent
c4be7bcd9f
commit
9fe44d30e0
1 changed files with 193 additions and 56 deletions
249
parser.py
249
parser.py
|
|
@ -2,15 +2,25 @@
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# LR0
|
||||||
|
#
|
||||||
|
# We start with LR0 parsers, because they form the basis of everything else.
|
||||||
|
###############################################################################
|
||||||
class Configuration(
|
class Configuration(
|
||||||
namedtuple('Configuration', ['name', 'symbols', 'position'])
|
namedtuple('Configuration', ['name', 'symbols', 'position', 'lookahead'])
|
||||||
):
|
):
|
||||||
"""A rule being tracked in a state."""
|
"""A rule being tracked in a state."""
|
||||||
__slots__ = ()
|
__slots__ = ()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_rule(cls, rule):
|
def from_rule(cls, rule, lookahead=()):
|
||||||
return Configuration(name=rule[0], symbols=rule[1], position=0)
|
return Configuration(
|
||||||
|
name=rule[0],
|
||||||
|
symbols=rule[1],
|
||||||
|
position=0,
|
||||||
|
lookahead=lookahead,
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def at_end(self):
|
def at_end(self):
|
||||||
|
|
@ -20,6 +30,10 @@ class Configuration(
|
||||||
def next(self):
|
def next(self):
|
||||||
return self.symbols[self.position] if not self.at_end else None
|
return self.symbols[self.position] if not self.at_end else None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def rest(self):
|
||||||
|
return self.symbols[(self.position+1):]
|
||||||
|
|
||||||
def at_symbol(self, symbol):
|
def at_symbol(self, symbol):
|
||||||
return self.next == symbol
|
return self.next == symbol
|
||||||
|
|
||||||
|
|
@ -27,12 +41,14 @@ class Configuration(
|
||||||
return self._replace(**kwargs)
|
return self._replace(**kwargs)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "{name} -> {bits}".format(
|
la = ", " + str(self.lookahead) if self.lookahead != () else ""
|
||||||
|
return "{name} -> {bits}{lookahead}".format(
|
||||||
name=self.name,
|
name=self.name,
|
||||||
bits=' '.join([
|
bits=' '.join([
|
||||||
'* ' + sym if i == self.position else sym
|
'* ' + sym if i == self.position else sym
|
||||||
for i, sym in enumerate(self.symbols)
|
for i, sym in enumerate(self.symbols)
|
||||||
]) + (' *' if self.at_end else '')
|
]) + (' *' if self.at_end else ''),
|
||||||
|
lookahead=la,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -61,9 +77,17 @@ class GenerateLR0(object):
|
||||||
|
|
||||||
means that O can be matched with nothing.
|
means that O can be matched with nothing.
|
||||||
|
|
||||||
Note that this is implemented in the dumbest way possible, in order to be
|
Implementation nodes:
|
||||||
the most understandable it can be. I built this to learn, and I want to
|
- This is implemented in the dumbest way possible, in order to be the
|
||||||
make sure I can keep learning with it.
|
most understandable it can be. I built this to learn, and I want to
|
||||||
|
make sure I can keep learning with it.
|
||||||
|
|
||||||
|
- We tend to use tuples everywhere. This is because tuples can be
|
||||||
|
compared for equality and put into tables and all that jazz. They might
|
||||||
|
be a little bit slower in places but like I said, this is for
|
||||||
|
learning. (Also, if we need this to run faster we can probably go a
|
||||||
|
long way by memoizing results, which is much easier if we have tuples
|
||||||
|
everywhere.)
|
||||||
"""
|
"""
|
||||||
def __init__(self, start, grammar):
|
def __init__(self, start, grammar):
|
||||||
"""Initialize the parser generator with the specified grammar and
|
"""Initialize the parser generator with the specified grammar and
|
||||||
|
|
@ -73,7 +97,7 @@ class GenerateLR0(object):
|
||||||
# production for the start state. grammar[0] is always the start
|
# production for the start state. grammar[0] is always the start
|
||||||
# rule, and in the set of states and table and whatever the first
|
# rule, and in the set of states and table and whatever the first
|
||||||
# element is always the starting state/position.
|
# element is always the starting state/position.
|
||||||
self.grammar = [('__start', start)] + grammar
|
self.grammar = [('__start', [start])] + grammar
|
||||||
self.nonterminals = set(rule[0] for rule in grammar)
|
self.nonterminals = set(rule[0] for rule in grammar)
|
||||||
self.terminals = set(
|
self.terminals = set(
|
||||||
sym
|
sym
|
||||||
|
|
@ -121,7 +145,8 @@ class GenerateLR0(object):
|
||||||
existing closure.
|
existing closure.
|
||||||
|
|
||||||
If the provided config is already in the closure then nothing is
|
If the provided config is already in the closure then nothing is
|
||||||
done.
|
done. (We assume that the closure of the config is *also* already in
|
||||||
|
the closure.)
|
||||||
"""
|
"""
|
||||||
if config in closure:
|
if config in closure:
|
||||||
return closure
|
return closure
|
||||||
|
|
@ -192,7 +217,7 @@ class GenerateLR0(object):
|
||||||
|
|
||||||
def gen_reduce_set(self, config):
|
def gen_reduce_set(self, config):
|
||||||
"""Return the set of symbols that indicate we should reduce the given
|
"""Return the set of symbols that indicate we should reduce the given
|
||||||
config.
|
configuration.
|
||||||
|
|
||||||
In an LR0 parser, this is just the set of all terminals."""
|
In an LR0 parser, this is just the set of all terminals."""
|
||||||
return self.terminals
|
return self.terminals
|
||||||
|
|
@ -310,14 +335,85 @@ class GenerateLR0(object):
|
||||||
return row[symbol][0]
|
return row[symbol][0]
|
||||||
|
|
||||||
|
|
||||||
|
def parse(table, input, trace=False):
|
||||||
|
"""Parse the input with the generated parsing table and return the
|
||||||
|
concrete syntax tree.
|
||||||
|
|
||||||
|
The parsing table can be generated by GenerateLR0.gen_table() or by any
|
||||||
|
of the other generators below. The parsing mechanism never changes, only
|
||||||
|
the table generation mechanism.
|
||||||
|
|
||||||
|
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
|
||||||
|
one on for you.
|
||||||
|
"""
|
||||||
|
assert '$' not in input
|
||||||
|
input = input + ['$']
|
||||||
|
input_index = 0
|
||||||
|
|
||||||
|
# Our stack is a stack of tuples, where the first entry is the state number
|
||||||
|
# and the second entry is the 'value' that was generated when the state was
|
||||||
|
# pushed.
|
||||||
|
stack = [(0, None)]
|
||||||
|
while True:
|
||||||
|
current_state = stack[-1][0]
|
||||||
|
current_token = input[input_index]
|
||||||
|
|
||||||
|
action = table[current_state].get(current_token, ('error',))
|
||||||
|
if trace:
|
||||||
|
print("{stack: <20} {input: <50} {action: <5}".format(
|
||||||
|
stack=repr([s[0] for s in stack]),
|
||||||
|
input=repr(input[input_index:]),
|
||||||
|
action=repr(action)
|
||||||
|
))
|
||||||
|
|
||||||
|
if action[0] == 'accept':
|
||||||
|
return stack[-1][1]
|
||||||
|
|
||||||
|
elif action[0] == 'reduce':
|
||||||
|
name = action[1]
|
||||||
|
size = action[2]
|
||||||
|
|
||||||
|
value = (name, tuple(s[1] for s in stack[-size:]))
|
||||||
|
stack = stack[:-size]
|
||||||
|
|
||||||
|
goto = table[stack[-1][0]].get(name, ('error',))
|
||||||
|
assert goto[0] == 'goto' # Corrupt table?
|
||||||
|
stack.append((goto[1], value))
|
||||||
|
|
||||||
|
elif action[0] == 'shift':
|
||||||
|
stack.append((action[1], (current_token, ())))
|
||||||
|
input_index += 1
|
||||||
|
|
||||||
|
elif action[0] == 'error':
|
||||||
|
raise ValueError(
|
||||||
|
'Syntax error: unexpected symbol {sym}'.format(
|
||||||
|
sym=current_token,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# SLR(1)
|
||||||
|
###############################################################################
|
||||||
class GenerateSLR1(GenerateLR0):
|
class GenerateSLR1(GenerateLR0):
|
||||||
"""Generate parse tables for SLR1 grammars.
|
"""Generate parse tables for SLR1 grammars.
|
||||||
|
|
||||||
boop
|
SLR1 parsers can recognize more than LR0 parsers, because they have a
|
||||||
|
little bit more information: instead of generating reduce actions for a
|
||||||
|
production on all possible inputs, as LR0 parsers do, they generate
|
||||||
|
reduce actions only for inputs that are in the 'follow' set of the
|
||||||
|
non-terminal.
|
||||||
|
|
||||||
|
That means SLR1 parsers need to know how to generate 'follow(A)', which
|
||||||
|
means they need to know how to generate 'first(A)', which is most of the
|
||||||
|
code in this class.
|
||||||
"""
|
"""
|
||||||
def gen_first_symbol(self, symbol, visited):
|
def gen_first_symbol(self, symbol, visited):
|
||||||
"""Compute the first set for a single symbol.
|
"""Compute the first set for a single symbol.
|
||||||
|
|
||||||
|
If a symbol can be empty, then the set contains epsilon, which we
|
||||||
|
represent as python's `None`.
|
||||||
|
|
||||||
The first set is the set of tokens that can appear as the first token
|
The first set is the set of tokens that can appear as the first token
|
||||||
for a given symbol. (Obviously, if the symbol is itself a token, then
|
for a given symbol. (Obviously, if the symbol is itself a token, then
|
||||||
this is trivial.)
|
this is trivial.)
|
||||||
|
|
@ -325,7 +421,7 @@ class GenerateSLR1(GenerateLR0):
|
||||||
'visited' is a set of already visited symbols, to stop infinite
|
'visited' is a set of already visited symbols, to stop infinite
|
||||||
recursion on left-recursive grammars. That means that sometimes this
|
recursion on left-recursive grammars. That means that sometimes this
|
||||||
function can return an empty tuple. Don't confuse that with a tuple
|
function can return an empty tuple. Don't confuse that with a tuple
|
||||||
containing epsilon: that's a tuple containing 'None', not an empty
|
containing epsilon: that's a tuple containing `None`, not an empty
|
||||||
tuple.
|
tuple.
|
||||||
"""
|
"""
|
||||||
if symbol in self.terminals:
|
if symbol in self.terminals:
|
||||||
|
|
@ -347,7 +443,7 @@ class GenerateSLR1(GenerateLR0):
|
||||||
for fs in firsts:
|
for fs in firsts:
|
||||||
result = result + tuple(f for f in fs if f not in result)
|
result = result + tuple(f for f in fs if f not in result)
|
||||||
|
|
||||||
return result
|
return tuple(sorted(result))
|
||||||
|
|
||||||
def gen_first(self, symbols, visited=None):
|
def gen_first(self, symbols, visited=None):
|
||||||
"""Compute the first set for a sequence of symbols.
|
"""Compute the first set for a sequence of symbols.
|
||||||
|
|
@ -355,7 +451,7 @@ class GenerateSLR1(GenerateLR0):
|
||||||
The first set is the set of tokens that can appear as the first token
|
The first set is the set of tokens that can appear as the first token
|
||||||
for this sequence of symbols. The interesting wrinkle in computing the
|
for this sequence of symbols. The interesting wrinkle in computing the
|
||||||
first set for a sequence of symbols is that we keep computing the first
|
first set for a sequence of symbols is that we keep computing the first
|
||||||
sets so long as Epsilon appears in the set. i.e., if we are computing
|
sets so long as epsilon appears in the set. i.e., if we are computing
|
||||||
for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the
|
for ['A', 'B', 'C'] and the first set of 'A' contains epsilon, then the
|
||||||
first set for the *sequence* also contains the first set of ['B', 'C'],
|
first set for the *sequence* also contains the first set of ['B', 'C'],
|
||||||
since 'A' could be missing entirely.
|
since 'A' could be missing entirely.
|
||||||
|
|
@ -374,8 +470,9 @@ class GenerateSLR1(GenerateLR0):
|
||||||
visited = set()
|
visited = set()
|
||||||
result = self.gen_first_symbol(symbols[0], visited)
|
result = self.gen_first_symbol(symbols[0], visited)
|
||||||
if None in result:
|
if None in result:
|
||||||
result = tuple(set(s for s in result if s is not None))
|
result = tuple(s for s in result if s is not None)
|
||||||
result = result + self.gen_first(symbols[1:], visited)
|
result = result + self.gen_first(symbols[1:], visited)
|
||||||
|
result = tuple(sorted(set(result)))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def gen_follow(self, symbol, visited=None):
|
def gen_follow(self, symbol, visited=None):
|
||||||
|
|
@ -420,51 +517,77 @@ class GenerateSLR1(GenerateLR0):
|
||||||
return self.gen_follow(config.name)
|
return self.gen_follow(config.name)
|
||||||
|
|
||||||
|
|
||||||
def parse(table, input, trace=False):
|
class GenerateLR1(GenerateSLR1):
|
||||||
"""Parse the input with the generated parsing table and return the
|
"""Generate parse tables for LR1, or "canonical LR" grammars.
|
||||||
concrete syntax tree.
|
|
||||||
|
|
||||||
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
|
LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they
|
||||||
one on for you.
|
are choosier about when they reduce. But unlike SLR parsers, they specify
|
||||||
|
the terminals on which they reduce by carrying a 'lookahead' terminal in
|
||||||
|
the configuration. The lookahead of a configuration is computed as the
|
||||||
|
closure of a configuration set is computed, so see gen_closure_next for
|
||||||
|
details. (Except for the start configuration, which has '$' as its
|
||||||
|
lookahead.)
|
||||||
"""
|
"""
|
||||||
input = input + ['$']
|
def gen_reduce_set(self, config):
|
||||||
input_index = 0
|
"""Return the set of symbols that indicate we should reduce the given
|
||||||
stack = [(0, None)]
|
config.
|
||||||
while True:
|
|
||||||
current_state = stack[-1][0]
|
|
||||||
current_token = input[input_index]
|
|
||||||
|
|
||||||
action = table[current_state].get(current_token, ('error',))
|
In an LR1 parser, this is the lookahead of the configuration."""
|
||||||
if trace:
|
return config.lookahead
|
||||||
print("{stack: <20} {input: <50} {action: <5}".format(
|
|
||||||
stack=[s[0] for s in stack],
|
|
||||||
input=input[input_index:],
|
|
||||||
action=action
|
|
||||||
))
|
|
||||||
|
|
||||||
if action[0] == 'accept':
|
def gen_closure_next(self, config):
|
||||||
return stack[-1][1]
|
"""Return the next set of configurations in the closure for
|
||||||
|
config.
|
||||||
|
|
||||||
elif action[0] == 'reduce':
|
In LR1 parsers, we must compute the lookahead for the configurations
|
||||||
name = action[1]
|
we're adding to the closure. The lookahead for the new configurations
|
||||||
size = action[2]
|
is the first() of the rest of this config's production. If that
|
||||||
|
contains epsilon, then the lookahead *also* contains the lookahead we
|
||||||
|
already have. (This lookahead was presumably generated by the same
|
||||||
|
process, so in some sense it is a 'parent' lookahead, or a lookahead
|
||||||
|
from an upstream production in the grammar.)
|
||||||
|
|
||||||
value = (name, tuple(s[1] for s in stack[-size:]))
|
(See the documentation in GenerateLR0 for more information on how
|
||||||
stack = stack[:-size]
|
this function fits into the whole process.)
|
||||||
|
"""
|
||||||
|
if config.at_end:
|
||||||
|
return ()
|
||||||
|
else:
|
||||||
|
next = []
|
||||||
|
for rule in self.grammar:
|
||||||
|
if rule[0] != config.next:
|
||||||
|
continue
|
||||||
|
|
||||||
goto = table[stack[-1][0]].get(name, ('error',))
|
# N.B.: We can't just append config.lookahead to config.rest
|
||||||
if (goto[0] != 'goto'):
|
# and compute first(), because lookahead is a *set*. So
|
||||||
raise ValueError('OH NOES GOTO')
|
# in this case we just say if 'first' contains epsilon,
|
||||||
stack.append((goto[1], value))
|
# then we need to remove the epsilon and union with the
|
||||||
|
# existing lookahead.
|
||||||
|
lookahead = self.gen_first(config.rest)
|
||||||
|
if None in lookahead:
|
||||||
|
lookahead = tuple(l for l in lookahead if l is not None)
|
||||||
|
lookahead = lookahead + config.lookahead
|
||||||
|
lookahead = tuple(sorted(set(lookahead)))
|
||||||
|
next.append(Configuration.from_rule(rule, lookahead=lookahead))
|
||||||
|
|
||||||
elif action[0] == 'shift':
|
return tuple(next)
|
||||||
stack.append((action[1], (current_token, ())))
|
|
||||||
input_index += 1
|
|
||||||
|
|
||||||
elif action[0] == 'error':
|
def gen_all_sets(self):
|
||||||
raise ValueError('OH NOES WAT')
|
"""Generate all of the configuration sets for the grammar.
|
||||||
|
|
||||||
|
In LR1 parsers, we must remember to set the lookahead of the start
|
||||||
|
symbol to '$'.
|
||||||
|
"""
|
||||||
|
initial_set = self.gen_closure(
|
||||||
|
Configuration.from_rule(self.grammar[0], lookahead=('$',)),
|
||||||
|
(),
|
||||||
|
)
|
||||||
|
return self.gen_sets(initial_set, ())
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Formatting
|
||||||
|
###############################################################################
|
||||||
def format_node(node):
|
def format_node(node):
|
||||||
"""Print out an indented concrete syntax tree, from parse()."""
|
"""Print out an indented concrete syntax tree, from parse()."""
|
||||||
lines = [
|
lines = [
|
||||||
|
|
@ -493,11 +616,11 @@ def format_table(generator, table):
|
||||||
header = " | {terms} | {nts}".format(
|
header = " | {terms} | {nts}".format(
|
||||||
terms=' '.join(
|
terms=' '.join(
|
||||||
'{0: <6}'.format(terminal)
|
'{0: <6}'.format(terminal)
|
||||||
for terminal in (generator.terminals)
|
for terminal in sorted(generator.terminals)
|
||||||
),
|
),
|
||||||
nts=' '.join(
|
nts=' '.join(
|
||||||
'{0: <5}'.format(nt)
|
'{0: <5}'.format(nt)
|
||||||
for nt in generator.nonterminals
|
for nt in sorted(generator.nonterminals)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -509,11 +632,11 @@ def format_table(generator, table):
|
||||||
index=i,
|
index=i,
|
||||||
actions=' '.join(
|
actions=' '.join(
|
||||||
'{0: <6}'.format(format_action(row, terminal))
|
'{0: <6}'.format(format_action(row, terminal))
|
||||||
for terminal in (generator.terminals)
|
for terminal in sorted(generator.terminals)
|
||||||
),
|
),
|
||||||
gotos=' '.join(
|
gotos=' '.join(
|
||||||
'{0: <5}'.format(row.get(nt, ('error', ''))[1])
|
'{0: <5}'.format(row.get(nt, ('error', ''))[1])
|
||||||
for nt in generator.nonterminals
|
for nt in sorted(generator.nonterminals)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
for i, row in enumerate(table)
|
for i, row in enumerate(table)
|
||||||
|
|
@ -521,6 +644,9 @@ def format_table(generator, table):
|
||||||
return '\n'.join(lines)
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Examples
|
||||||
|
###############################################################################
|
||||||
# OK, this is a very simple LR0 grammar.
|
# OK, this is a very simple LR0 grammar.
|
||||||
grammar_simple = [
|
grammar_simple = [
|
||||||
('E', ['E', '+', 'T']),
|
('E', ['E', '+', 'T']),
|
||||||
|
|
@ -580,7 +706,7 @@ tree = parse(table, ['id', '+', '(', 'id', '[', 'id', ']', ')'])
|
||||||
print(format_node(tree) + "\n")
|
print(format_node(tree) + "\n")
|
||||||
|
|
||||||
# SLR1 can't handle this.
|
# SLR1 can't handle this.
|
||||||
grammar_aho_ullman = [
|
grammar_aho_ullman_1 = [
|
||||||
('S', ['L', '=', 'R']),
|
('S', ['L', '=', 'R']),
|
||||||
('S', ['R']),
|
('S', ['R']),
|
||||||
('L', ['*', 'R']),
|
('L', ['*', 'R']),
|
||||||
|
|
@ -588,8 +714,19 @@ grammar_aho_ullman = [
|
||||||
('R', ['L']),
|
('R', ['L']),
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
gen = GenerateSLR1('S', grammar_aho_ullman)
|
gen = GenerateSLR1('S', grammar_aho_ullman_1)
|
||||||
table = gen.gen_table()
|
table = gen.gen_table()
|
||||||
assert False
|
assert False
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
|
# Here's an example with a full LR1 grammar, though.
|
||||||
|
grammar_aho_ullman_2 = [
|
||||||
|
('S', ['X', 'X']),
|
||||||
|
('X', ['a', 'X']),
|
||||||
|
('X', ['b']),
|
||||||
|
]
|
||||||
|
gen = GenerateLR1('S', grammar_aho_ullman_2)
|
||||||
|
table = gen.gen_table()
|
||||||
|
print(format_table(gen, table))
|
||||||
|
parse(table, ['b', 'a', 'a', 'b'], trace=True)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue