From 9781765043e794c758b16629304cecde323e0a49 Mon Sep 17 00:00:00 2001 From: John Doty Date: Thu, 8 Dec 2016 06:20:59 -0800 Subject: [PATCH] More docs, more stuff. --- parser.py | 90 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/parser.py b/parser.py index fa87c70..5682114 100644 --- a/parser.py +++ b/parser.py @@ -1,25 +1,5 @@ # This is doty playing with parser tables. -from collections import namedtuple, OrderedDict - -# This is how we define a grammar: as a list of productions. Should be -# self-evident. Note that we don't support alternatives or other complex -# rules-- you must reduce those to this style explicitly. -# -# Also note that you don't have to make an explicit list of tokens-- if a -# symbol is on the right-hand-side of a production in this grammar and it -# doesn't appear on the left-hand-side of any production then it must be a -# token. -# -# ALSO note that the token '$' is reserved to mean "end of input", so don't use -# it in your grammars. -# -grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), -] - +from collections import namedtuple class Configuration( namedtuple('Configuration', ['name', 'symbols', 'position']) @@ -55,11 +35,35 @@ class Configuration( class GenerateLR0(object): """Generate parser tables for an LR0 parser. - Note that this is built in the dumbest way possible, in order to be the - most understandable it can be. I built this to learn, and I want to make - sure I can keep learning with it. + Grammars are of the form: + + grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), + ] + + Which is to say, they are a list of productions. Each production is a + tuple where the first element of the tuple is the name of the + non-terminal being added, and the second elment of the tuple is the + list of terminals and non-terminals that make up the production. + + Don't name anything with double-underscores; those are reserved for the + generator. Don't add '$' to your + + Note that this is implemented in the dumbest way possible, in order to be + the most understandable it can be. I built this to learn, and I want to + make sure I can keep learning with it. """ def __init__(self, grammar, start): + """Initialize the parser generator with the specified grammar and + start symbol. + """ + # We always store the "augmented" grammar, which contains an initial + # production for the start state. grammar[0] is always the start + # rule, and in the set of states and table and whatever the first + # element is always the starting state/position. self.grammar = [('__start', start)] + grammar self.nonterminals = set(rule[0] for rule in grammar) self.terminals = set( @@ -67,9 +71,23 @@ class GenerateLR0(object): for name, symbols in grammar for sym in symbols if sym not in self.nonterminals - ) | {'$'} + ) self.alphabet = self.terminals | self.nonterminals + # Check to make sure they didn't use anything that will give us + # heartburn later. + reserved = [a for a in self.alphabet if a.startswith('__') or a == '$'] + if reserved: + raise ValueError( + "Can't use {symbols} in grammars, {what} reserved.".format( + symbols=' or '.join(reserved), + what="it's" if len(reserved) == 1 else "they're", + ) + ) + + self.terminals.add('$') + self.alphabet.add('$') + def gen_closure_next(self, config): """Return the next set of configurations in the closure for config. @@ -352,18 +370,18 @@ def format_table(generator, table): return '\n'.join(lines) +# OK, this is +grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), +] + gen = GenerateLR0(grammar_simple, 'E') -# sets = gen.gen_all_sets() -# print( -# '\n\n'.join( -# '\n'.join(str(config) for config in config_set) -# for config_set in sets -# ), -# ) - - table = gen.gen_table() -print(format_table(gen, table)) -print('') tree = parse(table, ['id', '+', '(', 'id', ')']) print(format_node(tree)) + +grammar_lr0_conflict = [ +]