diff --git a/grammar.py b/grammar.py index 97bc847..38299e9 100644 --- a/grammar.py +++ b/grammar.py @@ -1,5 +1,6 @@ # This is an example grammar. import re +import typing import parser from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule @@ -517,12 +518,15 @@ import bisect class FineTokens: def __init__(self, src: str): self.src = src - self._tokens = list(tokenize(src)) - self.lines = [m.start() for m in re.finditer("\n", src)] + self._tokens: list[typing.Tuple[Terminal, int, int]] = list(tokenize(src)) + self._lines = [m.start() for m in re.finditer("\n", src)] def tokens(self): return self._tokens + def lines(self): + return self._lines + def dump(self, *, start=None, end=None): if start is None: start = 0 @@ -531,11 +535,11 @@ class FineTokens: for token in self._tokens[start:end]: (kind, start, length) = token - line_index = bisect.bisect_left(self.lines, start) + line_index = bisect.bisect_left(self._lines, start) if line_index == 0: col_start = 0 else: - col_start = self.lines[line_index - 1] + 1 + col_start = self._lines[line_index - 1] + 1 column_index = start - col_start value = self.src[start : start + length] print(f"{start:04} {kind.value:12} {value} ({line_index}, {column_index})") diff --git a/harness.py b/harness.py index 94259d1..90cee9c 100644 --- a/harness.py +++ b/harness.py @@ -300,7 +300,7 @@ class Harness: # print(f"{tokens.lines}") # tokens.dump(end=5) - (tree, errors) = runtime.Parser(table, trace=None).parse(self.tokens) + (tree, errors) = runtime.Parser(table).parse(self.tokens) parse_time = time.time() self.tree = tree self.errors = errors diff --git a/makefile b/makefile index 330c2f7..a5fb8a9 100644 --- a/makefile +++ b/makefile @@ -1,3 +1,3 @@ .PHONY: test test: - pytest + pdm run pytest diff --git a/parser/parser.py b/parser/parser.py index 9fd2c02..d0cb1fc 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1097,73 +1097,6 @@ class GenerateLR0: return builder.flush(config_sets) -def parse(table: ParseTable, input, trace=False): - """Parse the input with the generated parsing table and return the - concrete syntax tree. - - The parsing table can be generated by GenerateLR0.gen_table() or by any - of the other generators below. The parsing mechanism never changes, only - the table generation mechanism. - - input is a list of tokens. Don't stick an end-of-stream marker, I'll stick - one on for you. - - This is not a *great* parser, it's really just a demo for what you can - do with the table. - """ - assert "$" not in input - input = input + ["$"] - input_index = 0 - - # Our stack is a stack of tuples, where the first entry is the state number - # and the second entry is the 'value' that was generated when the state was - # pushed. - stack: list[typing.Tuple[int, typing.Any]] = [(0, None)] - while True: - current_state = stack[-1][0] - current_token = input[input_index] - - action = table.actions[current_state].get(current_token, Error()) - if trace: - print( - "{stack: <20} {input: <50} {action: <5}".format( - stack=repr([s[0] for s in stack]), - input=repr(input[input_index:]), - action=repr(action), - ) - ) - - match action: - case Accept(): - return stack[-1][1] - - case Reduce(name=name, count=size, transparent=transparent): - children = [] - for _, c in stack[-size:]: - if isinstance(c, tuple) and c[0] is None: - children.extend(c[1]) - else: - children.append(c) - - value = (name if not transparent else None, tuple(children)) - stack = stack[:-size] - - goto = table.gotos[stack[-1][0]].get(name) - assert goto is not None - stack.append((goto, value)) - - case Shift(state): - stack.append((state, (current_token, ()))) - input_index += 1 - - case Error(): - raise ValueError( - "Syntax error: unexpected symbol {sym}".format( - sym=current_token, - ), - ) - - ############################################################################### # SLR(1) ############################################################################### @@ -1978,150 +1911,3 @@ class Grammar: gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) table = gen.gen_table() return table - - -############################################################################### -# Formatting -############################################################################### -def format_node(node): - """Print out an indented concrete syntax tree, from parse().""" - lines = ["{name}".format(name=node[0])] + [ - " " + line for child in node[1] for line in format_node(child).split("\n") - ] - return "\n".join(lines) - - -############################################################################### -# Examples -############################################################################### -def examples(): - def dump_grammar(grammar): - for name, symbols in grammar: - print(f"{name} -> {symbols}") - print() - - # OK, this is a very simple LR0 grammar. - print("grammar_simple:") - grammar_simple = [ - ("E", ["E", "+", "T"]), - ("E", ["T"]), - ("T", ["(", "E", ")"]), - ("T", ["id"]), - ] - - gen = GenerateLR0("E", grammar_simple) - table = gen.gen_table() - print(table.format()) - tree = parse(table, ["id", "+", "(", "id", ")"]) - print(format_node(tree) + "\n") - print() - - # This one doesn't work with LR0, though, it has a shift/reduce conflict. - print("grammar_lr0_shift_reduce (LR0):") - grammar_lr0_shift_reduce = grammar_simple + [ - ("T", ["id", "[", "E", "]"]), - ] - try: - gen = GenerateLR0("E", grammar_lr0_shift_reduce) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - # Nor does this: it has a reduce/reduce conflict. - print("grammar_lr0_reduce_reduce (LR0):") - grammar_lr0_reduce_reduce = grammar_simple + [ - ("E", ["V", "=", "E"]), - ("V", ["id"]), - ] - try: - gen = GenerateLR0("E", grammar_lr0_reduce_reduce) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - # Nullable symbols just don't work with constructs like this, because you can't - # look ahead to figure out if you should reduce an empty 'F' or not. - print("grammar_nullable (LR0):") - grammar_nullable = [ - ("E", ["F", "boop"]), - ("F", ["beep"]), - ("F", []), - ] - try: - gen = GenerateLR0("E", grammar_nullable) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - print("grammar_lr0_shift_reduce (SLR1):") - dump_grammar(grammar_lr0_shift_reduce) - gen = GenerateSLR1("E", grammar_lr0_shift_reduce) - print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}") - table = gen.gen_table() - print(table.format()) - tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True) - print(format_node(tree) + "\n") - print() - - # SLR1 can't handle this. - print("grammar_aho_ullman_1 (SLR1):") - grammar_aho_ullman_1 = [ - ("S", ["L", "=", "R"]), - ("S", ["R"]), - ("L", ["*", "R"]), - ("L", ["id"]), - ("R", ["L"]), - ] - try: - gen = GenerateSLR1("S", grammar_aho_ullman_1) - table = gen.gen_table() - assert False - except ValueError as e: - print(e) - print() - - # Here's an example with a full LR1 grammar, though. - print("grammar_aho_ullman_2 (LR1):") - grammar_aho_ullman_2 = [ - ("S", ["X", "X"]), - ("X", ["a", "X"]), - ("X", ["b"]), - ] - gen = GenerateLR1("S", grammar_aho_ullman_2) - table = gen.gen_table() - print(table.format()) - parse(table, ["b", "a", "a", "b"], trace=True) - print() - - # What happens if we do LALR to it? - print("grammar_aho_ullman_2 (LALR):") - gen = GenerateLALR("S", grammar_aho_ullman_2) - table = gen.gen_table() - print(table.format()) - print() - - # A fun LALAR grammar. - print("grammar_lalr:") - grammar_lalr = [ - ("S", ["V", "E"]), - ("E", ["F"]), - ("E", ["E", "+", "F"]), - ("F", ["V"]), - ("F", ["int"]), - ("F", ["(", "E", ")"]), - ("V", ["id"]), - ] - gen = GenerateLALR("S", grammar_lalr) - table = gen.gen_table() - print(table.format()) - print() - - -if __name__ == "__main__": - examples() diff --git a/parser/runtime.py b/parser/runtime.py index dd0e170..f5be3a4 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -5,7 +5,7 @@ import logging import typing from dataclasses import dataclass -from . import parser # pyright: ignore # You're drunk. +from . import parser @dataclass @@ -267,17 +267,27 @@ def recover(table: parser.ParseTable, input: list[TokenValue], start: int, stack action_log = logging.getLogger("parser.action") +class TokenStream(typing.Protocol): + def tokens(self) -> list[typing.Tuple[parser.Terminal, int, int]]: + """The tokens in the stream, in the form (terminal, start, length).""" + ... + + def lines(self) -> list[int]: + """The offsets of line breaks in the tokens. (The end of line 0 is at + index 0, etc.)""" + ... + + class Parser: # Our stack is a stack of tuples, where the first entry is the state # number and the second entry is the 'value' that was generated when the # state was pushed. table: parser.ParseTable - def __init__(self, table, trace): - self.trace = trace + def __init__(self, table): self.table = table - def parse(self, tokens) -> typing.Tuple[Tree | None, list[str]]: + def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]: input_tokens = tokens.tokens() input: list[TokenValue] = [ TokenValue(kind=kind.value, start=start, end=start + length) @@ -406,15 +416,17 @@ class Parser: # All done. error_strings = [] - for parse_error in errors: - line_index = bisect.bisect_left(tokens.lines, parse_error.start) - if line_index == 0: - col_start = 0 - else: - col_start = tokens.lines[line_index - 1] + 1 - column_index = parse_error.start - col_start - line_index += 1 + if errors: + lines = tokens.lines() + for parse_error in errors: + line_index = bisect.bisect_left(lines, parse_error.start) + if line_index == 0: + col_start = 0 + else: + col_start = lines[line_index - 1] + 1 + column_index = parse_error.start - col_start + line_index += 1 - error_strings.append(f"{line_index}:{column_index}: {parse_error.message}") + error_strings.append(f"{line_index}:{column_index}: {parse_error.message}") return (result, error_strings) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 23bfee2..ba1525b 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -1,6 +1,67 @@ -import parser +import typing + import pytest +import parser +import parser.runtime as runtime + +from parser import Grammar, seq, rule, Terminal + +PLUS = Terminal("+") +LPAREN = Terminal("(") +RPAREN = Terminal(")") +IDENTIFIER = Terminal("id") + + +class Tokens: + def __init__(self, *toks: Terminal): + self._tokens = [(t, 0, 0) for t in toks] + self._lines = [] + + def tokens(self): + return self._tokens + + def lines(self): + return self._lines + + +def _tree(treeform) -> runtime.Tree | runtime.TokenValue: + if isinstance(treeform, str): + return runtime.TokenValue(treeform, 0, 0) + else: + assert isinstance(treeform, tuple) + name = treeform[0] + assert isinstance(name, str) + return runtime.Tree( + name=name, + start=0, + end=0, + children=tuple(_tree(x) for x in treeform[1:]), + ) + + +class LR0Grammar(Grammar): + start = "E" + generator = parser.GenerateLR0 + + @rule + def E(self): + return seq(self.E, PLUS, self.T) | self.T + + @rule + def T(self): + return seq(LPAREN, self.E, RPAREN) | IDENTIFIER + + +def test_lr0_lr0(): + """An LR0 grammar should work with an LR0 generator.""" + table = LR0Grammar().build_table() + parser = runtime.Parser(table) + tree, errors = parser.parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN)) + + assert errors == [] + assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")"))) + def test_conflicting_names(): """Terminals and nonterminals cannot have the same name. @@ -16,14 +77,135 @@ def test_conflicting_names(): to understand. """ - IDENTIFIER = parser.Terminal("Identifier") + IDENTIFIER = Terminal("Identifier") - class TestGrammar(parser.Grammar): + class TestGrammar(Grammar): start = "Identifier" - @parser.rule("Identifier") + @rule("Identifier") def identifier(self): return IDENTIFIER with pytest.raises(ValueError): TestGrammar().build_table() + + +############################################################################### +# Examples +############################################################################### +# def examples(): +# def dump_grammar(grammar): +# for name, symbols in grammar: +# print(f"{name} -> {symbols}") +# print() + + +# # This one doesn't work with LR0, though, it has a shift/reduce conflict. +# print("grammar_lr0_shift_reduce (LR0):") +# grammar_lr0_shift_reduce = grammar_simple + [ +# ("T", ["id", "[", "E", "]"]), +# ] +# try: +# gen = GenerateLR0("E", grammar_lr0_shift_reduce) +# table = gen.gen_table() +# assert False +# except ValueError as e: +# print(e) +# print() + +# # Nor does this: it has a reduce/reduce conflict. +# print("grammar_lr0_reduce_reduce (LR0):") +# grammar_lr0_reduce_reduce = grammar_simple + [ +# ("E", ["V", "=", "E"]), +# ("V", ["id"]), +# ] +# try: +# gen = GenerateLR0("E", grammar_lr0_reduce_reduce) +# table = gen.gen_table() +# assert False +# except ValueError as e: +# print(e) +# print() + +# # Nullable symbols just don't work with constructs like this, because you can't +# # look ahead to figure out if you should reduce an empty 'F' or not. +# print("grammar_nullable (LR0):") +# grammar_nullable = [ +# ("E", ["F", "boop"]), +# ("F", ["beep"]), +# ("F", []), +# ] +# try: +# gen = GenerateLR0("E", grammar_nullable) +# table = gen.gen_table() +# assert False +# except ValueError as e: +# print(e) +# print() + +# print("grammar_lr0_shift_reduce (SLR1):") +# dump_grammar(grammar_lr0_shift_reduce) +# gen = GenerateSLR1("E", grammar_lr0_shift_reduce) +# print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}") +# table = gen.gen_table() +# print(table.format()) +# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True) +# print(format_node(tree) + "\n") +# print() + +# # SLR1 can't handle this. +# print("grammar_aho_ullman_1 (SLR1):") +# grammar_aho_ullman_1 = [ +# ("S", ["L", "=", "R"]), +# ("S", ["R"]), +# ("L", ["*", "R"]), +# ("L", ["id"]), +# ("R", ["L"]), +# ] +# try: +# gen = GenerateSLR1("S", grammar_aho_ullman_1) +# table = gen.gen_table() +# assert False +# except ValueError as e: +# print(e) +# print() + +# # Here's an example with a full LR1 grammar, though. +# print("grammar_aho_ullman_2 (LR1):") +# grammar_aho_ullman_2 = [ +# ("S", ["X", "X"]), +# ("X", ["a", "X"]), +# ("X", ["b"]), +# ] +# gen = GenerateLR1("S", grammar_aho_ullman_2) +# table = gen.gen_table() +# print(table.format()) +# parse(table, ["b", "a", "a", "b"], trace=True) +# print() + +# # What happens if we do LALR to it? +# print("grammar_aho_ullman_2 (LALR):") +# gen = GenerateLALR("S", grammar_aho_ullman_2) +# table = gen.gen_table() +# print(table.format()) +# print() + +# # A fun LALAR grammar. +# print("grammar_lalr:") +# grammar_lalr = [ +# ("S", ["V", "E"]), +# ("E", ["F"]), +# ("E", ["E", "+", "F"]), +# ("F", ["V"]), +# ("F", ["int"]), +# ("F", ["(", "E", ")"]), +# ("V", ["id"]), +# ] +# gen = GenerateLALR("S", grammar_lalr) +# table = gen.gen_table() +# print(table.format()) +# print() + + +# if __name__ == "__main__": +# examples()