diff --git a/grammar.py b/grammar.py index e95ccc0..c37405f 100644 --- a/grammar.py +++ b/grammar.py @@ -1,6 +1,4 @@ # This is an example grammar. -import re - from parser import Assoc, Grammar, Nothing, Token, rule, seq ARROW = Token("Arrow") @@ -121,7 +119,7 @@ class FineGrammar(Grammar): @rule def alternate_type(self): - return seq(self.type_expression, OR, self.type_identifier) + return seq(self.type_expression, BAR, self.type_identifier) @rule def type_identifier(self): @@ -172,7 +170,6 @@ class FineGrammar(Grammar): def block(self): return ( seq(LCURLY, RCURLY) - | seq(LCURLY, self.expression, RCURLY) | seq(LCURLY, self.statement_list, RCURLY) | seq(LCURLY, self.statement_list, self.expression, RCURLY) ) @@ -199,7 +196,7 @@ class FineGrammar(Grammar): @rule def return_statement(self): - return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON) + return seq(RETURN, self.expression, SEMICOLON) @rule def for_statement(self): @@ -257,7 +254,6 @@ class FineGrammar(Grammar): | seq(self.relation_expression, LESSEQUAL, self.additive_expression) | seq(self.relation_expression, GREATER, self.additive_expression) | seq(self.relation_expression, GREATEREQUAL, self.additive_expression) - | self.additive_expression ) @rule @@ -292,7 +288,6 @@ class FineGrammar(Grammar): | self.list_constructor_expression | self.object_constructor_expression | self.match_expression - | seq(self.primary_expression, LPAREN, RPAREN) | seq(self.primary_expression, LPAREN, self.expression_list, RPAREN) | seq(self.primary_expression, DOT, IDENTIFIER) | seq(LPAREN, self.expression, RPAREN) @@ -320,7 +315,7 @@ class FineGrammar(Grammar): @rule def match_expression(self): - return seq(MATCH, self.expression, self.match_body) + return seq(MATCH, self.match_body) @rule def match_body(self): @@ -380,187 +375,15 @@ class FineGrammar(Grammar): return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) -# ----------------------------------------------------------------------------- -# DORKY LEXER -# ----------------------------------------------------------------------------- -NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?") -IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*") -KEYWORD_TABLE = { - "_": UNDERSCORE, - "and": AND, - "as": AS, - "class": CLASS, - "else": ELSE, - "export": EXPORT, - "false": FALSE, - "for": FOR, - "fun": FUN, - "if": IF, - "import": IMPORT, - "in": IN, - "is": IS, - "let": LET, - "match": MATCH, - "new": NEW, - "or": OR, - "return": RETURN, - "self": SELF, - "true": TRUE, - "while": WHILE, -} +grammar = FineGrammar() +table = grammar.build_table(start="file") +print(f"{len(table)} states") -def tokenize(src: str): - pos = 0 - while pos < len(src): - ch = src[pos] - if ch.isspace(): - pos += 1 - continue +average_entries = sum(len(row) for row in table) / len(table) +max_entries = max(len(row) for row in table) +print(f"{average_entries} average, {max_entries} max") - token = None - if ch == "-": - if src[pos : pos + 2] == "->": - token = (ARROW, pos, 2) - else: - token = (MINUS, pos, 1) - - elif ch == "|": - token = (BAR, pos, 1) - - elif ch == ":": - token = (COLON, pos, 1) - - elif ch == "{": - token = (LCURLY, pos, 1) - - elif ch == "}": - token = (RCURLY, pos, 1) - - elif ch == ";": - token = (SEMICOLON, pos, 1) - - elif ch == "=": - if src[pos : pos + 2] == "==": - token = (EQUALEQUAL, pos, 2) - else: - token = (EQUAL, pos, 1) - - elif ch == "(": - token = (LPAREN, pos, 1) - - elif ch == ")": - token = (RPAREN, pos, 1) - - elif ch == ",": - token = (COMMA, pos, 1) - - elif ch == "!": - if src[pos : pos + 2] == "!=": - token = (BANGEQUAL, pos, 2) - else: - token = (BANG, pos, 1) - - elif ch == "<": - if src[pos : pos + 2] == "<=": - token = (LESSEQUAL, pos, 2) - else: - token = (LESS, pos, 1) - - elif ch == ">": - if src[pos : pos + 2] == ">=": - token = (GREATEREQUAL, pos, 2) - else: - token = (GREATER, pos, 1) - - elif ch == "+": - token = (PLUS, pos, 1) - - elif ch == "*": - token = (STAR, pos, 1) - - elif ch == "/": - if src[pos : pos + 2] == "//": - while pos < len(src) and src[pos] != "\n": - pos = pos + 1 - continue - - token = (SLASH, pos, 1) - - elif ch == ".": - token = (DOT, pos, 1) - - elif ch == "[": - token = (LSQUARE, pos, 1) - - elif ch == "]": - token = (RSQUARE, pos, 1) - - elif ch == '"' or ch == "'": - end = pos + 1 - while end < len(src) and src[end] != ch: - if src[end] == "\\": - end += 1 - end += 1 - if end == len(src): - raise Exception(f"Unterminated string constant at {pos}") - end += 1 - token = (STRING, pos, end - pos) - - else: - number_match = NUMBER_RE.match(src, pos) - if number_match: - token = (NUMBER, pos, number_match.end() - pos) - else: - id_match = IDENTIFIER_RE.match(src, pos) - if id_match: - fragment = src[pos : id_match.end()] - keyword = KEYWORD_TABLE.get(fragment) - if keyword: - token = (keyword, pos, len(fragment)) - else: - token = (IDENTIFIER, pos, len(fragment)) - - if token is None: - raise Exception("Token error") - yield token - pos += token[2] - - -import bisect - - -class FineTokens: - def __init__(self, src: str): - self.src = src - self.tokens = list(tokenize(src)) - self.lines = [m.start() for m in re.finditer("\n", src)] - - def dump(self, *, start=None, end=None): - if start is None: - start = 0 - if end is None: - end = len(self.tokens) - - for token in self.tokens[start:end]: - (kind, start, length) = token - line_index = bisect.bisect_left(self.lines, start) - if line_index == 0: - col_start = 0 - else: - col_start = self.lines[line_index - 1] + 1 - column_index = start - col_start - print( - f"{start:04} {kind.value:12} {self.src[start:start+length]} ({line_index}, {column_index})" - ) - - -if __name__ == "__main__": - grammar = FineGrammar() - table = grammar.build_table(start="expression") - - print(f"{len(table)} states") - - average_entries = sum(len(row) for row in table) / len(table) - max_entries = max(len(row) for row in table) - print(f"{average_entries} average, {max_entries} max") +# print(parser_faster.format_table(gen, table)) +# print() +# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"]) diff --git a/harness.py b/harness.py deleted file mode 100644 index 4a1b0a9..0000000 --- a/harness.py +++ /dev/null @@ -1,130 +0,0 @@ -import bisect -import typing - -import grammar -import parser - -# from parser import Token, Grammar, rule, seq - - -def trace_state(stack, input, input_index, action): - print( - "{stack: <20} {input: <50} {action: <5}".format( - stack=repr([s[0] for s in stack]), - input=repr(input[input_index : input_index + 4]), - action=repr(action), - ) - ) - - -def parse(table, tokens, trace=None): - """Parse the input with the generated parsing table and return the - concrete syntax tree. - - The parsing table can be generated by GenerateLR0.gen_table() or by any - of the other generators below. The parsing mechanism never changes, only - the table generation mechanism. - - input is a list of tokens. Don't stick an end-of-stream marker, I'll stick - one on for you. - - This is not a *great* parser, it's really just a demo for what you can - do with the table. - """ - input = [t.value for (t, _, _) in tokens.tokens] - - assert "$" not in input - input = input + ["$"] - input_index = 0 - - # Our stack is a stack of tuples, where the first entry is the state number - # and the second entry is the 'value' that was generated when the state was - # pushed. - stack: list[typing.Tuple[int, typing.Any]] = [(0, None)] - while True: - current_state = stack[-1][0] - current_token = input[input_index] - - action = table[current_state].get(current_token, ("error",)) - if trace: - trace(stack, input, input_index, action) - - if action[0] == "accept": - return (stack[-1][1], []) - - elif action[0] == "reduce": - name = action[1] - size = action[2] - - value = (name, tuple(s[1] for s in stack[-size:])) - stack = stack[:-size] - - goto = table[stack[-1][0]].get(name, ("error",)) - assert goto[0] == "goto" # Corrupt table? - stack.append((goto[1], value)) - - elif action[0] == "shift": - stack.append((action[1], (current_token, ()))) - input_index += 1 - - elif action[0] == "error": - if input_index >= len(tokens.tokens): - raise ValueError("Unexpected end of file") - else: - (_, start, _) = tokens.tokens[input_index] - line_index = bisect.bisect_left(tokens.lines, start) - if line_index == 0: - col_start = 0 - else: - col_start = tokens.lines[line_index - 1] + 1 - column_index = start - col_start - line_index += 1 - - return ( - None, - [ - f"{line_index}:{column_index}: Syntax error: unexpected symbol {current_token}" - ], - ) - - -def harness(lexer_func, grammar_func, start_rule, source_path): - # generator = parser.GenerateLR1 - generator = parser.GenerateLALR - table = grammar_func().build_table(start=start_rule, generator=generator) - print(f"{len(table)} states") - - average_entries = sum(len(row) for row in table) / len(table) - max_entries = max(len(row) for row in table) - print(f"{average_entries} average, {max_entries} max") - - if source_path: - with open(source_path, "r", encoding="utf-8") as f: - src = f.read() - tokens = lexer_func(src) - # print(f"{tokens.lines}") - # tokens.dump(end=5) - (_, errors) = parse(table, tokens) - if len(errors) > 0: - print(f"{len(errors)} errors:") - for error in errors: - print(f" {error}") - - -if __name__ == "__main__": - import sys - - source_path = None - if len(sys.argv) == 2: - source_path = sys.argv[1] - - harness( - lexer_func=grammar.FineTokens, - grammar_func=grammar.FineGrammar, - start_rule="file", - source_path=source_path, - ) - - # print(parser_faster.format_table(gen, table)) - # print() - # tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"]) diff --git a/parser.py b/parser.py index 7b24c49..8091fb7 100644 --- a/parser.py +++ b/parser.py @@ -257,14 +257,6 @@ class Configuration: lookahead=(), ) - def replace_lookahead(self, lookahead: typing.Tuple[int, ...]): - return Configuration( - name=self.name, - symbols=self.symbols, - position=self.position, - lookahead=lookahead, - ) - @property def rest(self): return self.symbols[(self.position + 1) :] @@ -1390,67 +1382,57 @@ class GenerateLALR(GenerateLR1): use a bunch of improvement, probably.) """ - def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo: + def merge_sets(self, config_set_a, config_set_b): + """Merge the two config sets, by keeping the item cores but merging + the lookahead sets for each item. + """ + assert len(config_set_a) == len(config_set_b) + merged = [] + for index, a in enumerate(config_set_a): + b = config_set_b[index] + assert a.clear_lookahead() == b.clear_lookahead() + + new_lookahead = a.lookahead + b.lookahead + new_lookahead = tuple(sorted(set(new_lookahead))) + merged.append(a.clear_lookahead()) + + return tuple(merged) + + def sets_equal(self, a, b): + a_no_la = tuple(s.clear_lookahead() for s in a) + b_no_la = tuple(s.clear_lookahead() for s in b) + return a_no_la == b_no_la + + def gen_sets(self, config_set) -> ConfigurationSetInfo: """Recursively generate all configuration sets starting from the - provided set. + provided set, and merge them with the provided set 'F'. The difference between this method and the one in GenerateLR0, where - this comes from, is that we're going to be keeping track of states - that we found that are equivalent in lookahead. + this comes from, is in the part that stops recursion. In LALR we + compare for set equality *ignoring lookahead*. If we find a match, + then instead of returning F unchanged, we merge the two equal sets + and replace the set in F, returning the modified set. """ - # - # First, do the actual walk. Don't merge yet: just keep track of all - # the config sets that need to be merged. - # F = {} - seen = set() successors = [] pending = [config_set] while len(pending) > 0: config_set = pending.pop() - if config_set in seen: - continue - seen.add(config_set) - config_set_no_la = tuple(s.clear_lookahead() for s in config_set) existing = F.get(config_set_no_la) if existing is not None: - existing.append(config_set) + F[config_set_no_la] = self.merge_sets(config_set, existing) else: - F[config_set_no_la] = [config_set] - - for symbol, successor in self.gen_all_successors(config_set): - successor_no_la = tuple(s.clear_lookahead() for s in successor) - successors.append((config_set_no_la, symbol, successor_no_la)) - pending.append(successor) - - # Now we gathered the sets, merge them all. - final_sets = {} - for key, config_sets in F.items(): - new_config_set = [] - config_groupings = [[] for _ in range(len(config_sets[0]))] - for config_set in config_sets: - for i, config in enumerate(config_set): - config_groupings[i].append(config) - - for config_group in config_groupings: - new_lookahead = [l for config in config_group for l in config.lookahead] - new_lookahead = tuple(sorted(set(new_lookahead))) - new_config_set.append( - Configuration( - name=config_group[0].name, - symbols=config_group[0].symbols, - position=config_group[0].position, - lookahead=new_lookahead, - ) - ) - - final_sets[key] = tuple(new_config_set) + F[config_set_no_la] = config_set + for symbol, successor in self.gen_all_successors(config_set): + successor_no_la = tuple(s.clear_lookahead() for s in successor) + successors.append((config_set_no_la, symbol, successor_no_la)) + pending.append(successor) # Register all the actually merged, final config sets. result = ConfigurationSetInfo() - for config_set in final_sets.values(): + for config_set in F.values(): result.register_config_set(config_set) # Now record all the successors that we found. Of course, the actual @@ -1461,10 +1443,10 @@ class GenerateLALR(GenerateLR1): # so we can find the final sets, then look them up in the registered # sets, and actually register the successor. for config_set_no_la, symbol, successor_no_la in successors: - actual_config_set = final_sets[config_set_no_la] + actual_config_set = F[config_set_no_la] from_index = result.config_set_key[actual_config_set] - actual_successor = final_sets[successor_no_la] + actual_successor = F[successor_no_la] to_index = result.config_set_key[actual_successor] result.add_successor(from_index, symbol, to_index) @@ -1517,7 +1499,7 @@ class Token(Rule): def __init__(self, value): self.value = sys.intern(value) - def flatten(self) -> typing.Generator[list["str | Token"], None, None]: + def flatten(self) -> typing.Generator[list[str], None, None]: # We are just ourselves when flattened. yield [self] @@ -1564,7 +1546,7 @@ class AlternativeRule(Rule): self.left = left self.right = right - def flatten(self) -> typing.Generator[list[str | Token], None, None]: + def flatten(self) -> typing.Generator[list[str], None, None]: # All the things from the left of the alternative, then all the things # from the right, never intermingled. yield from self.left.flatten() @@ -1580,7 +1562,7 @@ class SequenceRule(Rule): self.first = first self.second = second - def flatten(self) -> typing.Generator[list[str | Token], None, None]: + def flatten(self) -> typing.Generator[list[str], None, None]: # All the things in the prefix.... for first in self.first.flatten(): # ...potentially followed by all the things in the suffix. @@ -1593,7 +1575,7 @@ class NothingRule(Rule): these, you're probably better off just using the singleton `Nothing`. """ - def flatten(self) -> typing.Generator[list[str | Token], None, None]: + def flatten(self) -> typing.Generator[list[str], None, None]: # It's quiet in here. yield [] @@ -1601,7 +1583,7 @@ class NothingRule(Rule): Nothing = NothingRule() -def seq(*args: Rule) -> Rule: +def seq(*args: list[Rule]) -> Rule: """A rule that matches a sequence of rules. (A helper function that combines its arguments into nested sequences.) @@ -1612,15 +1594,17 @@ def seq(*args: Rule) -> Rule: return result -# @typing.overload -# def rule(f: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ... +@typing.overload +def rule(name: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ... -# @typing.overload -# def rule(f: typing.Callable) -> Rule: ... +@typing.overload +def rule(fn: typing.Callable) -> Rule: ... -def rule(f: typing.Callable) -> Rule: +def rule( + name_or_fn: None | str | typing.Callable = None, +) -> Rule | typing.Callable[[typing.Callable], Rule]: """The decorator that marks a method in a Grammar object as a nonterminal rule. @@ -1628,11 +1612,16 @@ def rule(f: typing.Callable) -> Rule: If called with one argument, that argument is a name that overrides the name of the nonterminal, which defaults to the name of the function. """ - name = f.__name__ - return NonTerminal(f, name) + def _rule(callable): + return NonTerminal(callable, name) -PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]] + if callable(name_or_fn): + name = name_or_fn.__name__ + return _rule(name_or_fn) + else: + name = name_or_fn + return _rule class Grammar: @@ -1661,13 +1650,12 @@ class Grammar: Not very exciting, perhaps, but it's something. """ - def __init__(self, precedence: PrecedenceList | None = None): + def __init__(self, precedence: list[typing.Tuple[Assoc, list[Token | NonTerminal]]] = None): if precedence is None: precedence = getattr(self, "precedence", []) - assert precedence is not None precedence_table = {} - for prec, (associativity, symbols) in enumerate(precedence): + for precedence, (associativity, symbols) in enumerate(precedence): for symbol in symbols: if isinstance(symbol, Token): key = symbol.value @@ -1676,7 +1664,7 @@ class Grammar: else: raise ValueError(f"{symbol} must be either a Token or a NonTerminal") - precedence_table[key] = (associativity, prec + 1) + precedence_table[key] = (associativity, precedence + 1) self._precedence = precedence_table