diff --git a/grammar.py b/grammar.py index c37405f..e95ccc0 100644 --- a/grammar.py +++ b/grammar.py @@ -1,4 +1,6 @@ # This is an example grammar. +import re + from parser import Assoc, Grammar, Nothing, Token, rule, seq ARROW = Token("Arrow") @@ -119,7 +121,7 @@ class FineGrammar(Grammar): @rule def alternate_type(self): - return seq(self.type_expression, BAR, self.type_identifier) + return seq(self.type_expression, OR, self.type_identifier) @rule def type_identifier(self): @@ -170,6 +172,7 @@ class FineGrammar(Grammar): def block(self): return ( seq(LCURLY, RCURLY) + | seq(LCURLY, self.expression, RCURLY) | seq(LCURLY, self.statement_list, RCURLY) | seq(LCURLY, self.statement_list, self.expression, RCURLY) ) @@ -196,7 +199,7 @@ class FineGrammar(Grammar): @rule def return_statement(self): - return seq(RETURN, self.expression, SEMICOLON) + return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON) @rule def for_statement(self): @@ -254,6 +257,7 @@ class FineGrammar(Grammar): | seq(self.relation_expression, LESSEQUAL, self.additive_expression) | seq(self.relation_expression, GREATER, self.additive_expression) | seq(self.relation_expression, GREATEREQUAL, self.additive_expression) + | self.additive_expression ) @rule @@ -288,6 +292,7 @@ class FineGrammar(Grammar): | self.list_constructor_expression | self.object_constructor_expression | self.match_expression + | seq(self.primary_expression, LPAREN, RPAREN) | seq(self.primary_expression, LPAREN, self.expression_list, RPAREN) | seq(self.primary_expression, DOT, IDENTIFIER) | seq(LPAREN, self.expression, RPAREN) @@ -315,7 +320,7 @@ class FineGrammar(Grammar): @rule def match_expression(self): - return seq(MATCH, self.match_body) + return seq(MATCH, self.expression, self.match_body) @rule def match_body(self): @@ -375,15 +380,187 @@ class FineGrammar(Grammar): return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) -grammar = FineGrammar() -table = grammar.build_table(start="file") +# ----------------------------------------------------------------------------- +# DORKY LEXER +# ----------------------------------------------------------------------------- +NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?") +IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*") +KEYWORD_TABLE = { + "_": UNDERSCORE, + "and": AND, + "as": AS, + "class": CLASS, + "else": ELSE, + "export": EXPORT, + "false": FALSE, + "for": FOR, + "fun": FUN, + "if": IF, + "import": IMPORT, + "in": IN, + "is": IS, + "let": LET, + "match": MATCH, + "new": NEW, + "or": OR, + "return": RETURN, + "self": SELF, + "true": TRUE, + "while": WHILE, +} -print(f"{len(table)} states") -average_entries = sum(len(row) for row in table) / len(table) -max_entries = max(len(row) for row in table) -print(f"{average_entries} average, {max_entries} max") +def tokenize(src: str): + pos = 0 + while pos < len(src): + ch = src[pos] + if ch.isspace(): + pos += 1 + continue -# print(parser_faster.format_table(gen, table)) -# print() -# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"]) + token = None + if ch == "-": + if src[pos : pos + 2] == "->": + token = (ARROW, pos, 2) + else: + token = (MINUS, pos, 1) + + elif ch == "|": + token = (BAR, pos, 1) + + elif ch == ":": + token = (COLON, pos, 1) + + elif ch == "{": + token = (LCURLY, pos, 1) + + elif ch == "}": + token = (RCURLY, pos, 1) + + elif ch == ";": + token = (SEMICOLON, pos, 1) + + elif ch == "=": + if src[pos : pos + 2] == "==": + token = (EQUALEQUAL, pos, 2) + else: + token = (EQUAL, pos, 1) + + elif ch == "(": + token = (LPAREN, pos, 1) + + elif ch == ")": + token = (RPAREN, pos, 1) + + elif ch == ",": + token = (COMMA, pos, 1) + + elif ch == "!": + if src[pos : pos + 2] == "!=": + token = (BANGEQUAL, pos, 2) + else: + token = (BANG, pos, 1) + + elif ch == "<": + if src[pos : pos + 2] == "<=": + token = (LESSEQUAL, pos, 2) + else: + token = (LESS, pos, 1) + + elif ch == ">": + if src[pos : pos + 2] == ">=": + token = (GREATEREQUAL, pos, 2) + else: + token = (GREATER, pos, 1) + + elif ch == "+": + token = (PLUS, pos, 1) + + elif ch == "*": + token = (STAR, pos, 1) + + elif ch == "/": + if src[pos : pos + 2] == "//": + while pos < len(src) and src[pos] != "\n": + pos = pos + 1 + continue + + token = (SLASH, pos, 1) + + elif ch == ".": + token = (DOT, pos, 1) + + elif ch == "[": + token = (LSQUARE, pos, 1) + + elif ch == "]": + token = (RSQUARE, pos, 1) + + elif ch == '"' or ch == "'": + end = pos + 1 + while end < len(src) and src[end] != ch: + if src[end] == "\\": + end += 1 + end += 1 + if end == len(src): + raise Exception(f"Unterminated string constant at {pos}") + end += 1 + token = (STRING, pos, end - pos) + + else: + number_match = NUMBER_RE.match(src, pos) + if number_match: + token = (NUMBER, pos, number_match.end() - pos) + else: + id_match = IDENTIFIER_RE.match(src, pos) + if id_match: + fragment = src[pos : id_match.end()] + keyword = KEYWORD_TABLE.get(fragment) + if keyword: + token = (keyword, pos, len(fragment)) + else: + token = (IDENTIFIER, pos, len(fragment)) + + if token is None: + raise Exception("Token error") + yield token + pos += token[2] + + +import bisect + + +class FineTokens: + def __init__(self, src: str): + self.src = src + self.tokens = list(tokenize(src)) + self.lines = [m.start() for m in re.finditer("\n", src)] + + def dump(self, *, start=None, end=None): + if start is None: + start = 0 + if end is None: + end = len(self.tokens) + + for token in self.tokens[start:end]: + (kind, start, length) = token + line_index = bisect.bisect_left(self.lines, start) + if line_index == 0: + col_start = 0 + else: + col_start = self.lines[line_index - 1] + 1 + column_index = start - col_start + print( + f"{start:04} {kind.value:12} {self.src[start:start+length]} ({line_index}, {column_index})" + ) + + +if __name__ == "__main__": + grammar = FineGrammar() + table = grammar.build_table(start="expression") + + print(f"{len(table)} states") + + average_entries = sum(len(row) for row in table) / len(table) + max_entries = max(len(row) for row in table) + print(f"{average_entries} average, {max_entries} max") diff --git a/harness.py b/harness.py new file mode 100644 index 0000000..4a1b0a9 --- /dev/null +++ b/harness.py @@ -0,0 +1,130 @@ +import bisect +import typing + +import grammar +import parser + +# from parser import Token, Grammar, rule, seq + + +def trace_state(stack, input, input_index, action): + print( + "{stack: <20} {input: <50} {action: <5}".format( + stack=repr([s[0] for s in stack]), + input=repr(input[input_index : input_index + 4]), + action=repr(action), + ) + ) + + +def parse(table, tokens, trace=None): + """Parse the input with the generated parsing table and return the + concrete syntax tree. + + The parsing table can be generated by GenerateLR0.gen_table() or by any + of the other generators below. The parsing mechanism never changes, only + the table generation mechanism. + + input is a list of tokens. Don't stick an end-of-stream marker, I'll stick + one on for you. + + This is not a *great* parser, it's really just a demo for what you can + do with the table. + """ + input = [t.value for (t, _, _) in tokens.tokens] + + assert "$" not in input + input = input + ["$"] + input_index = 0 + + # Our stack is a stack of tuples, where the first entry is the state number + # and the second entry is the 'value' that was generated when the state was + # pushed. + stack: list[typing.Tuple[int, typing.Any]] = [(0, None)] + while True: + current_state = stack[-1][0] + current_token = input[input_index] + + action = table[current_state].get(current_token, ("error",)) + if trace: + trace(stack, input, input_index, action) + + if action[0] == "accept": + return (stack[-1][1], []) + + elif action[0] == "reduce": + name = action[1] + size = action[2] + + value = (name, tuple(s[1] for s in stack[-size:])) + stack = stack[:-size] + + goto = table[stack[-1][0]].get(name, ("error",)) + assert goto[0] == "goto" # Corrupt table? + stack.append((goto[1], value)) + + elif action[0] == "shift": + stack.append((action[1], (current_token, ()))) + input_index += 1 + + elif action[0] == "error": + if input_index >= len(tokens.tokens): + raise ValueError("Unexpected end of file") + else: + (_, start, _) = tokens.tokens[input_index] + line_index = bisect.bisect_left(tokens.lines, start) + if line_index == 0: + col_start = 0 + else: + col_start = tokens.lines[line_index - 1] + 1 + column_index = start - col_start + line_index += 1 + + return ( + None, + [ + f"{line_index}:{column_index}: Syntax error: unexpected symbol {current_token}" + ], + ) + + +def harness(lexer_func, grammar_func, start_rule, source_path): + # generator = parser.GenerateLR1 + generator = parser.GenerateLALR + table = grammar_func().build_table(start=start_rule, generator=generator) + print(f"{len(table)} states") + + average_entries = sum(len(row) for row in table) / len(table) + max_entries = max(len(row) for row in table) + print(f"{average_entries} average, {max_entries} max") + + if source_path: + with open(source_path, "r", encoding="utf-8") as f: + src = f.read() + tokens = lexer_func(src) + # print(f"{tokens.lines}") + # tokens.dump(end=5) + (_, errors) = parse(table, tokens) + if len(errors) > 0: + print(f"{len(errors)} errors:") + for error in errors: + print(f" {error}") + + +if __name__ == "__main__": + import sys + + source_path = None + if len(sys.argv) == 2: + source_path = sys.argv[1] + + harness( + lexer_func=grammar.FineTokens, + grammar_func=grammar.FineGrammar, + start_rule="file", + source_path=source_path, + ) + + # print(parser_faster.format_table(gen, table)) + # print() + # tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"]) diff --git a/parser.py b/parser.py index 8091fb7..7b24c49 100644 --- a/parser.py +++ b/parser.py @@ -257,6 +257,14 @@ class Configuration: lookahead=(), ) + def replace_lookahead(self, lookahead: typing.Tuple[int, ...]): + return Configuration( + name=self.name, + symbols=self.symbols, + position=self.position, + lookahead=lookahead, + ) + @property def rest(self): return self.symbols[(self.position + 1) :] @@ -1382,57 +1390,67 @@ class GenerateLALR(GenerateLR1): use a bunch of improvement, probably.) """ - def merge_sets(self, config_set_a, config_set_b): - """Merge the two config sets, by keeping the item cores but merging - the lookahead sets for each item. - """ - assert len(config_set_a) == len(config_set_b) - merged = [] - for index, a in enumerate(config_set_a): - b = config_set_b[index] - assert a.clear_lookahead() == b.clear_lookahead() - - new_lookahead = a.lookahead + b.lookahead - new_lookahead = tuple(sorted(set(new_lookahead))) - merged.append(a.clear_lookahead()) - - return tuple(merged) - - def sets_equal(self, a, b): - a_no_la = tuple(s.clear_lookahead() for s in a) - b_no_la = tuple(s.clear_lookahead() for s in b) - return a_no_la == b_no_la - - def gen_sets(self, config_set) -> ConfigurationSetInfo: + def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo: """Recursively generate all configuration sets starting from the - provided set, and merge them with the provided set 'F'. + provided set. The difference between this method and the one in GenerateLR0, where - this comes from, is in the part that stops recursion. In LALR we - compare for set equality *ignoring lookahead*. If we find a match, - then instead of returning F unchanged, we merge the two equal sets - and replace the set in F, returning the modified set. + this comes from, is that we're going to be keeping track of states + that we found that are equivalent in lookahead. """ + # + # First, do the actual walk. Don't merge yet: just keep track of all + # the config sets that need to be merged. + # F = {} + seen = set() successors = [] pending = [config_set] while len(pending) > 0: config_set = pending.pop() + if config_set in seen: + continue + seen.add(config_set) + config_set_no_la = tuple(s.clear_lookahead() for s in config_set) existing = F.get(config_set_no_la) if existing is not None: - F[config_set_no_la] = self.merge_sets(config_set, existing) + existing.append(config_set) else: - F[config_set_no_la] = config_set - for symbol, successor in self.gen_all_successors(config_set): - successor_no_la = tuple(s.clear_lookahead() for s in successor) - successors.append((config_set_no_la, symbol, successor_no_la)) - pending.append(successor) + F[config_set_no_la] = [config_set] + + for symbol, successor in self.gen_all_successors(config_set): + successor_no_la = tuple(s.clear_lookahead() for s in successor) + successors.append((config_set_no_la, symbol, successor_no_la)) + pending.append(successor) + + # Now we gathered the sets, merge them all. + final_sets = {} + for key, config_sets in F.items(): + new_config_set = [] + config_groupings = [[] for _ in range(len(config_sets[0]))] + for config_set in config_sets: + for i, config in enumerate(config_set): + config_groupings[i].append(config) + + for config_group in config_groupings: + new_lookahead = [l for config in config_group for l in config.lookahead] + new_lookahead = tuple(sorted(set(new_lookahead))) + new_config_set.append( + Configuration( + name=config_group[0].name, + symbols=config_group[0].symbols, + position=config_group[0].position, + lookahead=new_lookahead, + ) + ) + + final_sets[key] = tuple(new_config_set) # Register all the actually merged, final config sets. result = ConfigurationSetInfo() - for config_set in F.values(): + for config_set in final_sets.values(): result.register_config_set(config_set) # Now record all the successors that we found. Of course, the actual @@ -1443,10 +1461,10 @@ class GenerateLALR(GenerateLR1): # so we can find the final sets, then look them up in the registered # sets, and actually register the successor. for config_set_no_la, symbol, successor_no_la in successors: - actual_config_set = F[config_set_no_la] + actual_config_set = final_sets[config_set_no_la] from_index = result.config_set_key[actual_config_set] - actual_successor = F[successor_no_la] + actual_successor = final_sets[successor_no_la] to_index = result.config_set_key[actual_successor] result.add_successor(from_index, symbol, to_index) @@ -1499,7 +1517,7 @@ class Token(Rule): def __init__(self, value): self.value = sys.intern(value) - def flatten(self) -> typing.Generator[list[str], None, None]: + def flatten(self) -> typing.Generator[list["str | Token"], None, None]: # We are just ourselves when flattened. yield [self] @@ -1546,7 +1564,7 @@ class AlternativeRule(Rule): self.left = left self.right = right - def flatten(self) -> typing.Generator[list[str], None, None]: + def flatten(self) -> typing.Generator[list[str | Token], None, None]: # All the things from the left of the alternative, then all the things # from the right, never intermingled. yield from self.left.flatten() @@ -1562,7 +1580,7 @@ class SequenceRule(Rule): self.first = first self.second = second - def flatten(self) -> typing.Generator[list[str], None, None]: + def flatten(self) -> typing.Generator[list[str | Token], None, None]: # All the things in the prefix.... for first in self.first.flatten(): # ...potentially followed by all the things in the suffix. @@ -1575,7 +1593,7 @@ class NothingRule(Rule): these, you're probably better off just using the singleton `Nothing`. """ - def flatten(self) -> typing.Generator[list[str], None, None]: + def flatten(self) -> typing.Generator[list[str | Token], None, None]: # It's quiet in here. yield [] @@ -1583,7 +1601,7 @@ class NothingRule(Rule): Nothing = NothingRule() -def seq(*args: list[Rule]) -> Rule: +def seq(*args: Rule) -> Rule: """A rule that matches a sequence of rules. (A helper function that combines its arguments into nested sequences.) @@ -1594,17 +1612,15 @@ def seq(*args: list[Rule]) -> Rule: return result -@typing.overload -def rule(name: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ... +# @typing.overload +# def rule(f: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ... -@typing.overload -def rule(fn: typing.Callable) -> Rule: ... +# @typing.overload +# def rule(f: typing.Callable) -> Rule: ... -def rule( - name_or_fn: None | str | typing.Callable = None, -) -> Rule | typing.Callable[[typing.Callable], Rule]: +def rule(f: typing.Callable) -> Rule: """The decorator that marks a method in a Grammar object as a nonterminal rule. @@ -1612,16 +1628,11 @@ def rule( If called with one argument, that argument is a name that overrides the name of the nonterminal, which defaults to the name of the function. """ + name = f.__name__ + return NonTerminal(f, name) - def _rule(callable): - return NonTerminal(callable, name) - if callable(name_or_fn): - name = name_or_fn.__name__ - return _rule(name_or_fn) - else: - name = name_or_fn - return _rule +PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]] class Grammar: @@ -1650,12 +1661,13 @@ class Grammar: Not very exciting, perhaps, but it's something. """ - def __init__(self, precedence: list[typing.Tuple[Assoc, list[Token | NonTerminal]]] = None): + def __init__(self, precedence: PrecedenceList | None = None): if precedence is None: precedence = getattr(self, "precedence", []) + assert precedence is not None precedence_table = {} - for precedence, (associativity, symbols) in enumerate(precedence): + for prec, (associativity, symbols) in enumerate(precedence): for symbol in symbols: if isinstance(symbol, Token): key = symbol.value @@ -1664,7 +1676,7 @@ class Grammar: else: raise ValueError(f"{symbol} must be either a Token or a NonTerminal") - precedence_table[key] = (associativity, precedence + 1) + precedence_table[key] = (associativity, prec + 1) self._precedence = precedence_table