A little faster but I think I might be doing this wrong.

Fix LALR. Small, but very very slow.
Fix grammar bugs, work on debugging harness.
2024-05-28 04:58:23 -07:00 · 2024-05-27 22:31:33 -07:00 · 2024-05-27 19:02:10 -07:00 · 2024-05-27 06:50:15 -07:00
3 changed files with 390 additions and 71 deletions
--- a/grammar.py
+++ b/grammar.py
@ -1,4 +1,6 @@
 # This is an example grammar.
 import re
 from parser import Assoc, Grammar, Nothing, Token, rule, seq
 ARROW = Token("Arrow")
@ -119,7 +121,7 @@ class FineGrammar(Grammar):
    @rule
    def alternate_type(self):
-        return seq(self.type_expression, BAR, self.type_identifier)
+        return seq(self.type_expression, OR, self.type_identifier)
    @rule
    def type_identifier(self):
@ -170,6 +172,7 @@ class FineGrammar(Grammar):
    def block(self):
        return (
            seq(LCURLY, RCURLY)
            | seq(LCURLY, self.expression, RCURLY)
            | seq(LCURLY, self.statement_list, RCURLY)
            | seq(LCURLY, self.statement_list, self.expression, RCURLY)
        )
@ -196,7 +199,7 @@ class FineGrammar(Grammar):
    @rule
    def return_statement(self):
-        return seq(RETURN, self.expression, SEMICOLON)
+        return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON)
    @rule
    def for_statement(self):
@ -254,6 +257,7 @@ class FineGrammar(Grammar):
            | seq(self.relation_expression, LESSEQUAL, self.additive_expression)
            | seq(self.relation_expression, GREATER, self.additive_expression)
            | seq(self.relation_expression, GREATEREQUAL, self.additive_expression)
            | self.additive_expression
        )
    @rule
@ -288,6 +292,7 @@ class FineGrammar(Grammar):
            | self.list_constructor_expression
            | self.object_constructor_expression
            | self.match_expression
            | seq(self.primary_expression, LPAREN, RPAREN)
            | seq(self.primary_expression, LPAREN, self.expression_list, RPAREN)
            | seq(self.primary_expression, DOT, IDENTIFIER)
            | seq(LPAREN, self.expression, RPAREN)
@ -315,7 +320,7 @@ class FineGrammar(Grammar):
    @rule
    def match_expression(self):
-        return seq(MATCH, self.match_body)
+        return seq(MATCH, self.expression, self.match_body)
    @rule
    def match_body(self):
@ -375,15 +380,187 @@ class FineGrammar(Grammar):
        return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression)
-grammar = FineGrammar()
+# -----------------------------------------------------------------------------
-table = grammar.build_table(start="file")
+# DORKY LEXER
 # -----------------------------------------------------------------------------
 NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
 IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
 KEYWORD_TABLE = {
    "_": UNDERSCORE,
    "and": AND,
    "as": AS,
    "class": CLASS,
    "else": ELSE,
    "export": EXPORT,
    "false": FALSE,
    "for": FOR,
    "fun": FUN,
    "if": IF,
    "import": IMPORT,
    "in": IN,
    "is": IS,
    "let": LET,
    "match": MATCH,
    "new": NEW,
    "or": OR,
    "return": RETURN,
    "self": SELF,
    "true": TRUE,
    "while": WHILE,
 }
 print(f"{len(table)} states")
-average_entries = sum(len(row) for row in table) / len(table)
+def tokenize(src: str):
-max_entries = max(len(row) for row in table)
+    pos = 0
-print(f"{average_entries} average, {max_entries} max")
+    while pos < len(src):
        ch = src[pos]
        if ch.isspace():
            pos += 1
            continue
-# print(parser_faster.format_table(gen, table))
+        token = None
-# print()
+        if ch == "-":
-# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])
+            if src[pos : pos + 2] == "->":
                token = (ARROW, pos, 2)
            else:
                token = (MINUS, pos, 1)
        elif ch == "|":
            token = (BAR, pos, 1)
        elif ch == ":":
            token = (COLON, pos, 1)
        elif ch == "{":
            token = (LCURLY, pos, 1)
        elif ch == "}":
            token = (RCURLY, pos, 1)
        elif ch == ";":
            token = (SEMICOLON, pos, 1)
        elif ch == "=":
            if src[pos : pos + 2] == "==":
                token = (EQUALEQUAL, pos, 2)
            else:
                token = (EQUAL, pos, 1)
        elif ch == "(":
            token = (LPAREN, pos, 1)
        elif ch == ")":
            token = (RPAREN, pos, 1)
        elif ch == ",":
            token = (COMMA, pos, 1)
        elif ch == "!":
            if src[pos : pos + 2] == "!=":
                token = (BANGEQUAL, pos, 2)
            else:
                token = (BANG, pos, 1)
        elif ch == "<":
            if src[pos : pos + 2] == "<=":
                token = (LESSEQUAL, pos, 2)
            else:
                token = (LESS, pos, 1)
        elif ch == ">":
            if src[pos : pos + 2] == ">=":
                token = (GREATEREQUAL, pos, 2)
            else:
                token = (GREATER, pos, 1)
        elif ch == "+":
            token = (PLUS, pos, 1)
        elif ch == "*":
            token = (STAR, pos, 1)
        elif ch == "/":
            if src[pos : pos + 2] == "//":
                while pos < len(src) and src[pos] != "\n":
                    pos = pos + 1
                continue
            token = (SLASH, pos, 1)
        elif ch == ".":
            token = (DOT, pos, 1)
        elif ch == "[":
            token = (LSQUARE, pos, 1)
        elif ch == "]":
            token = (RSQUARE, pos, 1)
        elif ch == '"' or ch == "'":
            end = pos + 1
            while end < len(src) and src[end] != ch:
                if src[end] == "\\":
                    end += 1
                end += 1
            if end == len(src):
                raise Exception(f"Unterminated string constant at {pos}")
            end += 1
            token = (STRING, pos, end - pos)
        else:
            number_match = NUMBER_RE.match(src, pos)
            if number_match:
                token = (NUMBER, pos, number_match.end() - pos)
            else:
                id_match = IDENTIFIER_RE.match(src, pos)
                if id_match:
                    fragment = src[pos : id_match.end()]
                    keyword = KEYWORD_TABLE.get(fragment)
                    if keyword:
                        token = (keyword, pos, len(fragment))
                    else:
                        token = (IDENTIFIER, pos, len(fragment))
        if token is None:
            raise Exception("Token error")
        yield token
        pos += token[2]
 import bisect
 class FineTokens:
    def __init__(self, src: str):
        self.src = src
        self.tokens = list(tokenize(src))
        self.lines = [m.start() for m in re.finditer("\n", src)]
    def dump(self, *, start=None, end=None):
        if start is None:
            start = 0
        if end is None:
            end = len(self.tokens)
        for token in self.tokens[start:end]:
            (kind, start, length) = token
            line_index = bisect.bisect_left(self.lines, start)
            if line_index == 0:
                col_start = 0
            else:
                col_start = self.lines[line_index - 1] + 1
            column_index = start - col_start
            print(
                f"{start:04} {kind.value:12} {self.src[start:start+length]} ({line_index}, {column_index})"
            )
 if __name__ == "__main__":
    grammar = FineGrammar()
    table = grammar.build_table(start="expression")
    print(f"{len(table)} states")
    average_entries = sum(len(row) for row in table) / len(table)
    max_entries = max(len(row) for row in table)
    print(f"{average_entries} average, {max_entries} max")
--- a/harness.py
+++ b/harness.py
@ -0,0 +1,130 @@
 import bisect
 import typing
 import grammar
 import parser
 # from parser import Token, Grammar, rule, seq
 def trace_state(stack, input, input_index, action):
    print(
        "{stack: <20}  {input: <50}  {action: <5}".format(
            stack=repr([s[0] for s in stack]),
            input=repr(input[input_index : input_index + 4]),
            action=repr(action),
        )
    )
 def parse(table, tokens, trace=None):
    """Parse the input with the generated parsing table and return the
    concrete syntax tree.
    The parsing table can be generated by GenerateLR0.gen_table() or by any
    of the other generators below. The parsing mechanism never changes, only
    the table generation mechanism.
    input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
    one on for you.
    This is not a *great* parser, it's really just a demo for what you can
    do with the table.
    """
    input = [t.value for (t, _, _) in tokens.tokens]
    assert "$" not in input
    input = input + ["$"]
    input_index = 0
    # Our stack is a stack of tuples, where the first entry is the state number
    # and the second entry is the 'value' that was generated when the state was
    # pushed.
    stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
    while True:
        current_state = stack[-1][0]
        current_token = input[input_index]
        action = table[current_state].get(current_token, ("error",))
        if trace:
            trace(stack, input, input_index, action)
        if action[0] == "accept":
            return (stack[-1][1], [])
        elif action[0] == "reduce":
            name = action[1]
            size = action[2]
            value = (name, tuple(s[1] for s in stack[-size:]))
            stack = stack[:-size]
            goto = table[stack[-1][0]].get(name, ("error",))
            assert goto[0] == "goto"  # Corrupt table?
            stack.append((goto[1], value))
        elif action[0] == "shift":
            stack.append((action[1], (current_token, ())))
            input_index += 1
        elif action[0] == "error":
            if input_index >= len(tokens.tokens):
                raise ValueError("Unexpected end of file")
            else:
                (_, start, _) = tokens.tokens[input_index]
                line_index = bisect.bisect_left(tokens.lines, start)
                if line_index == 0:
                    col_start = 0
                else:
                    col_start = tokens.lines[line_index - 1] + 1
                column_index = start - col_start
                line_index += 1
                return (
                    None,
                    [
                        f"{line_index}:{column_index}: Syntax error: unexpected symbol {current_token}"
                    ],
                )
 def harness(lexer_func, grammar_func, start_rule, source_path):
    # generator = parser.GenerateLR1
    generator = parser.GenerateLALR
    table = grammar_func().build_table(start=start_rule, generator=generator)
    print(f"{len(table)} states")
    average_entries = sum(len(row) for row in table) / len(table)
    max_entries = max(len(row) for row in table)
    print(f"{average_entries} average, {max_entries} max")
    if source_path:
        with open(source_path, "r", encoding="utf-8") as f:
            src = f.read()
        tokens = lexer_func(src)
        # print(f"{tokens.lines}")
        # tokens.dump(end=5)
        (_, errors) = parse(table, tokens)
        if len(errors) > 0:
            print(f"{len(errors)} errors:")
            for error in errors:
                print(f"  {error}")
 if __name__ == "__main__":
    import sys
    source_path = None
    if len(sys.argv) == 2:
        source_path = sys.argv[1]
    harness(
        lexer_func=grammar.FineTokens,
        grammar_func=grammar.FineGrammar,
        start_rule="file",
        source_path=source_path,
    )
    # print(parser_faster.format_table(gen, table))
    # print()
    # tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])
--- a/parser.py
+++ b/parser.py
@ -257,6 +257,14 @@ class Configuration:
            lookahead=(),
        )
    def replace_lookahead(self, lookahead: typing.Tuple[int, ...]):
        return Configuration(
            name=self.name,
            symbols=self.symbols,
            position=self.position,
            lookahead=lookahead,
        )
    @property
    def rest(self):
        return self.symbols[(self.position + 1) :]
@ -1382,57 +1390,67 @@ class GenerateLALR(GenerateLR1):
    use a bunch of improvement, probably.)
    """
-    def merge_sets(self, config_set_a, config_set_b):
+    def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo:
        """Merge the two config sets, by keeping the item cores but merging
        the lookahead sets for each item.
        """
        assert len(config_set_a) == len(config_set_b)
        merged = []
        for index, a in enumerate(config_set_a):
            b = config_set_b[index]
            assert a.clear_lookahead() == b.clear_lookahead()
            new_lookahead = a.lookahead + b.lookahead
            new_lookahead = tuple(sorted(set(new_lookahead)))
            merged.append(a.clear_lookahead())
        return tuple(merged)
    def sets_equal(self, a, b):
        a_no_la = tuple(s.clear_lookahead() for s in a)
        b_no_la = tuple(s.clear_lookahead() for s in b)
        return a_no_la == b_no_la
    def gen_sets(self, config_set) -> ConfigurationSetInfo:
        """Recursively generate all configuration sets starting from the
-        provided set, and merge them with the provided set 'F'.
+        provided set.
        The difference between this method and the one in GenerateLR0, where
-        this comes from, is in the part that stops recursion. In LALR we
+        this comes from, is that we're going to be keeping track of states
-        compare for set equality *ignoring lookahead*. If we find a match,
+        that we found that are equivalent in lookahead.
        then instead of returning F unchanged, we merge the two equal sets
        and replace the set in F, returning the modified set.
        """
        #
        # First, do the actual walk. Don't merge yet: just keep track of all
        # the config sets that need to be merged.
        #
        F = {}
        seen = set()
        successors = []
        pending = [config_set]
        while len(pending) > 0:
            config_set = pending.pop()
            if config_set in seen:
                continue
            seen.add(config_set)
            config_set_no_la = tuple(s.clear_lookahead() for s in config_set)
            existing = F.get(config_set_no_la)
            if existing is not None:
-                F[config_set_no_la] = self.merge_sets(config_set, existing)
+                existing.append(config_set)
            else:
-                F[config_set_no_la] = config_set
+                F[config_set_no_la] = [config_set]
-                for symbol, successor in self.gen_all_successors(config_set):
+
-                    successor_no_la = tuple(s.clear_lookahead() for s in successor)
+            for symbol, successor in self.gen_all_successors(config_set):
-                    successors.append((config_set_no_la, symbol, successor_no_la))
+                successor_no_la = tuple(s.clear_lookahead() for s in successor)
-                    pending.append(successor)
+                successors.append((config_set_no_la, symbol, successor_no_la))
                pending.append(successor)
        # Now we gathered the sets, merge them all.
        final_sets = {}
        for key, config_sets in F.items():
            new_config_set = []
            config_groupings = [[] for _ in range(len(config_sets[0]))]
            for config_set in config_sets:
                for i, config in enumerate(config_set):
                    config_groupings[i].append(config)
            for config_group in config_groupings:
                new_lookahead = [l for config in config_group for l in config.lookahead]
                new_lookahead = tuple(sorted(set(new_lookahead)))
                new_config_set.append(
                    Configuration(
                        name=config_group[0].name,
                        symbols=config_group[0].symbols,
                        position=config_group[0].position,
                        lookahead=new_lookahead,
                    )
                )
            final_sets[key] = tuple(new_config_set)
        # Register all the actually merged, final config sets.
        result = ConfigurationSetInfo()
-        for config_set in F.values():
+        for config_set in final_sets.values():
            result.register_config_set(config_set)
        # Now record all the successors that we found. Of course, the actual
@ -1443,10 +1461,10 @@ class GenerateLALR(GenerateLR1):
        # so we can find the final sets, then look them up in the registered
        # sets, and actually register the successor.
        for config_set_no_la, symbol, successor_no_la in successors:
-            actual_config_set = F[config_set_no_la]
+            actual_config_set = final_sets[config_set_no_la]
            from_index = result.config_set_key[actual_config_set]
-            actual_successor = F[successor_no_la]
+            actual_successor = final_sets[successor_no_la]
            to_index = result.config_set_key[actual_successor]
            result.add_successor(from_index, symbol, to_index)
@ -1499,7 +1517,7 @@ class Token(Rule):
    def __init__(self, value):
        self.value = sys.intern(value)
-    def flatten(self) -> typing.Generator[list[str], None, None]:
+    def flatten(self) -> typing.Generator[list["str | Token"], None, None]:
        # We are just ourselves when flattened.
        yield [self]
@ -1546,7 +1564,7 @@ class AlternativeRule(Rule):
        self.left = left
        self.right = right
-    def flatten(self) -> typing.Generator[list[str], None, None]:
+    def flatten(self) -> typing.Generator[list[str | Token], None, None]:
        # All the things from the left of the alternative, then all the things
        # from the right, never intermingled.
        yield from self.left.flatten()
@ -1562,7 +1580,7 @@ class SequenceRule(Rule):
        self.first = first
        self.second = second
-    def flatten(self) -> typing.Generator[list[str], None, None]:
+    def flatten(self) -> typing.Generator[list[str | Token], None, None]:
        # All the things in the prefix....
        for first in self.first.flatten():
            # ...potentially followed by all the things in the suffix.
@ -1575,7 +1593,7 @@ class NothingRule(Rule):
    these, you're probably better off just using the singleton `Nothing`.
    """
-    def flatten(self) -> typing.Generator[list[str], None, None]:
+    def flatten(self) -> typing.Generator[list[str | Token], None, None]:
        # It's quiet in here.
        yield []
@ -1583,7 +1601,7 @@ class NothingRule(Rule):
 Nothing = NothingRule()
-def seq(*args: list[Rule]) -> Rule:
+def seq(*args: Rule) -> Rule:
    """A rule that matches a sequence of rules.
    (A helper function that combines its arguments into nested sequences.)
@ -1594,17 +1612,15 @@ def seq(*args: list[Rule]) -> Rule:
    return result
-@typing.overload
+# @typing.overload
-def rule(name: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ...
+# def rule(f: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ...
-@typing.overload
+# @typing.overload
-def rule(fn: typing.Callable) -> Rule: ...
+# def rule(f: typing.Callable) -> Rule: ...
-def rule(
+def rule(f: typing.Callable) -> Rule:
    name_or_fn: None | str | typing.Callable = None,
 ) -> Rule | typing.Callable[[typing.Callable], Rule]:
    """The decorator that marks a method in a Grammar object as a nonterminal
    rule.
@ -1612,16 +1628,11 @@ def rule(
    If called with one argument, that argument is a name that overrides the name
    of the nonterminal, which defaults to the name of the function.
    """
    name = f.__name__
    return NonTerminal(f, name)
    def _rule(callable):
        return NonTerminal(callable, name)
-    if callable(name_or_fn):
+PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]]
        name = name_or_fn.__name__
        return _rule(name_or_fn)
    else:
        name = name_or_fn
        return _rule
 class Grammar:
@ -1650,12 +1661,13 @@ class Grammar:
    Not very exciting, perhaps, but it's something.
    """
-    def __init__(self, precedence: list[typing.Tuple[Assoc, list[Token | NonTerminal]]] = None):
+    def __init__(self, precedence: PrecedenceList | None = None):
        if precedence is None:
            precedence = getattr(self, "precedence", [])
        assert precedence is not None
        precedence_table = {}
-        for precedence, (associativity, symbols) in enumerate(precedence):
+        for prec, (associativity, symbols) in enumerate(precedence):
            for symbol in symbols:
                if isinstance(symbol, Token):
                    key = symbol.value
@ -1664,7 +1676,7 @@ class Grammar:
                else:
                    raise ValueError(f"{symbol} must be either a Token or a NonTerminal")
-                precedence_table[key] = (associativity, precedence + 1)
+                precedence_table[key] = (associativity, prec + 1)
        self._precedence = precedence_table
Author	SHA1	Message	Date
John Doty	bde22a5c99	A little faster but I think I might be doing this wrong.	2024-05-28 04:58:23 -07:00
John Doty	8d58c64040	Fix LALR. Small, but very very slow.	2024-05-27 22:31:33 -07:00
John Doty	0fc04cf11e	Fix grammar bugs, work on debugging harness.	2024-05-27 19:02:10 -07:00
John Doty	797ec8cd76	Correct type annotations for pyright Work around microsoft/pyright#8008.	2024-05-27 06:50:15 -07:00