Fix LALR. Small, but very very slow.

2024-05-27 22:31:33 -07:00 · 2024-05-27 22:31:33 -07:00 · 8d58c64040
commit 8d58c64040
parent 0fc04cf11e
2 changed files with 45 additions and 19 deletions
--- a/harness.py
+++ b/harness.py
@ -2,10 +2,22 @@ import bisect
 import typing

 import grammar
-from parser import Token, Grammar, rule, seq
+import parser
+
+# from parser import Token, Grammar, rule, seq


-def parse(table, tokens, trace=False):
+def trace_state(stack, input, input_index, action):
+    print(
+        "{stack: <20}  {input: <50}  {action: <5}".format(
+            stack=repr([s[0] for s in stack]),
+            input=repr(input[input_index : input_index + 4]),
+            action=repr(action),
+        )
+    )
+
+
+def parse(table, tokens, trace=None):
    """Parse the input with the generated parsing table and return the
    concrete syntax tree.

@ -35,13 +47,7 @@ def parse(table, tokens, trace=False):

        action = table[current_state].get(current_token, ("error",))
        if trace:
-            print(
-                "{stack: <20}  {input: <50}  {action: <5}".format(
-                    stack=repr([s[0] for s in stack]),
-                    input=repr(input[input_index : input_index + 4]),
-                    action=repr(action),
-                )
-            )
+            trace(stack, input, input_index, action)

        if action[0] == "accept":
            return (stack[-1][1], [])
@ -83,7 +89,9 @@ def parse(table, tokens, trace=False):


 def harness(lexer_func, grammar_func, start_rule, source_path):
-    table = grammar_func().build_table(start=start_rule)
+    # generator = parser.GenerateLR1
+    generator = parser.GenerateLALR
+    table = grammar_func().build_table(start=start_rule, generator=generator)
    print(f"{len(table)} states")

    average_entries = sum(len(row) for row in table) / len(table)
@ -96,7 +104,7 @@ def harness(lexer_func, grammar_func, start_rule, source_path):
        tokens = lexer_func(src)
        # print(f"{tokens.lines}")
        # tokens.dump(end=5)
-        (_, errors) = parse(table, tokens, trace=True)
+        (_, errors) = parse(table, tokens)
        if len(errors) > 0:
            print(f"{len(errors)} errors:")
            for error in errors:
--- a/parser.py
+++ b/parser.py
@ -257,6 +257,14 @@ class Configuration:
            lookahead=(),
        )

+    def replace_lookahead(self, lookahead: typing.Tuple[int, ...]):
+        return Configuration(
+            name=self.name,
+            symbols=self.symbols,
+            position=self.position,
+            lookahead=lookahead,
+        )
+
    @property
    def rest(self):
        return self.symbols[(self.position + 1) :]
@ -1382,7 +1390,11 @@ class GenerateLALR(GenerateLR1):
    use a bunch of improvement, probably.)
    """

-    def merge_sets(self, config_set_a, config_set_b):
+    def merge_sets(
+        self,
+        config_set_a: typing.Tuple[Configuration, ...],
+        config_set_b: typing.Tuple[Configuration, ...],
+    ):
        """Merge the two config sets, by keeping the item cores but merging
        the lookahead sets for each item.
        """
@ -1394,7 +1406,7 @@ class GenerateLALR(GenerateLR1):

            new_lookahead = a.lookahead + b.lookahead
            new_lookahead = tuple(sorted(set(new_lookahead)))
-            merged.append(a.clear_lookahead())
+            merged.append(a.replace_lookahead(new_lookahead))

        return tuple(merged)

@ -1403,7 +1415,7 @@ class GenerateLALR(GenerateLR1):
        b_no_la = tuple(s.clear_lookahead() for s in b)
        return a_no_la == b_no_la

-    def gen_sets(self, config_set) -> ConfigurationSetInfo:
+    def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo:
        """Recursively generate all configuration sets starting from the
        provided set, and merge them with the provided set 'F'.

@ -1414,10 +1426,15 @@ class GenerateLALR(GenerateLR1):
        and replace the set in F, returning the modified set.
        """
        F = {}
+        seen = set()
        successors = []
        pending = [config_set]
        while len(pending) > 0:
            config_set = pending.pop()
+            if config_set in seen:
+                continue
+            seen.add(config_set)
+
            config_set_no_la = tuple(s.clear_lookahead() for s in config_set)

            existing = F.get(config_set_no_la)
@ -1425,10 +1442,11 @@ class GenerateLALR(GenerateLR1):
                F[config_set_no_la] = self.merge_sets(config_set, existing)
            else:
                F[config_set_no_la] = config_set
-                for symbol, successor in self.gen_all_successors(config_set):
-                    successor_no_la = tuple(s.clear_lookahead() for s in successor)
-                    successors.append((config_set_no_la, symbol, successor_no_la))
-                    pending.append(successor)
+
+            for symbol, successor in self.gen_all_successors(config_set):
+                successor_no_la = tuple(s.clear_lookahead() for s in successor)
+                successors.append((config_set_no_la, symbol, successor_no_la))
+                pending.append(successor)

        # Register all the actually merged, final config sets.
        result = ConfigurationSetInfo()
@ -1723,7 +1741,7 @@ class Grammar:

        return grammar

-    def build_table(self, start: str, generator=GenerateLR1):
+    def build_table(self, start: str, generator=GenerateLALR):
        """Construct a parse table for this grammar, starting at the named
        nonterminal rule.
        """