faster: Precedence support, necessary for IfStatement

2024-04-17 11:06:14 -07:00 · 2024-04-17 11:06:14 -07:00 · d0be3ea267
commit d0be3ea267
parent c100613ff5
1 changed files with 109 additions and 65 deletions
--- a/parser_faster.py
+++ b/parser_faster.py
@ -3,10 +3,13 @@ might expect the code did NOT work acceptibly.

 This version has some performance work done.

+It also supports precedence.
+
 2023
 """
 import collections
 import dataclasses
+import enum
 import typing


@ -196,6 +199,14 @@ class ConfigurationSetInfo:
        raise KeyError("Unable to find a path to the target set!")


+class Assoc(enum.Enum):
+    """Associativity of a rule."""
+    NONE = 0
+    LEFT = 1
+    RIGHT = 2
+
+
+
 class ErrorCollection:
    errors: dict[ConfigSet, dict[int, dict[Configuration, typing.Tuple]]]

@ -259,15 +270,17 @@ class ErrorCollection:


 class TableBuilder(object):
-    row: None | list[typing.Tuple[None | typing.Tuple, None | Configuration]]
-    table: list[dict[str, typing.Tuple]]
-    config_sets: dict[ConfigSet, int] # Map config sets to rows.
    errors: ErrorCollection
+    table: list[dict[str, typing.Tuple]]
+    alphabet: list[str]
+    precedence: typing.Tuple[typing.Tuple[Assoc, int], ...]
+    row: None | list[typing.Tuple[None | typing.Tuple, None | Configuration]]

-    def __init__(self, alphabet: list[str]):
+    def __init__(self, alphabet: list[str], precedence: typing.Tuple[typing.Tuple[Assoc, int], ...]):
        self.errors = ErrorCollection()
        self.table = []
        self.alphabet = alphabet
+        self.precedence = precedence
        self.row = None

    def flush(self, all_sets: ConfigurationSetInfo):
@ -322,13 +335,56 @@ class TableBuilder(object):
            assert existing_config is not None
            assert config is not None

-            # Record the conflicts.
-            self.errors.add_error(self.current_config_set, symbol_id, existing_config, existing)
-            self.errors.add_error(self.current_config_set, symbol_id, config, action)
+            # Maybe we can resolve the conflict with precedence?
+            existing_assoc, existing_prec = self.precedence[existing_config.name]
+            new_assoc, new_prec = self.precedence[config.name]
+
+            if existing_prec > new_prec:
+                # Precedence of the action in the table already wins, do nothing.
+                return
+
+            elif existing_prec == new_prec:
+                # It's an actual conflict, use associativity if we can.
+                # If there's a conflict in associativity then it's a real conflict!
+                assoc = Assoc.NONE
+                if existing_assoc == Assoc.NONE:
+                    assoc = new_assoc
+                elif new_assoc == Assoc.NONE:
+                    assoc = existing_assoc
+                elif new_assoc == existing_assoc:
+                    assoc = new_assoc
+
+                resolved = False
+                if assoc == Assoc.LEFT:
+                    # Prefer reduce over shift
+                    if action[0] == 'shift' and existing[0] == 'reduce':
+                        action = existing
+                        resolved = True
+                    elif action[0] == 'reduce' and existing[0] == 'shift':
+                        resolved = True
+
+                elif assoc == Assoc.RIGHT:
+                    # Prefer shift over reduce
+                    if action[0] == 'shift' and existing[0] == 'reduce':
+                        resolved = True
+                    elif action[0] == 'reduce' and existing[0] == 'shift':
+                        action = existing
+                        resolved = True
+
+                if not resolved:
+                    # Record the conflicts.
+                    self.errors.add_error(self.current_config_set, symbol_id, existing_config, existing)
+                    self.errors.add_error(self.current_config_set, symbol_id, config, action)
+
+            else:
+                # Precedence of the new action is greater than the existing
+                # action, just allow the overwrite with no change.
+                pass

        self.row[symbol_id] = (action, config)


+
 class GenerateLR0(object):
    """Generate parser tables for an LR0 parser.

@ -357,24 +413,13 @@ class GenerateLR0(object):
      ('O', []),

    means that O can be matched with nothing.
-
-    Implementation notes:
-    - This is implemented in the dumbest way possible, in order to be the
-      most understandable it can be. I built this to learn, and I want to
-      make sure I can keep learning with it.
-
-    - We tend to use tuples everywhere. This is because tuples can be
-      compared for equality and put into tables and all that jazz. They might
-      be a little bit slower in places but like I said, this is for
-      learning. (Also, if we need this to run faster we can probably go a
-      long way by memoizing results, which is much easier if we have tuples
-      everywhere.)
    """

    alphabet: list[str]
    grammar: list[list[typing.Tuple[int, ...]]]
-    nonterminals: typing.Tuple[bool, ...]
-    terminals: typing.Tuple[bool, ...]
+    nonterminal: typing.Tuple[bool, ...]
+    terminal: typing.Tuple[bool, ...]
+    precedence: typing.Tuple[typing.Tuple[Assoc, int], ...]

    symbol_key: dict[str, int]
    start_symbol: int
@ -384,7 +429,12 @@ class GenerateLR0(object):
    successors: list[set[int]]


-    def __init__(self, start: str, grammar: list[typing.Tuple[str, list[str]]]):
+    def __init__(
+        self,
+        start: str,
+        grammar: list[typing.Tuple[str, list[str]]],
+        precedence: None | dict[str, typing.Tuple[Assoc, int]] = None,
+    ):
        """Initialize the parser generator with the specified grammar and
        start symbol.
        """
@ -426,30 +476,34 @@ class GenerateLR0(object):
        # We count on python dictionaries retaining the insertion order, like
        # it or not.
        full_grammar = [list() for _ in self.alphabet]
-        terminals = [True for _ in self.alphabet]
-        assert terminals[end_symbol]
+        terminal = [True for _ in self.alphabet]
+        assert terminal[end_symbol]

-        nonterminals = [False for _ in self.alphabet]
+        nonterminal = [False for _ in self.alphabet]

        for name, rule in grammar:
            name_symbol = symbol_key[name]

-            terminals[name_symbol] = False
-            nonterminals[name_symbol] = True
+            terminal[name_symbol] = False
+            nonterminal[name_symbol] = True

            rules = full_grammar[name_symbol]
            rules.append(tuple(symbol_key[symbol] for symbol in rule))

        self.grammar = full_grammar
        self.grammar[start_symbol].append((symbol_key[start],))
-        terminals[start_symbol] = False
-        nonterminals[start_symbol] = True
+        terminal[start_symbol] = False
+        nonterminal[start_symbol] = True

-        self.terminals = tuple(terminals)
-        self.nonterminals = tuple(nonterminals)
+        self.terminal = tuple(terminal)
+        self.nonterminal = tuple(nonterminal)

-        assert self.terminals[end_symbol]
-        assert self.nonterminals[start_symbol]
+        assert self.terminal[end_symbol]
+        assert self.nonterminal[start_symbol]
+
+        if precedence is None:
+            precedence = {}
+        self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet)

        self.symbol_key = symbol_key
        self.start_symbol = start_symbol
@ -497,7 +551,7 @@ class GenerateLR0(object):

        return tuple(sorted(closure)) # TODO: Why tuple?

-    def gen_successor(self, config_set: typing.Iterable[Configuration], symbol: str) -> ConfigSet:
+    def gen_successor(self, config_set: typing.Iterable[Configuration], symbol: int) -> ConfigSet:
        """Compute the successor state for the given config set and the
        given symbol.

@ -564,7 +618,7 @@ class GenerateLR0(object):

        In an LR0 parser, this is just the set of all terminals."""
        del(config)
-        return [index for index, value in enumerate(self.terminals) if value]
+        return [index for index, value in enumerate(self.terminal) if value]

    def gen_table(self):
        """Generate the parse table.
@ -595,7 +649,7 @@ class GenerateLR0(object):
        Anything missing from the row indicates an error.
        """
        config_sets = self.gen_all_sets()
-        builder = TableBuilder(self.alphabet)
+        builder = TableBuilder(self.alphabet, self.precedence)

        for config_set_id, config_set in enumerate(config_sets.sets):
            builder.new_row(config_set)
@ -610,13 +664,13 @@ class GenerateLR0(object):
                    else:
                        builder.set_table_accept(self.end_symbol, config)

-                elif self.terminals[config_next]:
+                elif self.terminal[config_next]:
                    index = successors[config_next]
                    builder.set_table_shift(config_next, index, config)

            # Gotos
            for symbol, index in successors.items():
-                if self.nonterminals[symbol]:
+                if self.nonterminal[symbol]:
                    builder.set_table_goto(symbol, index)

        return builder.flush(config_sets)
@ -700,27 +754,22 @@ class FirstInfo:
    @classmethod
    def from_grammar(
        cls,
-        alphabet: list[str],
        grammar: list[list[typing.Tuple[int,...]]],
-        terminals: typing.Tuple[bool, ...],
+        terminal: typing.Tuple[bool, ...],
    ):
-        # print("******* GENERATING FIRSTS ********")
-
        # Add all terminals to their own firsts
        firsts = []
-        for index, is_terminal in enumerate(terminals):
+        for index, is_terminal in enumerate(terminal):
            firsts.append(set())
            if is_terminal:
                firsts[index].add(index)

-        epsilons = [False for _ in terminals]
+        epsilons = [False for _ in terminal]
        changed = True
        while changed:
-            # print("========= ITERATION")
            changed = False
            for name, rules in enumerate(grammar):
                f = firsts[name]
-                # print(f"    {alphabet[name]} -> {[alphabet[s] for s in f]}")
                for rule in rules:
                    if len(rule) == 0:
                        changed = changed or not epsilons[name]
@ -728,11 +777,7 @@ class FirstInfo:
                        continue

                    for index, symbol in enumerate(rule):
-                        # if terminals[symbol]:
-                        #     changed = add_changed(f, symbol) or changed
-                        # else:
                        other_firsts = firsts[symbol]
-                        # print(f"        adding {alphabet[symbol]} -> {[alphabet[s] for s in other_firsts]}")
                        changed = update_changed(f, other_firsts) or changed

                        is_last = index == len(rule) - 1
@ -750,7 +795,6 @@ class FirstInfo:
                            # looping through the symbols in this rule.
                            break

-        # print("******* DONE GENERATING FIRSTS ********")
        return FirstInfo(firsts=firsts, is_epsilon=epsilons)

@dataclasses.dataclass(frozen=True)
@ -761,7 +805,7 @@ class FollowInfo:
    def from_grammar(
        cls,
        grammar: list[list[typing.Tuple[int,...]]],
-        terminals: typing.Tuple[bool, ...],
+        terminal: typing.Tuple[bool, ...],
        start_symbol: int,
        end_symbol: int,
        firsts: FirstInfo,
@ -778,7 +822,7 @@ class FollowInfo:
                    prev_symbol = None
                    for symbol in reversed(rule):
                        f = follows[symbol]
-                        if terminals[symbol]:
+                        if terminal[symbol]:
                            # This particular rule can't produce epsilon.
                            epsilon = False
                            prev_symbol = symbol
@ -826,10 +870,10 @@ class GenerateSLR1(GenerateLR0):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self._firsts = FirstInfo.from_grammar(self.alphabet, self.grammar, self.terminals)
+        self._firsts = FirstInfo.from_grammar(self.grammar, self.terminal)
        self._follows = FollowInfo.from_grammar(
            self.grammar,
-            self.terminals,
+            self.terminal,
            self.start_symbol,
            self.end_symbol,
            self._firsts,
@ -1049,24 +1093,24 @@ def format_table(generator, table):
        elif action[0] == 'reduce':
            return 'r' + str(action[1])

-    terminals = [
+    terminals = list(sorted(
        generator.alphabet[i]
-        for i,v in enumerate(generator.terminals)
+        for i,v in enumerate(generator.terminal)
        if v
-    ]
-    nonterminals = [
+    ))
+    nonterminals = list(sorted(
        generator.alphabet[i]
-        for i,v in enumerate(generator.nonterminals)
+        for i,v in enumerate(generator.nonterminal)
        if v
-    ]
+    ))
    header = "    | {terms} | {nts}".format(
        terms=' '.join(
            '{0: <6}'.format(terminal)
-            for terminal in sorted(terminals)
+            for terminal in terminals
        ),
        nts=' '.join(
            '{0: <5}'.format(nt)
-            for nt in sorted(nonterminals)
+            for nt in nonterminals
        ),
    )

@ -1078,11 +1122,11 @@ def format_table(generator, table):
            index=i,
            actions=' '.join(
                '{0: <6}'.format(format_action(row, terminal))
-                for terminal in sorted(terminals)
+                for terminal in terminals
            ),
            gotos=' '.join(
                '{0: <5}'.format(row.get(nt, ('error', ''))[1])
-                for nt in sorted(nonterminals)
+                for nt in nonterminals
            ),
        )
        for i, row in enumerate(table)