Move terminals into grammar definition

Starting to work on machine-generated lexers too
2024-08-23 07:24:30 -07:00 · 2024-08-23 07:24:30 -07:00 · 58c3004702
commit 58c3004702
parent f6bc2ccea8
4 changed files with 917 additions and 267 deletions
--- a/parser/parser.py
+++ b/parser/parser.py
@ -21,19 +21,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create
 one method per nonterminal, decorated with the `rule` decorator. Here's an
 example:

-    PLUS = Terminal('+')
-    LPAREN = Terminal('(')
-    RPAREN = Terminal(')')
-    ID = Terminal('id')

    class SimpleGrammar(Grammar):
        @rule
        def expression(self):
-            return seq(self.expression, PLUS, self.term) | self.term
+            return seq(self.expression, self.PLUS, self.term) | self.term

        @rule
        def term(self):
-            return seq(LPAREN, self.expression, RPAREN) | ID
+            return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
+
+        PLUS = Terminal('+')
+        LPAREN = Terminal('(')
+        RPAREN = Terminal(')')
+        ID = Terminal('id')


 ## Using grammars
@ -1605,10 +1606,14 @@ class Rule:
 class Terminal(Rule):
    """A token, or terminal symbol in the grammar."""

-    value: str
+    value: str | None
+    pattern: str
+    regex: bool

-    def __init__(self, value):
-        self.value = sys.intern(value)
+    def __init__(self, pattern, name=None, regex=False):
+        self.value = name
+        self.pattern = pattern
+        self.regex = regex

    def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
        # We are just ourselves when flattened.
@ -1766,19 +1771,20 @@ class Grammar:

    Here's an example of a simple grammar:

-        PLUS = Terminal('+')
-        LPAREN = Terminal('(')
-        RPAREN = Terminal(')')
-        ID = Terminal('id')
-
        class SimpleGrammar(Grammar):
            @rule
            def expression(self):
-                return seq(self.expression, PLUS, self.term) | self.term
+                return seq(self.expression, self.PLUS, self.term) | self.term

            @rule
            def term(self):
-                return seq(LPAREN, self.expression, RPAREN) | ID
+                return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
+
+            PLUS = Terminal('+')
+            LPAREN = Terminal('(')
+            RPAREN = Terminal(')')
+            ID = Terminal('id')
+

    Not very exciting, perhaps, but it's something.
    """
@ -1786,6 +1792,7 @@ class Grammar:
    _precedence: dict[str, typing.Tuple[Assoc, int]]
    _start: str
    _generator: type[GenerateLR0]
+    _terminals: list[Terminal]

    def __init__(
        self,
@ -1809,6 +1816,14 @@ class Grammar:
            generator = getattr(self, "generator", GenerateLALR)
        assert generator is not None

+        # Fixup terminal names with the name of the member that declared it.
+        terminals = []
+        for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
+            if t.value is None:
+                t.value = n
+            terminals.append(t)
+
+        # Fix up the precedence table.
        precedence_table = {}
        for prec, (associativity, symbols) in enumerate(precedence):
            for symbol in symbols:
@ -1824,6 +1839,11 @@ class Grammar:
        self._precedence = precedence_table
        self._start = start
        self._generator = generator
+        self._terminals = terminals
+
+    @property
+    def terminals(self) -> list[Terminal]:
+        return self._terminals

    def generate_nonterminal_dict(
        self, start: str | None = None
@ -1911,3 +1931,149 @@ class Grammar:
        gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
        table = gen.gen_table()
        return table
+
+
+###############################################################################
+# Lexer support
+###############################################################################
+# For machine-generated lexers
+
+
+@dataclasses.dataclass(frozen=True, slots=True)
+class Span:
+    lower: int  # inclusive
+    upper: int  # exclusive
+
+    @classmethod
+    def from_str(cls, c: str) -> "Span":
+        return Span(lower=ord(c), upper=ord(c) + 1)
+
+    def intersects(self, other: "Span") -> bool:
+        return self.lower < other.upper and self.upper > other.lower
+
+    def split(self, other: "Span") -> tuple["Span|None", "Span", "Span|None"]:
+        assert self.intersects(other)
+
+        first = min(self.lower, other.lower)
+        second = max(self.lower, other.lower)
+        third = min(self.upper, other.upper)
+        fourth = max(self.upper, other.upper)
+
+        low = Span(first, second) if first != second else None
+        mid = Span(second, third)
+        hi = Span(third, fourth) if third != fourth else None
+
+        return (low, mid, hi)
+
+    def __str__(self) -> str:
+        if self.upper - self.lower == 1:
+            return str(self.lower)
+
+        lower = str(self.lower)
+        upper = str(self.upper)
+        return f"[{lower}-{upper})"
+
+    def __lt__(self, other: "Span") -> bool:
+        return self.lower < other.lower
+
+
+ET = typing.TypeVar("ET")
+
+
+class EdgeList[ET]:
+    """A list of edge transitions, keyed by *span*. A given span can have
+    multiple targets, because this supports NFAs."""
+
+    _edges: list[tuple[Span, list[ET]]]
+
+    def __init__(self):
+        self._edges = []
+
+    def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]:
+        return iter(self._edges)
+
+    def __repr__(self) -> str:
+        return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]"
+
+    def add_edge(self, c: Span, s: ET):
+        """Add an edge for the given span to the list. If there are already
+        spans that overlap this one, split and generating multiple distinct
+        edges.
+        """
+        # print(f"  Adding {c}->{s} to {self}...")
+        # Look to see where we would put this span based solely on a
+        # sort of lower bounds.
+        point = bisect.bisect_left(self._edges, c, key=lambda x: x[0])
+
+        # If this is not the first span in the list then we might
+        # overlap with the span to our left....
+        if point > 0:
+            left_point = point - 1
+            left_span, left_targets = self._edges[left_point]
+            if c.intersects(left_span):
+                # ...if we intersect with the span to our left then we
+                # must split the span to our left with regards to our
+                # span. Then we have three target spans:
+                #
+                #   - The lo one, which just has the targets from the old
+                #     left span. (This may be empty if we overlap the
+                #     left one completely on the left side.)
+                #
+                #   - The mid one, which has both the targets from the
+                #     old left and the new target.
+                #
+                #   - The hi one, which if it exists only has our target.
+                #     If it exists it basically replaces the current span
+                #     for our future processing. (If not, then our span
+                #     is completely subsumed into the left span and we
+                #     can stop.)
+                #
+                del self._edges[left_point]
+                lo, mid, hi = c.split(left_span)
+                # print(f"    <- {c} splits {left_span} -> {lo}, {mid}, {hi}  @{left_point}")
+                self._edges.insert(left_point, (mid, left_targets + [s]))
+                if lo is not None:
+                    self._edges.insert(left_point, (lo, left_targets))
+                if hi is None or not hi.intersects(c):
+                    # Yup, completely subsumed.
+                    # print(f"      result: {self} (left out)")
+                    return
+
+                # Continue processing with `c` as the hi split from the
+                # left. If the left and right spans abut each other then
+                # `c` will be subsumed in our right span.
+                c = hi
+
+        # If point is not at the very end of the list then it might
+        # overlap the span to our right...
+        if point < len(self._edges):
+            right_span, right_targets = self._edges[point]
+            if c.intersects(right_span):
+                # ...this is similar to the left case, above, except the
+                # lower bound has the targets that our only ours, etc.
+                del self._edges[point]
+                lo, mid, hi = c.split(right_span)
+                # print(f"    -> {c} splits {right_span} -> {lo}, {mid}, {hi}  @{point}")
+                if hi is not None:
+                    self._edges.insert(point, (hi, right_targets))
+                self._edges.insert(point, (mid, right_targets + [s]))
+                if lo is None or not lo.intersects(c):
+                    # Our span is completely subsumed on the lower side
+                    # of the range; there is no lower side that just has
+                    # our targets. Bail now.
+                    # print(f"      result: {self} (right out)")
+                    return
+
+                # Continue processing with `c` as the lo split, since
+                # that's the one that has only the specified state as the
+                # target.
+                c = lo
+
+        # If we made it here then either we have a point that does not
+        # intersect at all, or it only partially intersects on either the
+        # left or right. Either way, we have ensured that:
+        #
+        #  - c doesn't intersect with left or right (any more)
+        #  - point is where it should go
+        self._edges.insert(point, (c, [s]))
+        # print(f"      result: {self} (done)")