From 0a0f7b3612d35e6f75ed6f23e5ab4a27257a9658 Mon Sep 17 00:00:00 2001
From: John Doty <john@d0ty.me>
Date: Sun, 27 Oct 2024 08:36:16 -0700
Subject: [PATCH] [parser] Comment cleanup, documentation cleanup

---
 parser/parser.py  |  11 +--
 parser/runtime.py | 173 +++++++++++++++++++++++++++++-----------------
 2 files changed, 113 insertions(+), 71 deletions(-)

diff --git a/parser/parser.py b/parser/parser.py
index 2a7c872..e0c8c3f 100644
--- a/parser/parser.py
+++ b/parser/parser.py
@@ -142,9 +142,7 @@ import typing
 
 
 ###############################################################################
-# LR0
-#
-# We start with LR0 parsers, because they form the basis of everything else.
+# Parser Generator
 ###############################################################################
 class Configuration(typing.NamedTuple):
     """A core configuration, basically, a position within a rule.
@@ -1218,11 +1216,8 @@ class ParserGenerator:
         # token more than once.
         seen: set[int] = set()
 
-        # cnd_[rule|token]_weaklies represent which states are possible weakly
-        # compatible matches for a given symbol.
-        #
-        # DOTY: As with `seen`, we have a uniform space so we can have a
-        #       uniform one of these too.
+        # cnd_weaklies represent which states are possible weakly compatible
+        # matches for a given symbol.
         cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))]
 
         todo = 1  # How many None values are there in closed_states?
diff --git a/parser/runtime.py b/parser/runtime.py
index 6749978..9f3d30e 100644
--- a/parser/runtime.py
+++ b/parser/runtime.py
@@ -113,6 +113,13 @@ class RepairStack(typing.NamedTuple):
     def handle_token(
         self, table: parser.ParseTable, token: str
     ) -> typing.Tuple["RepairStack | None", bool]:
+        """Pretend we received this token during a repair.
+
+        This is *incredibly* annoying: basically another implementation of the
+        shift/reduce machine. We need to do this in order to simulate the effect
+        of receiving a token of the given type, so that we know what state the
+        world will be in if we (hypothetically) take a given action.
+        """
         rl = recover_log
 
         stack = self
@@ -179,7 +186,8 @@ class Repair:
         table: parser.ParseTable,
         input: list[TokenValue],
         start: int,
-    ):
+    ) -> typing.Iterable["Repair"]:
+        """Generate all the possible next repairs from this one."""
         input_index = start + self.advance
         current_token = input[input_index].kind
 
@@ -191,6 +199,11 @@ class Repair:
 
         state = self.stack.state
 
+        # First, generate all the neighbors that involve either consuming the
+        # current token or generating a new one and consuming *that.* For each
+        # case, we need to run the shift-reduce machine to figure out what the
+        # new state will be after consuming the token.
+        #
         # For insert: go through all the actions and run all the possible
         # reduce/accepts on them. This will generate a *new stack* which we
         # then capture with an "Insert" repair action. Do not manipuate the
@@ -254,6 +267,11 @@ class Repair:
 
 
 def recover(table: parser.ParseTable, input: list[TokenValue], start: int, stack: ParseStack):
+    """An implementation of CPCT+ for automated error recovery.
+
+    Given a current parse state, attempt to produce a series of modifications to
+    the token stream such that the parse will continue successfully.
+    """
     rl = recover_log
     initial = Repair(
         repair=RepairAction.Base,
@@ -270,11 +288,16 @@ def recover(table: parser.ParseTable, input: list[TokenValue], start: int, stack
         while queue_index < len(queue):
             repair = queue[queue_index]
 
-            # NOTE: This is guaranteed to be the cheapest possible success-
-            #       there can be no success cheaper than this one. Since
-            #       we're going to pick one arbitrarily, this one might as
-            #       well be it.
             if repair.success:
+                # If the repair at the top of the queue indicates success, then
+                # we will just take it. This is guaranteed to be one of the
+                # cheapest repairs because we know that every repair on this level
+                # of the queue has the same cost and every every repair on a
+                # subsequent level has a *higher* cost.
+                #
+                # (The CPCT+ paper gathers all repairs and asks the user to choose,
+                # but I want fully automated recovery so I'll be picking arbitrarily,
+                # and, well, picking *this* one meets the definition of arbitrary.)
                 repairs: list[Repair] = []
                 while repair is not None:
                     repairs.append(repair)
@@ -286,6 +309,11 @@ def recover(table: parser.ParseTable, input: list[TokenValue], start: int, stack
                         rl.info(" " + repr(repair))
                 return repairs
 
+            # NOTE: a neighbor can be on the same queue level! As a result, we
+            #       must use this index + append scheme, and we must not "scan
+            #       for successes and then generate neighbors" because
+            #       generating neighbors might actually generate a success on
+            #       the current level.
             for neighbor in repair.neighbors(table, input, start):
                 for _ in range((neighbor.cost - len(todo_queue)) + 1):
                     todo_queue.append([])
@@ -309,6 +337,55 @@ class TokenStream(typing.Protocol):
         ...
 
 
+def prepare_tokens(
+    input_tokens: list[typing.Tuple[parser.Terminal, int, int]],
+    trivia_tokens: set[str],
+) -> list[TokenValue]:
+    """Filter the list of input tokens into a list of non-trivia tokens, with
+    associated trivia lists. Also, stick an EOF on the end of the token list
+    to make *sure* the input is terminated.
+    """
+    input: list[TokenValue] = []
+    trivia: list[TokenValue] = []
+    for kind, start, length in input_tokens:
+        assert kind.name is not None
+        if kind.name in trivia_tokens:
+            trivia.append(
+                TokenValue(
+                    kind=kind.name,
+                    start=start,
+                    end=start + length,
+                    pre_trivia=[],
+                    post_trivia=[],
+                )
+            )
+        else:
+            prev_trivia = trivia
+            trivia = []
+
+            input.append(
+                TokenValue(
+                    kind=kind.name,
+                    start=start,
+                    end=start + length,
+                    pre_trivia=prev_trivia,
+                    post_trivia=trivia,
+                )
+            )
+
+    eof = 0 if len(input) == 0 else input[-1].end
+    input.append(
+        TokenValue(
+            kind="$",
+            start=eof,
+            end=eof,
+            pre_trivia=trivia,
+            post_trivia=[],
+        )
+    )
+    return input
+
+
 class Parser:
     table: parser.ParseTable
 
@@ -316,50 +393,16 @@ class Parser:
         self.table = table
 
     def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
-        input_tokens = tokens.tokens()
+        """Parse a token stream into a tree, returning both the root of the tree
+        (if any could be found) and a list of errors that were encountered during
+        the parse.
 
-        # Filter the input tokens, to generate a list of non-trivia tokens.
-        # In addition, track the trivia tokens we find along the way, and put
-        # them into a list attached to each non-trivia token, so we can
-        # actually recover the document *as written*.
-        input: list[TokenValue] = []
-        trivia: list[TokenValue] = []
-        for kind, start, length in input_tokens:
-            assert kind.name is not None
-            if kind.name in self.table.trivia:
-                trivia.append(
-                    TokenValue(
-                        kind=kind.name,
-                        start=start,
-                        end=start + length,
-                        pre_trivia=[],
-                        post_trivia=[],
-                    )
-                )
-            else:
-                prev_trivia = trivia
-                trivia = []
-
-                input.append(
-                    TokenValue(
-                        kind=kind.name,
-                        start=start,
-                        end=start + length,
-                        pre_trivia=prev_trivia,
-                        post_trivia=trivia,
-                    )
-                )
-
-        eof = 0 if len(input) == 0 else input[-1].end
-        input = input + [
-            TokenValue(
-                kind="$",
-                start=eof,
-                end=eof,
-                pre_trivia=trivia,
-                post_trivia=[],
-            )
-        ]
+        This parse method does automated error recovery. Tree nodes that were
+        generated as a result of error recovery will be noticeable because they
+        will be zero characters wide.
+        """
+        # Prepare the incoming token stream into only meaningful tokens.
+        input = prepare_tokens(tokens.tokens(), self.table.trivia)
         input_index = 0
 
         # Our stack is a stack of tuples, where the first entry is the state
@@ -386,12 +429,15 @@ class Parser:
 
             match action:
                 case parser.Accept():
+                    # We are at the end of the parse and we're done.
                     r = stack[-1][1]
                     assert isinstance(r, Tree)
                     result = r
                     break
 
                 case parser.Reduce(name=name, count=size, transparent=transparent):
+                    # Reduce a nonterminal: consume children from the stack, and
+                    # make a new tree node, then jump to the next state.
                     children: list[TokenValue | Tree] = []
                     if size > 0:
                         for _, c in stack[-size:]:
@@ -421,10 +467,13 @@ class Parser:
                     stack.append((goto, value))
 
                 case parser.Shift():
+                    # Consume a token.
                     stack.append((action.state, current_token))
                     input_index += 1
 
                 case parser.Error():
+                    # Oh no, something went wrong! Record the error then
+                    # attempt to repair the token sequence.
                     if current_token.kind == "$":
                         message = "Syntax error: Unexpected end of file"
                     else:
@@ -438,18 +487,22 @@ class Parser:
                         )
                     )
 
+                    # See if we can find a series of patches to the token stream
+                    # that will allow us to continue parsing.
                     repairs = recover(self.table, input, input_index, stack)
 
                     # If we were unable to find a repair sequence, then just
-                    # quit here; we have what we have. We *should* do our
-                    # best to generate a tree, but I'm not sure if we can?
+                    # quit here: we didn't manage to even make a tree. It would
+                    # be nice if we could create a tree in this case but I'm not
+                    # entirely sure how to do it.
                     if repairs is None:
                         break
 
                     # If we were *were* able to find a repair, apply it to
-                    # the token stream and continue moving. It is guaranteed
-                    # that we will not generate an error until we get to the
-                    # end of the stream that we found.
+                    # the token stream. The repair is a series of insertions,
+                    # deletions, and consumptions of tokens in the stream. We
+                    # patch up the token stream inline with the repaired changes
+                    # so that now we have a valid token stream again.
                     cursor = input_index
                     for repair in repairs:
                         match repair.repair:
@@ -485,6 +538,10 @@ class Parser:
                             case _:
                                 typing.assert_never(repair.repair)
 
+                    # Now we can just keep running: don't change state or
+                    # position in the token stream or anything, the stream is
+                    # now good enough for us to keep parsing for a while.
+
                 case _:
                     typing.assert_never(action)
 
@@ -515,8 +572,6 @@ def generic_tokenize(
     last_accept = None
     last_accept_pos = 0
 
-    # print(f"LEXING: {src} ({len(src)})")
-
     while pos < len(src):
         while state is not None:
             accept, edges = table[state]
@@ -524,31 +579,24 @@ def generic_tokenize(
                 last_accept = accept
                 last_accept_pos = pos
 
-            # print(f"    @ {pos} state: {state} ({accept})")
             if pos >= len(src):
                 break
 
             char = ord(src[pos])
-            # print(f"      -> char: {char} ({repr(src[pos])})")
 
             # Find the index of the span where the upper value is the tightest
             # bound on the character.
             state = None
             index = bisect.bisect_right(edges, char, key=lambda x: x[0].upper)
-            # print(f"      -> {index}")
             if index < len(edges):
                 span, target = edges[index]
-                # print(f"      -> {span}, {target}")
                 if char >= span.lower:
-                    # print(f"         -> target: {target}")
                     state = target
                     pos += 1
 
                 else:
-                    # print(f"         Nope (outside range)")
                     pass
             else:
-                # print(f"       Nope (at end)")
                 pass
 
         if last_accept is None:
@@ -556,7 +604,6 @@ def generic_tokenize(
 
         yield (last_accept, start, last_accept_pos - start)
 
-        # print(f"    Yield: {last_accept}, reset to {last_accept_pos}")
         last_accept = None
         pos = last_accept_pos
         start = pos