Transparent rules

Better parsing/action types Good grief
Start working on the harness
2024-05-29 09:07:19 -07:00 · 2024-05-29 07:48:55 -07:00
3 changed files with 316 additions and 122 deletions
--- a/grammar.py
+++ b/grammar.py
@ -78,11 +78,11 @@ class FineGrammar(Grammar):

    @rule
    def file(self):
-        return self.file_statement_list
+        return self._file_statement_list

    @rule
-    def file_statement_list(self):
-        return self.file_statement | (self.file_statement_list + self.file_statement)
+    def _file_statement_list(self):
+        return self.file_statement | (self._file_statement_list + self.file_statement)

    @rule
    def file_statement(self):
--- a/harness.py
+++ b/harness.py
@ -1,4 +1,10 @@
 import bisect
+from dataclasses import dataclass
+import enum
+import select
+import sys
+import termios
+import tty
 import typing

 import grammar
@ -17,7 +23,13 @@ def trace_state(stack, input, input_index, action):
    )


-def parse(table, tokens, trace=None):
+@dataclass
+class Tree:
+    name: str | None
+    children: typing.Tuple["Tree | str", ...]
+
+
+def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | None, list[str]]:
    """Parse the input with the generated parsing table and return the
    concrete syntax tree.

@ -31,7 +43,7 @@ def parse(table, tokens, trace=None):
    This is not a *great* parser, it's really just a demo for what you can
    do with the table.
    """
-    input = [t.value for (t, _, _) in tokens.tokens]
+    input: list[str] = [t.value for (t, _, _) in tokens.tokens]

    assert "$" not in input
    input = input + ["$"]
@ -40,38 +52,50 @@ def parse(table, tokens, trace=None):
    # Our stack is a stack of tuples, where the first entry is the state number
    # and the second entry is the 'value' that was generated when the state was
    # pushed.
-    stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
+    stack: list[typing.Tuple[int, str | Tree | None]] = [(0, None)]
    while True:
        current_state = stack[-1][0]
        current_token = input[input_index]

-        action = table[current_state].get(current_token, ("error",))
+        action = table.states[current_state].get(current_token, parser.Error())
        if trace:
            trace(stack, input, input_index, action)

-        if action[0] == "accept":
-            return (stack[-1][1], [])
+        match action:
+            case parser.Accept():
+                result = stack[-1][1]
+                assert isinstance(result, Tree)
+                return (result, [])

-        elif action[0] == "reduce":
-            name = action[1]
-            size = action[2]
+            case parser.Reduce(name=name, count=size, transparent=transparent):
+                children: list[str | Tree] = []
+                for _, c in stack[-size:]:
+                    if c is None:
+                        continue
+                    elif isinstance(c, Tree) and c.name is None:
+                        children.extend(c.children)
+                    else:
+                        children.append(c)

-            value = (name, tuple(s[1] for s in stack[-size:]))
+                value = Tree(name=name if not transparent else None, children=tuple(children))
                stack = stack[:-size]

-            goto = table[stack[-1][0]].get(name, ("error",))
-            assert goto[0] == "goto"  # Corrupt table?
-            stack.append((goto[1], value))
+                goto = table.states[stack[-1][0]].get(name, parser.Error())
+                assert isinstance(goto, parser.Goto)
+                stack.append((goto.state, value))

-        elif action[0] == "shift":
-            stack.append((action[1], (current_token, ())))
+            case parser.Shift(state):
+                stack.append((state, current_token))
                input_index += 1

-        elif action[0] == "error":
+            case parser.Error():
                if input_index >= len(tokens.tokens):
-                raise ValueError("Unexpected end of file")
+                    message = "Unexpected end of file"
+                    start = tokens.tokens[-1][1]
                else:
+                    message = f"Syntax error: unexpected symbol {current_token}"
                    (_, start, _) = tokens.tokens[input_index]
+
                line_index = bisect.bisect_left(tokens.lines, start)
                if line_index == 0:
                    col_start = 0
@ -80,54 +104,154 @@ def parse(table, tokens, trace=None):
                column_index = start - col_start
                line_index += 1

-                return (
-                    None,
-                    [
-                        f"{line_index}:{column_index}: Syntax error: unexpected symbol {current_token}"
-                    ],
+                error = f"{line_index}:{column_index}: {message}"
+                return (None, [error])
+
+            case _:
+                raise ValueError(f"Unknown action type: {action}")
+
+
+# https://en.wikipedia.org/wiki/ANSI_escape_code
+# https://gist.github.com/fnky/458719343aabd01cfb17a3a4f7296797
+
+
+class CharColor(enum.IntEnum):
+    CHAR_COLOR_DEFAULT = 0
+    CHAR_COLOR_BLACK = 30
+    CHAR_COLOR_RED = enum.auto()
+    CHAR_COLOR_GREEN = enum.auto()
+    CHAR_COLOR_YELLOW = enum.auto()
+    CHAR_COLOR_BLUE = enum.auto()
+    CHAR_COLOR_MAGENTA = enum.auto()
+    CHAR_COLOR_CYAN = enum.auto()
+    CHAR_COLOR_WHITE = enum.auto()  # Really light gray
+    CHAR_COLOR_BRIGHT_BLACK = 90  # Really dark gray
+    CHAR_COLOR_BRIGHT_RED = enum.auto()
+    CHAR_COLOR_BRIGHT_GREEN = enum.auto()
+    CHAR_COLOR_BRIGHT_YELLOW = enum.auto()
+    CHAR_COLOR_BRIGHT_BLUE = enum.auto()
+    CHAR_COLOR_BRIGHT_MAGENTA = enum.auto()
+    CHAR_COLOR_BRIGHT_CYAN = enum.auto()
+    CHAR_COLOR_BRIGHT_WHITE = enum.auto()
+
+
+def ESC(x: bytes) -> bytes:
+    return b"\033" + x
+
+
+def CSI(x: bytes) -> bytes:
+    return ESC(b"[" + x)
+
+
+CLEAR = CSI(b"H") + CSI(b"0m")
+
+
+def enter_alt_screen():
+    sys.stdout.buffer.write(CSI(b"?1049h"))
+
+
+def leave_alt_screen():
+    sys.stdout.buffer.write(CSI(b"?1049l"))
+
+
+class Harness:
+    source: str | None
+    table: parser.ParseTable | None
+    tree: Tree | None
+
+    def __init__(self, lexer_func, grammar_func, start_rule, source_path):
+        # self.generator = parser.GenerateLR1
+        self.generator = parser.GenerateLALR
+        self.lexer_func = lexer_func
+        self.grammar_func = grammar_func
+        self.start_rule = start_rule
+        self.source_path = source_path
+
+        self.source = None
+        self.table = None
+        self.tokens = None
+        self.tree = None
+        self.errors = None
+
+    def run(self):
+        while True:
+            i, _, _ = select.select([sys.stdin], [], [], 1)
+            if i:
+                k = sys.stdin.read(1)
+                print(f"Key {k}\r")
+                return
+
+            self.update()
+
+    def update(self):
+        if self.table is None:
+            self.table = self.grammar_func().build_table(
+                start=self.start_rule, generator=self.generator
            )
+        assert self.table is not None

+        if self.tokens is None:
+            with open(self.source_path, "r", encoding="utf-8") as f:
+                self.source = f.read()
+            self.tokens = self.lexer_func(self.source)

-def harness(lexer_func, grammar_func, start_rule, source_path):
-    # generator = parser.GenerateLR1
-    generator = parser.GenerateLALR
-
-    trace = None
-    # trace = trace_state
-
-    table = grammar_func().build_table(start=start_rule, generator=generator)
-    print(f"{len(table)} states")
-
-    average_entries = sum(len(row) for row in table) / len(table)
-    max_entries = max(len(row) for row in table)
-    print(f"{average_entries} average, {max_entries} max")
-
-    if source_path:
-        with open(source_path, "r", encoding="utf-8") as f:
-            src = f.read()
-        tokens = lexer_func(src)
        # print(f"{tokens.lines}")
        # tokens.dump(end=5)
-        (_, errors) = parse(table, tokens, trace=trace)
-        if len(errors) > 0:
-            print(f"{len(errors)} errors:")
-            for error in errors:
-                print(f"  {error}")
+        if self.tree is None and self.errors is None:
+            (tree, errors) = parse(self.table, self.tokens, trace=None)
+            self.tree = tree
+            self.errors = errors
+
+        sys.stdout.buffer.write(CLEAR)
+        rows, cols = termios.tcgetwinsize(sys.stdout.fileno())
+
+        states = self.table.states
+        average_entries = sum(len(row) for row in states) / len(states)
+        max_entries = max(len(row) for row in states)
+        print(f"{len(states)} states - {average_entries} average, {max_entries} max\r")
+
+        if self.tree is not None:
+            lines = []
+            self.format_node(lines, self.tree)
+            for line in lines[: rows - 2]:
+                print(line[:cols] + "\r")
+
+        sys.stdout.flush()
+        sys.stdout.buffer.flush()
+
+    def format_node(self, lines, node: Tree | str, indent=0):
+        """Print out an indented concrete syntax tree, from parse()."""
+        match node:
+            case Tree(name, children):
+                lines.append((" " * indent) + (name or "???"))
+                for child in children:
+                    self.format_node(lines, child, indent + 2)
+            case _:
+                lines.append((" " * indent) + str(node))


 if __name__ == "__main__":
-    import sys
-
    source_path = None
    if len(sys.argv) == 2:
        source_path = sys.argv[1]

-    harness(
+    fd = sys.stdin.fileno()
+    old_settings = termios.tcgetattr(fd)
+    try:
+        tty.setraw(fd)
+        enter_alt_screen()
+
+        h = Harness(
            lexer_func=grammar.FineTokens,
            grammar_func=grammar.FineGrammar,
            start_rule="file",
            source_path=source_path,
        )
+        h.run()
+
+    finally:
+        leave_alt_screen()
+        termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)

    # print(parser_faster.format_table(gen, table))
    # print()
--- a/parser.py
+++ b/parser.py
@ -393,13 +393,45 @@ class Assoc(enum.Enum):
    RIGHT = 2


+@dataclasses.dataclass
+class Action:
+    pass
+
+
+@dataclasses.dataclass
+class Reduce(Action):
+    name: str
+    count: int
+    transparent: bool
+
+
+@dataclasses.dataclass
+class Shift(Action):
+    state: int
+
+
+@dataclasses.dataclass
+class Goto(Action):
+    state: int
+
+
+@dataclasses.dataclass
+class Accept(Action):
+    pass
+
+
+@dataclasses.dataclass
+class Error(Action):
+    pass
+
+
 class ErrorCollection:
    """A collection of errors. The errors are grouped by config set and alphabet
    symbol, so that we can group the error strings appropriately when we format
    the error.
    """

-    errors: dict[ConfigSet, dict[int, dict[Configuration, typing.Tuple]]]
+    errors: dict[ConfigSet, dict[int, dict[Configuration, Action]]]

    def __init__(self):
        self.errors = {}
@ -413,7 +445,7 @@ class ErrorCollection:
        config_set: ConfigSet,
        symbol: int,
        config: Configuration,
-        action: typing.Tuple,
+        action: Action,
    ):
        """Add an error to the collection.

@ -470,14 +502,16 @@ class ErrorCollection:
                    if config.next is None:
                        rule += " *"

-                    if action[0] == "reduce":
-                        action_str = f"pop {action[2]} values off the stack and make a {action[1]}"
-                    elif action[0] == "shift":
+                    match action:
+                        case Reduce(name=name, count=count, transparent=transparent):
+                            name_str = name if not transparent else "transparent node"
+                            action_str = f"pop {count} values off the stack and make a {name_str}"
+                        case Shift():
                            action_str = "consume the token and keep going"
-                    elif action[0] == "accept":
+                        case Accept():
                            action_str = "accept the parse"
-                    else:
-                        assert action[0] == "goto", f"Unknown action {action[0]}"
+                        case _:
+                            assert isinstance(action, Goto)
                            raise Exception("Shouldn't conflict on goto ever")

                    lines.append(
@ -489,6 +523,11 @@ class ErrorCollection:
        return "\n\n".join(errors)


+@dataclasses.dataclass
+class ParseTable:
+    states: list[dict[str, Action]]
+
+
 class TableBuilder(object):
    """A helper object to assemble actions into build parse tables.

@ -497,23 +536,27 @@ class TableBuilder(object):
    """

    errors: ErrorCollection
-    table: list[dict[str, typing.Tuple]]
+    table: list[dict[str, Action]]
    alphabet: list[str]
    precedence: typing.Tuple[typing.Tuple[Assoc, int], ...]
-    row: None | list[typing.Tuple[None | typing.Tuple, None | Configuration]]
+    transparents: set[str]
+
+    row: None | list[typing.Tuple[None | Action, None | Configuration]]

    def __init__(
        self,
        alphabet: list[str],
        precedence: typing.Tuple[typing.Tuple[Assoc, int], ...],
+        transparents: set[str],
    ):
        self.errors = ErrorCollection()
        self.table = []
        self.alphabet = alphabet
        self.precedence = precedence
+        self.transparents = transparents
        self.row = None

-    def flush(self, all_sets: ConfigurationSetInfo) -> list[dict[str, typing.Tuple]]:
+    def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable:
        """Finish building the table and return it.

        Raises ValueError if there were any conflicts during construction.
@ -522,7 +565,7 @@ class TableBuilder(object):
        if self.errors.any():
            errors = self.errors.format(self.alphabet, all_sets)
            raise ValueError(f"Errors building the table:\n\n{errors}")
-        return self.table
+        return ParseTable(states=self.table)

    def new_row(self, config_set: ConfigSet):
        """Start a new row, processing the given config set. Call this before
@ -541,36 +584,35 @@ class TableBuilder(object):
        """Mark a reduce of the given configuration for the given symbol in the
        current row.
        """
-        action = ("reduce", self.alphabet[config.name], len(config.symbols))
+        name = self.alphabet[config.name]
+        transparent = name in self.transparents
+        action = Reduce(name, len(config.symbols), transparent)
        self._set_table_action(symbol, action, config)

    def set_table_accept(self, symbol: int, config: Configuration):
        """Mark a accept of the given configuration for the given symbol in the
        current row.
        """
-        action = ("accept",)
-        self._set_table_action(symbol, action, config)
+        self._set_table_action(symbol, Accept(), config)

    def set_table_shift(self, symbol: int, index: int, config: Configuration):
        """Mark a shift in the current row of the given given symbol to the
        given index. The configuration here provides debugging informtion for
        conflicts.
        """
-        action = ("shift", index)
-        self._set_table_action(symbol, action, config)
+        self._set_table_action(symbol, Shift(index), config)

    def set_table_goto(self, symbol: int, index: int):
        """Set the goto for the given nonterminal symbol in the current row."""
-        action = ("goto", index)
-        self._set_table_action(symbol, action, None)
+        self._set_table_action(symbol, Goto(index), None)

-    def _action_precedence(self, symbol: int, action: typing.Tuple, config: Configuration):
-        if action[0] == "shift":
+    def _action_precedence(self, symbol: int, action: Action, config: Configuration):
+        if isinstance(action, Shift):
            return self.precedence[symbol]
        else:
            return self.precedence[config.name]

-    def _set_table_action(self, symbol_id: int, action: typing.Tuple, config: Configuration | None):
+    def _set_table_action(self, symbol_id: int, action: Action, config: Configuration | None):
        """Set the action for 'symbol' in the table row to 'action'.

        This is destructive; it changes the table. It records an error if
@ -607,17 +649,17 @@ class TableBuilder(object):
                resolved = False
                if assoc == Assoc.LEFT:
                    # Prefer reduce over shift
-                    if action[0] == "shift" and existing[0] == "reduce":
+                    if isinstance(action, Shift) and isinstance(existing, Reduce):
                        action = existing
                        resolved = True
-                    elif action[0] == "reduce" and existing[0] == "shift":
+                    elif isinstance(action, Reduce) and isinstance(existing, Shift):
                        resolved = True

                elif assoc == Assoc.RIGHT:
                    # Prefer shift over reduce
-                    if action[0] == "shift" and existing[0] == "reduce":
+                    if isinstance(action, Shift) and isinstance(existing, Reduce):
                        resolved = True
-                    elif action[0] == "reduce" and existing[0] == "shift":
+                    elif isinstance(action, Reduce) and isinstance(existing, Shift):
                        action = existing
                        resolved = True

@ -636,7 +678,7 @@ class TableBuilder(object):
        self.row[symbol_id] = (action, config)


-class GenerateLR0(object):
+class GenerateLR0:
    """Generate parser tables for an LR0 parser."""

    # Internally we use integers as symbols, not strings. Mostly this is fine,
@ -659,6 +701,10 @@ class GenerateLR0(object):
    # for a symbol, then its entry in this tuple will be (NONE, 0).
    precedence: typing.Tuple[typing.Tuple[Assoc, int], ...]

+    # The set of symbols for which we should reduce "transparently." This doesn't
+    # affect state generation at all, only the generation of the final table.
+    transparents: set[str]
+
    # The lookup that maps a particular symbol to an integer. (Only really used
    # for debugging.)
    symbol_key: dict[str, int]
@ -675,6 +721,7 @@ class GenerateLR0(object):
        start: str,
        grammar: list[typing.Tuple[str, list[str]]],
        precedence: None | dict[str, typing.Tuple[Assoc, int]] = None,
+        transparents: None | set[str] = None,
    ):
        """Initialize the parser generator with the specified grammar and
        start symbol.
@ -777,6 +824,10 @@ class GenerateLR0(object):
            precedence = {}
        self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet)

+        if transparents is None:
+            transparents = set()
+        self.transparents = transparents
+
        self.symbol_key = symbol_key
        self.start_symbol = start_symbol
        self.end_symbol = end_symbol
@ -903,7 +954,7 @@ class GenerateLR0(object):
        del config
        return [index for index, value in enumerate(self.terminal) if value]

-    def gen_table(self):
+    def gen_table(self) -> ParseTable:
        """Generate the parse table.

        The parse table is a list of states. The first state in the list is
@ -932,7 +983,7 @@ class GenerateLR0(object):
        Anything missing from the row indicates an error.
        """
        config_sets = self.gen_all_sets()
-        builder = TableBuilder(self.alphabet, self.precedence)
+        builder = TableBuilder(self.alphabet, self.precedence, self.transparents)

        for config_set_id, config_set in enumerate(config_sets.sets):
            builder.new_row(config_set)
@ -959,7 +1010,7 @@ class GenerateLR0(object):
        return builder.flush(config_sets)


-def parse(table, input, trace=False):
+def parse(table: ParseTable, input, trace=False):
    """Parse the input with the generated parsing table and return the
    concrete syntax tree.

@ -985,7 +1036,7 @@ def parse(table, input, trace=False):
        current_state = stack[-1][0]
        current_token = input[input_index]

-        action = table[current_state].get(current_token, ("error",))
+        action = table.states[current_state].get(current_token, Error())
        if trace:
            print(
                "{stack: <20}  {input: <50}  {action: <5}".format(
@ -995,25 +1046,30 @@ def parse(table, input, trace=False):
                )
            )

-        if action[0] == "accept":
+        match action:
+            case Accept():
                return stack[-1][1]

-        elif action[0] == "reduce":
-            name = action[1]
-            size = action[2]
+            case Reduce(name=name, count=size, transparent=transparent):
+                children = []
+                for _, c in stack[-size:]:
+                    if isinstance(c, tuple) and c[0] is None:
+                        children.extend(c[1])
+                    else:
+                        children.append(c)

-            value = (name, tuple(s[1] for s in stack[-size:]))
+                value = (name if not transparent else None, tuple(children))
                stack = stack[:-size]

-            goto = table[stack[-1][0]].get(name, ("error",))
-            assert goto[0] == "goto"  # Corrupt table?
-            stack.append((goto[1], value))
+                goto = table.states[stack[-1][0]].get(name, Error())
+                assert isinstance(goto, Goto)
+                stack.append((goto.state, value))

-        elif action[0] == "shift":
-            stack.append((action[1], (current_token, ())))
+            case Shift(state):
+                stack.append((state, (current_token, ())))
                input_index += 1

-        elif action[0] == "error":
+            case Error():
                raise ValueError(
                    "Syntax error: unexpected symbol {sym}".format(
                        sym=current_token,
@ -1539,7 +1595,16 @@ class NonTerminal(Rule):
    grammar class.
    """

-    def __init__(self, fn: typing.Callable[["Grammar"], Rule], name: str | None = None):
+    fn: typing.Callable[["Grammar"], Rule]
+    name: str
+    transparent: bool
+
+    def __init__(
+        self,
+        fn: typing.Callable[["Grammar"], Rule],
+        name: str | None = None,
+        transparent: bool = False,
+    ):
        """Create a new NonTerminal.

        `fn` is the function that will yield the `Rule` which is the
@ -1549,6 +1614,7 @@ class NonTerminal(Rule):
        """
        self.fn = fn
        self.name = name or fn.__name__
+        self.transparent = transparent

    def generate_body(self, grammar) -> list[list[str | Token]]:
        """Generate the body of the non-terminal.
@ -1638,7 +1704,8 @@ def rule(f: typing.Callable) -> Rule:
    of the nonterminal, which defaults to the name of the function.
    """
    name = f.__name__
-    return NonTerminal(f, name)
+    transparent = name.startswith("_")
+    return NonTerminal(f, name, transparent)


 PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]]
@ -1689,7 +1756,9 @@ class Grammar:

        self._precedence = precedence_table

-    def generate_nonterminal_dict(self, start: str) -> dict[str, list[list[str | Token]]]:
+    def generate_nonterminal_dict(
+        self, start: str
+    ) -> typing.Tuple[dict[str, list[list[str | Token]]], set[str]]:
        """Convert the rules into a dictionary of productions.

        Our table generators work on a very flat set of productions. This is the
@ -1700,6 +1769,7 @@ class Grammar:
        """
        rules = inspect.getmembers(self, lambda x: isinstance(x, NonTerminal))
        nonterminals = {rule.name: rule for _, rule in rules}
+        transparents = {rule.name for _, rule in rules if rule.transparent}

        grammar = {}

@ -1724,9 +1794,9 @@ class Grammar:

            grammar[rule.name] = body

-        return grammar
+        return (grammar, transparents)

-    def desugar(self, start: str) -> list[typing.Tuple[str, list[str]]]:
+    def desugar(self, start: str) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]:
        """Convert the rules into a flat list of productions.

        Our table generators work from a very flat set of productions. The form
@ -1734,7 +1804,7 @@ class Grammar:
        generate_nonterminal_dict- less useful to people, probably, but it is
        the input form needed by the Generator.
        """
-        temp_grammar = self.generate_nonterminal_dict(start)
+        temp_grammar, transparents = self.generate_nonterminal_dict(start)

        grammar = []
        for rule_name, clauses in temp_grammar.items():
@ -1748,15 +1818,15 @@ class Grammar:

                grammar.append((rule_name, new_clause))

-        return grammar
+        return grammar, transparents

    def build_table(self, start: str, generator=GenerateLALR):
        """Construct a parse table for this grammar, starting at the named
        nonterminal rule.
        """
-        desugared = self.desugar(start)
+        desugared, transparents = self.desugar(start)

-        gen = generator(start, desugared, precedence=self._precedence)
+        gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
        table = gen.gen_table()
        return table

@ -1772,7 +1842,7 @@ def format_node(node):
    return "\n".join(lines)


-def format_table(generator, table):
+def format_table(generator, table: ParseTable):
    """Format a parser table so pretty."""

    def format_action(state, terminal):
@ -1806,7 +1876,7 @@ def format_table(generator, table):
            ),
            gotos=" ".join("{0: <5}".format(row.get(nt, ("error", ""))[1]) for nt in nonterminals),
        )
-        for i, row in enumerate(table)
+        for i, row in enumerate(table.states)
    ]
    return "\n".join(lines)
Author	SHA1	Message	Date
John Doty	45a9303a27	Transparent rules Better parsing/action types Good grief	2024-05-29 09:07:19 -07:00
John Doty	4f8aef3f89	Start working on the harness	2024-05-29 07:48:55 -07:00