[wadler] Re-factor into multiple modules

Hard split between builder and runtime, as is proper.
2024-09-21 07:42:52 -07:00 · 2024-09-21 07:42:52 -07:00 · 1a3ce02d48
commit 1a3ce02d48
parent 1f84752538
4 changed files with 370 additions and 267 deletions
--- a/parser/wadler/runtime.py
+++ b/parser/wadler/runtime.py
@ -0,0 +1,604 @@
+import dataclasses
+import typing
+
+from . import builder
+from .. import parser
+from .. import runtime
+
+
+############################################################################
+# Documents
+############################################################################
+
+
+@dataclasses.dataclass(frozen=True)
+class Cons:
+    docs: list["Document"]
+
+
+@dataclasses.dataclass(frozen=True)
+class NewLine:
+    replace: str
+
+
+@dataclasses.dataclass(frozen=True)
+class ForceBreak:
+    silent: bool
+
+
+@dataclasses.dataclass(frozen=True)
+class Indent:
+    amount: int
+    doc: "Document"
+
+
+@dataclasses.dataclass(frozen=True)
+class Literal:
+    text: str
+
+
+@dataclasses.dataclass(frozen=True)
+class Group:
+    child: "Document"
+
+
+@dataclasses.dataclass(frozen=True)
+class Marker:
+    child: "Document"
+    meta: dict
+
+
+@dataclasses.dataclass(frozen=True)
+class Trivia:
+    child: "Document"
+
+
+@dataclasses.dataclass
+class Lazy:
+    value: typing.Callable[[], "Document"] | "Document"
+
+    def resolve(self) -> "Document":
+        if callable(self.value):
+            self.value = self.value()
+        return self.value
+
+    @classmethod
+    def from_tree(cls, tree: runtime.Tree, src: str, printer: "Printer") -> "Lazy":
+        return Lazy(lambda: printer.convert_tree_to_document(tree, src))
+
+
+Document = None | Literal | NewLine | ForceBreak | Cons | Indent | Group | Trivia | Marker | Lazy
+
+
+def cons(*documents: Document) -> Document:
+    if len(documents) == 0:
+        return None
+
+    # TODO: Merge adjacent trivia together?
+
+    result = []
+    for document in documents:
+        if isinstance(document, Cons):
+            result.extend(document.docs)
+        elif document is not None:
+            result.append(document)
+
+    if len(result) == 0:
+        return None
+    if len(result) == 1:
+        return result[0]
+
+    return Cons(result)
+
+
+def group(document: Document) -> Document:
+    if document is None:
+        return None
+
+    if isinstance(document, Cons):
+        children = list(document.docs)
+    else:
+        children = [document]
+
+    # Split the trivia off the left and right of the incoming group: trivia
+    # at the edges shouldn't affect the inside of the group.
+    right_trivia: list[Document] = []
+    while len(children) > 0 and isinstance(children[-1], Trivia):
+        right_trivia.append(children.pop())
+
+    children.reverse()
+    left_trivia: list[Document] = []
+    while len(children) > 0 and isinstance(children[-1], Trivia):
+        left_trivia.append(children.pop())
+
+    # IF we still have more than one child, *then* we can actually make a
+    # group. (A group with one child is a waste. A group with no children
+    # doubly so.)
+    children.reverse()
+    if len(children) > 1:
+        children = [Group(cons(*children))]
+
+    results = left_trivia + children + right_trivia
+    return cons(*results)
+
+
+def trivia(document: Document) -> Document:
+    if document is None:
+        return None
+
+    if isinstance(document, Trivia):
+        return document
+
+    return Trivia(document)
+
+
+############################################################################
+# Layouts
+############################################################################
+
+
+class DocumentLayout:
+    """A structure that is trivially convertable to a string; the result of
+    layout out a document."""
+
+    segments: list[str | tuple[int, int]]
+
+    def __init__(self, segments):
+        self.segments = segments
+
+    def apply_to_source(self, original: str) -> str:
+        """Convert this layout to a string by copying chunks of the source
+        text into the right place.
+        """
+        result = ""
+        for segment in self.segments:
+            if isinstance(segment, str):
+                result += segment
+            else:
+                start, end = segment
+                result += original[start:end]
+
+        return result
+
+
+def layout_document(doc: Document, width: int, indent: str) -> DocumentLayout:
+    """Lay out a document to fit within the given width.
+
+    The result of this function is a DocumentLayout which can trivially be
+    converted into a string given the original document.
+    """
+
+    @dataclasses.dataclass
+    class Chunk:
+        doc: Document
+        indent: int
+        flat: bool
+
+        def with_document(self, doc: Document, and_indent: int = 0) -> "Chunk":
+            return Chunk(doc=doc, indent=self.indent + and_indent, flat=self.flat)
+
+    column = 0
+    chunks: list[Chunk] = [
+        Chunk(
+            doc=doc,
+            indent=0,
+            flat=False,  # NOTE: Assume flat until we know how to break.
+        )
+    ]
+
+    def fits(chunk: Chunk) -> bool:
+        remaining = width - column
+        if remaining <= 0:
+            return False
+
+        stack = list(chunks)
+        stack.append(chunk)
+        while len(stack) > 0:
+            chunk = stack.pop()
+            match chunk.doc:
+                case None:
+                    pass
+
+                case Literal(text):
+                    remaining -= len(text)
+
+                case NewLine(replace):
+                    if chunk.flat:
+                        remaining -= len(replace)
+                    else:
+                        # These are newlines that are real, so it must have
+                        # all fit.
+                        return True
+
+                case ForceBreak():
+                    # If we're in a flattened chunk then force it to break by
+                    # returning false here, otherwise we're at the end of the
+                    # line and yes, whatever you were asking about has fit.
+                    return not chunk.flat
+
+                case Cons(docs):
+                    stack.extend(chunk.with_document(doc) for doc in reversed(docs))
+
+                case Lazy():
+                    stack.append(chunk.with_document(chunk.doc.resolve()))
+
+                case Indent(amount, child):
+                    stack.append(chunk.with_document(child, and_indent=amount))
+
+                case Group(child):
+                    # The difference between this approach and Justin's twist
+                    # is that we consider the flat variable in Newline(),
+                    # above, rather than here in Group. This makes us more
+                    # like Wadler's original formulation, I guess. The
+                    # grouping is an implicit transform over alternatives
+                    # represented by newline. (If we have other kinds of
+                    # alternatives we'll have to work those out elsewhere as
+                    # well.)
+                    stack.append(chunk.with_document(child))
+
+                case Marker():
+                    stack.append(chunk.with_document(chunk.doc.child))
+
+                case Trivia(child):
+                    stack.append(chunk.with_document(child))
+
+                case _:
+                    typing.assert_never(chunk.doc)
+
+            if remaining < 0:
+                return False
+
+        return True  # Everything must fit, so great!
+
+    output: list[str | tuple[int, int]] = []
+    while len(chunks) > 0:
+        chunk = chunks.pop()
+        match chunk.doc:
+            case None:
+                pass
+
+            case Literal(text):
+                output.append(text)
+                column += len(text)
+
+            case NewLine(replace):
+                if chunk.flat:
+                    output.append(replace)
+                    column += len(replace)
+                else:
+                    # TODO: Custom newline expansion, custom indent segments.
+                    output.append("\n" + (chunk.indent * indent))
+                    column = chunk.indent * len(indent)
+
+            case ForceBreak(silent):
+                # TODO: Custom newline expansion, custom indent segments.
+                if not silent:
+                    output.append("\n" + (chunk.indent * indent))
+                    column = chunk.indent * len(indent)
+
+            case Cons(docs):
+                chunks.extend(chunk.with_document(doc) for doc in reversed(docs))
+
+            case Indent(amount, doc):
+                chunks.append(chunk.with_document(doc, and_indent=amount))
+
+            case Lazy():
+                chunks.append(chunk.with_document(chunk.doc.resolve()))
+
+            case Group(child):
+                candidate = Chunk(doc=child, indent=chunk.indent, flat=True)
+                if chunk.flat or fits(candidate):
+                    chunks.append(candidate)
+                else:
+                    chunks.append(Chunk(doc=child, indent=chunk.indent, flat=False))
+
+            case Marker():
+                chunks.append(chunk.with_document(chunk.doc.child))
+
+            case Trivia(child):
+                chunks.append(chunk.with_document(child))
+
+            case _:
+                typing.assert_never(chunk)
+
+    return DocumentLayout(output)
+
+
+def resolve_document(doc: Document) -> Document:
+    match doc:
+        case Cons(docs):
+            docs = [resolve_document(d) for d in docs]
+            return cons(*docs)
+
+        case Lazy(_):
+            return resolve_document(doc.resolve())
+
+        case Group(doc):
+            return group(resolve_document(doc))
+
+        case Marker(child, meta):
+            return Marker(resolve_document(child), meta)
+
+        case Trivia(child):
+            return Trivia(resolve_document(child))
+
+        case Literal() | NewLine() | ForceBreak() | Indent() | None:
+            return doc
+
+        case _:
+            typing.assert_never(doc)
+
+
+def child_to_name(child: runtime.Tree | runtime.TokenValue) -> str:
+    if isinstance(child, runtime.Tree):
+        return f"tree_{child.name}"
+    else:
+        return f"token_{child.kind}"
+
+
+def slice_pre_post_trivia(
+    trivia_mode: dict[str, parser.TriviaMode],
+    trivia_tokens: list[runtime.TokenValue],
+) -> tuple[
+    list[tuple[parser.TriviaMode, runtime.TokenValue]],
+    list[tuple[parser.TriviaMode, runtime.TokenValue]],
+]:
+    tokens = [
+        (trivia_mode.get(token.kind, parser.TriviaMode.Blank), token) for token in trivia_tokens
+    ]
+
+    for index, (mode, token) in enumerate(tokens):
+        if token.start == 0:
+            # Everything is pre-trivia if we're at the start of the file.
+            return (tokens, [])
+
+        if mode == parser.TriviaMode.NewLine:
+            # This is the first newline; it belongs with the pre-trivia.
+            return (tokens[index:], tokens[:index])
+
+    # If we never found a new line then it's all post-trivia.
+    return ([], tokens)
+
+
+############################################################################
+# The Actual Pretty Printer
+############################################################################
+
+
+class Matcher:
+    table: builder.MatcherTable
+    trivia_mode: dict[str, parser.TriviaMode]
+
+    def __init__(self, table: builder.MatcherTable, trivia_mode: dict[str, parser.TriviaMode]):
+        self.table = table
+        self.trivia_mode = trivia_mode
+
+    def match(
+        self,
+        printer: "Printer",
+        items: list[runtime.Tree | runtime.TokenValue],
+        src: str,
+    ) -> Document:
+        stack: list[tuple[int, Document]] = [(0, None)]
+        table = self.table.table
+
+        # eof_trivia = []
+        # if len(items) > 0:
+        #     item = items[-1]
+        #     if isinstance(item, runtime.TokenValue):
+        #         eof_trivia = item.post_trivia
+
+        input = [(child_to_name(i), i) for i in items] + [
+            (
+                "$",
+                runtime.TokenValue(
+                    kind="$",
+                    start=0,
+                    end=0,
+                    pre_trivia=[],
+                    post_trivia=[],
+                ),
+            )
+        ]
+        input_index = 0
+
+        while True:
+            current_token = input[input_index]
+            current_state = stack[-1][0]
+            action = table.actions[current_state].get(current_token[0], parser.Error())
+
+            match action:
+                case parser.Accept():
+                    result = stack[-1][1]
+                    # result = cons(result, self.apply_trivia(eof_trivia))
+                    return result
+
+                case parser.Reduce(name=name, count=size):
+                    child: Document = None
+                    if size > 0:
+                        for _, c in stack[-size:]:
+                            if c is None:
+                                continue
+                            child = cons(child, c)
+                        del stack[-size:]
+
+                    if name[0] == "g":
+                        child = group(child)
+
+                    elif name[0] == "i":
+                        amount = self.table.indent_amounts[name]
+                        child = Indent(amount, child)
+
+                    elif name[0] == "n":
+                        replace = self.table.newline_replace[name]
+                        child = cons(child, NewLine(replace))
+
+                    elif name[0] == "p":
+                        replace = self.table.newline_replace[name]
+                        child = cons(NewLine(replace), child)
+
+                    elif name[0] == "f":
+                        child = cons(child, ForceBreak(False))
+
+                    elif name[0] == "d":
+                        child = cons(ForceBreak(False), child)
+
+                    else:
+                        pass  # Reducing a transparent rule probably.
+
+                    goto = table.gotos[stack[-1][0]].get(name)
+                    assert goto is not None
+                    stack.append((goto, child))
+
+                case parser.Shift():
+                    value = current_token[1]
+
+                    if isinstance(value, runtime.Tree):
+                        child = Lazy.from_tree(value, src, printer)
+                    else:
+                        child = cons(
+                            trivia(self.apply_pre_trivia(value.pre_trivia, src)),
+                            Literal(src[value.start : value.end]),
+                            trivia(self.apply_post_trivia(value.post_trivia, src)),
+                        )
+
+                    stack.append((action.state, child))
+                    input_index += 1
+
+                case parser.Error():
+                    raise Exception("How did I get a parse error here??")
+
+    def apply_pre_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
+        pre_trivia, _ = slice_pre_post_trivia(self.trivia_mode, trivia_tokens)
+        if len(pre_trivia) == 0:
+            return None
+
+        at_start_of_file = pre_trivia[0][1].start == 0
+
+        trivia_doc = None
+        new_line_count = 0
+        for mode, token in pre_trivia:
+            match mode:
+                case parser.TriviaMode.LineComment:
+                    trivia_doc = cons(
+                        trivia_doc,
+                        Literal(src[token.start : token.end]),
+                        ForceBreak(False),
+                    )
+                    new_line_count = 0  # There will be a newline after this.
+                    at_start_of_file = False
+
+                case parser.TriviaMode.Blank:
+                    pass
+
+                case parser.TriviaMode.NewLine:
+                    new_line_count += 1
+                    if new_line_count == 2 and not at_start_of_file:
+                        trivia_doc = cons(
+                            trivia_doc,
+                            ForceBreak(False),
+                        )
+
+                case _:
+                    typing.assert_never(mode)
+
+        return trivia_doc
+
+    def apply_post_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
+        if len(trivia_tokens) > 0 and trivia_tokens[-1].end == len(src):
+            return self.apply_eof_trivia(trivia_tokens, src)
+
+        _, post_trivia = slice_pre_post_trivia(self.trivia_mode, trivia_tokens)
+
+        trivia_doc = None
+        for mode, token in post_trivia:
+            match mode:
+                case parser.TriviaMode.Blank:
+                    pass
+
+                case parser.TriviaMode.NewLine:
+                    # Anything after a line break is not processed as post
+                    # trivia.
+                    break
+
+                case parser.TriviaMode.LineComment:
+                    # Because this is post-trivia, we know there's something
+                    # to our left, and we can force the space.
+                    trivia_doc = cons(
+                        Literal(" "),
+                        Literal(src[token.start : token.end]),
+                        ForceBreak(True),  # And the line needs to end.
+                    )
+                    break
+
+                case _:
+                    typing.assert_never(mode)
+
+        return trivia_doc
+
+    def apply_eof_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
+        # EOF trivia has weird rules, namely, it's like pre and post joined together but.
+        tokens = [
+            (self.trivia_mode.get(token.kind, parser.TriviaMode.Blank), token)
+            for token in trivia_tokens
+        ]
+
+        at_start = True
+        newline_count = 0
+        trivia_doc = None
+        for mode, token in tokens:
+            match mode:
+                case parser.TriviaMode.Blank:
+                    pass
+
+                case parser.TriviaMode.NewLine:
+                    at_start = False
+                    newline_count += 1
+                    if newline_count <= 2:
+                        trivia_doc = cons(trivia_doc, ForceBreak(False))
+
+                case parser.TriviaMode.LineComment:
+                    # Because this is post-trivia, we know there's something
+                    # to our left, and we can force the space.
+                    trivia_doc = cons(
+                        trivia_doc,
+                        Literal(" ") if at_start else None,
+                        Literal(src[token.start : token.end]),
+                    )
+                    newline_count = 0
+                    at_start = False
+
+                case _:
+                    typing.assert_never(mode)
+
+        return trivia_doc
+
+
+class Printer:
+    table: builder.PrettyTable
+    matchers: dict[str, Matcher]
+
+    def __init__(self, table: builder.PrettyTable):
+        self.table = table
+        self.matchers = {
+            name: Matcher(value, self.table.trivia_modes) for name, value in table.matchers.items()
+        }
+
+    def indent(self) -> str:
+        return self.table.indent
+
+    def convert_tree_to_document(self, tree: runtime.Tree, src: str) -> Document:
+        name = tree.name
+        assert name is not None, "Cannot format a tree if it still has transparent nodes inside"
+
+        matcher = self.matchers[name]
+        m = matcher.match(self, list(tree.children), src)
+        if m is None:
+            raise ValueError(
+                f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}"
+            )
+        return resolve_document(m)
+
+    def format_tree(self, tree: runtime.Tree, src: str, width: int) -> DocumentLayout:
+        doc = self.convert_tree_to_document(tree, src)
+        return layout_document(doc, width, self.table.indent)