Compare commits

...

3 commits

Author SHA1 Message Date
1d28c82007 Saving this for posterity, but it is doomed
Remember that tree levels are generated by context free languages, not
regular languages, and so they can only be recognized by push-down
automatons, not finite state machines.

What happened was that I failed to account for transparent rules.
Without transparent rules the children of a tree node do not have any
recursion in them (by definition!) and so therefore *are* a regular
language. But transparent rules change that: there *can be* recursion
hidden on the same tree level, and it should have been clear from a
moment's reflection that the recursion there meant that tree levels
were once again a context free language.

Fortunately we have a recognizer for context free languages lying
around, so we can just use that I guess.
2024-09-09 06:23:25 -07:00
0cbf696303 The start rule cannot be transparent 2024-09-09 06:23:11 -07:00
49b76b9bcc Teach trees to format themselves. 2024-09-09 06:22:56 -07:00
3 changed files with 322 additions and 166 deletions

View file

@ -2815,10 +2815,12 @@ class Grammar:
def get_precedence(self, name: str) -> None | tuple[Assoc, int]:
return self._precedence.get(name)
# TODO: The flattened form should retain NonTerminal, not just str.
def generate_nonterminal_dict(
self, start: str | None = None
) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]:
"""Convert the rules into a dictionary of productions.
"""Convert the rules into a dictionary of productions, and a set of
the names of transparent nonterminals.
Our table generators work on a very flat set of productions. This is the
first step in flattening the productions from the members: walk the rules
@ -2838,6 +2840,8 @@ class Grammar:
rule = nonterminals.get(start)
if rule is None:
raise ValueError(f"Cannot find a rule named '{start}'")
if rule.transparent:
raise ValueError("The start rule cannot be transparent")
queue = [rule]
while len(queue) > 0:
rule = queue.pop()

View file

@ -22,6 +22,29 @@ class Tree:
end: int
children: typing.Tuple["Tree | TokenValue", ...]
def format_lines(self, source: str | None = None) -> list[str]:
lines = []
def format_node(node: Tree | TokenValue, indent: int):
match node:
case Tree(name=name, start=start, end=end, children=children):
lines.append((" " * indent) + f"{name or '???'} [{start}, {end})")
for child in children:
format_node(child, indent + 2)
case TokenValue(kind=kind, start=start, end=end):
if source is not None:
value = f":'{source[start:end]}'"
else:
value = ""
lines.append((" " * indent) + f"{kind}{value} [{start}, {end})")
format_node(self, 0)
return lines
def format(self, source: str | None = None) -> str:
return "\n".join(self.format_lines(source))
@dataclass
class ParseError:
@ -278,13 +301,15 @@ class TokenStream(typing.Protocol):
...
# TODO: This runtime API sucks; the TokenStream is nice and all but I should
# also be able to have a function that takes a string and produces a
# tree directly, with caching intermediates for codegen and whatnot.
class Parser:
# Our stack is a stack of tuples, where the first entry is the state
# number and the second entry is the 'value' that was generated when the
# state was pushed.
table: parser.ParseTable
def __init__(self, table):
def __init__(self, table: parser.ParseTable):
self.table = table
def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
@ -301,6 +326,9 @@ class Parser:
input = input + [TokenValue(kind="$", start=eof, end=eof)]
input_index = 0
# Our stack is a stack of tuples, where the first entry is the state
# number and the second entry is the 'value' that was generated when
# the state was pushed.
stack: ParseStack = [(0, None)]
result: Tree | None = None
errors: list[ParseError] = []

View file

@ -1,5 +1,7 @@
# A prettier printer.
import abc
import dataclasses
import math
import typing
from . import parser
@ -12,6 +14,13 @@ class Cons:
right: "Document"
def cons(left: "Document", right: "Document") -> "Document":
if left and right:
return Cons(left, right)
else:
return left or right
@dataclasses.dataclass(frozen=True)
class NewLine:
pass
@ -47,144 +56,230 @@ class Lazy:
Document = None | Text | NewLine | Cons | Indent | Group | Lazy
def resolve_document(doc: Document) -> Document:
match doc:
case Cons(left, right):
lr = resolve_document(left)
rr = resolve_document(right)
if lr is not left or rr is not right:
return cons(lr, rr)
else:
return doc
case Lazy(_):
return doc.resolve()
case _:
return doc
def layout_document(doc: Document) -> typing.Generator[str, None, None]:
del doc
raise NotImplementedError()
@dataclasses.dataclass
class Match:
doc: Document
remaining: list[runtime.Tree | runtime.TokenValue]
class Matcher:
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
raise NotImplementedError()
class NonTerminalMatcher(Matcher):
name: str
printer: "Printer"
def __init__(self, name: str, printer: "Printer"):
self.name = name
self.printer = printer
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
if len(items) == 0:
return None
item = items[0]
if isinstance(item, runtime.Tree) and item.name == self.name:
return Match(
doc=Lazy(value=lambda: self.printer.convert_tree_to_document(item)),
remaining=items[1:],
)
return None
class TerminalMatcher(Matcher):
@dataclasses.dataclass(frozen=True)
class MatchTerminal:
name: str
def __init__(self, name: str):
self.name = name
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
if len(items) == 0:
return None
item = items[0]
if isinstance(item, runtime.TokenValue) and item.kind == self.name:
return Match(
doc=Text(start=item.start, end=item.end),
remaining=items[1:],
)
return None
@dataclasses.dataclass(frozen=True)
class MatchNonTerminal:
name: str
class IndentMatcher(Matcher):
@dataclasses.dataclass(frozen=True)
class Accept:
pass
@dataclasses.dataclass(frozen=True)
class StartGroup:
pass
@dataclasses.dataclass(frozen=True)
class EndGroup:
pass
@dataclasses.dataclass(frozen=True)
class StartIndent:
pass
@dataclasses.dataclass(frozen=True)
class EndIndent:
amount: int
child: Matcher
def __init__(self, amount: int, child: Matcher):
self.amount = amount
self.child = child
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
result = self.child.match(items)
if result is not None:
result.doc = Indent(amount=self.amount, doc=result.doc)
return result
class NewLineMatcher(Matcher):
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
return Match(
doc=NewLine(),
remaining=items,
)
@dataclasses.dataclass(frozen=True)
class Split:
left: int
right: int
class GroupMatcher(Matcher):
child: Matcher
def __init__(self, child: Matcher):
self.child = child
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
result = self.child.match(items)
if result is not None:
result.doc = Group(result.doc)
return result
@dataclasses.dataclass(frozen=True)
class Jump:
next: int
class CompleteMatcher(Matcher):
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
if len(items) == 0:
return Match(doc=None, remaining=[])
else:
return None
MatchInstruction = (
MatchTerminal
| MatchNonTerminal
| Accept
| StartGroup
| EndGroup
| NewLine
| StartIndent
| EndIndent
| Split
| Jump
)
class AlternativeMatcher(Matcher):
children: list[Matcher]
### THIS DOESN'T WORK
###
### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT
### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER
### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR
### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS
###
### CHRIST.
###
class Matcher:
code: list[MatchInstruction]
def __init__(self, children: list[Matcher] | None = None):
self.children = children or []
def __init__(self):
self.code = []
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
for child in self.children:
m = child.match(items)
if m is not None:
return m
@dataclasses.dataclass
class ThreadState:
pc: int
position: int
count: int
results: list[Document | StartGroup | StartIndent]
def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
threads: list[Matcher.ThreadState] = [
Matcher.ThreadState(pc=0, position=0, results=[], count=0)
]
while len(threads) > 0:
thread = threads.pop()
results = thread.results
while True:
thread.count += 1
if thread.count > 1000:
raise Exception("Too many steps!")
inst = self.code[thread.pc]
print(f"THREAD: {thread.pc}: {inst} ({thread.position})")
match inst:
case MatchTerminal(name):
if thread.position >= len(items):
break
item = items[thread.position]
if not isinstance(item, runtime.TokenValue):
break
if item.kind != name:
break
results.append(Text(item.start, item.end))
thread.pc += 1
thread.position += 1
case MatchNonTerminal(name):
if thread.position >= len(items):
break
item = items[thread.position]
if not isinstance(item, runtime.Tree):
break
if item.name != name:
break
def thunk(capture: runtime.Tree):
return lambda: printer.convert_tree_to_document(capture)
results.append(Lazy(thunk(item)))
thread.pc += 1
thread.position += 1
case Accept():
if thread.position != len(items):
break
result = None
for r in thread.results:
assert not isinstance(r, (StartGroup, StartIndent))
result = cons(result, r)
return result
case StartGroup():
results.append(inst)
thread.pc += 1
case EndGroup():
group_items = None
while not isinstance(results[-1], StartGroup):
item = typing.cast(Document, results.pop())
group_items = cons(item, group_items)
results.pop()
results.append(Group(group_items))
thread.pc += 1
case NewLine():
results.append(NewLine())
thread.pc += 1
case StartIndent():
results.append(inst)
thread.pc += 1
case EndIndent(amount):
indent_items = None
while not isinstance(results[-1], StartIndent):
item = typing.cast(Document, results.pop())
indent_items = cons(item, indent_items)
results.pop()
results.append(Indent(amount, indent_items))
thread.pc += 1
case Split(left, right):
new_thread = Matcher.ThreadState(
pc=right,
position=thread.position,
results=list(thread.results),
count=0,
)
threads.append(new_thread)
thread.pc = left
case Jump(where):
thread.pc = where
threads.append(thread)
case _:
typing.assert_never(inst)
return None
def format(self) -> str:
return "\n".join(self.format_lines())
class SequenceMatcher(Matcher):
children: list[Matcher]
def format_lines(self) -> list[str]:
lines = []
code_len = int(math.log10(len(self.code))) + 1
for i, inst in enumerate(self.code):
lines.append(f"{i: >{code_len}} {inst}")
return lines
def __init__(self, children: list[Matcher] | None = None):
self.children = children or []
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
doc = None
for child in self.children:
m = child.match(items)
if m is None:
return None
items = m.remaining
doc = Cons(doc, m.doc)
return Match(
doc=doc,
remaining=items,
)
@abc.abstractmethod
def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ...
class PrettyMeta(parser.SyntaxMeta):
@ -195,68 +290,92 @@ class PrettyMeta(parser.SyntaxMeta):
class Printer:
grammar: parser.Grammar
matchers: dict[str, Matcher]
_matchers: dict[str, Matcher]
_nonterminals: dict[str, parser.NonTerminal]
def __init__(self, grammar: parser.Grammar):
self.grammar = grammar
self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
self._matchers = {}
def lookup_nonterminal(self, name: str) -> parser.NonTerminal:
raise NotImplementedError()
return self._nonterminals[name]
def production_to_matcher(self, production: parser.FlattenedWithMetadata) -> Matcher:
results = []
for item in production:
if isinstance(item, str):
rule = self.lookup_nonterminal(item)
if rule.transparent:
# If it's transparent then we don't actually match a
# nonterminal here, we need to match against the contents
# of the rule, so we recurse.
results.append(self.rule_to_matcher(rule))
else:
results.append(NonTerminalMatcher(item, self))
elif isinstance(item, parser.Terminal):
name = item.name
assert name is not None
results.append(TerminalMatcher(name))
def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
matcher = Matcher()
code = matcher.code
patcher: dict[str, int] = {}
def compile_nonterminal(rule: parser.NonTerminal):
sub_start = patcher.get(rule.name)
if sub_start is not None:
code.append(Jump(sub_start))
else:
meta, children = item
sub_start = len(code)
patcher[rule.name] = sub_start
tails = []
subs = list(rule.fn(self.grammar).flatten(with_metadata=True))
for sub in subs[:-1]:
split_pos = len(code)
code.append(Split(0, 0))
child = self.production_to_matcher(children)
compile_production(sub)
prettier = meta.get("prettier")
if isinstance(prettier, PrettyMeta):
if prettier.indent:
child = IndentMatcher(prettier.indent, child)
tails.append(len(code))
code.append(Jump(0))
if prettier.group:
child = GroupMatcher(child)
code[split_pos] = Split(sub_start + 1, len(code))
sub_start = len(code)
results.append(child)
compile_production(subs[-1])
if prettier.newline:
results.append(NewLineMatcher())
for tail in tails:
code[tail] = Jump(len(code))
def compile_production(production: parser.FlattenedWithMetadata):
for item in production:
if isinstance(item, str):
rule = self.lookup_nonterminal(item)
if rule.transparent:
# If it's transparent then we need to inline the pattern here.
compile_nonterminal(rule)
else:
code.append(MatchNonTerminal(item))
elif isinstance(item, parser.Terminal):
name = item.name
assert name is not None
code.append(MatchTerminal(name))
else:
results.append(child)
meta, children = item
return SequenceMatcher(results)
prettier = meta.get("prettier")
if isinstance(prettier, PrettyMeta):
if prettier.indent:
code.append(StartIndent())
if prettier.group:
code.append(StartGroup())
compile_production(children)
if isinstance(prettier, PrettyMeta):
if prettier.group:
code.append(EndGroup())
if prettier.indent:
code.append(EndIndent(prettier.indent))
if prettier.newline:
code.append(NewLine())
compile_nonterminal(rule)
code.append(Accept())
return matcher
def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
result = self.matchers.get(rule.name)
result = self._matchers.get(rule.name)
if result is None:
# Create the empty alternative, be sure to set up the
alts = AlternativeMatcher()
if rule.transparent:
result = alts
else:
result = SequenceMatcher(children=[alts, CompleteMatcher()])
self.matchers[rule.name] = result
for production in rule.fn(self.grammar).flatten(with_metadata=True):
alts.children.append(self.production_to_matcher(production))
result = self.compile_rule(rule)
self._matchers[rule.name] = result
return result
@ -266,11 +385,16 @@ class Printer:
rule = self.lookup_nonterminal(name)
matcher = self.rule_to_matcher(rule)
m = matcher.match(list(tree.children))
assert m is not None, "Could not match a valid tree" # TODO: Exception rather I think
return m.doc
print(f"--------")
print(f"Matching with:\n{matcher.format()}")
m = matcher.match(self, list(tree.children))
print(f"--------")
if m is None:
raise ValueError(
f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}"
)
# return m
return resolve_document(m)
def format_tree(self, tree: runtime.Tree) -> str:
doc = self.convert_tree_to_document(tree)