Compare commits

...

3 commits

Author SHA1 Message Date
1d28c82007 Saving this for posterity, but it is doomed
Remember that tree levels are generated by context free languages, not
regular languages, and so they can only be recognized by push-down
automatons, not finite state machines.

What happened was that I failed to account for transparent rules.
Without transparent rules the children of a tree node do not have any
recursion in them (by definition!) and so therefore *are* a regular
language. But transparent rules change that: there *can be* recursion
hidden on the same tree level, and it should have been clear from a
moment's reflection that the recursion there meant that tree levels
were once again a context free language.

Fortunately we have a recognizer for context free languages lying
around, so we can just use that I guess.
2024-09-09 06:23:25 -07:00
0cbf696303 The start rule cannot be transparent 2024-09-09 06:23:11 -07:00
49b76b9bcc Teach trees to format themselves. 2024-09-09 06:22:56 -07:00
3 changed files with 322 additions and 166 deletions

View file

@ -2815,10 +2815,12 @@ class Grammar:
def get_precedence(self, name: str) -> None | tuple[Assoc, int]: def get_precedence(self, name: str) -> None | tuple[Assoc, int]:
return self._precedence.get(name) return self._precedence.get(name)
# TODO: The flattened form should retain NonTerminal, not just str.
def generate_nonterminal_dict( def generate_nonterminal_dict(
self, start: str | None = None self, start: str | None = None
) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]: ) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]:
"""Convert the rules into a dictionary of productions. """Convert the rules into a dictionary of productions, and a set of
the names of transparent nonterminals.
Our table generators work on a very flat set of productions. This is the Our table generators work on a very flat set of productions. This is the
first step in flattening the productions from the members: walk the rules first step in flattening the productions from the members: walk the rules
@ -2838,6 +2840,8 @@ class Grammar:
rule = nonterminals.get(start) rule = nonterminals.get(start)
if rule is None: if rule is None:
raise ValueError(f"Cannot find a rule named '{start}'") raise ValueError(f"Cannot find a rule named '{start}'")
if rule.transparent:
raise ValueError("The start rule cannot be transparent")
queue = [rule] queue = [rule]
while len(queue) > 0: while len(queue) > 0:
rule = queue.pop() rule = queue.pop()

View file

@ -22,6 +22,29 @@ class Tree:
end: int end: int
children: typing.Tuple["Tree | TokenValue", ...] children: typing.Tuple["Tree | TokenValue", ...]
def format_lines(self, source: str | None = None) -> list[str]:
lines = []
def format_node(node: Tree | TokenValue, indent: int):
match node:
case Tree(name=name, start=start, end=end, children=children):
lines.append((" " * indent) + f"{name or '???'} [{start}, {end})")
for child in children:
format_node(child, indent + 2)
case TokenValue(kind=kind, start=start, end=end):
if source is not None:
value = f":'{source[start:end]}'"
else:
value = ""
lines.append((" " * indent) + f"{kind}{value} [{start}, {end})")
format_node(self, 0)
return lines
def format(self, source: str | None = None) -> str:
return "\n".join(self.format_lines(source))
@dataclass @dataclass
class ParseError: class ParseError:
@ -278,13 +301,15 @@ class TokenStream(typing.Protocol):
... ...
# TODO: This runtime API sucks; the TokenStream is nice and all but I should
# also be able to have a function that takes a string and produces a
# tree directly, with caching intermediates for codegen and whatnot.
class Parser: class Parser:
# Our stack is a stack of tuples, where the first entry is the state
# number and the second entry is the 'value' that was generated when the
# state was pushed.
table: parser.ParseTable table: parser.ParseTable
def __init__(self, table): def __init__(self, table: parser.ParseTable):
self.table = table self.table = table
def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]: def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
@ -301,6 +326,9 @@ class Parser:
input = input + [TokenValue(kind="$", start=eof, end=eof)] input = input + [TokenValue(kind="$", start=eof, end=eof)]
input_index = 0 input_index = 0
# Our stack is a stack of tuples, where the first entry is the state
# number and the second entry is the 'value' that was generated when
# the state was pushed.
stack: ParseStack = [(0, None)] stack: ParseStack = [(0, None)]
result: Tree | None = None result: Tree | None = None
errors: list[ParseError] = [] errors: list[ParseError] = []

View file

@ -1,5 +1,7 @@
# A prettier printer. # A prettier printer.
import abc
import dataclasses import dataclasses
import math
import typing import typing
from . import parser from . import parser
@ -12,6 +14,13 @@ class Cons:
right: "Document" right: "Document"
def cons(left: "Document", right: "Document") -> "Document":
if left and right:
return Cons(left, right)
else:
return left or right
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class NewLine: class NewLine:
pass pass
@ -47,144 +56,230 @@ class Lazy:
Document = None | Text | NewLine | Cons | Indent | Group | Lazy Document = None | Text | NewLine | Cons | Indent | Group | Lazy
def resolve_document(doc: Document) -> Document:
match doc:
case Cons(left, right):
lr = resolve_document(left)
rr = resolve_document(right)
if lr is not left or rr is not right:
return cons(lr, rr)
else:
return doc
case Lazy(_):
return doc.resolve()
case _:
return doc
def layout_document(doc: Document) -> typing.Generator[str, None, None]: def layout_document(doc: Document) -> typing.Generator[str, None, None]:
del doc
raise NotImplementedError() raise NotImplementedError()
@dataclasses.dataclass(frozen=True)
class MatchTerminal:
name: str
@dataclasses.dataclass(frozen=True)
class MatchNonTerminal:
name: str
@dataclasses.dataclass(frozen=True)
class Accept:
pass
@dataclasses.dataclass(frozen=True)
class StartGroup:
pass
@dataclasses.dataclass(frozen=True)
class EndGroup:
pass
@dataclasses.dataclass(frozen=True)
class StartIndent:
pass
@dataclasses.dataclass(frozen=True)
class EndIndent:
amount: int
@dataclasses.dataclass(frozen=True)
class Split:
left: int
right: int
@dataclasses.dataclass(frozen=True)
class Jump:
next: int
MatchInstruction = (
MatchTerminal
| MatchNonTerminal
| Accept
| StartGroup
| EndGroup
| NewLine
| StartIndent
| EndIndent
| Split
| Jump
)
### THIS DOESN'T WORK
###
### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT
### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER
### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR
### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS
###
### CHRIST.
###
class Matcher:
code: list[MatchInstruction]
def __init__(self):
self.code = []
@dataclasses.dataclass @dataclasses.dataclass
class Match: class ThreadState:
doc: Document pc: int
remaining: list[runtime.Tree | runtime.TokenValue] position: int
count: int
results: list[Document | StartGroup | StartIndent]
def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
threads: list[Matcher.ThreadState] = [
Matcher.ThreadState(pc=0, position=0, results=[], count=0)
]
class Matcher: while len(threads) > 0:
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: thread = threads.pop()
raise NotImplementedError() results = thread.results
while True:
thread.count += 1
if thread.count > 1000:
raise Exception("Too many steps!")
inst = self.code[thread.pc]
print(f"THREAD: {thread.pc}: {inst} ({thread.position})")
match inst:
case MatchTerminal(name):
if thread.position >= len(items):
break
class NonTerminalMatcher(Matcher): item = items[thread.position]
name: str if not isinstance(item, runtime.TokenValue):
printer: "Printer" break
def __init__(self, name: str, printer: "Printer"): if item.kind != name:
self.name = name break
self.printer = printer
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: results.append(Text(item.start, item.end))
if len(items) == 0: thread.pc += 1
return None thread.position += 1
item = items[0] case MatchNonTerminal(name):
if isinstance(item, runtime.Tree) and item.name == self.name: if thread.position >= len(items):
return Match( break
doc=Lazy(value=lambda: self.printer.convert_tree_to_document(item)),
remaining=items[1:],
)
return None item = items[thread.position]
if not isinstance(item, runtime.Tree):
break
if item.name != name:
break
class TerminalMatcher(Matcher): def thunk(capture: runtime.Tree):
name: str return lambda: printer.convert_tree_to_document(capture)
def __init__(self, name: str): results.append(Lazy(thunk(item)))
self.name = name thread.pc += 1
thread.position += 1
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: case Accept():
if len(items) == 0: if thread.position != len(items):
return None break
item = items[0]
if isinstance(item, runtime.TokenValue) and item.kind == self.name:
return Match(
doc=Text(start=item.start, end=item.end),
remaining=items[1:],
)
return None
class IndentMatcher(Matcher):
amount: int
child: Matcher
def __init__(self, amount: int, child: Matcher):
self.amount = amount
self.child = child
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
result = self.child.match(items)
if result is not None:
result.doc = Indent(amount=self.amount, doc=result.doc)
result = None
for r in thread.results:
assert not isinstance(r, (StartGroup, StartIndent))
result = cons(result, r)
return result return result
case StartGroup():
results.append(inst)
thread.pc += 1
class NewLineMatcher(Matcher): case EndGroup():
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: group_items = None
return Match( while not isinstance(results[-1], StartGroup):
doc=NewLine(), item = typing.cast(Document, results.pop())
remaining=items, group_items = cons(item, group_items)
results.pop()
results.append(Group(group_items))
thread.pc += 1
case NewLine():
results.append(NewLine())
thread.pc += 1
case StartIndent():
results.append(inst)
thread.pc += 1
case EndIndent(amount):
indent_items = None
while not isinstance(results[-1], StartIndent):
item = typing.cast(Document, results.pop())
indent_items = cons(item, indent_items)
results.pop()
results.append(Indent(amount, indent_items))
thread.pc += 1
case Split(left, right):
new_thread = Matcher.ThreadState(
pc=right,
position=thread.position,
results=list(thread.results),
count=0,
) )
threads.append(new_thread)
thread.pc = left
case Jump(where):
thread.pc = where
threads.append(thread)
class GroupMatcher(Matcher): case _:
child: Matcher typing.assert_never(inst)
def __init__(self, child: Matcher):
self.child = child
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
result = self.child.match(items)
if result is not None:
result.doc = Group(result.doc)
return result
class CompleteMatcher(Matcher):
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
if len(items) == 0:
return Match(doc=None, remaining=[])
else:
return None
class AlternativeMatcher(Matcher):
children: list[Matcher]
def __init__(self, children: list[Matcher] | None = None):
self.children = children or []
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
for child in self.children:
m = child.match(items)
if m is not None:
return m
return None return None
def format(self) -> str:
return "\n".join(self.format_lines())
class SequenceMatcher(Matcher): def format_lines(self) -> list[str]:
children: list[Matcher] lines = []
code_len = int(math.log10(len(self.code))) + 1
for i, inst in enumerate(self.code):
lines.append(f"{i: >{code_len}} {inst}")
return lines
def __init__(self, children: list[Matcher] | None = None): @abc.abstractmethod
self.children = children or [] def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ...
def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
doc = None
for child in self.children:
m = child.match(items)
if m is None:
return None
items = m.remaining
doc = Cons(doc, m.doc)
return Match(
doc=doc,
remaining=items,
)
class PrettyMeta(parser.SyntaxMeta): class PrettyMeta(parser.SyntaxMeta):
@ -195,68 +290,92 @@ class PrettyMeta(parser.SyntaxMeta):
class Printer: class Printer:
grammar: parser.Grammar grammar: parser.Grammar
matchers: dict[str, Matcher] _matchers: dict[str, Matcher]
_nonterminals: dict[str, parser.NonTerminal]
def __init__(self, grammar: parser.Grammar): def __init__(self, grammar: parser.Grammar):
self.grammar = grammar self.grammar = grammar
self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
self._matchers = {}
def lookup_nonterminal(self, name: str) -> parser.NonTerminal: def lookup_nonterminal(self, name: str) -> parser.NonTerminal:
raise NotImplementedError() return self._nonterminals[name]
def production_to_matcher(self, production: parser.FlattenedWithMetadata) -> Matcher: def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
results = [] matcher = Matcher()
code = matcher.code
patcher: dict[str, int] = {}
def compile_nonterminal(rule: parser.NonTerminal):
sub_start = patcher.get(rule.name)
if sub_start is not None:
code.append(Jump(sub_start))
else:
sub_start = len(code)
patcher[rule.name] = sub_start
tails = []
subs = list(rule.fn(self.grammar).flatten(with_metadata=True))
for sub in subs[:-1]:
split_pos = len(code)
code.append(Split(0, 0))
compile_production(sub)
tails.append(len(code))
code.append(Jump(0))
code[split_pos] = Split(sub_start + 1, len(code))
sub_start = len(code)
compile_production(subs[-1])
for tail in tails:
code[tail] = Jump(len(code))
def compile_production(production: parser.FlattenedWithMetadata):
for item in production: for item in production:
if isinstance(item, str): if isinstance(item, str):
rule = self.lookup_nonterminal(item) rule = self.lookup_nonterminal(item)
if rule.transparent: if rule.transparent:
# If it's transparent then we don't actually match a # If it's transparent then we need to inline the pattern here.
# nonterminal here, we need to match against the contents compile_nonterminal(rule)
# of the rule, so we recurse.
results.append(self.rule_to_matcher(rule))
else: else:
results.append(NonTerminalMatcher(item, self)) code.append(MatchNonTerminal(item))
elif isinstance(item, parser.Terminal): elif isinstance(item, parser.Terminal):
name = item.name name = item.name
assert name is not None assert name is not None
results.append(TerminalMatcher(name)) code.append(MatchTerminal(name))
else: else:
meta, children = item meta, children = item
child = self.production_to_matcher(children)
prettier = meta.get("prettier") prettier = meta.get("prettier")
if isinstance(prettier, PrettyMeta): if isinstance(prettier, PrettyMeta):
if prettier.indent: if prettier.indent:
child = IndentMatcher(prettier.indent, child) code.append(StartIndent())
if prettier.group: if prettier.group:
child = GroupMatcher(child) code.append(StartGroup())
results.append(child) compile_production(children)
if isinstance(prettier, PrettyMeta):
if prettier.group:
code.append(EndGroup())
if prettier.indent:
code.append(EndIndent(prettier.indent))
if prettier.newline: if prettier.newline:
results.append(NewLineMatcher()) code.append(NewLine())
else: compile_nonterminal(rule)
results.append(child) code.append(Accept())
return matcher
return SequenceMatcher(results)
def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher: def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
result = self.matchers.get(rule.name) result = self._matchers.get(rule.name)
if result is None: if result is None:
# Create the empty alternative, be sure to set up the result = self.compile_rule(rule)
alts = AlternativeMatcher() self._matchers[rule.name] = result
if rule.transparent:
result = alts
else:
result = SequenceMatcher(children=[alts, CompleteMatcher()])
self.matchers[rule.name] = result
for production in rule.fn(self.grammar).flatten(with_metadata=True):
alts.children.append(self.production_to_matcher(production))
return result return result
@ -266,11 +385,16 @@ class Printer:
rule = self.lookup_nonterminal(name) rule = self.lookup_nonterminal(name)
matcher = self.rule_to_matcher(rule) matcher = self.rule_to_matcher(rule)
print(f"--------")
m = matcher.match(list(tree.children)) print(f"Matching with:\n{matcher.format()}")
assert m is not None, "Could not match a valid tree" # TODO: Exception rather I think m = matcher.match(self, list(tree.children))
print(f"--------")
return m.doc if m is None:
raise ValueError(
f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}"
)
# return m
return resolve_document(m)
def format_tree(self, tree: runtime.Tree) -> str: def format_tree(self, tree: runtime.Tree) -> str:
doc = self.convert_tree_to_document(tree) doc = self.convert_tree_to_document(tree)