Rebuild the matcher on grammars

Well that wasn't so bad now was it? Eh? Nice to have a parser
generator lying around. Let's keep working to see if I can actually
finish it.
This commit is contained in:
John Doty 2024-09-09 11:40:14 -07:00
parent 1d28c82007
commit 7edf5e06bf

View file

@ -1,7 +1,5 @@
# A prettier printer. # A prettier printer.
import abc
import dataclasses import dataclasses
import math
import typing import typing
from . import parser from . import parser
@ -52,6 +50,10 @@ class Lazy:
self.value = self.value() self.value = self.value()
return self.value return self.value
@classmethod
def from_tree(cls, tree: runtime.Tree, printer: "Printer") -> "Lazy":
return Lazy(lambda: printer.convert_tree_to_document(tree))
Document = None | Text | NewLine | Cons | Indent | Group | Lazy Document = None | Text | NewLine | Cons | Indent | Group | Lazy
@ -78,208 +80,88 @@ def layout_document(doc: Document) -> typing.Generator[str, None, None]:
raise NotImplementedError() raise NotImplementedError()
@dataclasses.dataclass(frozen=True) def child_to_name(child: runtime.Tree | runtime.TokenValue) -> str:
class MatchTerminal: if isinstance(child, runtime.Tree):
name: str return f"tree_{child.name}"
else:
return f"token_{child.kind}"
@dataclasses.dataclass(frozen=True)
class MatchNonTerminal:
name: str
@dataclasses.dataclass(frozen=True)
class Accept:
pass
@dataclasses.dataclass(frozen=True)
class StartGroup:
pass
@dataclasses.dataclass(frozen=True)
class EndGroup:
pass
@dataclasses.dataclass(frozen=True)
class StartIndent:
pass
@dataclasses.dataclass(frozen=True)
class EndIndent:
amount: int
@dataclasses.dataclass(frozen=True)
class Split:
left: int
right: int
@dataclasses.dataclass(frozen=True)
class Jump:
next: int
MatchInstruction = (
MatchTerminal
| MatchNonTerminal
| Accept
| StartGroup
| EndGroup
| NewLine
| StartIndent
| EndIndent
| Split
| Jump
)
### THIS DOESN'T WORK
###
### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT
### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER
### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR
### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS
###
### CHRIST.
###
class Matcher: class Matcher:
code: list[MatchInstruction] table: parser.ParseTable
indent_amounts: dict[str, int]
def __init__(self): def __init__(self, table: parser.ParseTable, indent_amounts):
self.code = [] self.table = table
self.indent_amounts = indent_amounts
@dataclasses.dataclass
class ThreadState:
pc: int
position: int
count: int
results: list[Document | StartGroup | StartIndent]
def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document: def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
threads: list[Matcher.ThreadState] = [ stack: list[tuple[int, Document]] = [(0, None)]
Matcher.ThreadState(pc=0, position=0, results=[], count=0) table = self.table
input = [(child_to_name(i), i) for i in items] + [
("$", runtime.TokenValue(kind="$", start=0, end=0))
] ]
input_index = 0
while len(threads) > 0: while True:
thread = threads.pop() current_token = input[input_index]
results = thread.results current_state = stack[-1][0]
while True: action = table.actions[current_state].get(current_token[0], parser.Error())
thread.count += 1
if thread.count > 1000:
raise Exception("Too many steps!")
inst = self.code[thread.pc] # print(
print(f"THREAD: {thread.pc}: {inst} ({thread.position})") # "{stack: <30} {input: <15} {action: <5}".format(
match inst: # stack=repr([s[0] for s in stack[-5:]]),
case MatchTerminal(name): # input=current_token[0],
if thread.position >= len(items): # action=repr(action),
break # )
# )
item = items[thread.position] match action:
if not isinstance(item, runtime.TokenValue): case parser.Accept():
break return stack[-1][1]
if item.kind != name: case parser.Reduce(name=name, count=size):
break child: Document = None
if size > 0:
for _, c in stack[-size:]:
if c is None:
continue
child = cons(child, c)
del stack[-size:]
results.append(Text(item.start, item.end)) if name[0] == "g":
thread.pc += 1 child = Group(child)
thread.position += 1
case MatchNonTerminal(name): elif name[0] == "i":
if thread.position >= len(items): amount = self.indent_amounts[name]
break child = Indent(amount, child)
item = items[thread.position] elif name[0] == "n":
if not isinstance(item, runtime.Tree): child = cons(child, NewLine())
break
if item.name != name: elif name[0] == "p":
break child = cons(NewLine(), child)
def thunk(capture: runtime.Tree): else:
return lambda: printer.convert_tree_to_document(capture) pass # ???
results.append(Lazy(thunk(item))) goto = self.table.gotos[stack[-1][0]].get(name)
thread.pc += 1 assert goto is not None
thread.position += 1 stack.append((goto, child))
case Accept(): case parser.Shift():
if thread.position != len(items): value = current_token[1]
break if isinstance(value, runtime.Tree):
child = Lazy.from_tree(value, printer)
else:
child = Text(value.start, value.end)
result = None stack.append((action.state, child))
for r in thread.results: input_index += 1
assert not isinstance(r, (StartGroup, StartIndent))
result = cons(result, r)
return result
case StartGroup(): case parser.Error():
results.append(inst) raise Exception("How did I get a parse error here??")
thread.pc += 1
case EndGroup():
group_items = None
while not isinstance(results[-1], StartGroup):
item = typing.cast(Document, results.pop())
group_items = cons(item, group_items)
results.pop()
results.append(Group(group_items))
thread.pc += 1
case NewLine():
results.append(NewLine())
thread.pc += 1
case StartIndent():
results.append(inst)
thread.pc += 1
case EndIndent(amount):
indent_items = None
while not isinstance(results[-1], StartIndent):
item = typing.cast(Document, results.pop())
indent_items = cons(item, indent_items)
results.pop()
results.append(Indent(amount, indent_items))
thread.pc += 1
case Split(left, right):
new_thread = Matcher.ThreadState(
pc=right,
position=thread.position,
results=list(thread.results),
count=0,
)
threads.append(new_thread)
thread.pc = left
case Jump(where):
thread.pc = where
threads.append(thread)
case _:
typing.assert_never(inst)
return None
def format(self) -> str:
return "\n".join(self.format_lines())
def format_lines(self) -> list[str]:
lines = []
code_len = int(math.log10(len(self.code))) + 1
for i, inst in enumerate(self.code):
lines.append(f"{i: >{code_len}} {inst}")
return lines
@abc.abstractmethod
def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ...
class PrettyMeta(parser.SyntaxMeta): class PrettyMeta(parser.SyntaxMeta):
@ -302,74 +184,86 @@ class Printer:
return self._nonterminals[name] return self._nonterminals[name]
def compile_rule(self, rule: parser.NonTerminal) -> Matcher: def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
matcher = Matcher() generated_grammar: list[typing.Tuple[str, list[str]]] = []
code = matcher.code visited: set[str] = set()
patcher: dict[str, int] = {} group_count = 0
indent_amounts: dict[str, int] = {}
done_newline = False
def compile_nonterminal(rule: parser.NonTerminal): def compile_nonterminal(name: str, rule: parser.NonTerminal):
sub_start = patcher.get(rule.name) if name not in visited:
if sub_start is not None: visited.add(name)
code.append(Jump(sub_start)) for production in rule.fn(self.grammar).flatten(with_metadata=True):
else: trans_prod = compile_production(production)
sub_start = len(code) generated_grammar.append((name, trans_prod))
patcher[rule.name] = sub_start
tails = []
subs = list(rule.fn(self.grammar).flatten(with_metadata=True))
for sub in subs[:-1]:
split_pos = len(code)
code.append(Split(0, 0))
compile_production(sub) def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
nonlocal group_count
nonlocal indent_amounts
nonlocal done_newline
tails.append(len(code)) result = []
code.append(Jump(0))
code[split_pos] = Split(sub_start + 1, len(code))
sub_start = len(code)
compile_production(subs[-1])
for tail in tails:
code[tail] = Jump(len(code))
def compile_production(production: parser.FlattenedWithMetadata):
for item in production: for item in production:
if isinstance(item, str): if isinstance(item, str):
rule = self.lookup_nonterminal(item) nt = self._nonterminals[item]
if rule.transparent: if nt.transparent:
# If it's transparent then we need to inline the pattern here. # If it's transparent then we make a new set of
compile_nonterminal(rule) # productions that covers the contents of the
# transparent nonterminal.
name = "xxx_" + nt.name
compile_nonterminal(name, nt)
result.append(name)
else: else:
code.append(MatchNonTerminal(item)) # Otherwise it's a "token" in our input, named
# "tree_{whatever}".
result.append(f"tree_{item}")
elif isinstance(item, parser.Terminal): elif isinstance(item, parser.Terminal):
name = item.name # If it's a terminal it will appear in our input as
assert name is not None # "token_{whatever}".
code.append(MatchTerminal(name)) result.append(f"token_{item.name}")
else: else:
meta, children = item meta, children = item
tx_children = compile_production(children)
prettier = meta.get("prettier") pretty = meta.get("prettier")
if isinstance(prettier, PrettyMeta): if isinstance(pretty, PrettyMeta):
if prettier.indent: if pretty.group:
code.append(StartIndent()) # Make a fake rule.
if prettier.group: rule_name = f"g_{group_count}"
code.append(StartGroup()) group_count += 1
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
compile_production(children) if pretty.indent:
rule_name = f"i_{len(indent_amounts)}"
indent_amounts[rule_name] = pretty.indent
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
if isinstance(prettier, PrettyMeta): if pretty.newline:
if prettier.group: if not done_newline:
code.append(EndGroup()) generated_grammar.append(("newline", []))
if prettier.indent: done_newline = True
code.append(EndIndent(prettier.indent)) tx_children.append("newline")
if prettier.newline:
code.append(NewLine())
compile_nonterminal(rule) # If it turned out to have formatting meta then we will
code.append(Accept()) # have replaced or augmented the translated children
return matcher # appropriately. Otherwise, if it's highlighting meta or
# something else, we'll have ignored it and the
# translated children should just be inserted inline.
result.extend(tx_children)
return result
compile_nonterminal(rule.name, rule)
gen = self.grammar._generator(rule.name, generated_grammar)
parse_table = gen.gen_table()
# print(parse_table.format())
return Matcher(parse_table, indent_amounts)
def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher: def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
result = self._matchers.get(rule.name) result = self._matchers.get(rule.name)
@ -385,15 +279,11 @@ class Printer:
rule = self.lookup_nonterminal(name) rule = self.lookup_nonterminal(name)
matcher = self.rule_to_matcher(rule) matcher = self.rule_to_matcher(rule)
print(f"--------")
print(f"Matching with:\n{matcher.format()}")
m = matcher.match(self, list(tree.children)) m = matcher.match(self, list(tree.children))
print(f"--------")
if m is None: if m is None:
raise ValueError( raise ValueError(
f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}" f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}"
) )
# return m
return resolve_document(m) return resolve_document(m)
def format_tree(self, tree: runtime.Tree) -> str: def format_tree(self, tree: runtime.Tree) -> str: