Rebuild the matcher on grammars
Well that wasn't so bad now was it? Eh? Nice to have a parser generator lying around. Let's keep working to see if I can actually finish it.
This commit is contained in:
parent
1d28c82007
commit
7edf5e06bf
1 changed files with 132 additions and 242 deletions
374
parser/wadler.py
374
parser/wadler.py
|
|
@ -1,7 +1,5 @@
|
||||||
# A prettier printer.
|
# A prettier printer.
|
||||||
import abc
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import math
|
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
from . import parser
|
from . import parser
|
||||||
|
|
@ -52,6 +50,10 @@ class Lazy:
|
||||||
self.value = self.value()
|
self.value = self.value()
|
||||||
return self.value
|
return self.value
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_tree(cls, tree: runtime.Tree, printer: "Printer") -> "Lazy":
|
||||||
|
return Lazy(lambda: printer.convert_tree_to_document(tree))
|
||||||
|
|
||||||
|
|
||||||
Document = None | Text | NewLine | Cons | Indent | Group | Lazy
|
Document = None | Text | NewLine | Cons | Indent | Group | Lazy
|
||||||
|
|
||||||
|
|
@ -78,208 +80,88 @@ def layout_document(doc: Document) -> typing.Generator[str, None, None]:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
def child_to_name(child: runtime.Tree | runtime.TokenValue) -> str:
|
||||||
class MatchTerminal:
|
if isinstance(child, runtime.Tree):
|
||||||
name: str
|
return f"tree_{child.name}"
|
||||||
|
else:
|
||||||
|
return f"token_{child.kind}"
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class MatchNonTerminal:
|
|
||||||
name: str
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class Accept:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class StartGroup:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class EndGroup:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class StartIndent:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class EndIndent:
|
|
||||||
amount: int
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class Split:
|
|
||||||
left: int
|
|
||||||
right: int
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class Jump:
|
|
||||||
next: int
|
|
||||||
|
|
||||||
|
|
||||||
MatchInstruction = (
|
|
||||||
MatchTerminal
|
|
||||||
| MatchNonTerminal
|
|
||||||
| Accept
|
|
||||||
| StartGroup
|
|
||||||
| EndGroup
|
|
||||||
| NewLine
|
|
||||||
| StartIndent
|
|
||||||
| EndIndent
|
|
||||||
| Split
|
|
||||||
| Jump
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
### THIS DOESN'T WORK
|
|
||||||
###
|
|
||||||
### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT
|
|
||||||
### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER
|
|
||||||
### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR
|
|
||||||
### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS
|
|
||||||
###
|
|
||||||
### CHRIST.
|
|
||||||
###
|
|
||||||
class Matcher:
|
class Matcher:
|
||||||
code: list[MatchInstruction]
|
table: parser.ParseTable
|
||||||
|
indent_amounts: dict[str, int]
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, table: parser.ParseTable, indent_amounts):
|
||||||
self.code = []
|
self.table = table
|
||||||
|
self.indent_amounts = indent_amounts
|
||||||
@dataclasses.dataclass
|
|
||||||
class ThreadState:
|
|
||||||
pc: int
|
|
||||||
position: int
|
|
||||||
count: int
|
|
||||||
results: list[Document | StartGroup | StartIndent]
|
|
||||||
|
|
||||||
def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
|
def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
|
||||||
threads: list[Matcher.ThreadState] = [
|
stack: list[tuple[int, Document]] = [(0, None)]
|
||||||
Matcher.ThreadState(pc=0, position=0, results=[], count=0)
|
table = self.table
|
||||||
|
|
||||||
|
input = [(child_to_name(i), i) for i in items] + [
|
||||||
|
("$", runtime.TokenValue(kind="$", start=0, end=0))
|
||||||
]
|
]
|
||||||
|
input_index = 0
|
||||||
|
|
||||||
while len(threads) > 0:
|
while True:
|
||||||
thread = threads.pop()
|
current_token = input[input_index]
|
||||||
results = thread.results
|
current_state = stack[-1][0]
|
||||||
while True:
|
action = table.actions[current_state].get(current_token[0], parser.Error())
|
||||||
thread.count += 1
|
|
||||||
if thread.count > 1000:
|
|
||||||
raise Exception("Too many steps!")
|
|
||||||
|
|
||||||
inst = self.code[thread.pc]
|
# print(
|
||||||
print(f"THREAD: {thread.pc}: {inst} ({thread.position})")
|
# "{stack: <30} {input: <15} {action: <5}".format(
|
||||||
match inst:
|
# stack=repr([s[0] for s in stack[-5:]]),
|
||||||
case MatchTerminal(name):
|
# input=current_token[0],
|
||||||
if thread.position >= len(items):
|
# action=repr(action),
|
||||||
break
|
# )
|
||||||
|
# )
|
||||||
|
|
||||||
item = items[thread.position]
|
match action:
|
||||||
if not isinstance(item, runtime.TokenValue):
|
case parser.Accept():
|
||||||
break
|
return stack[-1][1]
|
||||||
|
|
||||||
if item.kind != name:
|
case parser.Reduce(name=name, count=size):
|
||||||
break
|
child: Document = None
|
||||||
|
if size > 0:
|
||||||
|
for _, c in stack[-size:]:
|
||||||
|
if c is None:
|
||||||
|
continue
|
||||||
|
child = cons(child, c)
|
||||||
|
del stack[-size:]
|
||||||
|
|
||||||
results.append(Text(item.start, item.end))
|
if name[0] == "g":
|
||||||
thread.pc += 1
|
child = Group(child)
|
||||||
thread.position += 1
|
|
||||||
|
|
||||||
case MatchNonTerminal(name):
|
elif name[0] == "i":
|
||||||
if thread.position >= len(items):
|
amount = self.indent_amounts[name]
|
||||||
break
|
child = Indent(amount, child)
|
||||||
|
|
||||||
item = items[thread.position]
|
elif name[0] == "n":
|
||||||
if not isinstance(item, runtime.Tree):
|
child = cons(child, NewLine())
|
||||||
break
|
|
||||||
|
|
||||||
if item.name != name:
|
elif name[0] == "p":
|
||||||
break
|
child = cons(NewLine(), child)
|
||||||
|
|
||||||
def thunk(capture: runtime.Tree):
|
else:
|
||||||
return lambda: printer.convert_tree_to_document(capture)
|
pass # ???
|
||||||
|
|
||||||
results.append(Lazy(thunk(item)))
|
goto = self.table.gotos[stack[-1][0]].get(name)
|
||||||
thread.pc += 1
|
assert goto is not None
|
||||||
thread.position += 1
|
stack.append((goto, child))
|
||||||
|
|
||||||
case Accept():
|
case parser.Shift():
|
||||||
if thread.position != len(items):
|
value = current_token[1]
|
||||||
break
|
if isinstance(value, runtime.Tree):
|
||||||
|
child = Lazy.from_tree(value, printer)
|
||||||
|
else:
|
||||||
|
child = Text(value.start, value.end)
|
||||||
|
|
||||||
result = None
|
stack.append((action.state, child))
|
||||||
for r in thread.results:
|
input_index += 1
|
||||||
assert not isinstance(r, (StartGroup, StartIndent))
|
|
||||||
result = cons(result, r)
|
|
||||||
return result
|
|
||||||
|
|
||||||
case StartGroup():
|
case parser.Error():
|
||||||
results.append(inst)
|
raise Exception("How did I get a parse error here??")
|
||||||
thread.pc += 1
|
|
||||||
|
|
||||||
case EndGroup():
|
|
||||||
group_items = None
|
|
||||||
while not isinstance(results[-1], StartGroup):
|
|
||||||
item = typing.cast(Document, results.pop())
|
|
||||||
group_items = cons(item, group_items)
|
|
||||||
results.pop()
|
|
||||||
results.append(Group(group_items))
|
|
||||||
thread.pc += 1
|
|
||||||
|
|
||||||
case NewLine():
|
|
||||||
results.append(NewLine())
|
|
||||||
thread.pc += 1
|
|
||||||
|
|
||||||
case StartIndent():
|
|
||||||
results.append(inst)
|
|
||||||
thread.pc += 1
|
|
||||||
|
|
||||||
case EndIndent(amount):
|
|
||||||
indent_items = None
|
|
||||||
while not isinstance(results[-1], StartIndent):
|
|
||||||
item = typing.cast(Document, results.pop())
|
|
||||||
indent_items = cons(item, indent_items)
|
|
||||||
results.pop()
|
|
||||||
results.append(Indent(amount, indent_items))
|
|
||||||
thread.pc += 1
|
|
||||||
|
|
||||||
case Split(left, right):
|
|
||||||
new_thread = Matcher.ThreadState(
|
|
||||||
pc=right,
|
|
||||||
position=thread.position,
|
|
||||||
results=list(thread.results),
|
|
||||||
count=0,
|
|
||||||
)
|
|
||||||
threads.append(new_thread)
|
|
||||||
thread.pc = left
|
|
||||||
|
|
||||||
case Jump(where):
|
|
||||||
thread.pc = where
|
|
||||||
threads.append(thread)
|
|
||||||
|
|
||||||
case _:
|
|
||||||
typing.assert_never(inst)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def format(self) -> str:
|
|
||||||
return "\n".join(self.format_lines())
|
|
||||||
|
|
||||||
def format_lines(self) -> list[str]:
|
|
||||||
lines = []
|
|
||||||
code_len = int(math.log10(len(self.code))) + 1
|
|
||||||
for i, inst in enumerate(self.code):
|
|
||||||
lines.append(f"{i: >{code_len}} {inst}")
|
|
||||||
return lines
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ...
|
|
||||||
|
|
||||||
|
|
||||||
class PrettyMeta(parser.SyntaxMeta):
|
class PrettyMeta(parser.SyntaxMeta):
|
||||||
|
|
@ -302,74 +184,86 @@ class Printer:
|
||||||
return self._nonterminals[name]
|
return self._nonterminals[name]
|
||||||
|
|
||||||
def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
|
def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
|
||||||
matcher = Matcher()
|
generated_grammar: list[typing.Tuple[str, list[str]]] = []
|
||||||
code = matcher.code
|
visited: set[str] = set()
|
||||||
patcher: dict[str, int] = {}
|
group_count = 0
|
||||||
|
indent_amounts: dict[str, int] = {}
|
||||||
|
done_newline = False
|
||||||
|
|
||||||
def compile_nonterminal(rule: parser.NonTerminal):
|
def compile_nonterminal(name: str, rule: parser.NonTerminal):
|
||||||
sub_start = patcher.get(rule.name)
|
if name not in visited:
|
||||||
if sub_start is not None:
|
visited.add(name)
|
||||||
code.append(Jump(sub_start))
|
for production in rule.fn(self.grammar).flatten(with_metadata=True):
|
||||||
else:
|
trans_prod = compile_production(production)
|
||||||
sub_start = len(code)
|
generated_grammar.append((name, trans_prod))
|
||||||
patcher[rule.name] = sub_start
|
|
||||||
tails = []
|
|
||||||
subs = list(rule.fn(self.grammar).flatten(with_metadata=True))
|
|
||||||
for sub in subs[:-1]:
|
|
||||||
split_pos = len(code)
|
|
||||||
code.append(Split(0, 0))
|
|
||||||
|
|
||||||
compile_production(sub)
|
def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
|
||||||
|
nonlocal group_count
|
||||||
|
nonlocal indent_amounts
|
||||||
|
nonlocal done_newline
|
||||||
|
|
||||||
tails.append(len(code))
|
result = []
|
||||||
code.append(Jump(0))
|
|
||||||
|
|
||||||
code[split_pos] = Split(sub_start + 1, len(code))
|
|
||||||
sub_start = len(code)
|
|
||||||
|
|
||||||
compile_production(subs[-1])
|
|
||||||
|
|
||||||
for tail in tails:
|
|
||||||
code[tail] = Jump(len(code))
|
|
||||||
|
|
||||||
def compile_production(production: parser.FlattenedWithMetadata):
|
|
||||||
for item in production:
|
for item in production:
|
||||||
if isinstance(item, str):
|
if isinstance(item, str):
|
||||||
rule = self.lookup_nonterminal(item)
|
nt = self._nonterminals[item]
|
||||||
if rule.transparent:
|
if nt.transparent:
|
||||||
# If it's transparent then we need to inline the pattern here.
|
# If it's transparent then we make a new set of
|
||||||
compile_nonterminal(rule)
|
# productions that covers the contents of the
|
||||||
|
# transparent nonterminal.
|
||||||
|
name = "xxx_" + nt.name
|
||||||
|
compile_nonterminal(name, nt)
|
||||||
|
result.append(name)
|
||||||
else:
|
else:
|
||||||
code.append(MatchNonTerminal(item))
|
# Otherwise it's a "token" in our input, named
|
||||||
|
# "tree_{whatever}".
|
||||||
|
result.append(f"tree_{item}")
|
||||||
|
|
||||||
elif isinstance(item, parser.Terminal):
|
elif isinstance(item, parser.Terminal):
|
||||||
name = item.name
|
# If it's a terminal it will appear in our input as
|
||||||
assert name is not None
|
# "token_{whatever}".
|
||||||
code.append(MatchTerminal(name))
|
result.append(f"token_{item.name}")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
meta, children = item
|
meta, children = item
|
||||||
|
tx_children = compile_production(children)
|
||||||
|
|
||||||
prettier = meta.get("prettier")
|
pretty = meta.get("prettier")
|
||||||
if isinstance(prettier, PrettyMeta):
|
if isinstance(pretty, PrettyMeta):
|
||||||
if prettier.indent:
|
if pretty.group:
|
||||||
code.append(StartIndent())
|
# Make a fake rule.
|
||||||
if prettier.group:
|
rule_name = f"g_{group_count}"
|
||||||
code.append(StartGroup())
|
group_count += 1
|
||||||
|
generated_grammar.append((rule_name, tx_children))
|
||||||
|
tx_children = [rule_name]
|
||||||
|
|
||||||
compile_production(children)
|
if pretty.indent:
|
||||||
|
rule_name = f"i_{len(indent_amounts)}"
|
||||||
|
indent_amounts[rule_name] = pretty.indent
|
||||||
|
generated_grammar.append((rule_name, tx_children))
|
||||||
|
tx_children = [rule_name]
|
||||||
|
|
||||||
if isinstance(prettier, PrettyMeta):
|
if pretty.newline:
|
||||||
if prettier.group:
|
if not done_newline:
|
||||||
code.append(EndGroup())
|
generated_grammar.append(("newline", []))
|
||||||
if prettier.indent:
|
done_newline = True
|
||||||
code.append(EndIndent(prettier.indent))
|
tx_children.append("newline")
|
||||||
if prettier.newline:
|
|
||||||
code.append(NewLine())
|
|
||||||
|
|
||||||
compile_nonterminal(rule)
|
# If it turned out to have formatting meta then we will
|
||||||
code.append(Accept())
|
# have replaced or augmented the translated children
|
||||||
return matcher
|
# appropriately. Otherwise, if it's highlighting meta or
|
||||||
|
# something else, we'll have ignored it and the
|
||||||
|
# translated children should just be inserted inline.
|
||||||
|
result.extend(tx_children)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
compile_nonterminal(rule.name, rule)
|
||||||
|
gen = self.grammar._generator(rule.name, generated_grammar)
|
||||||
|
parse_table = gen.gen_table()
|
||||||
|
|
||||||
|
# print(parse_table.format())
|
||||||
|
|
||||||
|
return Matcher(parse_table, indent_amounts)
|
||||||
|
|
||||||
def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
|
def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
|
||||||
result = self._matchers.get(rule.name)
|
result = self._matchers.get(rule.name)
|
||||||
|
|
@ -385,15 +279,11 @@ class Printer:
|
||||||
|
|
||||||
rule = self.lookup_nonterminal(name)
|
rule = self.lookup_nonterminal(name)
|
||||||
matcher = self.rule_to_matcher(rule)
|
matcher = self.rule_to_matcher(rule)
|
||||||
print(f"--------")
|
|
||||||
print(f"Matching with:\n{matcher.format()}")
|
|
||||||
m = matcher.match(self, list(tree.children))
|
m = matcher.match(self, list(tree.children))
|
||||||
print(f"--------")
|
|
||||||
if m is None:
|
if m is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}"
|
f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}"
|
||||||
)
|
)
|
||||||
# return m
|
|
||||||
return resolve_document(m)
|
return resolve_document(m)
|
||||||
|
|
||||||
def format_tree(self, tree: runtime.Tree) -> str:
|
def format_tree(self, tree: runtime.Tree) -> str:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue