Finish annotating test grammar, forced breaks, fixes

Forced breaks force a newline in a spot, which is sometimes what we
want. (Like, this syntax should *never* be on a single line.)
This commit is contained in:
John Doty 2024-09-13 11:57:16 -07:00
parent 938f0e5c69
commit d7a6891519
6 changed files with 273 additions and 92 deletions

View file

@ -6,14 +6,16 @@ from parser import (
Rule,
Terminal,
alt,
br,
group,
highlight,
indent,
mark,
newline,
nl,
opt,
rule,
seq,
sp,
)
@ -53,7 +55,7 @@ class FineGrammar(Grammar):
def _file_statement_list(self) -> Rule:
return alt(
self._file_statement,
self._file_statement_list + newline() + self._file_statement,
self._file_statement_list + nl + self._file_statement,
)
@rule
@ -64,7 +66,9 @@ class FineGrammar(Grammar):
@rule
def import_statement(self) -> Rule:
return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON)
return group(
self.IMPORT, sp, self.STRING, sp, self.AS, sp, self.IDENTIFIER, sp, self.SEMICOLON
)
@rule("ClassDeclaration")
def class_declaration(self) -> Rule:
@ -72,16 +76,14 @@ class FineGrammar(Grammar):
group(
group(
self.CLASS,
newline(),
sp,
mark(self.IDENTIFIER, field="name", highlight=highlight.entity.name.type),
sp,
),
self.LCURLY,
),
indent(
newline(),
mark(opt(self.class_body), field="body"),
),
newline(),
indent(nl, mark(opt(self.class_body), field="body")),
nl,
self.RCURLY,
)
@ -99,7 +101,7 @@ class FineGrammar(Grammar):
@rule("FieldDecl")
def field_declaration(self) -> Rule:
return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON)
return group(self.IDENTIFIER, self.COLON, sp, self.type_expression, self.SEMICOLON) + nl
# Types
@rule("TypeExpression")
@ -108,7 +110,7 @@ class FineGrammar(Grammar):
@rule("AlternateType")
def alternate_type(self) -> Rule:
return seq(self.type_expression, self.OR, self.type_identifier)
return group(self.type_expression, sp, self.OR, sp, self.type_identifier)
@rule("TypeIdentifier")
def type_identifier(self) -> Rule:
@ -117,28 +119,28 @@ class FineGrammar(Grammar):
@rule
def export_statement(self) -> Rule:
return alt(
seq(self.EXPORT, self.class_declaration),
seq(self.EXPORT, self.function_declaration),
seq(self.EXPORT, self.let_statement),
seq(self.EXPORT, self.export_list, self.SEMICOLON),
group(self.EXPORT, sp, self.class_declaration),
group(self.EXPORT, sp, self.function_declaration),
group(self.EXPORT, sp, self.let_statement),
group(self.EXPORT, sp, self.export_list, self.SEMICOLON),
)
@rule
def export_list(self) -> Rule:
return (
self.IDENTIFIER
| seq(self.IDENTIFIER, self.COMMA)
| seq(self.IDENTIFIER, self.COMMA, self.export_list)
)
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, sp, self.export_list)
# Functions
@rule("FunctionDecl")
def function_declaration(self) -> Rule:
return seq(
self.FUN,
mark(self.IDENTIFIER, field="name", highlight=highlight.entity.name.function),
mark(self.function_parameters, field="parameters"),
mark(opt(self.ARROW, self.type_expression), field="return_type"),
group(
self.FUN,
sp,
mark(self.IDENTIFIER, field="name", highlight=highlight.entity.name.function),
sp,
mark(self.function_parameters, field="parameters"),
mark(opt(sp, self.ARROW, sp, self.type_expression), field="return_type"),
),
mark(self.block, field="body"),
)
@ -147,14 +149,14 @@ class FineGrammar(Grammar):
return group(
self.LPAREN,
indent(
newline(),
nl,
opt(
self._first_parameter
| seq(self._first_parameter, self.COMMA)
| group(self._first_parameter, self.COMMA, newline(), self._parameter_list)
| group(self._first_parameter, self.COMMA, sp, self._parameter_list)
),
),
newline(),
nl,
self.RPAREN,
)
@ -164,18 +166,18 @@ class FineGrammar(Grammar):
@rule
def _parameter_list(self) -> Rule:
return self.parameter | seq(self.parameter, self.COMMA, newline(), self._parameter_list)
return self.parameter | seq(self.parameter, self.COMMA, sp, self._parameter_list)
@rule("Parameter")
def parameter(self) -> Rule:
return seq(self.IDENTIFIER, self.COLON, self.type_expression)
return group(self.IDENTIFIER, self.COLON, sp, self.type_expression)
# Block
@rule("Block")
def block(self) -> Rule:
return alt(
seq(self.LCURLY, self.RCURLY),
group(self.LCURLY, indent(newline(), self.block_body), newline(), self.RCURLY),
group(self.LCURLY, nl, self.RCURLY),
seq(self.LCURLY, indent(br, self.block_body), br, self.RCURLY),
)
@rule("BlockBody")
@ -183,12 +185,12 @@ class FineGrammar(Grammar):
return alt(
self.expression,
self._statement_list,
seq(self._statement_list, newline(), self.expression),
seq(self._statement_list, br, self.expression),
)
@rule
def _statement_list(self) -> Rule:
return self._statement | seq(self._statement_list, self._statement)
return self._statement | seq(self._statement_list, br, self._statement)
@rule
def _statement(self) -> Rule:
@ -204,15 +206,26 @@ class FineGrammar(Grammar):
@rule("LetStatement")
def let_statement(self) -> Rule:
return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON)
return group(
self.LET,
sp,
self.IDENTIFIER,
indent(sp, self.EQUAL, indent(sp, group(self.expression, self.SEMICOLON))),
)
@rule("ReturnStatement")
def return_statement(self) -> Rule:
return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON)
return alt(
group(self.RETURN, indent(sp, group(self.expression, self.SEMICOLON))),
group(self.RETURN, self.SEMICOLON),
)
@rule("ForStatement")
def for_statement(self) -> Rule:
return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block)
return group(
group(self.FOR, sp, self.iterator_variable, sp, self.IN, sp, group(self.expression)),
self.block,
)
@rule("IteratorVariable")
def iterator_variable(self) -> Rule:
@ -224,7 +237,7 @@ class FineGrammar(Grammar):
@rule
def while_statement(self) -> Rule:
return seq(self.WHILE, self.expression, self.block)
return group(group(self.WHILE, sp, self.expression), sp, self.block)
@rule
def expression_statement(self) -> Rule:
@ -238,24 +251,24 @@ class FineGrammar(Grammar):
@rule("BinaryExpression")
def binary_expression(self) -> Rule:
return alt(
self.expression + self.EQUAL + self.expression,
self.expression + self.OR + self.expression,
self.expression + self.AND + self.expression,
self.expression + self.EQUALEQUAL + self.expression,
self.expression + self.BANGEQUAL + self.expression,
self.expression + self.LESS + self.expression,
self.expression + self.LESSEQUAL + self.expression,
self.expression + self.GREATER + self.expression,
self.expression + self.GREATEREQUAL + self.expression,
self.expression + self.PLUS + self.expression,
self.expression + self.MINUS + self.expression,
self.expression + self.STAR + self.expression,
self.expression + self.SLASH + self.expression,
group(self.expression, sp, self.EQUAL, sp, self.expression),
group(self.expression, sp, self.OR, sp, self.expression),
group(self.expression, sp, self.AND, sp, self.expression),
group(self.expression, sp, self.EQUALEQUAL, sp, self.expression),
group(self.expression, sp, self.BANGEQUAL, sp, self.expression),
group(self.expression, sp, self.LESS, sp, self.expression),
group(self.expression, sp, self.LESSEQUAL, sp, self.expression),
group(self.expression, sp, self.GREATER, sp, self.expression),
group(self.expression, sp, self.GREATEREQUAL, sp, self.expression),
group(self.expression, sp, self.PLUS, sp, self.expression),
group(self.expression, sp, self.MINUS, sp, self.expression),
group(self.expression, sp, self.STAR, sp, self.expression),
group(self.expression, sp, self.SLASH, sp, self.expression),
)
@rule("IsExpression")
def is_expression(self) -> Rule:
return seq(self.expression, self.IS, self.pattern)
return group(self.expression, sp, self.IS, indent(sp, self.pattern))
@rule
def primary_expression(self) -> Rule:
@ -271,9 +284,15 @@ class FineGrammar(Grammar):
| self.object_constructor_expression
| self.match_expression
| seq(self.primary_expression, self.LPAREN, self.RPAREN)
| seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN)
| seq(self.primary_expression, self.DOT, self.IDENTIFIER)
| seq(self.LPAREN, self.expression, self.RPAREN)
| group(
self.primary_expression,
self.LPAREN,
indent(nl, self._expression_list),
nl,
self.RPAREN,
)
| group(self.primary_expression, indent(nl, self.DOT, self.IDENTIFIER))
| group(self.LPAREN, indent(nl, self.expression), nl, self.RPAREN)
)
@rule("IdentifierExpression")
@ -287,15 +306,26 @@ class FineGrammar(Grammar):
@rule("ConditionalExpression")
def conditional_expression(self) -> Rule:
return (
seq(self.IF, self.expression, self.block)
| seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression)
| seq(self.IF, self.expression, self.block, self.ELSE, self.block)
seq(group(self.IF, sp, self.expression), sp, self.block)
| seq(
group(self.IF, sp, self.expression),
sp,
self.block,
sp,
self.ELSE,
sp,
self.conditional_expression,
)
| seq(
group(self.IF, sp, self.expression), sp, self.block, sp, self.ELSE, sp, self.block
)
)
@rule
def list_constructor_expression(self) -> Rule:
return seq(self.LSQUARE, self.RSQUARE) | seq(
self.LSQUARE, self._expression_list, self.RSQUARE
return alt(
group(self.LSQUARE, nl, self.RSQUARE),
group(self.LSQUARE, indent(nl, self._expression_list), nl, self.RSQUARE),
)
@rule
@ -303,34 +333,37 @@ class FineGrammar(Grammar):
return (
self.expression
| seq(self.expression, self.COMMA)
| seq(self.expression, self.COMMA, self._expression_list)
| seq(self.expression, self.COMMA, sp, self._expression_list)
)
@rule
def match_expression(self) -> Rule:
return seq(self.MATCH, self.expression, self.match_body)
return group(group(self.MATCH, sp, self.expression), sp, self.match_body)
@rule("MatchBody")
def match_body(self) -> Rule:
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY)
return alt(
group(self.LCURLY, nl, self.RCURLY),
group(self.LCURLY, indent(nl, self._match_arms), nl, self.RCURLY),
)
@rule
def _match_arms(self) -> Rule:
return (
self.match_arm
| seq(self.match_arm, self.COMMA)
| seq(self.match_arm, self.COMMA, self._match_arms)
| seq(self.match_arm, self.COMMA, br, self._match_arms)
)
@rule("MatchArm")
def match_arm(self) -> Rule:
return seq(self.pattern, self.ARROW, self.expression)
return group(self.pattern, sp, self.ARROW, sp, self.expression)
@rule("Pattern")
def pattern(self) -> Rule:
return (
seq(self.variable_binding, self._pattern_core, self.AND, self.expression)
| seq(self.variable_binding, self._pattern_core)
group(self.variable_binding, self._pattern_core, sp, self.AND, sp, self.expression)
| group(self.variable_binding, self._pattern_core)
| self._pattern_core
)
@ -348,23 +381,26 @@ class FineGrammar(Grammar):
@rule
def object_constructor_expression(self) -> Rule:
return seq(self.NEW, self.type_identifier, self.field_list)
return group(self.NEW, sp, self.type_identifier, self.field_list)
@rule
def field_list(self) -> Rule:
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY)
return alt(
seq(self.LCURLY, self.RCURLY),
group(self.LCURLY, indent(nl, self.field_values), nl, self.RCURLY),
)
@rule
def field_values(self) -> Rule:
return (
self.field_value
| seq(self.field_value, self.COMMA)
| seq(self.field_value, self.COMMA, self.field_values)
| seq(self.field_value, self.COMMA, sp, self.field_values)
)
@rule
def field_value(self) -> Rule:
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
return self.IDENTIFIER | group(self.IDENTIFIER, self.COLON, indent(sp, self.expression))
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
COMMENT = Terminal(
@ -461,13 +497,17 @@ if __name__ == "__main__":
from parser.emacs import emit_emacs_major_mode
from parser.tree_sitter import emit_tree_sitter_grammar, emit_tree_sitter_queries
# TODO: Actually generate a lexer/parser for some runtime.
grammar = FineGrammar()
grammar.build_table()
lexer = grammar.compile_lexer()
dump_lexer_table(lexer)
# Generate tree-sitter parser and emacs mode.
ts_path = Path(__file__).parent / "tree-sitter-fine"
emit_tree_sitter_grammar(grammar, ts_path)
emit_tree_sitter_queries(grammar, ts_path)
emit_emacs_major_mode(grammar, ts_path / "fine.el")
# TODO: Generate pretty-printer code.

View file

@ -4,4 +4,4 @@
.PHONY: test
test:
python3 ./parser/parser.py
pdm run pytest
pdm run python3 -m pytest

View file

@ -540,7 +540,7 @@ class ErrorCollection:
match action:
case Reduce(name=name, count=count, transparent=transparent):
name_str = name if not transparent else f"transparent node ({name})"
action_str = f"pop {count} values off the stack and make a {name_str}"
action_str = f"use the {count} values to make a {name_str}"
case Shift():
action_str = "consume the token and keep going"
case Accept():
@ -2680,6 +2680,7 @@ highlight = _Highlight()
@dataclasses.dataclass
class FormatMeta(SyntaxMeta):
newline: str | None = None
forced_break: bool = False
indent: int | None = None
group: bool = False
@ -2717,6 +2718,17 @@ def newline(text: str | None = None) -> Rule:
return mark(Nothing, format=FormatMeta(newline=text))
nl = newline("")
sp = newline(" ")
def forced_break() -> Rule:
return mark(Nothing, format=FormatMeta(forced_break=True))
br = forced_break()
###############################################################################
# Finally, the base class for grammars
###############################################################################
@ -2753,7 +2765,8 @@ class Grammar:
_precedence: dict[str, typing.Tuple[Assoc, int]]
_generator: type[GenerateLR0]
_terminals: list[Terminal]
_terminals: dict[str, Terminal]
_nonterminals: dict[str, NonTerminal]
_trivia: list[Terminal]
def __init__(
@ -2794,6 +2807,19 @@ class Grammar:
raise ValueError(f"More than one terminal has the name '{n}'")
terminals[n] = t
# Get the nonterminals.
nonterminals = {}
for _, nt in inspect.getmembers(self, lambda x: isinstance(x, NonTerminal)):
if nt.name in nonterminals:
raise ValueError(f"More than one nonterminal found with the name '{nt.name}'")
if nt.name in terminals:
raise ValueError(
f"'{nt.name}' is the name of both a Terminal and a NonTerminal rule"
)
nonterminals[nt.name] = nt
# Resolve the trivia declarations correctly.
resolved_trivia: list[Terminal] = []
for t in trivia:
@ -2809,12 +2835,22 @@ class Grammar:
precedence_table = {}
for prec, (associativity, symbols) in enumerate(precedence):
for symbol in symbols:
key = None
if isinstance(symbol, Terminal):
key = symbol.name
if key is None:
raise ValueError(f"{symbol} is a terminal that has not had a name set yet")
elif isinstance(symbol, NonTerminal):
key = symbol.name
else:
raise ValueError(f"{symbol} must be either a Token or a NonTerminal")
elif isinstance(symbol, str):
key = terminals.get(symbol)
if key is None:
key = nonterminals.get(symbol)
if key is None:
raise ValueError(
f"{symbol} must be either a Token or a NonTerminal, or the name of one"
)
precedence_table[key] = (associativity, prec + 1)
@ -2826,18 +2862,19 @@ class Grammar:
self._precedence = precedence_table
self.start = start
self._generator = generator
self._terminals = list(terminals.values())
self._terminals = terminals
self._nonterminals = nonterminals
self._trivia = resolved_trivia
self.name = name
def terminals(self) -> list[Terminal]:
return self._terminals
return list(self._terminals.values())
def trivia_terminals(self) -> list[Terminal]:
return self._trivia
def non_terminals(self) -> list[NonTerminal]:
return [nt for _, nt in inspect.getmembers(self, lambda x: isinstance(x, NonTerminal))]
return list(self._nonterminals.values())
def get_precedence(self, name: str) -> None | tuple[Assoc, int]:
return self._precedence.get(name)
@ -2858,9 +2895,8 @@ class Grammar:
if start is None:
start = self.start
rules = self.non_terminals()
nonterminals = {rule.name: rule for rule in rules}
transparents = {rule.name for rule in rules if rule.transparent}
nonterminals = self._nonterminals
transparents = {rule.name for rule in nonterminals.values() if rule.transparent}
grammar = {}

View file

@ -5,6 +5,9 @@ import typing
from . import parser
from . import runtime
# TODO: I think I want a *force break*, i.e., a document which forces things
# to not fit on one line.
@dataclasses.dataclass(frozen=True)
class Cons:
@ -24,6 +27,11 @@ class NewLine:
replace: str
@dataclasses.dataclass(frozen=True)
class ForceBreak:
pass
@dataclasses.dataclass(frozen=True)
class Indent:
amount: int
@ -60,7 +68,7 @@ class Lazy:
return Lazy(lambda: printer.convert_tree_to_document(tree))
Document = None | Text | Literal | NewLine | Cons | Indent | Group | Lazy
Document = None | Text | Literal | NewLine | ForceBreak | Cons | Indent | Group | Lazy
class DocumentLayout:
@ -127,6 +135,12 @@ def layout_document(doc: Document, width: int) -> DocumentLayout:
# all fit.
return True
case ForceBreak():
# If we're in a flattened chunk then force it to break by
# returning false here, otherwise we're at the end of the
# line and yes, whatever you were asking about has fit.
return not chunk.flat
case Cons(left, right):
stack.append(chunk.with_document(right))
stack.append(chunk.with_document(left))
@ -180,6 +194,11 @@ def layout_document(doc: Document, width: int) -> DocumentLayout:
output.append("\n" + (chunk.indent * " "))
column = chunk.indent
case ForceBreak():
# TODO: Custom newline expansion, custom indent segments.
output.append("\n" + (chunk.indent * " "))
column = chunk.indent
case Cons(left, right):
chunks.append(chunk.with_document(right))
chunks.append(chunk.with_document(left))
@ -292,12 +311,14 @@ class Matcher:
elif name[0] == "n":
replace = self.newline_replace[name]
print(f"!!!! {name} -> {repr(replace)}")
child = cons(child, NewLine(replace))
elif name[0] == "p":
child = cons(NewLine(""), child)
elif name[0] == "f":
child = cons(child, ForceBreak())
else:
pass # Reducing a transparent rule probably.
@ -375,8 +396,8 @@ class Printer:
visited: set[str] = set()
group_count = 0
indent_amounts: dict[str, int] = {}
done_newline = False
newline_map: dict[str, str] = {}
done_forced_break = False
def compile_nonterminal(name: str, rule: parser.NonTerminal):
if name not in visited:
@ -388,7 +409,7 @@ class Printer:
def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
nonlocal group_count
nonlocal indent_amounts
nonlocal done_newline
nonlocal done_forced_break
result = []
for item in production:
@ -439,6 +460,13 @@ class Printer:
tx_children.append(newline_rule_name)
if pretty.forced_break:
if not done_forced_break:
generated_grammar.append(("forced_break", []))
done_forced_break = True
tx_children.append("forced_break")
# If it turned out to have formatting meta then we will
# have replaced or augmented the translated children
# appropriately. Otherwise, if it's highlighting meta or

View file

@ -359,7 +359,7 @@ def test_lexer_compile():
def foo(self):
return self.IS
start = foo
start = "foo"
IS = Terminal("is")
AS = Terminal("as")
@ -392,7 +392,7 @@ def test_lexer_numbers(n: float):
def number(self):
return self.NUMBER
start = number
start = "number"
NUMBER = Terminal(
Re.seq(

View file

@ -1,6 +1,22 @@
import typing
from parser.parser import Grammar, Re, Terminal, rule, opt, group, newline, alt, indent
from parser.parser import (
Grammar,
Re,
Terminal,
rule,
opt,
group,
newline,
alt,
indent,
seq,
Rule,
Assoc,
sp,
nl,
br,
)
import parser.runtime as runtime
import parser.wadler as wadler
@ -57,10 +73,7 @@ class JsonGrammar(Grammar):
self.value + self.COMMA + newline(" ") + self._array_items,
)
BLANKS = Terminal(
Re.set(" ", "\t", "\r", "\n").plus(),
is_format_blank=True,
)
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
LCURLY = Terminal("{")
RCURLY = Terminal("}")
COMMA = Terminal(",")
@ -103,6 +116,8 @@ def flatten_document(doc: wadler.Document, src: str) -> list:
match doc:
case wadler.NewLine(replace):
return [f"<newline {repr(replace)}>"]
case wadler.ForceBreak():
return ["<forced break>"]
case wadler.Indent():
return [[f"<indent {doc.amount}>", flatten_document(doc.doc, src)]]
case wadler.Text(start, end):
@ -204,3 +219,65 @@ def test_layout_basic():
}
""".strip()
)
def test_forced_break():
class TG(Grammar):
start = "root"
trivia = ["BLANKS"]
@rule
def root(self):
return self._expression
@rule
def _expression(self):
return self.word | self.list
@rule
def list(self):
return group(self.LPAREN, indent(nl, self._expressions), nl, self.RPAREN)
@rule
def _expressions(self):
return self._expression | seq(self._expressions, sp, self._expression)
@rule
def word(self):
return self.OK | seq(self.BREAK, br, self.BREAK)
LPAREN = Terminal("(")
RPAREN = Terminal(")")
OK = Terminal("ok")
BREAK = Terminal("break")
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
g = TG()
g_lexer = g.compile_lexer()
g_parser = runtime.Parser(g.build_table())
text = "((ok ok) (ok break break ok) (ok ok ok ok))"
tree, errors = g_parser.parse(runtime.GenericTokenStream(text, g_lexer))
assert errors == []
assert tree is not None
printer = wadler.Printer(g)
result = printer.format_tree(tree, 200).apply_to_source(text)
assert (
result
== """
(
(ok ok)
(
ok
break
break
ok
)
(ok ok ok ok)
)
""".strip()
)