Compare commits

..

No commits in common. "49ad7fdb523bceda06870fa2f09825d6ffdba321" and "2473ae713d023653d72917cafae22b15de43b011" have entirely different histories.

3 changed files with 95 additions and 181 deletions

View file

@ -23,20 +23,19 @@ To get started, create a grammar that derives from the `Grammar` class. Create
one method per nonterminal, decorated with the `rule` decorator. Here's an one method per nonterminal, decorated with the `rule` decorator. Here's an
example: example:
PLUS = Token('+')
LPAREN = Token('(')
RPAREN = Token(')')
ID = Token('id')
class SimpleGrammar(Grammar): class SimpleGrammar(Grammar):
@rule @rule
def expression(self): def expression(self):
return seq(self.expression, self.PLUS, self.term) | self.term return seq(self.expression, PLUS, self.term) | self.term
@rule @rule
def term(self): def term(self):
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID return seq(LPAREN, self.expression, RPAREN) | ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
## Using grammars ## Using grammars

View file

@ -2,7 +2,16 @@
import re import re
import typing import typing
from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal, Re, TerminalKind from parser import (
Assoc,
Grammar,
Nothing,
rule,
seq,
Rule,
Terminal,
Re,
)
class FineGrammar(Grammar): class FineGrammar(Grammar):
@ -324,34 +333,30 @@ class FineGrammar(Grammar):
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
COMMENT = Terminal( COMMENT = Terminal(Re.seq(Re.literal("//"), Re.set("\n").invert().star()))
Re.seq(Re.literal("//"), Re.set("\n").invert().star()),
kind=TerminalKind.Comment.Line,
)
ARROW = Terminal("->", kind=TerminalKind.Keyword.Operator) ARROW = Terminal("->")
AS = Terminal("as", kind=TerminalKind.Keyword.Operator.Expression) AS = Terminal("as")
BAR = Terminal("|", kind=TerminalKind.Keyword.Operator.Expression) BAR = Terminal("bar")
CLASS = Terminal("class", kind=TerminalKind.Storage.Type.Class) CLASS = Terminal("class")
COLON = Terminal(":", kind=TerminalKind.Punctuation.Separator) COLON = Terminal("colon")
ELSE = Terminal("else", kind=TerminalKind.Keyword.Control.Conditional) ELSE = Terminal("else")
FOR = Terminal("for", kind=TerminalKind.Keyword.Control) FOR = Terminal("for")
FUN = Terminal("fun", kind=TerminalKind.Storage.Type.Function) FUN = Terminal("fun")
IDENTIFIER = Terminal( IDENTIFIER = Terminal(
Re.seq( Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"), Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
# kind=TerminalKind.Variable, #?
) )
IF = Terminal("if", kind=TerminalKind.Keyword.Control.Conditional) )
IMPORT = Terminal("import", kind=TerminalKind.Keyword.Other) IF = Terminal("if")
IN = Terminal("in", kind=TerminalKind.Keyword.Operator) IMPORT = Terminal("import")
LCURLY = Terminal("{", kind=TerminalKind.Punctuation.CurlyBrace.Open) IN = Terminal("in")
RCURLY = Terminal("}", kind=TerminalKind.Punctuation.CurlyBrace.Close) LCURLY = Terminal("{")
LET = Terminal("Let", kind=TerminalKind.Keyword.Other) LET = Terminal("Let")
RETURN = Terminal("return", kind=TerminalKind.Keyword.Control) RCURLY = Terminal("}")
SEMICOLON = Terminal(";", kind=TerminalKind.Punctuation.Separator) RETURN = Terminal("return")
SEMICOLON = Terminal(";")
STRING = Terminal( STRING = Terminal(
# Double-quoted string. # Double-quoted string.
Re.seq( Re.seq(
@ -364,28 +369,27 @@ class FineGrammar(Grammar):
Re.literal("'"), Re.literal("'"),
(~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(),
Re.literal("'"), Re.literal("'"),
),
kind=TerminalKind.String.Quoted,
) )
WHILE = Terminal("while", kind=TerminalKind.Keyword.Control) )
EQUAL = Terminal("=", kind=TerminalKind.Keyword.Operator.Expression) WHILE = Terminal("while")
LPAREN = Terminal("(", kind=TerminalKind.Punctuation.Parenthesis.Open) EQUAL = Terminal("=")
RPAREN = Terminal(")", kind=TerminalKind.Punctuation.Parenthesis.Close) LPAREN = Terminal("(")
COMMA = Terminal(",", kind=TerminalKind.Punctuation.Separator) RPAREN = Terminal(")")
SELF = Terminal("self", name="SELFF", kind=TerminalKind.Variable.Language) COMMA = Terminal(",")
OR = Terminal("or", kind=TerminalKind.Keyword.Operator.Expression) SELF = Terminal("self", name="SELFF")
IS = Terminal("is", kind=TerminalKind.Keyword.Operator.Expression) OR = Terminal("or")
AND = Terminal("and", kind=TerminalKind.Keyword.Operator.Expression) IS = Terminal("is")
EQUALEQUAL = Terminal("==", kind=TerminalKind.Keyword.Operator.Expression) AND = Terminal("and")
BANGEQUAL = Terminal("!=", kind=TerminalKind.Keyword.Operator.Expression) EQUALEQUAL = Terminal("==")
LESS = Terminal("<", kind=TerminalKind.Keyword.Operator.Expression) BANGEQUAL = Terminal("!=")
GREATER = Terminal(">", kind=TerminalKind.Keyword.Operator.Expression) LESS = Terminal("<")
LESSEQUAL = Terminal("<=", kind=TerminalKind.Keyword.Operator.Expression) GREATER = Terminal(">")
GREATEREQUAL = Terminal(">=", kind=TerminalKind.Keyword.Operator.Expression) LESSEQUAL = Terminal("<=")
PLUS = Terminal("+", kind=TerminalKind.Keyword.Operator.Expression) GREATEREQUAL = Terminal(">=")
MINUS = Terminal("-", kind=TerminalKind.Keyword.Operator.Expression) PLUS = Terminal("+")
STAR = Terminal("*", kind=TerminalKind.Keyword.Operator.Expression) MINUS = Terminal("-")
SLASH = Terminal("/", kind=TerminalKind.Keyword.Operator.Expression) STAR = Terminal("*")
SLASH = Terminal("/")
NUMBER = Terminal( NUMBER = Terminal(
Re.seq( Re.seq(
Re.set(("0", "9")).plus(), Re.set(("0", "9")).plus(),
@ -398,19 +402,18 @@ class FineGrammar(Grammar):
Re.set("+", "-").question(), Re.set("+", "-").question(),
Re.set(("0", "9")).plus(), Re.set(("0", "9")).plus(),
).question(), ).question(),
),
kind=TerminalKind.Constant.Numeric,
) )
TRUE = Terminal("true", kind=TerminalKind.Constant.Language) )
FALSE = Terminal("false", kind=TerminalKind.Constant.Language) TRUE = Terminal("true")
BANG = Terminal("!", kind=TerminalKind.Keyword.Operator.Expression) FALSE = Terminal("false")
DOT = Terminal(".", kind=TerminalKind.Punctuation.Separator) BANG = Terminal("!")
MATCH = Terminal("match", kind=TerminalKind.Keyword.Other) DOT = Terminal(".")
EXPORT = Terminal("export", kind=TerminalKind.Keyword.Other) MATCH = Terminal("match")
UNDERSCORE = Terminal("_", kind=TerminalKind.Variable.Language) EXPORT = Terminal("export")
NEW = Terminal("new", kind=TerminalKind.Keyword.Operator) UNDERSCORE = Terminal("_")
LSQUARE = Terminal("[", kind=TerminalKind.Punctuation.SquareBracket.Open) NEW = Terminal("new")
RSQUARE = Terminal("]", kind=TerminalKind.Punctuation.SquareBracket.Close) LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------

View file

@ -1609,14 +1609,10 @@ class Terminal(Rule):
value: str | None value: str | None
pattern: "str | Re" pattern: "str | Re"
meta: dict[str, typing.Any]
regex: bool
def __init__(self, pattern: "str|Re", *, name: str | None = None, **kwargs): def __init__(self, pattern, *, name=None):
self.value = name self.value = name
self.pattern = pattern self.pattern = pattern
self.meta = kwargs
self.regex = isinstance(pattern, Re)
def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
# We are just ourselves when flattened. # We are just ourselves when flattened.
@ -2153,15 +2149,15 @@ class EdgeList[ET]:
class NFAState: class NFAState:
"""An NFA state. A state can be an accept state if it has a Terminal """An NFA state. Each state can be the accept state, with one or more
associated with it.""" Terminals as the result."""
accept: Terminal | None accept: list[Terminal]
epsilons: list["NFAState"] epsilons: list["NFAState"]
_edges: EdgeList["NFAState"] _edges: EdgeList["NFAState"]
def __init__(self): def __init__(self):
self.accept = None self.accept = []
self.epsilons = [] self.epsilons = []
self._edges = EdgeList() self._edges = EdgeList()
@ -2187,7 +2183,7 @@ class NFAState:
continue continue
visited.add(state) visited.add(state)
label = state.accept.value if state.accept is not None else "" label = ", ".join([t.value for t in state.accept if t.value is not None])
f.write(f' {id(state)} [label="{label}"];\n') f.write(f' {id(state)} [label="{label}"];\n')
for target in state.epsilons: for target in state.epsilons:
stack.append(target) stack.append(target)
@ -2464,42 +2460,42 @@ class NFASuperState:
def accept_terminal(self) -> Terminal | None: def accept_terminal(self) -> Terminal | None:
accept = None accept = None
for st in self.states: for st in self.states:
if st.accept is None: for ac in st.accept:
continue
if accept is None: if accept is None:
accept = st.accept accept = ac
elif accept.value != st.accept.value: elif accept.value != ac.value:
if accept.regex and not st.accept.regex: accept_regex = isinstance(accept.pattern, Re)
accept = st.accept ac_regex = isinstance(ac.pattern, Re)
elif st.accept.regex and not accept.regex:
if accept_regex and not ac_regex:
accept = ac
elif ac_regex and not accept_regex:
pass pass
else: else:
raise ValueError( raise ValueError(
f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {st.accept.value} ('{st.accept.pattern}')" f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')"
) )
return accept return accept
def compile_lexer(grammar: Grammar) -> LexerTable: def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable:
# Parse the terminals all together into a big NFA rooted at `NFA`. # Parse the terminals all together into a big NFA rooted at `NFA`.
NFA = NFAState() NFA = NFAState()
for terminal in grammar.terminals: for terminal in terminals:
pattern = terminal.pattern pattern = terminal.pattern
if isinstance(pattern, Re): if isinstance(pattern, Re):
start, ends = pattern.to_nfa() start, ends = pattern.to_nfa()
for end in ends: for end in ends:
end.accept = terminal end.accept.append(terminal)
NFA.epsilons.append(start) NFA.epsilons.append(start)
else: else:
start = end = NFAState() start = end = NFAState()
for c in pattern: for c in pattern:
end = end.add_edge(Span.from_str(c), NFAState()) end = end.add_edge(Span.from_str(c), NFAState())
end.accept = terminal end.accept.append(terminal)
NFA.epsilons.append(start) NFA.epsilons.append(start)
NFA.dump_graph() NFA.dump_graph()
@ -2529,8 +2525,12 @@ def compile_lexer(grammar: Grammar) -> LexerTable:
] ]
def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"): def compile_lexer(grammar: Grammar) -> LexerTable:
with open(name, "w", encoding="utf-8") as f: return compile_terminals(grammar.terminals)
def dump_lexer_table(table: LexerTable):
with open("lexer.dot", "w", encoding="utf-8") as f:
f.write("digraph G {\n") f.write("digraph G {\n")
for index, (accept, edges) in enumerate(table): for index, (accept, edges) in enumerate(table):
label = accept.value if accept is not None else "" label = accept.value if accept is not None else ""
@ -2541,91 +2541,3 @@ def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"):
pass pass
f.write("}\n") f.write("}\n")
# NOTE: We have rich metadata system man, wow, how cool are we?
#
# The whole point of this stuff here is to allow automatic
# generation/maintenance of syntax coloring for editors. And maybe some
# other stuff? This is *extremely provisional*, I'm not even sure it
# makes sense yet. Tree sitter works differently, for example, and it's
# not clear at all what we want to generate for any particular editor.
#
# This here might be enough to produce extremely basic TextMate
# grammars but anything more complicated will want tree patterns
# anyway, and we can only do tree patterns by influencing the grammar.
class TerminalMeta:
pass
class TerminalKind(TerminalMeta):
class Comment(TerminalMeta):
class Block(TerminalMeta):
pass
class Line(TerminalMeta):
pass
class Constant(TerminalMeta):
class Language(TerminalMeta):
pass
class Numeric(TerminalMeta):
pass
class Keyword(TerminalMeta):
class Control(TerminalMeta):
class Conditional(TerminalMeta):
pass
class Operator(TerminalMeta):
class Expression(TerminalMeta):
pass
class Other(TerminalMeta):
pass
class Punctuation(TerminalMeta):
class Separator(TerminalMeta):
pass
class Parenthesis(TerminalMeta):
class Open(TerminalMeta):
pass
class Close(TerminalMeta):
pass
class CurlyBrace(TerminalMeta):
class Open(TerminalMeta):
pass
class Close(TerminalMeta):
pass
class SquareBracket(TerminalMeta):
class Open(TerminalMeta):
pass
class Close(TerminalMeta):
pass
class Storage(TerminalMeta):
class Type(TerminalMeta):
class Class(TerminalMeta):
pass
class Function(TerminalMeta):
pass
class String(TerminalMeta):
class Quoted(TerminalMeta):
class Single(TerminalMeta):
pass
class Double(TerminalMeta):
pass
class Variable(TerminalMeta):
class Language(TerminalMeta):
pass