Compare commits

...

3 commits

Author SHA1 Message Date
49ad7fdb52 Associate metadata with terminals
This is a half-assed attempt at doing syntax coloring which I think
will almost certainly turn out to be insufficient. I'm committing it
just to record some of the work I've done but. BUT.

Probably trying to match tree-sitter is a better way of doing
this. (But, like, emitting tree-sitter grammars? Really? Wow, dude.
Way to give up.)
2024-08-27 15:43:07 -07:00
76ef85483e Accept is single-valued, the multi-value thing didn't ever make sense
I mean, it did when we thought we were going to weave NFA states as we
were building them but we ended up not doing that and instead just
using the fancy EdgeList splitting magic when building DFAs from the
NFA. It has the same power and is simpler code, and also means that
we'll *never* be asked to have multiple Terminals be accepted from a
single NFA state.
2024-08-27 15:43:01 -07:00
208491d56e This was out of date 2024-08-26 08:05:01 -07:00
3 changed files with 182 additions and 96 deletions

View file

@ -23,19 +23,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create
one method per nonterminal, decorated with the `rule` decorator. Here's an one method per nonterminal, decorated with the `rule` decorator. Here's an
example: example:
PLUS = Token('+')
LPAREN = Token('(')
RPAREN = Token(')')
ID = Token('id')
class SimpleGrammar(Grammar): class SimpleGrammar(Grammar):
@rule @rule
def expression(self): def expression(self):
return seq(self.expression, PLUS, self.term) | self.term return seq(self.expression, self.PLUS, self.term) | self.term
@rule @rule
def term(self): def term(self):
return seq(LPAREN, self.expression, RPAREN) | ID return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
## Using grammars ## Using grammars

View file

@ -2,16 +2,7 @@
import re import re
import typing import typing
from parser import ( from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal, Re, TerminalKind
Assoc,
Grammar,
Nothing,
rule,
seq,
Rule,
Terminal,
Re,
)
class FineGrammar(Grammar): class FineGrammar(Grammar):
@ -333,30 +324,34 @@ class FineGrammar(Grammar):
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
COMMENT = Terminal(Re.seq(Re.literal("//"), Re.set("\n").invert().star())) COMMENT = Terminal(
Re.seq(Re.literal("//"), Re.set("\n").invert().star()),
kind=TerminalKind.Comment.Line,
)
ARROW = Terminal("->") ARROW = Terminal("->", kind=TerminalKind.Keyword.Operator)
AS = Terminal("as") AS = Terminal("as", kind=TerminalKind.Keyword.Operator.Expression)
BAR = Terminal("bar") BAR = Terminal("|", kind=TerminalKind.Keyword.Operator.Expression)
CLASS = Terminal("class") CLASS = Terminal("class", kind=TerminalKind.Storage.Type.Class)
COLON = Terminal("colon") COLON = Terminal(":", kind=TerminalKind.Punctuation.Separator)
ELSE = Terminal("else") ELSE = Terminal("else", kind=TerminalKind.Keyword.Control.Conditional)
FOR = Terminal("for") FOR = Terminal("for", kind=TerminalKind.Keyword.Control)
FUN = Terminal("fun") FUN = Terminal("fun", kind=TerminalKind.Storage.Type.Function)
IDENTIFIER = Terminal( IDENTIFIER = Terminal(
Re.seq( Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"), Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
# kind=TerminalKind.Variable, #?
) )
) IF = Terminal("if", kind=TerminalKind.Keyword.Control.Conditional)
IF = Terminal("if") IMPORT = Terminal("import", kind=TerminalKind.Keyword.Other)
IMPORT = Terminal("import") IN = Terminal("in", kind=TerminalKind.Keyword.Operator)
IN = Terminal("in") LCURLY = Terminal("{", kind=TerminalKind.Punctuation.CurlyBrace.Open)
LCURLY = Terminal("{") RCURLY = Terminal("}", kind=TerminalKind.Punctuation.CurlyBrace.Close)
LET = Terminal("Let") LET = Terminal("Let", kind=TerminalKind.Keyword.Other)
RCURLY = Terminal("}") RETURN = Terminal("return", kind=TerminalKind.Keyword.Control)
RETURN = Terminal("return") SEMICOLON = Terminal(";", kind=TerminalKind.Punctuation.Separator)
SEMICOLON = Terminal(";")
STRING = Terminal( STRING = Terminal(
# Double-quoted string. # Double-quoted string.
Re.seq( Re.seq(
@ -369,27 +364,28 @@ class FineGrammar(Grammar):
Re.literal("'"), Re.literal("'"),
(~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(),
Re.literal("'"), Re.literal("'"),
),
kind=TerminalKind.String.Quoted,
) )
) WHILE = Terminal("while", kind=TerminalKind.Keyword.Control)
WHILE = Terminal("while") EQUAL = Terminal("=", kind=TerminalKind.Keyword.Operator.Expression)
EQUAL = Terminal("=") LPAREN = Terminal("(", kind=TerminalKind.Punctuation.Parenthesis.Open)
LPAREN = Terminal("(") RPAREN = Terminal(")", kind=TerminalKind.Punctuation.Parenthesis.Close)
RPAREN = Terminal(")") COMMA = Terminal(",", kind=TerminalKind.Punctuation.Separator)
COMMA = Terminal(",") SELF = Terminal("self", name="SELFF", kind=TerminalKind.Variable.Language)
SELF = Terminal("self", name="SELFF") OR = Terminal("or", kind=TerminalKind.Keyword.Operator.Expression)
OR = Terminal("or") IS = Terminal("is", kind=TerminalKind.Keyword.Operator.Expression)
IS = Terminal("is") AND = Terminal("and", kind=TerminalKind.Keyword.Operator.Expression)
AND = Terminal("and") EQUALEQUAL = Terminal("==", kind=TerminalKind.Keyword.Operator.Expression)
EQUALEQUAL = Terminal("==") BANGEQUAL = Terminal("!=", kind=TerminalKind.Keyword.Operator.Expression)
BANGEQUAL = Terminal("!=") LESS = Terminal("<", kind=TerminalKind.Keyword.Operator.Expression)
LESS = Terminal("<") GREATER = Terminal(">", kind=TerminalKind.Keyword.Operator.Expression)
GREATER = Terminal(">") LESSEQUAL = Terminal("<=", kind=TerminalKind.Keyword.Operator.Expression)
LESSEQUAL = Terminal("<=") GREATEREQUAL = Terminal(">=", kind=TerminalKind.Keyword.Operator.Expression)
GREATEREQUAL = Terminal(">=") PLUS = Terminal("+", kind=TerminalKind.Keyword.Operator.Expression)
PLUS = Terminal("+") MINUS = Terminal("-", kind=TerminalKind.Keyword.Operator.Expression)
MINUS = Terminal("-") STAR = Terminal("*", kind=TerminalKind.Keyword.Operator.Expression)
STAR = Terminal("*") SLASH = Terminal("/", kind=TerminalKind.Keyword.Operator.Expression)
SLASH = Terminal("/")
NUMBER = Terminal( NUMBER = Terminal(
Re.seq( Re.seq(
Re.set(("0", "9")).plus(), Re.set(("0", "9")).plus(),
@ -402,18 +398,19 @@ class FineGrammar(Grammar):
Re.set("+", "-").question(), Re.set("+", "-").question(),
Re.set(("0", "9")).plus(), Re.set(("0", "9")).plus(),
).question(), ).question(),
),
kind=TerminalKind.Constant.Numeric,
) )
) TRUE = Terminal("true", kind=TerminalKind.Constant.Language)
TRUE = Terminal("true") FALSE = Terminal("false", kind=TerminalKind.Constant.Language)
FALSE = Terminal("false") BANG = Terminal("!", kind=TerminalKind.Keyword.Operator.Expression)
BANG = Terminal("!") DOT = Terminal(".", kind=TerminalKind.Punctuation.Separator)
DOT = Terminal(".") MATCH = Terminal("match", kind=TerminalKind.Keyword.Other)
MATCH = Terminal("match") EXPORT = Terminal("export", kind=TerminalKind.Keyword.Other)
EXPORT = Terminal("export") UNDERSCORE = Terminal("_", kind=TerminalKind.Variable.Language)
UNDERSCORE = Terminal("_") NEW = Terminal("new", kind=TerminalKind.Keyword.Operator)
NEW = Terminal("new") LSQUARE = Terminal("[", kind=TerminalKind.Punctuation.SquareBracket.Open)
LSQUARE = Terminal("[") RSQUARE = Terminal("]", kind=TerminalKind.Punctuation.SquareBracket.Close)
RSQUARE = Terminal("]")
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------

View file

@ -1609,10 +1609,14 @@ class Terminal(Rule):
value: str | None value: str | None
pattern: "str | Re" pattern: "str | Re"
meta: dict[str, typing.Any]
regex: bool
def __init__(self, pattern, *, name=None): def __init__(self, pattern: "str|Re", *, name: str | None = None, **kwargs):
self.value = name self.value = name
self.pattern = pattern self.pattern = pattern
self.meta = kwargs
self.regex = isinstance(pattern, Re)
def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
# We are just ourselves when flattened. # We are just ourselves when flattened.
@ -2149,15 +2153,15 @@ class EdgeList[ET]:
class NFAState: class NFAState:
"""An NFA state. Each state can be the accept state, with one or more """An NFA state. A state can be an accept state if it has a Terminal
Terminals as the result.""" associated with it."""
accept: list[Terminal] accept: Terminal | None
epsilons: list["NFAState"] epsilons: list["NFAState"]
_edges: EdgeList["NFAState"] _edges: EdgeList["NFAState"]
def __init__(self): def __init__(self):
self.accept = [] self.accept = None
self.epsilons = [] self.epsilons = []
self._edges = EdgeList() self._edges = EdgeList()
@ -2183,7 +2187,7 @@ class NFAState:
continue continue
visited.add(state) visited.add(state)
label = ", ".join([t.value for t in state.accept if t.value is not None]) label = state.accept.value if state.accept is not None else ""
f.write(f' {id(state)} [label="{label}"];\n') f.write(f' {id(state)} [label="{label}"];\n')
for target in state.epsilons: for target in state.epsilons:
stack.append(target) stack.append(target)
@ -2460,42 +2464,42 @@ class NFASuperState:
def accept_terminal(self) -> Terminal | None: def accept_terminal(self) -> Terminal | None:
accept = None accept = None
for st in self.states:
for ac in st.accept:
if accept is None:
accept = ac
elif accept.value != ac.value:
accept_regex = isinstance(accept.pattern, Re)
ac_regex = isinstance(ac.pattern, Re)
if accept_regex and not ac_regex: for st in self.states:
accept = ac if st.accept is None:
elif ac_regex and not accept_regex: continue
if accept is None:
accept = st.accept
elif accept.value != st.accept.value:
if accept.regex and not st.accept.regex:
accept = st.accept
elif st.accept.regex and not accept.regex:
pass pass
else: else:
raise ValueError( raise ValueError(
f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {st.accept.value} ('{st.accept.pattern}')"
) )
return accept return accept
def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable: def compile_lexer(grammar: Grammar) -> LexerTable:
# Parse the terminals all together into a big NFA rooted at `NFA`. # Parse the terminals all together into a big NFA rooted at `NFA`.
NFA = NFAState() NFA = NFAState()
for terminal in terminals: for terminal in grammar.terminals:
pattern = terminal.pattern pattern = terminal.pattern
if isinstance(pattern, Re): if isinstance(pattern, Re):
start, ends = pattern.to_nfa() start, ends = pattern.to_nfa()
for end in ends: for end in ends:
end.accept.append(terminal) end.accept = terminal
NFA.epsilons.append(start) NFA.epsilons.append(start)
else: else:
start = end = NFAState() start = end = NFAState()
for c in pattern: for c in pattern:
end = end.add_edge(Span.from_str(c), NFAState()) end = end.add_edge(Span.from_str(c), NFAState())
end.accept.append(terminal) end.accept = terminal
NFA.epsilons.append(start) NFA.epsilons.append(start)
NFA.dump_graph() NFA.dump_graph()
@ -2525,12 +2529,8 @@ def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable:
] ]
def compile_lexer(grammar: Grammar) -> LexerTable: def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"):
return compile_terminals(grammar.terminals) with open(name, "w", encoding="utf-8") as f:
def dump_lexer_table(table: LexerTable):
with open("lexer.dot", "w", encoding="utf-8") as f:
f.write("digraph G {\n") f.write("digraph G {\n")
for index, (accept, edges) in enumerate(table): for index, (accept, edges) in enumerate(table):
label = accept.value if accept is not None else "" label = accept.value if accept is not None else ""
@ -2541,3 +2541,91 @@ def dump_lexer_table(table: LexerTable):
pass pass
f.write("}\n") f.write("}\n")
# NOTE: We have rich metadata system man, wow, how cool are we?
#
# The whole point of this stuff here is to allow automatic
# generation/maintenance of syntax coloring for editors. And maybe some
# other stuff? This is *extremely provisional*, I'm not even sure it
# makes sense yet. Tree sitter works differently, for example, and it's
# not clear at all what we want to generate for any particular editor.
#
# This here might be enough to produce extremely basic TextMate
# grammars but anything more complicated will want tree patterns
# anyway, and we can only do tree patterns by influencing the grammar.
class TerminalMeta:
pass
class TerminalKind(TerminalMeta):
class Comment(TerminalMeta):
class Block(TerminalMeta):
pass
class Line(TerminalMeta):
pass
class Constant(TerminalMeta):
class Language(TerminalMeta):
pass
class Numeric(TerminalMeta):
pass
class Keyword(TerminalMeta):
class Control(TerminalMeta):
class Conditional(TerminalMeta):
pass
class Operator(TerminalMeta):
class Expression(TerminalMeta):
pass
class Other(TerminalMeta):
pass
class Punctuation(TerminalMeta):
class Separator(TerminalMeta):
pass
class Parenthesis(TerminalMeta):
class Open(TerminalMeta):
pass
class Close(TerminalMeta):
pass
class CurlyBrace(TerminalMeta):
class Open(TerminalMeta):
pass
class Close(TerminalMeta):
pass
class SquareBracket(TerminalMeta):
class Open(TerminalMeta):
pass
class Close(TerminalMeta):
pass
class Storage(TerminalMeta):
class Type(TerminalMeta):
class Class(TerminalMeta):
pass
class Function(TerminalMeta):
pass
class String(TerminalMeta):
class Quoted(TerminalMeta):
class Single(TerminalMeta):
pass
class Double(TerminalMeta):
pass
class Variable(TerminalMeta):
class Language(TerminalMeta):
pass