Compare commits
3 commits
2473ae713d
...
49ad7fdb52
| Author | SHA1 | Date | |
|---|---|---|---|
| 49ad7fdb52 | |||
| 76ef85483e | |||
| 208491d56e |
3 changed files with 182 additions and 96 deletions
13
README.md
13
README.md
|
|
@ -23,19 +23,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create
|
||||||
one method per nonterminal, decorated with the `rule` decorator. Here's an
|
one method per nonterminal, decorated with the `rule` decorator. Here's an
|
||||||
example:
|
example:
|
||||||
|
|
||||||
PLUS = Token('+')
|
|
||||||
LPAREN = Token('(')
|
|
||||||
RPAREN = Token(')')
|
|
||||||
ID = Token('id')
|
|
||||||
|
|
||||||
class SimpleGrammar(Grammar):
|
class SimpleGrammar(Grammar):
|
||||||
@rule
|
@rule
|
||||||
def expression(self):
|
def expression(self):
|
||||||
return seq(self.expression, PLUS, self.term) | self.term
|
return seq(self.expression, self.PLUS, self.term) | self.term
|
||||||
|
|
||||||
@rule
|
@rule
|
||||||
def term(self):
|
def term(self):
|
||||||
return seq(LPAREN, self.expression, RPAREN) | ID
|
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
|
||||||
|
|
||||||
|
PLUS = Terminal('+')
|
||||||
|
LPAREN = Terminal('(')
|
||||||
|
RPAREN = Terminal(')')
|
||||||
|
ID = Terminal('id')
|
||||||
|
|
||||||
|
|
||||||
## Using grammars
|
## Using grammars
|
||||||
|
|
|
||||||
115
grammar.py
115
grammar.py
|
|
@ -2,16 +2,7 @@
|
||||||
import re
|
import re
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
from parser import (
|
from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal, Re, TerminalKind
|
||||||
Assoc,
|
|
||||||
Grammar,
|
|
||||||
Nothing,
|
|
||||||
rule,
|
|
||||||
seq,
|
|
||||||
Rule,
|
|
||||||
Terminal,
|
|
||||||
Re,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class FineGrammar(Grammar):
|
class FineGrammar(Grammar):
|
||||||
|
|
@ -333,30 +324,34 @@ class FineGrammar(Grammar):
|
||||||
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
|
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
|
||||||
|
|
||||||
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
||||||
COMMENT = Terminal(Re.seq(Re.literal("//"), Re.set("\n").invert().star()))
|
COMMENT = Terminal(
|
||||||
|
Re.seq(Re.literal("//"), Re.set("\n").invert().star()),
|
||||||
|
kind=TerminalKind.Comment.Line,
|
||||||
|
)
|
||||||
|
|
||||||
ARROW = Terminal("->")
|
ARROW = Terminal("->", kind=TerminalKind.Keyword.Operator)
|
||||||
AS = Terminal("as")
|
AS = Terminal("as", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
BAR = Terminal("bar")
|
BAR = Terminal("|", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
CLASS = Terminal("class")
|
CLASS = Terminal("class", kind=TerminalKind.Storage.Type.Class)
|
||||||
COLON = Terminal("colon")
|
COLON = Terminal(":", kind=TerminalKind.Punctuation.Separator)
|
||||||
ELSE = Terminal("else")
|
ELSE = Terminal("else", kind=TerminalKind.Keyword.Control.Conditional)
|
||||||
FOR = Terminal("for")
|
FOR = Terminal("for", kind=TerminalKind.Keyword.Control)
|
||||||
FUN = Terminal("fun")
|
FUN = Terminal("fun", kind=TerminalKind.Storage.Type.Function)
|
||||||
IDENTIFIER = Terminal(
|
IDENTIFIER = Terminal(
|
||||||
Re.seq(
|
Re.seq(
|
||||||
Re.set(("a", "z"), ("A", "Z"), "_"),
|
Re.set(("a", "z"), ("A", "Z"), "_"),
|
||||||
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
||||||
|
),
|
||||||
|
# kind=TerminalKind.Variable, #?
|
||||||
)
|
)
|
||||||
)
|
IF = Terminal("if", kind=TerminalKind.Keyword.Control.Conditional)
|
||||||
IF = Terminal("if")
|
IMPORT = Terminal("import", kind=TerminalKind.Keyword.Other)
|
||||||
IMPORT = Terminal("import")
|
IN = Terminal("in", kind=TerminalKind.Keyword.Operator)
|
||||||
IN = Terminal("in")
|
LCURLY = Terminal("{", kind=TerminalKind.Punctuation.CurlyBrace.Open)
|
||||||
LCURLY = Terminal("{")
|
RCURLY = Terminal("}", kind=TerminalKind.Punctuation.CurlyBrace.Close)
|
||||||
LET = Terminal("Let")
|
LET = Terminal("Let", kind=TerminalKind.Keyword.Other)
|
||||||
RCURLY = Terminal("}")
|
RETURN = Terminal("return", kind=TerminalKind.Keyword.Control)
|
||||||
RETURN = Terminal("return")
|
SEMICOLON = Terminal(";", kind=TerminalKind.Punctuation.Separator)
|
||||||
SEMICOLON = Terminal(";")
|
|
||||||
STRING = Terminal(
|
STRING = Terminal(
|
||||||
# Double-quoted string.
|
# Double-quoted string.
|
||||||
Re.seq(
|
Re.seq(
|
||||||
|
|
@ -369,27 +364,28 @@ class FineGrammar(Grammar):
|
||||||
Re.literal("'"),
|
Re.literal("'"),
|
||||||
(~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(),
|
(~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(),
|
||||||
Re.literal("'"),
|
Re.literal("'"),
|
||||||
|
),
|
||||||
|
kind=TerminalKind.String.Quoted,
|
||||||
)
|
)
|
||||||
)
|
WHILE = Terminal("while", kind=TerminalKind.Keyword.Control)
|
||||||
WHILE = Terminal("while")
|
EQUAL = Terminal("=", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
EQUAL = Terminal("=")
|
LPAREN = Terminal("(", kind=TerminalKind.Punctuation.Parenthesis.Open)
|
||||||
LPAREN = Terminal("(")
|
RPAREN = Terminal(")", kind=TerminalKind.Punctuation.Parenthesis.Close)
|
||||||
RPAREN = Terminal(")")
|
COMMA = Terminal(",", kind=TerminalKind.Punctuation.Separator)
|
||||||
COMMA = Terminal(",")
|
SELF = Terminal("self", name="SELFF", kind=TerminalKind.Variable.Language)
|
||||||
SELF = Terminal("self", name="SELFF")
|
OR = Terminal("or", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
OR = Terminal("or")
|
IS = Terminal("is", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
IS = Terminal("is")
|
AND = Terminal("and", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
AND = Terminal("and")
|
EQUALEQUAL = Terminal("==", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
EQUALEQUAL = Terminal("==")
|
BANGEQUAL = Terminal("!=", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
BANGEQUAL = Terminal("!=")
|
LESS = Terminal("<", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
LESS = Terminal("<")
|
GREATER = Terminal(">", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
GREATER = Terminal(">")
|
LESSEQUAL = Terminal("<=", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
LESSEQUAL = Terminal("<=")
|
GREATEREQUAL = Terminal(">=", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
GREATEREQUAL = Terminal(">=")
|
PLUS = Terminal("+", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
PLUS = Terminal("+")
|
MINUS = Terminal("-", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
MINUS = Terminal("-")
|
STAR = Terminal("*", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
STAR = Terminal("*")
|
SLASH = Terminal("/", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
SLASH = Terminal("/")
|
|
||||||
NUMBER = Terminal(
|
NUMBER = Terminal(
|
||||||
Re.seq(
|
Re.seq(
|
||||||
Re.set(("0", "9")).plus(),
|
Re.set(("0", "9")).plus(),
|
||||||
|
|
@ -402,18 +398,19 @@ class FineGrammar(Grammar):
|
||||||
Re.set("+", "-").question(),
|
Re.set("+", "-").question(),
|
||||||
Re.set(("0", "9")).plus(),
|
Re.set(("0", "9")).plus(),
|
||||||
).question(),
|
).question(),
|
||||||
|
),
|
||||||
|
kind=TerminalKind.Constant.Numeric,
|
||||||
)
|
)
|
||||||
)
|
TRUE = Terminal("true", kind=TerminalKind.Constant.Language)
|
||||||
TRUE = Terminal("true")
|
FALSE = Terminal("false", kind=TerminalKind.Constant.Language)
|
||||||
FALSE = Terminal("false")
|
BANG = Terminal("!", kind=TerminalKind.Keyword.Operator.Expression)
|
||||||
BANG = Terminal("!")
|
DOT = Terminal(".", kind=TerminalKind.Punctuation.Separator)
|
||||||
DOT = Terminal(".")
|
MATCH = Terminal("match", kind=TerminalKind.Keyword.Other)
|
||||||
MATCH = Terminal("match")
|
EXPORT = Terminal("export", kind=TerminalKind.Keyword.Other)
|
||||||
EXPORT = Terminal("export")
|
UNDERSCORE = Terminal("_", kind=TerminalKind.Variable.Language)
|
||||||
UNDERSCORE = Terminal("_")
|
NEW = Terminal("new", kind=TerminalKind.Keyword.Operator)
|
||||||
NEW = Terminal("new")
|
LSQUARE = Terminal("[", kind=TerminalKind.Punctuation.SquareBracket.Open)
|
||||||
LSQUARE = Terminal("[")
|
RSQUARE = Terminal("]", kind=TerminalKind.Punctuation.SquareBracket.Close)
|
||||||
RSQUARE = Terminal("]")
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
|
||||||
142
parser/parser.py
142
parser/parser.py
|
|
@ -1609,10 +1609,14 @@ class Terminal(Rule):
|
||||||
|
|
||||||
value: str | None
|
value: str | None
|
||||||
pattern: "str | Re"
|
pattern: "str | Re"
|
||||||
|
meta: dict[str, typing.Any]
|
||||||
|
regex: bool
|
||||||
|
|
||||||
def __init__(self, pattern, *, name=None):
|
def __init__(self, pattern: "str|Re", *, name: str | None = None, **kwargs):
|
||||||
self.value = name
|
self.value = name
|
||||||
self.pattern = pattern
|
self.pattern = pattern
|
||||||
|
self.meta = kwargs
|
||||||
|
self.regex = isinstance(pattern, Re)
|
||||||
|
|
||||||
def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
|
def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
|
||||||
# We are just ourselves when flattened.
|
# We are just ourselves when flattened.
|
||||||
|
|
@ -2149,15 +2153,15 @@ class EdgeList[ET]:
|
||||||
|
|
||||||
|
|
||||||
class NFAState:
|
class NFAState:
|
||||||
"""An NFA state. Each state can be the accept state, with one or more
|
"""An NFA state. A state can be an accept state if it has a Terminal
|
||||||
Terminals as the result."""
|
associated with it."""
|
||||||
|
|
||||||
accept: list[Terminal]
|
accept: Terminal | None
|
||||||
epsilons: list["NFAState"]
|
epsilons: list["NFAState"]
|
||||||
_edges: EdgeList["NFAState"]
|
_edges: EdgeList["NFAState"]
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.accept = []
|
self.accept = None
|
||||||
self.epsilons = []
|
self.epsilons = []
|
||||||
self._edges = EdgeList()
|
self._edges = EdgeList()
|
||||||
|
|
||||||
|
|
@ -2183,7 +2187,7 @@ class NFAState:
|
||||||
continue
|
continue
|
||||||
visited.add(state)
|
visited.add(state)
|
||||||
|
|
||||||
label = ", ".join([t.value for t in state.accept if t.value is not None])
|
label = state.accept.value if state.accept is not None else ""
|
||||||
f.write(f' {id(state)} [label="{label}"];\n')
|
f.write(f' {id(state)} [label="{label}"];\n')
|
||||||
for target in state.epsilons:
|
for target in state.epsilons:
|
||||||
stack.append(target)
|
stack.append(target)
|
||||||
|
|
@ -2460,42 +2464,42 @@ class NFASuperState:
|
||||||
|
|
||||||
def accept_terminal(self) -> Terminal | None:
|
def accept_terminal(self) -> Terminal | None:
|
||||||
accept = None
|
accept = None
|
||||||
for st in self.states:
|
|
||||||
for ac in st.accept:
|
|
||||||
if accept is None:
|
|
||||||
accept = ac
|
|
||||||
elif accept.value != ac.value:
|
|
||||||
accept_regex = isinstance(accept.pattern, Re)
|
|
||||||
ac_regex = isinstance(ac.pattern, Re)
|
|
||||||
|
|
||||||
if accept_regex and not ac_regex:
|
for st in self.states:
|
||||||
accept = ac
|
if st.accept is None:
|
||||||
elif ac_regex and not accept_regex:
|
continue
|
||||||
|
|
||||||
|
if accept is None:
|
||||||
|
accept = st.accept
|
||||||
|
elif accept.value != st.accept.value:
|
||||||
|
if accept.regex and not st.accept.regex:
|
||||||
|
accept = st.accept
|
||||||
|
elif st.accept.regex and not accept.regex:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')"
|
f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {st.accept.value} ('{st.accept.pattern}')"
|
||||||
)
|
)
|
||||||
|
|
||||||
return accept
|
return accept
|
||||||
|
|
||||||
|
|
||||||
def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable:
|
def compile_lexer(grammar: Grammar) -> LexerTable:
|
||||||
# Parse the terminals all together into a big NFA rooted at `NFA`.
|
# Parse the terminals all together into a big NFA rooted at `NFA`.
|
||||||
NFA = NFAState()
|
NFA = NFAState()
|
||||||
for terminal in terminals:
|
for terminal in grammar.terminals:
|
||||||
pattern = terminal.pattern
|
pattern = terminal.pattern
|
||||||
if isinstance(pattern, Re):
|
if isinstance(pattern, Re):
|
||||||
start, ends = pattern.to_nfa()
|
start, ends = pattern.to_nfa()
|
||||||
for end in ends:
|
for end in ends:
|
||||||
end.accept.append(terminal)
|
end.accept = terminal
|
||||||
NFA.epsilons.append(start)
|
NFA.epsilons.append(start)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
start = end = NFAState()
|
start = end = NFAState()
|
||||||
for c in pattern:
|
for c in pattern:
|
||||||
end = end.add_edge(Span.from_str(c), NFAState())
|
end = end.add_edge(Span.from_str(c), NFAState())
|
||||||
end.accept.append(terminal)
|
end.accept = terminal
|
||||||
NFA.epsilons.append(start)
|
NFA.epsilons.append(start)
|
||||||
|
|
||||||
NFA.dump_graph()
|
NFA.dump_graph()
|
||||||
|
|
@ -2525,12 +2529,8 @@ def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable:
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def compile_lexer(grammar: Grammar) -> LexerTable:
|
def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"):
|
||||||
return compile_terminals(grammar.terminals)
|
with open(name, "w", encoding="utf-8") as f:
|
||||||
|
|
||||||
|
|
||||||
def dump_lexer_table(table: LexerTable):
|
|
||||||
with open("lexer.dot", "w", encoding="utf-8") as f:
|
|
||||||
f.write("digraph G {\n")
|
f.write("digraph G {\n")
|
||||||
for index, (accept, edges) in enumerate(table):
|
for index, (accept, edges) in enumerate(table):
|
||||||
label = accept.value if accept is not None else ""
|
label = accept.value if accept is not None else ""
|
||||||
|
|
@ -2541,3 +2541,91 @@ def dump_lexer_table(table: LexerTable):
|
||||||
|
|
||||||
pass
|
pass
|
||||||
f.write("}\n")
|
f.write("}\n")
|
||||||
|
|
||||||
|
|
||||||
|
# NOTE: We have rich metadata system man, wow, how cool are we?
|
||||||
|
#
|
||||||
|
# The whole point of this stuff here is to allow automatic
|
||||||
|
# generation/maintenance of syntax coloring for editors. And maybe some
|
||||||
|
# other stuff? This is *extremely provisional*, I'm not even sure it
|
||||||
|
# makes sense yet. Tree sitter works differently, for example, and it's
|
||||||
|
# not clear at all what we want to generate for any particular editor.
|
||||||
|
#
|
||||||
|
# This here might be enough to produce extremely basic TextMate
|
||||||
|
# grammars but anything more complicated will want tree patterns
|
||||||
|
# anyway, and we can only do tree patterns by influencing the grammar.
|
||||||
|
class TerminalMeta:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TerminalKind(TerminalMeta):
|
||||||
|
class Comment(TerminalMeta):
|
||||||
|
class Block(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Line(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Constant(TerminalMeta):
|
||||||
|
class Language(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Numeric(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Keyword(TerminalMeta):
|
||||||
|
class Control(TerminalMeta):
|
||||||
|
class Conditional(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Operator(TerminalMeta):
|
||||||
|
class Expression(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Other(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Punctuation(TerminalMeta):
|
||||||
|
class Separator(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Parenthesis(TerminalMeta):
|
||||||
|
class Open(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Close(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class CurlyBrace(TerminalMeta):
|
||||||
|
class Open(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Close(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class SquareBracket(TerminalMeta):
|
||||||
|
class Open(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Close(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Storage(TerminalMeta):
|
||||||
|
class Type(TerminalMeta):
|
||||||
|
class Class(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Function(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class String(TerminalMeta):
|
||||||
|
class Quoted(TerminalMeta):
|
||||||
|
class Single(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Double(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Variable(TerminalMeta):
|
||||||
|
class Language(TerminalMeta):
|
||||||
|
pass
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue