From 208491d56e72801b72081cbc57352f489cc87b59 Mon Sep 17 00:00:00 2001 From: John Doty Date: Mon, 26 Aug 2024 08:05:01 -0700 Subject: [PATCH 1/3] This was out of date --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b449b8e..c5a8020 100644 --- a/README.md +++ b/README.md @@ -23,19 +23,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create one method per nonterminal, decorated with the `rule` decorator. Here's an example: - PLUS = Token('+') - LPAREN = Token('(') - RPAREN = Token(')') - ID = Token('id') class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, PLUS, self.term) | self.term + return seq(self.expression, self.PLUS, self.term) | self.term @rule def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') ## Using grammars From 76ef85483e2f9fdbd504fa620d68a7ba6f10f23f Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 27 Aug 2024 15:40:37 -0700 Subject: [PATCH 2/3] Accept is single-valued, the multi-value thing didn't ever make sense I mean, it did when we thought we were going to weave NFA states as we were building them but we ended up not doing that and instead just using the fancy EdgeList splitting magic when building DFAs from the NFA. It has the same power and is simpler code, and also means that we'll *never* be asked to have multiple Terminals be accepted from a single NFA state. --- parser/parser.py | 56 ++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index 51cbe69..312dd2d 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -2149,15 +2149,15 @@ class EdgeList[ET]: class NFAState: - """An NFA state. Each state can be the accept state, with one or more - Terminals as the result.""" + """An NFA state. A state can be an accept state if it has a Terminal + associated with it.""" - accept: list[Terminal] + accept: Terminal | None epsilons: list["NFAState"] _edges: EdgeList["NFAState"] def __init__(self): - self.accept = [] + self.accept = None self.epsilons = [] self._edges = EdgeList() @@ -2183,7 +2183,7 @@ class NFAState: continue visited.add(state) - label = ", ".join([t.value for t in state.accept if t.value is not None]) + label = state.accept.value if state.accept is not None else "" f.write(f' {id(state)} [label="{label}"];\n') for target in state.epsilons: stack.append(target) @@ -2460,42 +2460,42 @@ class NFASuperState: def accept_terminal(self) -> Terminal | None: accept = None - for st in self.states: - for ac in st.accept: - if accept is None: - accept = ac - elif accept.value != ac.value: - accept_regex = isinstance(accept.pattern, Re) - ac_regex = isinstance(ac.pattern, Re) - if accept_regex and not ac_regex: - accept = ac - elif ac_regex and not accept_regex: - pass - else: - raise ValueError( - f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" - ) + for st in self.states: + if st.accept is None: + continue + + if accept is None: + accept = st.accept + elif accept.value != st.accept.value: + if accept.regex and not st.accept.regex: + accept = st.accept + elif st.accept.regex and not accept.regex: + pass + else: + raise ValueError( + f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {st.accept.value} ('{st.accept.pattern}')" + ) return accept -def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable: +def compile_lexer(grammar: Grammar) -> LexerTable: # Parse the terminals all together into a big NFA rooted at `NFA`. NFA = NFAState() - for terminal in terminals: + for terminal in grammar.terminals: pattern = terminal.pattern if isinstance(pattern, Re): start, ends = pattern.to_nfa() for end in ends: - end.accept.append(terminal) + end.accept = terminal NFA.epsilons.append(start) else: start = end = NFAState() for c in pattern: end = end.add_edge(Span.from_str(c), NFAState()) - end.accept.append(terminal) + end.accept = terminal NFA.epsilons.append(start) NFA.dump_graph() @@ -2525,12 +2525,8 @@ def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable: ] -def compile_lexer(grammar: Grammar) -> LexerTable: - return compile_terminals(grammar.terminals) - - -def dump_lexer_table(table: LexerTable): - with open("lexer.dot", "w", encoding="utf-8") as f: +def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"): + with open(name, "w", encoding="utf-8") as f: f.write("digraph G {\n") for index, (accept, edges) in enumerate(table): label = accept.value if accept is not None else "" From 49ad7fdb523bceda06870fa2f09825d6ffdba321 Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 27 Aug 2024 15:43:07 -0700 Subject: [PATCH 3/3] Associate metadata with terminals This is a half-assed attempt at doing syntax coloring which I think will almost certainly turn out to be insufficient. I'm committing it just to record some of the work I've done but. BUT. Probably trying to match tree-sitter is a better way of doing this. (But, like, emitting tree-sitter grammars? Really? Wow, dude. Way to give up.) --- grammar.py | 115 +++++++++++++++++++++++------------------------ parser/parser.py | 94 +++++++++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 60 deletions(-) diff --git a/grammar.py b/grammar.py index aba6259..0912700 100644 --- a/grammar.py +++ b/grammar.py @@ -2,16 +2,7 @@ import re import typing -from parser import ( - Assoc, - Grammar, - Nothing, - rule, - seq, - Rule, - Terminal, - Re, -) +from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal, Re, TerminalKind class FineGrammar(Grammar): @@ -333,30 +324,34 @@ class FineGrammar(Grammar): return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) - COMMENT = Terminal(Re.seq(Re.literal("//"), Re.set("\n").invert().star())) + COMMENT = Terminal( + Re.seq(Re.literal("//"), Re.set("\n").invert().star()), + kind=TerminalKind.Comment.Line, + ) - ARROW = Terminal("->") - AS = Terminal("as") - BAR = Terminal("bar") - CLASS = Terminal("class") - COLON = Terminal("colon") - ELSE = Terminal("else") - FOR = Terminal("for") - FUN = Terminal("fun") + ARROW = Terminal("->", kind=TerminalKind.Keyword.Operator) + AS = Terminal("as", kind=TerminalKind.Keyword.Operator.Expression) + BAR = Terminal("|", kind=TerminalKind.Keyword.Operator.Expression) + CLASS = Terminal("class", kind=TerminalKind.Storage.Type.Class) + COLON = Terminal(":", kind=TerminalKind.Punctuation.Separator) + ELSE = Terminal("else", kind=TerminalKind.Keyword.Control.Conditional) + FOR = Terminal("for", kind=TerminalKind.Keyword.Control) + FUN = Terminal("fun", kind=TerminalKind.Storage.Type.Function) IDENTIFIER = Terminal( Re.seq( Re.set(("a", "z"), ("A", "Z"), "_"), Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ) + ), + # kind=TerminalKind.Variable, #? ) - IF = Terminal("if") - IMPORT = Terminal("import") - IN = Terminal("in") - LCURLY = Terminal("{") - LET = Terminal("Let") - RCURLY = Terminal("}") - RETURN = Terminal("return") - SEMICOLON = Terminal(";") + IF = Terminal("if", kind=TerminalKind.Keyword.Control.Conditional) + IMPORT = Terminal("import", kind=TerminalKind.Keyword.Other) + IN = Terminal("in", kind=TerminalKind.Keyword.Operator) + LCURLY = Terminal("{", kind=TerminalKind.Punctuation.CurlyBrace.Open) + RCURLY = Terminal("}", kind=TerminalKind.Punctuation.CurlyBrace.Close) + LET = Terminal("Let", kind=TerminalKind.Keyword.Other) + RETURN = Terminal("return", kind=TerminalKind.Keyword.Control) + SEMICOLON = Terminal(";", kind=TerminalKind.Punctuation.Separator) STRING = Terminal( # Double-quoted string. Re.seq( @@ -369,27 +364,28 @@ class FineGrammar(Grammar): Re.literal("'"), (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), Re.literal("'"), - ) + ), + kind=TerminalKind.String.Quoted, ) - WHILE = Terminal("while") - EQUAL = Terminal("=") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - COMMA = Terminal(",") - SELF = Terminal("self", name="SELFF") - OR = Terminal("or") - IS = Terminal("is") - AND = Terminal("and") - EQUALEQUAL = Terminal("==") - BANGEQUAL = Terminal("!=") - LESS = Terminal("<") - GREATER = Terminal(">") - LESSEQUAL = Terminal("<=") - GREATEREQUAL = Terminal(">=") - PLUS = Terminal("+") - MINUS = Terminal("-") - STAR = Terminal("*") - SLASH = Terminal("/") + WHILE = Terminal("while", kind=TerminalKind.Keyword.Control) + EQUAL = Terminal("=", kind=TerminalKind.Keyword.Operator.Expression) + LPAREN = Terminal("(", kind=TerminalKind.Punctuation.Parenthesis.Open) + RPAREN = Terminal(")", kind=TerminalKind.Punctuation.Parenthesis.Close) + COMMA = Terminal(",", kind=TerminalKind.Punctuation.Separator) + SELF = Terminal("self", name="SELFF", kind=TerminalKind.Variable.Language) + OR = Terminal("or", kind=TerminalKind.Keyword.Operator.Expression) + IS = Terminal("is", kind=TerminalKind.Keyword.Operator.Expression) + AND = Terminal("and", kind=TerminalKind.Keyword.Operator.Expression) + EQUALEQUAL = Terminal("==", kind=TerminalKind.Keyword.Operator.Expression) + BANGEQUAL = Terminal("!=", kind=TerminalKind.Keyword.Operator.Expression) + LESS = Terminal("<", kind=TerminalKind.Keyword.Operator.Expression) + GREATER = Terminal(">", kind=TerminalKind.Keyword.Operator.Expression) + LESSEQUAL = Terminal("<=", kind=TerminalKind.Keyword.Operator.Expression) + GREATEREQUAL = Terminal(">=", kind=TerminalKind.Keyword.Operator.Expression) + PLUS = Terminal("+", kind=TerminalKind.Keyword.Operator.Expression) + MINUS = Terminal("-", kind=TerminalKind.Keyword.Operator.Expression) + STAR = Terminal("*", kind=TerminalKind.Keyword.Operator.Expression) + SLASH = Terminal("/", kind=TerminalKind.Keyword.Operator.Expression) NUMBER = Terminal( Re.seq( Re.set(("0", "9")).plus(), @@ -402,18 +398,19 @@ class FineGrammar(Grammar): Re.set("+", "-").question(), Re.set(("0", "9")).plus(), ).question(), - ) + ), + kind=TerminalKind.Constant.Numeric, ) - TRUE = Terminal("true") - FALSE = Terminal("false") - BANG = Terminal("!") - DOT = Terminal(".") - MATCH = Terminal("match") - EXPORT = Terminal("export") - UNDERSCORE = Terminal("_") - NEW = Terminal("new") - LSQUARE = Terminal("[") - RSQUARE = Terminal("]") + TRUE = Terminal("true", kind=TerminalKind.Constant.Language) + FALSE = Terminal("false", kind=TerminalKind.Constant.Language) + BANG = Terminal("!", kind=TerminalKind.Keyword.Operator.Expression) + DOT = Terminal(".", kind=TerminalKind.Punctuation.Separator) + MATCH = Terminal("match", kind=TerminalKind.Keyword.Other) + EXPORT = Terminal("export", kind=TerminalKind.Keyword.Other) + UNDERSCORE = Terminal("_", kind=TerminalKind.Variable.Language) + NEW = Terminal("new", kind=TerminalKind.Keyword.Operator) + LSQUARE = Terminal("[", kind=TerminalKind.Punctuation.SquareBracket.Open) + RSQUARE = Terminal("]", kind=TerminalKind.Punctuation.SquareBracket.Close) # ----------------------------------------------------------------------------- diff --git a/parser/parser.py b/parser/parser.py index 312dd2d..98ac13b 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1609,10 +1609,14 @@ class Terminal(Rule): value: str | None pattern: "str | Re" + meta: dict[str, typing.Any] + regex: bool - def __init__(self, pattern, *, name=None): + def __init__(self, pattern: "str|Re", *, name: str | None = None, **kwargs): self.value = name self.pattern = pattern + self.meta = kwargs + self.regex = isinstance(pattern, Re) def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: # We are just ourselves when flattened. @@ -2537,3 +2541,91 @@ def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"): pass f.write("}\n") + + +# NOTE: We have rich metadata system man, wow, how cool are we? +# +# The whole point of this stuff here is to allow automatic +# generation/maintenance of syntax coloring for editors. And maybe some +# other stuff? This is *extremely provisional*, I'm not even sure it +# makes sense yet. Tree sitter works differently, for example, and it's +# not clear at all what we want to generate for any particular editor. +# +# This here might be enough to produce extremely basic TextMate +# grammars but anything more complicated will want tree patterns +# anyway, and we can only do tree patterns by influencing the grammar. +class TerminalMeta: + pass + + +class TerminalKind(TerminalMeta): + class Comment(TerminalMeta): + class Block(TerminalMeta): + pass + + class Line(TerminalMeta): + pass + + class Constant(TerminalMeta): + class Language(TerminalMeta): + pass + + class Numeric(TerminalMeta): + pass + + class Keyword(TerminalMeta): + class Control(TerminalMeta): + class Conditional(TerminalMeta): + pass + + class Operator(TerminalMeta): + class Expression(TerminalMeta): + pass + + class Other(TerminalMeta): + pass + + class Punctuation(TerminalMeta): + class Separator(TerminalMeta): + pass + + class Parenthesis(TerminalMeta): + class Open(TerminalMeta): + pass + + class Close(TerminalMeta): + pass + + class CurlyBrace(TerminalMeta): + class Open(TerminalMeta): + pass + + class Close(TerminalMeta): + pass + + class SquareBracket(TerminalMeta): + class Open(TerminalMeta): + pass + + class Close(TerminalMeta): + pass + + class Storage(TerminalMeta): + class Type(TerminalMeta): + class Class(TerminalMeta): + pass + + class Function(TerminalMeta): + pass + + class String(TerminalMeta): + class Quoted(TerminalMeta): + class Single(TerminalMeta): + pass + + class Double(TerminalMeta): + pass + + class Variable(TerminalMeta): + class Language(TerminalMeta): + pass