diff --git a/README.md b/README.md index b449b8e..c5a8020 100644 --- a/README.md +++ b/README.md @@ -23,19 +23,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create one method per nonterminal, decorated with the `rule` decorator. Here's an example: - PLUS = Token('+') - LPAREN = Token('(') - RPAREN = Token(')') - ID = Token('id') class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, PLUS, self.term) | self.term + return seq(self.expression, self.PLUS, self.term) | self.term @rule def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') ## Using grammars diff --git a/grammar.py b/grammar.py index aba6259..0912700 100644 --- a/grammar.py +++ b/grammar.py @@ -2,16 +2,7 @@ import re import typing -from parser import ( - Assoc, - Grammar, - Nothing, - rule, - seq, - Rule, - Terminal, - Re, -) +from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal, Re, TerminalKind class FineGrammar(Grammar): @@ -333,30 +324,34 @@ class FineGrammar(Grammar): return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) - COMMENT = Terminal(Re.seq(Re.literal("//"), Re.set("\n").invert().star())) + COMMENT = Terminal( + Re.seq(Re.literal("//"), Re.set("\n").invert().star()), + kind=TerminalKind.Comment.Line, + ) - ARROW = Terminal("->") - AS = Terminal("as") - BAR = Terminal("bar") - CLASS = Terminal("class") - COLON = Terminal("colon") - ELSE = Terminal("else") - FOR = Terminal("for") - FUN = Terminal("fun") + ARROW = Terminal("->", kind=TerminalKind.Keyword.Operator) + AS = Terminal("as", kind=TerminalKind.Keyword.Operator.Expression) + BAR = Terminal("|", kind=TerminalKind.Keyword.Operator.Expression) + CLASS = Terminal("class", kind=TerminalKind.Storage.Type.Class) + COLON = Terminal(":", kind=TerminalKind.Punctuation.Separator) + ELSE = Terminal("else", kind=TerminalKind.Keyword.Control.Conditional) + FOR = Terminal("for", kind=TerminalKind.Keyword.Control) + FUN = Terminal("fun", kind=TerminalKind.Storage.Type.Function) IDENTIFIER = Terminal( Re.seq( Re.set(("a", "z"), ("A", "Z"), "_"), Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ) + ), + # kind=TerminalKind.Variable, #? ) - IF = Terminal("if") - IMPORT = Terminal("import") - IN = Terminal("in") - LCURLY = Terminal("{") - LET = Terminal("Let") - RCURLY = Terminal("}") - RETURN = Terminal("return") - SEMICOLON = Terminal(";") + IF = Terminal("if", kind=TerminalKind.Keyword.Control.Conditional) + IMPORT = Terminal("import", kind=TerminalKind.Keyword.Other) + IN = Terminal("in", kind=TerminalKind.Keyword.Operator) + LCURLY = Terminal("{", kind=TerminalKind.Punctuation.CurlyBrace.Open) + RCURLY = Terminal("}", kind=TerminalKind.Punctuation.CurlyBrace.Close) + LET = Terminal("Let", kind=TerminalKind.Keyword.Other) + RETURN = Terminal("return", kind=TerminalKind.Keyword.Control) + SEMICOLON = Terminal(";", kind=TerminalKind.Punctuation.Separator) STRING = Terminal( # Double-quoted string. Re.seq( @@ -369,27 +364,28 @@ class FineGrammar(Grammar): Re.literal("'"), (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), Re.literal("'"), - ) + ), + kind=TerminalKind.String.Quoted, ) - WHILE = Terminal("while") - EQUAL = Terminal("=") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - COMMA = Terminal(",") - SELF = Terminal("self", name="SELFF") - OR = Terminal("or") - IS = Terminal("is") - AND = Terminal("and") - EQUALEQUAL = Terminal("==") - BANGEQUAL = Terminal("!=") - LESS = Terminal("<") - GREATER = Terminal(">") - LESSEQUAL = Terminal("<=") - GREATEREQUAL = Terminal(">=") - PLUS = Terminal("+") - MINUS = Terminal("-") - STAR = Terminal("*") - SLASH = Terminal("/") + WHILE = Terminal("while", kind=TerminalKind.Keyword.Control) + EQUAL = Terminal("=", kind=TerminalKind.Keyword.Operator.Expression) + LPAREN = Terminal("(", kind=TerminalKind.Punctuation.Parenthesis.Open) + RPAREN = Terminal(")", kind=TerminalKind.Punctuation.Parenthesis.Close) + COMMA = Terminal(",", kind=TerminalKind.Punctuation.Separator) + SELF = Terminal("self", name="SELFF", kind=TerminalKind.Variable.Language) + OR = Terminal("or", kind=TerminalKind.Keyword.Operator.Expression) + IS = Terminal("is", kind=TerminalKind.Keyword.Operator.Expression) + AND = Terminal("and", kind=TerminalKind.Keyword.Operator.Expression) + EQUALEQUAL = Terminal("==", kind=TerminalKind.Keyword.Operator.Expression) + BANGEQUAL = Terminal("!=", kind=TerminalKind.Keyword.Operator.Expression) + LESS = Terminal("<", kind=TerminalKind.Keyword.Operator.Expression) + GREATER = Terminal(">", kind=TerminalKind.Keyword.Operator.Expression) + LESSEQUAL = Terminal("<=", kind=TerminalKind.Keyword.Operator.Expression) + GREATEREQUAL = Terminal(">=", kind=TerminalKind.Keyword.Operator.Expression) + PLUS = Terminal("+", kind=TerminalKind.Keyword.Operator.Expression) + MINUS = Terminal("-", kind=TerminalKind.Keyword.Operator.Expression) + STAR = Terminal("*", kind=TerminalKind.Keyword.Operator.Expression) + SLASH = Terminal("/", kind=TerminalKind.Keyword.Operator.Expression) NUMBER = Terminal( Re.seq( Re.set(("0", "9")).plus(), @@ -402,18 +398,19 @@ class FineGrammar(Grammar): Re.set("+", "-").question(), Re.set(("0", "9")).plus(), ).question(), - ) + ), + kind=TerminalKind.Constant.Numeric, ) - TRUE = Terminal("true") - FALSE = Terminal("false") - BANG = Terminal("!") - DOT = Terminal(".") - MATCH = Terminal("match") - EXPORT = Terminal("export") - UNDERSCORE = Terminal("_") - NEW = Terminal("new") - LSQUARE = Terminal("[") - RSQUARE = Terminal("]") + TRUE = Terminal("true", kind=TerminalKind.Constant.Language) + FALSE = Terminal("false", kind=TerminalKind.Constant.Language) + BANG = Terminal("!", kind=TerminalKind.Keyword.Operator.Expression) + DOT = Terminal(".", kind=TerminalKind.Punctuation.Separator) + MATCH = Terminal("match", kind=TerminalKind.Keyword.Other) + EXPORT = Terminal("export", kind=TerminalKind.Keyword.Other) + UNDERSCORE = Terminal("_", kind=TerminalKind.Variable.Language) + NEW = Terminal("new", kind=TerminalKind.Keyword.Operator) + LSQUARE = Terminal("[", kind=TerminalKind.Punctuation.SquareBracket.Open) + RSQUARE = Terminal("]", kind=TerminalKind.Punctuation.SquareBracket.Close) # ----------------------------------------------------------------------------- diff --git a/parser/parser.py b/parser/parser.py index 51cbe69..98ac13b 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1609,10 +1609,14 @@ class Terminal(Rule): value: str | None pattern: "str | Re" + meta: dict[str, typing.Any] + regex: bool - def __init__(self, pattern, *, name=None): + def __init__(self, pattern: "str|Re", *, name: str | None = None, **kwargs): self.value = name self.pattern = pattern + self.meta = kwargs + self.regex = isinstance(pattern, Re) def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: # We are just ourselves when flattened. @@ -2149,15 +2153,15 @@ class EdgeList[ET]: class NFAState: - """An NFA state. Each state can be the accept state, with one or more - Terminals as the result.""" + """An NFA state. A state can be an accept state if it has a Terminal + associated with it.""" - accept: list[Terminal] + accept: Terminal | None epsilons: list["NFAState"] _edges: EdgeList["NFAState"] def __init__(self): - self.accept = [] + self.accept = None self.epsilons = [] self._edges = EdgeList() @@ -2183,7 +2187,7 @@ class NFAState: continue visited.add(state) - label = ", ".join([t.value for t in state.accept if t.value is not None]) + label = state.accept.value if state.accept is not None else "" f.write(f' {id(state)} [label="{label}"];\n') for target in state.epsilons: stack.append(target) @@ -2460,42 +2464,42 @@ class NFASuperState: def accept_terminal(self) -> Terminal | None: accept = None - for st in self.states: - for ac in st.accept: - if accept is None: - accept = ac - elif accept.value != ac.value: - accept_regex = isinstance(accept.pattern, Re) - ac_regex = isinstance(ac.pattern, Re) - if accept_regex and not ac_regex: - accept = ac - elif ac_regex and not accept_regex: - pass - else: - raise ValueError( - f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" - ) + for st in self.states: + if st.accept is None: + continue + + if accept is None: + accept = st.accept + elif accept.value != st.accept.value: + if accept.regex and not st.accept.regex: + accept = st.accept + elif st.accept.regex and not accept.regex: + pass + else: + raise ValueError( + f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {st.accept.value} ('{st.accept.pattern}')" + ) return accept -def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable: +def compile_lexer(grammar: Grammar) -> LexerTable: # Parse the terminals all together into a big NFA rooted at `NFA`. NFA = NFAState() - for terminal in terminals: + for terminal in grammar.terminals: pattern = terminal.pattern if isinstance(pattern, Re): start, ends = pattern.to_nfa() for end in ends: - end.accept.append(terminal) + end.accept = terminal NFA.epsilons.append(start) else: start = end = NFAState() for c in pattern: end = end.add_edge(Span.from_str(c), NFAState()) - end.accept.append(terminal) + end.accept = terminal NFA.epsilons.append(start) NFA.dump_graph() @@ -2525,12 +2529,8 @@ def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable: ] -def compile_lexer(grammar: Grammar) -> LexerTable: - return compile_terminals(grammar.terminals) - - -def dump_lexer_table(table: LexerTable): - with open("lexer.dot", "w", encoding="utf-8") as f: +def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"): + with open(name, "w", encoding="utf-8") as f: f.write("digraph G {\n") for index, (accept, edges) in enumerate(table): label = accept.value if accept is not None else "" @@ -2541,3 +2541,91 @@ def dump_lexer_table(table: LexerTable): pass f.write("}\n") + + +# NOTE: We have rich metadata system man, wow, how cool are we? +# +# The whole point of this stuff here is to allow automatic +# generation/maintenance of syntax coloring for editors. And maybe some +# other stuff? This is *extremely provisional*, I'm not even sure it +# makes sense yet. Tree sitter works differently, for example, and it's +# not clear at all what we want to generate for any particular editor. +# +# This here might be enough to produce extremely basic TextMate +# grammars but anything more complicated will want tree patterns +# anyway, and we can only do tree patterns by influencing the grammar. +class TerminalMeta: + pass + + +class TerminalKind(TerminalMeta): + class Comment(TerminalMeta): + class Block(TerminalMeta): + pass + + class Line(TerminalMeta): + pass + + class Constant(TerminalMeta): + class Language(TerminalMeta): + pass + + class Numeric(TerminalMeta): + pass + + class Keyword(TerminalMeta): + class Control(TerminalMeta): + class Conditional(TerminalMeta): + pass + + class Operator(TerminalMeta): + class Expression(TerminalMeta): + pass + + class Other(TerminalMeta): + pass + + class Punctuation(TerminalMeta): + class Separator(TerminalMeta): + pass + + class Parenthesis(TerminalMeta): + class Open(TerminalMeta): + pass + + class Close(TerminalMeta): + pass + + class CurlyBrace(TerminalMeta): + class Open(TerminalMeta): + pass + + class Close(TerminalMeta): + pass + + class SquareBracket(TerminalMeta): + class Open(TerminalMeta): + pass + + class Close(TerminalMeta): + pass + + class Storage(TerminalMeta): + class Type(TerminalMeta): + class Class(TerminalMeta): + pass + + class Function(TerminalMeta): + pass + + class String(TerminalMeta): + class Quoted(TerminalMeta): + class Single(TerminalMeta): + pass + + class Double(TerminalMeta): + pass + + class Variable(TerminalMeta): + class Language(TerminalMeta): + pass