diff --git a/README.md b/README.md index c5a8020..b449b8e 100644 --- a/README.md +++ b/README.md @@ -23,20 +23,19 @@ To get started, create a grammar that derives from the `Grammar` class. Create one method per nonterminal, decorated with the `rule` decorator. Here's an example: + PLUS = Token('+') + LPAREN = Token('(') + RPAREN = Token(')') + ID = Token('id') class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, self.PLUS, self.term) | self.term + return seq(self.expression, PLUS, self.term) | self.term @rule def term(self): - return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID - - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') + return seq(LPAREN, self.expression, RPAREN) | ID ## Using grammars diff --git a/grammar.py b/grammar.py index 0912700..aba6259 100644 --- a/grammar.py +++ b/grammar.py @@ -2,7 +2,16 @@ import re import typing -from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal, Re, TerminalKind +from parser import ( + Assoc, + Grammar, + Nothing, + rule, + seq, + Rule, + Terminal, + Re, +) class FineGrammar(Grammar): @@ -324,34 +333,30 @@ class FineGrammar(Grammar): return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) - COMMENT = Terminal( - Re.seq(Re.literal("//"), Re.set("\n").invert().star()), - kind=TerminalKind.Comment.Line, - ) + COMMENT = Terminal(Re.seq(Re.literal("//"), Re.set("\n").invert().star())) - ARROW = Terminal("->", kind=TerminalKind.Keyword.Operator) - AS = Terminal("as", kind=TerminalKind.Keyword.Operator.Expression) - BAR = Terminal("|", kind=TerminalKind.Keyword.Operator.Expression) - CLASS = Terminal("class", kind=TerminalKind.Storage.Type.Class) - COLON = Terminal(":", kind=TerminalKind.Punctuation.Separator) - ELSE = Terminal("else", kind=TerminalKind.Keyword.Control.Conditional) - FOR = Terminal("for", kind=TerminalKind.Keyword.Control) - FUN = Terminal("fun", kind=TerminalKind.Storage.Type.Function) + ARROW = Terminal("->") + AS = Terminal("as") + BAR = Terminal("bar") + CLASS = Terminal("class") + COLON = Terminal("colon") + ELSE = Terminal("else") + FOR = Terminal("for") + FUN = Terminal("fun") IDENTIFIER = Terminal( Re.seq( Re.set(("a", "z"), ("A", "Z"), "_"), Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ), - # kind=TerminalKind.Variable, #? + ) ) - IF = Terminal("if", kind=TerminalKind.Keyword.Control.Conditional) - IMPORT = Terminal("import", kind=TerminalKind.Keyword.Other) - IN = Terminal("in", kind=TerminalKind.Keyword.Operator) - LCURLY = Terminal("{", kind=TerminalKind.Punctuation.CurlyBrace.Open) - RCURLY = Terminal("}", kind=TerminalKind.Punctuation.CurlyBrace.Close) - LET = Terminal("Let", kind=TerminalKind.Keyword.Other) - RETURN = Terminal("return", kind=TerminalKind.Keyword.Control) - SEMICOLON = Terminal(";", kind=TerminalKind.Punctuation.Separator) + IF = Terminal("if") + IMPORT = Terminal("import") + IN = Terminal("in") + LCURLY = Terminal("{") + LET = Terminal("Let") + RCURLY = Terminal("}") + RETURN = Terminal("return") + SEMICOLON = Terminal(";") STRING = Terminal( # Double-quoted string. Re.seq( @@ -364,28 +369,27 @@ class FineGrammar(Grammar): Re.literal("'"), (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), Re.literal("'"), - ), - kind=TerminalKind.String.Quoted, + ) ) - WHILE = Terminal("while", kind=TerminalKind.Keyword.Control) - EQUAL = Terminal("=", kind=TerminalKind.Keyword.Operator.Expression) - LPAREN = Terminal("(", kind=TerminalKind.Punctuation.Parenthesis.Open) - RPAREN = Terminal(")", kind=TerminalKind.Punctuation.Parenthesis.Close) - COMMA = Terminal(",", kind=TerminalKind.Punctuation.Separator) - SELF = Terminal("self", name="SELFF", kind=TerminalKind.Variable.Language) - OR = Terminal("or", kind=TerminalKind.Keyword.Operator.Expression) - IS = Terminal("is", kind=TerminalKind.Keyword.Operator.Expression) - AND = Terminal("and", kind=TerminalKind.Keyword.Operator.Expression) - EQUALEQUAL = Terminal("==", kind=TerminalKind.Keyword.Operator.Expression) - BANGEQUAL = Terminal("!=", kind=TerminalKind.Keyword.Operator.Expression) - LESS = Terminal("<", kind=TerminalKind.Keyword.Operator.Expression) - GREATER = Terminal(">", kind=TerminalKind.Keyword.Operator.Expression) - LESSEQUAL = Terminal("<=", kind=TerminalKind.Keyword.Operator.Expression) - GREATEREQUAL = Terminal(">=", kind=TerminalKind.Keyword.Operator.Expression) - PLUS = Terminal("+", kind=TerminalKind.Keyword.Operator.Expression) - MINUS = Terminal("-", kind=TerminalKind.Keyword.Operator.Expression) - STAR = Terminal("*", kind=TerminalKind.Keyword.Operator.Expression) - SLASH = Terminal("/", kind=TerminalKind.Keyword.Operator.Expression) + WHILE = Terminal("while") + EQUAL = Terminal("=") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + COMMA = Terminal(",") + SELF = Terminal("self", name="SELFF") + OR = Terminal("or") + IS = Terminal("is") + AND = Terminal("and") + EQUALEQUAL = Terminal("==") + BANGEQUAL = Terminal("!=") + LESS = Terminal("<") + GREATER = Terminal(">") + LESSEQUAL = Terminal("<=") + GREATEREQUAL = Terminal(">=") + PLUS = Terminal("+") + MINUS = Terminal("-") + STAR = Terminal("*") + SLASH = Terminal("/") NUMBER = Terminal( Re.seq( Re.set(("0", "9")).plus(), @@ -398,19 +402,18 @@ class FineGrammar(Grammar): Re.set("+", "-").question(), Re.set(("0", "9")).plus(), ).question(), - ), - kind=TerminalKind.Constant.Numeric, + ) ) - TRUE = Terminal("true", kind=TerminalKind.Constant.Language) - FALSE = Terminal("false", kind=TerminalKind.Constant.Language) - BANG = Terminal("!", kind=TerminalKind.Keyword.Operator.Expression) - DOT = Terminal(".", kind=TerminalKind.Punctuation.Separator) - MATCH = Terminal("match", kind=TerminalKind.Keyword.Other) - EXPORT = Terminal("export", kind=TerminalKind.Keyword.Other) - UNDERSCORE = Terminal("_", kind=TerminalKind.Variable.Language) - NEW = Terminal("new", kind=TerminalKind.Keyword.Operator) - LSQUARE = Terminal("[", kind=TerminalKind.Punctuation.SquareBracket.Open) - RSQUARE = Terminal("]", kind=TerminalKind.Punctuation.SquareBracket.Close) + TRUE = Terminal("true") + FALSE = Terminal("false") + BANG = Terminal("!") + DOT = Terminal(".") + MATCH = Terminal("match") + EXPORT = Terminal("export") + UNDERSCORE = Terminal("_") + NEW = Terminal("new") + LSQUARE = Terminal("[") + RSQUARE = Terminal("]") # ----------------------------------------------------------------------------- diff --git a/parser/parser.py b/parser/parser.py index 98ac13b..51cbe69 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1609,14 +1609,10 @@ class Terminal(Rule): value: str | None pattern: "str | Re" - meta: dict[str, typing.Any] - regex: bool - def __init__(self, pattern: "str|Re", *, name: str | None = None, **kwargs): + def __init__(self, pattern, *, name=None): self.value = name self.pattern = pattern - self.meta = kwargs - self.regex = isinstance(pattern, Re) def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: # We are just ourselves when flattened. @@ -2153,15 +2149,15 @@ class EdgeList[ET]: class NFAState: - """An NFA state. A state can be an accept state if it has a Terminal - associated with it.""" + """An NFA state. Each state can be the accept state, with one or more + Terminals as the result.""" - accept: Terminal | None + accept: list[Terminal] epsilons: list["NFAState"] _edges: EdgeList["NFAState"] def __init__(self): - self.accept = None + self.accept = [] self.epsilons = [] self._edges = EdgeList() @@ -2187,7 +2183,7 @@ class NFAState: continue visited.add(state) - label = state.accept.value if state.accept is not None else "" + label = ", ".join([t.value for t in state.accept if t.value is not None]) f.write(f' {id(state)} [label="{label}"];\n') for target in state.epsilons: stack.append(target) @@ -2464,42 +2460,42 @@ class NFASuperState: def accept_terminal(self) -> Terminal | None: accept = None - for st in self.states: - if st.accept is None: - continue + for ac in st.accept: + if accept is None: + accept = ac + elif accept.value != ac.value: + accept_regex = isinstance(accept.pattern, Re) + ac_regex = isinstance(ac.pattern, Re) - if accept is None: - accept = st.accept - elif accept.value != st.accept.value: - if accept.regex and not st.accept.regex: - accept = st.accept - elif st.accept.regex and not accept.regex: - pass - else: - raise ValueError( - f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {st.accept.value} ('{st.accept.pattern}')" - ) + if accept_regex and not ac_regex: + accept = ac + elif ac_regex and not accept_regex: + pass + else: + raise ValueError( + f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" + ) return accept -def compile_lexer(grammar: Grammar) -> LexerTable: +def compile_terminals(terminals: typing.Iterable[Terminal]) -> LexerTable: # Parse the terminals all together into a big NFA rooted at `NFA`. NFA = NFAState() - for terminal in grammar.terminals: + for terminal in terminals: pattern = terminal.pattern if isinstance(pattern, Re): start, ends = pattern.to_nfa() for end in ends: - end.accept = terminal + end.accept.append(terminal) NFA.epsilons.append(start) else: start = end = NFAState() for c in pattern: end = end.add_edge(Span.from_str(c), NFAState()) - end.accept = terminal + end.accept.append(terminal) NFA.epsilons.append(start) NFA.dump_graph() @@ -2529,8 +2525,12 @@ def compile_lexer(grammar: Grammar) -> LexerTable: ] -def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"): - with open(name, "w", encoding="utf-8") as f: +def compile_lexer(grammar: Grammar) -> LexerTable: + return compile_terminals(grammar.terminals) + + +def dump_lexer_table(table: LexerTable): + with open("lexer.dot", "w", encoding="utf-8") as f: f.write("digraph G {\n") for index, (accept, edges) in enumerate(table): label = accept.value if accept is not None else "" @@ -2541,91 +2541,3 @@ def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"): pass f.write("}\n") - - -# NOTE: We have rich metadata system man, wow, how cool are we? -# -# The whole point of this stuff here is to allow automatic -# generation/maintenance of syntax coloring for editors. And maybe some -# other stuff? This is *extremely provisional*, I'm not even sure it -# makes sense yet. Tree sitter works differently, for example, and it's -# not clear at all what we want to generate for any particular editor. -# -# This here might be enough to produce extremely basic TextMate -# grammars but anything more complicated will want tree patterns -# anyway, and we can only do tree patterns by influencing the grammar. -class TerminalMeta: - pass - - -class TerminalKind(TerminalMeta): - class Comment(TerminalMeta): - class Block(TerminalMeta): - pass - - class Line(TerminalMeta): - pass - - class Constant(TerminalMeta): - class Language(TerminalMeta): - pass - - class Numeric(TerminalMeta): - pass - - class Keyword(TerminalMeta): - class Control(TerminalMeta): - class Conditional(TerminalMeta): - pass - - class Operator(TerminalMeta): - class Expression(TerminalMeta): - pass - - class Other(TerminalMeta): - pass - - class Punctuation(TerminalMeta): - class Separator(TerminalMeta): - pass - - class Parenthesis(TerminalMeta): - class Open(TerminalMeta): - pass - - class Close(TerminalMeta): - pass - - class CurlyBrace(TerminalMeta): - class Open(TerminalMeta): - pass - - class Close(TerminalMeta): - pass - - class SquareBracket(TerminalMeta): - class Open(TerminalMeta): - pass - - class Close(TerminalMeta): - pass - - class Storage(TerminalMeta): - class Type(TerminalMeta): - class Class(TerminalMeta): - pass - - class Function(TerminalMeta): - pass - - class String(TerminalMeta): - class Quoted(TerminalMeta): - class Single(TerminalMeta): - pass - - class Double(TerminalMeta): - pass - - class Variable(TerminalMeta): - class Language(TerminalMeta): - pass