e.g. "this is how machine-generated parsers know to skip blanks and comments" The run time implementation could be better; we don't really want to just discard trivia because it's useful for e.g. doc comments and the like. BUT for now this is fine.
606 lines
18 KiB
Python
606 lines
18 KiB
Python
# This is an example grammar.
|
|
import re
|
|
import typing
|
|
|
|
from parser import (
|
|
Assoc,
|
|
Grammar,
|
|
Nothing,
|
|
rule,
|
|
seq,
|
|
Rule,
|
|
Terminal,
|
|
Re,
|
|
)
|
|
|
|
|
|
class FineGrammar(Grammar):
|
|
# generator = parser.GenerateLR1
|
|
start = "File"
|
|
|
|
trivia = ["BLANKS", "COMMENT"]
|
|
|
|
def __init__(self):
|
|
super().__init__(
|
|
precedence=[
|
|
(Assoc.RIGHT, [self.EQUAL]),
|
|
(Assoc.LEFT, [self.OR]),
|
|
(Assoc.LEFT, [self.IS]),
|
|
(Assoc.LEFT, [self.AND]),
|
|
(Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]),
|
|
(Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]),
|
|
(Assoc.LEFT, [self.PLUS, self.MINUS]),
|
|
(Assoc.LEFT, [self.STAR, self.SLASH]),
|
|
(Assoc.LEFT, [self.primary_expression]),
|
|
(Assoc.LEFT, [self.LPAREN]),
|
|
(Assoc.LEFT, [self.DOT]),
|
|
#
|
|
# If there's a confusion about whether to make an IF
|
|
# statement or an expression, prefer the statement.
|
|
#
|
|
(Assoc.NONE, [self.if_statement]),
|
|
],
|
|
)
|
|
|
|
@rule("File")
|
|
def file(self) -> Rule:
|
|
return self._file_statement_list
|
|
|
|
@rule
|
|
def _file_statement_list(self) -> Rule:
|
|
return self._file_statement | (self._file_statement_list + self._file_statement)
|
|
|
|
@rule
|
|
def _file_statement(self) -> Rule:
|
|
return (
|
|
self.import_statement | self.class_declaration | self.export_statement | self._statement
|
|
)
|
|
|
|
@rule
|
|
def import_statement(self) -> Rule:
|
|
return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON)
|
|
|
|
@rule("ClassDeclaration")
|
|
def class_declaration(self) -> Rule:
|
|
return seq(self.CLASS, self.IDENTIFIER, self._class_body)
|
|
|
|
@rule
|
|
def _class_body(self) -> Rule:
|
|
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY)
|
|
|
|
@rule
|
|
def _class_members(self) -> Rule:
|
|
return self._class_member | seq(self._class_members, self._class_member)
|
|
|
|
@rule
|
|
def _class_member(self) -> Rule:
|
|
return self.field_declaration | self.function_declaration
|
|
|
|
@rule("FieldDecl")
|
|
def field_declaration(self) -> Rule:
|
|
return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON)
|
|
|
|
# Types
|
|
@rule("TypeExpression")
|
|
def type_expression(self) -> Rule:
|
|
return self.alternate_type | self.type_identifier
|
|
|
|
@rule("AlternateType")
|
|
def alternate_type(self) -> Rule:
|
|
return seq(self.type_expression, self.OR, self.type_identifier)
|
|
|
|
@rule("TypeIdentifier")
|
|
def type_identifier(self) -> Rule:
|
|
return self.IDENTIFIER
|
|
|
|
@rule
|
|
def export_statement(self) -> Rule:
|
|
return (
|
|
seq(self.EXPORT, self.class_declaration)
|
|
| seq(self.EXPORT, self.function_declaration)
|
|
| seq(self.EXPORT, self.let_statement)
|
|
| seq(self.EXPORT, self.export_list, self.SEMICOLON)
|
|
)
|
|
|
|
@rule
|
|
def export_list(self) -> Rule:
|
|
return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list)
|
|
|
|
# Functions
|
|
@rule("FunctionDecl")
|
|
def function_declaration(self) -> Rule:
|
|
return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq(
|
|
self.FUN,
|
|
self.IDENTIFIER,
|
|
self.function_parameters,
|
|
self.ARROW,
|
|
self.type_expression,
|
|
self.block,
|
|
)
|
|
|
|
@rule("ParamList")
|
|
def function_parameters(self) -> Rule:
|
|
return (
|
|
seq(self.LPAREN, self.RPAREN)
|
|
| seq(self.LPAREN, self._first_parameter, self.RPAREN)
|
|
| seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN)
|
|
)
|
|
|
|
@rule
|
|
def _first_parameter(self) -> Rule:
|
|
return self.SELF | self.parameter
|
|
|
|
@rule
|
|
def _parameter_list(self) -> Rule:
|
|
return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list)
|
|
|
|
@rule("Parameter")
|
|
def parameter(self) -> Rule:
|
|
return seq(self.IDENTIFIER, self.COLON, self.type_expression)
|
|
|
|
# Block
|
|
@rule("Block")
|
|
def block(self) -> Rule:
|
|
return (
|
|
seq(self.LCURLY, self.RCURLY)
|
|
| seq(self.LCURLY, self.expression, self.RCURLY)
|
|
| seq(self.LCURLY, self._statement_list, self.RCURLY)
|
|
| seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY)
|
|
)
|
|
|
|
@rule
|
|
def _statement_list(self) -> Rule:
|
|
return self._statement | seq(self._statement_list, self._statement)
|
|
|
|
@rule
|
|
def _statement(self) -> Rule:
|
|
return (
|
|
self.function_declaration
|
|
| self.let_statement
|
|
| self.return_statement
|
|
| self.for_statement
|
|
| self.if_statement
|
|
| self.while_statement
|
|
| self.expression_statement
|
|
)
|
|
|
|
@rule("LetStatement")
|
|
def let_statement(self) -> Rule:
|
|
return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON)
|
|
|
|
@rule("ReturnStatement")
|
|
def return_statement(self) -> Rule:
|
|
return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON)
|
|
|
|
@rule("ForStatement")
|
|
def for_statement(self) -> Rule:
|
|
return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block)
|
|
|
|
@rule("IteratorVariable")
|
|
def iterator_variable(self) -> Rule:
|
|
return self.IDENTIFIER
|
|
|
|
@rule("IfStatement")
|
|
def if_statement(self) -> Rule:
|
|
return self.conditional_expression
|
|
|
|
@rule
|
|
def while_statement(self) -> Rule:
|
|
return seq(self.WHILE, self.expression, self.block)
|
|
|
|
@rule
|
|
def expression_statement(self) -> Rule:
|
|
return seq(self.expression, self.SEMICOLON)
|
|
|
|
# Expressions
|
|
@rule(transparent=True)
|
|
def expression(self) -> Rule:
|
|
return self.binary_expression | self.is_expression | self.primary_expression
|
|
|
|
@rule("BinaryExpression")
|
|
def binary_expression(self) -> Rule:
|
|
return (
|
|
seq(self.expression, self.EQUAL, self.expression)
|
|
| seq(self.expression, self.OR, self.expression)
|
|
| seq(self.expression, self.AND, self.expression)
|
|
| seq(self.expression, self.EQUALEQUAL, self.expression)
|
|
| seq(self.expression, self.BANGEQUAL, self.expression)
|
|
| seq(self.expression, self.LESS, self.expression)
|
|
| seq(self.expression, self.LESSEQUAL, self.expression)
|
|
| seq(self.expression, self.GREATER, self.expression)
|
|
| seq(self.expression, self.GREATEREQUAL, self.expression)
|
|
| seq(self.expression, self.PLUS, self.expression)
|
|
| seq(self.expression, self.MINUS, self.expression)
|
|
| seq(self.expression, self.STAR, self.expression)
|
|
| seq(self.expression, self.SLASH, self.expression)
|
|
)
|
|
|
|
@rule("IsExpression")
|
|
def is_expression(self) -> Rule:
|
|
return seq(self.expression, self.IS, self.pattern)
|
|
|
|
@rule
|
|
def primary_expression(self) -> Rule:
|
|
return (
|
|
self.identifier_expression
|
|
| self.literal_expression
|
|
| self.SELF
|
|
| seq(self.BANG, self.primary_expression)
|
|
| seq(self.MINUS, self.primary_expression)
|
|
| self.block
|
|
| self.conditional_expression
|
|
| self.list_constructor_expression
|
|
| self.object_constructor_expression
|
|
| self.match_expression
|
|
| seq(self.primary_expression, self.LPAREN, self.RPAREN)
|
|
| seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN)
|
|
| seq(self.primary_expression, self.DOT, self.IDENTIFIER)
|
|
| seq(self.LPAREN, self.expression, self.RPAREN)
|
|
)
|
|
|
|
@rule("IdentifierExpression")
|
|
def identifier_expression(self):
|
|
return self.IDENTIFIER
|
|
|
|
@rule("Literal")
|
|
def literal_expression(self):
|
|
return self.NUMBER | self.STRING | self.TRUE | self.FALSE
|
|
|
|
@rule("ConditionalExpression")
|
|
def conditional_expression(self) -> Rule:
|
|
return (
|
|
seq(self.IF, self.expression, self.block)
|
|
| seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression)
|
|
| seq(self.IF, self.expression, self.block, self.ELSE, self.block)
|
|
)
|
|
|
|
@rule
|
|
def list_constructor_expression(self) -> Rule:
|
|
return seq(self.LSQUARE, self.RSQUARE) | seq(
|
|
self.LSQUARE, self._expression_list, self.RSQUARE
|
|
)
|
|
|
|
@rule
|
|
def _expression_list(self) -> Rule:
|
|
return (
|
|
self.expression
|
|
| seq(self.expression, self.COMMA)
|
|
| seq(self.expression, self.COMMA, self._expression_list)
|
|
)
|
|
|
|
@rule
|
|
def match_expression(self) -> Rule:
|
|
return seq(self.MATCH, self.expression, self.match_body)
|
|
|
|
@rule("MatchBody")
|
|
def match_body(self) -> Rule:
|
|
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY)
|
|
|
|
@rule
|
|
def _match_arms(self) -> Rule:
|
|
return (
|
|
self.match_arm
|
|
| seq(self.match_arm, self.COMMA)
|
|
| seq(self.match_arm, self.COMMA, self._match_arms)
|
|
)
|
|
|
|
@rule("MatchArm")
|
|
def match_arm(self) -> Rule:
|
|
return seq(self.pattern, self.ARROW, self.expression)
|
|
|
|
@rule("Pattern")
|
|
def pattern(self) -> Rule:
|
|
return (
|
|
seq(self.variable_binding, self._pattern_core, self._pattern_predicate)
|
|
| seq(self.variable_binding, self._pattern_core)
|
|
| self._pattern_core
|
|
)
|
|
|
|
@rule
|
|
def _pattern_predicate(self) -> Rule:
|
|
return seq(self.AND, self.expression)
|
|
|
|
@rule
|
|
def _pattern_core(self) -> Rule:
|
|
return self.type_expression | self.wildcard_pattern
|
|
|
|
@rule("WildcardPattern")
|
|
def wildcard_pattern(self) -> Rule:
|
|
return self.UNDERSCORE
|
|
|
|
@rule("VariableBinding")
|
|
def variable_binding(self) -> Rule:
|
|
return seq(self.IDENTIFIER, self.COLON)
|
|
|
|
@rule
|
|
def object_constructor_expression(self) -> Rule:
|
|
return seq(self.NEW, self.type_identifier, self.field_list)
|
|
|
|
@rule
|
|
def field_list(self) -> Rule:
|
|
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY)
|
|
|
|
@rule
|
|
def field_values(self) -> Rule:
|
|
return (
|
|
self.field_value
|
|
| seq(self.field_value, self.COMMA)
|
|
| seq(self.field_value, self.COMMA, self.field_values)
|
|
)
|
|
|
|
@rule
|
|
def field_value(self) -> Rule:
|
|
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
|
|
|
|
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
|
COMMENT = Terminal(Re.seq(Re.literal("//"), Re.set("\n").invert().star()))
|
|
|
|
ARROW = Terminal("->")
|
|
AS = Terminal("as")
|
|
BAR = Terminal("bar")
|
|
CLASS = Terminal("class")
|
|
COLON = Terminal("colon")
|
|
ELSE = Terminal("else")
|
|
FOR = Terminal("for")
|
|
FUN = Terminal("fun")
|
|
IDENTIFIER = Terminal(
|
|
Re.seq(
|
|
Re.set(("a", "z"), ("A", "Z"), "_"),
|
|
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
|
)
|
|
)
|
|
IF = Terminal("if")
|
|
IMPORT = Terminal("import")
|
|
IN = Terminal("in")
|
|
LCURLY = Terminal("{")
|
|
LET = Terminal("Let")
|
|
RCURLY = Terminal("}")
|
|
RETURN = Terminal("return")
|
|
SEMICOLON = Terminal(";")
|
|
STRING = Terminal(
|
|
# Double-quoted string.
|
|
Re.seq(
|
|
Re.literal('"'),
|
|
(~Re.set('"', "\\") | (Re.set("\\") + Re.any())).star(),
|
|
Re.literal('"'),
|
|
)
|
|
# Single-quoted string.
|
|
| Re.seq(
|
|
Re.literal("'"),
|
|
(~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(),
|
|
Re.literal("'"),
|
|
)
|
|
)
|
|
WHILE = Terminal("while")
|
|
EQUAL = Terminal("=")
|
|
LPAREN = Terminal("(")
|
|
RPAREN = Terminal(")")
|
|
COMMA = Terminal(",")
|
|
SELF = Terminal("self", name="SELFF")
|
|
OR = Terminal("or")
|
|
IS = Terminal("is")
|
|
AND = Terminal("and")
|
|
EQUALEQUAL = Terminal("==")
|
|
BANGEQUAL = Terminal("!=")
|
|
LESS = Terminal("<")
|
|
GREATER = Terminal(">")
|
|
LESSEQUAL = Terminal("<=")
|
|
GREATEREQUAL = Terminal(">=")
|
|
PLUS = Terminal("+")
|
|
MINUS = Terminal("-")
|
|
STAR = Terminal("*")
|
|
SLASH = Terminal("/")
|
|
NUMBER = Terminal(
|
|
Re.seq(
|
|
Re.set(("0", "9")).plus(),
|
|
Re.seq(
|
|
Re.literal("."),
|
|
Re.set(("0", "9")).plus(),
|
|
).question(),
|
|
Re.seq(
|
|
Re.set("e", "E"),
|
|
Re.set("+", "-").question(),
|
|
Re.set(("0", "9")).plus(),
|
|
).question(),
|
|
)
|
|
)
|
|
TRUE = Terminal("true")
|
|
FALSE = Terminal("false")
|
|
BANG = Terminal("!")
|
|
DOT = Terminal(".")
|
|
MATCH = Terminal("match")
|
|
EXPORT = Terminal("export")
|
|
UNDERSCORE = Terminal("_")
|
|
NEW = Terminal("new")
|
|
LSQUARE = Terminal("[")
|
|
RSQUARE = Terminal("]")
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# DORKY LEXER
|
|
# -----------------------------------------------------------------------------
|
|
import bisect
|
|
|
|
|
|
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
|
|
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
|
|
KEYWORD_TABLE = {
|
|
"_": FineGrammar.UNDERSCORE,
|
|
"and": FineGrammar.AND,
|
|
"as": FineGrammar.AS,
|
|
"class": FineGrammar.CLASS,
|
|
"else": FineGrammar.ELSE,
|
|
"export": FineGrammar.EXPORT,
|
|
"false": FineGrammar.FALSE,
|
|
"for": FineGrammar.FOR,
|
|
"fun": FineGrammar.FUN,
|
|
"if": FineGrammar.IF,
|
|
"import": FineGrammar.IMPORT,
|
|
"in": FineGrammar.IN,
|
|
"is": FineGrammar.IS,
|
|
"let": FineGrammar.LET,
|
|
"match": FineGrammar.MATCH,
|
|
"new": FineGrammar.NEW,
|
|
"or": FineGrammar.OR,
|
|
"return": FineGrammar.RETURN,
|
|
"self": FineGrammar.SELF,
|
|
"true": FineGrammar.TRUE,
|
|
"while": FineGrammar.WHILE,
|
|
}
|
|
|
|
|
|
def tokenize(src: str):
|
|
pos = 0
|
|
while pos < len(src):
|
|
ch = src[pos]
|
|
if ch.isspace():
|
|
pos += 1
|
|
continue
|
|
|
|
token = None
|
|
if ch == "-":
|
|
if src[pos : pos + 2] == "->":
|
|
token = (FineGrammar.ARROW, pos, 2)
|
|
else:
|
|
token = (FineGrammar.MINUS, pos, 1)
|
|
|
|
elif ch == "|":
|
|
token = (FineGrammar.BAR, pos, 1)
|
|
|
|
elif ch == ":":
|
|
token = (FineGrammar.COLON, pos, 1)
|
|
|
|
elif ch == "{":
|
|
token = (FineGrammar.LCURLY, pos, 1)
|
|
|
|
elif ch == "}":
|
|
token = (FineGrammar.RCURLY, pos, 1)
|
|
|
|
elif ch == ";":
|
|
token = (FineGrammar.SEMICOLON, pos, 1)
|
|
|
|
elif ch == "=":
|
|
if src[pos : pos + 2] == "==":
|
|
token = (FineGrammar.EQUALEQUAL, pos, 2)
|
|
else:
|
|
token = (FineGrammar.EQUAL, pos, 1)
|
|
|
|
elif ch == "(":
|
|
token = (FineGrammar.LPAREN, pos, 1)
|
|
|
|
elif ch == ")":
|
|
token = (FineGrammar.RPAREN, pos, 1)
|
|
|
|
elif ch == ",":
|
|
token = (FineGrammar.COMMA, pos, 1)
|
|
|
|
elif ch == "!":
|
|
if src[pos : pos + 2] == "!=":
|
|
token = (FineGrammar.BANGEQUAL, pos, 2)
|
|
else:
|
|
token = (FineGrammar.BANG, pos, 1)
|
|
|
|
elif ch == "<":
|
|
if src[pos : pos + 2] == "<=":
|
|
token = (FineGrammar.LESSEQUAL, pos, 2)
|
|
else:
|
|
token = (FineGrammar.LESS, pos, 1)
|
|
|
|
elif ch == ">":
|
|
if src[pos : pos + 2] == ">=":
|
|
token = (FineGrammar.GREATEREQUAL, pos, 2)
|
|
else:
|
|
token = (FineGrammar.GREATER, pos, 1)
|
|
|
|
elif ch == "+":
|
|
token = (FineGrammar.PLUS, pos, 1)
|
|
|
|
elif ch == "*":
|
|
token = (FineGrammar.STAR, pos, 1)
|
|
|
|
elif ch == "/":
|
|
if src[pos : pos + 2] == "//":
|
|
while pos < len(src) and src[pos] != "\n":
|
|
pos = pos + 1
|
|
continue
|
|
|
|
token = (FineGrammar.SLASH, pos, 1)
|
|
|
|
elif ch == ".":
|
|
token = (FineGrammar.DOT, pos, 1)
|
|
|
|
elif ch == "[":
|
|
token = (FineGrammar.LSQUARE, pos, 1)
|
|
|
|
elif ch == "]":
|
|
token = (FineGrammar.RSQUARE, pos, 1)
|
|
|
|
elif ch == '"' or ch == "'":
|
|
end = pos + 1
|
|
while end < len(src) and src[end] != ch:
|
|
if src[end] == "\\":
|
|
end += 1
|
|
end += 1
|
|
if end == len(src):
|
|
raise Exception(f"Unterminated string constant at {pos}")
|
|
end += 1
|
|
token = (FineGrammar.STRING, pos, end - pos)
|
|
|
|
else:
|
|
number_match = NUMBER_RE.match(src, pos)
|
|
if number_match:
|
|
token = (FineGrammar.NUMBER, pos, number_match.end() - pos)
|
|
else:
|
|
id_match = IDENTIFIER_RE.match(src, pos)
|
|
if id_match:
|
|
fragment = src[pos : id_match.end()]
|
|
keyword = KEYWORD_TABLE.get(fragment)
|
|
if keyword:
|
|
token = (keyword, pos, len(fragment))
|
|
else:
|
|
token = (FineGrammar.IDENTIFIER, pos, len(fragment))
|
|
|
|
if token is None:
|
|
raise Exception("Token error")
|
|
yield token
|
|
pos += token[2]
|
|
|
|
|
|
class FineTokens:
|
|
def __init__(self, src: str):
|
|
self.src = src
|
|
self._tokens: list[typing.Tuple[Terminal, int, int]] = list(tokenize(src))
|
|
self._lines = [m.start() for m in re.finditer("\n", src)]
|
|
|
|
def tokens(self):
|
|
return self._tokens
|
|
|
|
def lines(self):
|
|
return self._lines
|
|
|
|
def dump(self, *, start=None, end=None):
|
|
if start is None:
|
|
start = 0
|
|
if end is None:
|
|
end = len(self._tokens)
|
|
|
|
for token in self._tokens[start:end]:
|
|
(kind, start, length) = token
|
|
line_index = bisect.bisect_left(self._lines, start)
|
|
if line_index == 0:
|
|
col_start = 0
|
|
else:
|
|
col_start = self._lines[line_index - 1] + 1
|
|
column_index = start - col_start
|
|
value = self.src[start : start + length]
|
|
print(f"{start:04} {kind.value:12} {value} ({line_index}, {column_index})")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from parser.parser import compile_lexer, dump_lexer_table
|
|
|
|
grammar = FineGrammar()
|
|
grammar.build_table()
|
|
|
|
lexer = compile_lexer(grammar)
|
|
dump_lexer_table(lexer)
|