diff --git a/grammar.py b/grammar.py index 90d88ef..d674111 100644 --- a/grammar.py +++ b/grammar.py @@ -413,186 +413,6 @@ class FineGrammar(Grammar): RSQUARE = Terminal("]", kind=TerminalKind.Punctuation.SquareBracket.Close) -# ----------------------------------------------------------------------------- -# DORKY LEXER -# ----------------------------------------------------------------------------- -import bisect - - -NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?") -IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*") -KEYWORD_TABLE = { - "_": FineGrammar.UNDERSCORE, - "and": FineGrammar.AND, - "as": FineGrammar.AS, - "class": FineGrammar.CLASS, - "else": FineGrammar.ELSE, - "export": FineGrammar.EXPORT, - "false": FineGrammar.FALSE, - "for": FineGrammar.FOR, - "fun": FineGrammar.FUN, - "if": FineGrammar.IF, - "import": FineGrammar.IMPORT, - "in": FineGrammar.IN, - "is": FineGrammar.IS, - "let": FineGrammar.LET, - "match": FineGrammar.MATCH, - "new": FineGrammar.NEW, - "or": FineGrammar.OR, - "return": FineGrammar.RETURN, - "self": FineGrammar.SELF, - "true": FineGrammar.TRUE, - "while": FineGrammar.WHILE, -} - - -def tokenize(src: str): - pos = 0 - while pos < len(src): - ch = src[pos] - if ch.isspace(): - pos += 1 - continue - - token = None - if ch == "-": - if src[pos : pos + 2] == "->": - token = (FineGrammar.ARROW, pos, 2) - else: - token = (FineGrammar.MINUS, pos, 1) - - elif ch == "|": - token = (FineGrammar.BAR, pos, 1) - - elif ch == ":": - token = (FineGrammar.COLON, pos, 1) - - elif ch == "{": - token = (FineGrammar.LCURLY, pos, 1) - - elif ch == "}": - token = (FineGrammar.RCURLY, pos, 1) - - elif ch == ";": - token = (FineGrammar.SEMICOLON, pos, 1) - - elif ch == "=": - if src[pos : pos + 2] == "==": - token = (FineGrammar.EQUALEQUAL, pos, 2) - else: - token = (FineGrammar.EQUAL, pos, 1) - - elif ch == "(": - token = (FineGrammar.LPAREN, pos, 1) - - elif ch == ")": - token = (FineGrammar.RPAREN, pos, 1) - - elif ch == ",": - token = (FineGrammar.COMMA, pos, 1) - - elif ch == "!": - if src[pos : pos + 2] == "!=": - token = (FineGrammar.BANGEQUAL, pos, 2) - else: - token = (FineGrammar.BANG, pos, 1) - - elif ch == "<": - if src[pos : pos + 2] == "<=": - token = (FineGrammar.LESSEQUAL, pos, 2) - else: - token = (FineGrammar.LESS, pos, 1) - - elif ch == ">": - if src[pos : pos + 2] == ">=": - token = (FineGrammar.GREATEREQUAL, pos, 2) - else: - token = (FineGrammar.GREATER, pos, 1) - - elif ch == "+": - token = (FineGrammar.PLUS, pos, 1) - - elif ch == "*": - token = (FineGrammar.STAR, pos, 1) - - elif ch == "/": - if src[pos : pos + 2] == "//": - while pos < len(src) and src[pos] != "\n": - pos = pos + 1 - continue - - token = (FineGrammar.SLASH, pos, 1) - - elif ch == ".": - token = (FineGrammar.DOT, pos, 1) - - elif ch == "[": - token = (FineGrammar.LSQUARE, pos, 1) - - elif ch == "]": - token = (FineGrammar.RSQUARE, pos, 1) - - elif ch == '"' or ch == "'": - end = pos + 1 - while end < len(src) and src[end] != ch: - if src[end] == "\\": - end += 1 - end += 1 - if end == len(src): - raise Exception(f"Unterminated string constant at {pos}") - end += 1 - token = (FineGrammar.STRING, pos, end - pos) - - else: - number_match = NUMBER_RE.match(src, pos) - if number_match: - token = (FineGrammar.NUMBER, pos, number_match.end() - pos) - else: - id_match = IDENTIFIER_RE.match(src, pos) - if id_match: - fragment = src[pos : id_match.end()] - keyword = KEYWORD_TABLE.get(fragment) - if keyword: - token = (keyword, pos, len(fragment)) - else: - token = (FineGrammar.IDENTIFIER, pos, len(fragment)) - - if token is None: - raise Exception("Token error") - yield token - pos += token[2] - - -class FineTokens: - def __init__(self, src: str): - self.src = src - self._tokens: list[typing.Tuple[Terminal, int, int]] = list(tokenize(src)) - self._lines = [m.start() for m in re.finditer("\n", src)] - - def tokens(self): - return self._tokens - - def lines(self): - return self._lines - - def dump(self, *, start=None, end=None): - if start is None: - start = 0 - if end is None: - end = len(self._tokens) - - for token in self._tokens[start:end]: - (kind, start, length) = token - line_index = bisect.bisect_left(self._lines, start) - if line_index == 0: - col_start = 0 - else: - col_start = self._lines[line_index - 1] + 1 - column_index = start - col_start - value = self.src[start : start + length] - print(f"{start:04} {kind.value:12} {value} ({line_index}, {column_index})") - - if __name__ == "__main__": from parser.parser import compile_lexer, dump_lexer_table