Remove the old hand-lexer

The machine lexer is working now
This commit is contained in:
John Doty 2024-08-27 16:49:03 -07:00
parent d62076f3c4
commit cd62b65789

View file

@ -413,186 +413,6 @@ class FineGrammar(Grammar):
RSQUARE = Terminal("]", kind=TerminalKind.Punctuation.SquareBracket.Close)
# -----------------------------------------------------------------------------
# DORKY LEXER
# -----------------------------------------------------------------------------
import bisect
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
KEYWORD_TABLE = {
"_": FineGrammar.UNDERSCORE,
"and": FineGrammar.AND,
"as": FineGrammar.AS,
"class": FineGrammar.CLASS,
"else": FineGrammar.ELSE,
"export": FineGrammar.EXPORT,
"false": FineGrammar.FALSE,
"for": FineGrammar.FOR,
"fun": FineGrammar.FUN,
"if": FineGrammar.IF,
"import": FineGrammar.IMPORT,
"in": FineGrammar.IN,
"is": FineGrammar.IS,
"let": FineGrammar.LET,
"match": FineGrammar.MATCH,
"new": FineGrammar.NEW,
"or": FineGrammar.OR,
"return": FineGrammar.RETURN,
"self": FineGrammar.SELF,
"true": FineGrammar.TRUE,
"while": FineGrammar.WHILE,
}
def tokenize(src: str):
pos = 0
while pos < len(src):
ch = src[pos]
if ch.isspace():
pos += 1
continue
token = None
if ch == "-":
if src[pos : pos + 2] == "->":
token = (FineGrammar.ARROW, pos, 2)
else:
token = (FineGrammar.MINUS, pos, 1)
elif ch == "|":
token = (FineGrammar.BAR, pos, 1)
elif ch == ":":
token = (FineGrammar.COLON, pos, 1)
elif ch == "{":
token = (FineGrammar.LCURLY, pos, 1)
elif ch == "}":
token = (FineGrammar.RCURLY, pos, 1)
elif ch == ";":
token = (FineGrammar.SEMICOLON, pos, 1)
elif ch == "=":
if src[pos : pos + 2] == "==":
token = (FineGrammar.EQUALEQUAL, pos, 2)
else:
token = (FineGrammar.EQUAL, pos, 1)
elif ch == "(":
token = (FineGrammar.LPAREN, pos, 1)
elif ch == ")":
token = (FineGrammar.RPAREN, pos, 1)
elif ch == ",":
token = (FineGrammar.COMMA, pos, 1)
elif ch == "!":
if src[pos : pos + 2] == "!=":
token = (FineGrammar.BANGEQUAL, pos, 2)
else:
token = (FineGrammar.BANG, pos, 1)
elif ch == "<":
if src[pos : pos + 2] == "<=":
token = (FineGrammar.LESSEQUAL, pos, 2)
else:
token = (FineGrammar.LESS, pos, 1)
elif ch == ">":
if src[pos : pos + 2] == ">=":
token = (FineGrammar.GREATEREQUAL, pos, 2)
else:
token = (FineGrammar.GREATER, pos, 1)
elif ch == "+":
token = (FineGrammar.PLUS, pos, 1)
elif ch == "*":
token = (FineGrammar.STAR, pos, 1)
elif ch == "/":
if src[pos : pos + 2] == "//":
while pos < len(src) and src[pos] != "\n":
pos = pos + 1
continue
token = (FineGrammar.SLASH, pos, 1)
elif ch == ".":
token = (FineGrammar.DOT, pos, 1)
elif ch == "[":
token = (FineGrammar.LSQUARE, pos, 1)
elif ch == "]":
token = (FineGrammar.RSQUARE, pos, 1)
elif ch == '"' or ch == "'":
end = pos + 1
while end < len(src) and src[end] != ch:
if src[end] == "\\":
end += 1
end += 1
if end == len(src):
raise Exception(f"Unterminated string constant at {pos}")
end += 1
token = (FineGrammar.STRING, pos, end - pos)
else:
number_match = NUMBER_RE.match(src, pos)
if number_match:
token = (FineGrammar.NUMBER, pos, number_match.end() - pos)
else:
id_match = IDENTIFIER_RE.match(src, pos)
if id_match:
fragment = src[pos : id_match.end()]
keyword = KEYWORD_TABLE.get(fragment)
if keyword:
token = (keyword, pos, len(fragment))
else:
token = (FineGrammar.IDENTIFIER, pos, len(fragment))
if token is None:
raise Exception("Token error")
yield token
pos += token[2]
class FineTokens:
def __init__(self, src: str):
self.src = src
self._tokens: list[typing.Tuple[Terminal, int, int]] = list(tokenize(src))
self._lines = [m.start() for m in re.finditer("\n", src)]
def tokens(self):
return self._tokens
def lines(self):
return self._lines
def dump(self, *, start=None, end=None):
if start is None:
start = 0
if end is None:
end = len(self._tokens)
for token in self._tokens[start:end]:
(kind, start, length) = token
line_index = bisect.bisect_left(self._lines, start)
if line_index == 0:
col_start = 0
else:
col_start = self._lines[line_index - 1] + 1
column_index = start - col_start
value = self.src[start : start + length]
print(f"{start:04} {kind.value:12} {value} ({line_index}, {column_index})")
if __name__ == "__main__":
from parser.parser import compile_lexer, dump_lexer_table