Remove the old hand-lexer
The machine lexer is working now
This commit is contained in:
parent
d62076f3c4
commit
cd62b65789
1 changed files with 0 additions and 180 deletions
180
grammar.py
180
grammar.py
|
|
@ -413,186 +413,6 @@ class FineGrammar(Grammar):
|
|||
RSQUARE = Terminal("]", kind=TerminalKind.Punctuation.SquareBracket.Close)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# DORKY LEXER
|
||||
# -----------------------------------------------------------------------------
|
||||
import bisect
|
||||
|
||||
|
||||
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
|
||||
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
|
||||
KEYWORD_TABLE = {
|
||||
"_": FineGrammar.UNDERSCORE,
|
||||
"and": FineGrammar.AND,
|
||||
"as": FineGrammar.AS,
|
||||
"class": FineGrammar.CLASS,
|
||||
"else": FineGrammar.ELSE,
|
||||
"export": FineGrammar.EXPORT,
|
||||
"false": FineGrammar.FALSE,
|
||||
"for": FineGrammar.FOR,
|
||||
"fun": FineGrammar.FUN,
|
||||
"if": FineGrammar.IF,
|
||||
"import": FineGrammar.IMPORT,
|
||||
"in": FineGrammar.IN,
|
||||
"is": FineGrammar.IS,
|
||||
"let": FineGrammar.LET,
|
||||
"match": FineGrammar.MATCH,
|
||||
"new": FineGrammar.NEW,
|
||||
"or": FineGrammar.OR,
|
||||
"return": FineGrammar.RETURN,
|
||||
"self": FineGrammar.SELF,
|
||||
"true": FineGrammar.TRUE,
|
||||
"while": FineGrammar.WHILE,
|
||||
}
|
||||
|
||||
|
||||
def tokenize(src: str):
|
||||
pos = 0
|
||||
while pos < len(src):
|
||||
ch = src[pos]
|
||||
if ch.isspace():
|
||||
pos += 1
|
||||
continue
|
||||
|
||||
token = None
|
||||
if ch == "-":
|
||||
if src[pos : pos + 2] == "->":
|
||||
token = (FineGrammar.ARROW, pos, 2)
|
||||
else:
|
||||
token = (FineGrammar.MINUS, pos, 1)
|
||||
|
||||
elif ch == "|":
|
||||
token = (FineGrammar.BAR, pos, 1)
|
||||
|
||||
elif ch == ":":
|
||||
token = (FineGrammar.COLON, pos, 1)
|
||||
|
||||
elif ch == "{":
|
||||
token = (FineGrammar.LCURLY, pos, 1)
|
||||
|
||||
elif ch == "}":
|
||||
token = (FineGrammar.RCURLY, pos, 1)
|
||||
|
||||
elif ch == ";":
|
||||
token = (FineGrammar.SEMICOLON, pos, 1)
|
||||
|
||||
elif ch == "=":
|
||||
if src[pos : pos + 2] == "==":
|
||||
token = (FineGrammar.EQUALEQUAL, pos, 2)
|
||||
else:
|
||||
token = (FineGrammar.EQUAL, pos, 1)
|
||||
|
||||
elif ch == "(":
|
||||
token = (FineGrammar.LPAREN, pos, 1)
|
||||
|
||||
elif ch == ")":
|
||||
token = (FineGrammar.RPAREN, pos, 1)
|
||||
|
||||
elif ch == ",":
|
||||
token = (FineGrammar.COMMA, pos, 1)
|
||||
|
||||
elif ch == "!":
|
||||
if src[pos : pos + 2] == "!=":
|
||||
token = (FineGrammar.BANGEQUAL, pos, 2)
|
||||
else:
|
||||
token = (FineGrammar.BANG, pos, 1)
|
||||
|
||||
elif ch == "<":
|
||||
if src[pos : pos + 2] == "<=":
|
||||
token = (FineGrammar.LESSEQUAL, pos, 2)
|
||||
else:
|
||||
token = (FineGrammar.LESS, pos, 1)
|
||||
|
||||
elif ch == ">":
|
||||
if src[pos : pos + 2] == ">=":
|
||||
token = (FineGrammar.GREATEREQUAL, pos, 2)
|
||||
else:
|
||||
token = (FineGrammar.GREATER, pos, 1)
|
||||
|
||||
elif ch == "+":
|
||||
token = (FineGrammar.PLUS, pos, 1)
|
||||
|
||||
elif ch == "*":
|
||||
token = (FineGrammar.STAR, pos, 1)
|
||||
|
||||
elif ch == "/":
|
||||
if src[pos : pos + 2] == "//":
|
||||
while pos < len(src) and src[pos] != "\n":
|
||||
pos = pos + 1
|
||||
continue
|
||||
|
||||
token = (FineGrammar.SLASH, pos, 1)
|
||||
|
||||
elif ch == ".":
|
||||
token = (FineGrammar.DOT, pos, 1)
|
||||
|
||||
elif ch == "[":
|
||||
token = (FineGrammar.LSQUARE, pos, 1)
|
||||
|
||||
elif ch == "]":
|
||||
token = (FineGrammar.RSQUARE, pos, 1)
|
||||
|
||||
elif ch == '"' or ch == "'":
|
||||
end = pos + 1
|
||||
while end < len(src) and src[end] != ch:
|
||||
if src[end] == "\\":
|
||||
end += 1
|
||||
end += 1
|
||||
if end == len(src):
|
||||
raise Exception(f"Unterminated string constant at {pos}")
|
||||
end += 1
|
||||
token = (FineGrammar.STRING, pos, end - pos)
|
||||
|
||||
else:
|
||||
number_match = NUMBER_RE.match(src, pos)
|
||||
if number_match:
|
||||
token = (FineGrammar.NUMBER, pos, number_match.end() - pos)
|
||||
else:
|
||||
id_match = IDENTIFIER_RE.match(src, pos)
|
||||
if id_match:
|
||||
fragment = src[pos : id_match.end()]
|
||||
keyword = KEYWORD_TABLE.get(fragment)
|
||||
if keyword:
|
||||
token = (keyword, pos, len(fragment))
|
||||
else:
|
||||
token = (FineGrammar.IDENTIFIER, pos, len(fragment))
|
||||
|
||||
if token is None:
|
||||
raise Exception("Token error")
|
||||
yield token
|
||||
pos += token[2]
|
||||
|
||||
|
||||
class FineTokens:
|
||||
def __init__(self, src: str):
|
||||
self.src = src
|
||||
self._tokens: list[typing.Tuple[Terminal, int, int]] = list(tokenize(src))
|
||||
self._lines = [m.start() for m in re.finditer("\n", src)]
|
||||
|
||||
def tokens(self):
|
||||
return self._tokens
|
||||
|
||||
def lines(self):
|
||||
return self._lines
|
||||
|
||||
def dump(self, *, start=None, end=None):
|
||||
if start is None:
|
||||
start = 0
|
||||
if end is None:
|
||||
end = len(self._tokens)
|
||||
|
||||
for token in self._tokens[start:end]:
|
||||
(kind, start, length) = token
|
||||
line_index = bisect.bisect_left(self._lines, start)
|
||||
if line_index == 0:
|
||||
col_start = 0
|
||||
else:
|
||||
col_start = self._lines[line_index - 1] + 1
|
||||
column_index = start - col_start
|
||||
value = self.src[start : start + length]
|
||||
print(f"{start:04} {kind.value:12} {value} ({line_index}, {column_index})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from parser.parser import compile_lexer, dump_lexer_table
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue