Simplify token handling in parsing

This commit is contained in:
John Doty 2024-06-06 08:16:38 -07:00
parent b60b38d78e
commit 68a8e9298b

View file

@ -26,12 +26,12 @@ import parser
############################################################################### ###############################################################################
def trace_state(id, stack, input, input_index, action): def trace_state(id, stack, token, action):
print( print(
"{id: <04}: {stack: <20} {input: <50} {action: <5}".format( "{id: <04}: {stack: <20} {input: <50} {action: <5}".format(
id=id, id=id,
stack=repr([s[0] for s in stack]), stack=repr([s[0] for s in stack]),
input=repr(input[input_index : input_index + 4]), input=token.kind,
action=repr(action), action=repr(action),
) )
) )
@ -74,27 +74,25 @@ class ParserThread:
# Our stack is a stack of tuples, where the first entry is the state # Our stack is a stack of tuples, where the first entry is the state
# number and the second entry is the 'value' that was generated when the # number and the second entry is the 'value' that was generated when the
# state was pushed. # state was pushed.
table: parser.ParseTable
stack: list[typing.Tuple[int, TokenValue | Tree | None]] stack: list[typing.Tuple[int, TokenValue | Tree | None]]
def __init__(self, id, trace, stack): def __init__(self, id, trace, table, stack):
self.id = id self.id = id
self.trace = trace self.trace = trace
self.table = table
self.stack = stack self.stack = stack
def step( def step(self, current_token: TokenValue) -> StepResult:
self,
table: parser.ParseTable,
current_token: str,
input_index: int,
input_tokens: list[typing.Tuple],
) -> StepResult:
stack = self.stack stack = self.stack
table = self.table
while True: while True:
current_state = stack[-1][0] current_state = stack[-1][0]
action = table.actions[current_state].get(current_token, parser.Error()) action = table.actions[current_state].get(current_token.kind, parser.Error())
if self.trace: if self.trace:
self.trace(self.id, stack, input, input_index, action) self.trace(self.id, stack, current_token, action)
match action: match action:
case parser.Accept(): case parser.Accept():
@ -126,9 +124,7 @@ class ParserThread:
continue continue
case parser.Shift(state): case parser.Shift(state):
(kind, start, length) = input_tokens[input_index] stack.append((state, current_token))
tval = TokenValue(kind=kind.value, start=start, end=start + length)
stack.append((state, tval))
return ContinueResult() return ContinueResult()
case parser.Error(): case parser.Error():
@ -138,25 +134,26 @@ class ParserThread:
raise ValueError(f"Unknown action type: {action}") raise ValueError(f"Unknown action type: {action}")
def parser_thread():
pass
def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | None, list[str]]: def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | None, list[str]]:
input_tokens = tokens.tokens() input_tokens = tokens.tokens()
input: list[str] = [t.value for (t, _, _) in input_tokens] input: list[TokenValue] = [
TokenValue(kind=kind.value, start=start, end=start + length)
for (kind, start, length) in input_tokens
]
assert "$" not in input assert "$" not in input
input = input + ["$"] input = input + [TokenValue(kind="$", start=-1, end=-1)]
input_index = 0 input_index = 0
threads = [ParserThread(0, trace, [(0, None)])] threads = [
ParserThread(0, trace, table, [(0, None)]),
]
while True: while True:
assert len(threads) > 0 assert len(threads) > 0
current_token = input[input_index] current_token = input[input_index]
for thread in threads: for thread in threads:
sr = thread.step(table, current_token, input_index, input_tokens) sr = thread.step(current_token)
match sr: match sr:
case AcceptResult(value): case AcceptResult(value):
return (value, []) return (value, [])
@ -169,8 +166,8 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N
message = "Unexpected end of file" message = "Unexpected end of file"
start = input_tokens[-1][1] start = input_tokens[-1][1]
else: else:
message = f"Syntax error: unexpected symbol {current_token}" message = f"Syntax error: unexpected symbol {current_token.kind}"
(_, start, _) = input_tokens[input_index] start = current_token.start
line_index = bisect.bisect_left(tokens.lines, start) line_index = bisect.bisect_left(tokens.lines, start)
if line_index == 0: if line_index == 0: