Fix grammar bugs, work on debugging harness.
This commit is contained in:
parent
797ec8cd76
commit
0fc04cf11e
3 changed files with 312 additions and 13 deletions
201
grammar.py
201
grammar.py
|
|
@ -1,4 +1,6 @@
|
|||
# This is an example grammar.
|
||||
import re
|
||||
|
||||
from parser import Assoc, Grammar, Nothing, Token, rule, seq
|
||||
|
||||
ARROW = Token("Arrow")
|
||||
|
|
@ -119,7 +121,7 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule
|
||||
def alternate_type(self):
|
||||
return seq(self.type_expression, BAR, self.type_identifier)
|
||||
return seq(self.type_expression, OR, self.type_identifier)
|
||||
|
||||
@rule
|
||||
def type_identifier(self):
|
||||
|
|
@ -170,6 +172,7 @@ class FineGrammar(Grammar):
|
|||
def block(self):
|
||||
return (
|
||||
seq(LCURLY, RCURLY)
|
||||
| seq(LCURLY, self.expression, RCURLY)
|
||||
| seq(LCURLY, self.statement_list, RCURLY)
|
||||
| seq(LCURLY, self.statement_list, self.expression, RCURLY)
|
||||
)
|
||||
|
|
@ -196,7 +199,7 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule
|
||||
def return_statement(self):
|
||||
return seq(RETURN, self.expression, SEMICOLON)
|
||||
return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON)
|
||||
|
||||
@rule
|
||||
def for_statement(self):
|
||||
|
|
@ -254,6 +257,7 @@ class FineGrammar(Grammar):
|
|||
| seq(self.relation_expression, LESSEQUAL, self.additive_expression)
|
||||
| seq(self.relation_expression, GREATER, self.additive_expression)
|
||||
| seq(self.relation_expression, GREATEREQUAL, self.additive_expression)
|
||||
| self.additive_expression
|
||||
)
|
||||
|
||||
@rule
|
||||
|
|
@ -288,6 +292,7 @@ class FineGrammar(Grammar):
|
|||
| self.list_constructor_expression
|
||||
| self.object_constructor_expression
|
||||
| self.match_expression
|
||||
| seq(self.primary_expression, LPAREN, RPAREN)
|
||||
| seq(self.primary_expression, LPAREN, self.expression_list, RPAREN)
|
||||
| seq(self.primary_expression, DOT, IDENTIFIER)
|
||||
| seq(LPAREN, self.expression, RPAREN)
|
||||
|
|
@ -315,7 +320,7 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule
|
||||
def match_expression(self):
|
||||
return seq(MATCH, self.match_body)
|
||||
return seq(MATCH, self.expression, self.match_body)
|
||||
|
||||
@rule
|
||||
def match_body(self):
|
||||
|
|
@ -375,15 +380,187 @@ class FineGrammar(Grammar):
|
|||
return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression)
|
||||
|
||||
|
||||
grammar = FineGrammar()
|
||||
table = grammar.build_table(start="file")
|
||||
# -----------------------------------------------------------------------------
|
||||
# DORKY LEXER
|
||||
# -----------------------------------------------------------------------------
|
||||
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
|
||||
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
|
||||
KEYWORD_TABLE = {
|
||||
"_": UNDERSCORE,
|
||||
"and": AND,
|
||||
"as": AS,
|
||||
"class": CLASS,
|
||||
"else": ELSE,
|
||||
"export": EXPORT,
|
||||
"false": FALSE,
|
||||
"for": FOR,
|
||||
"fun": FUN,
|
||||
"if": IF,
|
||||
"import": IMPORT,
|
||||
"in": IN,
|
||||
"is": IS,
|
||||
"let": LET,
|
||||
"match": MATCH,
|
||||
"new": NEW,
|
||||
"or": OR,
|
||||
"return": RETURN,
|
||||
"self": SELF,
|
||||
"true": TRUE,
|
||||
"while": WHILE,
|
||||
}
|
||||
|
||||
print(f"{len(table)} states")
|
||||
|
||||
average_entries = sum(len(row) for row in table) / len(table)
|
||||
max_entries = max(len(row) for row in table)
|
||||
print(f"{average_entries} average, {max_entries} max")
|
||||
def tokenize(src: str):
|
||||
pos = 0
|
||||
while pos < len(src):
|
||||
ch = src[pos]
|
||||
if ch.isspace():
|
||||
pos += 1
|
||||
continue
|
||||
|
||||
# print(parser_faster.format_table(gen, table))
|
||||
# print()
|
||||
# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])
|
||||
token = None
|
||||
if ch == "-":
|
||||
if src[pos : pos + 2] == "->":
|
||||
token = (ARROW, pos, 2)
|
||||
else:
|
||||
token = (MINUS, pos, 1)
|
||||
|
||||
elif ch == "|":
|
||||
token = (BAR, pos, 1)
|
||||
|
||||
elif ch == ":":
|
||||
token = (COLON, pos, 1)
|
||||
|
||||
elif ch == "{":
|
||||
token = (LCURLY, pos, 1)
|
||||
|
||||
elif ch == "}":
|
||||
token = (RCURLY, pos, 1)
|
||||
|
||||
elif ch == ";":
|
||||
token = (SEMICOLON, pos, 1)
|
||||
|
||||
elif ch == "=":
|
||||
if src[pos : pos + 2] == "==":
|
||||
token = (EQUALEQUAL, pos, 2)
|
||||
else:
|
||||
token = (EQUAL, pos, 1)
|
||||
|
||||
elif ch == "(":
|
||||
token = (LPAREN, pos, 1)
|
||||
|
||||
elif ch == ")":
|
||||
token = (RPAREN, pos, 1)
|
||||
|
||||
elif ch == ",":
|
||||
token = (COMMA, pos, 1)
|
||||
|
||||
elif ch == "!":
|
||||
if src[pos : pos + 2] == "!=":
|
||||
token = (BANGEQUAL, pos, 2)
|
||||
else:
|
||||
token = (BANG, pos, 1)
|
||||
|
||||
elif ch == "<":
|
||||
if src[pos : pos + 2] == "<=":
|
||||
token = (LESSEQUAL, pos, 2)
|
||||
else:
|
||||
token = (LESS, pos, 1)
|
||||
|
||||
elif ch == ">":
|
||||
if src[pos : pos + 2] == ">=":
|
||||
token = (GREATEREQUAL, pos, 2)
|
||||
else:
|
||||
token = (GREATER, pos, 1)
|
||||
|
||||
elif ch == "+":
|
||||
token = (PLUS, pos, 1)
|
||||
|
||||
elif ch == "*":
|
||||
token = (STAR, pos, 1)
|
||||
|
||||
elif ch == "/":
|
||||
if src[pos : pos + 2] == "//":
|
||||
while pos < len(src) and src[pos] != "\n":
|
||||
pos = pos + 1
|
||||
continue
|
||||
|
||||
token = (SLASH, pos, 1)
|
||||
|
||||
elif ch == ".":
|
||||
token = (DOT, pos, 1)
|
||||
|
||||
elif ch == "[":
|
||||
token = (LSQUARE, pos, 1)
|
||||
|
||||
elif ch == "]":
|
||||
token = (RSQUARE, pos, 1)
|
||||
|
||||
elif ch == '"' or ch == "'":
|
||||
end = pos + 1
|
||||
while end < len(src) and src[end] != ch:
|
||||
if src[end] == "\\":
|
||||
end += 1
|
||||
end += 1
|
||||
if end == len(src):
|
||||
raise Exception(f"Unterminated string constant at {pos}")
|
||||
end += 1
|
||||
token = (STRING, pos, end - pos)
|
||||
|
||||
else:
|
||||
number_match = NUMBER_RE.match(src, pos)
|
||||
if number_match:
|
||||
token = (NUMBER, pos, number_match.end() - pos)
|
||||
else:
|
||||
id_match = IDENTIFIER_RE.match(src, pos)
|
||||
if id_match:
|
||||
fragment = src[pos : id_match.end()]
|
||||
keyword = KEYWORD_TABLE.get(fragment)
|
||||
if keyword:
|
||||
token = (keyword, pos, len(fragment))
|
||||
else:
|
||||
token = (IDENTIFIER, pos, len(fragment))
|
||||
|
||||
if token is None:
|
||||
raise Exception("Token error")
|
||||
yield token
|
||||
pos += token[2]
|
||||
|
||||
|
||||
import bisect
|
||||
|
||||
|
||||
class FineTokens:
|
||||
def __init__(self, src: str):
|
||||
self.src = src
|
||||
self.tokens = list(tokenize(src))
|
||||
self.lines = [m.start() for m in re.finditer("\n", src)]
|
||||
|
||||
def dump(self, *, start=None, end=None):
|
||||
if start is None:
|
||||
start = 0
|
||||
if end is None:
|
||||
end = len(self.tokens)
|
||||
|
||||
for token in self.tokens[start:end]:
|
||||
(kind, start, length) = token
|
||||
line_index = bisect.bisect_left(self.lines, start)
|
||||
if line_index == 0:
|
||||
col_start = 0
|
||||
else:
|
||||
col_start = self.lines[line_index - 1] + 1
|
||||
column_index = start - col_start
|
||||
print(
|
||||
f"{start:04} {kind.value:12} {self.src[start:start+length]} ({line_index}, {column_index})"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
grammar = FineGrammar()
|
||||
table = grammar.build_table(start="expression")
|
||||
|
||||
print(f"{len(table)} states")
|
||||
|
||||
average_entries = sum(len(row) for row in table) / len(table)
|
||||
max_entries = max(len(row) for row in table)
|
||||
print(f"{average_entries} average, {max_entries} max")
|
||||
|
|
|
|||
122
harness.py
Normal file
122
harness.py
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
import bisect
|
||||
import typing
|
||||
|
||||
import grammar
|
||||
from parser import Token, Grammar, rule, seq
|
||||
|
||||
|
||||
def parse(table, tokens, trace=False):
|
||||
"""Parse the input with the generated parsing table and return the
|
||||
concrete syntax tree.
|
||||
|
||||
The parsing table can be generated by GenerateLR0.gen_table() or by any
|
||||
of the other generators below. The parsing mechanism never changes, only
|
||||
the table generation mechanism.
|
||||
|
||||
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
|
||||
one on for you.
|
||||
|
||||
This is not a *great* parser, it's really just a demo for what you can
|
||||
do with the table.
|
||||
"""
|
||||
input = [t.value for (t, _, _) in tokens.tokens]
|
||||
|
||||
assert "$" not in input
|
||||
input = input + ["$"]
|
||||
input_index = 0
|
||||
|
||||
# Our stack is a stack of tuples, where the first entry is the state number
|
||||
# and the second entry is the 'value' that was generated when the state was
|
||||
# pushed.
|
||||
stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
|
||||
while True:
|
||||
current_state = stack[-1][0]
|
||||
current_token = input[input_index]
|
||||
|
||||
action = table[current_state].get(current_token, ("error",))
|
||||
if trace:
|
||||
print(
|
||||
"{stack: <20} {input: <50} {action: <5}".format(
|
||||
stack=repr([s[0] for s in stack]),
|
||||
input=repr(input[input_index : input_index + 4]),
|
||||
action=repr(action),
|
||||
)
|
||||
)
|
||||
|
||||
if action[0] == "accept":
|
||||
return (stack[-1][1], [])
|
||||
|
||||
elif action[0] == "reduce":
|
||||
name = action[1]
|
||||
size = action[2]
|
||||
|
||||
value = (name, tuple(s[1] for s in stack[-size:]))
|
||||
stack = stack[:-size]
|
||||
|
||||
goto = table[stack[-1][0]].get(name, ("error",))
|
||||
assert goto[0] == "goto" # Corrupt table?
|
||||
stack.append((goto[1], value))
|
||||
|
||||
elif action[0] == "shift":
|
||||
stack.append((action[1], (current_token, ())))
|
||||
input_index += 1
|
||||
|
||||
elif action[0] == "error":
|
||||
if input_index >= len(tokens.tokens):
|
||||
raise ValueError("Unexpected end of file")
|
||||
else:
|
||||
(_, start, _) = tokens.tokens[input_index]
|
||||
line_index = bisect.bisect_left(tokens.lines, start)
|
||||
if line_index == 0:
|
||||
col_start = 0
|
||||
else:
|
||||
col_start = tokens.lines[line_index - 1] + 1
|
||||
column_index = start - col_start
|
||||
line_index += 1
|
||||
|
||||
return (
|
||||
None,
|
||||
[
|
||||
f"{line_index}:{column_index}: Syntax error: unexpected symbol {current_token}"
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def harness(lexer_func, grammar_func, start_rule, source_path):
|
||||
table = grammar_func().build_table(start=start_rule)
|
||||
print(f"{len(table)} states")
|
||||
|
||||
average_entries = sum(len(row) for row in table) / len(table)
|
||||
max_entries = max(len(row) for row in table)
|
||||
print(f"{average_entries} average, {max_entries} max")
|
||||
|
||||
if source_path:
|
||||
with open(source_path, "r", encoding="utf-8") as f:
|
||||
src = f.read()
|
||||
tokens = lexer_func(src)
|
||||
# print(f"{tokens.lines}")
|
||||
# tokens.dump(end=5)
|
||||
(_, errors) = parse(table, tokens, trace=True)
|
||||
if len(errors) > 0:
|
||||
print(f"{len(errors)} errors:")
|
||||
for error in errors:
|
||||
print(f" {error}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
source_path = None
|
||||
if len(sys.argv) == 2:
|
||||
source_path = sys.argv[1]
|
||||
|
||||
harness(
|
||||
lexer_func=grammar.FineTokens,
|
||||
grammar_func=grammar.FineGrammar,
|
||||
start_rule="file",
|
||||
source_path=source_path,
|
||||
)
|
||||
|
||||
# print(parser_faster.format_table(gen, table))
|
||||
# print()
|
||||
# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])
|
||||
|
|
@ -1723,7 +1723,7 @@ class Grammar:
|
|||
|
||||
return grammar
|
||||
|
||||
def build_table(self, start: str, generator=GenerateLALR):
|
||||
def build_table(self, start: str, generator=GenerateLR1):
|
||||
"""Construct a parse table for this grammar, starting at the named
|
||||
nonterminal rule.
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue