Compare commits

...

4 commits

3 changed files with 390 additions and 71 deletions

View file

@ -1,4 +1,6 @@
# This is an example grammar.
import re
from parser import Assoc, Grammar, Nothing, Token, rule, seq
ARROW = Token("Arrow")
@ -119,7 +121,7 @@ class FineGrammar(Grammar):
@rule
def alternate_type(self):
return seq(self.type_expression, BAR, self.type_identifier)
return seq(self.type_expression, OR, self.type_identifier)
@rule
def type_identifier(self):
@ -170,6 +172,7 @@ class FineGrammar(Grammar):
def block(self):
return (
seq(LCURLY, RCURLY)
| seq(LCURLY, self.expression, RCURLY)
| seq(LCURLY, self.statement_list, RCURLY)
| seq(LCURLY, self.statement_list, self.expression, RCURLY)
)
@ -196,7 +199,7 @@ class FineGrammar(Grammar):
@rule
def return_statement(self):
return seq(RETURN, self.expression, SEMICOLON)
return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON)
@rule
def for_statement(self):
@ -254,6 +257,7 @@ class FineGrammar(Grammar):
| seq(self.relation_expression, LESSEQUAL, self.additive_expression)
| seq(self.relation_expression, GREATER, self.additive_expression)
| seq(self.relation_expression, GREATEREQUAL, self.additive_expression)
| self.additive_expression
)
@rule
@ -288,6 +292,7 @@ class FineGrammar(Grammar):
| self.list_constructor_expression
| self.object_constructor_expression
| self.match_expression
| seq(self.primary_expression, LPAREN, RPAREN)
| seq(self.primary_expression, LPAREN, self.expression_list, RPAREN)
| seq(self.primary_expression, DOT, IDENTIFIER)
| seq(LPAREN, self.expression, RPAREN)
@ -315,7 +320,7 @@ class FineGrammar(Grammar):
@rule
def match_expression(self):
return seq(MATCH, self.match_body)
return seq(MATCH, self.expression, self.match_body)
@rule
def match_body(self):
@ -375,15 +380,187 @@ class FineGrammar(Grammar):
return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression)
grammar = FineGrammar()
table = grammar.build_table(start="file")
# -----------------------------------------------------------------------------
# DORKY LEXER
# -----------------------------------------------------------------------------
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
KEYWORD_TABLE = {
"_": UNDERSCORE,
"and": AND,
"as": AS,
"class": CLASS,
"else": ELSE,
"export": EXPORT,
"false": FALSE,
"for": FOR,
"fun": FUN,
"if": IF,
"import": IMPORT,
"in": IN,
"is": IS,
"let": LET,
"match": MATCH,
"new": NEW,
"or": OR,
"return": RETURN,
"self": SELF,
"true": TRUE,
"while": WHILE,
}
print(f"{len(table)} states")
average_entries = sum(len(row) for row in table) / len(table)
max_entries = max(len(row) for row in table)
print(f"{average_entries} average, {max_entries} max")
def tokenize(src: str):
pos = 0
while pos < len(src):
ch = src[pos]
if ch.isspace():
pos += 1
continue
# print(parser_faster.format_table(gen, table))
# print()
# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])
token = None
if ch == "-":
if src[pos : pos + 2] == "->":
token = (ARROW, pos, 2)
else:
token = (MINUS, pos, 1)
elif ch == "|":
token = (BAR, pos, 1)
elif ch == ":":
token = (COLON, pos, 1)
elif ch == "{":
token = (LCURLY, pos, 1)
elif ch == "}":
token = (RCURLY, pos, 1)
elif ch == ";":
token = (SEMICOLON, pos, 1)
elif ch == "=":
if src[pos : pos + 2] == "==":
token = (EQUALEQUAL, pos, 2)
else:
token = (EQUAL, pos, 1)
elif ch == "(":
token = (LPAREN, pos, 1)
elif ch == ")":
token = (RPAREN, pos, 1)
elif ch == ",":
token = (COMMA, pos, 1)
elif ch == "!":
if src[pos : pos + 2] == "!=":
token = (BANGEQUAL, pos, 2)
else:
token = (BANG, pos, 1)
elif ch == "<":
if src[pos : pos + 2] == "<=":
token = (LESSEQUAL, pos, 2)
else:
token = (LESS, pos, 1)
elif ch == ">":
if src[pos : pos + 2] == ">=":
token = (GREATEREQUAL, pos, 2)
else:
token = (GREATER, pos, 1)
elif ch == "+":
token = (PLUS, pos, 1)
elif ch == "*":
token = (STAR, pos, 1)
elif ch == "/":
if src[pos : pos + 2] == "//":
while pos < len(src) and src[pos] != "\n":
pos = pos + 1
continue
token = (SLASH, pos, 1)
elif ch == ".":
token = (DOT, pos, 1)
elif ch == "[":
token = (LSQUARE, pos, 1)
elif ch == "]":
token = (RSQUARE, pos, 1)
elif ch == '"' or ch == "'":
end = pos + 1
while end < len(src) and src[end] != ch:
if src[end] == "\\":
end += 1
end += 1
if end == len(src):
raise Exception(f"Unterminated string constant at {pos}")
end += 1
token = (STRING, pos, end - pos)
else:
number_match = NUMBER_RE.match(src, pos)
if number_match:
token = (NUMBER, pos, number_match.end() - pos)
else:
id_match = IDENTIFIER_RE.match(src, pos)
if id_match:
fragment = src[pos : id_match.end()]
keyword = KEYWORD_TABLE.get(fragment)
if keyword:
token = (keyword, pos, len(fragment))
else:
token = (IDENTIFIER, pos, len(fragment))
if token is None:
raise Exception("Token error")
yield token
pos += token[2]
import bisect
class FineTokens:
def __init__(self, src: str):
self.src = src
self.tokens = list(tokenize(src))
self.lines = [m.start() for m in re.finditer("\n", src)]
def dump(self, *, start=None, end=None):
if start is None:
start = 0
if end is None:
end = len(self.tokens)
for token in self.tokens[start:end]:
(kind, start, length) = token
line_index = bisect.bisect_left(self.lines, start)
if line_index == 0:
col_start = 0
else:
col_start = self.lines[line_index - 1] + 1
column_index = start - col_start
print(
f"{start:04} {kind.value:12} {self.src[start:start+length]} ({line_index}, {column_index})"
)
if __name__ == "__main__":
grammar = FineGrammar()
table = grammar.build_table(start="expression")
print(f"{len(table)} states")
average_entries = sum(len(row) for row in table) / len(table)
max_entries = max(len(row) for row in table)
print(f"{average_entries} average, {max_entries} max")

130
harness.py Normal file
View file

@ -0,0 +1,130 @@
import bisect
import typing
import grammar
import parser
# from parser import Token, Grammar, rule, seq
def trace_state(stack, input, input_index, action):
print(
"{stack: <20} {input: <50} {action: <5}".format(
stack=repr([s[0] for s in stack]),
input=repr(input[input_index : input_index + 4]),
action=repr(action),
)
)
def parse(table, tokens, trace=None):
"""Parse the input with the generated parsing table and return the
concrete syntax tree.
The parsing table can be generated by GenerateLR0.gen_table() or by any
of the other generators below. The parsing mechanism never changes, only
the table generation mechanism.
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
one on for you.
This is not a *great* parser, it's really just a demo for what you can
do with the table.
"""
input = [t.value for (t, _, _) in tokens.tokens]
assert "$" not in input
input = input + ["$"]
input_index = 0
# Our stack is a stack of tuples, where the first entry is the state number
# and the second entry is the 'value' that was generated when the state was
# pushed.
stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
while True:
current_state = stack[-1][0]
current_token = input[input_index]
action = table[current_state].get(current_token, ("error",))
if trace:
trace(stack, input, input_index, action)
if action[0] == "accept":
return (stack[-1][1], [])
elif action[0] == "reduce":
name = action[1]
size = action[2]
value = (name, tuple(s[1] for s in stack[-size:]))
stack = stack[:-size]
goto = table[stack[-1][0]].get(name, ("error",))
assert goto[0] == "goto" # Corrupt table?
stack.append((goto[1], value))
elif action[0] == "shift":
stack.append((action[1], (current_token, ())))
input_index += 1
elif action[0] == "error":
if input_index >= len(tokens.tokens):
raise ValueError("Unexpected end of file")
else:
(_, start, _) = tokens.tokens[input_index]
line_index = bisect.bisect_left(tokens.lines, start)
if line_index == 0:
col_start = 0
else:
col_start = tokens.lines[line_index - 1] + 1
column_index = start - col_start
line_index += 1
return (
None,
[
f"{line_index}:{column_index}: Syntax error: unexpected symbol {current_token}"
],
)
def harness(lexer_func, grammar_func, start_rule, source_path):
# generator = parser.GenerateLR1
generator = parser.GenerateLALR
table = grammar_func().build_table(start=start_rule, generator=generator)
print(f"{len(table)} states")
average_entries = sum(len(row) for row in table) / len(table)
max_entries = max(len(row) for row in table)
print(f"{average_entries} average, {max_entries} max")
if source_path:
with open(source_path, "r", encoding="utf-8") as f:
src = f.read()
tokens = lexer_func(src)
# print(f"{tokens.lines}")
# tokens.dump(end=5)
(_, errors) = parse(table, tokens)
if len(errors) > 0:
print(f"{len(errors)} errors:")
for error in errors:
print(f" {error}")
if __name__ == "__main__":
import sys
source_path = None
if len(sys.argv) == 2:
source_path = sys.argv[1]
harness(
lexer_func=grammar.FineTokens,
grammar_func=grammar.FineGrammar,
start_rule="file",
source_path=source_path,
)
# print(parser_faster.format_table(gen, table))
# print()
# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])

122
parser.py
View file

@ -257,6 +257,14 @@ class Configuration:
lookahead=(),
)
def replace_lookahead(self, lookahead: typing.Tuple[int, ...]):
return Configuration(
name=self.name,
symbols=self.symbols,
position=self.position,
lookahead=lookahead,
)
@property
def rest(self):
return self.symbols[(self.position + 1) :]
@ -1382,57 +1390,67 @@ class GenerateLALR(GenerateLR1):
use a bunch of improvement, probably.)
"""
def merge_sets(self, config_set_a, config_set_b):
"""Merge the two config sets, by keeping the item cores but merging
the lookahead sets for each item.
"""
assert len(config_set_a) == len(config_set_b)
merged = []
for index, a in enumerate(config_set_a):
b = config_set_b[index]
assert a.clear_lookahead() == b.clear_lookahead()
new_lookahead = a.lookahead + b.lookahead
new_lookahead = tuple(sorted(set(new_lookahead)))
merged.append(a.clear_lookahead())
return tuple(merged)
def sets_equal(self, a, b):
a_no_la = tuple(s.clear_lookahead() for s in a)
b_no_la = tuple(s.clear_lookahead() for s in b)
return a_no_la == b_no_la
def gen_sets(self, config_set) -> ConfigurationSetInfo:
def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo:
"""Recursively generate all configuration sets starting from the
provided set, and merge them with the provided set 'F'.
provided set.
The difference between this method and the one in GenerateLR0, where
this comes from, is in the part that stops recursion. In LALR we
compare for set equality *ignoring lookahead*. If we find a match,
then instead of returning F unchanged, we merge the two equal sets
and replace the set in F, returning the modified set.
this comes from, is that we're going to be keeping track of states
that we found that are equivalent in lookahead.
"""
#
# First, do the actual walk. Don't merge yet: just keep track of all
# the config sets that need to be merged.
#
F = {}
seen = set()
successors = []
pending = [config_set]
while len(pending) > 0:
config_set = pending.pop()
if config_set in seen:
continue
seen.add(config_set)
config_set_no_la = tuple(s.clear_lookahead() for s in config_set)
existing = F.get(config_set_no_la)
if existing is not None:
F[config_set_no_la] = self.merge_sets(config_set, existing)
existing.append(config_set)
else:
F[config_set_no_la] = config_set
F[config_set_no_la] = [config_set]
for symbol, successor in self.gen_all_successors(config_set):
successor_no_la = tuple(s.clear_lookahead() for s in successor)
successors.append((config_set_no_la, symbol, successor_no_la))
pending.append(successor)
# Now we gathered the sets, merge them all.
final_sets = {}
for key, config_sets in F.items():
new_config_set = []
config_groupings = [[] for _ in range(len(config_sets[0]))]
for config_set in config_sets:
for i, config in enumerate(config_set):
config_groupings[i].append(config)
for config_group in config_groupings:
new_lookahead = [l for config in config_group for l in config.lookahead]
new_lookahead = tuple(sorted(set(new_lookahead)))
new_config_set.append(
Configuration(
name=config_group[0].name,
symbols=config_group[0].symbols,
position=config_group[0].position,
lookahead=new_lookahead,
)
)
final_sets[key] = tuple(new_config_set)
# Register all the actually merged, final config sets.
result = ConfigurationSetInfo()
for config_set in F.values():
for config_set in final_sets.values():
result.register_config_set(config_set)
# Now record all the successors that we found. Of course, the actual
@ -1443,10 +1461,10 @@ class GenerateLALR(GenerateLR1):
# so we can find the final sets, then look them up in the registered
# sets, and actually register the successor.
for config_set_no_la, symbol, successor_no_la in successors:
actual_config_set = F[config_set_no_la]
actual_config_set = final_sets[config_set_no_la]
from_index = result.config_set_key[actual_config_set]
actual_successor = F[successor_no_la]
actual_successor = final_sets[successor_no_la]
to_index = result.config_set_key[actual_successor]
result.add_successor(from_index, symbol, to_index)
@ -1499,7 +1517,7 @@ class Token(Rule):
def __init__(self, value):
self.value = sys.intern(value)
def flatten(self) -> typing.Generator[list[str], None, None]:
def flatten(self) -> typing.Generator[list["str | Token"], None, None]:
# We are just ourselves when flattened.
yield [self]
@ -1546,7 +1564,7 @@ class AlternativeRule(Rule):
self.left = left
self.right = right
def flatten(self) -> typing.Generator[list[str], None, None]:
def flatten(self) -> typing.Generator[list[str | Token], None, None]:
# All the things from the left of the alternative, then all the things
# from the right, never intermingled.
yield from self.left.flatten()
@ -1562,7 +1580,7 @@ class SequenceRule(Rule):
self.first = first
self.second = second
def flatten(self) -> typing.Generator[list[str], None, None]:
def flatten(self) -> typing.Generator[list[str | Token], None, None]:
# All the things in the prefix....
for first in self.first.flatten():
# ...potentially followed by all the things in the suffix.
@ -1575,7 +1593,7 @@ class NothingRule(Rule):
these, you're probably better off just using the singleton `Nothing`.
"""
def flatten(self) -> typing.Generator[list[str], None, None]:
def flatten(self) -> typing.Generator[list[str | Token], None, None]:
# It's quiet in here.
yield []
@ -1583,7 +1601,7 @@ class NothingRule(Rule):
Nothing = NothingRule()
def seq(*args: list[Rule]) -> Rule:
def seq(*args: Rule) -> Rule:
"""A rule that matches a sequence of rules.
(A helper function that combines its arguments into nested sequences.)
@ -1594,17 +1612,15 @@ def seq(*args: list[Rule]) -> Rule:
return result
@typing.overload
def rule(name: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ...
# @typing.overload
# def rule(f: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ...
@typing.overload
def rule(fn: typing.Callable) -> Rule: ...
# @typing.overload
# def rule(f: typing.Callable) -> Rule: ...
def rule(
name_or_fn: None | str | typing.Callable = None,
) -> Rule | typing.Callable[[typing.Callable], Rule]:
def rule(f: typing.Callable) -> Rule:
"""The decorator that marks a method in a Grammar object as a nonterminal
rule.
@ -1612,16 +1628,11 @@ def rule(
If called with one argument, that argument is a name that overrides the name
of the nonterminal, which defaults to the name of the function.
"""
name = f.__name__
return NonTerminal(f, name)
def _rule(callable):
return NonTerminal(callable, name)
if callable(name_or_fn):
name = name_or_fn.__name__
return _rule(name_or_fn)
else:
name = name_or_fn
return _rule
PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]]
class Grammar:
@ -1650,12 +1661,13 @@ class Grammar:
Not very exciting, perhaps, but it's something.
"""
def __init__(self, precedence: list[typing.Tuple[Assoc, list[Token | NonTerminal]]] = None):
def __init__(self, precedence: PrecedenceList | None = None):
if precedence is None:
precedence = getattr(self, "precedence", [])
assert precedence is not None
precedence_table = {}
for precedence, (associativity, symbols) in enumerate(precedence):
for prec, (associativity, symbols) in enumerate(precedence):
for symbol in symbols:
if isinstance(symbol, Token):
key = symbol.value
@ -1664,7 +1676,7 @@ class Grammar:
else:
raise ValueError(f"{symbol} must be either a Token or a NonTerminal")
precedence_table[key] = (associativity, precedence + 1)
precedence_table[key] = (associativity, prec + 1)
self._precedence = precedence_table