[fine] Parser Table Generator?

Look I'm just thinking hard about converting to a parser generator
because I want to derive the pretty-printer from the parser without
having to repeat myself all over the place.

This parser.py is derived from my old LRParsers project, and should
go back there eventually, but for now I'm driving the work from here.
This commit is contained in:
John Doty 2024-05-04 16:46:36 -07:00
parent a2b3e8b74d
commit 25f9c3ecaf
6 changed files with 2384 additions and 1 deletions

3
.gitignore vendored
View file

@ -2,3 +2,6 @@
/oden-js/target
/oden-js-sys/target
/fine/target
.venv/
__pycache__/

388
fine/grammar/grammar.py Normal file
View file

@ -0,0 +1,388 @@
from parser import Assoc, Grammar, Nothing, Token, rule, seq
ARROW = Token("Arrow")
AS = Token("As")
BAR = Token("Bar")
CLASS = Token("Class")
COLON = Token("Colon")
ELSE = Token("Else")
FOR = Token("For")
FUN = Token("Fun")
IDENTIFIER = Token("Identifier")
IF = Token("If")
IMPORT = Token("Import")
IN = Token("In")
LCURLY = Token("LeftBrace")
LET = Token("Let")
RCURLY = Token("RightBrace")
RETURN = Token("Return")
SEMICOLON = Token("Semicolon")
STRING = Token("String")
WHILE = Token("While")
EQUAL = Token("Equal")
LPAREN = Token("LeftParen")
RPAREN = Token("RightParen")
COMMA = Token("Comma")
SELF = Token("Selff")
OR = Token("Or")
IS = Token("Is")
AND = Token("And")
EQUALEQUAL = Token("EqualEqual")
BANGEQUAL = Token("BangEqual")
LESS = Token("Less")
GREATER = Token("Greater")
LESSEQUAL = Token("LessEqual")
GREATEREQUAL = Token("GreaterEqual")
PLUS = Token("Plus")
MINUS = Token("Minus")
STAR = Token("Star")
SLASH = Token("Slash")
NUMBER = Token("Number")
TRUE = Token("True")
FALSE = Token("False")
BANG = Token("Bang")
DOT = Token("Dot")
MATCH = Token("Match")
EXPORT = Token("Export")
UNDERSCORE = Token("Underscore")
NEW = Token("New")
LSQUARE = Token("LeftBracket")
RSQUARE = Token("RightBracket")
class FineGrammar(Grammar):
def __init__(self):
super().__init__(
precedence=[
(Assoc.RIGHT, [EQUAL]),
(Assoc.LEFT, [OR]),
(Assoc.LEFT, [IS]),
(Assoc.LEFT, [AND]),
(Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]),
(Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]),
(Assoc.LEFT, [PLUS, MINUS]),
(Assoc.LEFT, [STAR, SLASH]),
(Assoc.LEFT, [self.primary_expression]),
(Assoc.LEFT, [LPAREN]),
(Assoc.LEFT, [DOT]),
#
# If there's a confusion about whether to make an IF
# statement or an expression, prefer the statement.
#
(Assoc.NONE, [self.if_statement]),
]
)
@rule
def file(self):
return self.file_statement_list
@rule
def file_statement_list(self):
return self.file_statement | (self.file_statement_list + self.file_statement)
@rule
def file_statement(self):
return (
self.import_statement | self.class_declaration | self.export_statement | self.statement
)
@rule
def import_statement(self):
return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON)
@rule
def class_declaration(self):
return seq(CLASS, IDENTIFIER, self.class_body)
@rule
def class_body(self):
return seq(LCURLY, RCURLY) | seq(LCURLY, self.class_members, RCURLY)
@rule
def class_members(self):
return self.class_member | seq(self.class_members, self.class_member)
@rule
def class_member(self):
return self.field_declaration | self.function_declaration
@rule
def field_declaration(self):
return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON)
# Types
@rule
def type_expression(self):
return self.alternate_type | self.type_identifier
@rule
def alternate_type(self):
return seq(self.type_expression, BAR, self.type_identifier)
@rule
def type_identifier(self):
return IDENTIFIER
@rule
def export_statement(self):
return (
seq(EXPORT, self.class_declaration)
| seq(EXPORT, self.function_declaration)
| seq(EXPORT, self.let_statement)
| seq(EXPORT, self.export_list, SEMICOLON)
)
@rule
def export_list(self):
return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list)
# Functions
@rule
def function_declaration(self):
return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq(
FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block
)
@rule
def function_parameters(self):
return (
seq(LPAREN, RPAREN)
| seq(LPAREN, self.first_parameter, RPAREN)
| seq(LPAREN, self.first_parameter, COMMA, self.parameter_list, RPAREN)
)
@rule
def first_parameter(self):
return SELF | self.parameter
@rule
def parameter_list(self):
return Nothing | self.parameter | seq(self.parameter, COMMA, self.parameter_list)
@rule
def parameter(self):
return seq(IDENTIFIER, COLON, self.type_expression)
# Block
@rule
def block(self):
return (
seq(LCURLY, RCURLY)
| seq(LCURLY, self.statement_list, RCURLY)
| seq(LCURLY, self.statement_list, self.expression, RCURLY)
)
@rule
def statement_list(self):
return self.statement | seq(self.statement_list, self.statement)
@rule
def statement(self):
return (
self.function_declaration
| self.let_statement
| self.return_statement
| self.for_statement
| self.if_statement
| self.while_statement
| self.expression_statement
)
@rule
def let_statement(self):
return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON)
@rule
def return_statement(self):
return seq(RETURN, self.expression, SEMICOLON)
@rule
def for_statement(self):
return seq(FOR, self.iterator_variable, IN, self.expression, self.block)
@rule
def iterator_variable(self):
return IDENTIFIER
@rule
def if_statement(self):
return self.conditional_expression
@rule
def while_statement(self):
return seq(WHILE, self.expression, self.block)
@rule
def expression_statement(self):
return seq(self.expression, SEMICOLON)
# Expressions
@rule
def expression(self):
return self.assignment_expression
@rule
def assignment_expression(self):
return seq(self.or_expression, EQUAL, self.assignment_expression) | self.or_expression
@rule
def or_expression(self):
return seq(self.or_expression, OR, self.is_expression) | self.is_expression
@rule
def is_expression(self):
return seq(self.is_expression, IS, self.pattern) | self.and_expression
@rule
def and_expression(self):
return seq(self.and_expression, AND, self.equality_expression) | self.equality_expression
@rule
def equality_expression(self):
return (
seq(self.equality_expression, EQUALEQUAL, self.relation_expression)
| seq(self.equality_expression, BANGEQUAL, self.relation_expression)
| self.relation_expression
)
@rule
def relation_expression(self):
return (
seq(self.relation_expression, LESS, self.additive_expression)
| seq(self.relation_expression, LESSEQUAL, self.additive_expression)
| seq(self.relation_expression, GREATER, self.additive_expression)
| seq(self.relation_expression, GREATEREQUAL, self.additive_expression)
)
@rule
def additive_expression(self):
return (
seq(self.additive_expression, PLUS, self.multiplication_expression)
| seq(self.additive_expression, MINUS, self.multiplication_expression)
| self.multiplication_expression
)
@rule
def multiplication_expression(self):
return (
seq(self.multiplication_expression, STAR, self.primary_expression)
| seq(self.multiplication_expression, SLASH, self.primary_expression)
| self.primary_expression
)
@rule
def primary_expression(self):
return (
IDENTIFIER
| SELF
| NUMBER
| STRING
| TRUE
| FALSE
| seq(BANG, self.primary_expression)
| seq(MINUS, self.primary_expression)
| self.block
| self.conditional_expression
| self.list_constructor_expression
| self.object_constructor_expression
| self.match_expression
| seq(self.primary_expression, LPAREN, self.expression_list, RPAREN)
| seq(self.primary_expression, DOT, IDENTIFIER)
| seq(LPAREN, self.expression, RPAREN)
)
@rule
def conditional_expression(self):
return (
seq(IF, self.expression, self.block)
| seq(IF, self.expression, self.block, ELSE, self.conditional_expression)
| seq(IF, self.expression, self.block, ELSE, self.block)
)
@rule
def list_constructor_expression(self):
return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self.expression_list, RSQUARE)
@rule
def expression_list(self):
return (
self.expression
| seq(self.expression, COMMA)
| seq(self.expression, COMMA, self.expression_list)
)
@rule
def match_expression(self):
return seq(MATCH, self.match_body)
@rule
def match_body(self):
return seq(LCURLY, RCURLY) | seq(LCURLY, self.match_arms, RCURLY)
@rule
def match_arms(self):
return (
self.match_arm
| seq(self.match_arm, COMMA)
| seq(self.match_arm, COMMA, self.match_arms)
)
@rule
def match_arm(self):
return seq(self.pattern, ARROW, self.expression)
@rule
def pattern(self):
return (
seq(self.variable_binding, self.pattern_core, AND, self.and_expression)
| seq(self.variable_binding, self.pattern_core)
| seq(self.pattern_core, AND, self.and_expression)
| self.pattern_core
)
@rule
def pattern_core(self):
return self.type_expression | self.wildcard_pattern
@rule
def wildcard_pattern(self):
return UNDERSCORE
@rule
def variable_binding(self):
return seq(IDENTIFIER, COLON)
@rule
def object_constructor_expression(self):
return seq(NEW, self.type_identifier, self.field_list)
@rule
def field_list(self):
return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY)
@rule
def field_values(self):
return (
self.field_value
| seq(self.field_value, COMMA)
| seq(self.field_value, COMMA, self.field_values)
)
@rule
def field_value(self):
return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression)
grammar = FineGrammar()
table = grammar.build_table(start="file")
print(f"{len(table)} states")
average_entries = sum(len(row) for row in table) / len(table)
max_entries = max(len(row) for row in table)
print(f"{average_entries} average, {max_entries} max")
# print(parser_faster.format_table(gen, table))
# print()
# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])

422
fine/grammar/guessing.rs Normal file
View file

@ -0,0 +1,422 @@
// NOTE: Utterly Broken Ideas about Parse Tables.
//
// Committing this here so I can back it up.
use std::collections::HashSet;
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
pub enum ReduceRule {
// Generated
AlternateType,
Argument,
ArgumentList,
BinaryExpression,
Block,
CallExpression,
ClassDecl,
ConditionalExpression,
ExpressionStatement,
FieldDecl,
FieldList,
FieldValue,
File,
ForStatement,
FunctionDecl,
GroupingExpression,
Identifier,
IfStatement,
IsExpression,
IteratorVariable,
LetStatement,
ListConstructor,
ListConstructorElement,
LiteralExpression,
MatchArm,
MatchBody,
MatchExpression,
MemberAccess,
NewObjectExpression,
ParamList,
Parameter,
Pattern,
ReturnStatement,
ReturnType,
SelfParameter,
SelfReference,
TypeExpression,
TypeIdentifier,
TypeParameter,
TypeParameterList,
UnaryExpression,
VariableBinding,
WhileStatement,
WildcardPattern,
Import,
Export,
ExportList,
}
#[derive(Eq, PartialEq, Hash, Copy, Clone)]
pub enum TokenAction {
Error,
Reduce(ReduceRule, TreeKind, u16),
ReduceAnonymous(ReduceRule, u16),
Accept,
Shift(u16),
}
pub struct ParseState {
action_start: usize,
action_end: usize,
goto_start: usize,
goto_end: usize,
}
pub struct ParseTable<'a> {
state: &'a [ParseState],
start_state: usize,
token_action: &'a [TokenAction],
token_kind: &'a [TokenKind],
tree_goto: &'a [u16],
tree_rules: &'a [ReduceRule],
}
#[derive(Clone)]
enum StackEntry {
Nothing,
Tree(TreeRef),
AnonTree(Vec<Child>),
Token(TokenRef),
Error(TokenRef),
}
#[derive(Clone)]
struct ParseThread {
stack: Vec<(usize, StackEntry)>,
panic_count: u8,
error_count: u8,
score: u32,
}
impl ParseThread {
fn initial(start_state: usize) -> ParseThread {
ParseThread {
stack: vec![(start_state, StackEntry::Nothing)],
error_count: 0,
panic_count: 0,
score: 0,
}
}
fn reduce(
&mut self,
table: &ParseTable,
syntax: &mut SyntaxTree,
count: u16,
rule: ReduceRule,
kind: Option<TreeKind>,
) {
let mut children = Vec::new();
let count: usize = count.into();
let mut consumed = 0;
while consumed < count {
let Some((_, value)) = self.stack.pop() else {
break;
};
match value {
StackEntry::Nothing => panic!("Popped nothing!"),
StackEntry::Tree(t) => {
consumed += 1;
children.push(Child::Tree(t));
}
StackEntry::AnonTree(mut cs) => {
consumed += 1;
children.append(&mut cs);
}
StackEntry::Token(t) => {
consumed += 1;
children.push(Child::Token(t));
}
StackEntry::Error(t) => {
// Do *not* increment consumed; these don't count!
children.push(Child::Token(t));
}
}
}
assert_eq!(consumed, count, "Stack underflow on reduce");
let value = if let Some(kind) = kind {
let tr = syntax.add_tree(Tree {
kind,
self_ref: TreeRef::from_index(0),
parent: None,
start_pos: 0,
end_pos: 0,
children,
});
StackEntry::Tree(tr)
} else {
StackEntry::AnonTree(children)
};
let (goto_index, _) = self.stack.last().unwrap();
let goto_state = &table.state[*goto_index];
let index: usize = (goto_state.goto_start..goto_state.goto_end)
.find(|i| table.tree_rules[*i] == rule)
.expect("Unable to goto target after reduction")
.into();
let target_state: usize = table.tree_goto[index].into();
self.stack.push((target_state, value));
}
fn shift(&mut self, state: u16, tr: TokenRef) {
let target_state: usize = state.into();
self.stack.push((target_state, StackEntry::Token(tr)));
}
}
// This is what we set the panic level to when we get an error; we require
// this many successful token shifts to decide we're not lost.
const PANIC_THRESHOLD: u8 = 3;
// This is the maximum number of failed states that we're going to go through
// before we just try to reduce all the way out of the tree.
const THREAD_ERROR_LIMIT: u8 = 20;
pub fn table_parse(source: &str, table: &ParseTable) -> (Rc<SyntaxTree>, Rc<Lines>) {
let mut tokens = Tokens::new(source);
let mut syntax = SyntaxTree::new();
let mut threads = vec![ParseThread::initial(table.start_state)];
let mut next_threads = vec![];
let mut accepted_threads: Vec<ParseThread> = vec![];
let mut maybe_pushed_garbage = false;
// While we still have threads to run....
while threads.len() > 0 {
// We've still got live threads running, which means we've still got
// tokens to consume! Any thread that has accepted "early" should be
// penalized here.
for thread in accepted_threads.iter_mut() {
if thread.score > 0 {
thread.score -= 1;
}
}
// Grab us the next token from the stream.
// TODO: Collect ephemera before setting on the token.
let token = tokens.next();
let current_token = token.kind;
let current_token_ref = syntax.add_token(token, vec![]);
// Go over every thread in the list of threads to run. If a thread
// needs to keep running on this token it can push itself back onto
// the stack, and we'll re-consider it next time. (This is necessary
// for both reduce and for error handling.)
while let Some(mut thread) = threads.pop() {
let (state, _) = thread.stack.last().unwrap();
let state = &table.state[*state];
let action = (state.action_start..state.action_end)
.find(|i| table.token_kind[*i] == current_token)
.map(|i| &table.token_action[i])
.unwrap_or(&TokenAction::Error);
match action {
TokenAction::Reduce(rule, kind, count) => {
thread.reduce(table, &mut syntax, *count, *rule, Some(*kind));
thread.score += 1;
threads.push(thread); // Run me again, I can still work with this token.
}
TokenAction::ReduceAnonymous(rule, count) => {
thread.reduce(table, &mut syntax, *count, *rule, None);
thread.score += 1;
threads.push(thread); // Run me again, I can still work with this token.
}
TokenAction::Shift(state) => {
thread.shift(*state, current_token_ref);
thread.score += 1;
if thread.panic_count > 0 {
thread.panic_count -= 1;
} else if thread.error_count > 0 {
// TODO: We shifted a good number of tokens in a row,
// maybe we should consider reducing the error count
// here too, so that this thread might live for
// longer.
}
next_threads.push(thread);
}
TokenAction::Accept => {
thread.score += 1;
accepted_threads.push(thread);
}
// Error handling, the bane of LR parsers!
//
// In this parser, we borrow a trick from Tree-Sitter and
// treat the parse error as if it were an ambiguity: we see a
// token but don't know what to do with it, so we'll just try
// to do *everything* with it and see what sticks.
//
// The tricky part here is not causing an enormous explosion
// of threads, so we have certain conditions where we just
// give up and refuse to consider any more tokens for a given
// error thread.
//
TokenAction::Error => {
// First, report the error. (We use a pretty standard
// "panic" error recovery mode here to decide when to
// start showing new error messages, otherwise we would
// just generate *way* too many cascading errors.)
//
if thread.panic_count == 0 {
// TODO: Get a description for this state from the table somehow.
// TODO: Describe the error in an error message somehow.
let token = &syntax[current_token_ref];
let error_token = syntax.add_token(
Token::error(token.start(), token.end(), format!("PARSE ERROR")),
vec![],
);
// NOTE: `Error` stack entries are not counted when
// reducing, so we know this push here won't mess up
// the state machine.
thread.stack.push((0, StackEntry::Error(error_token)));
}
// Now mark the thread as panicing so that we don't
// produce too many random errors...
thread.panic_count = PANIC_THRESHOLD;
// Count the error.
// TODO: Check to see if this really does help thread explosion or not.
if thread.error_count < THREAD_ERROR_LIMIT {
thread.error_count += 1;
}
// Penalize this thread; this is not a great parse, we can tell.
if thread.score > 0 {
thread.score -= 1;
}
let mut executed = HashSet::new();
for index in state.action_start..state.action_end {
// Make absolutely sure we don't do the same thing
// twice! It can happen, and it is hugely wasteful
// because it spawns duplicate threads.
let action = &table.token_action[index];
if executed.contains(action) {
continue;
}
executed.insert(action.clone());
match action {
TokenAction::Error => {
panic!("Literal error in the table; table is corrupt")
}
TokenAction::Reduce(rule, kind, count) => {
// Let's pretend that we're done with the
// current rule and see what happens.
let mut new_thread = thread.clone();
new_thread.reduce(&table, &mut syntax, *count, *rule, Some(*kind));
threads.push(new_thread);
// Mark that we might have to trim the syntax
// tree because we might not use this
// reduction.
maybe_pushed_garbage = true;
}
TokenAction::ReduceAnonymous(rule, count) => {
// Let's pretend that we're done with the
// current rule and see what happens.
let mut new_thread = thread.clone();
new_thread.reduce(&table, &mut syntax, *count, *rule, None);
threads.push(new_thread);
}
TokenAction::Shift(state) => {
// Let's just pretend the current token
// matched this thing that we were looking
// for, and shift it anyway, and see what
// happens.
//
// This represents an expansion of the search
// space and so we only want to do it if we
// haven't reached our error limit yet.
if thread.error_count < THREAD_ERROR_LIMIT {
let mut new_thread = thread.clone();
new_thread.shift(*state, current_token_ref);
next_threads.push(new_thread);
}
}
TokenAction::Accept => accepted_threads.push(thread.clone()),
}
}
// Let's try to process the *next* token and see what
// happens with this same thread, unless we're giving up
// on the thread.
if thread.error_count < THREAD_ERROR_LIMIT {
next_threads.push(thread);
}
}
}
}
// Drain all the next_threads into the current stack and start again
// on the next token!
threads.append(&mut next_threads);
}
// OK no more threads, we're done. In theory at this point we should
// penalize all accepted threads for remaining tokens but if we've got no
// more threads and there are remaining tokens then they all hit their
// error limit and are basically equivalent. (Why penalize all threads by
// the same amount?)
//
// Let's just go through all the threads that "accepted" and pick the one
// with the highest score that also wound up with a named tree at the top.
let mut best_score = 0;
for thread in accepted_threads {
if thread.score >= best_score {
if let Some((_, StackEntry::Tree(tr))) = thread.stack.last() {
syntax.root = Some(*tr);
best_score = thread.score + 1;
}
}
}
// Now, our syntax tree might have errors in it, and if it does we might
// have pushed trees that we have no interest in ever seeing ever again.
// That means that we need to rewrite the tree starting from the root, to
// make sure that the trees in the syntax tree are for real for real.
if maybe_pushed_garbage {
let mut valid = HashSet::new();
let mut stack = Vec::new();
if let Some(tr) = &syntax.root {
stack.push(*tr);
}
while let Some(tr) = stack.pop() {
valid.insert(tr);
for x in syntax[tr].child_trees() {
stack.push(x);
}
}
for tr in syntax.trees.iter_mut() {
if !valid.contains(&tr.self_ref) {
tr.kind = TreeKind::Ignore;
}
}
}
(Rc::new(syntax), Rc::new(tokens.lines()))
}

1528
fine/grammar/parser.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,2 @@
[tool.black]
line-length=100

40
fine/grammar/rust.py Normal file
View file

@ -0,0 +1,40 @@
import io
import parser
def generate_rust_parser(output: io.TextIOBase, table: list[dict[str, parser.Action]]):
lines = []
tree_kinds = list(
sorted(
{
action[1]
for state in table
for action in state.values()
if action[0] == "reduce" and action[1][0] != "_"
}
)
)
# First, generate the treekind enumeration
lines.extend(
[
"#[derive(Debug, Eq, PartialEq)]",
"pub enum TreeKind {",
" Error,",
"",
]
)
lines.extend(f" {kind}," for kind in tree_kinds)
lines.extend(
[
"}",
"",
]
)
# Next generate the parse table
lines.extend([])
pass