diff --git a/.gitignore b/.gitignore index e5f2ac2a..8582096f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ /target /oden-js/target /oden-js-sys/target -/fine/target \ No newline at end of file +/fine/target + +.venv/ +__pycache__/ \ No newline at end of file diff --git a/fine/grammar/grammar.py b/fine/grammar/grammar.py new file mode 100644 index 00000000..aa1985ca --- /dev/null +++ b/fine/grammar/grammar.py @@ -0,0 +1,388 @@ +from parser import Assoc, Grammar, Nothing, Token, rule, seq + +ARROW = Token("Arrow") +AS = Token("As") +BAR = Token("Bar") +CLASS = Token("Class") +COLON = Token("Colon") +ELSE = Token("Else") +FOR = Token("For") +FUN = Token("Fun") +IDENTIFIER = Token("Identifier") +IF = Token("If") +IMPORT = Token("Import") +IN = Token("In") +LCURLY = Token("LeftBrace") +LET = Token("Let") +RCURLY = Token("RightBrace") +RETURN = Token("Return") +SEMICOLON = Token("Semicolon") +STRING = Token("String") +WHILE = Token("While") +EQUAL = Token("Equal") +LPAREN = Token("LeftParen") +RPAREN = Token("RightParen") +COMMA = Token("Comma") +SELF = Token("Selff") +OR = Token("Or") +IS = Token("Is") +AND = Token("And") +EQUALEQUAL = Token("EqualEqual") +BANGEQUAL = Token("BangEqual") +LESS = Token("Less") +GREATER = Token("Greater") +LESSEQUAL = Token("LessEqual") +GREATEREQUAL = Token("GreaterEqual") +PLUS = Token("Plus") +MINUS = Token("Minus") +STAR = Token("Star") +SLASH = Token("Slash") +NUMBER = Token("Number") +TRUE = Token("True") +FALSE = Token("False") +BANG = Token("Bang") +DOT = Token("Dot") +MATCH = Token("Match") +EXPORT = Token("Export") +UNDERSCORE = Token("Underscore") +NEW = Token("New") +LSQUARE = Token("LeftBracket") +RSQUARE = Token("RightBracket") + + +class FineGrammar(Grammar): + def __init__(self): + super().__init__( + precedence=[ + (Assoc.RIGHT, [EQUAL]), + (Assoc.LEFT, [OR]), + (Assoc.LEFT, [IS]), + (Assoc.LEFT, [AND]), + (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), + (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), + (Assoc.LEFT, [PLUS, MINUS]), + (Assoc.LEFT, [STAR, SLASH]), + (Assoc.LEFT, [self.primary_expression]), + (Assoc.LEFT, [LPAREN]), + (Assoc.LEFT, [DOT]), + # + # If there's a confusion about whether to make an IF + # statement or an expression, prefer the statement. + # + (Assoc.NONE, [self.if_statement]), + ] + ) + + @rule + def file(self): + return self.file_statement_list + + @rule + def file_statement_list(self): + return self.file_statement | (self.file_statement_list + self.file_statement) + + @rule + def file_statement(self): + return ( + self.import_statement | self.class_declaration | self.export_statement | self.statement + ) + + @rule + def import_statement(self): + return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) + + @rule + def class_declaration(self): + return seq(CLASS, IDENTIFIER, self.class_body) + + @rule + def class_body(self): + return seq(LCURLY, RCURLY) | seq(LCURLY, self.class_members, RCURLY) + + @rule + def class_members(self): + return self.class_member | seq(self.class_members, self.class_member) + + @rule + def class_member(self): + return self.field_declaration | self.function_declaration + + @rule + def field_declaration(self): + return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) + + # Types + @rule + def type_expression(self): + return self.alternate_type | self.type_identifier + + @rule + def alternate_type(self): + return seq(self.type_expression, BAR, self.type_identifier) + + @rule + def type_identifier(self): + return IDENTIFIER + + @rule + def export_statement(self): + return ( + seq(EXPORT, self.class_declaration) + | seq(EXPORT, self.function_declaration) + | seq(EXPORT, self.let_statement) + | seq(EXPORT, self.export_list, SEMICOLON) + ) + + @rule + def export_list(self): + return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) + + # Functions + @rule + def function_declaration(self): + return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( + FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block + ) + + @rule + def function_parameters(self): + return ( + seq(LPAREN, RPAREN) + | seq(LPAREN, self.first_parameter, RPAREN) + | seq(LPAREN, self.first_parameter, COMMA, self.parameter_list, RPAREN) + ) + + @rule + def first_parameter(self): + return SELF | self.parameter + + @rule + def parameter_list(self): + return Nothing | self.parameter | seq(self.parameter, COMMA, self.parameter_list) + + @rule + def parameter(self): + return seq(IDENTIFIER, COLON, self.type_expression) + + # Block + @rule + def block(self): + return ( + seq(LCURLY, RCURLY) + | seq(LCURLY, self.statement_list, RCURLY) + | seq(LCURLY, self.statement_list, self.expression, RCURLY) + ) + + @rule + def statement_list(self): + return self.statement | seq(self.statement_list, self.statement) + + @rule + def statement(self): + return ( + self.function_declaration + | self.let_statement + | self.return_statement + | self.for_statement + | self.if_statement + | self.while_statement + | self.expression_statement + ) + + @rule + def let_statement(self): + return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) + + @rule + def return_statement(self): + return seq(RETURN, self.expression, SEMICOLON) + + @rule + def for_statement(self): + return seq(FOR, self.iterator_variable, IN, self.expression, self.block) + + @rule + def iterator_variable(self): + return IDENTIFIER + + @rule + def if_statement(self): + return self.conditional_expression + + @rule + def while_statement(self): + return seq(WHILE, self.expression, self.block) + + @rule + def expression_statement(self): + return seq(self.expression, SEMICOLON) + + # Expressions + @rule + def expression(self): + return self.assignment_expression + + @rule + def assignment_expression(self): + return seq(self.or_expression, EQUAL, self.assignment_expression) | self.or_expression + + @rule + def or_expression(self): + return seq(self.or_expression, OR, self.is_expression) | self.is_expression + + @rule + def is_expression(self): + return seq(self.is_expression, IS, self.pattern) | self.and_expression + + @rule + def and_expression(self): + return seq(self.and_expression, AND, self.equality_expression) | self.equality_expression + + @rule + def equality_expression(self): + return ( + seq(self.equality_expression, EQUALEQUAL, self.relation_expression) + | seq(self.equality_expression, BANGEQUAL, self.relation_expression) + | self.relation_expression + ) + + @rule + def relation_expression(self): + return ( + seq(self.relation_expression, LESS, self.additive_expression) + | seq(self.relation_expression, LESSEQUAL, self.additive_expression) + | seq(self.relation_expression, GREATER, self.additive_expression) + | seq(self.relation_expression, GREATEREQUAL, self.additive_expression) + ) + + @rule + def additive_expression(self): + return ( + seq(self.additive_expression, PLUS, self.multiplication_expression) + | seq(self.additive_expression, MINUS, self.multiplication_expression) + | self.multiplication_expression + ) + + @rule + def multiplication_expression(self): + return ( + seq(self.multiplication_expression, STAR, self.primary_expression) + | seq(self.multiplication_expression, SLASH, self.primary_expression) + | self.primary_expression + ) + + @rule + def primary_expression(self): + return ( + IDENTIFIER + | SELF + | NUMBER + | STRING + | TRUE + | FALSE + | seq(BANG, self.primary_expression) + | seq(MINUS, self.primary_expression) + | self.block + | self.conditional_expression + | self.list_constructor_expression + | self.object_constructor_expression + | self.match_expression + | seq(self.primary_expression, LPAREN, self.expression_list, RPAREN) + | seq(self.primary_expression, DOT, IDENTIFIER) + | seq(LPAREN, self.expression, RPAREN) + ) + + @rule + def conditional_expression(self): + return ( + seq(IF, self.expression, self.block) + | seq(IF, self.expression, self.block, ELSE, self.conditional_expression) + | seq(IF, self.expression, self.block, ELSE, self.block) + ) + + @rule + def list_constructor_expression(self): + return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self.expression_list, RSQUARE) + + @rule + def expression_list(self): + return ( + self.expression + | seq(self.expression, COMMA) + | seq(self.expression, COMMA, self.expression_list) + ) + + @rule + def match_expression(self): + return seq(MATCH, self.match_body) + + @rule + def match_body(self): + return seq(LCURLY, RCURLY) | seq(LCURLY, self.match_arms, RCURLY) + + @rule + def match_arms(self): + return ( + self.match_arm + | seq(self.match_arm, COMMA) + | seq(self.match_arm, COMMA, self.match_arms) + ) + + @rule + def match_arm(self): + return seq(self.pattern, ARROW, self.expression) + + @rule + def pattern(self): + return ( + seq(self.variable_binding, self.pattern_core, AND, self.and_expression) + | seq(self.variable_binding, self.pattern_core) + | seq(self.pattern_core, AND, self.and_expression) + | self.pattern_core + ) + + @rule + def pattern_core(self): + return self.type_expression | self.wildcard_pattern + + @rule + def wildcard_pattern(self): + return UNDERSCORE + + @rule + def variable_binding(self): + return seq(IDENTIFIER, COLON) + + @rule + def object_constructor_expression(self): + return seq(NEW, self.type_identifier, self.field_list) + + @rule + def field_list(self): + return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) + + @rule + def field_values(self): + return ( + self.field_value + | seq(self.field_value, COMMA) + | seq(self.field_value, COMMA, self.field_values) + ) + + @rule + def field_value(self): + return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) + + +grammar = FineGrammar() +table = grammar.build_table(start="file") + +print(f"{len(table)} states") + +average_entries = sum(len(row) for row in table) / len(table) +max_entries = max(len(row) for row in table) +print(f"{average_entries} average, {max_entries} max") + +# print(parser_faster.format_table(gen, table)) +# print() +# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"]) diff --git a/fine/grammar/guessing.rs b/fine/grammar/guessing.rs new file mode 100644 index 00000000..2788a0ad --- /dev/null +++ b/fine/grammar/guessing.rs @@ -0,0 +1,422 @@ +// NOTE: Utterly Broken Ideas about Parse Tables. +// +// Committing this here so I can back it up. +use std::collections::HashSet; + +#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)] +pub enum ReduceRule { + // Generated + AlternateType, + Argument, + ArgumentList, + BinaryExpression, + Block, + CallExpression, + ClassDecl, + ConditionalExpression, + ExpressionStatement, + FieldDecl, + FieldList, + FieldValue, + File, + ForStatement, + FunctionDecl, + GroupingExpression, + Identifier, + IfStatement, + IsExpression, + IteratorVariable, + LetStatement, + ListConstructor, + ListConstructorElement, + LiteralExpression, + MatchArm, + MatchBody, + MatchExpression, + MemberAccess, + NewObjectExpression, + ParamList, + Parameter, + Pattern, + ReturnStatement, + ReturnType, + SelfParameter, + SelfReference, + TypeExpression, + TypeIdentifier, + TypeParameter, + TypeParameterList, + UnaryExpression, + VariableBinding, + WhileStatement, + WildcardPattern, + Import, + Export, + ExportList, +} + +#[derive(Eq, PartialEq, Hash, Copy, Clone)] +pub enum TokenAction { + Error, + Reduce(ReduceRule, TreeKind, u16), + ReduceAnonymous(ReduceRule, u16), + Accept, + Shift(u16), +} + +pub struct ParseState { + action_start: usize, + action_end: usize, + goto_start: usize, + goto_end: usize, +} + +pub struct ParseTable<'a> { + state: &'a [ParseState], + start_state: usize, + + token_action: &'a [TokenAction], + token_kind: &'a [TokenKind], + + tree_goto: &'a [u16], + tree_rules: &'a [ReduceRule], +} + +#[derive(Clone)] +enum StackEntry { + Nothing, + Tree(TreeRef), + AnonTree(Vec), + Token(TokenRef), + Error(TokenRef), +} + +#[derive(Clone)] +struct ParseThread { + stack: Vec<(usize, StackEntry)>, + panic_count: u8, + error_count: u8, + score: u32, +} + +impl ParseThread { + fn initial(start_state: usize) -> ParseThread { + ParseThread { + stack: vec![(start_state, StackEntry::Nothing)], + error_count: 0, + panic_count: 0, + score: 0, + } + } + + fn reduce( + &mut self, + table: &ParseTable, + syntax: &mut SyntaxTree, + count: u16, + rule: ReduceRule, + kind: Option, + ) { + let mut children = Vec::new(); + let count: usize = count.into(); + + let mut consumed = 0; + while consumed < count { + let Some((_, value)) = self.stack.pop() else { + break; + }; + + match value { + StackEntry::Nothing => panic!("Popped nothing!"), + StackEntry::Tree(t) => { + consumed += 1; + children.push(Child::Tree(t)); + } + StackEntry::AnonTree(mut cs) => { + consumed += 1; + children.append(&mut cs); + } + StackEntry::Token(t) => { + consumed += 1; + children.push(Child::Token(t)); + } + StackEntry::Error(t) => { + // Do *not* increment consumed; these don't count! + children.push(Child::Token(t)); + } + } + } + assert_eq!(consumed, count, "Stack underflow on reduce"); + + let value = if let Some(kind) = kind { + let tr = syntax.add_tree(Tree { + kind, + self_ref: TreeRef::from_index(0), + parent: None, + start_pos: 0, + end_pos: 0, + children, + }); + StackEntry::Tree(tr) + } else { + StackEntry::AnonTree(children) + }; + + let (goto_index, _) = self.stack.last().unwrap(); + let goto_state = &table.state[*goto_index]; + + let index: usize = (goto_state.goto_start..goto_state.goto_end) + .find(|i| table.tree_rules[*i] == rule) + .expect("Unable to goto target after reduction") + .into(); + let target_state: usize = table.tree_goto[index].into(); + self.stack.push((target_state, value)); + } + + fn shift(&mut self, state: u16, tr: TokenRef) { + let target_state: usize = state.into(); + self.stack.push((target_state, StackEntry::Token(tr))); + } +} + +// This is what we set the panic level to when we get an error; we require +// this many successful token shifts to decide we're not lost. +const PANIC_THRESHOLD: u8 = 3; + +// This is the maximum number of failed states that we're going to go through +// before we just try to reduce all the way out of the tree. +const THREAD_ERROR_LIMIT: u8 = 20; + +pub fn table_parse(source: &str, table: &ParseTable) -> (Rc, Rc) { + let mut tokens = Tokens::new(source); + let mut syntax = SyntaxTree::new(); + + let mut threads = vec![ParseThread::initial(table.start_state)]; + let mut next_threads = vec![]; + + let mut accepted_threads: Vec = vec![]; + let mut maybe_pushed_garbage = false; + + // While we still have threads to run.... + while threads.len() > 0 { + // We've still got live threads running, which means we've still got + // tokens to consume! Any thread that has accepted "early" should be + // penalized here. + for thread in accepted_threads.iter_mut() { + if thread.score > 0 { + thread.score -= 1; + } + } + + // Grab us the next token from the stream. + // TODO: Collect ephemera before setting on the token. + let token = tokens.next(); + let current_token = token.kind; + let current_token_ref = syntax.add_token(token, vec![]); + + // Go over every thread in the list of threads to run. If a thread + // needs to keep running on this token it can push itself back onto + // the stack, and we'll re-consider it next time. (This is necessary + // for both reduce and for error handling.) + while let Some(mut thread) = threads.pop() { + let (state, _) = thread.stack.last().unwrap(); + let state = &table.state[*state]; + + let action = (state.action_start..state.action_end) + .find(|i| table.token_kind[*i] == current_token) + .map(|i| &table.token_action[i]) + .unwrap_or(&TokenAction::Error); + + match action { + TokenAction::Reduce(rule, kind, count) => { + thread.reduce(table, &mut syntax, *count, *rule, Some(*kind)); + thread.score += 1; + threads.push(thread); // Run me again, I can still work with this token. + } + + TokenAction::ReduceAnonymous(rule, count) => { + thread.reduce(table, &mut syntax, *count, *rule, None); + thread.score += 1; + threads.push(thread); // Run me again, I can still work with this token. + } + + TokenAction::Shift(state) => { + thread.shift(*state, current_token_ref); + thread.score += 1; + if thread.panic_count > 0 { + thread.panic_count -= 1; + } else if thread.error_count > 0 { + // TODO: We shifted a good number of tokens in a row, + // maybe we should consider reducing the error count + // here too, so that this thread might live for + // longer. + } + + next_threads.push(thread); + } + + TokenAction::Accept => { + thread.score += 1; + accepted_threads.push(thread); + } + + // Error handling, the bane of LR parsers! + // + // In this parser, we borrow a trick from Tree-Sitter and + // treat the parse error as if it were an ambiguity: we see a + // token but don't know what to do with it, so we'll just try + // to do *everything* with it and see what sticks. + // + // The tricky part here is not causing an enormous explosion + // of threads, so we have certain conditions where we just + // give up and refuse to consider any more tokens for a given + // error thread. + // + TokenAction::Error => { + // First, report the error. (We use a pretty standard + // "panic" error recovery mode here to decide when to + // start showing new error messages, otherwise we would + // just generate *way* too many cascading errors.) + // + if thread.panic_count == 0 { + // TODO: Get a description for this state from the table somehow. + // TODO: Describe the error in an error message somehow. + + let token = &syntax[current_token_ref]; + let error_token = syntax.add_token( + Token::error(token.start(), token.end(), format!("PARSE ERROR")), + vec![], + ); + + // NOTE: `Error` stack entries are not counted when + // reducing, so we know this push here won't mess up + // the state machine. + thread.stack.push((0, StackEntry::Error(error_token))); + } + + // Now mark the thread as panicing so that we don't + // produce too many random errors... + thread.panic_count = PANIC_THRESHOLD; + + // Count the error. + // TODO: Check to see if this really does help thread explosion or not. + if thread.error_count < THREAD_ERROR_LIMIT { + thread.error_count += 1; + } + // Penalize this thread; this is not a great parse, we can tell. + if thread.score > 0 { + thread.score -= 1; + } + + let mut executed = HashSet::new(); + for index in state.action_start..state.action_end { + // Make absolutely sure we don't do the same thing + // twice! It can happen, and it is hugely wasteful + // because it spawns duplicate threads. + let action = &table.token_action[index]; + if executed.contains(action) { + continue; + } + executed.insert(action.clone()); + + match action { + TokenAction::Error => { + panic!("Literal error in the table; table is corrupt") + } + TokenAction::Reduce(rule, kind, count) => { + // Let's pretend that we're done with the + // current rule and see what happens. + let mut new_thread = thread.clone(); + new_thread.reduce(&table, &mut syntax, *count, *rule, Some(*kind)); + threads.push(new_thread); + + // Mark that we might have to trim the syntax + // tree because we might not use this + // reduction. + maybe_pushed_garbage = true; + } + TokenAction::ReduceAnonymous(rule, count) => { + // Let's pretend that we're done with the + // current rule and see what happens. + let mut new_thread = thread.clone(); + new_thread.reduce(&table, &mut syntax, *count, *rule, None); + threads.push(new_thread); + } + TokenAction::Shift(state) => { + // Let's just pretend the current token + // matched this thing that we were looking + // for, and shift it anyway, and see what + // happens. + // + // This represents an expansion of the search + // space and so we only want to do it if we + // haven't reached our error limit yet. + if thread.error_count < THREAD_ERROR_LIMIT { + let mut new_thread = thread.clone(); + new_thread.shift(*state, current_token_ref); + next_threads.push(new_thread); + } + } + TokenAction::Accept => accepted_threads.push(thread.clone()), + } + } + + // Let's try to process the *next* token and see what + // happens with this same thread, unless we're giving up + // on the thread. + if thread.error_count < THREAD_ERROR_LIMIT { + next_threads.push(thread); + } + } + } + } + + // Drain all the next_threads into the current stack and start again + // on the next token! + threads.append(&mut next_threads); + } + + // OK no more threads, we're done. In theory at this point we should + // penalize all accepted threads for remaining tokens but if we've got no + // more threads and there are remaining tokens then they all hit their + // error limit and are basically equivalent. (Why penalize all threads by + // the same amount?) + // + // Let's just go through all the threads that "accepted" and pick the one + // with the highest score that also wound up with a named tree at the top. + let mut best_score = 0; + for thread in accepted_threads { + if thread.score >= best_score { + if let Some((_, StackEntry::Tree(tr))) = thread.stack.last() { + syntax.root = Some(*tr); + best_score = thread.score + 1; + } + } + } + + // Now, our syntax tree might have errors in it, and if it does we might + // have pushed trees that we have no interest in ever seeing ever again. + // That means that we need to rewrite the tree starting from the root, to + // make sure that the trees in the syntax tree are for real for real. + if maybe_pushed_garbage { + let mut valid = HashSet::new(); + let mut stack = Vec::new(); + if let Some(tr) = &syntax.root { + stack.push(*tr); + } + while let Some(tr) = stack.pop() { + valid.insert(tr); + for x in syntax[tr].child_trees() { + stack.push(x); + } + } + + for tr in syntax.trees.iter_mut() { + if !valid.contains(&tr.self_ref) { + tr.kind = TreeKind::Ignore; + } + } + } + + (Rc::new(syntax), Rc::new(tokens.lines())) +} diff --git a/fine/grammar/parser.py b/fine/grammar/parser.py new file mode 100644 index 00000000..b01a09c4 --- /dev/null +++ b/fine/grammar/parser.py @@ -0,0 +1,1528 @@ +"""This is a small helper library to generate LR parser tables. + +The primary inspiration for this library is tree-sitter, which also generates +LR parsers for grammars written in a turing-complete language. Like that, we +write grammars in a language, only we do it in Python instead of JavaScript. + +Why Python? Because Python 3 is widely pre-installed on MacOS and Unix. This +library requires nothing more than the basic standard library, and not even a +new version of it. Therefore, it turns out to be a pretty light dependency for +a rust or C++ or something kind of project. (Tree-sitter, on the other hand, +requires node, which is a far less stable and available runtime in 2024.) + +The parser tables can really be used to power anything. I prefer to make +concrete syntax trees (again, see tree-sitter), and there is no facility at all +for actions or custom ASTs or whatnot. Any such processing needs to be done by +the thing that processes the tables. + +## Making Grammars + +To get started, create a grammar that derives from the `Grammar` class. Create +one method per nonterminal, decorated with the `rule` decorator, as follows: + +``` +PLUS = Token("+") +IDENTIFIER = Token("Identifier") +NUMBER = Token("Number") + +class DumbGrammar(Grammar): + @rule + def expression(self): + return IDENTIFIER | NUMBER | seq(self.expression, PLUS, self.variable) +``` + +You get it. + +TODO: Obviously you need your own lexer, and there's... not really parsing. + + +## Some History + +The first version of this code was written as an idle exercise to learn how LR +parser table generation even worked. It was... very simple, fairly easy to +follow, and just *incredibly* slow. Like, mind-bogglingly slow. Unusably slow +for anything but the most trivial grammar. + +As a result, when I decided I wanted to use it for a larger grammar, I found that +I just couldn't. So this has been hacked and significantly improved from that +version, now capable of building tables for nontrivial grammars. It could still +be a lot faster, but it meets my needs for now. + +2024 +""" + +import abc +import collections +import dataclasses +import enum +import functools +import inspect +import sys +import typing + + +############################################################################### +# LR0 +# +# We start with LR0 parsers, because they form the basis of everything else. +############################################################################### +class Configuration: + """A rule being tracked in a state. + + (Note: technically, lookahead isn't used until we get to LR(1) parsers, + but if left at its default it's harmless. Ignore it until you get to + the part about LR(1).) + """ + + __slots__ = ( + "name", + "symbols", + "position", + "lookahead", + "next", + "at_end", + "_vals", + "_hash", + ) + + name: int + symbols: typing.Tuple[int, ...] + position: int + lookahead: typing.Tuple[int, ...] + next: int | None + at_end: bool + + _vals: typing.Tuple + _hash: int + + def __init__(self, name, symbols, position, lookahead) -> None: + self.name = name + self.symbols = symbols + self.position = position + self.lookahead = lookahead + + at_end = position == len(symbols) + self.at_end = at_end + self.next = symbols[position] if not at_end else None + + self._vals = (name, symbols, position, lookahead) + self._hash = hash(self._vals) + + @classmethod + def from_rule(cls, name: int, symbols: typing.Tuple[int, ...], lookahead=()): + return Configuration( + name=name, + symbols=symbols, + position=0, + lookahead=lookahead, + ) + + def __hash__(self) -> int: + return self._hash + + def __eq__(self, value: object, /) -> bool: + if value is self: + return True + if not isinstance(value, Configuration): + return NotImplemented + + return ( + value._hash == self._hash + and value.name == self.name + and value.position == self.position + and value.symbols == self.symbols + and value.lookahead == self.lookahead + ) + + def __lt__(self, value) -> bool: + if not isinstance(value, Configuration): + return NotImplemented + return self._vals < value._vals + + def __gt__(self, value) -> bool: + if not isinstance(value, Configuration): + return NotImplemented + return self._vals > value._vals + + def __le__(self, value) -> bool: + if not isinstance(value, Configuration): + return NotImplemented + return self._vals <= value._vals + + def __ge__(self, value) -> bool: + if not isinstance(value, Configuration): + return NotImplemented + return self._vals >= value._vals + + def replace_position(self, new_position): + return Configuration( + name=self.name, + symbols=self.symbols, + position=new_position, + lookahead=self.lookahead, + ) + + def clear_lookahead(self): + return Configuration( + name=self.name, + symbols=self.symbols, + position=self.position, + lookahead=(), + ) + + @property + def rest(self): + return self.symbols[(self.position + 1) :] + + def format(self, alphabet: list[str]) -> str: + la = ", " + str(tuple(alphabet[i] for i in self.lookahead)) if self.lookahead != () else "" + return "{name} -> {bits}{lookahead}".format( + name=alphabet[self.name], + bits=" ".join( + [ + "* " + alphabet[sym] if i == self.position else alphabet[sym] + for i, sym in enumerate(self.symbols) + ] + ) + + (" *" if self.at_end else ""), + lookahead=la, + ) + + +ConfigSet = typing.Tuple[Configuration, ...] + + +class ConfigurationSetInfo: + """When we build a grammar into a table, the first thing we need to do is + generate all the configuration sets and their successors. This is the + structure that tracks the result of that computation. + + (Different generators vary in the details of how they generate this + structure, but they all compute this information.) + """ + + config_set_key: dict[ConfigSet, int] + sets: list[ConfigSet] + successors: list[dict[int, int]] + + def __init__(self): + self.config_set_key = {} + self.sets = [] + self.successors = [] + + def register_config_set(self, c: ConfigSet) -> typing.Tuple[int, bool]: + """Potentially add a new config set to the set of sets. Returns the + canonical ID of the set within this structure, along with a boolean + indicating whether the set was just added or not. + + (You can use this integer to get the set back, if you need it, and + also access the successors table.) + """ + existing = self.config_set_key.get(c) + if existing is not None: + return existing, False + + index = len(self.sets) + self.sets.append(c) + self.successors.append({}) + self.config_set_key[c] = index + return index, True + + def add_successor(self, c_id: int, symbol: int, successor: int): + """Register sucessor(`c_id`, `symbol`) -> `successor`""" + self.successors[c_id][symbol] = successor + + def find_path_to_set(self, target_set: ConfigSet) -> list[int]: + target_index = self.config_set_key[target_set] + visited = set() + + queue: collections.deque = collections.deque() + queue.appendleft((0, [])) + while len(queue) > 0: + set_index, path = queue.pop() + if set_index == target_index: + return path + + if set_index in visited: + continue + visited.add(set_index) + + for symbol, successor in self.successors[set_index].items(): + queue.appendleft((successor, path + [symbol])) + + raise KeyError("Unable to find a path to the target set!") + + +class Assoc(enum.Enum): + """Associativity of a rule.""" + + NONE = 0 + LEFT = 1 + RIGHT = 2 + + +class ErrorCollection: + errors: dict[ConfigSet, dict[int, dict[Configuration, typing.Tuple]]] + + def __init__(self): + self.errors = {} + + def any(self) -> bool: + return len(self.errors) > 0 + + def add_error( + self, + config_set: ConfigSet, + symbol: int, + config: Configuration, + action: typing.Tuple, + ): + set_errors = self.errors.get(config_set) + if set_errors is None: + set_errors = {} + self.errors[config_set] = set_errors + + symbol_errors = set_errors.get(symbol) + if symbol_errors is None: + symbol_errors = {} + set_errors[symbol] = symbol_errors + + symbol_errors[config] = action + + def format( + self, + alphabet: list[str], + all_sets: ConfigurationSetInfo, + ) -> str | None: + if len(self.errors) is None: + return None + + errors = [] + for config_set, set_errors in self.errors.items(): + path = all_sets.find_path_to_set(config_set) + path_str = " ".join(alphabet[s] for s in path) + + for symbol, symbol_errors in set_errors.items(): + lines = [] + lines.append( + f"When we have parsed '{path_str}' and see '{alphabet[symbol]}' we don't know whether:" + ) + for config, action in symbol_errors.items(): + name = alphabet[config.name] + rule = " ".join( + f"{'* ' if config.position == i else ''}{alphabet[s]}" + for i, s in enumerate(config.symbols) + ) + if config.next is None: + rule += " *" + + if action[0] == "reduce": + action_str = f"pop {action[2]} values off the stack and make a {action[1]}" + elif action[0] == "shift": + action_str = "consume the token and keep going" + elif action[0] == "accept": + action_str = "accept the parse" + else: + assert action[0] == "goto", f"Unknown action {action[0]}" + raise Exception("Shouldn't conflict on goto ever") + + lines.append( + f" - We are in the rule `{name}: {rule}` and we should {action_str}" + ) + + errors.append("\n".join(lines)) + + return "\n\n".join(errors) + + +class TableBuilder(object): + errors: ErrorCollection + table: list[dict[str, typing.Tuple]] + alphabet: list[str] + precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] + row: None | list[typing.Tuple[None | typing.Tuple, None | Configuration]] + + def __init__( + self, + alphabet: list[str], + precedence: typing.Tuple[typing.Tuple[Assoc, int], ...], + ): + self.errors = ErrorCollection() + self.table = [] + self.alphabet = alphabet + self.precedence = precedence + self.row = None + + def flush(self, all_sets: ConfigurationSetInfo) -> list[dict[str, typing.Tuple]]: + self._flush_row() + if self.errors.any(): + errors = self.errors.format(self.alphabet, all_sets) + raise ValueError(f"Errors building the table:\n\n{errors}") + return self.table + + def new_row(self, config_set: ConfigSet): + self._flush_row() + self.row = [(None, None) for _ in self.alphabet] + self.current_config_set = config_set + + def _flush_row(self): + if self.row: + actions = {self.alphabet[k]: v[0] for k, v in enumerate(self.row) if v[0] is not None} + self.table.append(actions) + + def set_table_reduce(self, symbol: int, config: Configuration): + action = ("reduce", self.alphabet[config.name], len(config.symbols)) + self._set_table_action(symbol, action, config) + + def set_table_accept(self, symbol: int, config: Configuration): + action = ("accept",) + self._set_table_action(symbol, action, config) + + def set_table_shift(self, symbol: int, index: int, config: Configuration): + action = ("shift", index) + self._set_table_action(symbol, action, config) + + def set_table_goto(self, symbol: int, index: int): + action = ("goto", index) + self._set_table_action(symbol, action, None) + + def _action_precedence(self, symbol: int, action: typing.Tuple, config: Configuration): + if action[0] == "shift": + return self.precedence[symbol] + else: + return self.precedence[config.name] + + def _set_table_action(self, symbol_id: int, action: typing.Tuple, config: Configuration | None): + """Set the action for 'symbol' in the table row to 'action'. + + This is destructive; it changes the table. It raises an error if + there is already an action for the symbol in the row. + """ + assert isinstance(symbol_id, int) + + assert self.row is not None + existing, existing_config = self.row[symbol_id] + if existing is not None and existing != action: + assert existing_config is not None + assert config is not None + + existing_assoc, existing_prec = self._action_precedence( + symbol_id, existing, existing_config + ) + new_assoc, new_prec = self._action_precedence(symbol_id, action, config) + + if existing_prec > new_prec: + # Precedence of the action in the table already wins, do nothing. + return + + elif existing_prec == new_prec: + # It's an actual conflict, use associativity if we can. + # If there's a conflict in associativity then it's a real conflict! + assoc = Assoc.NONE + if existing_assoc == Assoc.NONE: + assoc = new_assoc + elif new_assoc == Assoc.NONE: + assoc = existing_assoc + elif new_assoc == existing_assoc: + assoc = new_assoc + + resolved = False + if assoc == Assoc.LEFT: + # Prefer reduce over shift + if action[0] == "shift" and existing[0] == "reduce": + action = existing + resolved = True + elif action[0] == "reduce" and existing[0] == "shift": + resolved = True + + elif assoc == Assoc.RIGHT: + # Prefer shift over reduce + if action[0] == "shift" and existing[0] == "reduce": + resolved = True + elif action[0] == "reduce" and existing[0] == "shift": + action = existing + resolved = True + + if not resolved: + # Record the conflicts. + self.errors.add_error( + self.current_config_set, symbol_id, existing_config, existing + ) + self.errors.add_error(self.current_config_set, symbol_id, config, action) + + else: + # Precedence of the new action is greater than the existing + # action, just allow the overwrite with no change. + pass + + self.row[symbol_id] = (action, config) + + +class GenerateLR0(object): + """Generate parser tables for an LR0 parser. + + The input grammars are of the form: + + grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), + ] + + Which is to say, they are a list of productions. Each production is a + tuple where the first element of the tuple is the name of the + non-terminal being added, and the second elment of the tuple is the + list of terminals and non-terminals that make up the production. + + There is currently no support for custom actions or alternation or + anything like that. If you want alternations that you'll have to lower + the grammar by hand into the simpler form first. + + Don't name anything with double-underscores; those are reserved for + the generator. Don't add '$' either, as it is reserved to mean + end-of-stream. Use an empty list to indicate nullability, that is: + + ('O', []), + + means that O can be matched with nothing. + """ + + alphabet: list[str] + grammar: list[list[typing.Tuple[int, ...]]] + nonterminal: typing.Tuple[bool, ...] + terminal: typing.Tuple[bool, ...] + precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] + + symbol_key: dict[str, int] + start_symbol: int + end_symbol: int + + config_sets_key: dict[ConfigSet, int] + successors: list[set[int]] + + def __init__( + self, + start: str, + grammar: list[typing.Tuple[str, list[str]]], + precedence: None | dict[str, typing.Tuple[Assoc, int]] = None, + ): + """Initialize the parser generator with the specified grammar and + start symbol. + """ + + # Work out the alphabet. + alphabet = set() + for name, rule in grammar: + alphabet.add(name) + alphabet.update(symbol for symbol in rule) + + # Check to make sure they didn't use anything that will give us + # heartburn later. + reserved = [a for a in alphabet if a.startswith("__") or a == "$"] + if reserved: + raise ValueError( + "Can't use {symbols} in grammars, {what} reserved.".format( + symbols=" or ".join(reserved), + what="it's" if len(reserved) == 1 else "they're", + ) + ) + + alphabet.add("__start") + alphabet.add("$") + self.alphabet = list(sorted(alphabet)) + + symbol_key = {symbol: index for index, symbol in enumerate(self.alphabet)} + + start_symbol = symbol_key["__start"] + end_symbol = symbol_key["$"] + + assert self.alphabet[start_symbol] == "__start" + assert self.alphabet[end_symbol] == "$" + + # Turn the incoming grammar into a dictionary, indexed by nonterminal. + # + # We count on python dictionaries retaining the insertion order, like + # it or not. + full_grammar: list[list] = [list() for _ in self.alphabet] + terminal: list[bool] = [True for _ in self.alphabet] + assert terminal[end_symbol] + + nonterminal = [False for _ in self.alphabet] + + for name, rule in grammar: + name_symbol = symbol_key[name] + + terminal[name_symbol] = False + nonterminal[name_symbol] = True + + rules = full_grammar[name_symbol] + rules.append(tuple(symbol_key[symbol] for symbol in rule)) + + self.grammar = full_grammar + self.grammar[start_symbol].append((symbol_key[start],)) + terminal[start_symbol] = False + nonterminal[start_symbol] = True + + self.terminal = tuple(terminal) + self.nonterminal = tuple(nonterminal) + + assert self.terminal[end_symbol] + assert self.nonterminal[start_symbol] + + if precedence is None: + precedence = {} + self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet) + + self.symbol_key = symbol_key + self.start_symbol = start_symbol + self.end_symbol = end_symbol + + @functools.cache + def gen_closure_next(self, config: Configuration): + """Return the next set of configurations in the closure for + config. + + If the position for config is just before a non-terminal, then the + next set of configurations is configurations for all of the + productions for that non-terminal, with the position at the + beginning. (If the position for config is just before a terminal, + or at the end of the production, then the next set is empty.) + """ + next = config.next + if next is None: + return () + else: + return tuple(Configuration.from_rule(next, rule) for rule in self.grammar[next]) + + def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet: + """Compute the closure for the specified configs. The closure is all + of the configurations we could be in. Specifically, if the position + for a config is just before a non-terminal then we must also consider + configurations where the rule is the rule for the non-terminal and + the position is just before the beginning of the rule. + + (We have replaced a recursive version with an iterative one.) + """ + closure = set() + pending = list(seeds) + pending_next = [] + while len(pending) > 0: + for config in pending: + if config in closure: + continue + + closure.add(config) + for next_config in self.gen_closure_next(config): + pending_next.append(next_config) + + temp = pending + pending = pending_next + pending_next = temp + pending_next.clear() + + return tuple(sorted(closure)) # TODO: Why tuple? + + def gen_successor(self, config_set: typing.Iterable[Configuration], symbol: int) -> ConfigSet: + """Compute the successor state for the given config set and the + given symbol. + + The successor represents the next state of the parser after seeing + the symbol. + """ + seeds = tuple( + config.replace_position(config.position + 1) + for config in config_set + if config.next == symbol + ) + + closure = self.gen_closure(seeds) + return closure + + def gen_all_successors( + self, config_set: typing.Iterable[Configuration] + ) -> list[typing.Tuple[int, ConfigSet]]: + """Return all of the non-empty successors for the given config set.""" + possible = tuple(sorted({config.next for config in config_set if config.next is not None})) + + next = [] + for symbol in possible: + successor = self.gen_successor(config_set, symbol) + if len(successor) > 0: + next.append((symbol, successor)) + + return next + + def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo: + """Generate all configuration sets starting from the provided set.""" + result = ConfigurationSetInfo() + + successors = [] + pending = [config_set] + pending_next = [] + while len(pending) > 0: + for config_set in pending: + id, is_new = result.register_config_set(config_set) + if is_new: + for symbol, successor in self.gen_all_successors(config_set): + successors.append((id, symbol, successor)) + pending_next.append(successor) + + temp = pending + pending = pending_next + pending_next = temp + pending_next.clear() + + for id, symbol, successor in successors: + result.add_successor(id, symbol, result.config_set_key[successor]) + + return result + + def gen_all_sets(self) -> ConfigurationSetInfo: + """Generate all of the configuration sets for the grammar.""" + seeds = tuple( + Configuration.from_rule(self.start_symbol, rule) + for rule in self.grammar[self.start_symbol] + ) + initial_set = self.gen_closure(seeds) + return self.gen_sets(initial_set) + + def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: + """Return the set of symbols that indicate we should reduce the given + configuration. + + In an LR0 parser, this is just the set of all terminals.""" + del config + return [index for index, value in enumerate(self.terminal) if value] + + def gen_table(self): + """Generate the parse table. + + The parse table is a list of states. The first state in the list is + the starting state. Each state is a dictionary that maps a symbol to an + action. Each action is a tuple. The first element of the tuple is a + string describing what to do: + + - 'shift': The second element of the tuple is the state + number. Consume the input and push that state onto the stack. + + - 'reduce': The second element is the name of the non-terminal being + reduced, and the third element is the number of states to remove + from the stack. Don't consume the input; just remove the specified + number of things from the stack, and then consult the table again, + this time using the new top-of-stack as the current state and the + name of the non-terminal to find out what to do. + + - 'goto': The second element is the state number to push onto the + stack. In the literature, these entries are treated distinctly from + the actions, but we mix them here because they never overlap with the + other actions. (These are always associated with non-terminals, and + the other actions are always associated with terminals.) + + - 'accept': Accept the result of the parse, it worked. + + Anything missing from the row indicates an error. + """ + config_sets = self.gen_all_sets() + builder = TableBuilder(self.alphabet, self.precedence) + + for config_set_id, config_set in enumerate(config_sets.sets): + builder.new_row(config_set) + successors = config_sets.successors[config_set_id] + + for config in config_set: + config_next = config.next + if config_next is None: + if config.name != self.start_symbol: + for a in self.gen_reduce_set(config): + builder.set_table_reduce(a, config) + else: + builder.set_table_accept(self.end_symbol, config) + + elif self.terminal[config_next]: + index = successors[config_next] + builder.set_table_shift(config_next, index, config) + + # Gotos + for symbol, index in successors.items(): + if self.nonterminal[symbol]: + builder.set_table_goto(symbol, index) + + return builder.flush(config_sets) + + +def parse(table, input, trace=False): + """Parse the input with the generated parsing table and return the + concrete syntax tree. + + The parsing table can be generated by GenerateLR0.gen_table() or by any + of the other generators below. The parsing mechanism never changes, only + the table generation mechanism. + + input is a list of tokens. Don't stick an end-of-stream marker, I'll stick + one on for you. + """ + assert "$" not in input + input = input + ["$"] + input_index = 0 + + # Our stack is a stack of tuples, where the first entry is the state number + # and the second entry is the 'value' that was generated when the state was + # pushed. + stack: list[typing.Tuple[int, typing.Any]] = [(0, None)] + while True: + current_state = stack[-1][0] + current_token = input[input_index] + + action = table[current_state].get(current_token, ("error",)) + if trace: + print( + "{stack: <20} {input: <50} {action: <5}".format( + stack=repr([s[0] for s in stack]), + input=repr(input[input_index:]), + action=repr(action), + ) + ) + + if action[0] == "accept": + return stack[-1][1] + + elif action[0] == "reduce": + name = action[1] + size = action[2] + + value = (name, tuple(s[1] for s in stack[-size:])) + stack = stack[:-size] + + goto = table[stack[-1][0]].get(name, ("error",)) + assert goto[0] == "goto" # Corrupt table? + stack.append((goto[1], value)) + + elif action[0] == "shift": + stack.append((action[1], (current_token, ()))) + input_index += 1 + + elif action[0] == "error": + raise ValueError( + "Syntax error: unexpected symbol {sym}".format( + sym=current_token, + ), + ) + + +############################################################################### +# SLR(1) +############################################################################### +def add_changed(items: set[int], item: int) -> bool: + old_len = len(items) + items.add(item) + return old_len != len(items) + + +def update_changed(items: set[int], other: set[int]) -> bool: + old_len = len(items) + items.update(other) + return old_len != len(items) + + +@dataclasses.dataclass(frozen=True) +class FirstInfo: + firsts: list[set[int]] + is_epsilon: list[bool] + + @classmethod + def from_grammar( + cls, + grammar: list[list[typing.Tuple[int, ...]]], + terminal: typing.Tuple[bool, ...], + ): + # Add all terminals to their own firsts + firsts: list[set[int]] = [] + for index, is_terminal in enumerate(terminal): + firsts.append(set()) + if is_terminal: + firsts[index].add(index) + + epsilons = [False for _ in terminal] + changed = True + while changed: + changed = False + for name, rules in enumerate(grammar): + f = firsts[name] + for rule in rules: + if len(rule) == 0: + changed = changed or not epsilons[name] + epsilons[name] = True + continue + + for index, symbol in enumerate(rule): + other_firsts = firsts[symbol] + changed = update_changed(f, other_firsts) or changed + + is_last = index == len(rule) - 1 + if is_last and epsilons[symbol]: + # If this is the last symbol and the last + # symbol can be empty then I can be empty + # too! :P + changed = changed or not epsilons[name] + epsilons[name] = True + + if not epsilons[symbol]: + # If we believe that there is at least one + # terminal in the first set of this + # nonterminal then I don't have to keep + # looping through the symbols in this rule. + break + + return FirstInfo(firsts=firsts, is_epsilon=epsilons) + + +@dataclasses.dataclass(frozen=True) +class FollowInfo: + follows: list[set[int]] + + @classmethod + def from_grammar( + cls, + grammar: list[list[typing.Tuple[int, ...]]], + terminal: typing.Tuple[bool, ...], + start_symbol: int, + end_symbol: int, + firsts: FirstInfo, + ): + follows: list[set[int]] = [set() for _ in grammar] + follows[start_symbol].add(end_symbol) + + changed = True + while changed: + changed = False + for name, rules in enumerate(grammar): + for rule in rules: + epsilon = True + prev_symbol = None + for symbol in reversed(rule): + f = follows[symbol] + if terminal[symbol]: + # This particular rule can't produce epsilon. + epsilon = False + prev_symbol = symbol + continue + + # While epsilon is still set, update the follow of + # this nonterminal with the follow of the production + # we're processing. (This also means that the follow + # of the last symbol in the production is the follow + # of the entire production, as it should be.) + if epsilon: + changed = update_changed(f, follows[name]) or changed + + # If we're not at the end of the list then the follow + # of the current symbol contains the first of the + # next symbol. + if prev_symbol is not None: + changed = update_changed(f, firsts.firsts[prev_symbol]) or changed + + # Now if there's no epsilon in this symbol there's no + # more epsilon in the rest of the sequence. + if not firsts.is_epsilon[symbol]: + epsilon = False + + prev_symbol = symbol + + return FollowInfo(follows=follows) + + +class GenerateSLR1(GenerateLR0): + """Generate parse tables for SLR1 grammars. + + SLR1 parsers can recognize more than LR0 parsers, because they have a + little bit more information: instead of generating reduce actions for a + production on all possible inputs, as LR0 parsers do, they generate + reduce actions only for inputs that are in the 'follow' set of the + non-terminal. + + That means SLR1 parsers need to know how to generate 'follow(A)', which + means they need to know how to generate 'first(A)', which is most of the + code in this class. + """ + + _firsts: FirstInfo + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._firsts = FirstInfo.from_grammar(self.grammar, self.terminal) + self._follows = FollowInfo.from_grammar( + self.grammar, + self.terminal, + self.start_symbol, + self.end_symbol, + self._firsts, + ) + + def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: + """Return the first set for a sequence of symbols. + + Build the set by combining the first sets of the symbols from left to + right as long as epsilon remains in the first set. If we reach the end + and every symbol has had epsilon, then this set also has epsilon. + + Otherwise we can stop as soon as we get to a non-epsilon first(), and + our result does not have epsilon. + """ + result = set() + for s in symbols: + result.update(self._firsts.firsts[s]) + if not self._firsts.is_epsilon[s]: + return (result, False) + + return (result, True) + + def gen_follow(self, symbol: int) -> set[int]: + """Generate the follow set for the given nonterminal. + + The follow set for a nonterminal is the set of terminals that can + follow the nonterminal in a valid sentence. The resulting set never + contains epsilon and is never empty, since we should always at least + ground out at '$', which is the end-of-stream marker. + """ + return self._follows.follows[symbol] + + def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: + """Return the set of symbols that indicate we should reduce the given + config. + + In an SLR1 parser, this is the follow set of the config nonterminal.""" + return self.gen_follow(config.name) + + +class GenerateLR1(GenerateSLR1): + """Generate parse tables for LR1, or "canonical LR" grammars. + + LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they + are choosier about when they reduce. But unlike SLR parsers, they specify + the terminals on which they reduce by carrying a 'lookahead' terminal in + the configuration. The lookahead of a configuration is computed as the + closure of a configuration set is computed, so see gen_closure_next for + details. (Except for the start configuration, which has '$' as its + lookahead.) + """ + + def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: + """Return the set of symbols that indicate we should reduce the given + config. + + In an LR1 parser, this is the lookahead of the configuration.""" + return config.lookahead + + @functools.cache + def gen_closure_next(self, config: Configuration): + """Return the next set of configurations in the closure for + config. + + In LR1 parsers, we must compute the lookahead for the configurations + we're adding to the closure. The lookahead for the new configurations + is the first() of the rest of this config's production. If that + contains epsilon, then the lookahead *also* contains the lookahead we + already have. (This lookahead was presumably generated by the same + process, so in some sense it is a 'parent' lookahead, or a lookahead + from an upstream production in the grammar.) + + (See the documentation in GenerateLR0 for more information on how + this function fits into the whole process.) + """ + config_next = config.next + if config_next is None: + return () + else: + next = [] + for rule in self.grammar[config_next]: + lookahead, epsilon = self.gen_first(config.rest) + if epsilon: + lookahead.update(config.lookahead) + lookahead_tuple = tuple(sorted(lookahead)) + next.append(Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple)) + + return tuple(sorted(next)) + + def gen_all_sets(self): + """Generate all of the configuration sets for the grammar. + + In LR1 parsers, we must remember to set the lookahead of the start + symbol to '$'. + """ + seeds = tuple( + Configuration.from_rule(self.start_symbol, rule, lookahead=(self.end_symbol,)) + for rule in self.grammar[self.start_symbol] + ) + initial_set = self.gen_closure(seeds) + return self.gen_sets(initial_set) + + +class GenerateLALR(GenerateLR1): + """Generate tables for LALR. + + LALR is smaller than LR(1) but bigger than SLR(1). It works by generating + the LR(1) configuration sets, but merging configuration sets which are + equal in everything but their lookaheads. This works in that it doesn't + generate any shift/reduce conflicts that weren't already in the LR(1) + grammar. It can, however, introduce new reduce/reduce conflicts, because + it does lose information. The advantage is that the number of parser + states is much much smaller in LALR than in LR(1). + + (Note that because we use immutable state everywhere this generator does + a lot of copying and allocation.) + """ + + def merge_sets(self, config_set_a, config_set_b): + """Merge the two config sets, by keeping the item cores but merging + the lookahead sets for each item. + """ + assert len(config_set_a) == len(config_set_b) + merged = [] + for index, a in enumerate(config_set_a): + b = config_set_b[index] + assert a.clear_lookahead() == b.clear_lookahead() + + new_lookahead = a.lookahead + b.lookahead + new_lookahead = tuple(sorted(set(new_lookahead))) + merged.append(a.clear_lookahead()) + + return tuple(merged) + + def sets_equal(self, a, b): + a_no_la = tuple(s.clear_lookahead() for s in a) + b_no_la = tuple(s.clear_lookahead() for s in b) + return a_no_la == b_no_la + + def gen_sets(self, config_set) -> ConfigurationSetInfo: + """Recursively generate all configuration sets starting from the + provided set, and merge them with the provided set 'F'. + + The difference between this method and the one in GenerateLR0, where + this comes from, is in the part that stops recursion. In LALR we + compare for set equality *ignoring lookahead*. If we find a match, + then instead of returning F unchanged, we merge the two equal sets + and replace the set in F, returning the modified set. + """ + F = {} + successors = [] + pending = [config_set] + while len(pending) > 0: + config_set = pending.pop() + config_set_no_la = tuple(s.clear_lookahead() for s in config_set) + + existing = F.get(config_set_no_la) + if existing is not None: + F[config_set_no_la] = self.merge_sets(config_set, existing) + else: + F[config_set_no_la] = config_set + for symbol, successor in self.gen_all_successors(config_set): + successor_no_la = tuple(s.clear_lookahead() for s in successor) + successors.append((config_set_no_la, symbol, successor_no_la)) + pending.append(successor) + + # Register all the actually merged, final config sets. + result = ConfigurationSetInfo() + for config_set in F.values(): + result.register_config_set(config_set) + + # Now record all the successors that we found. Of course, the actual + # sets that wound up in the ConfigurationSetInfo don't match anything + # we found during the previous phase. + # + # *Fortunately* we recorded the no-lookahead keys in the successors + # so we can find the final sets, then look them up in the registered + # sets, and actually register the successor. + for config_set_no_la, symbol, successor_no_la in successors: + actual_config_set = F[config_set_no_la] + from_index = result.config_set_key[actual_config_set] + + actual_successor = F[successor_no_la] + to_index = result.config_set_key[actual_successor] + + result.add_successor(from_index, symbol, to_index) + + return result + + def set_without_lookahead(self, config_set: ConfigSet) -> ConfigSet: + return tuple(sorted(set(c.clear_lookahead() for c in config_set))) + + +############################################################################### +# Sugar for constructing grammars +############################################################################### +class Rule: + """A token (terminal), production (nonterminal), or some other + combination thereof. Rules are composed and then flattened into + productions. + + """ + + def __or__(self, other) -> "Rule": + return AlternativeRule(self, other) + + def __add__(self, other) -> "Rule": + return SequenceRule(self, other) + + def flatten(self) -> typing.Generator[list[str], None, None]: + raise NotImplementedError() + + +class Token(Rule): + """A token, or terminal in the grammar.""" + + value: str + + def __init__(self, value): + self.value = sys.intern(value) + + def flatten(self) -> typing.Generator[list[str], None, None]: + yield [self] + + +class NonTerminal(Rule): + """A non-terminal, or a production, in the grammar. + + You probably don't want to create this directly; instead you probably want + to use the `@rule` decorator to associate this with a function in your + grammar class. + """ + + def __init__(self, fn: typing.Callable[["Grammar"], Rule], name: str | None = None): + """Create a new NonTerminal. + + `fn` is the function that will yield the `Rule` which is the right-hand-side + of this production; it will be flattened with `flatten`. `name` is the name of the + production- if unspecified (or `None`) it will be replaced with the `__name__` of + the provided fn. + """ + self.fn = fn + self.name = name or fn.__name__ + + def generate_body(self, grammar) -> list[list[str | Token]]: + """Generate the body of the non-terminal. + + The result is our standard format: a list productions, as produced by flatten. + """ + return [rule for rule in self.fn(grammar).flatten()] + + def flatten(self) -> typing.Generator[list[str], None, None]: + yield [self.name] + + +class AlternativeRule(Rule): + def __init__(self, left: Rule, right: Rule): + self.left = left + self.right = right + + def flatten(self) -> typing.Generator[list[str], None, None]: + yield from self.left.flatten() + yield from self.right.flatten() + + +class SequenceRule(Rule): + def __init__(self, first: Rule, second: Rule): + self.first = first + self.second = second + + def flatten(self) -> typing.Generator[list[str], None, None]: + for first in self.first.flatten(): + for second in self.second.flatten(): + yield first + second + + +class NothingRule(Rule): + def flatten(self) -> typing.Generator[list[str], None, None]: + yield [] + + +Nothing = NothingRule() + + +def seq(*args: list[Rule]) -> Rule: + result = args[0] + for rule in args[1:]: + result = SequenceRule(result, rule) + return result + + +@typing.overload +def rule(name: None | str = None) -> typing.Callable[[typing.Callable], Rule]: ... + + +@typing.overload +def rule(fn: typing.Callable) -> Rule: ... + + +def rule( + name_or_fn: None | str | typing.Callable = None, +) -> Rule | typing.Callable[[typing.Callable], Rule]: + def _rule(callable): + return NonTerminal(callable, name) + + if callable(name_or_fn): + name = name_or_fn.__name__ + return _rule(name_or_fn) + else: + name = name_or_fn + return _rule + + +class Grammar: + def __init__(self, precedence=None): + self._precedence = precedence or [] + + def _queue(self, name: str, rule: Rule): + pass + + def desugar(self, start): + rules = inspect.getmembers(self, lambda x: isinstance(x, NonTerminal)) + nonterminals = {rule.name: rule for _, rule in rules} + + temp_grammar = {} + + rule = nonterminals.get(start) + if rule is None: + raise ValueError(f"Cannot find a rule named '{start}'") + queue = [rule] + while len(queue) > 0: + rule = queue.pop() + if rule.name in temp_grammar: + continue + + body = rule.generate_body(self) + for clause in body: + for symbol in clause: + if not isinstance(symbol, Token): + assert isinstance(symbol, str) + nonterminal = nonterminals.get(symbol) + if nonterminal is None: + raise ValueError(f"While processing {rule.name}: cannot find {symbol}") + queue.append(nonterminal) + + temp_grammar[rule.name] = body + + grammar = [] + for rule_name, clauses in temp_grammar.items(): + for clause in clauses: + new_clause = [] + for symbol in clause: + if isinstance(symbol, Token): + new_clause.append(symbol.value) + else: + new_clause.append(symbol) + + grammar.append((rule_name, new_clause)) + + # print("{") + # for rule_name, clauses in grammar: + # print(f' "{rule_name}": [') + # for clause in clauses: + # parts = ", ".join(f'"{name}"' for name in clause) + # print(f" [{parts}],") + # print(f" ],") + # print("}") + return grammar + + def build_table(self, start, generator=GenerateLALR): + desugared = self.desugar(start) + precedence = { + (symbol.value if isinstance(symbol, Token) else symbol): ( + associativity, + precedence + 1, + ) + for precedence, (associativity, symbols) in enumerate(self._precedence) + for symbol in symbols + } + + gen = generator(start, desugared, precedence=precedence) + table = gen.gen_table() + return table + + +############################################################################### +# Formatting +############################################################################### +def format_node(node): + """Print out an indented concrete syntax tree, from parse().""" + lines = ["{name}".format(name=node[0])] + [ + " " + line for child in node[1] for line in format_node(child).split("\n") + ] + return "\n".join(lines) + + +def format_table(generator, table): + """Format a parser table so pretty.""" + + def format_action(state, terminal): + action = state.get(terminal, ("error",)) + if action[0] == "accept": + return "accept" + elif action[0] == "shift": + return "s" + str(action[1]) + elif action[0] == "error": + return "" + elif action[0] == "reduce": + return "r" + str(action[1]) + + terminals = list(sorted(generator.alphabet[i] for i, v in enumerate(generator.terminal) if v)) + nonterminals = list( + sorted(generator.alphabet[i] for i, v in enumerate(generator.nonterminal) if v) + ) + header = " | {terms} | {nts}".format( + terms=" ".join("{0: <6}".format(terminal) for terminal in terminals), + nts=" ".join("{0: <5}".format(nt) for nt in nonterminals), + ) + + lines = [ + header, + "-" * len(header), + ] + [ + "{index: <3} | {actions} | {gotos}".format( + index=i, + actions=" ".join( + "{0: <6}".format(format_action(row, terminal)) for terminal in terminals + ), + gotos=" ".join("{0: <5}".format(row.get(nt, ("error", ""))[1]) for nt in nonterminals), + ) + for i, row in enumerate(table) + ] + return "\n".join(lines) + + +############################################################################### +# Examples +############################################################################### +def examples(): + def dump_grammar(grammar): + for name, symbols in grammar: + print(f"{name} -> {symbols}") + print() + + # OK, this is a very simple LR0 grammar. + print("grammar_simple:") + grammar_simple = [ + ("E", ["E", "+", "T"]), + ("E", ["T"]), + ("T", ["(", "E", ")"]), + ("T", ["id"]), + ] + + gen = GenerateLR0("E", grammar_simple) + table = gen.gen_table() + print(format_table(gen, table)) + tree = parse(table, ["id", "+", "(", "id", ")"]) + print(format_node(tree) + "\n") + print() + + # This one doesn't work with LR0, though, it has a shift/reduce conflict. + print("grammar_lr0_shift_reduce (LR0):") + grammar_lr0_shift_reduce = grammar_simple + [ + ("T", ["id", "[", "E", "]"]), + ] + try: + gen = GenerateLR0("E", grammar_lr0_shift_reduce) + table = gen.gen_table() + assert False + except ValueError as e: + print(e) + print() + + # Nor does this: it has a reduce/reduce conflict. + print("grammar_lr0_reduce_reduce (LR0):") + grammar_lr0_reduce_reduce = grammar_simple + [ + ("E", ["V", "=", "E"]), + ("V", ["id"]), + ] + try: + gen = GenerateLR0("E", grammar_lr0_reduce_reduce) + table = gen.gen_table() + assert False + except ValueError as e: + print(e) + print() + + # Nullable symbols just don't work with constructs like this, because you can't + # look ahead to figure out if you should reduce an empty 'F' or not. + print("grammar_nullable (LR0):") + grammar_nullable = [ + ("E", ["F", "boop"]), + ("F", ["beep"]), + ("F", []), + ] + try: + gen = GenerateLR0("E", grammar_nullable) + table = gen.gen_table() + assert False + except ValueError as e: + print(e) + print() + + print("grammar_lr0_shift_reduce (SLR1):") + dump_grammar(grammar_lr0_shift_reduce) + gen = GenerateSLR1("E", grammar_lr0_shift_reduce) + first, epsilon = gen.gen_first((gen.symbol_key["E"],)) + print(f"First('E'): {str([gen.alphabet[f] for f in first])} (epsilon={epsilon})") + print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}") + table = gen.gen_table() + print(format_table(gen, table)) + tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True) + print(format_node(tree) + "\n") + print() + + # SLR1 can't handle this. + print("grammar_aho_ullman_1 (SLR1):") + grammar_aho_ullman_1 = [ + ("S", ["L", "=", "R"]), + ("S", ["R"]), + ("L", ["*", "R"]), + ("L", ["id"]), + ("R", ["L"]), + ] + try: + gen = GenerateSLR1("S", grammar_aho_ullman_1) + table = gen.gen_table() + assert False + except ValueError as e: + print(e) + print() + + # Here's an example with a full LR1 grammar, though. + print("grammar_aho_ullman_2 (LR1):") + grammar_aho_ullman_2 = [ + ("S", ["X", "X"]), + ("X", ["a", "X"]), + ("X", ["b"]), + ] + gen = GenerateLR1("S", grammar_aho_ullman_2) + table = gen.gen_table() + print(format_table(gen, table)) + parse(table, ["b", "a", "a", "b"], trace=True) + print() + + # What happens if we do LALR to it? + print("grammar_aho_ullman_2 (LALR):") + gen = GenerateLALR("S", grammar_aho_ullman_2) + table = gen.gen_table() + print(format_table(gen, table)) + print() + + # A fun LALAR grammar. + print("grammar_lalr:") + grammar_lalr = [ + ("S", ["V", "E"]), + ("E", ["F"]), + ("E", ["E", "+", "F"]), + ("F", ["V"]), + ("F", ["int"]), + ("F", ["(", "E", ")"]), + ("V", ["id"]), + ] + gen = GenerateLALR("S", grammar_lalr) + table = gen.gen_table() + print(format_table(gen, table)) + print() + + +if __name__ == "__main__": + examples() diff --git a/fine/grammar/pyproject.toml b/fine/grammar/pyproject.toml new file mode 100644 index 00000000..5328d490 --- /dev/null +++ b/fine/grammar/pyproject.toml @@ -0,0 +1,2 @@ +[tool.black] +line-length=100 \ No newline at end of file diff --git a/fine/grammar/rust.py b/fine/grammar/rust.py new file mode 100644 index 00000000..32184e2e --- /dev/null +++ b/fine/grammar/rust.py @@ -0,0 +1,40 @@ +import io + +import parser + + +def generate_rust_parser(output: io.TextIOBase, table: list[dict[str, parser.Action]]): + lines = [] + + tree_kinds = list( + sorted( + { + action[1] + for state in table + for action in state.values() + if action[0] == "reduce" and action[1][0] != "_" + } + ) + ) + + # First, generate the treekind enumeration + lines.extend( + [ + "#[derive(Debug, Eq, PartialEq)]", + "pub enum TreeKind {", + " Error,", + "", + ] + ) + lines.extend(f" {kind}," for kind in tree_kinds) + lines.extend( + [ + "}", + "", + ] + ) + + # Next generate the parse table + lines.extend([]) + + pass