// NOTE: Utterly Broken Ideas about Parse Tables. // // Committing this here so I can back it up. use std::collections::HashSet; #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)] pub enum ReduceRule { // Generated AlternateType, Argument, ArgumentList, BinaryExpression, Block, CallExpression, ClassDecl, ConditionalExpression, ExpressionStatement, FieldDecl, FieldList, FieldValue, File, ForStatement, FunctionDecl, GroupingExpression, Identifier, IfStatement, IsExpression, IteratorVariable, LetStatement, ListConstructor, ListConstructorElement, LiteralExpression, MatchArm, MatchBody, MatchExpression, MemberAccess, NewObjectExpression, ParamList, Parameter, Pattern, ReturnStatement, ReturnType, SelfParameter, SelfReference, TypeExpression, TypeIdentifier, TypeParameter, TypeParameterList, UnaryExpression, VariableBinding, WhileStatement, WildcardPattern, Import, Export, ExportList, } #[derive(Eq, PartialEq, Hash, Copy, Clone)] pub enum TokenAction { Error, Reduce(ReduceRule, TreeKind, u16), ReduceAnonymous(ReduceRule, u16), Accept, Shift(u16), } pub struct ParseState { action_start: usize, action_end: usize, goto_start: usize, goto_end: usize, } pub struct ParseTable<'a> { state: &'a [ParseState], start_state: usize, token_action: &'a [TokenAction], token_kind: &'a [TokenKind], tree_goto: &'a [u16], tree_rules: &'a [ReduceRule], } #[derive(Clone)] enum StackEntry { Nothing, Tree(TreeRef), AnonTree(Vec), Token(TokenRef), Error(TokenRef), } #[derive(Clone)] struct ParseThread { stack: Vec<(usize, StackEntry)>, panic_count: u8, error_count: u8, score: u32, } impl ParseThread { fn initial(start_state: usize) -> ParseThread { ParseThread { stack: vec![(start_state, StackEntry::Nothing)], error_count: 0, panic_count: 0, score: 0, } } fn reduce( &mut self, table: &ParseTable, syntax: &mut SyntaxTree, count: u16, rule: ReduceRule, kind: Option, ) { let mut children = Vec::new(); let count: usize = count.into(); let mut consumed = 0; while consumed < count { let Some((_, value)) = self.stack.pop() else { break; }; match value { StackEntry::Nothing => panic!("Popped nothing!"), StackEntry::Tree(t) => { consumed += 1; children.push(Child::Tree(t)); } StackEntry::AnonTree(mut cs) => { consumed += 1; children.append(&mut cs); } StackEntry::Token(t) => { consumed += 1; children.push(Child::Token(t)); } StackEntry::Error(t) => { // Do *not* increment consumed; these don't count! children.push(Child::Token(t)); } } } assert_eq!(consumed, count, "Stack underflow on reduce"); let value = if let Some(kind) = kind { let tr = syntax.add_tree(Tree { kind, self_ref: TreeRef::from_index(0), parent: None, start_pos: 0, end_pos: 0, children, }); StackEntry::Tree(tr) } else { StackEntry::AnonTree(children) }; let (goto_index, _) = self.stack.last().unwrap(); let goto_state = &table.state[*goto_index]; let index: usize = (goto_state.goto_start..goto_state.goto_end) .find(|i| table.tree_rules[*i] == rule) .expect("Unable to goto target after reduction") .into(); let target_state: usize = table.tree_goto[index].into(); self.stack.push((target_state, value)); } fn shift(&mut self, state: u16, tr: TokenRef) { let target_state: usize = state.into(); self.stack.push((target_state, StackEntry::Token(tr))); } } // This is what we set the panic level to when we get an error; we require // this many successful token shifts to decide we're not lost. const PANIC_THRESHOLD: u8 = 3; // This is the maximum number of failed states that we're going to go through // before we just try to reduce all the way out of the tree. const THREAD_ERROR_LIMIT: u8 = 20; pub fn table_parse(source: &str, table: &ParseTable) -> (Rc, Rc) { let mut tokens = Tokens::new(source); let mut syntax = SyntaxTree::new(); let mut threads = vec![ParseThread::initial(table.start_state)]; let mut next_threads = vec![]; let mut accepted_threads: Vec = vec![]; let mut maybe_pushed_garbage = false; // While we still have threads to run.... while threads.len() > 0 { // We've still got live threads running, which means we've still got // tokens to consume! Any thread that has accepted "early" should be // penalized here. for thread in accepted_threads.iter_mut() { if thread.score > 0 { thread.score -= 1; } } // Grab us the next token from the stream. // TODO: Collect ephemera before setting on the token. let token = tokens.next(); let current_token = token.kind; let current_token_ref = syntax.add_token(token, vec![]); // Go over every thread in the list of threads to run. If a thread // needs to keep running on this token it can push itself back onto // the stack, and we'll re-consider it next time. (This is necessary // for both reduce and for error handling.) while let Some(mut thread) = threads.pop() { let (state, _) = thread.stack.last().unwrap(); let state = &table.state[*state]; let action = (state.action_start..state.action_end) .find(|i| table.token_kind[*i] == current_token) .map(|i| &table.token_action[i]) .unwrap_or(&TokenAction::Error); match action { TokenAction::Reduce(rule, kind, count) => { thread.reduce(table, &mut syntax, *count, *rule, Some(*kind)); thread.score += 1; threads.push(thread); // Run me again, I can still work with this token. } TokenAction::ReduceAnonymous(rule, count) => { thread.reduce(table, &mut syntax, *count, *rule, None); thread.score += 1; threads.push(thread); // Run me again, I can still work with this token. } TokenAction::Shift(state) => { thread.shift(*state, current_token_ref); thread.score += 1; if thread.panic_count > 0 { thread.panic_count -= 1; } else if thread.error_count > 0 { // TODO: We shifted a good number of tokens in a row, // maybe we should consider reducing the error count // here too, so that this thread might live for // longer. } next_threads.push(thread); } TokenAction::Accept => { thread.score += 1; accepted_threads.push(thread); } // Error handling, the bane of LR parsers! // // In this parser, we borrow a trick from Tree-Sitter and // treat the parse error as if it were an ambiguity: we see a // token but don't know what to do with it, so we'll just try // to do *everything* with it and see what sticks. // // The tricky part here is not causing an enormous explosion // of threads, so we have certain conditions where we just // give up and refuse to consider any more tokens for a given // error thread. // TokenAction::Error => { // First, report the error. (We use a pretty standard // "panic" error recovery mode here to decide when to // start showing new error messages, otherwise we would // just generate *way* too many cascading errors.) // if thread.panic_count == 0 { // TODO: Get a description for this state from the table somehow. // TODO: Describe the error in an error message somehow. let token = &syntax[current_token_ref]; let error_token = syntax.add_token( Token::error(token.start(), token.end(), format!("PARSE ERROR")), vec![], ); // NOTE: `Error` stack entries are not counted when // reducing, so we know this push here won't mess up // the state machine. thread.stack.push((0, StackEntry::Error(error_token))); } // Now mark the thread as panicing so that we don't // produce too many random errors... thread.panic_count = PANIC_THRESHOLD; // Count the error. // TODO: Check to see if this really does help thread explosion or not. if thread.error_count < THREAD_ERROR_LIMIT { thread.error_count += 1; } // Penalize this thread; this is not a great parse, we can tell. if thread.score > 0 { thread.score -= 1; } let mut executed = HashSet::new(); for index in state.action_start..state.action_end { // Make absolutely sure we don't do the same thing // twice! It can happen, and it is hugely wasteful // because it spawns duplicate threads. let action = &table.token_action[index]; if executed.contains(action) { continue; } executed.insert(action.clone()); match action { TokenAction::Error => { panic!("Literal error in the table; table is corrupt") } TokenAction::Reduce(rule, kind, count) => { // Let's pretend that we're done with the // current rule and see what happens. let mut new_thread = thread.clone(); new_thread.reduce(&table, &mut syntax, *count, *rule, Some(*kind)); threads.push(new_thread); // Mark that we might have to trim the syntax // tree because we might not use this // reduction. maybe_pushed_garbage = true; } TokenAction::ReduceAnonymous(rule, count) => { // Let's pretend that we're done with the // current rule and see what happens. let mut new_thread = thread.clone(); new_thread.reduce(&table, &mut syntax, *count, *rule, None); threads.push(new_thread); } TokenAction::Shift(state) => { // Let's just pretend the current token // matched this thing that we were looking // for, and shift it anyway, and see what // happens. // // This represents an expansion of the search // space and so we only want to do it if we // haven't reached our error limit yet. if thread.error_count < THREAD_ERROR_LIMIT { let mut new_thread = thread.clone(); new_thread.shift(*state, current_token_ref); next_threads.push(new_thread); } } TokenAction::Accept => accepted_threads.push(thread.clone()), } } // Let's try to process the *next* token and see what // happens with this same thread, unless we're giving up // on the thread. if thread.error_count < THREAD_ERROR_LIMIT { next_threads.push(thread); } } } } // Drain all the next_threads into the current stack and start again // on the next token! threads.append(&mut next_threads); } // OK no more threads, we're done. In theory at this point we should // penalize all accepted threads for remaining tokens but if we've got no // more threads and there are remaining tokens then they all hit their // error limit and are basically equivalent. (Why penalize all threads by // the same amount?) // // Let's just go through all the threads that "accepted" and pick the one // with the highest score that also wound up with a named tree at the top. let mut best_score = 0; for thread in accepted_threads { if thread.score >= best_score { if let Some((_, StackEntry::Tree(tr))) = thread.stack.last() { syntax.root = Some(*tr); best_score = thread.score + 1; } } } // Now, our syntax tree might have errors in it, and if it does we might // have pushed trees that we have no interest in ever seeing ever again. // That means that we need to rewrite the tree starting from the root, to // make sure that the trees in the syntax tree are for real for real. if maybe_pushed_garbage { let mut valid = HashSet::new(); let mut stack = Vec::new(); if let Some(tr) = &syntax.root { stack.push(*tr); } while let Some(tr) = stack.pop() { valid.insert(tr); for x in syntax[tr].child_trees() { stack.push(x); } } for tr in syntax.trees.iter_mut() { if !valid.contains(&tr.self_ref) { tr.kind = TreeKind::Ignore; } } } (Rc::new(syntax), Rc::new(tokens.lines())) }