// NOTE: much of this parser structure derived from // https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html use crate::tokens::{Lines, Token, TokenKind, Tokens}; use std::fmt::Write as _; use std::{cell::Cell, num::NonZeroU32}; pub struct SyntaxTree<'a> { trees: Vec>, root: Option, } impl<'a> SyntaxTree<'a> { pub fn new() -> Self { SyntaxTree { trees: vec![], root: None, } } pub fn root(&self) -> Option { self.root } pub fn add_tree(&mut self, mut t: Tree<'a>) -> TreeRef { assert!(t.parent.is_none()); let tr = TreeRef::from_index(self.trees.len()); t.start_pos = t .children .first() .map(|c| c.start_position(&self)) .unwrap_or(0); t.end_pos = t .children .last() .map(|c| c.end_position(&self)) .unwrap_or(t.start_pos); // NOTE: Because of the difficulty of holding multiple mutable // references it's this is our best chance to patch up parent // pointers. for child in t.children.iter() { if let Child::Tree(ct) = child { self[*ct].parent = Some(tr); } } self.trees.push(t); tr } pub fn dump(&self, with_positions: bool) -> String { let mut output = String::new(); if let Some(r) = self.root { self[r].dump(self, with_positions, &mut output); } output } pub fn start_position(&self, t: TreeRef) -> usize { self[t].start_pos } pub fn end_position(&self, t: TreeRef) -> usize { self[t].end_pos } pub fn trees(&self) -> impl Iterator { (0..self.trees.len()).map(|i| TreeRef::from_index(i)) } pub fn find_tree_at(&self, pos: usize) -> Option { let mut current = self.root?; let mut tree = &self[current]; if pos < tree.start_pos || pos >= tree.end_pos { return None; } loop { let mut found = false; for child in &tree.children { if let Child::Tree(next) = child { let next_tree = &self[*next]; if pos >= next_tree.start_pos && pos < next_tree.end_pos { found = true; current = *next; tree = next_tree; break; } } } if !found { return Some(current); } } } } impl<'a> std::ops::Index for SyntaxTree<'a> { type Output = Tree<'a>; fn index(&self, index: TreeRef) -> &Self::Output { &self.trees[index.index()] } } impl<'a> std::ops::IndexMut for SyntaxTree<'a> { fn index_mut(&mut self, index: TreeRef) -> &mut Self::Output { &mut self.trees[index.index()] } } #[derive(Debug, Eq, PartialEq)] pub enum TreeKind { Error, File, FunDecl, ParamList, Parameter, TypeExpression, Block, LetStatement, ReturnStatement, ExpressionStatement, LiteralExpression, GroupingExpression, UnaryExpression, ConditionalExpression, CallExpression, ArgumentList, Argument, BinaryExpression, IfStatement, Identifier, } pub struct Tree<'a> { pub kind: TreeKind, pub parent: Option, // TODO: Do we actually need this? pub start_pos: usize, pub end_pos: usize, pub children: Vec>, } impl<'a> Tree<'a> { pub fn nth_token(&self, index: usize) -> Option<&Token<'a>> { self.children .get(index) .map(|c| match c { Child::Token(t) => Some(t), _ => None, }) .flatten() } pub fn nth_tree(&self, index: usize) -> Option { self.children .get(index) .map(|c| match c { Child::Tree(t) => Some(*t), _ => None, }) .flatten() } } #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)] pub struct TreeRef(NonZeroU32); impl TreeRef { pub fn from_index(index: usize) -> TreeRef { let index: u32 = (index + 1).try_into().unwrap(); TreeRef(NonZeroU32::new(index).unwrap()) } pub fn index(&self) -> usize { let index: usize = self.0.get().try_into().unwrap(); index - 1 } } impl<'a> Tree<'a> { pub fn dump(&self, tree: &SyntaxTree<'a>, with_positions: bool, output: &mut String) { let _ = write!(output, "{:?}", self.kind); if with_positions { let _ = write!(output, " [{}, {})", self.start_pos, self.end_pos); } let _ = write!(output, "\n"); for child in self.children.iter() { child.dump_rec(2, tree, with_positions, output); } } } pub enum Child<'a> { Token(Token<'a>), Tree(TreeRef), } impl<'a> Child<'a> { fn dump_rec( &self, indent: usize, tree: &SyntaxTree<'a>, with_positions: bool, output: &mut String, ) { for _ in 0..indent { let _ = write!(output, " "); } match self { Child::Token(t) => { let _ = write!(output, "{:?}:'{:?}'", t.kind, t.as_str()); if with_positions { let _ = write!(output, " [{}, {})", t.start, t.start + t.as_str().len()); } let _ = write!(output, "\n"); } Child::Tree(t) => { let t = &tree[*t]; let _ = write!(output, "{:?}", t.kind); if with_positions { let _ = write!(output, " [{}, {})", t.start_pos, t.end_pos); } let _ = write!(output, "\n"); for child in t.children.iter() { child.dump_rec(indent + 2, tree, with_positions, output); } } } } pub fn start_position(&self, syntax_tree: &SyntaxTree) -> usize { match &self { Child::Token(t) => t.start, Child::Tree(t) => syntax_tree[*t].start_pos, } } pub fn end_position(&self, syntax_tree: &SyntaxTree) -> usize { match &self { Child::Token(t) => t.start + t.as_str().len(), Child::Tree(t) => syntax_tree[*t].end_pos, } } } enum ParseEvent<'a> { Start { kind: TreeKind }, End, Advance { token: Token<'a> }, } struct MarkStarted { index: usize, } struct MarkClosed { index: usize, } struct CParser<'a> { tokens: Tokens<'a>, current: Token<'a>, fuel: Cell, events: Vec>, } impl<'a> CParser<'a> { fn new(tokens: Tokens<'a>) -> Self { let mut parser = CParser { tokens, current: Token::new(TokenKind::EOF, 0, ""), fuel: Cell::new(256), events: Vec::new(), }; parser.current = parser.tokens.next(); parser.skip_ephemera(); parser } fn start(&mut self) -> MarkStarted { let mark = MarkStarted { index: self.events.len(), }; self.events.push(ParseEvent::Start { kind: TreeKind::Error, }); mark } fn end(&mut self, mark: MarkStarted, kind: TreeKind) -> MarkClosed { self.events[mark.index] = ParseEvent::Start { kind }; self.events.push(ParseEvent::End); MarkClosed { index: mark.index } } fn start_before(&mut self, mark: MarkClosed) -> MarkStarted { // TODO: Point backwards and pointer chase in tree build? let mark = MarkStarted { index: mark.index }; self.events.insert( mark.index, ParseEvent::Start { kind: TreeKind::Error, }, ); mark } fn advance(&mut self) { assert!(!self.eof()); // Don't try to advance past EOF self.fuel.set(256); // Consuming a token, rest stuck detector self.events.push(ParseEvent::Advance { token: self.current.clone(), }); self.current = self.tokens.next(); self.skip_ephemera(); } fn skip_ephemera(&mut self) { while self.current.kind == TokenKind::Whitespace || self.current.kind == TokenKind::Comment { self.current = self.tokens.next(); } } fn eof(&self) -> bool { self.current.kind == TokenKind::EOF } fn peek(&self) -> TokenKind { assert!(self.fuel.get() > 0, "parser is stuck!"); self.fuel.set(self.fuel.get() - 1); self.current.kind } fn at(&self, kind: TokenKind) -> bool { self.peek() == kind } fn eat(&mut self, kind: TokenKind) -> bool { if self.at(kind) { self.advance(); true } else { false } } fn expect(&mut self, kind: TokenKind, error: T) where T: Into, { if self.eat(kind) { return; } self.error(error); } fn advance_with_error(&mut self, error: T) -> MarkClosed where T: Into, { let m = self.start(); self.error(error); self.advance(); self.end(m, TreeKind::Error) } fn error(&mut self, message: T) where T: Into, { self.error_at(self.current.clone(), message) } fn error_at(&mut self, token: Token<'a>, message: T) where T: Into, { let message: String = message.into(); let mut final_message = "Error ".to_string(); if token.kind == TokenKind::EOF { final_message.push_str("at end") } else if token.kind != TokenKind::Error { final_message.push_str("at '"); final_message.push_str(token.as_str()); final_message.push_str("'"); } final_message.push_str(": "); final_message.push_str(&message); self.events.push(ParseEvent::Advance { token: Token::error(token.start, final_message), }); } fn build_tree(self) -> (SyntaxTree<'a>, Lines) { let mut events = self.events; let mut stack = Vec::new(); let mut result = SyntaxTree::new(); // The first element in our events vector must be a start; the whole // thing must be bracketed in a tree. assert!(matches!(events.get(0), Some(ParseEvent::Start { .. }))); // The last element in our events vector must be an end, otherwise // the parser has failed badly. We'll remove it here so that, after // processing the entire array, the stack retains the tree that we // start with the very first ::Start. assert!(matches!(events.pop(), Some(ParseEvent::End))); for event in events { match event { ParseEvent::Start { kind } => stack.push(Tree { kind, parent: None, start_pos: 0, end_pos: 0, children: Vec::new(), }), ParseEvent::End => { let t = result.add_tree(stack.pop().unwrap()); stack.last_mut().unwrap().children.push(Child::Tree(t)); } ParseEvent::Advance { token } => { stack.last_mut().unwrap().children.push(Child::Token(token)); } } } assert!(stack.len() == 1, "Not all trees were ended!"); let root = result.add_tree(stack.pop().unwrap()); result.root = Some(root); (result, self.tokens.lines()) } } pub fn parse(source: &str) -> (SyntaxTree, Lines) { let tokens = Tokens::new(source); let mut parser = CParser::new(tokens); file(&mut parser); parser.build_tree() } fn file(p: &mut CParser) { let m = p.start(); while !p.eof() { match p.peek() { TokenKind::Fun => function(p), _ => statement(p), } } p.end(m, TreeKind::File); } fn function(p: &mut CParser) { assert!(p.at(TokenKind::Fun)); let m = p.start(); p.expect(TokenKind::Fun, "expected a function to start with 'fun'"); p.expect(TokenKind::Identifier, "expected a function name"); if p.at(TokenKind::LeftParen) { param_list(p); } if p.eat(TokenKind::Arrow) { type_expr(p); } if p.at(TokenKind::LeftBrace) { block(p); } p.end(m, TreeKind::FunDecl); } fn param_list(p: &mut CParser) { assert!(p.at(TokenKind::LeftParen)); let m = p.start(); p.expect(TokenKind::LeftParen, "expect '(' to start a parameter list"); while !p.at(TokenKind::RightParen) && !p.eof() { if p.at(TokenKind::Identifier) { parameter(p); } else { break; } } p.expect(TokenKind::RightParen, "expect ')' to end a parameter list"); p.end(m, TreeKind::ParamList); } fn parameter(p: &mut CParser) { assert!(p.at(TokenKind::Identifier)); let m = p.start(); p.expect( TokenKind::Identifier, "expected an identifier for a parameter name", ); if p.eat(TokenKind::Colon) { type_expr(p); } if !p.at(TokenKind::RightParen) { p.expect(TokenKind::Comma, "expected a comma between parameters"); } p.end(m, TreeKind::Parameter); } fn type_expr(p: &mut CParser) { let m = p.start(); // TODO: Other kinds of type expressions probably! p.expect(TokenKind::Identifier, "expected the identifier of a type"); p.end(m, TreeKind::TypeExpression); } fn block(p: &mut CParser) { assert!(p.at(TokenKind::LeftBrace)); let m = p.start(); p.expect(TokenKind::LeftBrace, "expect '{' to start a block"); while !p.at(TokenKind::RightBrace) && !p.eof() { statement(p); } p.expect(TokenKind::RightBrace, "expect '}' to start a block"); p.end(m, TreeKind::Block); } fn statement(p: &mut CParser) { match p.peek() { TokenKind::LeftBrace => block(p), TokenKind::Let => statement_let(p), TokenKind::Return => statement_return(p), // NOTE: Technically 'if' is an expression, but `if` doesn't // require a semicolon at the end if it's all by itself. TokenKind::If => statement_if(p), _ => statement_expression(p), } } fn statement_if(p: &mut CParser) { assert!(p.at(TokenKind::If)); let m = p.start(); conditional(p); p.end(m, TreeKind::IfStatement); } fn statement_let(p: &mut CParser) { assert!(p.at(TokenKind::Let)); let m = p.start(); p.expect(TokenKind::Let, "expect 'let' to start a let statement"); p.expect(TokenKind::Identifier, "expected a name for the variable"); p.expect(TokenKind::Equal, "expected a '=' after the variable name"); expression(p); if !p.at(TokenKind::RightBrace) { p.expect(TokenKind::Semicolon, "expect ';' to end a let statement"); } p.end(m, TreeKind::LetStatement); } fn statement_return(p: &mut CParser) { assert!(p.at(TokenKind::Return)); let m = p.start(); p.expect( TokenKind::Return, "expect 'return' to start a return statement", ); expression(p); if !p.at(TokenKind::RightBrace) { p.expect(TokenKind::Semicolon, "expect ';' to end a return statement"); } p.end(m, TreeKind::ReturnStatement); } fn statement_expression(p: &mut CParser) { let m = p.start(); expression(p); if !p.at(TokenKind::RightBrace) { p.expect( TokenKind::Semicolon, "expect ';' to end an expression statement", ); } p.end(m, TreeKind::ExpressionStatement); } fn expression(p: &mut CParser) { expression_with_power(p, 0) } // BINDING POWERS. When parsing expressions we only accept expressions that // meet a minimum binding power. (This is like "precedence" but I just super // don't like that terminology.) const ASSIGNMENT_POWER: u8 = 0; // = const OR_POWER: u8 = 1; // or const AND_POWER: u8 = 2; // and const EQUALITY_POWER: u8 = 3; // == != const COMPARISON_POWER: u8 = 4; // < > <= >= const TERM_POWER: u8 = 5; // + - const FACTOR_POWER: u8 = 6; // * / const UNARY_POWER: u8 = 7; // ! - // const PRIMARY_POWER: u8 = 9; fn token_power<'a>(token: TokenKind) -> Option { match token { TokenKind::Equal => Some(ASSIGNMENT_POWER), TokenKind::Or => Some(OR_POWER), TokenKind::And => Some(AND_POWER), TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER), TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => { Some(COMPARISON_POWER) } TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER), TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER), _ => None, } } fn expression_with_power(p: &mut CParser, minimum_power: u8) { let mut expr = prefix_expression(p); while p.at(TokenKind::LeftParen) { let m = p.start_before(expr); argument_list(p); expr = p.end(m, TreeKind::CallExpression); } loop { let Some(power) = token_power(p.peek()) else { break; }; if power < minimum_power { break; } // TODO: I don't think this works for other "infix" types, but we'll // see won't we. let m = p.start_before(expr); p.advance(); // Consume the operator expression_with_power(p, power); expr = p.end(m, TreeKind::BinaryExpression); } } fn argument_list(p: &mut CParser) { assert!(p.at(TokenKind::LeftParen)); let m = p.start(); p.expect( TokenKind::LeftParen, "expect an argument list to start with '('", ); while !p.at(TokenKind::RightParen) && !p.eof() { argument(p); } p.expect( TokenKind::RightParen, "expect an argument list to start with '('", ); p.end(m, TreeKind::ArgumentList); } fn argument(p: &mut CParser) { let m = p.start(); expression(p); if !p.at(TokenKind::RightParen) { p.expect(TokenKind::Comma, "expect a ',' between arguments"); } p.end(m, TreeKind::Argument); } fn prefix_expression(p: &mut CParser) -> MarkClosed { match p.peek() { TokenKind::Number => literal(p), TokenKind::String => literal(p), TokenKind::True => literal(p), TokenKind::False => literal(p), TokenKind::LeftParen => grouping(p), TokenKind::Bang => unary(p), TokenKind::Minus => unary(p), TokenKind::If => conditional(p), TokenKind::Identifier => identifier(p), _ => p.advance_with_error("expected an expression"), } } fn literal(p: &mut CParser) -> MarkClosed { let m = p.start(); p.advance(); p.end(m, TreeKind::LiteralExpression) } fn grouping(p: &mut CParser) -> MarkClosed { assert!(p.at(TokenKind::LeftParen)); let m = p.start(); p.expect(TokenKind::LeftParen, "expected '(' to start grouping"); expression(p); p.expect(TokenKind::RightParen, "unmatched parentheses in expression"); p.end(m, TreeKind::GroupingExpression) } fn unary(p: &mut CParser) -> MarkClosed { let m = p.start(); p.advance(); // Past the operator expression_with_power(p, UNARY_POWER); p.end(m, TreeKind::UnaryExpression) } fn conditional(p: &mut CParser) -> MarkClosed { assert!(p.at(TokenKind::If)); let m = p.start(); p.expect(TokenKind::If, "expected conditional to start with 'if'"); expression(p); block(p); if p.eat(TokenKind::Else) { if p.at(TokenKind::If) { // Don't require another block, just jump right into the conditional. conditional(p); } else { block(p); } } p.end(m, TreeKind::ConditionalExpression) } fn identifier(p: &mut CParser) -> MarkClosed { assert!(p.at(TokenKind::Identifier)); let m = p.start(); p.advance(); p.end(m, TreeKind::Identifier) } #[cfg(test)] mod tests { use super::*; #[test] fn tree_ref_size() { // What's the point of doing all that work if the tree ref isn't nice // and "small"? // // TODO: This is a dumb optimization because tokens are // huge so Child is huge no matter what we do. If we retain // tokens out of line then we can re-visit this optimization. assert_eq!(4, std::mem::size_of::>()); } }