diff --git a/fine/src/parser.rs b/fine/src/parser.rs index da900c5b..5ab842ca 100644 --- a/fine/src/parser.rs +++ b/fine/src/parser.rs @@ -1,5 +1,5 @@ use crate::tokens::{Lines, Token, TokenKind, Tokens}; -use std::fmt; +use std::{cell::Cell, fmt}; // TODO: An error should have: // @@ -157,7 +157,491 @@ impl std::fmt::Display for Type { } } -pub struct TypeRef(Option); +// NOTE: much of this parser structure derived from +// https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html +pub enum TreeKind { + Error, + File, + FunDecl, + ParamList, + Parameter, + TypeExpression, + Block, + LetStatement, + ReturnStatement, + ExpressionStatement, + LiteralExpression, + GroupingExpression, + UnaryExpression, + ConditionalExpression, + CallExpression, + ArgumentList, + Argument, + BinaryExpression, + IfStatement, +} + +pub struct Tree<'a> { + pub kind: TreeKind, + // TODO: Indirect reference? Flatness? Using a reference structure will + // make caching and annotation easier if desired. + pub children: Vec>, +} + +pub enum Child<'a> { + Token(Token<'a>), + Tree(Tree<'a>), +} + +enum ParseEvent<'a> { + Start { kind: TreeKind }, + End, + Advance { token: Token<'a> }, +} + +struct MarkStarted { + index: usize, +} + +struct MarkClosed { + index: usize, +} + +struct CParser<'a> { + tokens: Tokens<'a>, + current: Token<'a>, + fuel: Cell, + events: Vec>, +} + +impl<'a> CParser<'a> { + fn new(tokens: Tokens<'a>) -> Self { + let mut parser = CParser { + tokens, + current: Token::new(TokenKind::EOF, 0, ""), + fuel: Cell::new(256), + events: Vec::new(), + }; + parser.current = parser.tokens.next(); + parser + } + + fn start(&mut self) -> MarkStarted { + let mark = MarkStarted { + index: self.events.len(), + }; + self.events.push(ParseEvent::Start { + kind: TreeKind::Error, + }); + mark + } + + fn end(&mut self, mark: MarkStarted, kind: TreeKind) -> MarkClosed { + self.events[mark.index] = ParseEvent::Start { kind }; + self.events.push(ParseEvent::End); + MarkClosed { index: mark.index } + } + + fn start_before(&mut self, mark: MarkClosed) -> MarkStarted { + // TODO: Point backwards and pointer chase in tree build? + let mark = MarkStarted { index: mark.index }; + self.events.insert( + mark.index, + ParseEvent::Start { + kind: TreeKind::Error, + }, + ); + mark + } + + fn advance(&mut self) { + assert!(!self.eof()); // Don't try to advance past EOF + self.fuel.set(256); // Consuming a token, rest stuck detector + self.events.push(ParseEvent::Advance { + token: self.current.clone(), + }); + self.current = self.tokens.next(); + } + + fn eof(&self) -> bool { + self.current.kind == TokenKind::EOF + } + + fn peek(&self) -> TokenKind { + assert!(self.fuel.get() > 0, "parser is stuck!"); + self.fuel.set(self.fuel.get() - 1); + self.current.kind + } + + fn at(&self, kind: TokenKind) -> bool { + self.peek() == kind + } + + fn eat(&mut self, kind: TokenKind) -> bool { + if self.at(kind) { + self.advance(); + true + } else { + false + } + } + + fn expect(&mut self, kind: TokenKind, error: T) + where + T: Into, + { + if self.eat(kind) { + return; + } + self.error(error); + } + + fn advance_with_error(&mut self, error: T) -> MarkClosed + where + T: Into, + { + let m = self.start(); + self.error(error); + self.advance(); + self.end(m, TreeKind::Error) + } + + fn error(&mut self, message: T) + where + T: Into, + { + self.error_at(self.current.clone(), message) + } + + fn error_at(&mut self, token: Token<'a>, message: T) + where + T: Into, + { + let message: String = message.into(); + let mut final_message = "Error ".to_string(); + + if token.kind == TokenKind::EOF { + final_message.push_str("at end") + } else if token.kind != TokenKind::Error { + final_message.push_str("at '"); + final_message.push_str(token.as_str()); + final_message.push_str("'"); + } + final_message.push_str(": "); + final_message.push_str(&message); + + self.events.push(ParseEvent::Advance { + token: Token::error(token.start, final_message), + }); + } + + fn build_tree(self) -> (Tree<'a>, Lines) { + let mut events = self.events; + let mut stack = Vec::new(); + + // Special case: pop the last `Close` event to ensure that the stack + // is non-empty inside the loop. + assert!(matches!(events.pop(), Some(ParseEvent::End))); + + for event in events { + match event { + ParseEvent::Start { kind } => stack.push(Tree { + kind, + children: Vec::new(), + }), + + ParseEvent::End => { + let tree = stack.pop().unwrap(); + stack.last_mut().unwrap().children.push(Child::Tree(tree)); + } + + ParseEvent::Advance { token } => { + stack.last_mut().unwrap().children.push(Child::Token(token)); + } + } + } + + assert!(stack.len() == 1, "Not all trees were ended!"); + (stack.pop().unwrap(), self.tokens.lines()) + } +} + +pub fn c_parse(source: &str) -> (Tree, Lines) { + let tokens = Tokens::new(source); + let mut parser = CParser::new(tokens); + + file(&mut parser); + + parser.build_tree() +} + +fn file(p: &mut CParser) { + let m = p.start(); + while !p.eof() { + match p.peek() { + TokenKind::Fun => function(p), + _ => statement(p), + } + } + p.end(m, TreeKind::File); +} + +fn function(p: &mut CParser) { + assert!(p.at(TokenKind::Fun)); + let m = p.start(); + + p.expect(TokenKind::Fun, "expected a function to start with 'fun'"); + p.expect(TokenKind::Identifier, "expected a function name"); + if p.at(TokenKind::LeftParen) { + param_list(p); + } + if p.eat(TokenKind::Arrow) { + type_expr(p); + } + if p.at(TokenKind::LeftBrace) { + block(p); + } + + p.end(m, TreeKind::FunDecl); +} + +fn param_list(p: &mut CParser) { + assert!(p.at(TokenKind::LeftParen)); + let m = p.start(); + + p.expect(TokenKind::LeftParen, "expect '(' to start a parameter list"); + while !p.at(TokenKind::RightParen) && !p.eof() { + if p.at(TokenKind::Identifier) { + parameter(p); + } else { + break; + } + } + p.expect(TokenKind::RightParen, "expect ')' to end a parameter list"); + + p.end(m, TreeKind::ParamList); +} + +fn parameter(p: &mut CParser) { + assert!(p.at(TokenKind::Identifier)); + let m = p.start(); + p.expect( + TokenKind::Identifier, + "expected an identifier for a parameter name", + ); + if p.eat(TokenKind::Colon) { + type_expr(p); + } + if !p.at(TokenKind::RightParen) { + p.expect(TokenKind::Comma, "expected a comma between parameters"); + } + + p.end(m, TreeKind::Parameter); +} + +fn type_expr(p: &mut CParser) { + let m = p.start(); + // TODO: Other kinds of type expressions probably! + p.expect(TokenKind::Identifier, "expected the identifier of a type"); + p.end(m, TreeKind::TypeExpression); +} + +fn block(p: &mut CParser) { + assert!(p.at(TokenKind::LeftBrace)); + let m = p.start(); + + p.expect(TokenKind::LeftBrace, "expect '{' to start a block"); + while !p.at(TokenKind::RightBrace) && !p.eof() { + statement(p); + } + p.expect(TokenKind::RightBrace, "expect '}' to start a block"); + + p.end(m, TreeKind::Block); +} + +fn statement(p: &mut CParser) { + match p.peek() { + TokenKind::LeftBrace => block(p), + TokenKind::Let => statement_let(p), + TokenKind::Return => statement_return(p), + + // NOTE: Technically 'if' is an expression, but `if` doesn't + // require a semicolon at the end if it's all by itself. + TokenKind::If => statement_if(p), + + _ => statement_expression(p), + } +} + +fn statement_if(p: &mut CParser) { + assert!(p.at(TokenKind::If)); + let m = p.start(); + + conditional(p); + + p.end(m, TreeKind::IfStatement); +} + +fn statement_let(p: &mut CParser) { + assert!(p.at(TokenKind::Let)); + let m = p.start(); + + p.expect(TokenKind::Let, "expect 'let' to start a let statement"); + p.expect(TokenKind::Identifier, "expected a name for the variable"); + p.expect(TokenKind::Equal, "expected a '=' after the variable name"); + expression(p); + p.expect(TokenKind::Semicolon, "expect ';' to end a let statement"); + + p.end(m, TreeKind::LetStatement); +} + +fn statement_return(p: &mut CParser) { + assert!(p.at(TokenKind::Return)); + let m = p.start(); + + p.expect( + TokenKind::Return, + "expect 'return' to start a return statement", + ); + expression(p); + p.expect(TokenKind::Semicolon, "expect ';' to end a return statement"); + + p.end(m, TreeKind::ReturnStatement); +} + +fn statement_expression(p: &mut CParser) { + let m = p.start(); + + expression(p); + p.expect( + TokenKind::Semicolon, + "expect ';' to end an expression statement", + ); + + p.end(m, TreeKind::ExpressionStatement); +} + +fn expression(p: &mut CParser) { + expression_with_power(p, 0) +} + +fn expression_with_power(p: &mut CParser, minimum_power: u8) { + let mut expr = prefix_expression(p); + while p.at(TokenKind::LeftParen) { + let m = p.start_before(expr); + argument_list(p); + expr = p.end(m, TreeKind::CallExpression); + } + + loop { + let Some(power) = token_power(p.peek()) else { + break; + }; + if power < minimum_power { + break; + } + + // TODO: I don't think this works for other "infix" types, but we'll + // see won't we. + let m = p.start_before(expr); + p.advance(); // Consume the operator + expression_with_power(p, power); + expr = p.end(m, TreeKind::BinaryExpression); + } +} + +fn argument_list(p: &mut CParser) { + assert!(p.at(TokenKind::LeftParen)); + let m = p.start(); + + p.expect( + TokenKind::LeftParen, + "expect an argument list to start with '('", + ); + while !p.at(TokenKind::RightParen) && !p.eof() { + argument(p); + } + p.expect( + TokenKind::RightParen, + "expect an argument list to start with '('", + ); + + p.end(m, TreeKind::ArgumentList); +} + +fn argument(p: &mut CParser) { + let m = p.start(); + + expression(p); + if !p.at(TokenKind::RightParen) { + p.expect(TokenKind::Comma, "expect a ',' between arguments"); + } + + p.end(m, TreeKind::Argument); +} + +fn prefix_expression(p: &mut CParser) -> MarkClosed { + match p.peek() { + TokenKind::Number => literal(p), + TokenKind::String => literal(p), + TokenKind::True => literal(p), + TokenKind::False => literal(p), + + TokenKind::LeftParen => grouping(p), + + TokenKind::Bang => unary(p), + TokenKind::Minus => unary(p), + + TokenKind::If => conditional(p), + + _ => p.advance_with_error("expected an expression"), + } +} + +fn literal(p: &mut CParser) -> MarkClosed { + let m = p.start(); + p.advance(); + p.end(m, TreeKind::LiteralExpression) +} + +fn grouping(p: &mut CParser) -> MarkClosed { + assert!(p.at(TokenKind::LeftParen)); + let m = p.start(); + + p.expect(TokenKind::LeftParen, "expected '(' to start grouping"); + expression(p); + p.expect(TokenKind::RightParen, "unmatched parentheses in expression"); + + p.end(m, TreeKind::GroupingExpression) +} + +fn unary(p: &mut CParser) -> MarkClosed { + let m = p.start(); + + p.advance(); // Past the operator + expression_with_power(p, UNARY_POWER); + + p.end(m, TreeKind::UnaryExpression) +} + +fn conditional(p: &mut CParser) -> MarkClosed { + assert!(p.at(TokenKind::If)); + let m = p.start(); + + p.expect(TokenKind::If, "expected conditional to start with 'if'"); + expression(p); + block(p); + if p.eat(TokenKind::Else) { + if p.at(TokenKind::If) { + // Don't require another block, just jump right into the conditional. + conditional(p); + } else { + block(p); + } + } + + p.end(m, TreeKind::ConditionalExpression) +} pub struct SyntaxTree<'a> { pub errors: Vec, @@ -431,8 +915,8 @@ const UNARY_POWER: u8 = 7; // ! - // const CALL_POWER: u8 = 8; // . () // const PRIMARY_POWER: u8 = 9; -fn token_power<'a>(token: &Token<'a>) -> Option { - match token.kind { +fn token_power<'a>(token: TokenKind) -> Option { + match token { TokenKind::Equal => Some(ASSIGNMENT_POWER), TokenKind::Or => Some(OR_POWER), TokenKind::And => Some(AND_POWER), @@ -483,7 +967,7 @@ impl<'a> Parser<'a> { self.advance(); let mut expr = self.prefix_expression(); loop { - let power = match token_power(&self.current) { + let power = match token_power(self.current.kind) { Some(p) => p, None => break, // EOF, end of expression? }; diff --git a/fine/src/tokens.rs b/fine/src/tokens.rs index 36a73ea6..ceaa22f0 100644 --- a/fine/src/tokens.rs +++ b/fine/src/tokens.rs @@ -17,6 +17,9 @@ pub enum TokenKind { Slash, Star, + Arrow, + Colon, + Bang, BangEqual, Equal, @@ -414,8 +417,15 @@ impl<'a> Tokens<'a> { ')' => self.token(pos, TokenKind::RightParen), ',' => self.token(pos, TokenKind::Comma), '.' => self.token(pos, TokenKind::Dot), - '-' => self.token(pos, TokenKind::Minus), + '-' => { + if self.matches('>') { + self.token(pos, TokenKind::Arrow) + } else { + self.token(pos, TokenKind::Minus) + } + } '+' => self.token(pos, TokenKind::Plus), + ':' => self.token(pos, TokenKind::Colon), ';' => self.token(pos, TokenKind::Semicolon), '/' => self.token(pos, TokenKind::Slash), '*' => self.token(pos, TokenKind::Star), @@ -555,7 +565,7 @@ mod tests { test_tokens!( symbols, - "{ } ( ) [ ] . ! != < <= > >= = == , - + * / ;", + "{ } ( ) [ ] . ! != < <= > >= = == , - -> + * / ; :", (0, LeftBrace, "{"), (2, RightBrace, "}"), (4, LeftParen, "("), @@ -573,9 +583,11 @@ mod tests { (31, EqualEqual, "=="), (34, Comma, ","), (36, Minus, "-"), - (38, Plus, "+"), - (40, Star, "*"), - (42, Slash, "/"), - (44, Semicolon, ";") + (38, Arrow, "->"), + (41, Plus, "+"), + (43, Star, "*"), + (45, Slash, "/"), + (47, Semicolon, ";"), + (49, Colon, ":") ); }