diff --git a/Cargo.lock b/Cargo.lock index 4fd9bdb8..509dfde1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -150,7 +150,7 @@ dependencies = [ "proc-macro2", "quote", "swc_macros_common", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -295,7 +295,7 @@ checksum = "fdde5c9cd29ebd706ce1b35600920a33550e402fc998a2e53ad3b42c3c47a192" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -672,6 +672,12 @@ dependencies = [ [[package]] name = "fine" version = "0.1.0" +dependencies = [ + "glob", + "prettyplease", + "quote", + "syn 2.0.47", +] [[package]] name = "flate2" @@ -720,7 +726,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -753,7 +759,7 @@ dependencies = [ "pmutil", "proc-macro2", "swc_macros_common", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -1038,7 +1044,7 @@ dependencies = [ "pmutil", "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -1717,7 +1723,7 @@ checksum = "52a40bc70c2c58040d2d8b167ba9a5ff59fc9dab7ad44771cfde3dcfde7a09c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -1751,6 +1757,16 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +[[package]] +name = "prettyplease" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" +dependencies = [ + "proc-macro2", + "syn 2.0.47", +] + [[package]] name = "proc-macro-crate" version = "1.3.1" @@ -1769,9 +1785,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.59" +version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6aeca18b86b413c660b781aa319e4e2648a3e6f9eadc9b47e9038e6fe9f3451b" +checksum = "907a61bd0f64c2f29cd1cf1dc34d05176426a3f504a78010f08416ddb7b13708" dependencies = [ "unicode-ident", ] @@ -1793,9 +1809,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.28" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -2011,7 +2027,7 @@ checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2216,7 +2232,7 @@ dependencies = [ "proc-macro2", "quote", "swc_macros_common", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2282,7 +2298,7 @@ dependencies = [ "proc-macro2", "quote", "swc_macros_common", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2331,7 +2347,7 @@ dependencies = [ "proc-macro2", "quote", "swc_macros_common", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2414,7 +2430,7 @@ dependencies = [ "proc-macro2", "quote", "swc_macros_common", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2518,7 +2534,7 @@ dependencies = [ "pmutil", "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2530,7 +2546,7 @@ dependencies = [ "pmutil", "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2554,7 +2570,7 @@ dependencies = [ "proc-macro2", "quote", "swc_macros_common", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2570,9 +2586,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.18" +version = "2.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +checksum = "1726efe18f42ae774cc644f330953a5e7b3c3003d3edcecf18850fe9d4dd9afb" dependencies = [ "proc-macro2", "quote", @@ -2614,7 +2630,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2704,7 +2720,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", ] [[package]] @@ -2912,7 +2928,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", "wasm-bindgen-shared", ] @@ -2946,7 +2962,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.18", + "syn 2.0.47", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/fine/Cargo.lock b/fine/Cargo.lock index f3a8ee96..535a9196 100644 --- a/fine/Cargo.lock +++ b/fine/Cargo.lock @@ -12,9 +12,19 @@ checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" name = "fine" version = "0.1.0" dependencies = [ + "glob", "pretty_assertions", + "prettyplease", + "quote", + "syn", ] +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "pretty_assertions" version = "1.4.0" @@ -25,6 +35,51 @@ dependencies = [ "yansi", ] +[[package]] +name = "prettyplease" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "907a61bd0f64c2f29cd1cf1dc34d05176426a3f504a78010f08416ddb7b13708" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "2.0.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1726efe18f42ae774cc644f330953a5e7b3c3003d3edcecf18850fe9d4dd9afb" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + [[package]] name = "yansi" version = "0.5.1" diff --git a/fine/Cargo.toml b/fine/Cargo.toml index 3ee646e5..731a7d40 100644 --- a/fine/Cargo.toml +++ b/fine/Cargo.toml @@ -5,3 +5,9 @@ edition = "2021" [dev-dependencies] pretty_assertions = "1.4.0" + +[build-dependencies] +glob = "0.3.1" +prettyplease = "0.2.16" +quote = "1.0.35" +syn = "2.0.47" diff --git a/fine/build.rs b/fine/build.rs new file mode 100644 index 00000000..100b5229 --- /dev/null +++ b/fine/build.rs @@ -0,0 +1,96 @@ +use quote::{format_ident, quote}; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +fn generate_test_for_file(path: PathBuf) -> String { + let contents = fs::read_to_string(&path).expect("Unable to read input"); + + let mut concrete_stuff: Option = None; + + // Start iterating over lines and processing directives.... + let mut lines = contents.lines(); + while let Some(line) = lines.next() { + let line = match line.strip_prefix("//") { + Some(line) => line, + None => break, + }; + + let line = line.trim(); + if line == "concrete:" { + let mut concrete = String::new(); + while let Some(line) = lines.next() { + let line = match line.strip_prefix("// | ") { + Some(line) => line, + None => break, + }; + + concrete.push_str(line); + concrete.push_str("\n"); + } + concrete_stuff = Some(concrete); + } + } + + let concrete_comparison = if let Some(concrete) = concrete_stuff { + quote! { + crate::assert_concrete(&_tree, #concrete) + } + } else { + quote! {} + }; + + let name = format_ident!("{}", path.file_stem().unwrap().to_string_lossy()); + let test_method = quote! { + fn #name() { + let (_tree, _lines) = fine::parser::concrete::parse_concrete(#contents); + #concrete_comparison; + } + }; + + let syntax_tree = syn::parse2(test_method).unwrap(); + prettyplease::unparse(&syntax_tree) +} + +fn process_directory(output: &mut String, path: T) +where + T: AsRef, +{ + let fine_ext: std::ffi::OsString = "fine".into(); + let path = path.as_ref(); + for entry in std::fs::read_dir(path).expect("Unable to read directory") { + match entry { + Ok(dirent) => { + let file_type = dirent.file_type().unwrap(); + if file_type.is_dir() { + let file_name = dirent.file_name(); + let file_name = file_name.to_string_lossy().to_owned(); + output.push_str(&format!("mod {file_name} {{\n")); + process_directory(output, dirent.path()); + output.push_str("}\n\n"); + } else if file_type.is_file() { + if dirent.path().extension() == Some(&fine_ext) { + output.push_str(&format!("// {}\n", dirent.path().display())); + output.push_str("#[test]\n"); + output.push_str(&generate_test_for_file(dirent.path())); + output.push_str("\n\n"); + } + } else { + eprintln!("Skipping symlink: {}", path.display()); + } + } + Err(e) => eprintln!("Unable to read directory entry: {:?}", e), + } + } +} + +fn main() { + println!("cargo:rerun-if-changed=./tests"); + + let mut test_source = String::new(); + process_directory(&mut test_source, "./tests"); + + let out_dir = env::var_os("OUT_DIR").unwrap(); + let dest_path = Path::new(&out_dir).join("generated_tests.rs"); + fs::write(dest_path, test_source).unwrap(); +} diff --git a/fine/src/parser.rs b/fine/src/parser.rs index 5ab842ca..ce8fd3e6 100644 --- a/fine/src/parser.rs +++ b/fine/src/parser.rs @@ -1,5 +1,7 @@ use crate::tokens::{Lines, Token, TokenKind, Tokens}; -use std::{cell::Cell, fmt}; +use std::fmt; + +pub mod concrete; // TODO: An error should have: // @@ -157,492 +159,6 @@ impl std::fmt::Display for Type { } } -// NOTE: much of this parser structure derived from -// https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html -pub enum TreeKind { - Error, - File, - FunDecl, - ParamList, - Parameter, - TypeExpression, - Block, - LetStatement, - ReturnStatement, - ExpressionStatement, - LiteralExpression, - GroupingExpression, - UnaryExpression, - ConditionalExpression, - CallExpression, - ArgumentList, - Argument, - BinaryExpression, - IfStatement, -} - -pub struct Tree<'a> { - pub kind: TreeKind, - // TODO: Indirect reference? Flatness? Using a reference structure will - // make caching and annotation easier if desired. - pub children: Vec>, -} - -pub enum Child<'a> { - Token(Token<'a>), - Tree(Tree<'a>), -} - -enum ParseEvent<'a> { - Start { kind: TreeKind }, - End, - Advance { token: Token<'a> }, -} - -struct MarkStarted { - index: usize, -} - -struct MarkClosed { - index: usize, -} - -struct CParser<'a> { - tokens: Tokens<'a>, - current: Token<'a>, - fuel: Cell, - events: Vec>, -} - -impl<'a> CParser<'a> { - fn new(tokens: Tokens<'a>) -> Self { - let mut parser = CParser { - tokens, - current: Token::new(TokenKind::EOF, 0, ""), - fuel: Cell::new(256), - events: Vec::new(), - }; - parser.current = parser.tokens.next(); - parser - } - - fn start(&mut self) -> MarkStarted { - let mark = MarkStarted { - index: self.events.len(), - }; - self.events.push(ParseEvent::Start { - kind: TreeKind::Error, - }); - mark - } - - fn end(&mut self, mark: MarkStarted, kind: TreeKind) -> MarkClosed { - self.events[mark.index] = ParseEvent::Start { kind }; - self.events.push(ParseEvent::End); - MarkClosed { index: mark.index } - } - - fn start_before(&mut self, mark: MarkClosed) -> MarkStarted { - // TODO: Point backwards and pointer chase in tree build? - let mark = MarkStarted { index: mark.index }; - self.events.insert( - mark.index, - ParseEvent::Start { - kind: TreeKind::Error, - }, - ); - mark - } - - fn advance(&mut self) { - assert!(!self.eof()); // Don't try to advance past EOF - self.fuel.set(256); // Consuming a token, rest stuck detector - self.events.push(ParseEvent::Advance { - token: self.current.clone(), - }); - self.current = self.tokens.next(); - } - - fn eof(&self) -> bool { - self.current.kind == TokenKind::EOF - } - - fn peek(&self) -> TokenKind { - assert!(self.fuel.get() > 0, "parser is stuck!"); - self.fuel.set(self.fuel.get() - 1); - self.current.kind - } - - fn at(&self, kind: TokenKind) -> bool { - self.peek() == kind - } - - fn eat(&mut self, kind: TokenKind) -> bool { - if self.at(kind) { - self.advance(); - true - } else { - false - } - } - - fn expect(&mut self, kind: TokenKind, error: T) - where - T: Into, - { - if self.eat(kind) { - return; - } - self.error(error); - } - - fn advance_with_error(&mut self, error: T) -> MarkClosed - where - T: Into, - { - let m = self.start(); - self.error(error); - self.advance(); - self.end(m, TreeKind::Error) - } - - fn error(&mut self, message: T) - where - T: Into, - { - self.error_at(self.current.clone(), message) - } - - fn error_at(&mut self, token: Token<'a>, message: T) - where - T: Into, - { - let message: String = message.into(); - let mut final_message = "Error ".to_string(); - - if token.kind == TokenKind::EOF { - final_message.push_str("at end") - } else if token.kind != TokenKind::Error { - final_message.push_str("at '"); - final_message.push_str(token.as_str()); - final_message.push_str("'"); - } - final_message.push_str(": "); - final_message.push_str(&message); - - self.events.push(ParseEvent::Advance { - token: Token::error(token.start, final_message), - }); - } - - fn build_tree(self) -> (Tree<'a>, Lines) { - let mut events = self.events; - let mut stack = Vec::new(); - - // Special case: pop the last `Close` event to ensure that the stack - // is non-empty inside the loop. - assert!(matches!(events.pop(), Some(ParseEvent::End))); - - for event in events { - match event { - ParseEvent::Start { kind } => stack.push(Tree { - kind, - children: Vec::new(), - }), - - ParseEvent::End => { - let tree = stack.pop().unwrap(); - stack.last_mut().unwrap().children.push(Child::Tree(tree)); - } - - ParseEvent::Advance { token } => { - stack.last_mut().unwrap().children.push(Child::Token(token)); - } - } - } - - assert!(stack.len() == 1, "Not all trees were ended!"); - (stack.pop().unwrap(), self.tokens.lines()) - } -} - -pub fn c_parse(source: &str) -> (Tree, Lines) { - let tokens = Tokens::new(source); - let mut parser = CParser::new(tokens); - - file(&mut parser); - - parser.build_tree() -} - -fn file(p: &mut CParser) { - let m = p.start(); - while !p.eof() { - match p.peek() { - TokenKind::Fun => function(p), - _ => statement(p), - } - } - p.end(m, TreeKind::File); -} - -fn function(p: &mut CParser) { - assert!(p.at(TokenKind::Fun)); - let m = p.start(); - - p.expect(TokenKind::Fun, "expected a function to start with 'fun'"); - p.expect(TokenKind::Identifier, "expected a function name"); - if p.at(TokenKind::LeftParen) { - param_list(p); - } - if p.eat(TokenKind::Arrow) { - type_expr(p); - } - if p.at(TokenKind::LeftBrace) { - block(p); - } - - p.end(m, TreeKind::FunDecl); -} - -fn param_list(p: &mut CParser) { - assert!(p.at(TokenKind::LeftParen)); - let m = p.start(); - - p.expect(TokenKind::LeftParen, "expect '(' to start a parameter list"); - while !p.at(TokenKind::RightParen) && !p.eof() { - if p.at(TokenKind::Identifier) { - parameter(p); - } else { - break; - } - } - p.expect(TokenKind::RightParen, "expect ')' to end a parameter list"); - - p.end(m, TreeKind::ParamList); -} - -fn parameter(p: &mut CParser) { - assert!(p.at(TokenKind::Identifier)); - let m = p.start(); - p.expect( - TokenKind::Identifier, - "expected an identifier for a parameter name", - ); - if p.eat(TokenKind::Colon) { - type_expr(p); - } - if !p.at(TokenKind::RightParen) { - p.expect(TokenKind::Comma, "expected a comma between parameters"); - } - - p.end(m, TreeKind::Parameter); -} - -fn type_expr(p: &mut CParser) { - let m = p.start(); - // TODO: Other kinds of type expressions probably! - p.expect(TokenKind::Identifier, "expected the identifier of a type"); - p.end(m, TreeKind::TypeExpression); -} - -fn block(p: &mut CParser) { - assert!(p.at(TokenKind::LeftBrace)); - let m = p.start(); - - p.expect(TokenKind::LeftBrace, "expect '{' to start a block"); - while !p.at(TokenKind::RightBrace) && !p.eof() { - statement(p); - } - p.expect(TokenKind::RightBrace, "expect '}' to start a block"); - - p.end(m, TreeKind::Block); -} - -fn statement(p: &mut CParser) { - match p.peek() { - TokenKind::LeftBrace => block(p), - TokenKind::Let => statement_let(p), - TokenKind::Return => statement_return(p), - - // NOTE: Technically 'if' is an expression, but `if` doesn't - // require a semicolon at the end if it's all by itself. - TokenKind::If => statement_if(p), - - _ => statement_expression(p), - } -} - -fn statement_if(p: &mut CParser) { - assert!(p.at(TokenKind::If)); - let m = p.start(); - - conditional(p); - - p.end(m, TreeKind::IfStatement); -} - -fn statement_let(p: &mut CParser) { - assert!(p.at(TokenKind::Let)); - let m = p.start(); - - p.expect(TokenKind::Let, "expect 'let' to start a let statement"); - p.expect(TokenKind::Identifier, "expected a name for the variable"); - p.expect(TokenKind::Equal, "expected a '=' after the variable name"); - expression(p); - p.expect(TokenKind::Semicolon, "expect ';' to end a let statement"); - - p.end(m, TreeKind::LetStatement); -} - -fn statement_return(p: &mut CParser) { - assert!(p.at(TokenKind::Return)); - let m = p.start(); - - p.expect( - TokenKind::Return, - "expect 'return' to start a return statement", - ); - expression(p); - p.expect(TokenKind::Semicolon, "expect ';' to end a return statement"); - - p.end(m, TreeKind::ReturnStatement); -} - -fn statement_expression(p: &mut CParser) { - let m = p.start(); - - expression(p); - p.expect( - TokenKind::Semicolon, - "expect ';' to end an expression statement", - ); - - p.end(m, TreeKind::ExpressionStatement); -} - -fn expression(p: &mut CParser) { - expression_with_power(p, 0) -} - -fn expression_with_power(p: &mut CParser, minimum_power: u8) { - let mut expr = prefix_expression(p); - while p.at(TokenKind::LeftParen) { - let m = p.start_before(expr); - argument_list(p); - expr = p.end(m, TreeKind::CallExpression); - } - - loop { - let Some(power) = token_power(p.peek()) else { - break; - }; - if power < minimum_power { - break; - } - - // TODO: I don't think this works for other "infix" types, but we'll - // see won't we. - let m = p.start_before(expr); - p.advance(); // Consume the operator - expression_with_power(p, power); - expr = p.end(m, TreeKind::BinaryExpression); - } -} - -fn argument_list(p: &mut CParser) { - assert!(p.at(TokenKind::LeftParen)); - let m = p.start(); - - p.expect( - TokenKind::LeftParen, - "expect an argument list to start with '('", - ); - while !p.at(TokenKind::RightParen) && !p.eof() { - argument(p); - } - p.expect( - TokenKind::RightParen, - "expect an argument list to start with '('", - ); - - p.end(m, TreeKind::ArgumentList); -} - -fn argument(p: &mut CParser) { - let m = p.start(); - - expression(p); - if !p.at(TokenKind::RightParen) { - p.expect(TokenKind::Comma, "expect a ',' between arguments"); - } - - p.end(m, TreeKind::Argument); -} - -fn prefix_expression(p: &mut CParser) -> MarkClosed { - match p.peek() { - TokenKind::Number => literal(p), - TokenKind::String => literal(p), - TokenKind::True => literal(p), - TokenKind::False => literal(p), - - TokenKind::LeftParen => grouping(p), - - TokenKind::Bang => unary(p), - TokenKind::Minus => unary(p), - - TokenKind::If => conditional(p), - - _ => p.advance_with_error("expected an expression"), - } -} - -fn literal(p: &mut CParser) -> MarkClosed { - let m = p.start(); - p.advance(); - p.end(m, TreeKind::LiteralExpression) -} - -fn grouping(p: &mut CParser) -> MarkClosed { - assert!(p.at(TokenKind::LeftParen)); - let m = p.start(); - - p.expect(TokenKind::LeftParen, "expected '(' to start grouping"); - expression(p); - p.expect(TokenKind::RightParen, "unmatched parentheses in expression"); - - p.end(m, TreeKind::GroupingExpression) -} - -fn unary(p: &mut CParser) -> MarkClosed { - let m = p.start(); - - p.advance(); // Past the operator - expression_with_power(p, UNARY_POWER); - - p.end(m, TreeKind::UnaryExpression) -} - -fn conditional(p: &mut CParser) -> MarkClosed { - assert!(p.at(TokenKind::If)); - let m = p.start(); - - p.expect(TokenKind::If, "expected conditional to start with 'if'"); - expression(p); - block(p); - if p.eat(TokenKind::Else) { - if p.at(TokenKind::If) { - // Don't require another block, just jump right into the conditional. - conditional(p); - } else { - block(p); - } - } - - p.end(m, TreeKind::ConditionalExpression) -} - pub struct SyntaxTree<'a> { pub errors: Vec, expressions: Vec>, @@ -1130,8 +646,13 @@ impl<'a> Parser<'a> { fn advance(&mut self) { self.previous = self.current.clone(); self.current = self.tokens.next(); - while self.current.kind == TokenKind::Error { - self.error_at_current(self.current.to_string()); + while self.current.kind == TokenKind::Error + || self.current.kind == TokenKind::Whitespace + || self.current.kind == TokenKind::Comment + { + if self.current.kind == TokenKind::Error { + self.error_at_current(self.current.to_string()); + } self.current = self.tokens.next(); } } diff --git a/fine/src/parser/concrete.rs b/fine/src/parser/concrete.rs new file mode 100644 index 00000000..7e857af1 --- /dev/null +++ b/fine/src/parser/concrete.rs @@ -0,0 +1,618 @@ +// NOTE: much of this parser structure derived from +// https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html +use crate::tokens::{Lines, Token, TokenKind, Tokens}; +use std::cell::Cell; + +// BINDING POWERS. When parsing expressions we only accept expressions that +// meet a minimum binding power. (This is like "precedence" but I just super +// don't like that terminology.) +const ASSIGNMENT_POWER: u8 = 0; // = +const OR_POWER: u8 = 1; // or +const AND_POWER: u8 = 2; // and +const EQUALITY_POWER: u8 = 3; // == != +const COMPARISON_POWER: u8 = 4; // < > <= >= +const TERM_POWER: u8 = 5; // + - +const FACTOR_POWER: u8 = 6; // * / +const UNARY_POWER: u8 = 7; // ! - + +// const PRIMARY_POWER: u8 = 9; + +fn token_power<'a>(token: TokenKind) -> Option { + match token { + TokenKind::Equal => Some(ASSIGNMENT_POWER), + TokenKind::Or => Some(OR_POWER), + TokenKind::And => Some(AND_POWER), + TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER), + TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => { + Some(COMPARISON_POWER) + } + TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER), + TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER), + _ => None, + } +} + +#[derive(Debug)] +pub enum TreeKind { + Error, + File, + FunDecl, + ParamList, + Parameter, + TypeExpression, + Block, + LetStatement, + ReturnStatement, + ExpressionStatement, + LiteralExpression, + GroupingExpression, + UnaryExpression, + ConditionalExpression, + CallExpression, + ArgumentList, + Argument, + BinaryExpression, + IfStatement, + Identifier, +} + +pub struct Tree<'a> { + pub kind: TreeKind, + // TODO: Indirect reference? Flatness? Using a reference structure will + // make caching and annotation easier if desired. + pub children: Vec>, +} + +impl<'a> Tree<'a> { + pub fn dump(&self) -> String { + let mut output = String::new(); + output.push_str(&format!("{:?}\n", self.kind)); + for child in self.children.iter() { + child.dump_rec(2, &mut output); + } + output + } +} + +impl<'a> std::fmt::Debug for Tree<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[{:?}", self.kind)?; + for child in self.children.iter() { + match child { + Child::Token(t) => write!(f, " {:?}:'{}'", t.kind, t.as_str())?, + Child::Tree(t) => write!(f, " {t:?}")?, + } + } + write!(f, "]")?; + Ok(()) + } +} + +pub enum Child<'a> { + Token(Token<'a>), + Tree(Tree<'a>), +} + +impl<'a> Child<'a> { + fn dump_rec(&self, indent: usize, output: &mut String) { + for _ in 0..indent { + output.push(' '); + } + match self { + Child::Token(t) => output.push_str(&format!("{:?}:'{:?}'\n", t.kind, t.as_str())), + Child::Tree(t) => { + output.push_str(&format!("{:?}\n", t.kind)); + for child in t.children.iter() { + child.dump_rec(indent + 2, output); + } + } + } + } +} + +enum ParseEvent<'a> { + Start { kind: TreeKind }, + End, + Advance { token: Token<'a> }, +} + +struct MarkStarted { + index: usize, +} + +struct MarkClosed { + index: usize, +} + +struct CParser<'a> { + tokens: Tokens<'a>, + current: Token<'a>, + fuel: Cell, + events: Vec>, +} + +impl<'a> CParser<'a> { + fn new(tokens: Tokens<'a>) -> Self { + let mut parser = CParser { + tokens, + current: Token::new(TokenKind::EOF, 0, ""), + fuel: Cell::new(256), + events: Vec::new(), + }; + parser.current = parser.tokens.next(); + parser.skip_ephemera(); + parser + } + + fn start(&mut self) -> MarkStarted { + let mark = MarkStarted { + index: self.events.len(), + }; + self.events.push(ParseEvent::Start { + kind: TreeKind::Error, + }); + mark + } + + fn end(&mut self, mark: MarkStarted, kind: TreeKind) -> MarkClosed { + self.events[mark.index] = ParseEvent::Start { kind }; + self.events.push(ParseEvent::End); + MarkClosed { index: mark.index } + } + + fn start_before(&mut self, mark: MarkClosed) -> MarkStarted { + // TODO: Point backwards and pointer chase in tree build? + let mark = MarkStarted { index: mark.index }; + self.events.insert( + mark.index, + ParseEvent::Start { + kind: TreeKind::Error, + }, + ); + mark + } + + fn advance(&mut self) { + assert!(!self.eof()); // Don't try to advance past EOF + self.fuel.set(256); // Consuming a token, rest stuck detector + self.events.push(ParseEvent::Advance { + token: self.current.clone(), + }); + self.current = self.tokens.next(); + self.skip_ephemera(); + } + + fn skip_ephemera(&mut self) { + while self.current.kind == TokenKind::Whitespace || self.current.kind == TokenKind::Comment + { + self.current = self.tokens.next(); + } + } + + fn eof(&self) -> bool { + self.current.kind == TokenKind::EOF + } + + fn peek(&self) -> TokenKind { + assert!(self.fuel.get() > 0, "parser is stuck!"); + self.fuel.set(self.fuel.get() - 1); + self.current.kind + } + + fn at(&self, kind: TokenKind) -> bool { + self.peek() == kind + } + + fn eat(&mut self, kind: TokenKind) -> bool { + if self.at(kind) { + self.advance(); + true + } else { + false + } + } + + fn expect(&mut self, kind: TokenKind, error: T) + where + T: Into, + { + if self.eat(kind) { + return; + } + self.error(error); + } + + fn advance_with_error(&mut self, error: T) -> MarkClosed + where + T: Into, + { + let m = self.start(); + self.error(error); + self.advance(); + self.end(m, TreeKind::Error) + } + + fn error(&mut self, message: T) + where + T: Into, + { + self.error_at(self.current.clone(), message) + } + + fn error_at(&mut self, token: Token<'a>, message: T) + where + T: Into, + { + let message: String = message.into(); + let mut final_message = "Error ".to_string(); + + if token.kind == TokenKind::EOF { + final_message.push_str("at end") + } else if token.kind != TokenKind::Error { + final_message.push_str("at '"); + final_message.push_str(token.as_str()); + final_message.push_str("'"); + } + final_message.push_str(": "); + final_message.push_str(&message); + + self.events.push(ParseEvent::Advance { + token: Token::error(token.start, final_message), + }); + } + + fn build_tree(self) -> (Tree<'a>, Lines) { + let mut events = self.events; + let mut stack = Vec::new(); + + // The first element in our events vector must be a start; the whole + // thing must be bracketed in a tree. + assert!(matches!(events.get(0), Some(ParseEvent::Start { .. }))); + + // The last element in our events vector must be an end, otherwise + // the parser has failed badly. We'll remove it here so that, after + // processing the entire array, the stack retains the tree that we + // start with the very first ::Start. + assert!(matches!(events.pop(), Some(ParseEvent::End))); + + for event in events { + match event { + ParseEvent::Start { kind } => stack.push(Tree { + kind, + children: Vec::new(), + }), + + ParseEvent::End => { + let tree = stack.pop().unwrap(); + stack.last_mut().unwrap().children.push(Child::Tree(tree)); + } + + ParseEvent::Advance { token } => { + stack.last_mut().unwrap().children.push(Child::Token(token)); + } + } + } + + assert!(stack.len() == 1, "Not all trees were ended!"); + (stack.pop().unwrap(), self.tokens.lines()) + } +} + +pub fn parse_concrete(source: &str) -> (Tree, Lines) { + let tokens = Tokens::new(source); + let mut parser = CParser::new(tokens); + + file(&mut parser); + + parser.build_tree() +} + +fn file(p: &mut CParser) { + let m = p.start(); + while !p.eof() { + match p.peek() { + TokenKind::Fun => function(p), + _ => statement(p), + } + } + p.end(m, TreeKind::File); +} + +fn function(p: &mut CParser) { + assert!(p.at(TokenKind::Fun)); + let m = p.start(); + + p.expect(TokenKind::Fun, "expected a function to start with 'fun'"); + p.expect(TokenKind::Identifier, "expected a function name"); + if p.at(TokenKind::LeftParen) { + param_list(p); + } + if p.eat(TokenKind::Arrow) { + type_expr(p); + } + if p.at(TokenKind::LeftBrace) { + block(p); + } + + p.end(m, TreeKind::FunDecl); +} + +fn param_list(p: &mut CParser) { + assert!(p.at(TokenKind::LeftParen)); + let m = p.start(); + + p.expect(TokenKind::LeftParen, "expect '(' to start a parameter list"); + while !p.at(TokenKind::RightParen) && !p.eof() { + if p.at(TokenKind::Identifier) { + parameter(p); + } else { + break; + } + } + p.expect(TokenKind::RightParen, "expect ')' to end a parameter list"); + + p.end(m, TreeKind::ParamList); +} + +fn parameter(p: &mut CParser) { + assert!(p.at(TokenKind::Identifier)); + let m = p.start(); + p.expect( + TokenKind::Identifier, + "expected an identifier for a parameter name", + ); + if p.eat(TokenKind::Colon) { + type_expr(p); + } + if !p.at(TokenKind::RightParen) { + p.expect(TokenKind::Comma, "expected a comma between parameters"); + } + + p.end(m, TreeKind::Parameter); +} + +fn type_expr(p: &mut CParser) { + let m = p.start(); + // TODO: Other kinds of type expressions probably! + p.expect(TokenKind::Identifier, "expected the identifier of a type"); + p.end(m, TreeKind::TypeExpression); +} + +fn block(p: &mut CParser) { + assert!(p.at(TokenKind::LeftBrace)); + let m = p.start(); + + p.expect(TokenKind::LeftBrace, "expect '{' to start a block"); + while !p.at(TokenKind::RightBrace) && !p.eof() { + statement(p); + } + p.expect(TokenKind::RightBrace, "expect '}' to start a block"); + + p.end(m, TreeKind::Block); +} + +fn statement(p: &mut CParser) { + match p.peek() { + TokenKind::LeftBrace => block(p), + TokenKind::Let => statement_let(p), + TokenKind::Return => statement_return(p), + + // NOTE: Technically 'if' is an expression, but `if` doesn't + // require a semicolon at the end if it's all by itself. + TokenKind::If => statement_if(p), + + _ => statement_expression(p), + } +} + +fn statement_if(p: &mut CParser) { + assert!(p.at(TokenKind::If)); + let m = p.start(); + + conditional(p); + + p.end(m, TreeKind::IfStatement); +} + +fn statement_let(p: &mut CParser) { + assert!(p.at(TokenKind::Let)); + let m = p.start(); + + p.expect(TokenKind::Let, "expect 'let' to start a let statement"); + p.expect(TokenKind::Identifier, "expected a name for the variable"); + p.expect(TokenKind::Equal, "expected a '=' after the variable name"); + expression(p); + p.expect(TokenKind::Semicolon, "expect ';' to end a let statement"); + + p.end(m, TreeKind::LetStatement); +} + +fn statement_return(p: &mut CParser) { + assert!(p.at(TokenKind::Return)); + let m = p.start(); + + p.expect( + TokenKind::Return, + "expect 'return' to start a return statement", + ); + expression(p); + p.expect(TokenKind::Semicolon, "expect ';' to end a return statement"); + + p.end(m, TreeKind::ReturnStatement); +} + +fn statement_expression(p: &mut CParser) { + let m = p.start(); + + expression(p); + p.expect( + TokenKind::Semicolon, + "expect ';' to end an expression statement", + ); + + p.end(m, TreeKind::ExpressionStatement); +} + +fn expression(p: &mut CParser) { + expression_with_power(p, 0) +} + +fn expression_with_power(p: &mut CParser, minimum_power: u8) { + let mut expr = prefix_expression(p); + while p.at(TokenKind::LeftParen) { + let m = p.start_before(expr); + argument_list(p); + expr = p.end(m, TreeKind::CallExpression); + } + + loop { + let Some(power) = token_power(p.peek()) else { + break; + }; + if power < minimum_power { + break; + } + + // TODO: I don't think this works for other "infix" types, but we'll + // see won't we. + let m = p.start_before(expr); + p.advance(); // Consume the operator + expression_with_power(p, power); + expr = p.end(m, TreeKind::BinaryExpression); + } +} + +fn argument_list(p: &mut CParser) { + assert!(p.at(TokenKind::LeftParen)); + let m = p.start(); + + p.expect( + TokenKind::LeftParen, + "expect an argument list to start with '('", + ); + while !p.at(TokenKind::RightParen) && !p.eof() { + argument(p); + } + p.expect( + TokenKind::RightParen, + "expect an argument list to start with '('", + ); + + p.end(m, TreeKind::ArgumentList); +} + +fn argument(p: &mut CParser) { + let m = p.start(); + + expression(p); + if !p.at(TokenKind::RightParen) { + p.expect(TokenKind::Comma, "expect a ',' between arguments"); + } + + p.end(m, TreeKind::Argument); +} + +fn prefix_expression(p: &mut CParser) -> MarkClosed { + match p.peek() { + TokenKind::Number => literal(p), + TokenKind::String => literal(p), + TokenKind::True => literal(p), + TokenKind::False => literal(p), + + TokenKind::LeftParen => grouping(p), + + TokenKind::Bang => unary(p), + TokenKind::Minus => unary(p), + + TokenKind::If => conditional(p), + + TokenKind::Identifier => identifier(p), + + _ => p.advance_with_error("expected an expression"), + } +} + +fn literal(p: &mut CParser) -> MarkClosed { + let m = p.start(); + p.advance(); + p.end(m, TreeKind::LiteralExpression) +} + +fn grouping(p: &mut CParser) -> MarkClosed { + assert!(p.at(TokenKind::LeftParen)); + let m = p.start(); + + p.expect(TokenKind::LeftParen, "expected '(' to start grouping"); + expression(p); + p.expect(TokenKind::RightParen, "unmatched parentheses in expression"); + + p.end(m, TreeKind::GroupingExpression) +} + +fn unary(p: &mut CParser) -> MarkClosed { + let m = p.start(); + + p.advance(); // Past the operator + expression_with_power(p, UNARY_POWER); + + p.end(m, TreeKind::UnaryExpression) +} + +fn conditional(p: &mut CParser) -> MarkClosed { + assert!(p.at(TokenKind::If)); + let m = p.start(); + + p.expect(TokenKind::If, "expected conditional to start with 'if'"); + expression(p); + block(p); + if p.eat(TokenKind::Else) { + if p.at(TokenKind::If) { + // Don't require another block, just jump right into the conditional. + conditional(p); + } else { + block(p); + } + } + + p.end(m, TreeKind::ConditionalExpression) +} + +fn identifier(p: &mut CParser) -> MarkClosed { + assert!(p.at(TokenKind::Identifier)); + let m = p.start(); + + p.advance(); + + p.end(m, TreeKind::Identifier) +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + fn test_successful_expression_parse(source: &str, expected: &str) { + let tokens = Tokens::new(source); + let mut parser = CParser::new(tokens); + + expression(&mut parser); + + let (tree, _) = parser.build_tree(); + assert_eq!( + expected, + format!("{tree:?}"), + "The parse structure of the expressions did not match" + ); + } + + macro_rules! test_expr { + ($name:ident, $input:expr, $expected:expr) => { + #[test] + fn $name() { + test_successful_expression_parse($input, $expected); + } + }; + } + + test_expr!(number_expr, "12", "[LiteralExpression Number:'12']"); +} diff --git a/fine/src/tokens.rs b/fine/src/tokens.rs index ceaa22f0..a6ddccdf 100644 --- a/fine/src/tokens.rs +++ b/fine/src/tokens.rs @@ -3,6 +3,9 @@ pub enum TokenKind { EOF, Error, + Whitespace, + Comment, + LeftBrace, RightBrace, LeftBracket, @@ -390,7 +393,7 @@ impl<'a> Tokens<'a> { self.next_char.is_none() } - fn skip_whitespace(&mut self) { + fn whitespace(&mut self, pos: usize) -> Token<'a> { while let Some((pos, ch)) = self.next_char { if ch == '\n' { self.lines.add_line(pos); @@ -399,16 +402,27 @@ impl<'a> Tokens<'a> { } self.advance(); } + self.token(pos, TokenKind::Whitespace) + } + + fn comment(&mut self, pos: usize) -> Token<'a> { + while let Some((_, ch)) = self.next_char { + if ch == '\n' { + break; + } + self.advance(); + } + self.token(pos, TokenKind::Comment) } pub fn next(&mut self) -> Token<'a> { - self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving let (pos, c) = match self.advance() { Some((p, c)) => (p, c), None => return self.token(self.source.len(), TokenKind::EOF), }; match c { + ' ' | '\t' | '\r' | '\n' => self.whitespace(pos), '{' => self.token(pos, TokenKind::LeftBrace), '}' => self.token(pos, TokenKind::RightBrace), '[' => self.token(pos, TokenKind::LeftBracket), @@ -427,7 +441,13 @@ impl<'a> Tokens<'a> { '+' => self.token(pos, TokenKind::Plus), ':' => self.token(pos, TokenKind::Colon), ';' => self.token(pos, TokenKind::Semicolon), - '/' => self.token(pos, TokenKind::Slash), + '/' => { + if self.matches('/') { + self.comment(pos) + } else { + self.token(pos, TokenKind::Slash) + } + } '*' => self.token(pos, TokenKind::Star), '!' => { if self.matches('=') { @@ -484,6 +504,9 @@ mod tests { while !is_eof { let token = tokens.next(); is_eof = token.kind == TokenKind::EOF; + if token.kind == TokenKind::Whitespace { + continue; + } result.push(token); } diff --git a/fine/tests/example_tests.rs b/fine/tests/example_tests.rs new file mode 100644 index 00000000..c61f26c6 --- /dev/null +++ b/fine/tests/example_tests.rs @@ -0,0 +1,8 @@ +use fine::parser::concrete::Tree; +use pretty_assertions::assert_eq; + +fn assert_concrete(tree: &Tree, expected: &str) { + assert_eq!(tree.dump(), expected, "concrete syntax trees did not match"); +} + +include!(concat!(env!("OUT_DIR"), "/generated_tests.rs")); diff --git a/fine/tests/expression/expressions.fine b/fine/tests/expression/expressions.fine new file mode 100644 index 00000000..b353d91a --- /dev/null +++ b/fine/tests/expression/expressions.fine @@ -0,0 +1,27 @@ +// concrete: +// | File +// | ExpressionStatement +// | LiteralExpression +// | Number:'"42"' +// | Semicolon:'";"' +// | ExpressionStatement +// | BinaryExpression +// | BinaryExpression +// | LiteralExpression +// | Number:'"1"' +// | Star:'"*"' +// | LiteralExpression +// | Number:'"2"' +// | Plus:'"+"' +// | BinaryExpression +// | UnaryExpression +// | Minus:'"-"' +// | LiteralExpression +// | Number:'"3"' +// | Star:'"*"' +// | LiteralExpression +// | Number:'"4"' +// | Semicolon:'";"' +// +42; +1 * 2 + -3 * 4;