diff --git a/fine/src/parser/concrete.rs b/fine/src/parser/concrete.rs index 3b6b25bf..4af5069b 100644 --- a/fine/src/parser/concrete.rs +++ b/fine/src/parser/concrete.rs @@ -1,56 +1,34 @@ // NOTE: much of this parser structure derived from // https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html use crate::tokens::{Lines, Token, TokenKind, Tokens}; -use std::{cell::Cell, num::NonZeroU32}; +use std::cell::Cell; -pub struct ConcreteTree<'a> { - trees: Vec>, - root: Option, -} +// BINDING POWERS. When parsing expressions we only accept expressions that +// meet a minimum binding power. (This is like "precedence" but I just super +// don't like that terminology.) +const ASSIGNMENT_POWER: u8 = 0; // = +const OR_POWER: u8 = 1; // or +const AND_POWER: u8 = 2; // and +const EQUALITY_POWER: u8 = 3; // == != +const COMPARISON_POWER: u8 = 4; // < > <= >= +const TERM_POWER: u8 = 5; // + - +const FACTOR_POWER: u8 = 6; // * / +const UNARY_POWER: u8 = 7; // ! - -impl<'a> ConcreteTree<'a> { - pub fn new() -> Self { - ConcreteTree { - trees: vec![], - root: None, +// const PRIMARY_POWER: u8 = 9; + +fn token_power<'a>(token: TokenKind) -> Option { + match token { + TokenKind::Equal => Some(ASSIGNMENT_POWER), + TokenKind::Or => Some(OR_POWER), + TokenKind::And => Some(AND_POWER), + TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER), + TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => { + Some(COMPARISON_POWER) } - } - - pub fn add_tree(&mut self, t: Tree<'a>) -> TreeRef { - assert!(t.parent.is_none()); - let tr = TreeRef::from_index(self.trees.len()); - - // NOTE: Because of the difficulty of holding multiple mutable - // references it's this is our best chance to patch up parent - // pointers. - for child in t.children.iter() { - if let Child::Tree(ct) = child { - self[*ct].parent = Some(tr); - } - } - self.trees.push(t); - tr - } - - pub fn dump(&self) -> String { - match self.root { - Some(r) => self[r].dump(self), - None => String::new(), - } - } -} - -impl<'a> std::ops::Index for ConcreteTree<'a> { - type Output = Tree<'a>; - - fn index(&self, index: TreeRef) -> &Self::Output { - &self.trees[index.index()] - } -} - -impl<'a> std::ops::IndexMut for ConcreteTree<'a> { - fn index_mut(&mut self, index: TreeRef) -> &mut Self::Output { - &mut self.trees[index.index()] + TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER), + TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER), + _ => None, } } @@ -80,53 +58,52 @@ pub enum TreeKind { pub struct Tree<'a> { pub kind: TreeKind, - pub parent: Option, + // TODO: Indirect reference? Flatness? Using a reference structure will + // make caching and annotation easier if desired. pub children: Vec>, } -#[derive(Copy, Clone, Eq, PartialEq)] -pub struct TreeRef(NonZeroU32); - -impl TreeRef { - pub fn from_index(index: usize) -> TreeRef { - let index: u32 = (index + 1).try_into().unwrap(); - TreeRef(NonZeroU32::new(index).unwrap()) - } - - pub fn index(&self) -> usize { - let index: usize = self.0.get().try_into().unwrap(); - index - 1 - } -} - impl<'a> Tree<'a> { - pub fn dump(&self, tree: &ConcreteTree<'a>) -> String { + pub fn dump(&self) -> String { let mut output = String::new(); output.push_str(&format!("{:?}\n", self.kind)); for child in self.children.iter() { - child.dump_rec(2, tree, &mut output); + child.dump_rec(2, &mut output); } output } } +impl<'a> std::fmt::Debug for Tree<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[{:?}", self.kind)?; + for child in self.children.iter() { + match child { + Child::Token(t) => write!(f, " {:?}:'{}'", t.kind, t.as_str())?, + Child::Tree(t) => write!(f, " {t:?}")?, + } + } + write!(f, "]")?; + Ok(()) + } +} + pub enum Child<'a> { Token(Token<'a>), - Tree(TreeRef), + Tree(Tree<'a>), } impl<'a> Child<'a> { - fn dump_rec(&self, indent: usize, tree: &ConcreteTree<'a>, output: &mut String) { + fn dump_rec(&self, indent: usize, output: &mut String) { for _ in 0..indent { output.push(' '); } match self { Child::Token(t) => output.push_str(&format!("{:?}:'{:?}'\n", t.kind, t.as_str())), Child::Tree(t) => { - let t = &tree[*t]; output.push_str(&format!("{:?}\n", t.kind)); for child in t.children.iter() { - child.dump_rec(indent + 2, tree, output); + child.dump_rec(indent + 2, output); } } } @@ -284,12 +261,10 @@ impl<'a> CParser<'a> { }); } - fn build_tree(self) -> (ConcreteTree<'a>, Lines) { + fn build_tree(self) -> (Tree<'a>, Lines) { let mut events = self.events; let mut stack = Vec::new(); - let mut result = ConcreteTree::new(); - // The first element in our events vector must be a start; the whole // thing must be bracketed in a tree. assert!(matches!(events.get(0), Some(ParseEvent::Start { .. }))); @@ -304,13 +279,12 @@ impl<'a> CParser<'a> { match event { ParseEvent::Start { kind } => stack.push(Tree { kind, - parent: None, children: Vec::new(), }), ParseEvent::End => { - let t = result.add_tree(stack.pop().unwrap()); - stack.last_mut().unwrap().children.push(Child::Tree(t)); + let tree = stack.pop().unwrap(); + stack.last_mut().unwrap().children.push(Child::Tree(tree)); } ParseEvent::Advance { token } => { @@ -320,14 +294,11 @@ impl<'a> CParser<'a> { } assert!(stack.len() == 1, "Not all trees were ended!"); - let root = result.add_tree(stack.pop().unwrap()); - result.root = Some(root); - - (result, self.tokens.lines()) + (stack.pop().unwrap(), self.tokens.lines()) } } -pub fn parse_concrete(source: &str) -> (ConcreteTree, Lines) { +pub fn parse_concrete(source: &str) -> (Tree, Lines) { let tokens = Tokens::new(source); let mut parser = CParser::new(tokens); @@ -492,35 +463,6 @@ fn expression(p: &mut CParser) { expression_with_power(p, 0) } -// BINDING POWERS. When parsing expressions we only accept expressions that -// meet a minimum binding power. (This is like "precedence" but I just super -// don't like that terminology.) -const ASSIGNMENT_POWER: u8 = 0; // = -const OR_POWER: u8 = 1; // or -const AND_POWER: u8 = 2; // and -const EQUALITY_POWER: u8 = 3; // == != -const COMPARISON_POWER: u8 = 4; // < > <= >= -const TERM_POWER: u8 = 5; // + - -const FACTOR_POWER: u8 = 6; // * / -const UNARY_POWER: u8 = 7; // ! - - -// const PRIMARY_POWER: u8 = 9; - -fn token_power<'a>(token: TokenKind) -> Option { - match token { - TokenKind::Equal => Some(ASSIGNMENT_POWER), - TokenKind::Or => Some(OR_POWER), - TokenKind::And => Some(AND_POWER), - TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER), - TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => { - Some(COMPARISON_POWER) - } - TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER), - TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER), - _ => None, - } -} - fn expression_with_power(p: &mut CParser, minimum_power: u8) { let mut expr = prefix_expression(p); while p.at(TokenKind::LeftParen) { @@ -653,11 +595,30 @@ fn identifier(p: &mut CParser) -> MarkClosed { #[cfg(test)] mod tests { use super::*; + use pretty_assertions::assert_eq; - #[test] - fn tree_ref_size() { - // What's the point of doing all that work if the tree ref isn't nice - // and "small"? - assert_eq!(4, std::mem::size_of::>()); + fn test_successful_expression_parse(source: &str, expected: &str) { + let tokens = Tokens::new(source); + let mut parser = CParser::new(tokens); + + expression(&mut parser); + + let (tree, _) = parser.build_tree(); + assert_eq!( + expected, + format!("{tree:?}"), + "The parse structure of the expressions did not match" + ); } + + macro_rules! test_expr { + ($name:ident, $input:expr, $expected:expr) => { + #[test] + fn $name() { + test_successful_expression_parse($input, $expected); + } + }; + } + + test_expr!(number_expr, "12", "[LiteralExpression Number:'12']"); } diff --git a/fine/src/tokens.rs b/fine/src/tokens.rs index dedbf864..a6ddccdf 100644 --- a/fine/src/tokens.rs +++ b/fine/src/tokens.rs @@ -58,18 +58,11 @@ pub enum TokenKind { Yield, } -// NOTE: Tokens are kinda big (like 40 bytes?) and AFAICT the only way to go -// smaller would be to stop using string pointers and use smaller -// sizes/offsets instead, e.g., 32b for offset and 32b for size, and -// stop tracking the position independently from the start, and then -// require the source text when converting to line/col. I'm unwilling to -// give up the ergonomics of &str and String right now, so we're just -// not doing it. #[derive(Debug, PartialEq, Eq, Clone)] pub struct Token<'a> { pub kind: TokenKind, pub start: usize, - value: Result<&'a str, Box>, + value: Result<&'a str, String>, } impl<'a> Token<'a> { @@ -85,7 +78,7 @@ impl<'a> Token<'a> { Token { kind: TokenKind::Error, start, - value: Err(message.into()), + value: Err(message), } } diff --git a/fine/tests/example_tests.rs b/fine/tests/example_tests.rs index 4526a6b9..680ab3d5 100644 --- a/fine/tests/example_tests.rs +++ b/fine/tests/example_tests.rs @@ -1,4 +1,4 @@ -use fine::parser::concrete::ConcreteTree; +use fine::parser::concrete::Tree; use pretty_assertions::assert_eq; fn rebase_concrete(source_path: &str, dump: &str) { @@ -68,7 +68,7 @@ fn rebase_concrete(source_path: &str, dump: &str) { std::fs::write(source_path, result).expect("unable to write the new file!"); } -fn assert_concrete(tree: &ConcreteTree, expected: &str, source_path: &str) { +fn assert_concrete(tree: &Tree, expected: &str, source_path: &str) { let dump = tree.dump(); let rebase = std::env::var("FINE_TEST_REBASE") .unwrap_or(String::new())