From 7fccab8f592525cfc5212af877549f0a4ecf066e Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 30 Dec 2023 17:15:05 -0800 Subject: [PATCH 1/9] [oden-script] Tokens --- oden-script/Cargo.lock | 25 ++ oden-script/Cargo.toml | 3 + oden-script/src/lib.rs | 459 +----------------------------- oden-script/src/main.rs | 2 - oden-script/src/tokens.rs | 569 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 598 insertions(+), 460 deletions(-) create mode 100644 oden-script/src/tokens.rs diff --git a/oden-script/Cargo.lock b/oden-script/Cargo.lock index 459de42c..8839cda3 100644 --- a/oden-script/Cargo.lock +++ b/oden-script/Cargo.lock @@ -2,6 +2,31 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + [[package]] name = "oden-script" version = "0.1.0" +dependencies = [ + "pretty_assertions", +] + +[[package]] +name = "pretty_assertions" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +dependencies = [ + "diff", + "yansi", +] + +[[package]] +name = "yansi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" diff --git a/oden-script/Cargo.toml b/oden-script/Cargo.toml index 87a1f807..25623ca0 100644 --- a/oden-script/Cargo.toml +++ b/oden-script/Cargo.toml @@ -2,3 +2,6 @@ name = "oden-script" version = "0.1.0" edition = "2021" + +[dev-dependencies] +pretty_assertions = "1.4.0" diff --git a/oden-script/src/lib.rs b/oden-script/src/lib.rs index e0d6d806..5c766355 100644 --- a/oden-script/src/lib.rs +++ b/oden-script/src/lib.rs @@ -1,458 +1 @@ -#[derive(Debug)] -pub enum TokenKind<'a> { - LeftBrace, - RightBrace, - LeftBracket, - RightBracket, - LeftParen, - RightParen, - Comma, - Dot, - Minus, - Plus, - Semicolon, - Slash, - Star, - - Bang, - BangEqual, - Equal, - EqualEqual, - Greater, - GreaterEqual, - Less, - LessEqual, - - Identifier(&'a str), // TODO - String(&'a str), - Number(&'a str), - - And, - Async, - Await, - Class, - Else, - False, - For, - From, - Fun, - If, - Let, - Or, - Print, - Return, - Select, - This, - True, - While, - Yield, - - Error(String), -} - -#[derive(Debug)] -pub struct Token<'a> { - kind: TokenKind<'a>, - start: usize, -} - -impl<'a> Token<'a> { - pub fn as_str<'b>(&'b self) -> &'a str - where - 'b: 'a, - { - use TokenKind::*; - match &self.kind { - LeftBrace => "{", - RightBrace => "}", - LeftBracket => "[", - RightBracket => "]", - - LeftParen => "(", - RightParen => ")", - Comma => ",", - Dot => ".", - Minus => "-", - - Plus => "+", - Semicolon => ";", - Slash => "/", - Star => "*", - - Bang => "+", - BangEqual => "!=", - Equal => "=", - EqualEqual => "==", - Greater => ">", - GreaterEqual => ">=", - Less => "<", - LessEqual => "<=", - - Identifier(v) => v, - String(v) => v, - Number(v) => v, - - And => "and", - Async => "async", - Await => "await", - Class => "class", - Else => "else", - False => "false", - For => "for", - From => "from", - Fun => "fun", - If => "if", - Let => "let", - Or => "or", - Print => "print", - Return => "return", - Select => "select", - This => "this", - True => "true", - While => "while", - Yield => "yield", - - Error(e) => e, - } - } -} - -pub struct Tokens<'a> { - source: &'a str, - chars: std::str::CharIndices<'a>, - next_char: Option<(usize, char)>, - newlines: Vec, -} - -impl<'a> Tokens<'a> { - pub fn new(source: &'a str) -> Self { - let mut chars = source.char_indices(); - let next_char = chars.next(); - Tokens { - source, - chars, - next_char, - newlines: Vec::new(), - } - } - - pub fn token_position(&self, token: &Token) -> (usize, usize) { - let line_end_index = match self.newlines.binary_search(&token.start) { - Ok(index) => index, - Err(index) => index, - }; - let line_start_pos = if line_end_index == 0 { - 0 - } else { - self.newlines[line_end_index - 1] + 1 - }; - let line_number = line_end_index + 1; - let column_offset = token.start - line_start_pos; - (line_number, column_offset) - } - - pub fn next_token(&mut self) -> Option> { - self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving - let (pos, c) = match self.advance() { - Some((p, c)) => (p, c), - None => return None, - }; - - let token = match c { - '{' => TokenKind::LeftBrace, - '}' => TokenKind::RightBrace, - '[' => TokenKind::LeftBracket, - ']' => TokenKind::RightBracket, - '(' => TokenKind::LeftParen, - ')' => TokenKind::RightParen, - ',' => TokenKind::Comma, - '.' => TokenKind::Dot, - '-' => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else { - TokenKind::Minus - } - } - '+' => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else { - TokenKind::Plus - } - } - ';' => TokenKind::Semicolon, - '/' => TokenKind::Slash, - '*' => TokenKind::Star, - '!' => { - if self.matches('=') { - TokenKind::BangEqual - } else { - TokenKind::Bang - } - } - '=' => { - if self.matches('=') { - TokenKind::EqualEqual - } else { - TokenKind::Equal - } - } - '>' => { - if self.matches('=') { - TokenKind::GreaterEqual - } else { - TokenKind::Greater - } - } - '<' => { - if self.matches('=') { - TokenKind::LessEqual - } else { - TokenKind::Less - } - } - '\'' => self.string(pos, '\''), - '"' => self.string(pos, '"'), - _ => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else if self.matches_next(|c| c.is_ascii_alphabetic() || c == '_') { - self.identifier(pos) - } else { - TokenKind::Error(format!("Unexpected character '{c}'")) - } - } - }; - let token = self.token(pos, token); - Some(token) - } - - fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> { - Token { kind, start } - } - - fn number(&mut self, start: usize) -> TokenKind<'a> { - // First, the main part. - loop { - if !self.matches_digit() { - break; - } - } - - // Now the fraction part. - // The thing that is bad here is that this is speculative... - let backup = self.chars.clone(); - if self.matches('.') { - let mut saw_digit = false; - loop { - if self.matches('_') { - } else if self.matches_next(|c| c.is_ascii_digit()) { - saw_digit = true; - } else { - break; - } - } - - if saw_digit { - // OK we're good to here! Check the scientific notation. - if self.matches('e') || self.matches('E') { - if self.matches('+') || self.matches('-') {} - let mut saw_digit = false; - loop { - if self.matches('_') { - } else if self.matches_next(|c| c.is_ascii_digit()) { - saw_digit = true; - } else { - break; - } - } - - if !saw_digit { - // This is just a broken number. - let slice = &self.source[start..self.pos()]; - return TokenKind::Error(format!( - "Invalid floating-point literal: {slice}" - )); - } - } - } else { - // Might be accessing a member on an integer. - self.chars = backup; - } - } - - TokenKind::Number(&self.source[start..self.pos()]) - } - - fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> { - while !self.matches(delimiter) { - if self.eof() { - return TokenKind::Error("Unterminated string constant".to_string()); - } - if self.matches('\\') { - self.advance(); - } - } - - TokenKind::String(&self.source[start..self.pos()]) - } - - fn identifier(&mut self, start: usize) -> TokenKind<'a> { - loop { - // TODO: Use unicode identifier classes instead - if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { - break; - } - } - - let ident = &self.source[start..self.pos()]; - match ident.chars().nth(0) { - Some('a') => { - if ident == "and" { - return TokenKind::And; - } - if ident == "async" { - return TokenKind::Async; - } - if ident == "await" { - return TokenKind::Await; - } - } - Some('c') => { - if ident == "class" { - return TokenKind::Class; - } - } - Some('e') => { - if ident == "else" { - return TokenKind::Else; - } - } - Some('f') => { - if ident == "for" { - return TokenKind::For; - } - if ident == "from" { - return TokenKind::From; - } - if ident == "fun" { - return TokenKind::Fun; - } - } - Some('i') => { - if ident == "if" { - return TokenKind::If; - } - } - Some('l') => { - if ident == "let" { - return TokenKind::Let; - } - } - Some('o') => { - if ident == "or" { - return TokenKind::Or; - } - } - Some('p') => { - if ident == "print" { - return TokenKind::Print; - } - } - Some('r') => { - if ident == "return" { - return TokenKind::Return; - } - } - Some('s') => { - if ident == "select" { - return TokenKind::Select; - } - } - Some('t') => { - if ident == "this" { - return TokenKind::This; - } - if ident == "true" { - return TokenKind::True; - } - } - Some('w') => { - if ident == "while" { - return TokenKind::While; - } - } - Some('y') => { - if ident == "yield" { - return TokenKind::Yield; - } - } - _ => (), - } - - TokenKind::Identifier(ident) - } - - fn matches(&mut self, ch: char) -> bool { - if let Some((_, next_ch)) = self.next_char { - if next_ch == ch { - self.advance(); - return true; - } - } - false - } - - fn matches_next(&mut self, f: F) -> bool - where - F: FnOnce(char) -> bool, - { - if let Some((_, next_ch)) = self.next_char { - if f(next_ch) { - self.advance(); - return true; - } - } - false - } - - fn matches_digit(&mut self) -> bool { - self.matches('_') || self.matches_next(|c| c.is_ascii_digit()) - } - - fn advance(&mut self) -> Option<(usize, char)> { - let result = self.next_char; - self.next_char = self.chars.next(); - result - } - - fn pos(&self) -> usize { - match self.next_char { - Some((p, _)) => p, - None => self.source.len(), - } - } - - fn eof(&self) -> bool { - self.next_char.is_none() - } - - fn skip_whitespace(&mut self) { - while let Some((pos, ch)) = self.next_char { - if ch == '\n' { - self.newlines.push(pos); - } else if !ch.is_whitespace() { - break; - } - self.advance(); - } - } -} - -pub fn tokenize(input: String) { - let mut tokens = Tokens::new(&input); - while let Some(token) = tokens.next_token() { - println!("{}: {}", token.start, token.as_str()); - } -} +pub mod tokens; diff --git a/oden-script/src/main.rs b/oden-script/src/main.rs index 7f158ec7..da0f5d92 100644 --- a/oden-script/src/main.rs +++ b/oden-script/src/main.rs @@ -1,3 +1 @@ -use oden_script; - pub fn main() {} diff --git a/oden-script/src/tokens.rs b/oden-script/src/tokens.rs new file mode 100644 index 00000000..02979fe5 --- /dev/null +++ b/oden-script/src/tokens.rs @@ -0,0 +1,569 @@ +#[derive(Debug, PartialEq, Eq)] +pub enum TokenKind<'a> { + LeftBrace, + RightBrace, + LeftBracket, + RightBracket, + LeftParen, + RightParen, + Comma, + Dot, + Minus, + Plus, + Semicolon, + Slash, + Star, + + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + Identifier(&'a str), // TODO + String(&'a str), + Number(&'a str), + + And, + Async, + Await, + Class, + Else, + False, + For, + From, + Fun, + If, + Let, + Or, + Print, + Return, + Select, + This, + True, + While, + Yield, + + Error(String), +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Token<'a> { + kind: TokenKind<'a>, + start: usize, +} + +impl<'a> Token<'a> { + pub fn new(start: usize, kind: TokenKind<'a>) -> Self { + Token { kind, start } + } + + pub fn as_str<'b>(&'b self) -> &'a str + where + 'b: 'a, + { + use TokenKind::*; + match &self.kind { + LeftBrace => "{", + RightBrace => "}", + LeftBracket => "[", + RightBracket => "]", + + LeftParen => "(", + RightParen => ")", + Comma => ",", + Dot => ".", + Minus => "-", + + Plus => "+", + Semicolon => ";", + Slash => "/", + Star => "*", + + Bang => "+", + BangEqual => "!=", + Equal => "=", + EqualEqual => "==", + Greater => ">", + GreaterEqual => ">=", + Less => "<", + LessEqual => "<=", + + Identifier(v) => v, + String(v) => v, + Number(v) => v, + + And => "and", + Async => "async", + Await => "await", + Class => "class", + Else => "else", + False => "false", + For => "for", + From => "from", + Fun => "fun", + If => "if", + Let => "let", + Or => "or", + Print => "print", + Return => "return", + Select => "select", + This => "this", + True => "true", + While => "while", + Yield => "yield", + + Error(e) => e, + } + } +} + +pub struct Tokens<'a> { + source: &'a str, + chars: std::str::CharIndices<'a>, + next_char: Option<(usize, char)>, + newlines: Vec, +} + +impl<'a> Tokens<'a> { + pub fn new(source: &'a str) -> Self { + let mut result = Tokens { + source, + chars: source.char_indices(), + next_char: None, + newlines: Vec::new(), + }; + result.advance(); // Prime the pump + result + } + + pub fn token_position(&self, token: &Token) -> (usize, usize) { + let line_end_index = match self.newlines.binary_search(&token.start) { + Ok(index) => index, + Err(index) => index, + }; + let line_start_pos = if line_end_index == 0 { + 0 + } else { + self.newlines[line_end_index - 1] + 1 + }; + let line_number = line_end_index + 1; + let column_offset = token.start - line_start_pos; + (line_number, column_offset) + } + + fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> { + Token::new(start, kind) + } + + fn number(&mut self, start: usize) -> TokenKind<'a> { + // First, the main part. + loop { + if !self.matches_digit() { + break; + } + } + + // Now the fraction part. + // The thing that is bad here is that this is speculative... + let backup = self.chars.clone(); + if self.matches('.') { + let mut saw_digit = false; + loop { + if self.matches('_') { + } else if self.matches_next(|c| c.is_ascii_digit()) { + saw_digit = true; + } else { + break; + } + } + + if saw_digit { + // OK we're good to here! Check the scientific notation. + if self.matches('e') || self.matches('E') { + if self.matches('+') || self.matches('-') {} + let mut saw_digit = false; + loop { + if self.matches('_') { + } else if self.matches_next(|c| c.is_ascii_digit()) { + saw_digit = true; + } else { + break; + } + } + + if !saw_digit { + // This is just a broken number. + let slice = &self.source[start..self.pos()]; + return TokenKind::Error(format!( + "Invalid floating-point literal: {slice}" + )); + } + } + } else { + // Might be accessing a member on an integer. + self.chars = backup; + } + } + + TokenKind::Number(&self.source[start..self.pos()]) + } + + fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> { + while !self.matches(delimiter) { + if self.eof() { + return TokenKind::Error("Unterminated string constant".to_string()); + } + if self.matches('\\') { + self.advance(); + } else { + self.advance(); + } + } + + TokenKind::String(&self.source[start..self.pos()]) + } + + fn identifier(&mut self, start: usize) -> TokenKind<'a> { + loop { + // TODO: Use unicode identifier classes instead + if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { + break; + } + } + + let ident = &self.source[start..self.pos()]; + match ident.chars().nth(0) { + Some('a') => { + if ident == "and" { + return TokenKind::And; + } + if ident == "async" { + return TokenKind::Async; + } + if ident == "await" { + return TokenKind::Await; + } + } + Some('c') => { + if ident == "class" { + return TokenKind::Class; + } + } + Some('e') => { + if ident == "else" { + return TokenKind::Else; + } + } + Some('f') => { + if ident == "false" { + return TokenKind::False; + } + if ident == "for" { + return TokenKind::For; + } + if ident == "from" { + return TokenKind::From; + } + if ident == "fun" { + return TokenKind::Fun; + } + } + Some('i') => { + if ident == "if" { + return TokenKind::If; + } + } + Some('l') => { + if ident == "let" { + return TokenKind::Let; + } + } + Some('o') => { + if ident == "or" { + return TokenKind::Or; + } + } + Some('p') => { + if ident == "print" { + return TokenKind::Print; + } + } + Some('r') => { + if ident == "return" { + return TokenKind::Return; + } + } + Some('s') => { + if ident == "select" { + return TokenKind::Select; + } + } + Some('t') => { + if ident == "this" { + return TokenKind::This; + } + if ident == "true" { + return TokenKind::True; + } + } + Some('w') => { + if ident == "while" { + return TokenKind::While; + } + } + Some('y') => { + if ident == "yield" { + return TokenKind::Yield; + } + } + _ => (), + } + + TokenKind::Identifier(ident) + } + + fn matches(&mut self, ch: char) -> bool { + if let Some((_, next_ch)) = self.next_char { + if next_ch == ch { + self.advance(); + return true; + } + } + false + } + + fn matches_next(&mut self, f: F) -> bool + where + F: FnOnce(char) -> bool, + { + if let Some((_, next_ch)) = self.next_char { + if f(next_ch) { + eprintln!("MATCHES NEXT: {next_ch}"); + self.advance(); + return true; + } else { + eprintln!("NOT MATCHES NEXT: {next_ch}"); + } + } else { + eprintln!("E O F"); + } + false + } + + fn matches_digit(&mut self) -> bool { + self.matches('_') || self.matches_next(|c| c.is_ascii_digit()) + } + + fn advance(&mut self) -> Option<(usize, char)> { + let result = self.next_char; + self.next_char = self.chars.next(); + eprintln!("NEXT: {:?}", self.next_char); + result + } + + fn pos(&self) -> usize { + match self.next_char { + Some((p, _)) => p, + None => self.source.len(), + } + } + + fn eof(&self) -> bool { + self.next_char.is_none() + } + + fn skip_whitespace(&mut self) { + while let Some((pos, ch)) = self.next_char { + if ch == '\n' { + self.newlines.push(pos); + } else if !ch.is_whitespace() { + break; + } + self.advance(); + } + } +} + +impl<'a> std::iter::Iterator for Tokens<'a> { + type Item = Token<'a>; + + fn next(&mut self) -> Option { + self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving + let (pos, c) = match self.advance() { + Some((p, c)) => (p, c), + None => return None, + }; + + let token = match c { + '{' => TokenKind::LeftBrace, + '}' => TokenKind::RightBrace, + '[' => TokenKind::LeftBracket, + ']' => TokenKind::RightBracket, + '(' => TokenKind::LeftParen, + ')' => TokenKind::RightParen, + ',' => TokenKind::Comma, + '.' => TokenKind::Dot, + '-' => { + if self.matches_next(|c| c.is_ascii_digit()) { + self.number(pos) + } else { + TokenKind::Minus + } + } + '+' => { + if self.matches_next(|c| c.is_ascii_digit()) { + self.number(pos) + } else { + TokenKind::Plus + } + } + ';' => TokenKind::Semicolon, + '/' => TokenKind::Slash, + '*' => TokenKind::Star, + '!' => { + if self.matches('=') { + TokenKind::BangEqual + } else { + TokenKind::Bang + } + } + '=' => { + if self.matches('=') { + TokenKind::EqualEqual + } else { + TokenKind::Equal + } + } + '>' => { + if self.matches('=') { + TokenKind::GreaterEqual + } else { + TokenKind::Greater + } + } + '<' => { + if self.matches('=') { + TokenKind::LessEqual + } else { + TokenKind::Less + } + } + '\'' => self.string(pos, '\''), + '"' => self.string(pos, '"'), + _ => { + if c.is_ascii_digit() { + self.number(pos) + } else if c.is_ascii_alphabetic() || c == '_' { + self.identifier(pos) + } else { + TokenKind::Error(format!("Unexpected character '{c}'")) + } + } + }; + let token = self.token(pos, token); + Some(token) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + macro_rules! test_tokens { + ($name:ident, $input:expr, $($s:expr),+) => { + #[test] + fn $name() { + use TokenKind::*; + let tokens: Vec<_> = Tokens::new($input).collect(); + let expected = vec![$($s),*]; + assert_eq!(expected, tokens); + } + } + } + + test_tokens!( + numbers, + "1 1.0 1.2e7 2.3e+7 3.3E-06 7_6 8.0e_8", + Token::new(0, Number("1")), + Token::new(2, Number("1.0")), + Token::new(6, Number("1.2e7")), + Token::new(12, Number("2.3e+7")), + Token::new(19, Number("3.3E-06")), + Token::new(27, Number("7_6")), + Token::new(31, Number("8.0e_8")) + ); + + test_tokens!( + identifiers, + "asdf x _123 a_23 x3a and or yield async await class else false for from", + Token::new(0, Identifier("asdf")), + Token::new(5, Identifier("x")), + Token::new(7, Identifier("_123")), + Token::new(12, Identifier("a_23")), + Token::new(17, Identifier("x3a")), + Token::new(21, And), + Token::new(25, Or), + Token::new(28, Yield), + Token::new(34, Async), + Token::new(40, Await), + Token::new(46, Class), + Token::new(52, Else), + Token::new(57, False), + Token::new(63, For), + Token::new(67, From) + ); + + test_tokens!( + more_keywords, + "fun if let print return select this true while truewhile", + Token::new(0, Fun), + Token::new(4, If), + Token::new(7, Let), + Token::new(11, Print), + Token::new(17, Return), + Token::new(24, Select), + Token::new(31, This), + Token::new(36, True), + Token::new(41, While), + Token::new(47, Identifier("truewhile")) + ); + + test_tokens!( + strings, + r#"'this is a string that\'s great!\r\n' "foo's" 'bar"s' "#, + Token::new(0, String(r#"'this is a string that\'s great!\r\n'"#)), + Token::new(38, String(r#""foo's""#)), + Token::new(46, String("'bar\"s'")) + ); + + test_tokens!( + symbols, + "{ } ( ) [ ] . ! != < <= > >= = == , - + * / ;", + Token::new(0, LeftBrace), + Token::new(2, RightBrace), + Token::new(4, LeftParen), + Token::new(6, RightParen), + Token::new(8, LeftBracket), + Token::new(10, RightBracket), + Token::new(12, Dot), + Token::new(14, Bang), + Token::new(16, BangEqual), + Token::new(19, Less), + Token::new(21, LessEqual), + Token::new(24, Greater), + Token::new(26, GreaterEqual), + Token::new(29, Equal), + Token::new(31, EqualEqual), + Token::new(34, Comma), + Token::new(36, Minus), + Token::new(38, Plus), + Token::new(40, Star), + Token::new(42, Slash), + Token::new(44, Semicolon) + ); +} From ece5576fb2f0db79e1d04d7acb16368dfcf8b4c1 Mon Sep 17 00:00:00 2001 From: John Doty Date: Mon, 1 Jan 2024 08:07:29 -0800 Subject: [PATCH 2/9] [fine] Starting to parse (ugh) --- oden-script/src/lib.rs | 1 + oden-script/src/parser.rs | 360 ++++++++++++++++++++++++++++++++++++++ oden-script/src/tokens.rs | 358 ++++++++++++++++++------------------- 3 files changed, 534 insertions(+), 185 deletions(-) create mode 100644 oden-script/src/parser.rs diff --git a/oden-script/src/lib.rs b/oden-script/src/lib.rs index 5c766355..4144a208 100644 --- a/oden-script/src/lib.rs +++ b/oden-script/src/lib.rs @@ -1 +1,2 @@ +pub mod parser; pub mod tokens; diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs new file mode 100644 index 00000000..61283f73 --- /dev/null +++ b/oden-script/src/parser.rs @@ -0,0 +1,360 @@ +use crate::tokens::{Token, TokenKind, Tokens}; +use std::fmt; + +#[derive(PartialEq, Eq)] +pub struct SyntaxError { + pub line: usize, + pub column: usize, + pub message: String, +} + +impl SyntaxError { + pub fn new(line: usize, column: usize, message: String) -> Self { + SyntaxError { + line, + column, + message, + } + } +} + +impl fmt::Debug for SyntaxError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}:{}: {}", self.line, self.column, self.message) + } +} + +impl fmt::Display for SyntaxError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}:{}: {}", self.line, self.column, self.message) + } +} + +pub enum Literal { + Float64(f64), +} + +pub enum UnaryOp { + Negate, +} + +pub enum BinaryOp { + Add, + Subtract, + Mutiply, + Divide, + And, + Or, +} + +pub enum Expr { + Literal(Literal), + Unary(UnaryOp, ExprRef), + Binary(BinaryOp, ExprRef, ExprRef), +} + +pub struct ExprRef(Option); + +impl ExprRef { + pub fn error() -> Self { + ExprRef(None) + } +} + +pub struct SyntaxTree { + pub errors: Vec, + expressions: Vec, +} + +impl SyntaxTree { + pub fn new() -> Self { + SyntaxTree { + errors: Vec::new(), + expressions: Vec::new(), + } + } + + pub fn add_error(&mut self, error: SyntaxError) { + self.errors.push(error); + } + + pub fn add_expr(&mut self, expr: Expr) -> ExprRef { + let index = self.expressions.len(); + self.expressions.push(expr); + ExprRef(Some(index)) + } + + pub fn dump_expr(&self, expr: &ExprRef) -> String { + match expr.0 { + Some(idx) => { + let expr = &self.expressions[idx]; + match expr { + Expr::Literal(lit) => match lit { + Literal::Float64(f) => f.to_string(), + }, + Expr::Unary(op, e) => { + let op = match op { + UnaryOp::Negate => "-", + }; + format!("({op} {})", self.dump_expr(e)) + } + Expr::Binary(op, l, r) => { + let op = match op { + BinaryOp::Add => "+", + BinaryOp::Subtract => "-", + BinaryOp::Mutiply => "*", + BinaryOp::Divide => "/", + BinaryOp::And => "and", + BinaryOp::Or => "or", + }; + format!("({op} {} {})", self.dump_expr(l), self.dump_expr(r)) + } + } + } + None => "<|EOF|>".to_string(), + } + } +} + +// BINDING POWERS. When parsing expressions we only accept expressions that +// meet a minimum binding power. (This is like "precedence" but I just super +// don't like that terminology.) +const ASSIGNMENT_POWER: u8 = 0; // = +const OR_POWER: u8 = 1; // or +const AND_POWER: u8 = 2; // and +const EQUALITY_POWER: u8 = 3; // == != +const COMPARISON_POWER: u8 = 4; // < > <= >= +const TERM_POWER: u8 = 5; // + - +const FACTOR_POWER: u8 = 6; // * / +const UNARY_POWER: u8 = 7; // ! - + +// const CALL_POWER: u8 = 8; // . () +// const PRIMARY_POWER: u8 = 9; + +fn token_power<'a>(token: &Option>) -> Option { + let token = match token { + Some(t) => t, + None => return None, + }; + + match token.kind() { + TokenKind::Equal => Some(ASSIGNMENT_POWER), + TokenKind::Or => Some(OR_POWER), + TokenKind::And => Some(AND_POWER), + TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER), + TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => { + Some(COMPARISON_POWER) + } + TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER), + TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER), + _ => None, + } +} + +pub struct Parser<'a> { + tokens: Tokens<'a>, + tree: SyntaxTree, + current: Option>, + previous: Option>, + + panic_mode: bool, +} + +impl<'a> Parser<'a> { + pub fn new(source: &'a str) -> Self { + let mut parser = Parser { + tokens: Tokens::new(source), + tree: SyntaxTree::new(), + current: None, + previous: None, + panic_mode: false, + }; + parser.advance(); + parser + } + + pub fn parse(mut self) -> (SyntaxTree, ExprRef) { + let expr = self.expression(); + self.consume(None, "expected end of expression"); + (self.tree, expr) + } + + fn expression(&mut self) -> ExprRef { + self.expression_with_power(0) + } + + fn expression_with_power(&mut self, minimum_power: u8) -> ExprRef { + self.advance(); + let mut expr = self.prefix_expression(); + loop { + let power = match token_power(&self.current) { + Some(p) => p, + None => break, // EOF, end of expression? + }; + + if power < minimum_power { + break; + } + + self.advance(); + expr = self.infix_expression(power, expr); + } + expr + } + + fn prefix_expression(&mut self) -> ExprRef { + let token = self.previous.as_ref(); + match token { + Some(token) => match token.kind() { + TokenKind::LeftParen => self.grouping(), + TokenKind::Number => self.number(), + TokenKind::Minus => self.unary(), + _ => { + self.error("expected an expression"); + ExprRef::error() + } + }, + None => { + self.error("expected an expression"); + ExprRef::error() + } + } + } + + fn infix_expression(&mut self, power: u8, left: ExprRef) -> ExprRef { + let kind = self.previous.as_ref().unwrap().kind(); + match kind { + TokenKind::Plus | TokenKind::Minus | TokenKind::Star | TokenKind::Slash => { + self.binary(power, left) + } + _ => panic!("Unknown infix operator, dispatch error?"), + } + } + + fn number(&mut self) -> ExprRef { + let token = self.previous.as_ref().unwrap(); + // What kind is it? For now let's just ... make it good. + + match token.as_str().parse::() { + Ok(v) => self.tree.add_expr(Expr::Literal(Literal::Float64(v))), + Err(e) => { + self.error(format!("invalid f64: {e}")); + ExprRef::error() + } + } + } + + fn grouping(&mut self) -> ExprRef { + let result = self.number(); + self.consume( + Some(TokenKind::RightParen), + "expected ')' after an expression", + ); + result + } + + fn unary(&mut self) -> ExprRef { + let kind = self.previous.as_ref().unwrap().kind(); + let expr = self.expression_with_power(UNARY_POWER); + let op = match kind { + TokenKind::Minus => UnaryOp::Negate, + _ => panic!("unsuitable unary: {:?}: no op", kind), + }; + self.tree.add_expr(Expr::Unary(op, expr)) + } + + fn binary(&mut self, power: u8, left: ExprRef) -> ExprRef { + let right = self.expression_with_power(power + 1); + let op = match self.previous.as_ref().unwrap().kind() { + TokenKind::Plus => BinaryOp::Add, + TokenKind::Minus => BinaryOp::Subtract, + TokenKind::Star => BinaryOp::Mutiply, + TokenKind::Slash => BinaryOp::Divide, + TokenKind::And => BinaryOp::And, + TokenKind::Or => BinaryOp::Or, + _ => panic!("unsuitable binary: {:?}: no op", self.previous), + }; + + self.tree.add_expr(Expr::Binary(op, left, right)) + } + + fn advance(&mut self) { + self.previous = self.current.take(); + loop { + self.current = self.tokens.next(); + match &self.current { + Some(token) if token.kind() == TokenKind::Error => { + self.error_at_current(token.clone()) + } + _ => break, + } + } + } + + fn consume(&mut self, kind: Option, error: &str) { + match (&self.current, kind) { + (Some(token), Some(kind)) if token.kind() == kind => self.advance(), + (None, None) => (), + _ => { + self.error_at_current(error); + } + } + } + + fn error(&mut self, message: T) + where + T: Into, + { + self.error_at(self.previous.clone(), message) + } + + fn error_at_current(&mut self, message: T) + where + T: Into, + { + self.error_at(self.current.clone(), message) + } + + fn error_at(&mut self, token: Option>, message: T) + where + T: Into, + { + if self.panic_mode { + return; + } + self.panic_mode = true; + + let message: String = message.into(); + let (line, column) = self.tokens.token_position(&token); + let mut final_message = "Error ".to_string(); + match token { + None => final_message.push_str("at end"), + Some(t) => { + if t.kind() != TokenKind::Error { + final_message.push_str("at '"); + final_message.push_str(t.as_str()); + final_message.push_str("'"); + } + } + } + final_message.push_str(": "); + final_message.push_str(&message); + + self.tree + .add_error(SyntaxError::new(line, column, final_message)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + pub fn number_expressions() { + // How am I going to test this? + let (tree, expr) = Parser::new("23.5").parse(); + assert_eq!(Vec::::new(), tree.errors); + assert_eq!("23.5", tree.dump_expr(&expr)); + } +} diff --git a/oden-script/src/tokens.rs b/oden-script/src/tokens.rs index 02979fe5..d0df5685 100644 --- a/oden-script/src/tokens.rs +++ b/oden-script/src/tokens.rs @@ -1,5 +1,5 @@ -#[derive(Debug, PartialEq, Eq)] -pub enum TokenKind<'a> { +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum TokenKind { LeftBrace, RightBrace, LeftBracket, @@ -23,9 +23,9 @@ pub enum TokenKind<'a> { Less, LessEqual, - Identifier(&'a str), // TODO - String(&'a str), - Number(&'a str), + Identifier, + String, + Number, And, Async, @@ -47,80 +47,54 @@ pub enum TokenKind<'a> { While, Yield, - Error(String), + Error, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone)] pub struct Token<'a> { - kind: TokenKind<'a>, + kind: TokenKind, start: usize, + value: Result<&'a str, String>, } impl<'a> Token<'a> { - pub fn new(start: usize, kind: TokenKind<'a>) -> Self { - Token { kind, start } + pub fn new(kind: TokenKind, start: usize, value: &'a str) -> Self { + Token { + kind, + start, + value: Ok(value), + } + } + + pub fn error(start: usize, message: String) -> Self { + Token { + kind: TokenKind::Error, + start, + value: Err(message), + } + } + + pub fn kind(&self) -> TokenKind { + self.kind } pub fn as_str<'b>(&'b self) -> &'a str where 'b: 'a, { - use TokenKind::*; - match &self.kind { - LeftBrace => "{", - RightBrace => "}", - LeftBracket => "[", - RightBracket => "]", - - LeftParen => "(", - RightParen => ")", - Comma => ",", - Dot => ".", - Minus => "-", - - Plus => "+", - Semicolon => ";", - Slash => "/", - Star => "*", - - Bang => "+", - BangEqual => "!=", - Equal => "=", - EqualEqual => "==", - Greater => ">", - GreaterEqual => ">=", - Less => "<", - LessEqual => "<=", - - Identifier(v) => v, - String(v) => v, - Number(v) => v, - - And => "and", - Async => "async", - Await => "await", - Class => "class", - Else => "else", - False => "false", - For => "for", - From => "from", - Fun => "fun", - If => "if", - Let => "let", - Or => "or", - Print => "print", - Return => "return", - Select => "select", - This => "this", - True => "true", - While => "while", - Yield => "yield", - - Error(e) => e, + match &self.value { + Ok(v) => v, + Err(e) => &e, } } } +impl<'a> Into for Token<'a> { + fn into(self) -> String { + self.as_str().to_string() + } +} + pub struct Tokens<'a> { source: &'a str, chars: std::str::CharIndices<'a>, @@ -140,8 +114,17 @@ impl<'a> Tokens<'a> { result } - pub fn token_position(&self, token: &Token) -> (usize, usize) { - let line_end_index = match self.newlines.binary_search(&token.start) { + /// Return the position of the given token as a (line, column) pair. By + /// convention, lines are 1-based and columns are 0-based. Also, in + /// keeping with the iterator-nature of the tokenizer, `None` here + /// indicates end-of-file, and will return the position of the end of the + /// file. + pub fn token_position(&self, token: &Option) -> (usize, usize) { + let start = match token { + Some(t) => t.start, + None => self.source.len(), + }; + let line_end_index = match self.newlines.binary_search(&start) { Ok(index) => index, Err(index) => index, }; @@ -151,15 +134,16 @@ impl<'a> Tokens<'a> { self.newlines[line_end_index - 1] + 1 }; let line_number = line_end_index + 1; - let column_offset = token.start - line_start_pos; + let column_offset = start - line_start_pos; (line_number, column_offset) } - fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> { - Token::new(start, kind) + fn token(&self, start: usize, kind: TokenKind) -> Token<'a> { + let value = &self.source[start..self.pos()]; + Token::new(kind, start, value) } - fn number(&mut self, start: usize) -> TokenKind<'a> { + fn number(&mut self, start: usize) -> Token<'a> { // First, the main part. loop { if !self.matches_digit() { @@ -198,9 +182,10 @@ impl<'a> Tokens<'a> { if !saw_digit { // This is just a broken number. let slice = &self.source[start..self.pos()]; - return TokenKind::Error(format!( - "Invalid floating-point literal: {slice}" - )); + return Token::error( + start, + format!("Invalid floating-point literal: {slice}"), + ); } } } else { @@ -209,13 +194,13 @@ impl<'a> Tokens<'a> { } } - TokenKind::Number(&self.source[start..self.pos()]) + self.token(start, TokenKind::Number) } - fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> { + fn string(&mut self, start: usize, delimiter: char) -> Token<'a> { while !self.matches(delimiter) { if self.eof() { - return TokenKind::Error("Unterminated string constant".to_string()); + return Token::error(start, "Unterminated string constant".to_string()); } if self.matches('\\') { self.advance(); @@ -224,20 +209,12 @@ impl<'a> Tokens<'a> { } } - TokenKind::String(&self.source[start..self.pos()]) + self.token(start, TokenKind::String) } - fn identifier(&mut self, start: usize) -> TokenKind<'a> { - loop { - // TODO: Use unicode identifier classes instead - if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { - break; - } - } - - let ident = &self.source[start..self.pos()]; - match ident.chars().nth(0) { - Some('a') => { + fn identifier_token_kind(ident: &str) -> TokenKind { + match ident.chars().nth(0).unwrap() { + 'a' => { if ident == "and" { return TokenKind::And; } @@ -248,17 +225,17 @@ impl<'a> Tokens<'a> { return TokenKind::Await; } } - Some('c') => { + 'c' => { if ident == "class" { return TokenKind::Class; } } - Some('e') => { + 'e' => { if ident == "else" { return TokenKind::Else; } } - Some('f') => { + 'f' => { if ident == "false" { return TokenKind::False; } @@ -272,37 +249,37 @@ impl<'a> Tokens<'a> { return TokenKind::Fun; } } - Some('i') => { + 'i' => { if ident == "if" { return TokenKind::If; } } - Some('l') => { + 'l' => { if ident == "let" { return TokenKind::Let; } } - Some('o') => { + 'o' => { if ident == "or" { return TokenKind::Or; } } - Some('p') => { + 'p' => { if ident == "print" { return TokenKind::Print; } } - Some('r') => { + 'r' => { if ident == "return" { return TokenKind::Return; } } - Some('s') => { + 's' => { if ident == "select" { return TokenKind::Select; } } - Some('t') => { + 't' => { if ident == "this" { return TokenKind::This; } @@ -310,12 +287,12 @@ impl<'a> Tokens<'a> { return TokenKind::True; } } - Some('w') => { + 'w' => { if ident == "while" { return TokenKind::While; } } - Some('y') => { + 'y' => { if ident == "yield" { return TokenKind::Yield; } @@ -323,7 +300,20 @@ impl<'a> Tokens<'a> { _ => (), } - TokenKind::Identifier(ident) + TokenKind::Identifier + } + + fn identifier(&mut self, start: usize) -> Token<'a> { + loop { + // TODO: Use unicode identifier classes instead + if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { + break; + } + } + + let ident = &self.source[start..self.pos()]; + let kind = Self::identifier_token_kind(ident); + Token::new(kind, start, ident) } fn matches(&mut self, ch: char) -> bool { @@ -342,14 +332,9 @@ impl<'a> Tokens<'a> { { if let Some((_, next_ch)) = self.next_char { if f(next_ch) { - eprintln!("MATCHES NEXT: {next_ch}"); self.advance(); return true; - } else { - eprintln!("NOT MATCHES NEXT: {next_ch}"); } - } else { - eprintln!("E O F"); } false } @@ -361,7 +346,6 @@ impl<'a> Tokens<'a> { fn advance(&mut self) -> Option<(usize, char)> { let result = self.next_char; self.next_char = self.chars.next(); - eprintln!("NEXT: {:?}", self.next_char); result } @@ -399,57 +383,57 @@ impl<'a> std::iter::Iterator for Tokens<'a> { }; let token = match c { - '{' => TokenKind::LeftBrace, - '}' => TokenKind::RightBrace, - '[' => TokenKind::LeftBracket, - ']' => TokenKind::RightBracket, - '(' => TokenKind::LeftParen, - ')' => TokenKind::RightParen, - ',' => TokenKind::Comma, - '.' => TokenKind::Dot, + '{' => self.token(pos, TokenKind::LeftBrace), + '}' => self.token(pos, TokenKind::RightBrace), + '[' => self.token(pos, TokenKind::LeftBracket), + ']' => self.token(pos, TokenKind::RightBracket), + '(' => self.token(pos, TokenKind::LeftParen), + ')' => self.token(pos, TokenKind::RightParen), + ',' => self.token(pos, TokenKind::Comma), + '.' => self.token(pos, TokenKind::Dot), '-' => { if self.matches_next(|c| c.is_ascii_digit()) { self.number(pos) } else { - TokenKind::Minus + self.token(pos, TokenKind::Minus) } } '+' => { if self.matches_next(|c| c.is_ascii_digit()) { self.number(pos) } else { - TokenKind::Plus + self.token(pos, TokenKind::Plus) } } - ';' => TokenKind::Semicolon, - '/' => TokenKind::Slash, - '*' => TokenKind::Star, + ';' => self.token(pos, TokenKind::Semicolon), + '/' => self.token(pos, TokenKind::Slash), + '*' => self.token(pos, TokenKind::Star), '!' => { if self.matches('=') { - TokenKind::BangEqual + self.token(pos, TokenKind::BangEqual) } else { - TokenKind::Bang + self.token(pos, TokenKind::Bang) } } '=' => { if self.matches('=') { - TokenKind::EqualEqual + self.token(pos, TokenKind::EqualEqual) } else { - TokenKind::Equal + self.token(pos, TokenKind::Equal) } } '>' => { if self.matches('=') { - TokenKind::GreaterEqual + self.token(pos, TokenKind::GreaterEqual) } else { - TokenKind::Greater + self.token(pos, TokenKind::Greater) } } '<' => { if self.matches('=') { - TokenKind::LessEqual + self.token(pos, TokenKind::LessEqual) } else { - TokenKind::Less + self.token(pos, TokenKind::Less) } } '\'' => self.string(pos, '\''), @@ -460,11 +444,10 @@ impl<'a> std::iter::Iterator for Tokens<'a> { } else if c.is_ascii_alphabetic() || c == '_' { self.identifier(pos) } else { - TokenKind::Error(format!("Unexpected character '{c}'")) + Token::error(pos, format!("Unexpected character '{c}'")) } } }; - let token = self.token(pos, token); Some(token) } } @@ -480,7 +463,12 @@ mod tests { fn $name() { use TokenKind::*; let tokens: Vec<_> = Tokens::new($input).collect(); - let expected = vec![$($s),*]; + + let expected: Vec = (vec![$($s),*]) + .into_iter() + .map(|t| Token::new(t.1, t.0, t.2)) + .collect(); + assert_eq!(expected, tokens); } } @@ -489,81 +477,81 @@ mod tests { test_tokens!( numbers, "1 1.0 1.2e7 2.3e+7 3.3E-06 7_6 8.0e_8", - Token::new(0, Number("1")), - Token::new(2, Number("1.0")), - Token::new(6, Number("1.2e7")), - Token::new(12, Number("2.3e+7")), - Token::new(19, Number("3.3E-06")), - Token::new(27, Number("7_6")), - Token::new(31, Number("8.0e_8")) + (0, Number, "1"), + (2, Number, "1.0"), + (6, Number, "1.2e7"), + (12, Number, "2.3e+7"), + (19, Number, "3.3E-06"), + (27, Number, "7_6"), + (31, Number, "8.0e_8") ); test_tokens!( identifiers, "asdf x _123 a_23 x3a and or yield async await class else false for from", - Token::new(0, Identifier("asdf")), - Token::new(5, Identifier("x")), - Token::new(7, Identifier("_123")), - Token::new(12, Identifier("a_23")), - Token::new(17, Identifier("x3a")), - Token::new(21, And), - Token::new(25, Or), - Token::new(28, Yield), - Token::new(34, Async), - Token::new(40, Await), - Token::new(46, Class), - Token::new(52, Else), - Token::new(57, False), - Token::new(63, For), - Token::new(67, From) + (0, Identifier, "asdf"), + (5, Identifier, "x"), + (7, Identifier, "_123"), + (12, Identifier, "a_23"), + (17, Identifier, "x3a"), + (21, And, "and"), + (25, Or, "or"), + (28, Yield, "yield"), + (34, Async, "async"), + (40, Await, "await"), + (46, Class, "class"), + (52, Else, "else"), + (57, False, "false"), + (63, For, "for"), + (67, From, "from") ); test_tokens!( more_keywords, "fun if let print return select this true while truewhile", - Token::new(0, Fun), - Token::new(4, If), - Token::new(7, Let), - Token::new(11, Print), - Token::new(17, Return), - Token::new(24, Select), - Token::new(31, This), - Token::new(36, True), - Token::new(41, While), - Token::new(47, Identifier("truewhile")) + (0, Fun, "fun"), + (4, If, "if"), + (7, Let, "let"), + (11, Print, "print"), + (17, Return, "return"), + (24, Select, "select"), + (31, This, "this"), + (36, True, "true"), + (41, While, "while"), + (47, Identifier, "truewhile") ); test_tokens!( strings, r#"'this is a string that\'s great!\r\n' "foo's" 'bar"s' "#, - Token::new(0, String(r#"'this is a string that\'s great!\r\n'"#)), - Token::new(38, String(r#""foo's""#)), - Token::new(46, String("'bar\"s'")) + (0, String, r#"'this is a string that\'s great!\r\n'"#), + (38, String, r#""foo's""#), + (46, String, "'bar\"s'") ); test_tokens!( symbols, "{ } ( ) [ ] . ! != < <= > >= = == , - + * / ;", - Token::new(0, LeftBrace), - Token::new(2, RightBrace), - Token::new(4, LeftParen), - Token::new(6, RightParen), - Token::new(8, LeftBracket), - Token::new(10, RightBracket), - Token::new(12, Dot), - Token::new(14, Bang), - Token::new(16, BangEqual), - Token::new(19, Less), - Token::new(21, LessEqual), - Token::new(24, Greater), - Token::new(26, GreaterEqual), - Token::new(29, Equal), - Token::new(31, EqualEqual), - Token::new(34, Comma), - Token::new(36, Minus), - Token::new(38, Plus), - Token::new(40, Star), - Token::new(42, Slash), - Token::new(44, Semicolon) + (0, LeftBrace, "{"), + (2, RightBrace, "}"), + (4, LeftParen, "("), + (6, RightParen, ")"), + (8, LeftBracket, "["), + (10, RightBracket, "]"), + (12, Dot, "."), + (14, Bang, "!"), + (16, BangEqual, "!="), + (19, Less, "<"), + (21, LessEqual, "<="), + (24, Greater, ">"), + (26, GreaterEqual, ">="), + (29, Equal, "="), + (31, EqualEqual, "=="), + (34, Comma, ","), + (36, Minus, "-"), + (38, Plus, "+"), + (40, Star, "*"), + (42, Slash, "/"), + (44, Semicolon, ";") ); } From 24d056b19844535fdaffd30313dc4bfe6177b950 Mon Sep 17 00:00:00 2001 From: John Doty Date: Mon, 1 Jan 2024 08:30:30 -0800 Subject: [PATCH 3/9] [fine] Fix bugs, testing --- oden-script/src/parser.rs | 53 +++++++++++++++++++++++++++++++++------ oden-script/src/tokens.rs | 16 ++---------- 2 files changed, 47 insertions(+), 22 deletions(-) diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs index 61283f73..e05c9a84 100644 --- a/oden-script/src/parser.rs +++ b/oden-script/src/parser.rs @@ -184,6 +184,7 @@ impl<'a> Parser<'a> { } fn expression_with_power(&mut self, minimum_power: u8) -> ExprRef { + self.trace("expression with power"); self.advance(); let mut expr = self.prefix_expression(); loop { @@ -203,6 +204,7 @@ impl<'a> Parser<'a> { } fn prefix_expression(&mut self) -> ExprRef { + self.trace("prefix"); let token = self.previous.as_ref(); match token { Some(token) => match token.kind() { @@ -222,6 +224,7 @@ impl<'a> Parser<'a> { } fn infix_expression(&mut self, power: u8, left: ExprRef) -> ExprRef { + self.trace("infix"); let kind = self.previous.as_ref().unwrap().kind(); match kind { TokenKind::Plus | TokenKind::Minus | TokenKind::Star | TokenKind::Slash => { @@ -245,7 +248,7 @@ impl<'a> Parser<'a> { } fn grouping(&mut self) -> ExprRef { - let result = self.number(); + let result = self.expression(); self.consume( Some(TokenKind::RightParen), "expected ')' after an expression", @@ -264,7 +267,6 @@ impl<'a> Parser<'a> { } fn binary(&mut self, power: u8, left: ExprRef) -> ExprRef { - let right = self.expression_with_power(power + 1); let op = match self.previous.as_ref().unwrap().kind() { TokenKind::Plus => BinaryOp::Add, TokenKind::Minus => BinaryOp::Subtract, @@ -274,6 +276,7 @@ impl<'a> Parser<'a> { TokenKind::Or => BinaryOp::Or, _ => panic!("unsuitable binary: {:?}: no op", self.previous), }; + let right = self.expression_with_power(power + 1); self.tree.add_expr(Expr::Binary(op, left, right)) } @@ -343,6 +346,24 @@ impl<'a> Parser<'a> { self.tree .add_error(SyntaxError::new(line, column, final_message)); } + + fn trace(&self, _msg: &str) { + // let cpos = self.tokens.token_position(&self.current); + // let ppos = self.tokens.token_position(&self.previous); + + // eprintln!( + // "[{}:{}:{}] [{}:{}:{}]: {msg}", + // ppos.0, + // ppos.1, + // self.previous + // .as_ref() + // .map(|t| t.as_str()) + // .unwrap_or(""), + // cpos.0, + // cpos.1, + // self.current.as_ref().map(|t| t.as_str()).unwrap_or("") + // ); + } } #[cfg(test)] @@ -350,11 +371,27 @@ mod tests { use super::*; use pretty_assertions::assert_eq; - #[test] - pub fn number_expressions() { - // How am I going to test this? - let (tree, expr) = Parser::new("23.5").parse(); - assert_eq!(Vec::::new(), tree.errors); - assert_eq!("23.5", tree.dump_expr(&expr)); + fn test_successful_expression_parse(source: &str, expected: &str) { + let (tree, expr) = Parser::new(source).parse(); + assert_eq!( + Vec::::new(), + tree.errors, + "Expected successful parse" + ); + assert_eq!(expected, tree.dump_expr(&expr)); } + + macro_rules! test_expr { + ($name:ident, $input:expr, $expected:expr) => { + #[test] + fn $name() { + test_successful_expression_parse($input, $expected); + } + }; + } + + test_expr!(number_expr, "12", "12"); + test_expr!(add_expr, "1 + 2", "(+ 1 2)"); + test_expr!(prec_expr, "1 + 2 * 3 - 7 * 7", "(- (+ 1 (* 2 3)) (* 7 7))"); + test_expr!(unary, "-((23)) * 5", "(* (- 23) 5)"); } diff --git a/oden-script/src/tokens.rs b/oden-script/src/tokens.rs index d0df5685..2a09336e 100644 --- a/oden-script/src/tokens.rs +++ b/oden-script/src/tokens.rs @@ -391,20 +391,8 @@ impl<'a> std::iter::Iterator for Tokens<'a> { ')' => self.token(pos, TokenKind::RightParen), ',' => self.token(pos, TokenKind::Comma), '.' => self.token(pos, TokenKind::Dot), - '-' => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else { - self.token(pos, TokenKind::Minus) - } - } - '+' => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else { - self.token(pos, TokenKind::Plus) - } - } + '-' => self.token(pos, TokenKind::Minus), + '+' => self.token(pos, TokenKind::Plus), ';' => self.token(pos, TokenKind::Semicolon), '/' => self.token(pos, TokenKind::Slash), '*' => self.token(pos, TokenKind::Star), From 2d233244cf19196f2b8e21b20d4b9161d860cc58 Mon Sep 17 00:00:00 2001 From: John Doty Date: Mon, 1 Jan 2024 08:56:25 -0800 Subject: [PATCH 4/9] [fine] Tracking source locations --- oden-script/src/parser.rs | 57 ++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs index e05c9a84..262164f7 100644 --- a/oden-script/src/parser.rs +++ b/oden-script/src/parser.rs @@ -61,16 +61,18 @@ impl ExprRef { } } -pub struct SyntaxTree { +pub struct SyntaxTree<'a> { pub errors: Vec, expressions: Vec, + spans: Vec<(Option>, Option>)>, } -impl SyntaxTree { +impl<'a> SyntaxTree<'a> { pub fn new() -> Self { SyntaxTree { errors: Vec::new(), expressions: Vec::new(), + spans: Vec::new(), } } @@ -78,12 +80,27 @@ impl SyntaxTree { self.errors.push(error); } - pub fn add_expr(&mut self, expr: Expr) -> ExprRef { + pub fn add_expr( + &mut self, + expr: Expr, + start: Option>, + end: Option>, + ) -> ExprRef { let index = self.expressions.len(); self.expressions.push(expr); + self.spans.push((start, end)); ExprRef(Some(index)) } + pub fn span(&self, expr: &ExprRef) -> (Option>, Option>) { + if let ExprRef(Some(idx)) = expr { + let (start, end) = &self.spans[*idx]; + (start.clone(), end.clone()) + } else { + (None, None) + } + } + pub fn dump_expr(&self, expr: &ExprRef) -> String { match expr.0 { Some(idx) => { @@ -153,7 +170,7 @@ fn token_power<'a>(token: &Option>) -> Option { pub struct Parser<'a> { tokens: Tokens<'a>, - tree: SyntaxTree, + tree: SyntaxTree<'a>, current: Option>, previous: Option>, @@ -173,7 +190,7 @@ impl<'a> Parser<'a> { parser } - pub fn parse(mut self) -> (SyntaxTree, ExprRef) { + pub fn parse(mut self) -> (SyntaxTree<'a>, ExprRef) { let expr = self.expression(); self.consume(None, "expected end of expression"); (self.tree, expr) @@ -238,13 +255,19 @@ impl<'a> Parser<'a> { let token = self.previous.as_ref().unwrap(); // What kind is it? For now let's just ... make it good. - match token.as_str().parse::() { - Ok(v) => self.tree.add_expr(Expr::Literal(Literal::Float64(v))), + let literal = match token.as_str().parse::() { + Ok(v) => Literal::Float64(v), Err(e) => { self.error(format!("invalid f64: {e}")); - ExprRef::error() + return ExprRef::error(); } - } + }; + + self.tree.add_expr( + Expr::Literal(literal), + Some(token.clone()), + Some(token.clone()), + ) } fn grouping(&mut self) -> ExprRef { @@ -257,17 +280,21 @@ impl<'a> Parser<'a> { } fn unary(&mut self) -> ExprRef { - let kind = self.previous.as_ref().unwrap().kind(); + let token = self.previous.as_ref().unwrap().clone(); + let kind = token.kind(); let expr = self.expression_with_power(UNARY_POWER); let op = match kind { TokenKind::Minus => UnaryOp::Negate, _ => panic!("unsuitable unary: {:?}: no op", kind), }; - self.tree.add_expr(Expr::Unary(op, expr)) + + self.tree + .add_expr(Expr::Unary(op, expr), Some(token), self.previous.clone()) } fn binary(&mut self, power: u8, left: ExprRef) -> ExprRef { - let op = match self.previous.as_ref().unwrap().kind() { + let token = self.previous.as_ref().unwrap().clone(); + let op = match token.kind() { TokenKind::Plus => BinaryOp::Add, TokenKind::Minus => BinaryOp::Subtract, TokenKind::Star => BinaryOp::Mutiply, @@ -278,7 +305,11 @@ impl<'a> Parser<'a> { }; let right = self.expression_with_power(power + 1); - self.tree.add_expr(Expr::Binary(op, left, right)) + let (left_start, _) = self.tree.span(&left); + let (_, right_end) = self.tree.span(&right); + + self.tree + .add_expr(Expr::Binary(op, left, right), left_start, right_end) } fn advance(&mut self) { From 81a7b095555d5cf549848083a030067ce9b93b15 Mon Sep 17 00:00:00 2001 From: John Doty Date: Mon, 1 Jan 2024 09:05:13 -0800 Subject: [PATCH 5/9] [fine] Different source locations --- oden-script/src/parser.rs | 70 +++++++++------------------------------ oden-script/src/tokens.rs | 6 ++-- 2 files changed, 18 insertions(+), 58 deletions(-) diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs index 262164f7..ce8c6bb7 100644 --- a/oden-script/src/parser.rs +++ b/oden-script/src/parser.rs @@ -47,10 +47,10 @@ pub enum BinaryOp { Or, } -pub enum Expr { - Literal(Literal), - Unary(UnaryOp, ExprRef), - Binary(BinaryOp, ExprRef, ExprRef), +pub enum Expr<'a> { + Literal(Literal, Token<'a>), + Unary(UnaryOp, Token<'a>, ExprRef), + Binary(BinaryOp, Token<'a>, ExprRef, ExprRef), } pub struct ExprRef(Option); @@ -63,8 +63,7 @@ impl ExprRef { pub struct SyntaxTree<'a> { pub errors: Vec, - expressions: Vec, - spans: Vec<(Option>, Option>)>, + expressions: Vec>, } impl<'a> SyntaxTree<'a> { @@ -72,7 +71,6 @@ impl<'a> SyntaxTree<'a> { SyntaxTree { errors: Vec::new(), expressions: Vec::new(), - spans: Vec::new(), } } @@ -80,51 +78,23 @@ impl<'a> SyntaxTree<'a> { self.errors.push(error); } - pub fn add_expr( - &mut self, - expr: Expr, - start: Option>, - end: Option>, - ) -> ExprRef { + pub fn add_expr(&mut self, expr: Expr<'a>) -> ExprRef { let index = self.expressions.len(); self.expressions.push(expr); - self.spans.push((start, end)); ExprRef(Some(index)) } - pub fn span(&self, expr: &ExprRef) -> (Option>, Option>) { - if let ExprRef(Some(idx)) = expr { - let (start, end) = &self.spans[*idx]; - (start.clone(), end.clone()) - } else { - (None, None) - } - } - pub fn dump_expr(&self, expr: &ExprRef) -> String { match expr.0 { Some(idx) => { let expr = &self.expressions[idx]; match expr { - Expr::Literal(lit) => match lit { - Literal::Float64(f) => f.to_string(), - }, - Expr::Unary(op, e) => { - let op = match op { - UnaryOp::Negate => "-", - }; - format!("({op} {})", self.dump_expr(e)) + Expr::Literal(_, tok) => tok.to_string(), + Expr::Unary(_, tok, e) => { + format!("({tok} {})", self.dump_expr(e)) } - Expr::Binary(op, l, r) => { - let op = match op { - BinaryOp::Add => "+", - BinaryOp::Subtract => "-", - BinaryOp::Mutiply => "*", - BinaryOp::Divide => "/", - BinaryOp::And => "and", - BinaryOp::Or => "or", - }; - format!("({op} {} {})", self.dump_expr(l), self.dump_expr(r)) + Expr::Binary(_, tok, l, r) => { + format!("({tok} {} {})", self.dump_expr(l), self.dump_expr(r)) } } } @@ -263,11 +233,7 @@ impl<'a> Parser<'a> { } }; - self.tree.add_expr( - Expr::Literal(literal), - Some(token.clone()), - Some(token.clone()), - ) + self.tree.add_expr(Expr::Literal(literal, token.clone())) } fn grouping(&mut self) -> ExprRef { @@ -288,8 +254,7 @@ impl<'a> Parser<'a> { _ => panic!("unsuitable unary: {:?}: no op", kind), }; - self.tree - .add_expr(Expr::Unary(op, expr), Some(token), self.previous.clone()) + self.tree.add_expr(Expr::Unary(op, token, expr)) } fn binary(&mut self, power: u8, left: ExprRef) -> ExprRef { @@ -304,12 +269,7 @@ impl<'a> Parser<'a> { _ => panic!("unsuitable binary: {:?}: no op", self.previous), }; let right = self.expression_with_power(power + 1); - - let (left_start, _) = self.tree.span(&left); - let (_, right_end) = self.tree.span(&right); - - self.tree - .add_expr(Expr::Binary(op, left, right), left_start, right_end) + self.tree.add_expr(Expr::Binary(op, token, left, right)) } fn advance(&mut self) { @@ -318,7 +278,7 @@ impl<'a> Parser<'a> { self.current = self.tokens.next(); match &self.current { Some(token) if token.kind() == TokenKind::Error => { - self.error_at_current(token.clone()) + self.error_at_current(token.to_string()) } _ => break, } diff --git a/oden-script/src/tokens.rs b/oden-script/src/tokens.rs index 2a09336e..c989dab9 100644 --- a/oden-script/src/tokens.rs +++ b/oden-script/src/tokens.rs @@ -89,9 +89,9 @@ impl<'a> Token<'a> { } } -impl<'a> Into for Token<'a> { - fn into(self) -> String { - self.as_str().to_string() +impl<'a> std::fmt::Display for Token<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) } } From 633ce898173f4e82ca907ac996e5041f0d634b23 Mon Sep 17 00:00:00 2001 From: John Doty Date: Mon, 1 Jan 2024 09:18:42 -0800 Subject: [PATCH 6/9] [fine] Strings --- oden-script/src/parser.rs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs index ce8c6bb7..bcbf0823 100644 --- a/oden-script/src/parser.rs +++ b/oden-script/src/parser.rs @@ -32,6 +32,7 @@ impl fmt::Display for SyntaxError { pub enum Literal { Float64(f64), + String(String), } pub enum UnaryOp { @@ -198,6 +199,7 @@ impl<'a> Parser<'a> { TokenKind::LeftParen => self.grouping(), TokenKind::Number => self.number(), TokenKind::Minus => self.unary(), + TokenKind::String => self.string(), _ => { self.error("expected an expression"); ExprRef::error() @@ -236,6 +238,30 @@ impl<'a> Parser<'a> { self.tree.add_expr(Expr::Literal(literal, token.clone())) } + fn string(&mut self) -> ExprRef { + let token = self.previous.as_ref().unwrap(); + + let mut result = String::new(); + let mut input = token.as_str().chars(); + + assert!(input.next().is_some()); // Delimiter + while let Some(ch) = input.next() { + match ch { + '\\' => match input.next().unwrap() { + 'n' => result.push('\n'), + 'r' => result.push('\r'), + 't' => result.push('\t'), + ch => result.push(ch), + }, + _ => result.push(ch), + } + } + result.pop(); // We pushed the other delimiter on, whoops. + + let literal = Literal::String(result); + self.tree.add_expr(Expr::Literal(literal, token.clone())) + } + fn grouping(&mut self) -> ExprRef { let result = self.expression(); self.consume( @@ -385,4 +411,9 @@ mod tests { test_expr!(add_expr, "1 + 2", "(+ 1 2)"); test_expr!(prec_expr, "1 + 2 * 3 - 7 * 7", "(- (+ 1 (* 2 3)) (* 7 7))"); test_expr!(unary, "-((23)) * 5", "(* (- 23) 5)"); + test_expr!( + strings, + r#" "Hello " + "world!" "#, + r#"(+ "Hello " "world!")"# + ); } From cc6f77daf4f9addab1ef9c04c747b3fc3b215a95 Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 2 Jan 2024 09:29:52 -0800 Subject: [PATCH 7/9] [fine] Type checking --- oden-script/src/parser.rs | 227 ++++++++++++++++++++++++++++++++++---- oden-script/src/tokens.rs | 73 +++++++++--- 2 files changed, 264 insertions(+), 36 deletions(-) diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs index bcbf0823..2753a872 100644 --- a/oden-script/src/parser.rs +++ b/oden-script/src/parser.rs @@ -1,4 +1,4 @@ -use crate::tokens::{Token, TokenKind, Tokens}; +use crate::tokens::{Lines, Token, TokenKind, Tokens}; use std::fmt; #[derive(PartialEq, Eq)] @@ -30,30 +30,37 @@ impl fmt::Display for SyntaxError { } } +#[derive(Clone)] pub enum Literal { Float64(f64), String(String), + Bool(bool), } +#[derive(Copy, Clone)] pub enum UnaryOp { Negate, + Not, } +#[derive(Copy, Clone)] pub enum BinaryOp { Add, Subtract, - Mutiply, + Multiply, Divide, And, Or, } +#[derive(Clone)] pub enum Expr<'a> { Literal(Literal, Token<'a>), Unary(UnaryOp, Token<'a>, ExprRef), Binary(BinaryOp, Token<'a>, ExprRef, ExprRef), } +#[derive(Clone)] pub struct ExprRef(Option); impl ExprRef { @@ -62,6 +69,39 @@ impl ExprRef { } } +// TODO: Eventually we will be unable to use Eq and PartialEq here, and will +// need to do out own thing. +#[derive(Clone, Eq, PartialEq)] +pub enum Type { + Error, + + // TODO: Numeric literals should be implicitly convertable unlike other + // types. + F64, + String, + Bool, +} + +impl std::fmt::Debug for Type { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{self}") + } +} + +impl std::fmt::Display for Type { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use Type::*; + match self { + Error => write!(f, "<< INTERNAL ERROR >>"), + F64 => write!(f, "f64"), + String => write!(f, "string"), + Bool => write!(f, "bool"), + } + } +} + +pub struct TypeRef(Option); + pub struct SyntaxTree<'a> { pub errors: Vec, expressions: Vec>, @@ -102,6 +142,76 @@ impl<'a> SyntaxTree<'a> { None => "<|EOF|>".to_string(), } } + + pub fn expr_type(&mut self, expr: &ExprRef, lines: &Lines) -> Type { + // TODO: Cache and work on demand? Or is this just fine? + + let expr = match expr.0 { + Some(idx) => &self.expressions[idx], + None => return Type::Error, + }; + match expr { + Expr::Literal(lit, _) => match lit { + Literal::Float64(_) => Type::F64, + Literal::String(_) => Type::String, + Literal::Bool(_) => Type::Bool, + }, + + // Figure out the main thing. Check for a... trait? + Expr::Unary(op, tok, arg) => { + let op = op.clone(); + let arg = arg.clone(); + let tok = tok.clone(); + let arg_type = self.expr_type(&arg, lines); + match (op, arg_type) { + (UnaryOp::Negate, Type::F64) => Type::F64, + (UnaryOp::Not, Type::Bool) => Type::Bool, + + // Propagate existing errors without additional complaint. + (_, Type::Error) => Type::Error, + + // Missed the whole table, must be an error. + (_, arg_type) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new(line, col, format!("cannot apply unary operator '{tok}' to expression of type '{arg_type}'"))); + Type::Error + } + } + } + + Expr::Binary(op, tok, left, right) => { + let op = op.clone(); + let tok = tok.clone(); + let left = left.clone(); + let right = right.clone(); + let left_type = self.expr_type(&left, lines); + let right_type = self.expr_type(&right, lines); + + match (op, left_type, right_type) { + ( + BinaryOp::Add | BinaryOp::Subtract | BinaryOp::Multiply | BinaryOp::Divide, + Type::F64, + Type::F64, + ) => Type::F64, + + (BinaryOp::Add, Type::String, Type::String) => Type::String, + + (BinaryOp::And | BinaryOp::Or, Type::Bool, Type::Bool) => Type::Bool, + + // Propagate existing errors without additional complaint. + (_, Type::Error, _) => Type::Error, + (_, _, Type::Error) => Type::Error, + + // Missed the whole table, it must be an error. + (_, left_type, right_type) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new(line, col, format!("cannot apply binary operator '{tok}' to expressions of type '{left_type}' (on the left) and '{right_type}' (on the right)"))); + Type::Error + } + } + } + } + } } // BINDING POWERS. When parsing expressions we only accept expressions that @@ -161,10 +271,10 @@ impl<'a> Parser<'a> { parser } - pub fn parse(mut self) -> (SyntaxTree<'a>, ExprRef) { + pub fn parse(mut self) -> (SyntaxTree<'a>, ExprRef, Lines) { let expr = self.expression(); self.consume(None, "expected end of expression"); - (self.tree, expr) + (self.tree, expr, self.tokens.lines()) } fn expression(&mut self) -> ExprRef { @@ -196,10 +306,19 @@ impl<'a> Parser<'a> { let token = self.previous.as_ref(); match token { Some(token) => match token.kind() { + TokenKind::Bang => self.unary(), TokenKind::LeftParen => self.grouping(), TokenKind::Number => self.number(), TokenKind::Minus => self.unary(), TokenKind::String => self.string(), + + TokenKind::True => self + .tree + .add_expr(Expr::Literal(Literal::Bool(true), token.clone())), + TokenKind::False => self + .tree + .add_expr(Expr::Literal(Literal::Bool(false), token.clone())), + _ => { self.error("expected an expression"); ExprRef::error() @@ -216,9 +335,12 @@ impl<'a> Parser<'a> { self.trace("infix"); let kind = self.previous.as_ref().unwrap().kind(); match kind { - TokenKind::Plus | TokenKind::Minus | TokenKind::Star | TokenKind::Slash => { - self.binary(power, left) - } + TokenKind::Plus + | TokenKind::Minus + | TokenKind::Star + | TokenKind::Slash + | TokenKind::And + | TokenKind::Or => self.binary(power, left), _ => panic!("Unknown infix operator, dispatch error?"), } } @@ -277,6 +399,7 @@ impl<'a> Parser<'a> { let expr = self.expression_with_power(UNARY_POWER); let op = match kind { TokenKind::Minus => UnaryOp::Negate, + TokenKind::Bang => UnaryOp::Not, _ => panic!("unsuitable unary: {:?}: no op", kind), }; @@ -288,7 +411,7 @@ impl<'a> Parser<'a> { let op = match token.kind() { TokenKind::Plus => BinaryOp::Add, TokenKind::Minus => BinaryOp::Subtract, - TokenKind::Star => BinaryOp::Mutiply, + TokenKind::Star => BinaryOp::Multiply, TokenKind::Slash => BinaryOp::Divide, TokenKind::And => BinaryOp::And, TokenKind::Or => BinaryOp::Or, @@ -388,32 +511,98 @@ mod tests { use super::*; use pretty_assertions::assert_eq; - fn test_successful_expression_parse(source: &str, expected: &str) { - let (tree, expr) = Parser::new(source).parse(); + fn test_successful_expression_parse(source: &str, expected: &str, expected_type: Type) { + let (mut tree, expr, lines) = Parser::new(source).parse(); assert_eq!( Vec::::new(), tree.errors, "Expected successful parse" ); - assert_eq!(expected, tree.dump_expr(&expr)); + assert_eq!( + expected, + tree.dump_expr(&expr), + "The parse structure of the expressions did not match" + ); + + // TODO: 'assert_eq' is probably wrong here + let expr_type = tree.expr_type(&expr, &lines); + assert_eq!( + expected_type, expr_type, + "The type of the expression did not match" + ); } macro_rules! test_expr { - ($name:ident, $input:expr, $expected:expr) => { + ($name:ident, $input:expr, $expected:expr, $type:expr) => { #[test] fn $name() { - test_successful_expression_parse($input, $expected); + test_successful_expression_parse($input, $expected, $type); } }; } - test_expr!(number_expr, "12", "12"); - test_expr!(add_expr, "1 + 2", "(+ 1 2)"); - test_expr!(prec_expr, "1 + 2 * 3 - 7 * 7", "(- (+ 1 (* 2 3)) (* 7 7))"); - test_expr!(unary, "-((23)) * 5", "(* (- 23) 5)"); + test_expr!(number_expr, "12", "12", Type::F64); + test_expr!(add_expr, "1 + 2", "(+ 1 2)", Type::F64); + test_expr!( + prec_expr, + "1 + 2 * 3 - 7 * 7", + "(- (+ 1 (* 2 3)) (* 7 7))", + Type::F64 + ); + test_expr!(unary, "-((23)) * 5", "(* (- 23) 5)", Type::F64); test_expr!( strings, - r#" "Hello " + "world!" "#, - r#"(+ "Hello " "world!")"# + r#" "Hello " + 'world!' "#, + r#"(+ "Hello " 'world!')"#, + Type::String + ); + + test_expr!( + booleans, + "true and false or false and !true", + "(or (and true false) (and false (! true)))", + Type::Bool + ); + + fn test_type_error_expression(source: &str, expected_errors: Vec<&str>) { + let (mut tree, expr, lines) = Parser::new(source).parse(); + assert_eq!( + Vec::::new(), + tree.errors, + "Expected successful parse" + ); + + let expr_type = tree.expr_type(&expr, &lines); + assert_eq!(Type::Error, expr_type, "expected to have a type error"); + + let actual_errors = tree + .errors + .iter() + .map(|e| e.message.as_str()) + .collect::>(); + assert_eq!(expected_errors, actual_errors); + } + + macro_rules! test_type_error_expr { + ($name:ident, $input:expr, $($s:expr),+) => { + #[test] + fn $name() { + let expected_errors: Vec<&str> = (vec![$($s),*]); + test_type_error_expression($input, expected_errors); + } + } + } + + test_type_error_expr!( + negate_string, + "-('what?')", + "cannot apply unary operator '-' to expression of type 'string'" + ); + + test_type_error_expr!( + errors_propagate_do_not_duplicate, + "!'hello' / 27 * -('what?') + 23", + "cannot apply unary operator '!' to expression of type 'string'", + "cannot apply unary operator '-' to expression of type 'string'" ); } diff --git a/oden-script/src/tokens.rs b/oden-script/src/tokens.rs index c989dab9..c2bccfb9 100644 --- a/oden-script/src/tokens.rs +++ b/oden-script/src/tokens.rs @@ -74,6 +74,10 @@ impl<'a> Token<'a> { } } + pub fn start(&self) -> usize { + self.start + } + pub fn kind(&self) -> TokenKind { self.kind } @@ -95,23 +99,22 @@ impl<'a> std::fmt::Display for Token<'a> { } } -pub struct Tokens<'a> { - source: &'a str, - chars: std::str::CharIndices<'a>, - next_char: Option<(usize, char)>, +pub struct Lines { newlines: Vec, + eof: usize, } -impl<'a> Tokens<'a> { - pub fn new(source: &'a str) -> Self { - let mut result = Tokens { - source, - chars: source.char_indices(), - next_char: None, +impl Lines { + fn new(eof: usize) -> Self { + Lines { newlines: Vec::new(), - }; - result.advance(); // Prime the pump - result + eof, + } + } + + /// Record the position of a newline in the source. + pub fn add_line(&mut self, pos: usize) { + self.newlines.push(pos) } /// Return the position of the given token as a (line, column) pair. By @@ -122,9 +125,15 @@ impl<'a> Tokens<'a> { pub fn token_position(&self, token: &Option) -> (usize, usize) { let start = match token { Some(t) => t.start, - None => self.source.len(), + None => self.eof, }; - let line_end_index = match self.newlines.binary_search(&start) { + self.position(start) + } + + /// Return the position of the given character offset as a (line,column) + /// pair. By convention, lines are 1-based and columns are 0-based. + pub fn position(&self, offset: usize) -> (usize, usize) { + let line_end_index = match self.newlines.binary_search(&offset) { Ok(index) => index, Err(index) => index, }; @@ -134,9 +143,39 @@ impl<'a> Tokens<'a> { self.newlines[line_end_index - 1] + 1 }; let line_number = line_end_index + 1; - let column_offset = start - line_start_pos; + let column_offset = offset - line_start_pos; (line_number, column_offset) } +} + +pub struct Tokens<'a> { + source: &'a str, + chars: std::str::CharIndices<'a>, + next_char: Option<(usize, char)>, + lines: Lines, +} + +impl<'a> Tokens<'a> { + pub fn new(source: &'a str) -> Self { + let mut result = Tokens { + source, + chars: source.char_indices(), + next_char: None, + lines: Lines::new(source.len()), + }; + result.advance(); // Prime the pump + result + } + + pub fn lines(self) -> Lines { + self.lines + } + + /// Return the position of the given token as a (line, column) pair. See + /// `Lines::token_position` for more information about the range, etc. + pub fn token_position(&self, token: &Option) -> (usize, usize) { + self.lines.token_position(token) + } fn token(&self, start: usize, kind: TokenKind) -> Token<'a> { let value = &self.source[start..self.pos()]; @@ -363,7 +402,7 @@ impl<'a> Tokens<'a> { fn skip_whitespace(&mut self) { while let Some((pos, ch)) = self.next_char { if ch == '\n' { - self.newlines.push(pos); + self.lines.add_line(pos); } else if !ch.is_whitespace() { break; } From dd6b673615dc9c7cc8e093e411131667d3ea250e Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 2 Jan 2024 09:35:48 -0800 Subject: [PATCH 8/9] [fine] A few more tests --- oden-script/src/parser.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs index 2753a872..bc98d520 100644 --- a/oden-script/src/parser.rs +++ b/oden-script/src/parser.rs @@ -599,6 +599,18 @@ mod tests { "cannot apply unary operator '-' to expression of type 'string'" ); + test_type_error_expr!( + add_string_number, + "'what?' + 5", + "cannot apply binary operator '+' to expressions of type 'string' (on the left) and 'f64' (on the right)" + ); + + test_type_error_expr!( + add_number_string, + "5 + 'what?'", + "cannot apply binary operator '+' to expressions of type 'f64' (on the left) and 'string' (on the right)" + ); + test_type_error_expr!( errors_propagate_do_not_duplicate, "!'hello' / 27 * -('what?') + 23", From 652fe18f57fde7d011b7609d77ea877bcc44481e Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 2 Jan 2024 13:49:31 -0800 Subject: [PATCH 9/9] [fine] More type checking; if and whatnot --- oden-script/src/parser.rs | 328 +++++++++++++++++++++++++++++++++++--- 1 file changed, 307 insertions(+), 21 deletions(-) diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs index bc98d520..a962dcd3 100644 --- a/oden-script/src/parser.rs +++ b/oden-script/src/parser.rs @@ -1,32 +1,54 @@ use crate::tokens::{Lines, Token, TokenKind, Tokens}; use std::fmt; +// TODO: An error should have: +// +// - a start +// - an end +// - a focus +// - descriptive messages +// +// that will have to wait for now #[derive(PartialEq, Eq)] pub struct SyntaxError { - pub line: usize, - pub column: usize, + pub start: (usize, usize), + pub end: (usize, usize), pub message: String, } impl SyntaxError { - pub fn new(line: usize, column: usize, message: String) -> Self { + pub fn new(line: usize, column: usize, message: T) -> Self + where + T: ToString, + { SyntaxError { - line, - column, - message, + start: (line, column), + end: (line, column), + message: message.to_string(), + } + } + + pub fn new_spanned(start: (usize, usize), end: (usize, usize), message: T) -> Self + where + T: ToString, + { + SyntaxError { + start, + end, + message: message.to_string(), } } } impl fmt::Debug for SyntaxError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}:{}: {}", self.line, self.column, self.message) + write!(f, "{self}") } } impl fmt::Display for SyntaxError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}:{}: {}", self.line, self.column, self.message) + write!(f, "{}:{}: {}", self.start.0, self.end.0, self.message) } } @@ -58,6 +80,7 @@ pub enum Expr<'a> { Literal(Literal, Token<'a>), Unary(UnaryOp, Token<'a>, ExprRef), Binary(BinaryOp, Token<'a>, ExprRef, ExprRef), + Conditional(Token<'a>, ExprRef, ExprRef, Option, Token<'a>), } #[derive(Clone)] @@ -71,17 +94,50 @@ impl ExprRef { // TODO: Eventually we will be unable to use Eq and PartialEq here, and will // need to do out own thing. -#[derive(Clone, Eq, PartialEq)] +#[derive(Copy, Clone)] pub enum Type { + // Signals a type error. If you receive this then you know that an error + // has already been reported; if you produce this be sure to also note + // the error in the errors collection. Error, - // TODO: Numeric literals should be implicitly convertable unlike other - // types. + // Signals that the expression has a control-flow side-effect and that no + // value will ever result from this expression. Usually this means + // everything's fine. + Unreachable, + + // TODO: Numeric literals should be implicitly convertable, unlike other + // types. Maybe just "numeric literal" type? F64, String, Bool, } +impl Type { + pub fn is_error(&self) -> bool { + match self { + Type::Error => true, + _ => false, + } + } + + pub fn compatible_with(&self, other: &Type) -> bool { + // TODO: This is wrong; we because of numeric literals etc. + match (self, other) { + (Type::F64, Type::F64) => true, + (Type::String, Type::String) => true, + (Type::Bool, Type::Bool) => true, + (Type::Unreachable, Type::Unreachable) => true, + + // Avoid introducing more errors + (Type::Error, _) => true, + (_, Type::Error) => true, + + (_, _) => false, + } + } +} + impl std::fmt::Debug for Type { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{self}") @@ -93,6 +149,7 @@ impl std::fmt::Display for Type { use Type::*; match self { Error => write!(f, "<< INTERNAL ERROR >>"), + Unreachable => write!(f, "<< UNREACHABLE >>"), F64 => write!(f, "f64"), String => write!(f, "string"), Bool => write!(f, "bool"), @@ -137,15 +194,56 @@ impl<'a> SyntaxTree<'a> { Expr::Binary(_, tok, l, r) => { format!("({tok} {} {})", self.dump_expr(l), self.dump_expr(r)) } + Expr::Conditional(tok, cond, t, e, _) => { + if let Some(e) = e { + format!( + "({tok} {} {} {})", + self.dump_expr(cond), + self.dump_expr(t), + self.dump_expr(e) + ) + } else { + format!("({tok} {} {})", self.dump_expr(cond), self.dump_expr(t)) + } + } } } None => "<|EOF|>".to_string(), } } - pub fn expr_type(&mut self, expr: &ExprRef, lines: &Lines) -> Type { + pub fn expr_span(&self, expr: &ExprRef) -> Option<(Token<'a>, Token<'a>)> { + let expr = match expr.0 { + Some(idx) => &self.expressions[idx], + None => return None, + }; + + match expr { + Expr::Literal(_, tok) => Some((tok.clone(), tok.clone())), + Expr::Unary(_, tok, arg) => { + let arg = self.expr_span(arg); + match arg { + None => None, + Some((_, end)) => Some((tok.clone(), end)), + } + } + Expr::Binary(_, _, left, right) => { + let left = self.expr_span(left); + let right = self.expr_span(right); + match (left, right) { + (None, _) => None, + (_, None) => None, + (Some((start, _)), Some((_, end))) => Some((start, end)), + } + } + Expr::Conditional(head, _, _, _, tail) => Some((head.clone(), tail.clone())), + } + } + + pub fn expr_type(&mut self, expr: &ExprRef, lines: &Lines, value_required: bool) -> Type { // TODO: Cache and work on demand? Or is this just fine? + let exr = expr.clone(); let expr = match expr.0 { Some(idx) => &self.expressions[idx], None => return Type::Error, @@ -162,11 +260,18 @@ impl<'a> SyntaxTree<'a> { let op = op.clone(); let arg = arg.clone(); let tok = tok.clone(); - let arg_type = self.expr_type(&arg, lines); + let arg_type = self.expr_type(&arg, lines, true); match (op, arg_type) { (UnaryOp::Negate, Type::F64) => Type::F64, (UnaryOp::Not, Type::Bool) => Type::Bool, + // This is dumb and should be punished, probably. + (_, Type::Unreachable) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new(line, col, format!("cannot apply a unary operator to something that doesn't yield a value"))); + Type::Error + } + // Propagate existing errors without additional complaint. (_, Type::Error) => Type::Error, @@ -184,8 +289,8 @@ impl<'a> SyntaxTree<'a> { let tok = tok.clone(); let left = left.clone(); let right = right.clone(); - let left_type = self.expr_type(&left, lines); - let right_type = self.expr_type(&right, lines); + let left_type = self.expr_type(&left, lines, true); + let right_type = self.expr_type(&right, lines, true); match (op, left_type, right_type) { ( @@ -198,6 +303,30 @@ impl<'a> SyntaxTree<'a> { (BinaryOp::And | BinaryOp::Or, Type::Bool, Type::Bool) => Type::Bool, + // This is dumb and should be punished, probably. + (_, _, Type::Unreachable) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new( + line, + col, + format!( + "cannot apply '{tok}' to an argument that doesn't yield a value (on the right)" + ), + )); + Type::Error + } + (_, Type::Unreachable, _) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new( + line, + col, + format!( + "cannot apply '{tok}' to an argument that doesn't yield a value (on the left)" + ), + )); + Type::Error + } + // Propagate existing errors without additional complaint. (_, Type::Error, _) => Type::Error, (_, _, Type::Error) => Type::Error, @@ -210,6 +339,79 @@ impl<'a> SyntaxTree<'a> { } } } + + Expr::Conditional(_, cond, then_exp, else_exp, _) => { + let cond = cond.clone(); + let then_exp = then_exp.clone(); + let else_exp = else_exp.clone(); + + let cond_type = self.expr_type(&cond, lines, true); + let then_type = self.expr_type(&then_exp, lines, value_required); + let else_type = else_exp.map(|e| self.expr_type(&e, lines, value_required)); + if !cond_type.compatible_with(&Type::Bool) { + if !cond_type.is_error() { + let span = self + .expr_span(&cond) + .expect("If the expression has a type it must have a span"); + + let start = lines.position(span.0.start()); + let end = lines.position(span.1.start()); + self.errors.push(SyntaxError::new_spanned( + start, + end, + "the condition of an `if` expression must be a boolean", + )); + } + return Type::Error; + } + + match (then_type, else_type) { + (Type::Error, _) => Type::Error, + (_, Some(Type::Error)) => Type::Error, + + // It's an error to have a missing else branch if the value is required + (_, None) if value_required => { + let span = self + .expr_span(&exr) + .expect("How did I get this far with a broken parse?"); + let start = lines.position(span.0.start()); + let end = lines.position(span.1.start()); + self.errors.push(SyntaxError::new_spanned( + start, + end, + "this `if` expression must have both a `then` clause and an `else` clause, so it can produce a value", + )); + Type::Error + } + + // If the value is required then the branches must be + // compatible, and the type of the expression is the type + // of the `then` branch. + (then_type, Some(else_type)) if value_required => { + if !then_type.compatible_with(&else_type) { + let span = self + .expr_span(&exr) + .expect("How did I get this far with a broken parse?"); + let start = lines.position(span.0.start()); + let end = lines.position(span.1.start()); + self.errors.push(SyntaxError::new_spanned( + start, + end, + format!("the type of the `then` branch ({then_type}) must match the type of the `else` branch ({else_type})"), + )); + Type::Error + } else { + then_type + } + } + + // The value must not be required, just mark this as unreachable. + (_, _) => { + assert!(!value_required); + Type::Unreachable + } + } + } } } } @@ -319,6 +521,8 @@ impl<'a> Parser<'a> { .tree .add_expr(Expr::Literal(Literal::Bool(false), token.clone())), + TokenKind::If => self.conditional(), + _ => { self.error("expected an expression"); ExprRef::error() @@ -393,6 +597,53 @@ impl<'a> Parser<'a> { result } + fn conditional(&mut self) -> ExprRef { + let token = self.previous.as_ref().unwrap().clone(); + let condition_expr = self.expression(); + self.consume( + Some(TokenKind::LeftBrace), + "expected '{' to start an 'if' block", + ); + let then_expr = self.expression(); + self.consume( + Some(TokenKind::RightBrace), + "expected '}' to end an 'if' block", + ); + let else_expr = match &self.current { + Some(token) if token.kind() == TokenKind::Else => { + self.advance(); + match &self.current { + // Allow `else if` without another `{`. + Some(token) if token.kind() == TokenKind::If => { + self.advance(); + Some(self.conditional()) + } + _ => { + self.consume( + Some(TokenKind::LeftBrace), + "expected '{' to start an 'else' block", + ); + let else_expr = self.expression(); + self.consume( + Some(TokenKind::RightBrace), + "Expected '}' to end an 'else' block", + ); + Some(else_expr) + } + } + } + _ => None, + }; + let tail = self.previous.as_ref().unwrap().clone(); + self.tree.add_expr(Expr::Conditional( + token, + condition_expr, + then_expr, + else_expr, + tail, + )) + } + fn unary(&mut self) -> ExprRef { let token = self.previous.as_ref().unwrap().clone(); let kind = token.kind(); @@ -525,10 +776,10 @@ mod tests { ); // TODO: 'assert_eq' is probably wrong here - let expr_type = tree.expr_type(&expr, &lines); - assert_eq!( - expected_type, expr_type, - "The type of the expression did not match" + let expr_type = tree.expr_type(&expr, &lines, true); + assert!( + expected_type.compatible_with(&expr_type), + "The type of the expression did not match. expected: {expected_type}, actual: {expr_type}" ); } @@ -564,6 +815,23 @@ mod tests { Type::Bool ); + test_expr!( + if_expression, + "if true { 23 } else { 45 }", + "(if true 23 45)", + Type::F64 + ); + // test_expr!( + // if_with_return, + // "if true { 23 } else { return 'nothing' }", + // "", + // Type::F64 + // ); + + // ======================================================================== + // Type Error Tests + // ======================================================================== + fn test_type_error_expression(source: &str, expected_errors: Vec<&str>) { let (mut tree, expr, lines) = Parser::new(source).parse(); assert_eq!( @@ -572,8 +840,8 @@ mod tests { "Expected successful parse" ); - let expr_type = tree.expr_type(&expr, &lines); - assert_eq!(Type::Error, expr_type, "expected to have a type error"); + let expr_type = tree.expr_type(&expr, &lines, true); + assert!(expr_type.is_error()); let actual_errors = tree .errors @@ -617,4 +885,22 @@ mod tests { "cannot apply unary operator '!' to expression of type 'string'", "cannot apply unary operator '-' to expression of type 'string'" ); + + test_type_error_expr!( + if_not_bool, + "if 23 { 1 } else { 2 }", + "the condition of an `if` expression must be a boolean" + ); + + test_type_error_expr!( + if_arm_mismatch, + "if true { 1 } else { '1' }", + "the type of the `then` branch (f64) must match the type of the `else` branch (string)" + ); + + test_type_error_expr!( + if_no_else, + "if true { 1 }", + "this `if` expression must have both a `then` clause and an `else` clause, so it can produce a value" + ); }