diff --git a/oden-script/Cargo.lock b/oden-script/Cargo.lock index 459de42c..8839cda3 100644 --- a/oden-script/Cargo.lock +++ b/oden-script/Cargo.lock @@ -2,6 +2,31 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + [[package]] name = "oden-script" version = "0.1.0" +dependencies = [ + "pretty_assertions", +] + +[[package]] +name = "pretty_assertions" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +dependencies = [ + "diff", + "yansi", +] + +[[package]] +name = "yansi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" diff --git a/oden-script/Cargo.toml b/oden-script/Cargo.toml index 87a1f807..25623ca0 100644 --- a/oden-script/Cargo.toml +++ b/oden-script/Cargo.toml @@ -2,3 +2,6 @@ name = "oden-script" version = "0.1.0" edition = "2021" + +[dev-dependencies] +pretty_assertions = "1.4.0" diff --git a/oden-script/src/lib.rs b/oden-script/src/lib.rs index e0d6d806..5c766355 100644 --- a/oden-script/src/lib.rs +++ b/oden-script/src/lib.rs @@ -1,458 +1 @@ -#[derive(Debug)] -pub enum TokenKind<'a> { - LeftBrace, - RightBrace, - LeftBracket, - RightBracket, - LeftParen, - RightParen, - Comma, - Dot, - Minus, - Plus, - Semicolon, - Slash, - Star, - - Bang, - BangEqual, - Equal, - EqualEqual, - Greater, - GreaterEqual, - Less, - LessEqual, - - Identifier(&'a str), // TODO - String(&'a str), - Number(&'a str), - - And, - Async, - Await, - Class, - Else, - False, - For, - From, - Fun, - If, - Let, - Or, - Print, - Return, - Select, - This, - True, - While, - Yield, - - Error(String), -} - -#[derive(Debug)] -pub struct Token<'a> { - kind: TokenKind<'a>, - start: usize, -} - -impl<'a> Token<'a> { - pub fn as_str<'b>(&'b self) -> &'a str - where - 'b: 'a, - { - use TokenKind::*; - match &self.kind { - LeftBrace => "{", - RightBrace => "}", - LeftBracket => "[", - RightBracket => "]", - - LeftParen => "(", - RightParen => ")", - Comma => ",", - Dot => ".", - Minus => "-", - - Plus => "+", - Semicolon => ";", - Slash => "/", - Star => "*", - - Bang => "+", - BangEqual => "!=", - Equal => "=", - EqualEqual => "==", - Greater => ">", - GreaterEqual => ">=", - Less => "<", - LessEqual => "<=", - - Identifier(v) => v, - String(v) => v, - Number(v) => v, - - And => "and", - Async => "async", - Await => "await", - Class => "class", - Else => "else", - False => "false", - For => "for", - From => "from", - Fun => "fun", - If => "if", - Let => "let", - Or => "or", - Print => "print", - Return => "return", - Select => "select", - This => "this", - True => "true", - While => "while", - Yield => "yield", - - Error(e) => e, - } - } -} - -pub struct Tokens<'a> { - source: &'a str, - chars: std::str::CharIndices<'a>, - next_char: Option<(usize, char)>, - newlines: Vec, -} - -impl<'a> Tokens<'a> { - pub fn new(source: &'a str) -> Self { - let mut chars = source.char_indices(); - let next_char = chars.next(); - Tokens { - source, - chars, - next_char, - newlines: Vec::new(), - } - } - - pub fn token_position(&self, token: &Token) -> (usize, usize) { - let line_end_index = match self.newlines.binary_search(&token.start) { - Ok(index) => index, - Err(index) => index, - }; - let line_start_pos = if line_end_index == 0 { - 0 - } else { - self.newlines[line_end_index - 1] + 1 - }; - let line_number = line_end_index + 1; - let column_offset = token.start - line_start_pos; - (line_number, column_offset) - } - - pub fn next_token(&mut self) -> Option> { - self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving - let (pos, c) = match self.advance() { - Some((p, c)) => (p, c), - None => return None, - }; - - let token = match c { - '{' => TokenKind::LeftBrace, - '}' => TokenKind::RightBrace, - '[' => TokenKind::LeftBracket, - ']' => TokenKind::RightBracket, - '(' => TokenKind::LeftParen, - ')' => TokenKind::RightParen, - ',' => TokenKind::Comma, - '.' => TokenKind::Dot, - '-' => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else { - TokenKind::Minus - } - } - '+' => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else { - TokenKind::Plus - } - } - ';' => TokenKind::Semicolon, - '/' => TokenKind::Slash, - '*' => TokenKind::Star, - '!' => { - if self.matches('=') { - TokenKind::BangEqual - } else { - TokenKind::Bang - } - } - '=' => { - if self.matches('=') { - TokenKind::EqualEqual - } else { - TokenKind::Equal - } - } - '>' => { - if self.matches('=') { - TokenKind::GreaterEqual - } else { - TokenKind::Greater - } - } - '<' => { - if self.matches('=') { - TokenKind::LessEqual - } else { - TokenKind::Less - } - } - '\'' => self.string(pos, '\''), - '"' => self.string(pos, '"'), - _ => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else if self.matches_next(|c| c.is_ascii_alphabetic() || c == '_') { - self.identifier(pos) - } else { - TokenKind::Error(format!("Unexpected character '{c}'")) - } - } - }; - let token = self.token(pos, token); - Some(token) - } - - fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> { - Token { kind, start } - } - - fn number(&mut self, start: usize) -> TokenKind<'a> { - // First, the main part. - loop { - if !self.matches_digit() { - break; - } - } - - // Now the fraction part. - // The thing that is bad here is that this is speculative... - let backup = self.chars.clone(); - if self.matches('.') { - let mut saw_digit = false; - loop { - if self.matches('_') { - } else if self.matches_next(|c| c.is_ascii_digit()) { - saw_digit = true; - } else { - break; - } - } - - if saw_digit { - // OK we're good to here! Check the scientific notation. - if self.matches('e') || self.matches('E') { - if self.matches('+') || self.matches('-') {} - let mut saw_digit = false; - loop { - if self.matches('_') { - } else if self.matches_next(|c| c.is_ascii_digit()) { - saw_digit = true; - } else { - break; - } - } - - if !saw_digit { - // This is just a broken number. - let slice = &self.source[start..self.pos()]; - return TokenKind::Error(format!( - "Invalid floating-point literal: {slice}" - )); - } - } - } else { - // Might be accessing a member on an integer. - self.chars = backup; - } - } - - TokenKind::Number(&self.source[start..self.pos()]) - } - - fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> { - while !self.matches(delimiter) { - if self.eof() { - return TokenKind::Error("Unterminated string constant".to_string()); - } - if self.matches('\\') { - self.advance(); - } - } - - TokenKind::String(&self.source[start..self.pos()]) - } - - fn identifier(&mut self, start: usize) -> TokenKind<'a> { - loop { - // TODO: Use unicode identifier classes instead - if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { - break; - } - } - - let ident = &self.source[start..self.pos()]; - match ident.chars().nth(0) { - Some('a') => { - if ident == "and" { - return TokenKind::And; - } - if ident == "async" { - return TokenKind::Async; - } - if ident == "await" { - return TokenKind::Await; - } - } - Some('c') => { - if ident == "class" { - return TokenKind::Class; - } - } - Some('e') => { - if ident == "else" { - return TokenKind::Else; - } - } - Some('f') => { - if ident == "for" { - return TokenKind::For; - } - if ident == "from" { - return TokenKind::From; - } - if ident == "fun" { - return TokenKind::Fun; - } - } - Some('i') => { - if ident == "if" { - return TokenKind::If; - } - } - Some('l') => { - if ident == "let" { - return TokenKind::Let; - } - } - Some('o') => { - if ident == "or" { - return TokenKind::Or; - } - } - Some('p') => { - if ident == "print" { - return TokenKind::Print; - } - } - Some('r') => { - if ident == "return" { - return TokenKind::Return; - } - } - Some('s') => { - if ident == "select" { - return TokenKind::Select; - } - } - Some('t') => { - if ident == "this" { - return TokenKind::This; - } - if ident == "true" { - return TokenKind::True; - } - } - Some('w') => { - if ident == "while" { - return TokenKind::While; - } - } - Some('y') => { - if ident == "yield" { - return TokenKind::Yield; - } - } - _ => (), - } - - TokenKind::Identifier(ident) - } - - fn matches(&mut self, ch: char) -> bool { - if let Some((_, next_ch)) = self.next_char { - if next_ch == ch { - self.advance(); - return true; - } - } - false - } - - fn matches_next(&mut self, f: F) -> bool - where - F: FnOnce(char) -> bool, - { - if let Some((_, next_ch)) = self.next_char { - if f(next_ch) { - self.advance(); - return true; - } - } - false - } - - fn matches_digit(&mut self) -> bool { - self.matches('_') || self.matches_next(|c| c.is_ascii_digit()) - } - - fn advance(&mut self) -> Option<(usize, char)> { - let result = self.next_char; - self.next_char = self.chars.next(); - result - } - - fn pos(&self) -> usize { - match self.next_char { - Some((p, _)) => p, - None => self.source.len(), - } - } - - fn eof(&self) -> bool { - self.next_char.is_none() - } - - fn skip_whitespace(&mut self) { - while let Some((pos, ch)) = self.next_char { - if ch == '\n' { - self.newlines.push(pos); - } else if !ch.is_whitespace() { - break; - } - self.advance(); - } - } -} - -pub fn tokenize(input: String) { - let mut tokens = Tokens::new(&input); - while let Some(token) = tokens.next_token() { - println!("{}: {}", token.start, token.as_str()); - } -} +pub mod tokens; diff --git a/oden-script/src/main.rs b/oden-script/src/main.rs index 7f158ec7..da0f5d92 100644 --- a/oden-script/src/main.rs +++ b/oden-script/src/main.rs @@ -1,3 +1 @@ -use oden_script; - pub fn main() {} diff --git a/oden-script/src/tokens.rs b/oden-script/src/tokens.rs new file mode 100644 index 00000000..02979fe5 --- /dev/null +++ b/oden-script/src/tokens.rs @@ -0,0 +1,569 @@ +#[derive(Debug, PartialEq, Eq)] +pub enum TokenKind<'a> { + LeftBrace, + RightBrace, + LeftBracket, + RightBracket, + LeftParen, + RightParen, + Comma, + Dot, + Minus, + Plus, + Semicolon, + Slash, + Star, + + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + Identifier(&'a str), // TODO + String(&'a str), + Number(&'a str), + + And, + Async, + Await, + Class, + Else, + False, + For, + From, + Fun, + If, + Let, + Or, + Print, + Return, + Select, + This, + True, + While, + Yield, + + Error(String), +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Token<'a> { + kind: TokenKind<'a>, + start: usize, +} + +impl<'a> Token<'a> { + pub fn new(start: usize, kind: TokenKind<'a>) -> Self { + Token { kind, start } + } + + pub fn as_str<'b>(&'b self) -> &'a str + where + 'b: 'a, + { + use TokenKind::*; + match &self.kind { + LeftBrace => "{", + RightBrace => "}", + LeftBracket => "[", + RightBracket => "]", + + LeftParen => "(", + RightParen => ")", + Comma => ",", + Dot => ".", + Minus => "-", + + Plus => "+", + Semicolon => ";", + Slash => "/", + Star => "*", + + Bang => "+", + BangEqual => "!=", + Equal => "=", + EqualEqual => "==", + Greater => ">", + GreaterEqual => ">=", + Less => "<", + LessEqual => "<=", + + Identifier(v) => v, + String(v) => v, + Number(v) => v, + + And => "and", + Async => "async", + Await => "await", + Class => "class", + Else => "else", + False => "false", + For => "for", + From => "from", + Fun => "fun", + If => "if", + Let => "let", + Or => "or", + Print => "print", + Return => "return", + Select => "select", + This => "this", + True => "true", + While => "while", + Yield => "yield", + + Error(e) => e, + } + } +} + +pub struct Tokens<'a> { + source: &'a str, + chars: std::str::CharIndices<'a>, + next_char: Option<(usize, char)>, + newlines: Vec, +} + +impl<'a> Tokens<'a> { + pub fn new(source: &'a str) -> Self { + let mut result = Tokens { + source, + chars: source.char_indices(), + next_char: None, + newlines: Vec::new(), + }; + result.advance(); // Prime the pump + result + } + + pub fn token_position(&self, token: &Token) -> (usize, usize) { + let line_end_index = match self.newlines.binary_search(&token.start) { + Ok(index) => index, + Err(index) => index, + }; + let line_start_pos = if line_end_index == 0 { + 0 + } else { + self.newlines[line_end_index - 1] + 1 + }; + let line_number = line_end_index + 1; + let column_offset = token.start - line_start_pos; + (line_number, column_offset) + } + + fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> { + Token::new(start, kind) + } + + fn number(&mut self, start: usize) -> TokenKind<'a> { + // First, the main part. + loop { + if !self.matches_digit() { + break; + } + } + + // Now the fraction part. + // The thing that is bad here is that this is speculative... + let backup = self.chars.clone(); + if self.matches('.') { + let mut saw_digit = false; + loop { + if self.matches('_') { + } else if self.matches_next(|c| c.is_ascii_digit()) { + saw_digit = true; + } else { + break; + } + } + + if saw_digit { + // OK we're good to here! Check the scientific notation. + if self.matches('e') || self.matches('E') { + if self.matches('+') || self.matches('-') {} + let mut saw_digit = false; + loop { + if self.matches('_') { + } else if self.matches_next(|c| c.is_ascii_digit()) { + saw_digit = true; + } else { + break; + } + } + + if !saw_digit { + // This is just a broken number. + let slice = &self.source[start..self.pos()]; + return TokenKind::Error(format!( + "Invalid floating-point literal: {slice}" + )); + } + } + } else { + // Might be accessing a member on an integer. + self.chars = backup; + } + } + + TokenKind::Number(&self.source[start..self.pos()]) + } + + fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> { + while !self.matches(delimiter) { + if self.eof() { + return TokenKind::Error("Unterminated string constant".to_string()); + } + if self.matches('\\') { + self.advance(); + } else { + self.advance(); + } + } + + TokenKind::String(&self.source[start..self.pos()]) + } + + fn identifier(&mut self, start: usize) -> TokenKind<'a> { + loop { + // TODO: Use unicode identifier classes instead + if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { + break; + } + } + + let ident = &self.source[start..self.pos()]; + match ident.chars().nth(0) { + Some('a') => { + if ident == "and" { + return TokenKind::And; + } + if ident == "async" { + return TokenKind::Async; + } + if ident == "await" { + return TokenKind::Await; + } + } + Some('c') => { + if ident == "class" { + return TokenKind::Class; + } + } + Some('e') => { + if ident == "else" { + return TokenKind::Else; + } + } + Some('f') => { + if ident == "false" { + return TokenKind::False; + } + if ident == "for" { + return TokenKind::For; + } + if ident == "from" { + return TokenKind::From; + } + if ident == "fun" { + return TokenKind::Fun; + } + } + Some('i') => { + if ident == "if" { + return TokenKind::If; + } + } + Some('l') => { + if ident == "let" { + return TokenKind::Let; + } + } + Some('o') => { + if ident == "or" { + return TokenKind::Or; + } + } + Some('p') => { + if ident == "print" { + return TokenKind::Print; + } + } + Some('r') => { + if ident == "return" { + return TokenKind::Return; + } + } + Some('s') => { + if ident == "select" { + return TokenKind::Select; + } + } + Some('t') => { + if ident == "this" { + return TokenKind::This; + } + if ident == "true" { + return TokenKind::True; + } + } + Some('w') => { + if ident == "while" { + return TokenKind::While; + } + } + Some('y') => { + if ident == "yield" { + return TokenKind::Yield; + } + } + _ => (), + } + + TokenKind::Identifier(ident) + } + + fn matches(&mut self, ch: char) -> bool { + if let Some((_, next_ch)) = self.next_char { + if next_ch == ch { + self.advance(); + return true; + } + } + false + } + + fn matches_next(&mut self, f: F) -> bool + where + F: FnOnce(char) -> bool, + { + if let Some((_, next_ch)) = self.next_char { + if f(next_ch) { + eprintln!("MATCHES NEXT: {next_ch}"); + self.advance(); + return true; + } else { + eprintln!("NOT MATCHES NEXT: {next_ch}"); + } + } else { + eprintln!("E O F"); + } + false + } + + fn matches_digit(&mut self) -> bool { + self.matches('_') || self.matches_next(|c| c.is_ascii_digit()) + } + + fn advance(&mut self) -> Option<(usize, char)> { + let result = self.next_char; + self.next_char = self.chars.next(); + eprintln!("NEXT: {:?}", self.next_char); + result + } + + fn pos(&self) -> usize { + match self.next_char { + Some((p, _)) => p, + None => self.source.len(), + } + } + + fn eof(&self) -> bool { + self.next_char.is_none() + } + + fn skip_whitespace(&mut self) { + while let Some((pos, ch)) = self.next_char { + if ch == '\n' { + self.newlines.push(pos); + } else if !ch.is_whitespace() { + break; + } + self.advance(); + } + } +} + +impl<'a> std::iter::Iterator for Tokens<'a> { + type Item = Token<'a>; + + fn next(&mut self) -> Option { + self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving + let (pos, c) = match self.advance() { + Some((p, c)) => (p, c), + None => return None, + }; + + let token = match c { + '{' => TokenKind::LeftBrace, + '}' => TokenKind::RightBrace, + '[' => TokenKind::LeftBracket, + ']' => TokenKind::RightBracket, + '(' => TokenKind::LeftParen, + ')' => TokenKind::RightParen, + ',' => TokenKind::Comma, + '.' => TokenKind::Dot, + '-' => { + if self.matches_next(|c| c.is_ascii_digit()) { + self.number(pos) + } else { + TokenKind::Minus + } + } + '+' => { + if self.matches_next(|c| c.is_ascii_digit()) { + self.number(pos) + } else { + TokenKind::Plus + } + } + ';' => TokenKind::Semicolon, + '/' => TokenKind::Slash, + '*' => TokenKind::Star, + '!' => { + if self.matches('=') { + TokenKind::BangEqual + } else { + TokenKind::Bang + } + } + '=' => { + if self.matches('=') { + TokenKind::EqualEqual + } else { + TokenKind::Equal + } + } + '>' => { + if self.matches('=') { + TokenKind::GreaterEqual + } else { + TokenKind::Greater + } + } + '<' => { + if self.matches('=') { + TokenKind::LessEqual + } else { + TokenKind::Less + } + } + '\'' => self.string(pos, '\''), + '"' => self.string(pos, '"'), + _ => { + if c.is_ascii_digit() { + self.number(pos) + } else if c.is_ascii_alphabetic() || c == '_' { + self.identifier(pos) + } else { + TokenKind::Error(format!("Unexpected character '{c}'")) + } + } + }; + let token = self.token(pos, token); + Some(token) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + macro_rules! test_tokens { + ($name:ident, $input:expr, $($s:expr),+) => { + #[test] + fn $name() { + use TokenKind::*; + let tokens: Vec<_> = Tokens::new($input).collect(); + let expected = vec![$($s),*]; + assert_eq!(expected, tokens); + } + } + } + + test_tokens!( + numbers, + "1 1.0 1.2e7 2.3e+7 3.3E-06 7_6 8.0e_8", + Token::new(0, Number("1")), + Token::new(2, Number("1.0")), + Token::new(6, Number("1.2e7")), + Token::new(12, Number("2.3e+7")), + Token::new(19, Number("3.3E-06")), + Token::new(27, Number("7_6")), + Token::new(31, Number("8.0e_8")) + ); + + test_tokens!( + identifiers, + "asdf x _123 a_23 x3a and or yield async await class else false for from", + Token::new(0, Identifier("asdf")), + Token::new(5, Identifier("x")), + Token::new(7, Identifier("_123")), + Token::new(12, Identifier("a_23")), + Token::new(17, Identifier("x3a")), + Token::new(21, And), + Token::new(25, Or), + Token::new(28, Yield), + Token::new(34, Async), + Token::new(40, Await), + Token::new(46, Class), + Token::new(52, Else), + Token::new(57, False), + Token::new(63, For), + Token::new(67, From) + ); + + test_tokens!( + more_keywords, + "fun if let print return select this true while truewhile", + Token::new(0, Fun), + Token::new(4, If), + Token::new(7, Let), + Token::new(11, Print), + Token::new(17, Return), + Token::new(24, Select), + Token::new(31, This), + Token::new(36, True), + Token::new(41, While), + Token::new(47, Identifier("truewhile")) + ); + + test_tokens!( + strings, + r#"'this is a string that\'s great!\r\n' "foo's" 'bar"s' "#, + Token::new(0, String(r#"'this is a string that\'s great!\r\n'"#)), + Token::new(38, String(r#""foo's""#)), + Token::new(46, String("'bar\"s'")) + ); + + test_tokens!( + symbols, + "{ } ( ) [ ] . ! != < <= > >= = == , - + * / ;", + Token::new(0, LeftBrace), + Token::new(2, RightBrace), + Token::new(4, LeftParen), + Token::new(6, RightParen), + Token::new(8, LeftBracket), + Token::new(10, RightBracket), + Token::new(12, Dot), + Token::new(14, Bang), + Token::new(16, BangEqual), + Token::new(19, Less), + Token::new(21, LessEqual), + Token::new(24, Greater), + Token::new(26, GreaterEqual), + Token::new(29, Equal), + Token::new(31, EqualEqual), + Token::new(34, Comma), + Token::new(36, Minus), + Token::new(38, Plus), + Token::new(40, Star), + Token::new(42, Slash), + Token::new(44, Semicolon) + ); +}