diff --git a/oden-script/Cargo.lock b/oden-script/Cargo.lock index 8839cda3..459de42c 100644 --- a/oden-script/Cargo.lock +++ b/oden-script/Cargo.lock @@ -2,31 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "diff" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" - [[package]] name = "oden-script" version = "0.1.0" -dependencies = [ - "pretty_assertions", -] - -[[package]] -name = "pretty_assertions" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" -dependencies = [ - "diff", - "yansi", -] - -[[package]] -name = "yansi" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" diff --git a/oden-script/Cargo.toml b/oden-script/Cargo.toml index 25623ca0..87a1f807 100644 --- a/oden-script/Cargo.toml +++ b/oden-script/Cargo.toml @@ -2,6 +2,3 @@ name = "oden-script" version = "0.1.0" edition = "2021" - -[dev-dependencies] -pretty_assertions = "1.4.0" diff --git a/oden-script/src/lib.rs b/oden-script/src/lib.rs index 4144a208..e0d6d806 100644 --- a/oden-script/src/lib.rs +++ b/oden-script/src/lib.rs @@ -1,2 +1,458 @@ -pub mod parser; -pub mod tokens; +#[derive(Debug)] +pub enum TokenKind<'a> { + LeftBrace, + RightBrace, + LeftBracket, + RightBracket, + LeftParen, + RightParen, + Comma, + Dot, + Minus, + Plus, + Semicolon, + Slash, + Star, + + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + Identifier(&'a str), // TODO + String(&'a str), + Number(&'a str), + + And, + Async, + Await, + Class, + Else, + False, + For, + From, + Fun, + If, + Let, + Or, + Print, + Return, + Select, + This, + True, + While, + Yield, + + Error(String), +} + +#[derive(Debug)] +pub struct Token<'a> { + kind: TokenKind<'a>, + start: usize, +} + +impl<'a> Token<'a> { + pub fn as_str<'b>(&'b self) -> &'a str + where + 'b: 'a, + { + use TokenKind::*; + match &self.kind { + LeftBrace => "{", + RightBrace => "}", + LeftBracket => "[", + RightBracket => "]", + + LeftParen => "(", + RightParen => ")", + Comma => ",", + Dot => ".", + Minus => "-", + + Plus => "+", + Semicolon => ";", + Slash => "/", + Star => "*", + + Bang => "+", + BangEqual => "!=", + Equal => "=", + EqualEqual => "==", + Greater => ">", + GreaterEqual => ">=", + Less => "<", + LessEqual => "<=", + + Identifier(v) => v, + String(v) => v, + Number(v) => v, + + And => "and", + Async => "async", + Await => "await", + Class => "class", + Else => "else", + False => "false", + For => "for", + From => "from", + Fun => "fun", + If => "if", + Let => "let", + Or => "or", + Print => "print", + Return => "return", + Select => "select", + This => "this", + True => "true", + While => "while", + Yield => "yield", + + Error(e) => e, + } + } +} + +pub struct Tokens<'a> { + source: &'a str, + chars: std::str::CharIndices<'a>, + next_char: Option<(usize, char)>, + newlines: Vec, +} + +impl<'a> Tokens<'a> { + pub fn new(source: &'a str) -> Self { + let mut chars = source.char_indices(); + let next_char = chars.next(); + Tokens { + source, + chars, + next_char, + newlines: Vec::new(), + } + } + + pub fn token_position(&self, token: &Token) -> (usize, usize) { + let line_end_index = match self.newlines.binary_search(&token.start) { + Ok(index) => index, + Err(index) => index, + }; + let line_start_pos = if line_end_index == 0 { + 0 + } else { + self.newlines[line_end_index - 1] + 1 + }; + let line_number = line_end_index + 1; + let column_offset = token.start - line_start_pos; + (line_number, column_offset) + } + + pub fn next_token(&mut self) -> Option> { + self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving + let (pos, c) = match self.advance() { + Some((p, c)) => (p, c), + None => return None, + }; + + let token = match c { + '{' => TokenKind::LeftBrace, + '}' => TokenKind::RightBrace, + '[' => TokenKind::LeftBracket, + ']' => TokenKind::RightBracket, + '(' => TokenKind::LeftParen, + ')' => TokenKind::RightParen, + ',' => TokenKind::Comma, + '.' => TokenKind::Dot, + '-' => { + if self.matches_next(|c| c.is_ascii_digit()) { + self.number(pos) + } else { + TokenKind::Minus + } + } + '+' => { + if self.matches_next(|c| c.is_ascii_digit()) { + self.number(pos) + } else { + TokenKind::Plus + } + } + ';' => TokenKind::Semicolon, + '/' => TokenKind::Slash, + '*' => TokenKind::Star, + '!' => { + if self.matches('=') { + TokenKind::BangEqual + } else { + TokenKind::Bang + } + } + '=' => { + if self.matches('=') { + TokenKind::EqualEqual + } else { + TokenKind::Equal + } + } + '>' => { + if self.matches('=') { + TokenKind::GreaterEqual + } else { + TokenKind::Greater + } + } + '<' => { + if self.matches('=') { + TokenKind::LessEqual + } else { + TokenKind::Less + } + } + '\'' => self.string(pos, '\''), + '"' => self.string(pos, '"'), + _ => { + if self.matches_next(|c| c.is_ascii_digit()) { + self.number(pos) + } else if self.matches_next(|c| c.is_ascii_alphabetic() || c == '_') { + self.identifier(pos) + } else { + TokenKind::Error(format!("Unexpected character '{c}'")) + } + } + }; + let token = self.token(pos, token); + Some(token) + } + + fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> { + Token { kind, start } + } + + fn number(&mut self, start: usize) -> TokenKind<'a> { + // First, the main part. + loop { + if !self.matches_digit() { + break; + } + } + + // Now the fraction part. + // The thing that is bad here is that this is speculative... + let backup = self.chars.clone(); + if self.matches('.') { + let mut saw_digit = false; + loop { + if self.matches('_') { + } else if self.matches_next(|c| c.is_ascii_digit()) { + saw_digit = true; + } else { + break; + } + } + + if saw_digit { + // OK we're good to here! Check the scientific notation. + if self.matches('e') || self.matches('E') { + if self.matches('+') || self.matches('-') {} + let mut saw_digit = false; + loop { + if self.matches('_') { + } else if self.matches_next(|c| c.is_ascii_digit()) { + saw_digit = true; + } else { + break; + } + } + + if !saw_digit { + // This is just a broken number. + let slice = &self.source[start..self.pos()]; + return TokenKind::Error(format!( + "Invalid floating-point literal: {slice}" + )); + } + } + } else { + // Might be accessing a member on an integer. + self.chars = backup; + } + } + + TokenKind::Number(&self.source[start..self.pos()]) + } + + fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> { + while !self.matches(delimiter) { + if self.eof() { + return TokenKind::Error("Unterminated string constant".to_string()); + } + if self.matches('\\') { + self.advance(); + } + } + + TokenKind::String(&self.source[start..self.pos()]) + } + + fn identifier(&mut self, start: usize) -> TokenKind<'a> { + loop { + // TODO: Use unicode identifier classes instead + if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { + break; + } + } + + let ident = &self.source[start..self.pos()]; + match ident.chars().nth(0) { + Some('a') => { + if ident == "and" { + return TokenKind::And; + } + if ident == "async" { + return TokenKind::Async; + } + if ident == "await" { + return TokenKind::Await; + } + } + Some('c') => { + if ident == "class" { + return TokenKind::Class; + } + } + Some('e') => { + if ident == "else" { + return TokenKind::Else; + } + } + Some('f') => { + if ident == "for" { + return TokenKind::For; + } + if ident == "from" { + return TokenKind::From; + } + if ident == "fun" { + return TokenKind::Fun; + } + } + Some('i') => { + if ident == "if" { + return TokenKind::If; + } + } + Some('l') => { + if ident == "let" { + return TokenKind::Let; + } + } + Some('o') => { + if ident == "or" { + return TokenKind::Or; + } + } + Some('p') => { + if ident == "print" { + return TokenKind::Print; + } + } + Some('r') => { + if ident == "return" { + return TokenKind::Return; + } + } + Some('s') => { + if ident == "select" { + return TokenKind::Select; + } + } + Some('t') => { + if ident == "this" { + return TokenKind::This; + } + if ident == "true" { + return TokenKind::True; + } + } + Some('w') => { + if ident == "while" { + return TokenKind::While; + } + } + Some('y') => { + if ident == "yield" { + return TokenKind::Yield; + } + } + _ => (), + } + + TokenKind::Identifier(ident) + } + + fn matches(&mut self, ch: char) -> bool { + if let Some((_, next_ch)) = self.next_char { + if next_ch == ch { + self.advance(); + return true; + } + } + false + } + + fn matches_next(&mut self, f: F) -> bool + where + F: FnOnce(char) -> bool, + { + if let Some((_, next_ch)) = self.next_char { + if f(next_ch) { + self.advance(); + return true; + } + } + false + } + + fn matches_digit(&mut self) -> bool { + self.matches('_') || self.matches_next(|c| c.is_ascii_digit()) + } + + fn advance(&mut self) -> Option<(usize, char)> { + let result = self.next_char; + self.next_char = self.chars.next(); + result + } + + fn pos(&self) -> usize { + match self.next_char { + Some((p, _)) => p, + None => self.source.len(), + } + } + + fn eof(&self) -> bool { + self.next_char.is_none() + } + + fn skip_whitespace(&mut self) { + while let Some((pos, ch)) = self.next_char { + if ch == '\n' { + self.newlines.push(pos); + } else if !ch.is_whitespace() { + break; + } + self.advance(); + } + } +} + +pub fn tokenize(input: String) { + let mut tokens = Tokens::new(&input); + while let Some(token) = tokens.next_token() { + println!("{}: {}", token.start, token.as_str()); + } +} diff --git a/oden-script/src/main.rs b/oden-script/src/main.rs index da0f5d92..7f158ec7 100644 --- a/oden-script/src/main.rs +++ b/oden-script/src/main.rs @@ -1 +1,3 @@ +use oden_script; + pub fn main() {} diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs deleted file mode 100644 index a962dcd3..00000000 --- a/oden-script/src/parser.rs +++ /dev/null @@ -1,906 +0,0 @@ -use crate::tokens::{Lines, Token, TokenKind, Tokens}; -use std::fmt; - -// TODO: An error should have: -// -// - a start -// - an end -// - a focus -// - descriptive messages -// -// that will have to wait for now -#[derive(PartialEq, Eq)] -pub struct SyntaxError { - pub start: (usize, usize), - pub end: (usize, usize), - pub message: String, -} - -impl SyntaxError { - pub fn new(line: usize, column: usize, message: T) -> Self - where - T: ToString, - { - SyntaxError { - start: (line, column), - end: (line, column), - message: message.to_string(), - } - } - - pub fn new_spanned(start: (usize, usize), end: (usize, usize), message: T) -> Self - where - T: ToString, - { - SyntaxError { - start, - end, - message: message.to_string(), - } - } -} - -impl fmt::Debug for SyntaxError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{self}") - } -} - -impl fmt::Display for SyntaxError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}:{}: {}", self.start.0, self.end.0, self.message) - } -} - -#[derive(Clone)] -pub enum Literal { - Float64(f64), - String(String), - Bool(bool), -} - -#[derive(Copy, Clone)] -pub enum UnaryOp { - Negate, - Not, -} - -#[derive(Copy, Clone)] -pub enum BinaryOp { - Add, - Subtract, - Multiply, - Divide, - And, - Or, -} - -#[derive(Clone)] -pub enum Expr<'a> { - Literal(Literal, Token<'a>), - Unary(UnaryOp, Token<'a>, ExprRef), - Binary(BinaryOp, Token<'a>, ExprRef, ExprRef), - Conditional(Token<'a>, ExprRef, ExprRef, Option, Token<'a>), -} - -#[derive(Clone)] -pub struct ExprRef(Option); - -impl ExprRef { - pub fn error() -> Self { - ExprRef(None) - } -} - -// TODO: Eventually we will be unable to use Eq and PartialEq here, and will -// need to do out own thing. -#[derive(Copy, Clone)] -pub enum Type { - // Signals a type error. If you receive this then you know that an error - // has already been reported; if you produce this be sure to also note - // the error in the errors collection. - Error, - - // Signals that the expression has a control-flow side-effect and that no - // value will ever result from this expression. Usually this means - // everything's fine. - Unreachable, - - // TODO: Numeric literals should be implicitly convertable, unlike other - // types. Maybe just "numeric literal" type? - F64, - String, - Bool, -} - -impl Type { - pub fn is_error(&self) -> bool { - match self { - Type::Error => true, - _ => false, - } - } - - pub fn compatible_with(&self, other: &Type) -> bool { - // TODO: This is wrong; we because of numeric literals etc. - match (self, other) { - (Type::F64, Type::F64) => true, - (Type::String, Type::String) => true, - (Type::Bool, Type::Bool) => true, - (Type::Unreachable, Type::Unreachable) => true, - - // Avoid introducing more errors - (Type::Error, _) => true, - (_, Type::Error) => true, - - (_, _) => false, - } - } -} - -impl std::fmt::Debug for Type { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{self}") - } -} - -impl std::fmt::Display for Type { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use Type::*; - match self { - Error => write!(f, "<< INTERNAL ERROR >>"), - Unreachable => write!(f, "<< UNREACHABLE >>"), - F64 => write!(f, "f64"), - String => write!(f, "string"), - Bool => write!(f, "bool"), - } - } -} - -pub struct TypeRef(Option); - -pub struct SyntaxTree<'a> { - pub errors: Vec, - expressions: Vec>, -} - -impl<'a> SyntaxTree<'a> { - pub fn new() -> Self { - SyntaxTree { - errors: Vec::new(), - expressions: Vec::new(), - } - } - - pub fn add_error(&mut self, error: SyntaxError) { - self.errors.push(error); - } - - pub fn add_expr(&mut self, expr: Expr<'a>) -> ExprRef { - let index = self.expressions.len(); - self.expressions.push(expr); - ExprRef(Some(index)) - } - - pub fn dump_expr(&self, expr: &ExprRef) -> String { - match expr.0 { - Some(idx) => { - let expr = &self.expressions[idx]; - match expr { - Expr::Literal(_, tok) => tok.to_string(), - Expr::Unary(_, tok, e) => { - format!("({tok} {})", self.dump_expr(e)) - } - Expr::Binary(_, tok, l, r) => { - format!("({tok} {} {})", self.dump_expr(l), self.dump_expr(r)) - } - Expr::Conditional(tok, cond, t, e, _) => { - if let Some(e) = e { - format!( - "({tok} {} {} {})", - self.dump_expr(cond), - self.dump_expr(t), - self.dump_expr(e) - ) - } else { - format!("({tok} {} {})", self.dump_expr(cond), self.dump_expr(t)) - } - } - } - } - None => "<|EOF|>".to_string(), - } - } - - pub fn expr_span(&self, expr: &ExprRef) -> Option<(Token<'a>, Token<'a>)> { - let expr = match expr.0 { - Some(idx) => &self.expressions[idx], - None => return None, - }; - - match expr { - Expr::Literal(_, tok) => Some((tok.clone(), tok.clone())), - Expr::Unary(_, tok, arg) => { - let arg = self.expr_span(arg); - match arg { - None => None, - Some((_, end)) => Some((tok.clone(), end)), - } - } - Expr::Binary(_, _, left, right) => { - let left = self.expr_span(left); - let right = self.expr_span(right); - match (left, right) { - (None, _) => None, - (_, None) => None, - (Some((start, _)), Some((_, end))) => Some((start, end)), - } - } - Expr::Conditional(head, _, _, _, tail) => Some((head.clone(), tail.clone())), - } - } - - pub fn expr_type(&mut self, expr: &ExprRef, lines: &Lines, value_required: bool) -> Type { - // TODO: Cache and work on demand? Or is this just fine? - - let exr = expr.clone(); - let expr = match expr.0 { - Some(idx) => &self.expressions[idx], - None => return Type::Error, - }; - match expr { - Expr::Literal(lit, _) => match lit { - Literal::Float64(_) => Type::F64, - Literal::String(_) => Type::String, - Literal::Bool(_) => Type::Bool, - }, - - // Figure out the main thing. Check for a... trait? - Expr::Unary(op, tok, arg) => { - let op = op.clone(); - let arg = arg.clone(); - let tok = tok.clone(); - let arg_type = self.expr_type(&arg, lines, true); - match (op, arg_type) { - (UnaryOp::Negate, Type::F64) => Type::F64, - (UnaryOp::Not, Type::Bool) => Type::Bool, - - // This is dumb and should be punished, probably. - (_, Type::Unreachable) => { - let (line, col) = lines.position(tok.start()); - self.errors.push(SyntaxError::new(line, col, format!("cannot apply a unary operator to something that doesn't yield a value"))); - Type::Error - } - - // Propagate existing errors without additional complaint. - (_, Type::Error) => Type::Error, - - // Missed the whole table, must be an error. - (_, arg_type) => { - let (line, col) = lines.position(tok.start()); - self.errors.push(SyntaxError::new(line, col, format!("cannot apply unary operator '{tok}' to expression of type '{arg_type}'"))); - Type::Error - } - } - } - - Expr::Binary(op, tok, left, right) => { - let op = op.clone(); - let tok = tok.clone(); - let left = left.clone(); - let right = right.clone(); - let left_type = self.expr_type(&left, lines, true); - let right_type = self.expr_type(&right, lines, true); - - match (op, left_type, right_type) { - ( - BinaryOp::Add | BinaryOp::Subtract | BinaryOp::Multiply | BinaryOp::Divide, - Type::F64, - Type::F64, - ) => Type::F64, - - (BinaryOp::Add, Type::String, Type::String) => Type::String, - - (BinaryOp::And | BinaryOp::Or, Type::Bool, Type::Bool) => Type::Bool, - - // This is dumb and should be punished, probably. - (_, _, Type::Unreachable) => { - let (line, col) = lines.position(tok.start()); - self.errors.push(SyntaxError::new( - line, - col, - format!( - "cannot apply '{tok}' to an argument that doesn't yield a value (on the right)" - ), - )); - Type::Error - } - (_, Type::Unreachable, _) => { - let (line, col) = lines.position(tok.start()); - self.errors.push(SyntaxError::new( - line, - col, - format!( - "cannot apply '{tok}' to an argument that doesn't yield a value (on the left)" - ), - )); - Type::Error - } - - // Propagate existing errors without additional complaint. - (_, Type::Error, _) => Type::Error, - (_, _, Type::Error) => Type::Error, - - // Missed the whole table, it must be an error. - (_, left_type, right_type) => { - let (line, col) = lines.position(tok.start()); - self.errors.push(SyntaxError::new(line, col, format!("cannot apply binary operator '{tok}' to expressions of type '{left_type}' (on the left) and '{right_type}' (on the right)"))); - Type::Error - } - } - } - - Expr::Conditional(_, cond, then_exp, else_exp, _) => { - let cond = cond.clone(); - let then_exp = then_exp.clone(); - let else_exp = else_exp.clone(); - - let cond_type = self.expr_type(&cond, lines, true); - let then_type = self.expr_type(&then_exp, lines, value_required); - let else_type = else_exp.map(|e| self.expr_type(&e, lines, value_required)); - if !cond_type.compatible_with(&Type::Bool) { - if !cond_type.is_error() { - let span = self - .expr_span(&cond) - .expect("If the expression has a type it must have a span"); - - let start = lines.position(span.0.start()); - let end = lines.position(span.1.start()); - self.errors.push(SyntaxError::new_spanned( - start, - end, - "the condition of an `if` expression must be a boolean", - )); - } - return Type::Error; - } - - match (then_type, else_type) { - (Type::Error, _) => Type::Error, - (_, Some(Type::Error)) => Type::Error, - - // It's an error to have a missing else branch if the value is required - (_, None) if value_required => { - let span = self - .expr_span(&exr) - .expect("How did I get this far with a broken parse?"); - let start = lines.position(span.0.start()); - let end = lines.position(span.1.start()); - self.errors.push(SyntaxError::new_spanned( - start, - end, - "this `if` expression must have both a `then` clause and an `else` clause, so it can produce a value", - )); - Type::Error - } - - // If the value is required then the branches must be - // compatible, and the type of the expression is the type - // of the `then` branch. - (then_type, Some(else_type)) if value_required => { - if !then_type.compatible_with(&else_type) { - let span = self - .expr_span(&exr) - .expect("How did I get this far with a broken parse?"); - let start = lines.position(span.0.start()); - let end = lines.position(span.1.start()); - self.errors.push(SyntaxError::new_spanned( - start, - end, - format!("the type of the `then` branch ({then_type}) must match the type of the `else` branch ({else_type})"), - )); - Type::Error - } else { - then_type - } - } - - // The value must not be required, just mark this as unreachable. - (_, _) => { - assert!(!value_required); - Type::Unreachable - } - } - } - } - } -} - -// BINDING POWERS. When parsing expressions we only accept expressions that -// meet a minimum binding power. (This is like "precedence" but I just super -// don't like that terminology.) -const ASSIGNMENT_POWER: u8 = 0; // = -const OR_POWER: u8 = 1; // or -const AND_POWER: u8 = 2; // and -const EQUALITY_POWER: u8 = 3; // == != -const COMPARISON_POWER: u8 = 4; // < > <= >= -const TERM_POWER: u8 = 5; // + - -const FACTOR_POWER: u8 = 6; // * / -const UNARY_POWER: u8 = 7; // ! - - -// const CALL_POWER: u8 = 8; // . () -// const PRIMARY_POWER: u8 = 9; - -fn token_power<'a>(token: &Option>) -> Option { - let token = match token { - Some(t) => t, - None => return None, - }; - - match token.kind() { - TokenKind::Equal => Some(ASSIGNMENT_POWER), - TokenKind::Or => Some(OR_POWER), - TokenKind::And => Some(AND_POWER), - TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER), - TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => { - Some(COMPARISON_POWER) - } - TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER), - TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER), - _ => None, - } -} - -pub struct Parser<'a> { - tokens: Tokens<'a>, - tree: SyntaxTree<'a>, - current: Option>, - previous: Option>, - - panic_mode: bool, -} - -impl<'a> Parser<'a> { - pub fn new(source: &'a str) -> Self { - let mut parser = Parser { - tokens: Tokens::new(source), - tree: SyntaxTree::new(), - current: None, - previous: None, - panic_mode: false, - }; - parser.advance(); - parser - } - - pub fn parse(mut self) -> (SyntaxTree<'a>, ExprRef, Lines) { - let expr = self.expression(); - self.consume(None, "expected end of expression"); - (self.tree, expr, self.tokens.lines()) - } - - fn expression(&mut self) -> ExprRef { - self.expression_with_power(0) - } - - fn expression_with_power(&mut self, minimum_power: u8) -> ExprRef { - self.trace("expression with power"); - self.advance(); - let mut expr = self.prefix_expression(); - loop { - let power = match token_power(&self.current) { - Some(p) => p, - None => break, // EOF, end of expression? - }; - - if power < minimum_power { - break; - } - - self.advance(); - expr = self.infix_expression(power, expr); - } - expr - } - - fn prefix_expression(&mut self) -> ExprRef { - self.trace("prefix"); - let token = self.previous.as_ref(); - match token { - Some(token) => match token.kind() { - TokenKind::Bang => self.unary(), - TokenKind::LeftParen => self.grouping(), - TokenKind::Number => self.number(), - TokenKind::Minus => self.unary(), - TokenKind::String => self.string(), - - TokenKind::True => self - .tree - .add_expr(Expr::Literal(Literal::Bool(true), token.clone())), - TokenKind::False => self - .tree - .add_expr(Expr::Literal(Literal::Bool(false), token.clone())), - - TokenKind::If => self.conditional(), - - _ => { - self.error("expected an expression"); - ExprRef::error() - } - }, - None => { - self.error("expected an expression"); - ExprRef::error() - } - } - } - - fn infix_expression(&mut self, power: u8, left: ExprRef) -> ExprRef { - self.trace("infix"); - let kind = self.previous.as_ref().unwrap().kind(); - match kind { - TokenKind::Plus - | TokenKind::Minus - | TokenKind::Star - | TokenKind::Slash - | TokenKind::And - | TokenKind::Or => self.binary(power, left), - _ => panic!("Unknown infix operator, dispatch error?"), - } - } - - fn number(&mut self) -> ExprRef { - let token = self.previous.as_ref().unwrap(); - // What kind is it? For now let's just ... make it good. - - let literal = match token.as_str().parse::() { - Ok(v) => Literal::Float64(v), - Err(e) => { - self.error(format!("invalid f64: {e}")); - return ExprRef::error(); - } - }; - - self.tree.add_expr(Expr::Literal(literal, token.clone())) - } - - fn string(&mut self) -> ExprRef { - let token = self.previous.as_ref().unwrap(); - - let mut result = String::new(); - let mut input = token.as_str().chars(); - - assert!(input.next().is_some()); // Delimiter - while let Some(ch) = input.next() { - match ch { - '\\' => match input.next().unwrap() { - 'n' => result.push('\n'), - 'r' => result.push('\r'), - 't' => result.push('\t'), - ch => result.push(ch), - }, - _ => result.push(ch), - } - } - result.pop(); // We pushed the other delimiter on, whoops. - - let literal = Literal::String(result); - self.tree.add_expr(Expr::Literal(literal, token.clone())) - } - - fn grouping(&mut self) -> ExprRef { - let result = self.expression(); - self.consume( - Some(TokenKind::RightParen), - "expected ')' after an expression", - ); - result - } - - fn conditional(&mut self) -> ExprRef { - let token = self.previous.as_ref().unwrap().clone(); - let condition_expr = self.expression(); - self.consume( - Some(TokenKind::LeftBrace), - "expected '{' to start an 'if' block", - ); - let then_expr = self.expression(); - self.consume( - Some(TokenKind::RightBrace), - "expected '}' to end an 'if' block", - ); - let else_expr = match &self.current { - Some(token) if token.kind() == TokenKind::Else => { - self.advance(); - match &self.current { - // Allow `else if` without another `{`. - Some(token) if token.kind() == TokenKind::If => { - self.advance(); - Some(self.conditional()) - } - _ => { - self.consume( - Some(TokenKind::LeftBrace), - "expected '{' to start an 'else' block", - ); - let else_expr = self.expression(); - self.consume( - Some(TokenKind::RightBrace), - "Expected '}' to end an 'else' block", - ); - Some(else_expr) - } - } - } - _ => None, - }; - let tail = self.previous.as_ref().unwrap().clone(); - self.tree.add_expr(Expr::Conditional( - token, - condition_expr, - then_expr, - else_expr, - tail, - )) - } - - fn unary(&mut self) -> ExprRef { - let token = self.previous.as_ref().unwrap().clone(); - let kind = token.kind(); - let expr = self.expression_with_power(UNARY_POWER); - let op = match kind { - TokenKind::Minus => UnaryOp::Negate, - TokenKind::Bang => UnaryOp::Not, - _ => panic!("unsuitable unary: {:?}: no op", kind), - }; - - self.tree.add_expr(Expr::Unary(op, token, expr)) - } - - fn binary(&mut self, power: u8, left: ExprRef) -> ExprRef { - let token = self.previous.as_ref().unwrap().clone(); - let op = match token.kind() { - TokenKind::Plus => BinaryOp::Add, - TokenKind::Minus => BinaryOp::Subtract, - TokenKind::Star => BinaryOp::Multiply, - TokenKind::Slash => BinaryOp::Divide, - TokenKind::And => BinaryOp::And, - TokenKind::Or => BinaryOp::Or, - _ => panic!("unsuitable binary: {:?}: no op", self.previous), - }; - let right = self.expression_with_power(power + 1); - self.tree.add_expr(Expr::Binary(op, token, left, right)) - } - - fn advance(&mut self) { - self.previous = self.current.take(); - loop { - self.current = self.tokens.next(); - match &self.current { - Some(token) if token.kind() == TokenKind::Error => { - self.error_at_current(token.to_string()) - } - _ => break, - } - } - } - - fn consume(&mut self, kind: Option, error: &str) { - match (&self.current, kind) { - (Some(token), Some(kind)) if token.kind() == kind => self.advance(), - (None, None) => (), - _ => { - self.error_at_current(error); - } - } - } - - fn error(&mut self, message: T) - where - T: Into, - { - self.error_at(self.previous.clone(), message) - } - - fn error_at_current(&mut self, message: T) - where - T: Into, - { - self.error_at(self.current.clone(), message) - } - - fn error_at(&mut self, token: Option>, message: T) - where - T: Into, - { - if self.panic_mode { - return; - } - self.panic_mode = true; - - let message: String = message.into(); - let (line, column) = self.tokens.token_position(&token); - let mut final_message = "Error ".to_string(); - match token { - None => final_message.push_str("at end"), - Some(t) => { - if t.kind() != TokenKind::Error { - final_message.push_str("at '"); - final_message.push_str(t.as_str()); - final_message.push_str("'"); - } - } - } - final_message.push_str(": "); - final_message.push_str(&message); - - self.tree - .add_error(SyntaxError::new(line, column, final_message)); - } - - fn trace(&self, _msg: &str) { - // let cpos = self.tokens.token_position(&self.current); - // let ppos = self.tokens.token_position(&self.previous); - - // eprintln!( - // "[{}:{}:{}] [{}:{}:{}]: {msg}", - // ppos.0, - // ppos.1, - // self.previous - // .as_ref() - // .map(|t| t.as_str()) - // .unwrap_or(""), - // cpos.0, - // cpos.1, - // self.current.as_ref().map(|t| t.as_str()).unwrap_or("") - // ); - } -} - -#[cfg(test)] -mod tests { - use super::*; - use pretty_assertions::assert_eq; - - fn test_successful_expression_parse(source: &str, expected: &str, expected_type: Type) { - let (mut tree, expr, lines) = Parser::new(source).parse(); - assert_eq!( - Vec::::new(), - tree.errors, - "Expected successful parse" - ); - assert_eq!( - expected, - tree.dump_expr(&expr), - "The parse structure of the expressions did not match" - ); - - // TODO: 'assert_eq' is probably wrong here - let expr_type = tree.expr_type(&expr, &lines, true); - assert!( - expected_type.compatible_with(&expr_type), - "The type of the expression did not match. expected: {expected_type}, actual: {expr_type}" - ); - } - - macro_rules! test_expr { - ($name:ident, $input:expr, $expected:expr, $type:expr) => { - #[test] - fn $name() { - test_successful_expression_parse($input, $expected, $type); - } - }; - } - - test_expr!(number_expr, "12", "12", Type::F64); - test_expr!(add_expr, "1 + 2", "(+ 1 2)", Type::F64); - test_expr!( - prec_expr, - "1 + 2 * 3 - 7 * 7", - "(- (+ 1 (* 2 3)) (* 7 7))", - Type::F64 - ); - test_expr!(unary, "-((23)) * 5", "(* (- 23) 5)", Type::F64); - test_expr!( - strings, - r#" "Hello " + 'world!' "#, - r#"(+ "Hello " 'world!')"#, - Type::String - ); - - test_expr!( - booleans, - "true and false or false and !true", - "(or (and true false) (and false (! true)))", - Type::Bool - ); - - test_expr!( - if_expression, - "if true { 23 } else { 45 }", - "(if true 23 45)", - Type::F64 - ); - // test_expr!( - // if_with_return, - // "if true { 23 } else { return 'nothing' }", - // "", - // Type::F64 - // ); - - // ======================================================================== - // Type Error Tests - // ======================================================================== - - fn test_type_error_expression(source: &str, expected_errors: Vec<&str>) { - let (mut tree, expr, lines) = Parser::new(source).parse(); - assert_eq!( - Vec::::new(), - tree.errors, - "Expected successful parse" - ); - - let expr_type = tree.expr_type(&expr, &lines, true); - assert!(expr_type.is_error()); - - let actual_errors = tree - .errors - .iter() - .map(|e| e.message.as_str()) - .collect::>(); - assert_eq!(expected_errors, actual_errors); - } - - macro_rules! test_type_error_expr { - ($name:ident, $input:expr, $($s:expr),+) => { - #[test] - fn $name() { - let expected_errors: Vec<&str> = (vec![$($s),*]); - test_type_error_expression($input, expected_errors); - } - } - } - - test_type_error_expr!( - negate_string, - "-('what?')", - "cannot apply unary operator '-' to expression of type 'string'" - ); - - test_type_error_expr!( - add_string_number, - "'what?' + 5", - "cannot apply binary operator '+' to expressions of type 'string' (on the left) and 'f64' (on the right)" - ); - - test_type_error_expr!( - add_number_string, - "5 + 'what?'", - "cannot apply binary operator '+' to expressions of type 'f64' (on the left) and 'string' (on the right)" - ); - - test_type_error_expr!( - errors_propagate_do_not_duplicate, - "!'hello' / 27 * -('what?') + 23", - "cannot apply unary operator '!' to expression of type 'string'", - "cannot apply unary operator '-' to expression of type 'string'" - ); - - test_type_error_expr!( - if_not_bool, - "if 23 { 1 } else { 2 }", - "the condition of an `if` expression must be a boolean" - ); - - test_type_error_expr!( - if_arm_mismatch, - "if true { 1 } else { '1' }", - "the type of the `then` branch (f64) must match the type of the `else` branch (string)" - ); - - test_type_error_expr!( - if_no_else, - "if true { 1 }", - "this `if` expression must have both a `then` clause and an `else` clause, so it can produce a value" - ); -} diff --git a/oden-script/src/tokens.rs b/oden-script/src/tokens.rs deleted file mode 100644 index c2bccfb9..00000000 --- a/oden-script/src/tokens.rs +++ /dev/null @@ -1,584 +0,0 @@ -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum TokenKind { - LeftBrace, - RightBrace, - LeftBracket, - RightBracket, - LeftParen, - RightParen, - Comma, - Dot, - Minus, - Plus, - Semicolon, - Slash, - Star, - - Bang, - BangEqual, - Equal, - EqualEqual, - Greater, - GreaterEqual, - Less, - LessEqual, - - Identifier, - String, - Number, - - And, - Async, - Await, - Class, - Else, - False, - For, - From, - Fun, - If, - Let, - Or, - Print, - Return, - Select, - This, - True, - While, - Yield, - - Error, -} - -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct Token<'a> { - kind: TokenKind, - start: usize, - value: Result<&'a str, String>, -} - -impl<'a> Token<'a> { - pub fn new(kind: TokenKind, start: usize, value: &'a str) -> Self { - Token { - kind, - start, - value: Ok(value), - } - } - - pub fn error(start: usize, message: String) -> Self { - Token { - kind: TokenKind::Error, - start, - value: Err(message), - } - } - - pub fn start(&self) -> usize { - self.start - } - - pub fn kind(&self) -> TokenKind { - self.kind - } - - pub fn as_str<'b>(&'b self) -> &'a str - where - 'b: 'a, - { - match &self.value { - Ok(v) => v, - Err(e) => &e, - } - } -} - -impl<'a> std::fmt::Display for Token<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -pub struct Lines { - newlines: Vec, - eof: usize, -} - -impl Lines { - fn new(eof: usize) -> Self { - Lines { - newlines: Vec::new(), - eof, - } - } - - /// Record the position of a newline in the source. - pub fn add_line(&mut self, pos: usize) { - self.newlines.push(pos) - } - - /// Return the position of the given token as a (line, column) pair. By - /// convention, lines are 1-based and columns are 0-based. Also, in - /// keeping with the iterator-nature of the tokenizer, `None` here - /// indicates end-of-file, and will return the position of the end of the - /// file. - pub fn token_position(&self, token: &Option) -> (usize, usize) { - let start = match token { - Some(t) => t.start, - None => self.eof, - }; - self.position(start) - } - - /// Return the position of the given character offset as a (line,column) - /// pair. By convention, lines are 1-based and columns are 0-based. - pub fn position(&self, offset: usize) -> (usize, usize) { - let line_end_index = match self.newlines.binary_search(&offset) { - Ok(index) => index, - Err(index) => index, - }; - let line_start_pos = if line_end_index == 0 { - 0 - } else { - self.newlines[line_end_index - 1] + 1 - }; - let line_number = line_end_index + 1; - let column_offset = offset - line_start_pos; - (line_number, column_offset) - } -} - -pub struct Tokens<'a> { - source: &'a str, - chars: std::str::CharIndices<'a>, - next_char: Option<(usize, char)>, - lines: Lines, -} - -impl<'a> Tokens<'a> { - pub fn new(source: &'a str) -> Self { - let mut result = Tokens { - source, - chars: source.char_indices(), - next_char: None, - lines: Lines::new(source.len()), - }; - result.advance(); // Prime the pump - result - } - - pub fn lines(self) -> Lines { - self.lines - } - - /// Return the position of the given token as a (line, column) pair. See - /// `Lines::token_position` for more information about the range, etc. - pub fn token_position(&self, token: &Option) -> (usize, usize) { - self.lines.token_position(token) - } - - fn token(&self, start: usize, kind: TokenKind) -> Token<'a> { - let value = &self.source[start..self.pos()]; - Token::new(kind, start, value) - } - - fn number(&mut self, start: usize) -> Token<'a> { - // First, the main part. - loop { - if !self.matches_digit() { - break; - } - } - - // Now the fraction part. - // The thing that is bad here is that this is speculative... - let backup = self.chars.clone(); - if self.matches('.') { - let mut saw_digit = false; - loop { - if self.matches('_') { - } else if self.matches_next(|c| c.is_ascii_digit()) { - saw_digit = true; - } else { - break; - } - } - - if saw_digit { - // OK we're good to here! Check the scientific notation. - if self.matches('e') || self.matches('E') { - if self.matches('+') || self.matches('-') {} - let mut saw_digit = false; - loop { - if self.matches('_') { - } else if self.matches_next(|c| c.is_ascii_digit()) { - saw_digit = true; - } else { - break; - } - } - - if !saw_digit { - // This is just a broken number. - let slice = &self.source[start..self.pos()]; - return Token::error( - start, - format!("Invalid floating-point literal: {slice}"), - ); - } - } - } else { - // Might be accessing a member on an integer. - self.chars = backup; - } - } - - self.token(start, TokenKind::Number) - } - - fn string(&mut self, start: usize, delimiter: char) -> Token<'a> { - while !self.matches(delimiter) { - if self.eof() { - return Token::error(start, "Unterminated string constant".to_string()); - } - if self.matches('\\') { - self.advance(); - } else { - self.advance(); - } - } - - self.token(start, TokenKind::String) - } - - fn identifier_token_kind(ident: &str) -> TokenKind { - match ident.chars().nth(0).unwrap() { - 'a' => { - if ident == "and" { - return TokenKind::And; - } - if ident == "async" { - return TokenKind::Async; - } - if ident == "await" { - return TokenKind::Await; - } - } - 'c' => { - if ident == "class" { - return TokenKind::Class; - } - } - 'e' => { - if ident == "else" { - return TokenKind::Else; - } - } - 'f' => { - if ident == "false" { - return TokenKind::False; - } - if ident == "for" { - return TokenKind::For; - } - if ident == "from" { - return TokenKind::From; - } - if ident == "fun" { - return TokenKind::Fun; - } - } - 'i' => { - if ident == "if" { - return TokenKind::If; - } - } - 'l' => { - if ident == "let" { - return TokenKind::Let; - } - } - 'o' => { - if ident == "or" { - return TokenKind::Or; - } - } - 'p' => { - if ident == "print" { - return TokenKind::Print; - } - } - 'r' => { - if ident == "return" { - return TokenKind::Return; - } - } - 's' => { - if ident == "select" { - return TokenKind::Select; - } - } - 't' => { - if ident == "this" { - return TokenKind::This; - } - if ident == "true" { - return TokenKind::True; - } - } - 'w' => { - if ident == "while" { - return TokenKind::While; - } - } - 'y' => { - if ident == "yield" { - return TokenKind::Yield; - } - } - _ => (), - } - - TokenKind::Identifier - } - - fn identifier(&mut self, start: usize) -> Token<'a> { - loop { - // TODO: Use unicode identifier classes instead - if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { - break; - } - } - - let ident = &self.source[start..self.pos()]; - let kind = Self::identifier_token_kind(ident); - Token::new(kind, start, ident) - } - - fn matches(&mut self, ch: char) -> bool { - if let Some((_, next_ch)) = self.next_char { - if next_ch == ch { - self.advance(); - return true; - } - } - false - } - - fn matches_next(&mut self, f: F) -> bool - where - F: FnOnce(char) -> bool, - { - if let Some((_, next_ch)) = self.next_char { - if f(next_ch) { - self.advance(); - return true; - } - } - false - } - - fn matches_digit(&mut self) -> bool { - self.matches('_') || self.matches_next(|c| c.is_ascii_digit()) - } - - fn advance(&mut self) -> Option<(usize, char)> { - let result = self.next_char; - self.next_char = self.chars.next(); - result - } - - fn pos(&self) -> usize { - match self.next_char { - Some((p, _)) => p, - None => self.source.len(), - } - } - - fn eof(&self) -> bool { - self.next_char.is_none() - } - - fn skip_whitespace(&mut self) { - while let Some((pos, ch)) = self.next_char { - if ch == '\n' { - self.lines.add_line(pos); - } else if !ch.is_whitespace() { - break; - } - self.advance(); - } - } -} - -impl<'a> std::iter::Iterator for Tokens<'a> { - type Item = Token<'a>; - - fn next(&mut self) -> Option { - self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving - let (pos, c) = match self.advance() { - Some((p, c)) => (p, c), - None => return None, - }; - - let token = match c { - '{' => self.token(pos, TokenKind::LeftBrace), - '}' => self.token(pos, TokenKind::RightBrace), - '[' => self.token(pos, TokenKind::LeftBracket), - ']' => self.token(pos, TokenKind::RightBracket), - '(' => self.token(pos, TokenKind::LeftParen), - ')' => self.token(pos, TokenKind::RightParen), - ',' => self.token(pos, TokenKind::Comma), - '.' => self.token(pos, TokenKind::Dot), - '-' => self.token(pos, TokenKind::Minus), - '+' => self.token(pos, TokenKind::Plus), - ';' => self.token(pos, TokenKind::Semicolon), - '/' => self.token(pos, TokenKind::Slash), - '*' => self.token(pos, TokenKind::Star), - '!' => { - if self.matches('=') { - self.token(pos, TokenKind::BangEqual) - } else { - self.token(pos, TokenKind::Bang) - } - } - '=' => { - if self.matches('=') { - self.token(pos, TokenKind::EqualEqual) - } else { - self.token(pos, TokenKind::Equal) - } - } - '>' => { - if self.matches('=') { - self.token(pos, TokenKind::GreaterEqual) - } else { - self.token(pos, TokenKind::Greater) - } - } - '<' => { - if self.matches('=') { - self.token(pos, TokenKind::LessEqual) - } else { - self.token(pos, TokenKind::Less) - } - } - '\'' => self.string(pos, '\''), - '"' => self.string(pos, '"'), - _ => { - if c.is_ascii_digit() { - self.number(pos) - } else if c.is_ascii_alphabetic() || c == '_' { - self.identifier(pos) - } else { - Token::error(pos, format!("Unexpected character '{c}'")) - } - } - }; - Some(token) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use pretty_assertions::assert_eq; - - macro_rules! test_tokens { - ($name:ident, $input:expr, $($s:expr),+) => { - #[test] - fn $name() { - use TokenKind::*; - let tokens: Vec<_> = Tokens::new($input).collect(); - - let expected: Vec = (vec![$($s),*]) - .into_iter() - .map(|t| Token::new(t.1, t.0, t.2)) - .collect(); - - assert_eq!(expected, tokens); - } - } - } - - test_tokens!( - numbers, - "1 1.0 1.2e7 2.3e+7 3.3E-06 7_6 8.0e_8", - (0, Number, "1"), - (2, Number, "1.0"), - (6, Number, "1.2e7"), - (12, Number, "2.3e+7"), - (19, Number, "3.3E-06"), - (27, Number, "7_6"), - (31, Number, "8.0e_8") - ); - - test_tokens!( - identifiers, - "asdf x _123 a_23 x3a and or yield async await class else false for from", - (0, Identifier, "asdf"), - (5, Identifier, "x"), - (7, Identifier, "_123"), - (12, Identifier, "a_23"), - (17, Identifier, "x3a"), - (21, And, "and"), - (25, Or, "or"), - (28, Yield, "yield"), - (34, Async, "async"), - (40, Await, "await"), - (46, Class, "class"), - (52, Else, "else"), - (57, False, "false"), - (63, For, "for"), - (67, From, "from") - ); - - test_tokens!( - more_keywords, - "fun if let print return select this true while truewhile", - (0, Fun, "fun"), - (4, If, "if"), - (7, Let, "let"), - (11, Print, "print"), - (17, Return, "return"), - (24, Select, "select"), - (31, This, "this"), - (36, True, "true"), - (41, While, "while"), - (47, Identifier, "truewhile") - ); - - test_tokens!( - strings, - r#"'this is a string that\'s great!\r\n' "foo's" 'bar"s' "#, - (0, String, r#"'this is a string that\'s great!\r\n'"#), - (38, String, r#""foo's""#), - (46, String, "'bar\"s'") - ); - - test_tokens!( - symbols, - "{ } ( ) [ ] . ! != < <= > >= = == , - + * / ;", - (0, LeftBrace, "{"), - (2, RightBrace, "}"), - (4, LeftParen, "("), - (6, RightParen, ")"), - (8, LeftBracket, "["), - (10, RightBracket, "]"), - (12, Dot, "."), - (14, Bang, "!"), - (16, BangEqual, "!="), - (19, Less, "<"), - (21, LessEqual, "<="), - (24, Greater, ">"), - (26, GreaterEqual, ">="), - (29, Equal, "="), - (31, EqualEqual, "=="), - (34, Comma, ","), - (36, Minus, "-"), - (38, Plus, "+"), - (40, Star, "*"), - (42, Slash, "/"), - (44, Semicolon, ";") - ); -}