diff --git a/oden-script/Cargo.lock b/oden-script/Cargo.lock index 459de42c..8839cda3 100644 --- a/oden-script/Cargo.lock +++ b/oden-script/Cargo.lock @@ -2,6 +2,31 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + [[package]] name = "oden-script" version = "0.1.0" +dependencies = [ + "pretty_assertions", +] + +[[package]] +name = "pretty_assertions" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +dependencies = [ + "diff", + "yansi", +] + +[[package]] +name = "yansi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" diff --git a/oden-script/Cargo.toml b/oden-script/Cargo.toml index 87a1f807..25623ca0 100644 --- a/oden-script/Cargo.toml +++ b/oden-script/Cargo.toml @@ -2,3 +2,6 @@ name = "oden-script" version = "0.1.0" edition = "2021" + +[dev-dependencies] +pretty_assertions = "1.4.0" diff --git a/oden-script/src/lib.rs b/oden-script/src/lib.rs index e0d6d806..4144a208 100644 --- a/oden-script/src/lib.rs +++ b/oden-script/src/lib.rs @@ -1,458 +1,2 @@ -#[derive(Debug)] -pub enum TokenKind<'a> { - LeftBrace, - RightBrace, - LeftBracket, - RightBracket, - LeftParen, - RightParen, - Comma, - Dot, - Minus, - Plus, - Semicolon, - Slash, - Star, - - Bang, - BangEqual, - Equal, - EqualEqual, - Greater, - GreaterEqual, - Less, - LessEqual, - - Identifier(&'a str), // TODO - String(&'a str), - Number(&'a str), - - And, - Async, - Await, - Class, - Else, - False, - For, - From, - Fun, - If, - Let, - Or, - Print, - Return, - Select, - This, - True, - While, - Yield, - - Error(String), -} - -#[derive(Debug)] -pub struct Token<'a> { - kind: TokenKind<'a>, - start: usize, -} - -impl<'a> Token<'a> { - pub fn as_str<'b>(&'b self) -> &'a str - where - 'b: 'a, - { - use TokenKind::*; - match &self.kind { - LeftBrace => "{", - RightBrace => "}", - LeftBracket => "[", - RightBracket => "]", - - LeftParen => "(", - RightParen => ")", - Comma => ",", - Dot => ".", - Minus => "-", - - Plus => "+", - Semicolon => ";", - Slash => "/", - Star => "*", - - Bang => "+", - BangEqual => "!=", - Equal => "=", - EqualEqual => "==", - Greater => ">", - GreaterEqual => ">=", - Less => "<", - LessEqual => "<=", - - Identifier(v) => v, - String(v) => v, - Number(v) => v, - - And => "and", - Async => "async", - Await => "await", - Class => "class", - Else => "else", - False => "false", - For => "for", - From => "from", - Fun => "fun", - If => "if", - Let => "let", - Or => "or", - Print => "print", - Return => "return", - Select => "select", - This => "this", - True => "true", - While => "while", - Yield => "yield", - - Error(e) => e, - } - } -} - -pub struct Tokens<'a> { - source: &'a str, - chars: std::str::CharIndices<'a>, - next_char: Option<(usize, char)>, - newlines: Vec, -} - -impl<'a> Tokens<'a> { - pub fn new(source: &'a str) -> Self { - let mut chars = source.char_indices(); - let next_char = chars.next(); - Tokens { - source, - chars, - next_char, - newlines: Vec::new(), - } - } - - pub fn token_position(&self, token: &Token) -> (usize, usize) { - let line_end_index = match self.newlines.binary_search(&token.start) { - Ok(index) => index, - Err(index) => index, - }; - let line_start_pos = if line_end_index == 0 { - 0 - } else { - self.newlines[line_end_index - 1] + 1 - }; - let line_number = line_end_index + 1; - let column_offset = token.start - line_start_pos; - (line_number, column_offset) - } - - pub fn next_token(&mut self) -> Option> { - self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving - let (pos, c) = match self.advance() { - Some((p, c)) => (p, c), - None => return None, - }; - - let token = match c { - '{' => TokenKind::LeftBrace, - '}' => TokenKind::RightBrace, - '[' => TokenKind::LeftBracket, - ']' => TokenKind::RightBracket, - '(' => TokenKind::LeftParen, - ')' => TokenKind::RightParen, - ',' => TokenKind::Comma, - '.' => TokenKind::Dot, - '-' => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else { - TokenKind::Minus - } - } - '+' => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else { - TokenKind::Plus - } - } - ';' => TokenKind::Semicolon, - '/' => TokenKind::Slash, - '*' => TokenKind::Star, - '!' => { - if self.matches('=') { - TokenKind::BangEqual - } else { - TokenKind::Bang - } - } - '=' => { - if self.matches('=') { - TokenKind::EqualEqual - } else { - TokenKind::Equal - } - } - '>' => { - if self.matches('=') { - TokenKind::GreaterEqual - } else { - TokenKind::Greater - } - } - '<' => { - if self.matches('=') { - TokenKind::LessEqual - } else { - TokenKind::Less - } - } - '\'' => self.string(pos, '\''), - '"' => self.string(pos, '"'), - _ => { - if self.matches_next(|c| c.is_ascii_digit()) { - self.number(pos) - } else if self.matches_next(|c| c.is_ascii_alphabetic() || c == '_') { - self.identifier(pos) - } else { - TokenKind::Error(format!("Unexpected character '{c}'")) - } - } - }; - let token = self.token(pos, token); - Some(token) - } - - fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> { - Token { kind, start } - } - - fn number(&mut self, start: usize) -> TokenKind<'a> { - // First, the main part. - loop { - if !self.matches_digit() { - break; - } - } - - // Now the fraction part. - // The thing that is bad here is that this is speculative... - let backup = self.chars.clone(); - if self.matches('.') { - let mut saw_digit = false; - loop { - if self.matches('_') { - } else if self.matches_next(|c| c.is_ascii_digit()) { - saw_digit = true; - } else { - break; - } - } - - if saw_digit { - // OK we're good to here! Check the scientific notation. - if self.matches('e') || self.matches('E') { - if self.matches('+') || self.matches('-') {} - let mut saw_digit = false; - loop { - if self.matches('_') { - } else if self.matches_next(|c| c.is_ascii_digit()) { - saw_digit = true; - } else { - break; - } - } - - if !saw_digit { - // This is just a broken number. - let slice = &self.source[start..self.pos()]; - return TokenKind::Error(format!( - "Invalid floating-point literal: {slice}" - )); - } - } - } else { - // Might be accessing a member on an integer. - self.chars = backup; - } - } - - TokenKind::Number(&self.source[start..self.pos()]) - } - - fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> { - while !self.matches(delimiter) { - if self.eof() { - return TokenKind::Error("Unterminated string constant".to_string()); - } - if self.matches('\\') { - self.advance(); - } - } - - TokenKind::String(&self.source[start..self.pos()]) - } - - fn identifier(&mut self, start: usize) -> TokenKind<'a> { - loop { - // TODO: Use unicode identifier classes instead - if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { - break; - } - } - - let ident = &self.source[start..self.pos()]; - match ident.chars().nth(0) { - Some('a') => { - if ident == "and" { - return TokenKind::And; - } - if ident == "async" { - return TokenKind::Async; - } - if ident == "await" { - return TokenKind::Await; - } - } - Some('c') => { - if ident == "class" { - return TokenKind::Class; - } - } - Some('e') => { - if ident == "else" { - return TokenKind::Else; - } - } - Some('f') => { - if ident == "for" { - return TokenKind::For; - } - if ident == "from" { - return TokenKind::From; - } - if ident == "fun" { - return TokenKind::Fun; - } - } - Some('i') => { - if ident == "if" { - return TokenKind::If; - } - } - Some('l') => { - if ident == "let" { - return TokenKind::Let; - } - } - Some('o') => { - if ident == "or" { - return TokenKind::Or; - } - } - Some('p') => { - if ident == "print" { - return TokenKind::Print; - } - } - Some('r') => { - if ident == "return" { - return TokenKind::Return; - } - } - Some('s') => { - if ident == "select" { - return TokenKind::Select; - } - } - Some('t') => { - if ident == "this" { - return TokenKind::This; - } - if ident == "true" { - return TokenKind::True; - } - } - Some('w') => { - if ident == "while" { - return TokenKind::While; - } - } - Some('y') => { - if ident == "yield" { - return TokenKind::Yield; - } - } - _ => (), - } - - TokenKind::Identifier(ident) - } - - fn matches(&mut self, ch: char) -> bool { - if let Some((_, next_ch)) = self.next_char { - if next_ch == ch { - self.advance(); - return true; - } - } - false - } - - fn matches_next(&mut self, f: F) -> bool - where - F: FnOnce(char) -> bool, - { - if let Some((_, next_ch)) = self.next_char { - if f(next_ch) { - self.advance(); - return true; - } - } - false - } - - fn matches_digit(&mut self) -> bool { - self.matches('_') || self.matches_next(|c| c.is_ascii_digit()) - } - - fn advance(&mut self) -> Option<(usize, char)> { - let result = self.next_char; - self.next_char = self.chars.next(); - result - } - - fn pos(&self) -> usize { - match self.next_char { - Some((p, _)) => p, - None => self.source.len(), - } - } - - fn eof(&self) -> bool { - self.next_char.is_none() - } - - fn skip_whitespace(&mut self) { - while let Some((pos, ch)) = self.next_char { - if ch == '\n' { - self.newlines.push(pos); - } else if !ch.is_whitespace() { - break; - } - self.advance(); - } - } -} - -pub fn tokenize(input: String) { - let mut tokens = Tokens::new(&input); - while let Some(token) = tokens.next_token() { - println!("{}: {}", token.start, token.as_str()); - } -} +pub mod parser; +pub mod tokens; diff --git a/oden-script/src/main.rs b/oden-script/src/main.rs index 7f158ec7..da0f5d92 100644 --- a/oden-script/src/main.rs +++ b/oden-script/src/main.rs @@ -1,3 +1 @@ -use oden_script; - pub fn main() {} diff --git a/oden-script/src/parser.rs b/oden-script/src/parser.rs new file mode 100644 index 00000000..a962dcd3 --- /dev/null +++ b/oden-script/src/parser.rs @@ -0,0 +1,906 @@ +use crate::tokens::{Lines, Token, TokenKind, Tokens}; +use std::fmt; + +// TODO: An error should have: +// +// - a start +// - an end +// - a focus +// - descriptive messages +// +// that will have to wait for now +#[derive(PartialEq, Eq)] +pub struct SyntaxError { + pub start: (usize, usize), + pub end: (usize, usize), + pub message: String, +} + +impl SyntaxError { + pub fn new(line: usize, column: usize, message: T) -> Self + where + T: ToString, + { + SyntaxError { + start: (line, column), + end: (line, column), + message: message.to_string(), + } + } + + pub fn new_spanned(start: (usize, usize), end: (usize, usize), message: T) -> Self + where + T: ToString, + { + SyntaxError { + start, + end, + message: message.to_string(), + } + } +} + +impl fmt::Debug for SyntaxError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{self}") + } +} + +impl fmt::Display for SyntaxError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}:{}: {}", self.start.0, self.end.0, self.message) + } +} + +#[derive(Clone)] +pub enum Literal { + Float64(f64), + String(String), + Bool(bool), +} + +#[derive(Copy, Clone)] +pub enum UnaryOp { + Negate, + Not, +} + +#[derive(Copy, Clone)] +pub enum BinaryOp { + Add, + Subtract, + Multiply, + Divide, + And, + Or, +} + +#[derive(Clone)] +pub enum Expr<'a> { + Literal(Literal, Token<'a>), + Unary(UnaryOp, Token<'a>, ExprRef), + Binary(BinaryOp, Token<'a>, ExprRef, ExprRef), + Conditional(Token<'a>, ExprRef, ExprRef, Option, Token<'a>), +} + +#[derive(Clone)] +pub struct ExprRef(Option); + +impl ExprRef { + pub fn error() -> Self { + ExprRef(None) + } +} + +// TODO: Eventually we will be unable to use Eq and PartialEq here, and will +// need to do out own thing. +#[derive(Copy, Clone)] +pub enum Type { + // Signals a type error. If you receive this then you know that an error + // has already been reported; if you produce this be sure to also note + // the error in the errors collection. + Error, + + // Signals that the expression has a control-flow side-effect and that no + // value will ever result from this expression. Usually this means + // everything's fine. + Unreachable, + + // TODO: Numeric literals should be implicitly convertable, unlike other + // types. Maybe just "numeric literal" type? + F64, + String, + Bool, +} + +impl Type { + pub fn is_error(&self) -> bool { + match self { + Type::Error => true, + _ => false, + } + } + + pub fn compatible_with(&self, other: &Type) -> bool { + // TODO: This is wrong; we because of numeric literals etc. + match (self, other) { + (Type::F64, Type::F64) => true, + (Type::String, Type::String) => true, + (Type::Bool, Type::Bool) => true, + (Type::Unreachable, Type::Unreachable) => true, + + // Avoid introducing more errors + (Type::Error, _) => true, + (_, Type::Error) => true, + + (_, _) => false, + } + } +} + +impl std::fmt::Debug for Type { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{self}") + } +} + +impl std::fmt::Display for Type { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use Type::*; + match self { + Error => write!(f, "<< INTERNAL ERROR >>"), + Unreachable => write!(f, "<< UNREACHABLE >>"), + F64 => write!(f, "f64"), + String => write!(f, "string"), + Bool => write!(f, "bool"), + } + } +} + +pub struct TypeRef(Option); + +pub struct SyntaxTree<'a> { + pub errors: Vec, + expressions: Vec>, +} + +impl<'a> SyntaxTree<'a> { + pub fn new() -> Self { + SyntaxTree { + errors: Vec::new(), + expressions: Vec::new(), + } + } + + pub fn add_error(&mut self, error: SyntaxError) { + self.errors.push(error); + } + + pub fn add_expr(&mut self, expr: Expr<'a>) -> ExprRef { + let index = self.expressions.len(); + self.expressions.push(expr); + ExprRef(Some(index)) + } + + pub fn dump_expr(&self, expr: &ExprRef) -> String { + match expr.0 { + Some(idx) => { + let expr = &self.expressions[idx]; + match expr { + Expr::Literal(_, tok) => tok.to_string(), + Expr::Unary(_, tok, e) => { + format!("({tok} {})", self.dump_expr(e)) + } + Expr::Binary(_, tok, l, r) => { + format!("({tok} {} {})", self.dump_expr(l), self.dump_expr(r)) + } + Expr::Conditional(tok, cond, t, e, _) => { + if let Some(e) = e { + format!( + "({tok} {} {} {})", + self.dump_expr(cond), + self.dump_expr(t), + self.dump_expr(e) + ) + } else { + format!("({tok} {} {})", self.dump_expr(cond), self.dump_expr(t)) + } + } + } + } + None => "<|EOF|>".to_string(), + } + } + + pub fn expr_span(&self, expr: &ExprRef) -> Option<(Token<'a>, Token<'a>)> { + let expr = match expr.0 { + Some(idx) => &self.expressions[idx], + None => return None, + }; + + match expr { + Expr::Literal(_, tok) => Some((tok.clone(), tok.clone())), + Expr::Unary(_, tok, arg) => { + let arg = self.expr_span(arg); + match arg { + None => None, + Some((_, end)) => Some((tok.clone(), end)), + } + } + Expr::Binary(_, _, left, right) => { + let left = self.expr_span(left); + let right = self.expr_span(right); + match (left, right) { + (None, _) => None, + (_, None) => None, + (Some((start, _)), Some((_, end))) => Some((start, end)), + } + } + Expr::Conditional(head, _, _, _, tail) => Some((head.clone(), tail.clone())), + } + } + + pub fn expr_type(&mut self, expr: &ExprRef, lines: &Lines, value_required: bool) -> Type { + // TODO: Cache and work on demand? Or is this just fine? + + let exr = expr.clone(); + let expr = match expr.0 { + Some(idx) => &self.expressions[idx], + None => return Type::Error, + }; + match expr { + Expr::Literal(lit, _) => match lit { + Literal::Float64(_) => Type::F64, + Literal::String(_) => Type::String, + Literal::Bool(_) => Type::Bool, + }, + + // Figure out the main thing. Check for a... trait? + Expr::Unary(op, tok, arg) => { + let op = op.clone(); + let arg = arg.clone(); + let tok = tok.clone(); + let arg_type = self.expr_type(&arg, lines, true); + match (op, arg_type) { + (UnaryOp::Negate, Type::F64) => Type::F64, + (UnaryOp::Not, Type::Bool) => Type::Bool, + + // This is dumb and should be punished, probably. + (_, Type::Unreachable) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new(line, col, format!("cannot apply a unary operator to something that doesn't yield a value"))); + Type::Error + } + + // Propagate existing errors without additional complaint. + (_, Type::Error) => Type::Error, + + // Missed the whole table, must be an error. + (_, arg_type) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new(line, col, format!("cannot apply unary operator '{tok}' to expression of type '{arg_type}'"))); + Type::Error + } + } + } + + Expr::Binary(op, tok, left, right) => { + let op = op.clone(); + let tok = tok.clone(); + let left = left.clone(); + let right = right.clone(); + let left_type = self.expr_type(&left, lines, true); + let right_type = self.expr_type(&right, lines, true); + + match (op, left_type, right_type) { + ( + BinaryOp::Add | BinaryOp::Subtract | BinaryOp::Multiply | BinaryOp::Divide, + Type::F64, + Type::F64, + ) => Type::F64, + + (BinaryOp::Add, Type::String, Type::String) => Type::String, + + (BinaryOp::And | BinaryOp::Or, Type::Bool, Type::Bool) => Type::Bool, + + // This is dumb and should be punished, probably. + (_, _, Type::Unreachable) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new( + line, + col, + format!( + "cannot apply '{tok}' to an argument that doesn't yield a value (on the right)" + ), + )); + Type::Error + } + (_, Type::Unreachable, _) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new( + line, + col, + format!( + "cannot apply '{tok}' to an argument that doesn't yield a value (on the left)" + ), + )); + Type::Error + } + + // Propagate existing errors without additional complaint. + (_, Type::Error, _) => Type::Error, + (_, _, Type::Error) => Type::Error, + + // Missed the whole table, it must be an error. + (_, left_type, right_type) => { + let (line, col) = lines.position(tok.start()); + self.errors.push(SyntaxError::new(line, col, format!("cannot apply binary operator '{tok}' to expressions of type '{left_type}' (on the left) and '{right_type}' (on the right)"))); + Type::Error + } + } + } + + Expr::Conditional(_, cond, then_exp, else_exp, _) => { + let cond = cond.clone(); + let then_exp = then_exp.clone(); + let else_exp = else_exp.clone(); + + let cond_type = self.expr_type(&cond, lines, true); + let then_type = self.expr_type(&then_exp, lines, value_required); + let else_type = else_exp.map(|e| self.expr_type(&e, lines, value_required)); + if !cond_type.compatible_with(&Type::Bool) { + if !cond_type.is_error() { + let span = self + .expr_span(&cond) + .expect("If the expression has a type it must have a span"); + + let start = lines.position(span.0.start()); + let end = lines.position(span.1.start()); + self.errors.push(SyntaxError::new_spanned( + start, + end, + "the condition of an `if` expression must be a boolean", + )); + } + return Type::Error; + } + + match (then_type, else_type) { + (Type::Error, _) => Type::Error, + (_, Some(Type::Error)) => Type::Error, + + // It's an error to have a missing else branch if the value is required + (_, None) if value_required => { + let span = self + .expr_span(&exr) + .expect("How did I get this far with a broken parse?"); + let start = lines.position(span.0.start()); + let end = lines.position(span.1.start()); + self.errors.push(SyntaxError::new_spanned( + start, + end, + "this `if` expression must have both a `then` clause and an `else` clause, so it can produce a value", + )); + Type::Error + } + + // If the value is required then the branches must be + // compatible, and the type of the expression is the type + // of the `then` branch. + (then_type, Some(else_type)) if value_required => { + if !then_type.compatible_with(&else_type) { + let span = self + .expr_span(&exr) + .expect("How did I get this far with a broken parse?"); + let start = lines.position(span.0.start()); + let end = lines.position(span.1.start()); + self.errors.push(SyntaxError::new_spanned( + start, + end, + format!("the type of the `then` branch ({then_type}) must match the type of the `else` branch ({else_type})"), + )); + Type::Error + } else { + then_type + } + } + + // The value must not be required, just mark this as unreachable. + (_, _) => { + assert!(!value_required); + Type::Unreachable + } + } + } + } + } +} + +// BINDING POWERS. When parsing expressions we only accept expressions that +// meet a minimum binding power. (This is like "precedence" but I just super +// don't like that terminology.) +const ASSIGNMENT_POWER: u8 = 0; // = +const OR_POWER: u8 = 1; // or +const AND_POWER: u8 = 2; // and +const EQUALITY_POWER: u8 = 3; // == != +const COMPARISON_POWER: u8 = 4; // < > <= >= +const TERM_POWER: u8 = 5; // + - +const FACTOR_POWER: u8 = 6; // * / +const UNARY_POWER: u8 = 7; // ! - + +// const CALL_POWER: u8 = 8; // . () +// const PRIMARY_POWER: u8 = 9; + +fn token_power<'a>(token: &Option>) -> Option { + let token = match token { + Some(t) => t, + None => return None, + }; + + match token.kind() { + TokenKind::Equal => Some(ASSIGNMENT_POWER), + TokenKind::Or => Some(OR_POWER), + TokenKind::And => Some(AND_POWER), + TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER), + TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => { + Some(COMPARISON_POWER) + } + TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER), + TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER), + _ => None, + } +} + +pub struct Parser<'a> { + tokens: Tokens<'a>, + tree: SyntaxTree<'a>, + current: Option>, + previous: Option>, + + panic_mode: bool, +} + +impl<'a> Parser<'a> { + pub fn new(source: &'a str) -> Self { + let mut parser = Parser { + tokens: Tokens::new(source), + tree: SyntaxTree::new(), + current: None, + previous: None, + panic_mode: false, + }; + parser.advance(); + parser + } + + pub fn parse(mut self) -> (SyntaxTree<'a>, ExprRef, Lines) { + let expr = self.expression(); + self.consume(None, "expected end of expression"); + (self.tree, expr, self.tokens.lines()) + } + + fn expression(&mut self) -> ExprRef { + self.expression_with_power(0) + } + + fn expression_with_power(&mut self, minimum_power: u8) -> ExprRef { + self.trace("expression with power"); + self.advance(); + let mut expr = self.prefix_expression(); + loop { + let power = match token_power(&self.current) { + Some(p) => p, + None => break, // EOF, end of expression? + }; + + if power < minimum_power { + break; + } + + self.advance(); + expr = self.infix_expression(power, expr); + } + expr + } + + fn prefix_expression(&mut self) -> ExprRef { + self.trace("prefix"); + let token = self.previous.as_ref(); + match token { + Some(token) => match token.kind() { + TokenKind::Bang => self.unary(), + TokenKind::LeftParen => self.grouping(), + TokenKind::Number => self.number(), + TokenKind::Minus => self.unary(), + TokenKind::String => self.string(), + + TokenKind::True => self + .tree + .add_expr(Expr::Literal(Literal::Bool(true), token.clone())), + TokenKind::False => self + .tree + .add_expr(Expr::Literal(Literal::Bool(false), token.clone())), + + TokenKind::If => self.conditional(), + + _ => { + self.error("expected an expression"); + ExprRef::error() + } + }, + None => { + self.error("expected an expression"); + ExprRef::error() + } + } + } + + fn infix_expression(&mut self, power: u8, left: ExprRef) -> ExprRef { + self.trace("infix"); + let kind = self.previous.as_ref().unwrap().kind(); + match kind { + TokenKind::Plus + | TokenKind::Minus + | TokenKind::Star + | TokenKind::Slash + | TokenKind::And + | TokenKind::Or => self.binary(power, left), + _ => panic!("Unknown infix operator, dispatch error?"), + } + } + + fn number(&mut self) -> ExprRef { + let token = self.previous.as_ref().unwrap(); + // What kind is it? For now let's just ... make it good. + + let literal = match token.as_str().parse::() { + Ok(v) => Literal::Float64(v), + Err(e) => { + self.error(format!("invalid f64: {e}")); + return ExprRef::error(); + } + }; + + self.tree.add_expr(Expr::Literal(literal, token.clone())) + } + + fn string(&mut self) -> ExprRef { + let token = self.previous.as_ref().unwrap(); + + let mut result = String::new(); + let mut input = token.as_str().chars(); + + assert!(input.next().is_some()); // Delimiter + while let Some(ch) = input.next() { + match ch { + '\\' => match input.next().unwrap() { + 'n' => result.push('\n'), + 'r' => result.push('\r'), + 't' => result.push('\t'), + ch => result.push(ch), + }, + _ => result.push(ch), + } + } + result.pop(); // We pushed the other delimiter on, whoops. + + let literal = Literal::String(result); + self.tree.add_expr(Expr::Literal(literal, token.clone())) + } + + fn grouping(&mut self) -> ExprRef { + let result = self.expression(); + self.consume( + Some(TokenKind::RightParen), + "expected ')' after an expression", + ); + result + } + + fn conditional(&mut self) -> ExprRef { + let token = self.previous.as_ref().unwrap().clone(); + let condition_expr = self.expression(); + self.consume( + Some(TokenKind::LeftBrace), + "expected '{' to start an 'if' block", + ); + let then_expr = self.expression(); + self.consume( + Some(TokenKind::RightBrace), + "expected '}' to end an 'if' block", + ); + let else_expr = match &self.current { + Some(token) if token.kind() == TokenKind::Else => { + self.advance(); + match &self.current { + // Allow `else if` without another `{`. + Some(token) if token.kind() == TokenKind::If => { + self.advance(); + Some(self.conditional()) + } + _ => { + self.consume( + Some(TokenKind::LeftBrace), + "expected '{' to start an 'else' block", + ); + let else_expr = self.expression(); + self.consume( + Some(TokenKind::RightBrace), + "Expected '}' to end an 'else' block", + ); + Some(else_expr) + } + } + } + _ => None, + }; + let tail = self.previous.as_ref().unwrap().clone(); + self.tree.add_expr(Expr::Conditional( + token, + condition_expr, + then_expr, + else_expr, + tail, + )) + } + + fn unary(&mut self) -> ExprRef { + let token = self.previous.as_ref().unwrap().clone(); + let kind = token.kind(); + let expr = self.expression_with_power(UNARY_POWER); + let op = match kind { + TokenKind::Minus => UnaryOp::Negate, + TokenKind::Bang => UnaryOp::Not, + _ => panic!("unsuitable unary: {:?}: no op", kind), + }; + + self.tree.add_expr(Expr::Unary(op, token, expr)) + } + + fn binary(&mut self, power: u8, left: ExprRef) -> ExprRef { + let token = self.previous.as_ref().unwrap().clone(); + let op = match token.kind() { + TokenKind::Plus => BinaryOp::Add, + TokenKind::Minus => BinaryOp::Subtract, + TokenKind::Star => BinaryOp::Multiply, + TokenKind::Slash => BinaryOp::Divide, + TokenKind::And => BinaryOp::And, + TokenKind::Or => BinaryOp::Or, + _ => panic!("unsuitable binary: {:?}: no op", self.previous), + }; + let right = self.expression_with_power(power + 1); + self.tree.add_expr(Expr::Binary(op, token, left, right)) + } + + fn advance(&mut self) { + self.previous = self.current.take(); + loop { + self.current = self.tokens.next(); + match &self.current { + Some(token) if token.kind() == TokenKind::Error => { + self.error_at_current(token.to_string()) + } + _ => break, + } + } + } + + fn consume(&mut self, kind: Option, error: &str) { + match (&self.current, kind) { + (Some(token), Some(kind)) if token.kind() == kind => self.advance(), + (None, None) => (), + _ => { + self.error_at_current(error); + } + } + } + + fn error(&mut self, message: T) + where + T: Into, + { + self.error_at(self.previous.clone(), message) + } + + fn error_at_current(&mut self, message: T) + where + T: Into, + { + self.error_at(self.current.clone(), message) + } + + fn error_at(&mut self, token: Option>, message: T) + where + T: Into, + { + if self.panic_mode { + return; + } + self.panic_mode = true; + + let message: String = message.into(); + let (line, column) = self.tokens.token_position(&token); + let mut final_message = "Error ".to_string(); + match token { + None => final_message.push_str("at end"), + Some(t) => { + if t.kind() != TokenKind::Error { + final_message.push_str("at '"); + final_message.push_str(t.as_str()); + final_message.push_str("'"); + } + } + } + final_message.push_str(": "); + final_message.push_str(&message); + + self.tree + .add_error(SyntaxError::new(line, column, final_message)); + } + + fn trace(&self, _msg: &str) { + // let cpos = self.tokens.token_position(&self.current); + // let ppos = self.tokens.token_position(&self.previous); + + // eprintln!( + // "[{}:{}:{}] [{}:{}:{}]: {msg}", + // ppos.0, + // ppos.1, + // self.previous + // .as_ref() + // .map(|t| t.as_str()) + // .unwrap_or(""), + // cpos.0, + // cpos.1, + // self.current.as_ref().map(|t| t.as_str()).unwrap_or("") + // ); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + fn test_successful_expression_parse(source: &str, expected: &str, expected_type: Type) { + let (mut tree, expr, lines) = Parser::new(source).parse(); + assert_eq!( + Vec::::new(), + tree.errors, + "Expected successful parse" + ); + assert_eq!( + expected, + tree.dump_expr(&expr), + "The parse structure of the expressions did not match" + ); + + // TODO: 'assert_eq' is probably wrong here + let expr_type = tree.expr_type(&expr, &lines, true); + assert!( + expected_type.compatible_with(&expr_type), + "The type of the expression did not match. expected: {expected_type}, actual: {expr_type}" + ); + } + + macro_rules! test_expr { + ($name:ident, $input:expr, $expected:expr, $type:expr) => { + #[test] + fn $name() { + test_successful_expression_parse($input, $expected, $type); + } + }; + } + + test_expr!(number_expr, "12", "12", Type::F64); + test_expr!(add_expr, "1 + 2", "(+ 1 2)", Type::F64); + test_expr!( + prec_expr, + "1 + 2 * 3 - 7 * 7", + "(- (+ 1 (* 2 3)) (* 7 7))", + Type::F64 + ); + test_expr!(unary, "-((23)) * 5", "(* (- 23) 5)", Type::F64); + test_expr!( + strings, + r#" "Hello " + 'world!' "#, + r#"(+ "Hello " 'world!')"#, + Type::String + ); + + test_expr!( + booleans, + "true and false or false and !true", + "(or (and true false) (and false (! true)))", + Type::Bool + ); + + test_expr!( + if_expression, + "if true { 23 } else { 45 }", + "(if true 23 45)", + Type::F64 + ); + // test_expr!( + // if_with_return, + // "if true { 23 } else { return 'nothing' }", + // "", + // Type::F64 + // ); + + // ======================================================================== + // Type Error Tests + // ======================================================================== + + fn test_type_error_expression(source: &str, expected_errors: Vec<&str>) { + let (mut tree, expr, lines) = Parser::new(source).parse(); + assert_eq!( + Vec::::new(), + tree.errors, + "Expected successful parse" + ); + + let expr_type = tree.expr_type(&expr, &lines, true); + assert!(expr_type.is_error()); + + let actual_errors = tree + .errors + .iter() + .map(|e| e.message.as_str()) + .collect::>(); + assert_eq!(expected_errors, actual_errors); + } + + macro_rules! test_type_error_expr { + ($name:ident, $input:expr, $($s:expr),+) => { + #[test] + fn $name() { + let expected_errors: Vec<&str> = (vec![$($s),*]); + test_type_error_expression($input, expected_errors); + } + } + } + + test_type_error_expr!( + negate_string, + "-('what?')", + "cannot apply unary operator '-' to expression of type 'string'" + ); + + test_type_error_expr!( + add_string_number, + "'what?' + 5", + "cannot apply binary operator '+' to expressions of type 'string' (on the left) and 'f64' (on the right)" + ); + + test_type_error_expr!( + add_number_string, + "5 + 'what?'", + "cannot apply binary operator '+' to expressions of type 'f64' (on the left) and 'string' (on the right)" + ); + + test_type_error_expr!( + errors_propagate_do_not_duplicate, + "!'hello' / 27 * -('what?') + 23", + "cannot apply unary operator '!' to expression of type 'string'", + "cannot apply unary operator '-' to expression of type 'string'" + ); + + test_type_error_expr!( + if_not_bool, + "if 23 { 1 } else { 2 }", + "the condition of an `if` expression must be a boolean" + ); + + test_type_error_expr!( + if_arm_mismatch, + "if true { 1 } else { '1' }", + "the type of the `then` branch (f64) must match the type of the `else` branch (string)" + ); + + test_type_error_expr!( + if_no_else, + "if true { 1 }", + "this `if` expression must have both a `then` clause and an `else` clause, so it can produce a value" + ); +} diff --git a/oden-script/src/tokens.rs b/oden-script/src/tokens.rs new file mode 100644 index 00000000..c2bccfb9 --- /dev/null +++ b/oden-script/src/tokens.rs @@ -0,0 +1,584 @@ +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum TokenKind { + LeftBrace, + RightBrace, + LeftBracket, + RightBracket, + LeftParen, + RightParen, + Comma, + Dot, + Minus, + Plus, + Semicolon, + Slash, + Star, + + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + Identifier, + String, + Number, + + And, + Async, + Await, + Class, + Else, + False, + For, + From, + Fun, + If, + Let, + Or, + Print, + Return, + Select, + This, + True, + While, + Yield, + + Error, +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Token<'a> { + kind: TokenKind, + start: usize, + value: Result<&'a str, String>, +} + +impl<'a> Token<'a> { + pub fn new(kind: TokenKind, start: usize, value: &'a str) -> Self { + Token { + kind, + start, + value: Ok(value), + } + } + + pub fn error(start: usize, message: String) -> Self { + Token { + kind: TokenKind::Error, + start, + value: Err(message), + } + } + + pub fn start(&self) -> usize { + self.start + } + + pub fn kind(&self) -> TokenKind { + self.kind + } + + pub fn as_str<'b>(&'b self) -> &'a str + where + 'b: 'a, + { + match &self.value { + Ok(v) => v, + Err(e) => &e, + } + } +} + +impl<'a> std::fmt::Display for Token<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +pub struct Lines { + newlines: Vec, + eof: usize, +} + +impl Lines { + fn new(eof: usize) -> Self { + Lines { + newlines: Vec::new(), + eof, + } + } + + /// Record the position of a newline in the source. + pub fn add_line(&mut self, pos: usize) { + self.newlines.push(pos) + } + + /// Return the position of the given token as a (line, column) pair. By + /// convention, lines are 1-based and columns are 0-based. Also, in + /// keeping with the iterator-nature of the tokenizer, `None` here + /// indicates end-of-file, and will return the position of the end of the + /// file. + pub fn token_position(&self, token: &Option) -> (usize, usize) { + let start = match token { + Some(t) => t.start, + None => self.eof, + }; + self.position(start) + } + + /// Return the position of the given character offset as a (line,column) + /// pair. By convention, lines are 1-based and columns are 0-based. + pub fn position(&self, offset: usize) -> (usize, usize) { + let line_end_index = match self.newlines.binary_search(&offset) { + Ok(index) => index, + Err(index) => index, + }; + let line_start_pos = if line_end_index == 0 { + 0 + } else { + self.newlines[line_end_index - 1] + 1 + }; + let line_number = line_end_index + 1; + let column_offset = offset - line_start_pos; + (line_number, column_offset) + } +} + +pub struct Tokens<'a> { + source: &'a str, + chars: std::str::CharIndices<'a>, + next_char: Option<(usize, char)>, + lines: Lines, +} + +impl<'a> Tokens<'a> { + pub fn new(source: &'a str) -> Self { + let mut result = Tokens { + source, + chars: source.char_indices(), + next_char: None, + lines: Lines::new(source.len()), + }; + result.advance(); // Prime the pump + result + } + + pub fn lines(self) -> Lines { + self.lines + } + + /// Return the position of the given token as a (line, column) pair. See + /// `Lines::token_position` for more information about the range, etc. + pub fn token_position(&self, token: &Option) -> (usize, usize) { + self.lines.token_position(token) + } + + fn token(&self, start: usize, kind: TokenKind) -> Token<'a> { + let value = &self.source[start..self.pos()]; + Token::new(kind, start, value) + } + + fn number(&mut self, start: usize) -> Token<'a> { + // First, the main part. + loop { + if !self.matches_digit() { + break; + } + } + + // Now the fraction part. + // The thing that is bad here is that this is speculative... + let backup = self.chars.clone(); + if self.matches('.') { + let mut saw_digit = false; + loop { + if self.matches('_') { + } else if self.matches_next(|c| c.is_ascii_digit()) { + saw_digit = true; + } else { + break; + } + } + + if saw_digit { + // OK we're good to here! Check the scientific notation. + if self.matches('e') || self.matches('E') { + if self.matches('+') || self.matches('-') {} + let mut saw_digit = false; + loop { + if self.matches('_') { + } else if self.matches_next(|c| c.is_ascii_digit()) { + saw_digit = true; + } else { + break; + } + } + + if !saw_digit { + // This is just a broken number. + let slice = &self.source[start..self.pos()]; + return Token::error( + start, + format!("Invalid floating-point literal: {slice}"), + ); + } + } + } else { + // Might be accessing a member on an integer. + self.chars = backup; + } + } + + self.token(start, TokenKind::Number) + } + + fn string(&mut self, start: usize, delimiter: char) -> Token<'a> { + while !self.matches(delimiter) { + if self.eof() { + return Token::error(start, "Unterminated string constant".to_string()); + } + if self.matches('\\') { + self.advance(); + } else { + self.advance(); + } + } + + self.token(start, TokenKind::String) + } + + fn identifier_token_kind(ident: &str) -> TokenKind { + match ident.chars().nth(0).unwrap() { + 'a' => { + if ident == "and" { + return TokenKind::And; + } + if ident == "async" { + return TokenKind::Async; + } + if ident == "await" { + return TokenKind::Await; + } + } + 'c' => { + if ident == "class" { + return TokenKind::Class; + } + } + 'e' => { + if ident == "else" { + return TokenKind::Else; + } + } + 'f' => { + if ident == "false" { + return TokenKind::False; + } + if ident == "for" { + return TokenKind::For; + } + if ident == "from" { + return TokenKind::From; + } + if ident == "fun" { + return TokenKind::Fun; + } + } + 'i' => { + if ident == "if" { + return TokenKind::If; + } + } + 'l' => { + if ident == "let" { + return TokenKind::Let; + } + } + 'o' => { + if ident == "or" { + return TokenKind::Or; + } + } + 'p' => { + if ident == "print" { + return TokenKind::Print; + } + } + 'r' => { + if ident == "return" { + return TokenKind::Return; + } + } + 's' => { + if ident == "select" { + return TokenKind::Select; + } + } + 't' => { + if ident == "this" { + return TokenKind::This; + } + if ident == "true" { + return TokenKind::True; + } + } + 'w' => { + if ident == "while" { + return TokenKind::While; + } + } + 'y' => { + if ident == "yield" { + return TokenKind::Yield; + } + } + _ => (), + } + + TokenKind::Identifier + } + + fn identifier(&mut self, start: usize) -> Token<'a> { + loop { + // TODO: Use unicode identifier classes instead + if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') { + break; + } + } + + let ident = &self.source[start..self.pos()]; + let kind = Self::identifier_token_kind(ident); + Token::new(kind, start, ident) + } + + fn matches(&mut self, ch: char) -> bool { + if let Some((_, next_ch)) = self.next_char { + if next_ch == ch { + self.advance(); + return true; + } + } + false + } + + fn matches_next(&mut self, f: F) -> bool + where + F: FnOnce(char) -> bool, + { + if let Some((_, next_ch)) = self.next_char { + if f(next_ch) { + self.advance(); + return true; + } + } + false + } + + fn matches_digit(&mut self) -> bool { + self.matches('_') || self.matches_next(|c| c.is_ascii_digit()) + } + + fn advance(&mut self) -> Option<(usize, char)> { + let result = self.next_char; + self.next_char = self.chars.next(); + result + } + + fn pos(&self) -> usize { + match self.next_char { + Some((p, _)) => p, + None => self.source.len(), + } + } + + fn eof(&self) -> bool { + self.next_char.is_none() + } + + fn skip_whitespace(&mut self) { + while let Some((pos, ch)) = self.next_char { + if ch == '\n' { + self.lines.add_line(pos); + } else if !ch.is_whitespace() { + break; + } + self.advance(); + } + } +} + +impl<'a> std::iter::Iterator for Tokens<'a> { + type Item = Token<'a>; + + fn next(&mut self) -> Option { + self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving + let (pos, c) = match self.advance() { + Some((p, c)) => (p, c), + None => return None, + }; + + let token = match c { + '{' => self.token(pos, TokenKind::LeftBrace), + '}' => self.token(pos, TokenKind::RightBrace), + '[' => self.token(pos, TokenKind::LeftBracket), + ']' => self.token(pos, TokenKind::RightBracket), + '(' => self.token(pos, TokenKind::LeftParen), + ')' => self.token(pos, TokenKind::RightParen), + ',' => self.token(pos, TokenKind::Comma), + '.' => self.token(pos, TokenKind::Dot), + '-' => self.token(pos, TokenKind::Minus), + '+' => self.token(pos, TokenKind::Plus), + ';' => self.token(pos, TokenKind::Semicolon), + '/' => self.token(pos, TokenKind::Slash), + '*' => self.token(pos, TokenKind::Star), + '!' => { + if self.matches('=') { + self.token(pos, TokenKind::BangEqual) + } else { + self.token(pos, TokenKind::Bang) + } + } + '=' => { + if self.matches('=') { + self.token(pos, TokenKind::EqualEqual) + } else { + self.token(pos, TokenKind::Equal) + } + } + '>' => { + if self.matches('=') { + self.token(pos, TokenKind::GreaterEqual) + } else { + self.token(pos, TokenKind::Greater) + } + } + '<' => { + if self.matches('=') { + self.token(pos, TokenKind::LessEqual) + } else { + self.token(pos, TokenKind::Less) + } + } + '\'' => self.string(pos, '\''), + '"' => self.string(pos, '"'), + _ => { + if c.is_ascii_digit() { + self.number(pos) + } else if c.is_ascii_alphabetic() || c == '_' { + self.identifier(pos) + } else { + Token::error(pos, format!("Unexpected character '{c}'")) + } + } + }; + Some(token) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + macro_rules! test_tokens { + ($name:ident, $input:expr, $($s:expr),+) => { + #[test] + fn $name() { + use TokenKind::*; + let tokens: Vec<_> = Tokens::new($input).collect(); + + let expected: Vec = (vec![$($s),*]) + .into_iter() + .map(|t| Token::new(t.1, t.0, t.2)) + .collect(); + + assert_eq!(expected, tokens); + } + } + } + + test_tokens!( + numbers, + "1 1.0 1.2e7 2.3e+7 3.3E-06 7_6 8.0e_8", + (0, Number, "1"), + (2, Number, "1.0"), + (6, Number, "1.2e7"), + (12, Number, "2.3e+7"), + (19, Number, "3.3E-06"), + (27, Number, "7_6"), + (31, Number, "8.0e_8") + ); + + test_tokens!( + identifiers, + "asdf x _123 a_23 x3a and or yield async await class else false for from", + (0, Identifier, "asdf"), + (5, Identifier, "x"), + (7, Identifier, "_123"), + (12, Identifier, "a_23"), + (17, Identifier, "x3a"), + (21, And, "and"), + (25, Or, "or"), + (28, Yield, "yield"), + (34, Async, "async"), + (40, Await, "await"), + (46, Class, "class"), + (52, Else, "else"), + (57, False, "false"), + (63, For, "for"), + (67, From, "from") + ); + + test_tokens!( + more_keywords, + "fun if let print return select this true while truewhile", + (0, Fun, "fun"), + (4, If, "if"), + (7, Let, "let"), + (11, Print, "print"), + (17, Return, "return"), + (24, Select, "select"), + (31, This, "this"), + (36, True, "true"), + (41, While, "while"), + (47, Identifier, "truewhile") + ); + + test_tokens!( + strings, + r#"'this is a string that\'s great!\r\n' "foo's" 'bar"s' "#, + (0, String, r#"'this is a string that\'s great!\r\n'"#), + (38, String, r#""foo's""#), + (46, String, "'bar\"s'") + ); + + test_tokens!( + symbols, + "{ } ( ) [ ] . ! != < <= > >= = == , - + * / ;", + (0, LeftBrace, "{"), + (2, RightBrace, "}"), + (4, LeftParen, "("), + (6, RightParen, ")"), + (8, LeftBracket, "["), + (10, RightBracket, "]"), + (12, Dot, "."), + (14, Bang, "!"), + (16, BangEqual, "!="), + (19, Less, "<"), + (21, LessEqual, "<="), + (24, Greater, ">"), + (26, GreaterEqual, ">="), + (29, Equal, "="), + (31, EqualEqual, "=="), + (34, Comma, ","), + (36, Minus, "-"), + (38, Plus, "+"), + (40, Star, "*"), + (42, Slash, "/"), + (44, Semicolon, ";") + ); +}