oden/fine/src/tokens.rs

#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum TokenKind {
    EOF,
    Error,

    Whitespace,
    Comment,

    LeftBrace,    // TODO: LeftCurly
    RightBrace,   // TODO: RightCurly
    LeftBracket,  // TODO: LeftSquare
    RightBracket, // TODO: RightSquare
    LeftParen,
    RightParen,
    Comma,
    Dot,
    Minus,
    Plus,
    Semicolon,
    Slash,
    Star,

    Arrow,
    Colon,

    Bang,
    BangEqual,
    Equal,
    EqualEqual,
    Greater,
    GreaterEqual,
    Less,
    LessEqual,

    Identifier,
    String,
    Number,

    And,
    As,
    Async,
    Await,
    Class,
    Else,
    Export,
    False,
    For,
    From,
    Fun,
    If,
    Import,
    In,
    Is,
    Let,
    Match,
    New,
    Or,
    Return,
    Select,
    Selff,
    True,
    Underscore,
    While,
    Yield,
}

// NOTE: Tokens are kinda big (like 40 bytes?) and AFAICT the only way to go
//       smaller would be to find some other way to represent the error in an
//       error token, but I'm kinda unwilling to do that.
//
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Token {
    pub kind: TokenKind,
    start: usize,
    end: usize,
    error: Option<Box<str>>,
}

impl Token {
    pub fn new(kind: TokenKind, start: usize, end: usize) -> Self {
        Token {
            kind,
            start,
            end,
            error: None,
        }
    }

    pub fn error(start: usize, end: usize, message: String) -> Self {
        Token {
            kind: TokenKind::Error,
            start,
            end,
            error: Some(message.into()),
        }
    }

    pub fn start(&self) -> usize {
        self.start
    }

    pub fn end(&self) -> usize {
        self.end
    }

    pub fn len(&self) -> usize {
        self.end() - self.start()
    }

    pub fn as_str<'a, 'b>(&'a self, source: &'b str) -> &'a str
    where
        'b: 'a,
    {
        if let Some(error) = &self.error {
            &error
        } else {
            &source[self.start()..self.end()]
        }
    }
}

pub struct Lines {
    newlines: Vec<usize>,
}

impl Lines {
    fn new() -> Self {
        Lines {
            newlines: Vec::new(),
        }
    }

    /// Record the position of a newline in the source.
    pub fn add_line(&mut self, pos: usize) {
        self.newlines.push(pos)
    }

    /// Return the position of the given token as a (line, column) pair. By
    /// convention, lines are 1-based and columns are 0-based.
    pub fn token_position(&self, token: &Token) -> (usize, usize) {
        self.position(token.start)
    }

    /// Return the position of the given character offset as a (line,column)
    /// pair. By convention, lines are 1-based and columns are 0-based.
    pub fn position(&self, offset: usize) -> (usize, usize) {
        let line_end_index = match self.newlines.binary_search(&offset) {
            Ok(index) => index,
            Err(index) => index,
        };
        let line_start_pos = if line_end_index == 0 {
            0
        } else {
            self.newlines[line_end_index - 1] + 1
        };
        let line_number = line_end_index + 1;
        let column_offset = offset - line_start_pos;
        (line_number, column_offset)
    }
}

pub struct Tokens<'a> {
    source: &'a str,
    chars: std::str::CharIndices<'a>,
    next_char: Option<(usize, char)>,
    lines: Lines,
}

impl<'a> Tokens<'a> {
    pub fn new(source: &'a str) -> Self {
        let mut result = Tokens {
            source,
            chars: source.char_indices(),
            next_char: None,
            lines: Lines::new(),
        };
        result.advance(); // Prime the pump
        result
    }

    pub fn source(&self) -> &'a str {
        self.source
    }

    pub fn lines(self) -> Lines {
        self.lines
    }

    /// Return the position of the given token as a (line, column) pair. See
    /// `Lines::token_position` for more information about the range, etc.
    pub fn token_position(&self, token: &Token) -> (usize, usize) {
        self.lines.token_position(token)
    }

    fn token(&self, start: usize, kind: TokenKind) -> Token {
        Token::new(kind, start, self.pos())
    }

    fn number(&mut self, start: usize) -> Token {
        // First, the main part.
        loop {
            if !self.matches_digit() {
                break;
            }
        }

        // Now the fraction part.
        // The thing that is bad here is that this is speculative...
        let backup = self.chars.clone();
        if self.matches('.') {
            let mut saw_digit = false;
            loop {
                if self.matches('_') {
                } else if self.matches_next(|c| c.is_ascii_digit()) {
                    saw_digit = true;
                } else {
                    break;
                }
            }

            if saw_digit {
                // OK we're good to here! Check the scientific notation.
                if self.matches('e') || self.matches('E') {
                    if self.matches('+') || self.matches('-') {}
                    let mut saw_digit = false;
                    loop {
                        if self.matches('_') {
                        } else if self.matches_next(|c| c.is_ascii_digit()) {
                            saw_digit = true;
                        } else {
                            break;
                        }
                    }

                    if !saw_digit {
                        // This is just a broken number.
                        let slice = &self.source[start..self.pos()];
                        return Token::error(
                            start,
                            self.pos(),
                            format!("Invalid floating-point literal: {slice}"),
                        );
                    }
                }
            } else {
                // Might be accessing a member on an integer.
                self.chars = backup;
            }
        }

        self.token(start, TokenKind::Number)
    }

    fn string(&mut self, start: usize, delimiter: char) -> Token {
        while !self.matches(delimiter) {
            if self.eof() {
                return Token::error(
                    start,
                    self.pos(),
                    "Unterminated string constant".to_string(),
                );
            }
            if self.matches('\\') {
                self.advance();
            } else {
                self.advance();
            }
        }

        self.token(start, TokenKind::String)
    }

    fn identifier_token_kind(ident: &str) -> TokenKind {
        match ident.chars().nth(0).unwrap() {
            'a' => {
                if ident == "and" {
                    return TokenKind::And;
                }
                if ident == "as" {
                    return TokenKind::As;
                }
                if ident == "async" {
                    return TokenKind::Async;
                }
                if ident == "await" {
                    return TokenKind::Await;
                }
            }
            'c' => {
                if ident == "class" {
                    return TokenKind::Class;
                }
            }
            'e' => {
                if ident == "else" {
                    return TokenKind::Else;
                }
                if ident == "export" {
                    return TokenKind::Export;
                }
            }
            'f' => {
                if ident == "false" {
                    return TokenKind::False;
                }
                if ident == "for" {
                    return TokenKind::For;
                }
                if ident == "from" {
                    return TokenKind::From;
                }
                if ident == "fun" {
                    return TokenKind::Fun;
                }
            }
            'i' => {
                if ident == "if" {
                    return TokenKind::If;
                }
                if ident == "import" {
                    return TokenKind::Import;
                }
                if ident == "in" {
                    return TokenKind::In;
                }
                if ident == "is" {
                    return TokenKind::Is;
                }
            }
            'l' => {
                if ident == "let" {
                    return TokenKind::Let;
                }
            }
            'm' => {
                if ident == "match" {
                    return TokenKind::Match;
                }
            }
            'n' => {
                if ident == "new" {
                    return TokenKind::New;
                }
            }
            'o' => {
                if ident == "or" {
                    return TokenKind::Or;
                }
            }
            'r' => {
                if ident == "return" {
                    return TokenKind::Return;
                }
            }
            's' => {
                if ident == "select" {
                    return TokenKind::Select;
                }
                if ident == "self" {
                    return TokenKind::Selff;
                }
            }
            't' => {
                if ident == "true" {
                    return TokenKind::True;
                }
            }
            'w' => {
                if ident == "while" {
                    return TokenKind::While;
                }
            }
            'y' => {
                if ident == "yield" {
                    return TokenKind::Yield;
                }
            }
            '_' => {
                if ident == "_" {
                    return TokenKind::Underscore;
                }
            }
            _ => (),
        }

        TokenKind::Identifier
    }

    fn identifier(&mut self, start: usize) -> Token {
        loop {
            // TODO: Use unicode identifier classes instead
            if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') {
                break;
            }
        }

        let ident = &self.source[start..self.pos()];
        let kind = Self::identifier_token_kind(ident);
        Token::new(kind, start, self.pos())
    }

    fn matches(&mut self, ch: char) -> bool {
        if let Some((_, next_ch)) = self.next_char {
            if next_ch == ch {
                self.advance();
                return true;
            }
        }
        false
    }

    fn matches_next<F>(&mut self, f: F) -> bool
    where
        F: FnOnce(char) -> bool,
    {
        if let Some((_, next_ch)) = self.next_char {
            if f(next_ch) {
                self.advance();
                return true;
            }
        }
        false
    }

    fn matches_digit(&mut self) -> bool {
        self.matches('_') || self.matches_next(|c| c.is_ascii_digit())
    }

    fn advance(&mut self) -> Option<(usize, char)> {
        let result = self.next_char;
        self.next_char = self.chars.next();
        result
    }

    fn pos(&self) -> usize {
        match self.next_char {
            Some((p, _)) => p,
            None => self.source.len(),
        }
    }

    fn eof(&self) -> bool {
        self.next_char.is_none()
    }

    fn whitespace(&mut self, pos: usize) -> Token {
        while let Some((pos, ch)) = self.next_char {
            if ch == '\n' {
                self.lines.add_line(pos);
            } else if !ch.is_whitespace() {
                break;
            }
            self.advance();
        }
        self.token(pos, TokenKind::Whitespace)
    }

    fn comment(&mut self, pos: usize) -> Token {
        while let Some((_, ch)) = self.next_char {
            if ch == '\n' {
                break;
            }
            self.advance();
        }
        self.token(pos, TokenKind::Comment)
    }

    pub fn next(&mut self) -> Token {
        let (pos, c) = match self.advance() {
            Some((p, c)) => (p, c),
            None => return self.token(self.source.len(), TokenKind::EOF),
        };

        match c {
            ' ' | '\t' | '\r' => self.whitespace(pos),
            '\n' => {
                self.lines.add_line(pos);
                self.whitespace(pos)
            }
            '{' => self.token(pos, TokenKind::LeftBrace),
            '}' => self.token(pos, TokenKind::RightBrace),
            '[' => self.token(pos, TokenKind::LeftBracket),
            ']' => self.token(pos, TokenKind::RightBracket),
            '(' => self.token(pos, TokenKind::LeftParen),
            ')' => self.token(pos, TokenKind::RightParen),
            ',' => self.token(pos, TokenKind::Comma),
            '.' => self.token(pos, TokenKind::Dot),
            '-' => {
                if self.matches('>') {
                    self.token(pos, TokenKind::Arrow)
                } else {
                    self.token(pos, TokenKind::Minus)
                }
            }
            '+' => self.token(pos, TokenKind::Plus),
            ':' => self.token(pos, TokenKind::Colon),
            ';' => self.token(pos, TokenKind::Semicolon),
            '/' => {
                if self.matches('/') {
                    self.comment(pos)
                } else {
                    self.token(pos, TokenKind::Slash)
                }
            }
            '*' => self.token(pos, TokenKind::Star),
            '!' => {
                if self.matches('=') {
                    self.token(pos, TokenKind::BangEqual)
                } else {
                    self.token(pos, TokenKind::Bang)
                }
            }
            '=' => {
                if self.matches('=') {
                    self.token(pos, TokenKind::EqualEqual)
                } else {
                    self.token(pos, TokenKind::Equal)
                }
            }
            '>' => {
                if self.matches('=') {
                    self.token(pos, TokenKind::GreaterEqual)
                } else {
                    self.token(pos, TokenKind::Greater)
                }
            }
            '<' => {
                if self.matches('=') {
                    self.token(pos, TokenKind::LessEqual)
                } else {
                    self.token(pos, TokenKind::Less)
                }
            }
            '\'' => self.string(pos, '\''),
            '"' => self.string(pos, '"'),
            _ => {
                if c.is_ascii_digit() {
                    self.number(pos)
                } else if c.is_ascii_alphabetic() || c == '_' {
                    self.identifier(pos)
                } else {
                    Token::error(pos, self.pos(), format!("Unexpected character '{c}'"))
                }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use pretty_assertions::assert_eq;

    fn test_tokens_impl(input: &str, expected: Vec<Token>, expected_text: Vec<String>) {
        let mut result = Vec::new();
        let mut result_text = Vec::new();
        let mut tokens = Tokens::new(input);
        let mut is_eof = false;
        while !is_eof {
            let token = tokens.next();
            is_eof = token.kind == TokenKind::EOF;
            if token.kind == TokenKind::Whitespace {
                continue;
            }
            result_text.push(token.as_str(input).to_string());
            result.push(token);
        }

        assert_eq!(expected, result);
        assert_eq!(expected_text, result_text);
    }

    macro_rules! test_tokens {
        ($name:ident, $input:expr, $($s:expr),+) => {
            #[test]
            fn $name() {
                use TokenKind::*;

                let mut expected: Vec<Token> = (vec![$($s),*])
                    .into_iter()
                    .map(|t| Token::new(t.1, t.0, t.0 + t.2.len()))
                    .collect();
                expected.push(Token::new(TokenKind::EOF, $input.len(), $input.len()));

                let mut expected_text: Vec<_> = (vec![$($s),*])
                    .into_iter()
                    .map(|t| t.2.to_string())
                    .collect();
                expected_text.push("".to_string());

                test_tokens_impl($input, expected, expected_text);
            }
        }
    }

    test_tokens!(
        numbers,
        "1 1.0 1.2e7 2.3e+7 3.3E-06 7_6 8.0e_8",
        (0, Number, "1"),
        (2, Number, "1.0"),
        (6, Number, "1.2e7"),
        (12, Number, "2.3e+7"),
        (19, Number, "3.3E-06"),
        (27, Number, "7_6"),
        (31, Number, "8.0e_8")
    );

    test_tokens!(
        identifiers,
        "asdf x _123 a_23 x3a and or yield async await class else false for from",
        (0, Identifier, "asdf"),
        (5, Identifier, "x"),
        (7, Identifier, "_123"),
        (12, Identifier, "a_23"),
        (17, Identifier, "x3a"),
        (21, And, "and"),
        (25, Or, "or"),
        (28, Yield, "yield"),
        (34, Async, "async"),
        (40, Await, "await"),
        (46, Class, "class"),
        (52, Else, "else"),
        (57, False, "false"),
        (63, For, "for"),
        (67, From, "from")
    );

    test_tokens!(
        more_keywords,
        "fun if import let return select self true while truewhile new",
        (0, Fun, "fun"),
        (4, If, "if"),
        (7, Import, "import"),
        (14, Let, "let"),
        (18, Return, "return"),
        (25, Select, "select"),
        (32, Selff, "self"),
        (37, True, "true"),
        (42, While, "while"),
        (48, Identifier, "truewhile"),
        (58, New, "new")
    );

    test_tokens!(
        more_more_keywords,
        "in is match _ as export",
        (0, In, "in"),
        (3, Is, "is"),
        (6, Match, "match"),
        (12, Underscore, "_"),
        (14, As, "as"),
        (17, Export, "export")
    );

    test_tokens!(
        strings,
        r#"'this is a string that\'s great!\r\n' "foo's" 'bar"s' "#,
        (0, String, r#"'this is a string that\'s great!\r\n'"#),
        (38, String, r#""foo's""#),
        (46, String, "'bar\"s'")
    );

    test_tokens!(
        symbols,
        "{ } ( ) [ ] . ! != < <= > >= = == , - -> + * / ; :",
        (0, LeftBrace, "{"),
        (2, RightBrace, "}"),
        (4, LeftParen, "("),
        (6, RightParen, ")"),
        (8, LeftBracket, "["),
        (10, RightBracket, "]"),
        (12, Dot, "."),
        (14, Bang, "!"),
        (16, BangEqual, "!="),
        (19, Less, "<"),
        (21, LessEqual, "<="),
        (24, Greater, ">"),
        (26, GreaterEqual, ">="),
        (29, Equal, "="),
        (31, EqualEqual, "=="),
        (34, Comma, ","),
        (36, Minus, "-"),
        (38, Arrow, "->"),
        (41, Plus, "+"),
        (43, Star, "*"),
        (45, Slash, "/"),
        (47, Semicolon, ";"),
        (49, Colon, ":")
    );
}