690 lines
18 KiB
Rust
690 lines
18 KiB
Rust
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
|
pub enum TokenKind {
|
|
EOF,
|
|
Error,
|
|
|
|
Whitespace,
|
|
Comment,
|
|
|
|
LeftBrace, // TODO: LeftCurly
|
|
RightBrace, // TODO: RightCurly
|
|
LeftBracket, // TODO: LeftSquare
|
|
RightBracket, // TODO: RightSquare
|
|
LeftParen,
|
|
RightParen,
|
|
Comma,
|
|
Dot,
|
|
Minus,
|
|
Plus,
|
|
Semicolon,
|
|
Slash,
|
|
Star,
|
|
|
|
Arrow,
|
|
Colon,
|
|
|
|
Bang,
|
|
BangEqual,
|
|
Equal,
|
|
EqualEqual,
|
|
Greater,
|
|
GreaterEqual,
|
|
Less,
|
|
LessEqual,
|
|
|
|
Identifier,
|
|
String,
|
|
Number,
|
|
|
|
And,
|
|
As,
|
|
Async,
|
|
Await,
|
|
Class,
|
|
Else,
|
|
Export,
|
|
False,
|
|
For,
|
|
From,
|
|
Fun,
|
|
If,
|
|
Import,
|
|
In,
|
|
Is,
|
|
Let,
|
|
Match,
|
|
New,
|
|
Or,
|
|
Return,
|
|
Select,
|
|
Selff,
|
|
True,
|
|
Underscore,
|
|
While,
|
|
Yield,
|
|
}
|
|
|
|
// NOTE: Tokens are kinda big (like 40 bytes?) and AFAICT the only way to go
|
|
// smaller would be to find some other way to represent the error in an
|
|
// error token, but I'm kinda unwilling to do that.
|
|
//
|
|
#[derive(Debug, PartialEq, Eq, Clone)]
|
|
pub struct Token {
|
|
pub kind: TokenKind,
|
|
start: usize,
|
|
end: usize,
|
|
error: Option<Box<str>>,
|
|
}
|
|
|
|
impl Token {
|
|
pub fn new(kind: TokenKind, start: usize, end: usize) -> Self {
|
|
Token {
|
|
kind,
|
|
start,
|
|
end,
|
|
error: None,
|
|
}
|
|
}
|
|
|
|
pub fn error(start: usize, end: usize, message: String) -> Self {
|
|
Token {
|
|
kind: TokenKind::Error,
|
|
start,
|
|
end,
|
|
error: Some(message.into()),
|
|
}
|
|
}
|
|
|
|
pub fn start(&self) -> usize {
|
|
self.start
|
|
}
|
|
|
|
pub fn end(&self) -> usize {
|
|
self.end
|
|
}
|
|
|
|
pub fn len(&self) -> usize {
|
|
self.end() - self.start()
|
|
}
|
|
|
|
pub fn as_str<'a, 'b>(&'a self, source: &'b str) -> &'a str
|
|
where
|
|
'b: 'a,
|
|
{
|
|
if let Some(error) = &self.error {
|
|
&error
|
|
} else {
|
|
&source[self.start()..self.end()]
|
|
}
|
|
}
|
|
}
|
|
|
|
pub struct Lines {
|
|
newlines: Vec<usize>,
|
|
}
|
|
|
|
impl Lines {
|
|
fn new() -> Self {
|
|
Lines {
|
|
newlines: Vec::new(),
|
|
}
|
|
}
|
|
|
|
/// Record the position of a newline in the source.
|
|
pub fn add_line(&mut self, pos: usize) {
|
|
self.newlines.push(pos)
|
|
}
|
|
|
|
/// Return the position of the given token as a (line, column) pair. By
|
|
/// convention, lines are 1-based and columns are 0-based.
|
|
pub fn token_position(&self, token: &Token) -> (usize, usize) {
|
|
self.position(token.start)
|
|
}
|
|
|
|
/// Return the position of the given character offset as a (line,column)
|
|
/// pair. By convention, lines are 1-based and columns are 0-based.
|
|
pub fn position(&self, offset: usize) -> (usize, usize) {
|
|
let line_end_index = match self.newlines.binary_search(&offset) {
|
|
Ok(index) => index,
|
|
Err(index) => index,
|
|
};
|
|
let line_start_pos = if line_end_index == 0 {
|
|
0
|
|
} else {
|
|
self.newlines[line_end_index - 1] + 1
|
|
};
|
|
let line_number = line_end_index + 1;
|
|
let column_offset = offset - line_start_pos;
|
|
(line_number, column_offset)
|
|
}
|
|
}
|
|
|
|
pub struct Tokens<'a> {
|
|
source: &'a str,
|
|
chars: std::str::CharIndices<'a>,
|
|
next_char: Option<(usize, char)>,
|
|
lines: Lines,
|
|
}
|
|
|
|
impl<'a> Tokens<'a> {
|
|
pub fn new(source: &'a str) -> Self {
|
|
let mut result = Tokens {
|
|
source,
|
|
chars: source.char_indices(),
|
|
next_char: None,
|
|
lines: Lines::new(),
|
|
};
|
|
result.advance(); // Prime the pump
|
|
result
|
|
}
|
|
|
|
pub fn source(&self) -> &'a str {
|
|
self.source
|
|
}
|
|
|
|
pub fn lines(self) -> Lines {
|
|
self.lines
|
|
}
|
|
|
|
/// Return the position of the given token as a (line, column) pair. See
|
|
/// `Lines::token_position` for more information about the range, etc.
|
|
pub fn token_position(&self, token: &Token) -> (usize, usize) {
|
|
self.lines.token_position(token)
|
|
}
|
|
|
|
fn token(&self, start: usize, kind: TokenKind) -> Token {
|
|
Token::new(kind, start, self.pos())
|
|
}
|
|
|
|
fn number(&mut self, start: usize) -> Token {
|
|
// First, the main part.
|
|
loop {
|
|
if !self.matches_digit() {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Now the fraction part.
|
|
// The thing that is bad here is that this is speculative...
|
|
let backup = self.chars.clone();
|
|
if self.matches('.') {
|
|
let mut saw_digit = false;
|
|
loop {
|
|
if self.matches('_') {
|
|
} else if self.matches_next(|c| c.is_ascii_digit()) {
|
|
saw_digit = true;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if saw_digit {
|
|
// OK we're good to here! Check the scientific notation.
|
|
if self.matches('e') || self.matches('E') {
|
|
if self.matches('+') || self.matches('-') {}
|
|
let mut saw_digit = false;
|
|
loop {
|
|
if self.matches('_') {
|
|
} else if self.matches_next(|c| c.is_ascii_digit()) {
|
|
saw_digit = true;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if !saw_digit {
|
|
// This is just a broken number.
|
|
let slice = &self.source[start..self.pos()];
|
|
return Token::error(
|
|
start,
|
|
self.pos(),
|
|
format!("Invalid floating-point literal: {slice}"),
|
|
);
|
|
}
|
|
}
|
|
} else {
|
|
// Might be accessing a member on an integer.
|
|
self.chars = backup;
|
|
}
|
|
}
|
|
|
|
self.token(start, TokenKind::Number)
|
|
}
|
|
|
|
fn string(&mut self, start: usize, delimiter: char) -> Token {
|
|
while !self.matches(delimiter) {
|
|
if self.eof() {
|
|
return Token::error(
|
|
start,
|
|
self.pos(),
|
|
"Unterminated string constant".to_string(),
|
|
);
|
|
}
|
|
if self.matches('\\') {
|
|
self.advance();
|
|
} else {
|
|
self.advance();
|
|
}
|
|
}
|
|
|
|
self.token(start, TokenKind::String)
|
|
}
|
|
|
|
fn identifier_token_kind(ident: &str) -> TokenKind {
|
|
match ident.chars().nth(0).unwrap() {
|
|
'a' => {
|
|
if ident == "and" {
|
|
return TokenKind::And;
|
|
}
|
|
if ident == "as" {
|
|
return TokenKind::As;
|
|
}
|
|
if ident == "async" {
|
|
return TokenKind::Async;
|
|
}
|
|
if ident == "await" {
|
|
return TokenKind::Await;
|
|
}
|
|
}
|
|
'c' => {
|
|
if ident == "class" {
|
|
return TokenKind::Class;
|
|
}
|
|
}
|
|
'e' => {
|
|
if ident == "else" {
|
|
return TokenKind::Else;
|
|
}
|
|
if ident == "export" {
|
|
return TokenKind::Export;
|
|
}
|
|
}
|
|
'f' => {
|
|
if ident == "false" {
|
|
return TokenKind::False;
|
|
}
|
|
if ident == "for" {
|
|
return TokenKind::For;
|
|
}
|
|
if ident == "from" {
|
|
return TokenKind::From;
|
|
}
|
|
if ident == "fun" {
|
|
return TokenKind::Fun;
|
|
}
|
|
}
|
|
'i' => {
|
|
if ident == "if" {
|
|
return TokenKind::If;
|
|
}
|
|
if ident == "import" {
|
|
return TokenKind::Import;
|
|
}
|
|
if ident == "in" {
|
|
return TokenKind::In;
|
|
}
|
|
if ident == "is" {
|
|
return TokenKind::Is;
|
|
}
|
|
}
|
|
'l' => {
|
|
if ident == "let" {
|
|
return TokenKind::Let;
|
|
}
|
|
}
|
|
'm' => {
|
|
if ident == "match" {
|
|
return TokenKind::Match;
|
|
}
|
|
}
|
|
'n' => {
|
|
if ident == "new" {
|
|
return TokenKind::New;
|
|
}
|
|
}
|
|
'o' => {
|
|
if ident == "or" {
|
|
return TokenKind::Or;
|
|
}
|
|
}
|
|
'r' => {
|
|
if ident == "return" {
|
|
return TokenKind::Return;
|
|
}
|
|
}
|
|
's' => {
|
|
if ident == "select" {
|
|
return TokenKind::Select;
|
|
}
|
|
if ident == "self" {
|
|
return TokenKind::Selff;
|
|
}
|
|
}
|
|
't' => {
|
|
if ident == "true" {
|
|
return TokenKind::True;
|
|
}
|
|
}
|
|
'w' => {
|
|
if ident == "while" {
|
|
return TokenKind::While;
|
|
}
|
|
}
|
|
'y' => {
|
|
if ident == "yield" {
|
|
return TokenKind::Yield;
|
|
}
|
|
}
|
|
'_' => {
|
|
if ident == "_" {
|
|
return TokenKind::Underscore;
|
|
}
|
|
}
|
|
_ => (),
|
|
}
|
|
|
|
TokenKind::Identifier
|
|
}
|
|
|
|
fn identifier(&mut self, start: usize) -> Token {
|
|
loop {
|
|
// TODO: Use unicode identifier classes instead
|
|
if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') {
|
|
break;
|
|
}
|
|
}
|
|
|
|
let ident = &self.source[start..self.pos()];
|
|
let kind = Self::identifier_token_kind(ident);
|
|
Token::new(kind, start, self.pos())
|
|
}
|
|
|
|
fn matches(&mut self, ch: char) -> bool {
|
|
if let Some((_, next_ch)) = self.next_char {
|
|
if next_ch == ch {
|
|
self.advance();
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
fn matches_next<F>(&mut self, f: F) -> bool
|
|
where
|
|
F: FnOnce(char) -> bool,
|
|
{
|
|
if let Some((_, next_ch)) = self.next_char {
|
|
if f(next_ch) {
|
|
self.advance();
|
|
return true;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
fn matches_digit(&mut self) -> bool {
|
|
self.matches('_') || self.matches_next(|c| c.is_ascii_digit())
|
|
}
|
|
|
|
fn advance(&mut self) -> Option<(usize, char)> {
|
|
let result = self.next_char;
|
|
self.next_char = self.chars.next();
|
|
result
|
|
}
|
|
|
|
fn pos(&self) -> usize {
|
|
match self.next_char {
|
|
Some((p, _)) => p,
|
|
None => self.source.len(),
|
|
}
|
|
}
|
|
|
|
fn eof(&self) -> bool {
|
|
self.next_char.is_none()
|
|
}
|
|
|
|
fn whitespace(&mut self, pos: usize) -> Token {
|
|
while let Some((pos, ch)) = self.next_char {
|
|
if ch == '\n' {
|
|
self.lines.add_line(pos);
|
|
} else if !ch.is_whitespace() {
|
|
break;
|
|
}
|
|
self.advance();
|
|
}
|
|
self.token(pos, TokenKind::Whitespace)
|
|
}
|
|
|
|
fn comment(&mut self, pos: usize) -> Token {
|
|
while let Some((_, ch)) = self.next_char {
|
|
if ch == '\n' {
|
|
break;
|
|
}
|
|
self.advance();
|
|
}
|
|
self.token(pos, TokenKind::Comment)
|
|
}
|
|
|
|
pub fn next(&mut self) -> Token {
|
|
let (pos, c) = match self.advance() {
|
|
Some((p, c)) => (p, c),
|
|
None => return self.token(self.source.len(), TokenKind::EOF),
|
|
};
|
|
|
|
match c {
|
|
' ' | '\t' | '\r' => self.whitespace(pos),
|
|
'\n' => {
|
|
self.lines.add_line(pos);
|
|
self.whitespace(pos)
|
|
}
|
|
'{' => self.token(pos, TokenKind::LeftBrace),
|
|
'}' => self.token(pos, TokenKind::RightBrace),
|
|
'[' => self.token(pos, TokenKind::LeftBracket),
|
|
']' => self.token(pos, TokenKind::RightBracket),
|
|
'(' => self.token(pos, TokenKind::LeftParen),
|
|
')' => self.token(pos, TokenKind::RightParen),
|
|
',' => self.token(pos, TokenKind::Comma),
|
|
'.' => self.token(pos, TokenKind::Dot),
|
|
'-' => {
|
|
if self.matches('>') {
|
|
self.token(pos, TokenKind::Arrow)
|
|
} else {
|
|
self.token(pos, TokenKind::Minus)
|
|
}
|
|
}
|
|
'+' => self.token(pos, TokenKind::Plus),
|
|
':' => self.token(pos, TokenKind::Colon),
|
|
';' => self.token(pos, TokenKind::Semicolon),
|
|
'/' => {
|
|
if self.matches('/') {
|
|
self.comment(pos)
|
|
} else {
|
|
self.token(pos, TokenKind::Slash)
|
|
}
|
|
}
|
|
'*' => self.token(pos, TokenKind::Star),
|
|
'!' => {
|
|
if self.matches('=') {
|
|
self.token(pos, TokenKind::BangEqual)
|
|
} else {
|
|
self.token(pos, TokenKind::Bang)
|
|
}
|
|
}
|
|
'=' => {
|
|
if self.matches('=') {
|
|
self.token(pos, TokenKind::EqualEqual)
|
|
} else {
|
|
self.token(pos, TokenKind::Equal)
|
|
}
|
|
}
|
|
'>' => {
|
|
if self.matches('=') {
|
|
self.token(pos, TokenKind::GreaterEqual)
|
|
} else {
|
|
self.token(pos, TokenKind::Greater)
|
|
}
|
|
}
|
|
'<' => {
|
|
if self.matches('=') {
|
|
self.token(pos, TokenKind::LessEqual)
|
|
} else {
|
|
self.token(pos, TokenKind::Less)
|
|
}
|
|
}
|
|
'\'' => self.string(pos, '\''),
|
|
'"' => self.string(pos, '"'),
|
|
_ => {
|
|
if c.is_ascii_digit() {
|
|
self.number(pos)
|
|
} else if c.is_ascii_alphabetic() || c == '_' {
|
|
self.identifier(pos)
|
|
} else {
|
|
Token::error(pos, self.pos(), format!("Unexpected character '{c}'"))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use pretty_assertions::assert_eq;
|
|
|
|
fn test_tokens_impl(input: &str, expected: Vec<Token>, expected_text: Vec<String>) {
|
|
let mut result = Vec::new();
|
|
let mut result_text = Vec::new();
|
|
let mut tokens = Tokens::new(input);
|
|
let mut is_eof = false;
|
|
while !is_eof {
|
|
let token = tokens.next();
|
|
is_eof = token.kind == TokenKind::EOF;
|
|
if token.kind == TokenKind::Whitespace {
|
|
continue;
|
|
}
|
|
result_text.push(token.as_str(input).to_string());
|
|
result.push(token);
|
|
}
|
|
|
|
assert_eq!(expected, result);
|
|
assert_eq!(expected_text, result_text);
|
|
}
|
|
|
|
macro_rules! test_tokens {
|
|
($name:ident, $input:expr, $($s:expr),+) => {
|
|
#[test]
|
|
fn $name() {
|
|
use TokenKind::*;
|
|
|
|
let mut expected: Vec<Token> = (vec![$($s),*])
|
|
.into_iter()
|
|
.map(|t| Token::new(t.1, t.0, t.0 + t.2.len()))
|
|
.collect();
|
|
expected.push(Token::new(TokenKind::EOF, $input.len(), $input.len()));
|
|
|
|
let mut expected_text: Vec<_> = (vec![$($s),*])
|
|
.into_iter()
|
|
.map(|t| t.2.to_string())
|
|
.collect();
|
|
expected_text.push("".to_string());
|
|
|
|
test_tokens_impl($input, expected, expected_text);
|
|
}
|
|
}
|
|
}
|
|
|
|
test_tokens!(
|
|
numbers,
|
|
"1 1.0 1.2e7 2.3e+7 3.3E-06 7_6 8.0e_8",
|
|
(0, Number, "1"),
|
|
(2, Number, "1.0"),
|
|
(6, Number, "1.2e7"),
|
|
(12, Number, "2.3e+7"),
|
|
(19, Number, "3.3E-06"),
|
|
(27, Number, "7_6"),
|
|
(31, Number, "8.0e_8")
|
|
);
|
|
|
|
test_tokens!(
|
|
identifiers,
|
|
"asdf x _123 a_23 x3a and or yield async await class else false for from",
|
|
(0, Identifier, "asdf"),
|
|
(5, Identifier, "x"),
|
|
(7, Identifier, "_123"),
|
|
(12, Identifier, "a_23"),
|
|
(17, Identifier, "x3a"),
|
|
(21, And, "and"),
|
|
(25, Or, "or"),
|
|
(28, Yield, "yield"),
|
|
(34, Async, "async"),
|
|
(40, Await, "await"),
|
|
(46, Class, "class"),
|
|
(52, Else, "else"),
|
|
(57, False, "false"),
|
|
(63, For, "for"),
|
|
(67, From, "from")
|
|
);
|
|
|
|
test_tokens!(
|
|
more_keywords,
|
|
"fun if import let return select self true while truewhile new",
|
|
(0, Fun, "fun"),
|
|
(4, If, "if"),
|
|
(7, Import, "import"),
|
|
(14, Let, "let"),
|
|
(18, Return, "return"),
|
|
(25, Select, "select"),
|
|
(32, Selff, "self"),
|
|
(37, True, "true"),
|
|
(42, While, "while"),
|
|
(48, Identifier, "truewhile"),
|
|
(58, New, "new")
|
|
);
|
|
|
|
test_tokens!(
|
|
more_more_keywords,
|
|
"in is match _ as export",
|
|
(0, In, "in"),
|
|
(3, Is, "is"),
|
|
(6, Match, "match"),
|
|
(12, Underscore, "_"),
|
|
(14, As, "as"),
|
|
(17, Export, "export")
|
|
);
|
|
|
|
test_tokens!(
|
|
strings,
|
|
r#"'this is a string that\'s great!\r\n' "foo's" 'bar"s' "#,
|
|
(0, String, r#"'this is a string that\'s great!\r\n'"#),
|
|
(38, String, r#""foo's""#),
|
|
(46, String, "'bar\"s'")
|
|
);
|
|
|
|
test_tokens!(
|
|
symbols,
|
|
"{ } ( ) [ ] . ! != < <= > >= = == , - -> + * / ; :",
|
|
(0, LeftBrace, "{"),
|
|
(2, RightBrace, "}"),
|
|
(4, LeftParen, "("),
|
|
(6, RightParen, ")"),
|
|
(8, LeftBracket, "["),
|
|
(10, RightBracket, "]"),
|
|
(12, Dot, "."),
|
|
(14, Bang, "!"),
|
|
(16, BangEqual, "!="),
|
|
(19, Less, "<"),
|
|
(21, LessEqual, "<="),
|
|
(24, Greater, ">"),
|
|
(26, GreaterEqual, ">="),
|
|
(29, Equal, "="),
|
|
(31, EqualEqual, "=="),
|
|
(34, Comma, ","),
|
|
(36, Minus, "-"),
|
|
(38, Arrow, "->"),
|
|
(41, Plus, "+"),
|
|
(43, Star, "*"),
|
|
(45, Slash, "/"),
|
|
(47, Semicolon, ";"),
|
|
(49, Colon, ":")
|
|
);
|
|
}
|