[oden-script] Tokens

This commit is contained in:
John Doty 2023-12-30 17:15:05 -08:00
parent 8a7cee1c82
commit 7fccab8f59
5 changed files with 598 additions and 460 deletions

25
oden-script/Cargo.lock generated
View file

@ -2,6 +2,31 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "diff"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
[[package]]
name = "oden-script"
version = "0.1.0"
dependencies = [
"pretty_assertions",
]
[[package]]
name = "pretty_assertions"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66"
dependencies = [
"diff",
"yansi",
]
[[package]]
name = "yansi"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec"

View file

@ -2,3 +2,6 @@
name = "oden-script"
version = "0.1.0"
edition = "2021"
[dev-dependencies]
pretty_assertions = "1.4.0"

View file

@ -1,458 +1 @@
#[derive(Debug)]
pub enum TokenKind<'a> {
LeftBrace,
RightBrace,
LeftBracket,
RightBracket,
LeftParen,
RightParen,
Comma,
Dot,
Minus,
Plus,
Semicolon,
Slash,
Star,
Bang,
BangEqual,
Equal,
EqualEqual,
Greater,
GreaterEqual,
Less,
LessEqual,
Identifier(&'a str), // TODO
String(&'a str),
Number(&'a str),
And,
Async,
Await,
Class,
Else,
False,
For,
From,
Fun,
If,
Let,
Or,
Print,
Return,
Select,
This,
True,
While,
Yield,
Error(String),
}
#[derive(Debug)]
pub struct Token<'a> {
kind: TokenKind<'a>,
start: usize,
}
impl<'a> Token<'a> {
pub fn as_str<'b>(&'b self) -> &'a str
where
'b: 'a,
{
use TokenKind::*;
match &self.kind {
LeftBrace => "{",
RightBrace => "}",
LeftBracket => "[",
RightBracket => "]",
LeftParen => "(",
RightParen => ")",
Comma => ",",
Dot => ".",
Minus => "-",
Plus => "+",
Semicolon => ";",
Slash => "/",
Star => "*",
Bang => "+",
BangEqual => "!=",
Equal => "=",
EqualEqual => "==",
Greater => ">",
GreaterEqual => ">=",
Less => "<",
LessEqual => "<=",
Identifier(v) => v,
String(v) => v,
Number(v) => v,
And => "and",
Async => "async",
Await => "await",
Class => "class",
Else => "else",
False => "false",
For => "for",
From => "from",
Fun => "fun",
If => "if",
Let => "let",
Or => "or",
Print => "print",
Return => "return",
Select => "select",
This => "this",
True => "true",
While => "while",
Yield => "yield",
Error(e) => e,
}
}
}
pub struct Tokens<'a> {
source: &'a str,
chars: std::str::CharIndices<'a>,
next_char: Option<(usize, char)>,
newlines: Vec<usize>,
}
impl<'a> Tokens<'a> {
pub fn new(source: &'a str) -> Self {
let mut chars = source.char_indices();
let next_char = chars.next();
Tokens {
source,
chars,
next_char,
newlines: Vec::new(),
}
}
pub fn token_position(&self, token: &Token) -> (usize, usize) {
let line_end_index = match self.newlines.binary_search(&token.start) {
Ok(index) => index,
Err(index) => index,
};
let line_start_pos = if line_end_index == 0 {
0
} else {
self.newlines[line_end_index - 1] + 1
};
let line_number = line_end_index + 1;
let column_offset = token.start - line_start_pos;
(line_number, column_offset)
}
pub fn next_token(&mut self) -> Option<Token<'a>> {
self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving
let (pos, c) = match self.advance() {
Some((p, c)) => (p, c),
None => return None,
};
let token = match c {
'{' => TokenKind::LeftBrace,
'}' => TokenKind::RightBrace,
'[' => TokenKind::LeftBracket,
']' => TokenKind::RightBracket,
'(' => TokenKind::LeftParen,
')' => TokenKind::RightParen,
',' => TokenKind::Comma,
'.' => TokenKind::Dot,
'-' => {
if self.matches_next(|c| c.is_ascii_digit()) {
self.number(pos)
} else {
TokenKind::Minus
}
}
'+' => {
if self.matches_next(|c| c.is_ascii_digit()) {
self.number(pos)
} else {
TokenKind::Plus
}
}
';' => TokenKind::Semicolon,
'/' => TokenKind::Slash,
'*' => TokenKind::Star,
'!' => {
if self.matches('=') {
TokenKind::BangEqual
} else {
TokenKind::Bang
}
}
'=' => {
if self.matches('=') {
TokenKind::EqualEqual
} else {
TokenKind::Equal
}
}
'>' => {
if self.matches('=') {
TokenKind::GreaterEqual
} else {
TokenKind::Greater
}
}
'<' => {
if self.matches('=') {
TokenKind::LessEqual
} else {
TokenKind::Less
}
}
'\'' => self.string(pos, '\''),
'"' => self.string(pos, '"'),
_ => {
if self.matches_next(|c| c.is_ascii_digit()) {
self.number(pos)
} else if self.matches_next(|c| c.is_ascii_alphabetic() || c == '_') {
self.identifier(pos)
} else {
TokenKind::Error(format!("Unexpected character '{c}'"))
}
}
};
let token = self.token(pos, token);
Some(token)
}
fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> {
Token { kind, start }
}
fn number(&mut self, start: usize) -> TokenKind<'a> {
// First, the main part.
loop {
if !self.matches_digit() {
break;
}
}
// Now the fraction part.
// The thing that is bad here is that this is speculative...
let backup = self.chars.clone();
if self.matches('.') {
let mut saw_digit = false;
loop {
if self.matches('_') {
} else if self.matches_next(|c| c.is_ascii_digit()) {
saw_digit = true;
} else {
break;
}
}
if saw_digit {
// OK we're good to here! Check the scientific notation.
if self.matches('e') || self.matches('E') {
if self.matches('+') || self.matches('-') {}
let mut saw_digit = false;
loop {
if self.matches('_') {
} else if self.matches_next(|c| c.is_ascii_digit()) {
saw_digit = true;
} else {
break;
}
}
if !saw_digit {
// This is just a broken number.
let slice = &self.source[start..self.pos()];
return TokenKind::Error(format!(
"Invalid floating-point literal: {slice}"
));
}
}
} else {
// Might be accessing a member on an integer.
self.chars = backup;
}
}
TokenKind::Number(&self.source[start..self.pos()])
}
fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> {
while !self.matches(delimiter) {
if self.eof() {
return TokenKind::Error("Unterminated string constant".to_string());
}
if self.matches('\\') {
self.advance();
}
}
TokenKind::String(&self.source[start..self.pos()])
}
fn identifier(&mut self, start: usize) -> TokenKind<'a> {
loop {
// TODO: Use unicode identifier classes instead
if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') {
break;
}
}
let ident = &self.source[start..self.pos()];
match ident.chars().nth(0) {
Some('a') => {
if ident == "and" {
return TokenKind::And;
}
if ident == "async" {
return TokenKind::Async;
}
if ident == "await" {
return TokenKind::Await;
}
}
Some('c') => {
if ident == "class" {
return TokenKind::Class;
}
}
Some('e') => {
if ident == "else" {
return TokenKind::Else;
}
}
Some('f') => {
if ident == "for" {
return TokenKind::For;
}
if ident == "from" {
return TokenKind::From;
}
if ident == "fun" {
return TokenKind::Fun;
}
}
Some('i') => {
if ident == "if" {
return TokenKind::If;
}
}
Some('l') => {
if ident == "let" {
return TokenKind::Let;
}
}
Some('o') => {
if ident == "or" {
return TokenKind::Or;
}
}
Some('p') => {
if ident == "print" {
return TokenKind::Print;
}
}
Some('r') => {
if ident == "return" {
return TokenKind::Return;
}
}
Some('s') => {
if ident == "select" {
return TokenKind::Select;
}
}
Some('t') => {
if ident == "this" {
return TokenKind::This;
}
if ident == "true" {
return TokenKind::True;
}
}
Some('w') => {
if ident == "while" {
return TokenKind::While;
}
}
Some('y') => {
if ident == "yield" {
return TokenKind::Yield;
}
}
_ => (),
}
TokenKind::Identifier(ident)
}
fn matches(&mut self, ch: char) -> bool {
if let Some((_, next_ch)) = self.next_char {
if next_ch == ch {
self.advance();
return true;
}
}
false
}
fn matches_next<F>(&mut self, f: F) -> bool
where
F: FnOnce(char) -> bool,
{
if let Some((_, next_ch)) = self.next_char {
if f(next_ch) {
self.advance();
return true;
}
}
false
}
fn matches_digit(&mut self) -> bool {
self.matches('_') || self.matches_next(|c| c.is_ascii_digit())
}
fn advance(&mut self) -> Option<(usize, char)> {
let result = self.next_char;
self.next_char = self.chars.next();
result
}
fn pos(&self) -> usize {
match self.next_char {
Some((p, _)) => p,
None => self.source.len(),
}
}
fn eof(&self) -> bool {
self.next_char.is_none()
}
fn skip_whitespace(&mut self) {
while let Some((pos, ch)) = self.next_char {
if ch == '\n' {
self.newlines.push(pos);
} else if !ch.is_whitespace() {
break;
}
self.advance();
}
}
}
pub fn tokenize(input: String) {
let mut tokens = Tokens::new(&input);
while let Some(token) = tokens.next_token() {
println!("{}: {}", token.start, token.as_str());
}
}
pub mod tokens;

View file

@ -1,3 +1 @@
use oden_script;
pub fn main() {}

569
oden-script/src/tokens.rs Normal file
View file

@ -0,0 +1,569 @@
#[derive(Debug, PartialEq, Eq)]
pub enum TokenKind<'a> {
LeftBrace,
RightBrace,
LeftBracket,
RightBracket,
LeftParen,
RightParen,
Comma,
Dot,
Minus,
Plus,
Semicolon,
Slash,
Star,
Bang,
BangEqual,
Equal,
EqualEqual,
Greater,
GreaterEqual,
Less,
LessEqual,
Identifier(&'a str), // TODO
String(&'a str),
Number(&'a str),
And,
Async,
Await,
Class,
Else,
False,
For,
From,
Fun,
If,
Let,
Or,
Print,
Return,
Select,
This,
True,
While,
Yield,
Error(String),
}
#[derive(Debug, PartialEq, Eq)]
pub struct Token<'a> {
kind: TokenKind<'a>,
start: usize,
}
impl<'a> Token<'a> {
pub fn new(start: usize, kind: TokenKind<'a>) -> Self {
Token { kind, start }
}
pub fn as_str<'b>(&'b self) -> &'a str
where
'b: 'a,
{
use TokenKind::*;
match &self.kind {
LeftBrace => "{",
RightBrace => "}",
LeftBracket => "[",
RightBracket => "]",
LeftParen => "(",
RightParen => ")",
Comma => ",",
Dot => ".",
Minus => "-",
Plus => "+",
Semicolon => ";",
Slash => "/",
Star => "*",
Bang => "+",
BangEqual => "!=",
Equal => "=",
EqualEqual => "==",
Greater => ">",
GreaterEqual => ">=",
Less => "<",
LessEqual => "<=",
Identifier(v) => v,
String(v) => v,
Number(v) => v,
And => "and",
Async => "async",
Await => "await",
Class => "class",
Else => "else",
False => "false",
For => "for",
From => "from",
Fun => "fun",
If => "if",
Let => "let",
Or => "or",
Print => "print",
Return => "return",
Select => "select",
This => "this",
True => "true",
While => "while",
Yield => "yield",
Error(e) => e,
}
}
}
pub struct Tokens<'a> {
source: &'a str,
chars: std::str::CharIndices<'a>,
next_char: Option<(usize, char)>,
newlines: Vec<usize>,
}
impl<'a> Tokens<'a> {
pub fn new(source: &'a str) -> Self {
let mut result = Tokens {
source,
chars: source.char_indices(),
next_char: None,
newlines: Vec::new(),
};
result.advance(); // Prime the pump
result
}
pub fn token_position(&self, token: &Token) -> (usize, usize) {
let line_end_index = match self.newlines.binary_search(&token.start) {
Ok(index) => index,
Err(index) => index,
};
let line_start_pos = if line_end_index == 0 {
0
} else {
self.newlines[line_end_index - 1] + 1
};
let line_number = line_end_index + 1;
let column_offset = token.start - line_start_pos;
(line_number, column_offset)
}
fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> {
Token::new(start, kind)
}
fn number(&mut self, start: usize) -> TokenKind<'a> {
// First, the main part.
loop {
if !self.matches_digit() {
break;
}
}
// Now the fraction part.
// The thing that is bad here is that this is speculative...
let backup = self.chars.clone();
if self.matches('.') {
let mut saw_digit = false;
loop {
if self.matches('_') {
} else if self.matches_next(|c| c.is_ascii_digit()) {
saw_digit = true;
} else {
break;
}
}
if saw_digit {
// OK we're good to here! Check the scientific notation.
if self.matches('e') || self.matches('E') {
if self.matches('+') || self.matches('-') {}
let mut saw_digit = false;
loop {
if self.matches('_') {
} else if self.matches_next(|c| c.is_ascii_digit()) {
saw_digit = true;
} else {
break;
}
}
if !saw_digit {
// This is just a broken number.
let slice = &self.source[start..self.pos()];
return TokenKind::Error(format!(
"Invalid floating-point literal: {slice}"
));
}
}
} else {
// Might be accessing a member on an integer.
self.chars = backup;
}
}
TokenKind::Number(&self.source[start..self.pos()])
}
fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> {
while !self.matches(delimiter) {
if self.eof() {
return TokenKind::Error("Unterminated string constant".to_string());
}
if self.matches('\\') {
self.advance();
} else {
self.advance();
}
}
TokenKind::String(&self.source[start..self.pos()])
}
fn identifier(&mut self, start: usize) -> TokenKind<'a> {
loop {
// TODO: Use unicode identifier classes instead
if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') {
break;
}
}
let ident = &self.source[start..self.pos()];
match ident.chars().nth(0) {
Some('a') => {
if ident == "and" {
return TokenKind::And;
}
if ident == "async" {
return TokenKind::Async;
}
if ident == "await" {
return TokenKind::Await;
}
}
Some('c') => {
if ident == "class" {
return TokenKind::Class;
}
}
Some('e') => {
if ident == "else" {
return TokenKind::Else;
}
}
Some('f') => {
if ident == "false" {
return TokenKind::False;
}
if ident == "for" {
return TokenKind::For;
}
if ident == "from" {
return TokenKind::From;
}
if ident == "fun" {
return TokenKind::Fun;
}
}
Some('i') => {
if ident == "if" {
return TokenKind::If;
}
}
Some('l') => {
if ident == "let" {
return TokenKind::Let;
}
}
Some('o') => {
if ident == "or" {
return TokenKind::Or;
}
}
Some('p') => {
if ident == "print" {
return TokenKind::Print;
}
}
Some('r') => {
if ident == "return" {
return TokenKind::Return;
}
}
Some('s') => {
if ident == "select" {
return TokenKind::Select;
}
}
Some('t') => {
if ident == "this" {
return TokenKind::This;
}
if ident == "true" {
return TokenKind::True;
}
}
Some('w') => {
if ident == "while" {
return TokenKind::While;
}
}
Some('y') => {
if ident == "yield" {
return TokenKind::Yield;
}
}
_ => (),
}
TokenKind::Identifier(ident)
}
fn matches(&mut self, ch: char) -> bool {
if let Some((_, next_ch)) = self.next_char {
if next_ch == ch {
self.advance();
return true;
}
}
false
}
fn matches_next<F>(&mut self, f: F) -> bool
where
F: FnOnce(char) -> bool,
{
if let Some((_, next_ch)) = self.next_char {
if f(next_ch) {
eprintln!("MATCHES NEXT: {next_ch}");
self.advance();
return true;
} else {
eprintln!("NOT MATCHES NEXT: {next_ch}");
}
} else {
eprintln!("E O F");
}
false
}
fn matches_digit(&mut self) -> bool {
self.matches('_') || self.matches_next(|c| c.is_ascii_digit())
}
fn advance(&mut self) -> Option<(usize, char)> {
let result = self.next_char;
self.next_char = self.chars.next();
eprintln!("NEXT: {:?}", self.next_char);
result
}
fn pos(&self) -> usize {
match self.next_char {
Some((p, _)) => p,
None => self.source.len(),
}
}
fn eof(&self) -> bool {
self.next_char.is_none()
}
fn skip_whitespace(&mut self) {
while let Some((pos, ch)) = self.next_char {
if ch == '\n' {
self.newlines.push(pos);
} else if !ch.is_whitespace() {
break;
}
self.advance();
}
}
}
impl<'a> std::iter::Iterator for Tokens<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
self.skip_whitespace(); // TODO: Whitespace preserving/comment preserving
let (pos, c) = match self.advance() {
Some((p, c)) => (p, c),
None => return None,
};
let token = match c {
'{' => TokenKind::LeftBrace,
'}' => TokenKind::RightBrace,
'[' => TokenKind::LeftBracket,
']' => TokenKind::RightBracket,
'(' => TokenKind::LeftParen,
')' => TokenKind::RightParen,
',' => TokenKind::Comma,
'.' => TokenKind::Dot,
'-' => {
if self.matches_next(|c| c.is_ascii_digit()) {
self.number(pos)
} else {
TokenKind::Minus
}
}
'+' => {
if self.matches_next(|c| c.is_ascii_digit()) {
self.number(pos)
} else {
TokenKind::Plus
}
}
';' => TokenKind::Semicolon,
'/' => TokenKind::Slash,
'*' => TokenKind::Star,
'!' => {
if self.matches('=') {
TokenKind::BangEqual
} else {
TokenKind::Bang
}
}
'=' => {
if self.matches('=') {
TokenKind::EqualEqual
} else {
TokenKind::Equal
}
}
'>' => {
if self.matches('=') {
TokenKind::GreaterEqual
} else {
TokenKind::Greater
}
}
'<' => {
if self.matches('=') {
TokenKind::LessEqual
} else {
TokenKind::Less
}
}
'\'' => self.string(pos, '\''),
'"' => self.string(pos, '"'),
_ => {
if c.is_ascii_digit() {
self.number(pos)
} else if c.is_ascii_alphabetic() || c == '_' {
self.identifier(pos)
} else {
TokenKind::Error(format!("Unexpected character '{c}'"))
}
}
};
let token = self.token(pos, token);
Some(token)
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
macro_rules! test_tokens {
($name:ident, $input:expr, $($s:expr),+) => {
#[test]
fn $name() {
use TokenKind::*;
let tokens: Vec<_> = Tokens::new($input).collect();
let expected = vec![$($s),*];
assert_eq!(expected, tokens);
}
}
}
test_tokens!(
numbers,
"1 1.0 1.2e7 2.3e+7 3.3E-06 7_6 8.0e_8",
Token::new(0, Number("1")),
Token::new(2, Number("1.0")),
Token::new(6, Number("1.2e7")),
Token::new(12, Number("2.3e+7")),
Token::new(19, Number("3.3E-06")),
Token::new(27, Number("7_6")),
Token::new(31, Number("8.0e_8"))
);
test_tokens!(
identifiers,
"asdf x _123 a_23 x3a and or yield async await class else false for from",
Token::new(0, Identifier("asdf")),
Token::new(5, Identifier("x")),
Token::new(7, Identifier("_123")),
Token::new(12, Identifier("a_23")),
Token::new(17, Identifier("x3a")),
Token::new(21, And),
Token::new(25, Or),
Token::new(28, Yield),
Token::new(34, Async),
Token::new(40, Await),
Token::new(46, Class),
Token::new(52, Else),
Token::new(57, False),
Token::new(63, For),
Token::new(67, From)
);
test_tokens!(
more_keywords,
"fun if let print return select this true while truewhile",
Token::new(0, Fun),
Token::new(4, If),
Token::new(7, Let),
Token::new(11, Print),
Token::new(17, Return),
Token::new(24, Select),
Token::new(31, This),
Token::new(36, True),
Token::new(41, While),
Token::new(47, Identifier("truewhile"))
);
test_tokens!(
strings,
r#"'this is a string that\'s great!\r\n' "foo's" 'bar"s' "#,
Token::new(0, String(r#"'this is a string that\'s great!\r\n'"#)),
Token::new(38, String(r#""foo's""#)),
Token::new(46, String("'bar\"s'"))
);
test_tokens!(
symbols,
"{ } ( ) [ ] . ! != < <= > >= = == , - + * / ;",
Token::new(0, LeftBrace),
Token::new(2, RightBrace),
Token::new(4, LeftParen),
Token::new(6, RightParen),
Token::new(8, LeftBracket),
Token::new(10, RightBracket),
Token::new(12, Dot),
Token::new(14, Bang),
Token::new(16, BangEqual),
Token::new(19, Less),
Token::new(21, LessEqual),
Token::new(24, Greater),
Token::new(26, GreaterEqual),
Token::new(29, Equal),
Token::new(31, EqualEqual),
Token::new(34, Comma),
Token::new(36, Minus),
Token::new(38, Plus),
Token::new(40, Star),
Token::new(42, Slash),
Token::new(44, Semicolon)
);
}