[fine] Starting to parse (ugh)

This commit is contained in:
John Doty 2024-01-01 08:07:29 -08:00
parent 7fccab8f59
commit ece5576fb2
3 changed files with 534 additions and 185 deletions

View file

@ -1,5 +1,5 @@
#[derive(Debug, PartialEq, Eq)]
pub enum TokenKind<'a> {
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum TokenKind {
LeftBrace,
RightBrace,
LeftBracket,
@ -23,9 +23,9 @@ pub enum TokenKind<'a> {
Less,
LessEqual,
Identifier(&'a str), // TODO
String(&'a str),
Number(&'a str),
Identifier,
String,
Number,
And,
Async,
@ -47,80 +47,54 @@ pub enum TokenKind<'a> {
While,
Yield,
Error(String),
Error,
}
#[derive(Debug, PartialEq, Eq)]
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Token<'a> {
kind: TokenKind<'a>,
kind: TokenKind,
start: usize,
value: Result<&'a str, String>,
}
impl<'a> Token<'a> {
pub fn new(start: usize, kind: TokenKind<'a>) -> Self {
Token { kind, start }
pub fn new(kind: TokenKind, start: usize, value: &'a str) -> Self {
Token {
kind,
start,
value: Ok(value),
}
}
pub fn error(start: usize, message: String) -> Self {
Token {
kind: TokenKind::Error,
start,
value: Err(message),
}
}
pub fn kind(&self) -> TokenKind {
self.kind
}
pub fn as_str<'b>(&'b self) -> &'a str
where
'b: 'a,
{
use TokenKind::*;
match &self.kind {
LeftBrace => "{",
RightBrace => "}",
LeftBracket => "[",
RightBracket => "]",
LeftParen => "(",
RightParen => ")",
Comma => ",",
Dot => ".",
Minus => "-",
Plus => "+",
Semicolon => ";",
Slash => "/",
Star => "*",
Bang => "+",
BangEqual => "!=",
Equal => "=",
EqualEqual => "==",
Greater => ">",
GreaterEqual => ">=",
Less => "<",
LessEqual => "<=",
Identifier(v) => v,
String(v) => v,
Number(v) => v,
And => "and",
Async => "async",
Await => "await",
Class => "class",
Else => "else",
False => "false",
For => "for",
From => "from",
Fun => "fun",
If => "if",
Let => "let",
Or => "or",
Print => "print",
Return => "return",
Select => "select",
This => "this",
True => "true",
While => "while",
Yield => "yield",
Error(e) => e,
match &self.value {
Ok(v) => v,
Err(e) => &e,
}
}
}
impl<'a> Into<String> for Token<'a> {
fn into(self) -> String {
self.as_str().to_string()
}
}
pub struct Tokens<'a> {
source: &'a str,
chars: std::str::CharIndices<'a>,
@ -140,8 +114,17 @@ impl<'a> Tokens<'a> {
result
}
pub fn token_position(&self, token: &Token) -> (usize, usize) {
let line_end_index = match self.newlines.binary_search(&token.start) {
/// Return the position of the given token as a (line, column) pair. By
/// convention, lines are 1-based and columns are 0-based. Also, in
/// keeping with the iterator-nature of the tokenizer, `None` here
/// indicates end-of-file, and will return the position of the end of the
/// file.
pub fn token_position(&self, token: &Option<Token>) -> (usize, usize) {
let start = match token {
Some(t) => t.start,
None => self.source.len(),
};
let line_end_index = match self.newlines.binary_search(&start) {
Ok(index) => index,
Err(index) => index,
};
@ -151,15 +134,16 @@ impl<'a> Tokens<'a> {
self.newlines[line_end_index - 1] + 1
};
let line_number = line_end_index + 1;
let column_offset = token.start - line_start_pos;
let column_offset = start - line_start_pos;
(line_number, column_offset)
}
fn token(&self, start: usize, kind: TokenKind<'a>) -> Token<'a> {
Token::new(start, kind)
fn token(&self, start: usize, kind: TokenKind) -> Token<'a> {
let value = &self.source[start..self.pos()];
Token::new(kind, start, value)
}
fn number(&mut self, start: usize) -> TokenKind<'a> {
fn number(&mut self, start: usize) -> Token<'a> {
// First, the main part.
loop {
if !self.matches_digit() {
@ -198,9 +182,10 @@ impl<'a> Tokens<'a> {
if !saw_digit {
// This is just a broken number.
let slice = &self.source[start..self.pos()];
return TokenKind::Error(format!(
"Invalid floating-point literal: {slice}"
));
return Token::error(
start,
format!("Invalid floating-point literal: {slice}"),
);
}
}
} else {
@ -209,13 +194,13 @@ impl<'a> Tokens<'a> {
}
}
TokenKind::Number(&self.source[start..self.pos()])
self.token(start, TokenKind::Number)
}
fn string(&mut self, start: usize, delimiter: char) -> TokenKind<'a> {
fn string(&mut self, start: usize, delimiter: char) -> Token<'a> {
while !self.matches(delimiter) {
if self.eof() {
return TokenKind::Error("Unterminated string constant".to_string());
return Token::error(start, "Unterminated string constant".to_string());
}
if self.matches('\\') {
self.advance();
@ -224,20 +209,12 @@ impl<'a> Tokens<'a> {
}
}
TokenKind::String(&self.source[start..self.pos()])
self.token(start, TokenKind::String)
}
fn identifier(&mut self, start: usize) -> TokenKind<'a> {
loop {
// TODO: Use unicode identifier classes instead
if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') {
break;
}
}
let ident = &self.source[start..self.pos()];
match ident.chars().nth(0) {
Some('a') => {
fn identifier_token_kind(ident: &str) -> TokenKind {
match ident.chars().nth(0).unwrap() {
'a' => {
if ident == "and" {
return TokenKind::And;
}
@ -248,17 +225,17 @@ impl<'a> Tokens<'a> {
return TokenKind::Await;
}
}
Some('c') => {
'c' => {
if ident == "class" {
return TokenKind::Class;
}
}
Some('e') => {
'e' => {
if ident == "else" {
return TokenKind::Else;
}
}
Some('f') => {
'f' => {
if ident == "false" {
return TokenKind::False;
}
@ -272,37 +249,37 @@ impl<'a> Tokens<'a> {
return TokenKind::Fun;
}
}
Some('i') => {
'i' => {
if ident == "if" {
return TokenKind::If;
}
}
Some('l') => {
'l' => {
if ident == "let" {
return TokenKind::Let;
}
}
Some('o') => {
'o' => {
if ident == "or" {
return TokenKind::Or;
}
}
Some('p') => {
'p' => {
if ident == "print" {
return TokenKind::Print;
}
}
Some('r') => {
'r' => {
if ident == "return" {
return TokenKind::Return;
}
}
Some('s') => {
's' => {
if ident == "select" {
return TokenKind::Select;
}
}
Some('t') => {
't' => {
if ident == "this" {
return TokenKind::This;
}
@ -310,12 +287,12 @@ impl<'a> Tokens<'a> {
return TokenKind::True;
}
}
Some('w') => {
'w' => {
if ident == "while" {
return TokenKind::While;
}
}
Some('y') => {
'y' => {
if ident == "yield" {
return TokenKind::Yield;
}
@ -323,7 +300,20 @@ impl<'a> Tokens<'a> {
_ => (),
}
TokenKind::Identifier(ident)
TokenKind::Identifier
}
fn identifier(&mut self, start: usize) -> Token<'a> {
loop {
// TODO: Use unicode identifier classes instead
if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') {
break;
}
}
let ident = &self.source[start..self.pos()];
let kind = Self::identifier_token_kind(ident);
Token::new(kind, start, ident)
}
fn matches(&mut self, ch: char) -> bool {
@ -342,14 +332,9 @@ impl<'a> Tokens<'a> {
{
if let Some((_, next_ch)) = self.next_char {
if f(next_ch) {
eprintln!("MATCHES NEXT: {next_ch}");
self.advance();
return true;
} else {
eprintln!("NOT MATCHES NEXT: {next_ch}");
}
} else {
eprintln!("E O F");
}
false
}
@ -361,7 +346,6 @@ impl<'a> Tokens<'a> {
fn advance(&mut self) -> Option<(usize, char)> {
let result = self.next_char;
self.next_char = self.chars.next();
eprintln!("NEXT: {:?}", self.next_char);
result
}
@ -399,57 +383,57 @@ impl<'a> std::iter::Iterator for Tokens<'a> {
};
let token = match c {
'{' => TokenKind::LeftBrace,
'}' => TokenKind::RightBrace,
'[' => TokenKind::LeftBracket,
']' => TokenKind::RightBracket,
'(' => TokenKind::LeftParen,
')' => TokenKind::RightParen,
',' => TokenKind::Comma,
'.' => TokenKind::Dot,
'{' => self.token(pos, TokenKind::LeftBrace),
'}' => self.token(pos, TokenKind::RightBrace),
'[' => self.token(pos, TokenKind::LeftBracket),
']' => self.token(pos, TokenKind::RightBracket),
'(' => self.token(pos, TokenKind::LeftParen),
')' => self.token(pos, TokenKind::RightParen),
',' => self.token(pos, TokenKind::Comma),
'.' => self.token(pos, TokenKind::Dot),
'-' => {
if self.matches_next(|c| c.is_ascii_digit()) {
self.number(pos)
} else {
TokenKind::Minus
self.token(pos, TokenKind::Minus)
}
}
'+' => {
if self.matches_next(|c| c.is_ascii_digit()) {
self.number(pos)
} else {
TokenKind::Plus
self.token(pos, TokenKind::Plus)
}
}
';' => TokenKind::Semicolon,
'/' => TokenKind::Slash,
'*' => TokenKind::Star,
';' => self.token(pos, TokenKind::Semicolon),
'/' => self.token(pos, TokenKind::Slash),
'*' => self.token(pos, TokenKind::Star),
'!' => {
if self.matches('=') {
TokenKind::BangEqual
self.token(pos, TokenKind::BangEqual)
} else {
TokenKind::Bang
self.token(pos, TokenKind::Bang)
}
}
'=' => {
if self.matches('=') {
TokenKind::EqualEqual
self.token(pos, TokenKind::EqualEqual)
} else {
TokenKind::Equal
self.token(pos, TokenKind::Equal)
}
}
'>' => {
if self.matches('=') {
TokenKind::GreaterEqual
self.token(pos, TokenKind::GreaterEqual)
} else {
TokenKind::Greater
self.token(pos, TokenKind::Greater)
}
}
'<' => {
if self.matches('=') {
TokenKind::LessEqual
self.token(pos, TokenKind::LessEqual)
} else {
TokenKind::Less
self.token(pos, TokenKind::Less)
}
}
'\'' => self.string(pos, '\''),
@ -460,11 +444,10 @@ impl<'a> std::iter::Iterator for Tokens<'a> {
} else if c.is_ascii_alphabetic() || c == '_' {
self.identifier(pos)
} else {
TokenKind::Error(format!("Unexpected character '{c}'"))
Token::error(pos, format!("Unexpected character '{c}'"))
}
}
};
let token = self.token(pos, token);
Some(token)
}
}
@ -480,7 +463,12 @@ mod tests {
fn $name() {
use TokenKind::*;
let tokens: Vec<_> = Tokens::new($input).collect();
let expected = vec![$($s),*];
let expected: Vec<Token> = (vec![$($s),*])
.into_iter()
.map(|t| Token::new(t.1, t.0, t.2))
.collect();
assert_eq!(expected, tokens);
}
}
@ -489,81 +477,81 @@ mod tests {
test_tokens!(
numbers,
"1 1.0 1.2e7 2.3e+7 3.3E-06 7_6 8.0e_8",
Token::new(0, Number("1")),
Token::new(2, Number("1.0")),
Token::new(6, Number("1.2e7")),
Token::new(12, Number("2.3e+7")),
Token::new(19, Number("3.3E-06")),
Token::new(27, Number("7_6")),
Token::new(31, Number("8.0e_8"))
(0, Number, "1"),
(2, Number, "1.0"),
(6, Number, "1.2e7"),
(12, Number, "2.3e+7"),
(19, Number, "3.3E-06"),
(27, Number, "7_6"),
(31, Number, "8.0e_8")
);
test_tokens!(
identifiers,
"asdf x _123 a_23 x3a and or yield async await class else false for from",
Token::new(0, Identifier("asdf")),
Token::new(5, Identifier("x")),
Token::new(7, Identifier("_123")),
Token::new(12, Identifier("a_23")),
Token::new(17, Identifier("x3a")),
Token::new(21, And),
Token::new(25, Or),
Token::new(28, Yield),
Token::new(34, Async),
Token::new(40, Await),
Token::new(46, Class),
Token::new(52, Else),
Token::new(57, False),
Token::new(63, For),
Token::new(67, From)
(0, Identifier, "asdf"),
(5, Identifier, "x"),
(7, Identifier, "_123"),
(12, Identifier, "a_23"),
(17, Identifier, "x3a"),
(21, And, "and"),
(25, Or, "or"),
(28, Yield, "yield"),
(34, Async, "async"),
(40, Await, "await"),
(46, Class, "class"),
(52, Else, "else"),
(57, False, "false"),
(63, For, "for"),
(67, From, "from")
);
test_tokens!(
more_keywords,
"fun if let print return select this true while truewhile",
Token::new(0, Fun),
Token::new(4, If),
Token::new(7, Let),
Token::new(11, Print),
Token::new(17, Return),
Token::new(24, Select),
Token::new(31, This),
Token::new(36, True),
Token::new(41, While),
Token::new(47, Identifier("truewhile"))
(0, Fun, "fun"),
(4, If, "if"),
(7, Let, "let"),
(11, Print, "print"),
(17, Return, "return"),
(24, Select, "select"),
(31, This, "this"),
(36, True, "true"),
(41, While, "while"),
(47, Identifier, "truewhile")
);
test_tokens!(
strings,
r#"'this is a string that\'s great!\r\n' "foo's" 'bar"s' "#,
Token::new(0, String(r#"'this is a string that\'s great!\r\n'"#)),
Token::new(38, String(r#""foo's""#)),
Token::new(46, String("'bar\"s'"))
(0, String, r#"'this is a string that\'s great!\r\n'"#),
(38, String, r#""foo's""#),
(46, String, "'bar\"s'")
);
test_tokens!(
symbols,
"{ } ( ) [ ] . ! != < <= > >= = == , - + * / ;",
Token::new(0, LeftBrace),
Token::new(2, RightBrace),
Token::new(4, LeftParen),
Token::new(6, RightParen),
Token::new(8, LeftBracket),
Token::new(10, RightBracket),
Token::new(12, Dot),
Token::new(14, Bang),
Token::new(16, BangEqual),
Token::new(19, Less),
Token::new(21, LessEqual),
Token::new(24, Greater),
Token::new(26, GreaterEqual),
Token::new(29, Equal),
Token::new(31, EqualEqual),
Token::new(34, Comma),
Token::new(36, Minus),
Token::new(38, Plus),
Token::new(40, Star),
Token::new(42, Slash),
Token::new(44, Semicolon)
(0, LeftBrace, "{"),
(2, RightBrace, "}"),
(4, LeftParen, "("),
(6, RightParen, ")"),
(8, LeftBracket, "["),
(10, RightBracket, "]"),
(12, Dot, "."),
(14, Bang, "!"),
(16, BangEqual, "!="),
(19, Less, "<"),
(21, LessEqual, "<="),
(24, Greater, ">"),
(26, GreaterEqual, ">="),
(29, Equal, "="),
(31, EqualEqual, "=="),
(34, Comma, ","),
(36, Minus, "-"),
(38, Plus, "+"),
(40, Star, "*"),
(42, Slash, "/"),
(44, Semicolon, ";")
);
}