[fine] Lifetime garbage, big refactor
So it turns out that I can't hold `&str` in token because it makes it impossible to encapsulate a source file in the larger context- self referential structure problems again. Everything gets rebuilt so that the source can be passed through. While we're at it, more things become Rc<> because, man..... life it too short. Semantics in particular has become a giant hub of the module state: we can basically just hold an Rc<Semantics> and have everything we could possibly want to know about a source file, computed lazily if necessary.
This commit is contained in:
parent
d5059dd450
commit
2dbdbb3957
7 changed files with 502 additions and 329 deletions
|
|
@ -37,6 +37,7 @@ pub enum TokenKind {
|
|||
Number,
|
||||
|
||||
And,
|
||||
As,
|
||||
Async,
|
||||
Await,
|
||||
Class,
|
||||
|
|
@ -63,53 +64,60 @@ pub enum TokenKind {
|
|||
}
|
||||
|
||||
// NOTE: Tokens are kinda big (like 40 bytes?) and AFAICT the only way to go
|
||||
// smaller would be to stop using string pointers and use smaller
|
||||
// sizes/offsets instead, e.g., 32b for offset and 32b for size, and
|
||||
// stop tracking the position independently from the start, and then
|
||||
// require the source text when converting to line/col. I'm unwilling to
|
||||
// give up the ergonomics of &str and String right now, so we're just
|
||||
// not doing it.
|
||||
// smaller would be to find some other way to represent the error in an
|
||||
// error token, but I'm kinda unwilling to do that.
|
||||
//
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct Token<'a> {
|
||||
pub struct Token {
|
||||
pub kind: TokenKind,
|
||||
pub start: usize,
|
||||
value: Result<&'a str, Box<str>>,
|
||||
start: usize,
|
||||
end: usize,
|
||||
error: Option<Box<str>>,
|
||||
}
|
||||
|
||||
impl<'a> Token<'a> {
|
||||
pub fn new(kind: TokenKind, start: usize, value: &'a str) -> Self {
|
||||
impl Token {
|
||||
pub fn new(kind: TokenKind, start: usize, end: usize) -> Self {
|
||||
Token {
|
||||
kind,
|
||||
start,
|
||||
value: Ok(value),
|
||||
end,
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn error(start: usize, message: String) -> Self {
|
||||
pub fn error(start: usize, end: usize, message: String) -> Self {
|
||||
Token {
|
||||
kind: TokenKind::Error,
|
||||
start,
|
||||
value: Err(message.into()),
|
||||
end,
|
||||
error: Some(message.into()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_str<'b>(&'b self) -> &'a str
|
||||
pub fn start(&self) -> usize {
|
||||
self.start
|
||||
}
|
||||
|
||||
pub fn end(&self) -> usize {
|
||||
self.end
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.end() - self.start()
|
||||
}
|
||||
|
||||
pub fn as_str<'a, 'b>(&'a self, source: &'b str) -> &'a str
|
||||
where
|
||||
'b: 'a,
|
||||
{
|
||||
match &self.value {
|
||||
Ok(v) => v,
|
||||
Err(e) => &e,
|
||||
if let Some(error) = &self.error {
|
||||
&error
|
||||
} else {
|
||||
&source[self.start()..self.end()]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> std::fmt::Display for Token<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Lines {
|
||||
newlines: Vec<usize>,
|
||||
}
|
||||
|
|
@ -169,6 +177,10 @@ impl<'a> Tokens<'a> {
|
|||
result
|
||||
}
|
||||
|
||||
pub fn source(&self) -> &'a str {
|
||||
self.source
|
||||
}
|
||||
|
||||
pub fn lines(self) -> Lines {
|
||||
self.lines
|
||||
}
|
||||
|
|
@ -179,12 +191,11 @@ impl<'a> Tokens<'a> {
|
|||
self.lines.token_position(token)
|
||||
}
|
||||
|
||||
fn token(&self, start: usize, kind: TokenKind) -> Token<'a> {
|
||||
let value = &self.source[start..self.pos()];
|
||||
Token::new(kind, start, value)
|
||||
fn token(&self, start: usize, kind: TokenKind) -> Token {
|
||||
Token::new(kind, start, self.pos())
|
||||
}
|
||||
|
||||
fn number(&mut self, start: usize) -> Token<'a> {
|
||||
fn number(&mut self, start: usize) -> Token {
|
||||
// First, the main part.
|
||||
loop {
|
||||
if !self.matches_digit() {
|
||||
|
|
@ -225,6 +236,7 @@ impl<'a> Tokens<'a> {
|
|||
let slice = &self.source[start..self.pos()];
|
||||
return Token::error(
|
||||
start,
|
||||
self.pos(),
|
||||
format!("Invalid floating-point literal: {slice}"),
|
||||
);
|
||||
}
|
||||
|
|
@ -238,10 +250,14 @@ impl<'a> Tokens<'a> {
|
|||
self.token(start, TokenKind::Number)
|
||||
}
|
||||
|
||||
fn string(&mut self, start: usize, delimiter: char) -> Token<'a> {
|
||||
fn string(&mut self, start: usize, delimiter: char) -> Token {
|
||||
while !self.matches(delimiter) {
|
||||
if self.eof() {
|
||||
return Token::error(start, "Unterminated string constant".to_string());
|
||||
return Token::error(
|
||||
start,
|
||||
self.pos(),
|
||||
"Unterminated string constant".to_string(),
|
||||
);
|
||||
}
|
||||
if self.matches('\\') {
|
||||
self.advance();
|
||||
|
|
@ -259,6 +275,9 @@ impl<'a> Tokens<'a> {
|
|||
if ident == "and" {
|
||||
return TokenKind::And;
|
||||
}
|
||||
if ident == "as" {
|
||||
return TokenKind::As;
|
||||
}
|
||||
if ident == "async" {
|
||||
return TokenKind::Async;
|
||||
}
|
||||
|
|
@ -363,7 +382,7 @@ impl<'a> Tokens<'a> {
|
|||
TokenKind::Identifier
|
||||
}
|
||||
|
||||
fn identifier(&mut self, start: usize) -> Token<'a> {
|
||||
fn identifier(&mut self, start: usize) -> Token {
|
||||
loop {
|
||||
// TODO: Use unicode identifier classes instead
|
||||
if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') {
|
||||
|
|
@ -373,7 +392,7 @@ impl<'a> Tokens<'a> {
|
|||
|
||||
let ident = &self.source[start..self.pos()];
|
||||
let kind = Self::identifier_token_kind(ident);
|
||||
Token::new(kind, start, ident)
|
||||
Token::new(kind, start, self.pos())
|
||||
}
|
||||
|
||||
fn matches(&mut self, ch: char) -> bool {
|
||||
|
|
@ -420,7 +439,7 @@ impl<'a> Tokens<'a> {
|
|||
self.next_char.is_none()
|
||||
}
|
||||
|
||||
fn whitespace(&mut self, pos: usize) -> Token<'a> {
|
||||
fn whitespace(&mut self, pos: usize) -> Token {
|
||||
while let Some((pos, ch)) = self.next_char {
|
||||
if ch == '\n' {
|
||||
self.lines.add_line(pos);
|
||||
|
|
@ -432,7 +451,7 @@ impl<'a> Tokens<'a> {
|
|||
self.token(pos, TokenKind::Whitespace)
|
||||
}
|
||||
|
||||
fn comment(&mut self, pos: usize) -> Token<'a> {
|
||||
fn comment(&mut self, pos: usize) -> Token {
|
||||
while let Some((_, ch)) = self.next_char {
|
||||
if ch == '\n' {
|
||||
break;
|
||||
|
|
@ -442,7 +461,7 @@ impl<'a> Tokens<'a> {
|
|||
self.token(pos, TokenKind::Comment)
|
||||
}
|
||||
|
||||
pub fn next(&mut self) -> Token<'a> {
|
||||
pub fn next(&mut self) -> Token {
|
||||
let (pos, c) = match self.advance() {
|
||||
Some((p, c)) => (p, c),
|
||||
None => return self.token(self.source.len(), TokenKind::EOF),
|
||||
|
|
@ -516,7 +535,7 @@ impl<'a> Tokens<'a> {
|
|||
} else if c.is_ascii_alphabetic() || c == '_' {
|
||||
self.identifier(pos)
|
||||
} else {
|
||||
Token::error(pos, format!("Unexpected character '{c}'"))
|
||||
Token::error(pos, self.pos(), format!("Unexpected character '{c}'"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -552,9 +571,9 @@ mod tests {
|
|||
|
||||
let mut expected: Vec<Token> = (vec![$($s),*])
|
||||
.into_iter()
|
||||
.map(|t| Token::new(t.1, t.0, t.2))
|
||||
.map(|t| Token::new(t.1, t.0, t.0 + t.2.len()))
|
||||
.collect();
|
||||
expected.push(Token::new(TokenKind::EOF, $input.len(), ""));
|
||||
expected.push(Token::new(TokenKind::EOF, $input.len(), $input.len()));
|
||||
|
||||
test_tokens_impl($input, expected);
|
||||
}
|
||||
|
|
@ -611,11 +630,12 @@ mod tests {
|
|||
|
||||
test_tokens!(
|
||||
more_more_keywords,
|
||||
"in is match _",
|
||||
"in is match _ as",
|
||||
(0, In, "in"),
|
||||
(3, Is, "is"),
|
||||
(6, Match, "match"),
|
||||
(12, Underscore, "_")
|
||||
(12, Underscore, "_"),
|
||||
(14, As, "as")
|
||||
);
|
||||
|
||||
test_tokens!(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue