[fine] Lifetime garbage, big refactor

So it turns out that I can't hold `&str` in token because it makes it
impossible to encapsulate a source file in the larger context- self
referential structure problems again. Everything gets rebuilt so that
the source can be passed through. While we're at it, more things
become Rc<> because, man..... life it too short.

Semantics in particular has become a giant hub of the module state: we
can basically just hold an Rc<Semantics> and have everything we could
possibly want to know about a source file, computed lazily if
necessary.
This commit is contained in:
John Doty 2024-02-11 09:31:51 -08:00
parent d5059dd450
commit 2dbdbb3957
7 changed files with 502 additions and 329 deletions

View file

@ -37,6 +37,7 @@ pub enum TokenKind {
Number,
And,
As,
Async,
Await,
Class,
@ -63,53 +64,60 @@ pub enum TokenKind {
}
// NOTE: Tokens are kinda big (like 40 bytes?) and AFAICT the only way to go
// smaller would be to stop using string pointers and use smaller
// sizes/offsets instead, e.g., 32b for offset and 32b for size, and
// stop tracking the position independently from the start, and then
// require the source text when converting to line/col. I'm unwilling to
// give up the ergonomics of &str and String right now, so we're just
// not doing it.
// smaller would be to find some other way to represent the error in an
// error token, but I'm kinda unwilling to do that.
//
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Token<'a> {
pub struct Token {
pub kind: TokenKind,
pub start: usize,
value: Result<&'a str, Box<str>>,
start: usize,
end: usize,
error: Option<Box<str>>,
}
impl<'a> Token<'a> {
pub fn new(kind: TokenKind, start: usize, value: &'a str) -> Self {
impl Token {
pub fn new(kind: TokenKind, start: usize, end: usize) -> Self {
Token {
kind,
start,
value: Ok(value),
end,
error: None,
}
}
pub fn error(start: usize, message: String) -> Self {
pub fn error(start: usize, end: usize, message: String) -> Self {
Token {
kind: TokenKind::Error,
start,
value: Err(message.into()),
end,
error: Some(message.into()),
}
}
pub fn as_str<'b>(&'b self) -> &'a str
pub fn start(&self) -> usize {
self.start
}
pub fn end(&self) -> usize {
self.end
}
pub fn len(&self) -> usize {
self.end() - self.start()
}
pub fn as_str<'a, 'b>(&'a self, source: &'b str) -> &'a str
where
'b: 'a,
{
match &self.value {
Ok(v) => v,
Err(e) => &e,
if let Some(error) = &self.error {
&error
} else {
&source[self.start()..self.end()]
}
}
}
impl<'a> std::fmt::Display for Token<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
pub struct Lines {
newlines: Vec<usize>,
}
@ -169,6 +177,10 @@ impl<'a> Tokens<'a> {
result
}
pub fn source(&self) -> &'a str {
self.source
}
pub fn lines(self) -> Lines {
self.lines
}
@ -179,12 +191,11 @@ impl<'a> Tokens<'a> {
self.lines.token_position(token)
}
fn token(&self, start: usize, kind: TokenKind) -> Token<'a> {
let value = &self.source[start..self.pos()];
Token::new(kind, start, value)
fn token(&self, start: usize, kind: TokenKind) -> Token {
Token::new(kind, start, self.pos())
}
fn number(&mut self, start: usize) -> Token<'a> {
fn number(&mut self, start: usize) -> Token {
// First, the main part.
loop {
if !self.matches_digit() {
@ -225,6 +236,7 @@ impl<'a> Tokens<'a> {
let slice = &self.source[start..self.pos()];
return Token::error(
start,
self.pos(),
format!("Invalid floating-point literal: {slice}"),
);
}
@ -238,10 +250,14 @@ impl<'a> Tokens<'a> {
self.token(start, TokenKind::Number)
}
fn string(&mut self, start: usize, delimiter: char) -> Token<'a> {
fn string(&mut self, start: usize, delimiter: char) -> Token {
while !self.matches(delimiter) {
if self.eof() {
return Token::error(start, "Unterminated string constant".to_string());
return Token::error(
start,
self.pos(),
"Unterminated string constant".to_string(),
);
}
if self.matches('\\') {
self.advance();
@ -259,6 +275,9 @@ impl<'a> Tokens<'a> {
if ident == "and" {
return TokenKind::And;
}
if ident == "as" {
return TokenKind::As;
}
if ident == "async" {
return TokenKind::Async;
}
@ -363,7 +382,7 @@ impl<'a> Tokens<'a> {
TokenKind::Identifier
}
fn identifier(&mut self, start: usize) -> Token<'a> {
fn identifier(&mut self, start: usize) -> Token {
loop {
// TODO: Use unicode identifier classes instead
if !self.matches_next(|c| c.is_ascii_alphanumeric() || c == '_') {
@ -373,7 +392,7 @@ impl<'a> Tokens<'a> {
let ident = &self.source[start..self.pos()];
let kind = Self::identifier_token_kind(ident);
Token::new(kind, start, ident)
Token::new(kind, start, self.pos())
}
fn matches(&mut self, ch: char) -> bool {
@ -420,7 +439,7 @@ impl<'a> Tokens<'a> {
self.next_char.is_none()
}
fn whitespace(&mut self, pos: usize) -> Token<'a> {
fn whitespace(&mut self, pos: usize) -> Token {
while let Some((pos, ch)) = self.next_char {
if ch == '\n' {
self.lines.add_line(pos);
@ -432,7 +451,7 @@ impl<'a> Tokens<'a> {
self.token(pos, TokenKind::Whitespace)
}
fn comment(&mut self, pos: usize) -> Token<'a> {
fn comment(&mut self, pos: usize) -> Token {
while let Some((_, ch)) = self.next_char {
if ch == '\n' {
break;
@ -442,7 +461,7 @@ impl<'a> Tokens<'a> {
self.token(pos, TokenKind::Comment)
}
pub fn next(&mut self) -> Token<'a> {
pub fn next(&mut self) -> Token {
let (pos, c) = match self.advance() {
Some((p, c)) => (p, c),
None => return self.token(self.source.len(), TokenKind::EOF),
@ -516,7 +535,7 @@ impl<'a> Tokens<'a> {
} else if c.is_ascii_alphabetic() || c == '_' {
self.identifier(pos)
} else {
Token::error(pos, format!("Unexpected character '{c}'"))
Token::error(pos, self.pos(), format!("Unexpected character '{c}'"))
}
}
}
@ -552,9 +571,9 @@ mod tests {
let mut expected: Vec<Token> = (vec![$($s),*])
.into_iter()
.map(|t| Token::new(t.1, t.0, t.2))
.map(|t| Token::new(t.1, t.0, t.0 + t.2.len()))
.collect();
expected.push(Token::new(TokenKind::EOF, $input.len(), ""));
expected.push(Token::new(TokenKind::EOF, $input.len(), $input.len()));
test_tokens_impl($input, expected);
}
@ -611,11 +630,12 @@ mod tests {
test_tokens!(
more_more_keywords,
"in is match _",
"in is match _ as",
(0, In, "in"),
(3, Is, "is"),
(6, Match, "match"),
(12, Underscore, "_")
(12, Underscore, "_"),
(14, As, "as")
);
test_tokens!(