oden/fine/src/parser.rs

784 lines
20 KiB
Rust

// NOTE: much of this parser structure derived from
// https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html
use crate::tokens::{Lines, Token, TokenKind, Tokens};
use std::fmt::Write as _;
use std::{cell::Cell, num::NonZeroU32};
pub struct SyntaxTree<'a> {
trees: Vec<Tree<'a>>,
root: Option<TreeRef>,
}
impl<'a> SyntaxTree<'a> {
pub fn new() -> Self {
SyntaxTree {
trees: vec![],
root: None,
}
}
pub fn root(&self) -> Option<TreeRef> {
self.root
}
pub fn add_tree(&mut self, mut t: Tree<'a>) -> TreeRef {
assert!(t.parent.is_none());
let tr = TreeRef::from_index(self.trees.len());
t.start_pos = t
.children
.first()
.map(|c| c.start_position(&self))
.unwrap_or(0);
t.end_pos = t
.children
.last()
.map(|c| c.end_position(&self))
.unwrap_or(t.start_pos);
// NOTE: Because of the difficulty of holding multiple mutable
// references it's this is our best chance to patch up parent
// pointers.
for child in t.children.iter() {
if let Child::Tree(ct) = child {
self[*ct].parent = Some(tr);
}
}
self.trees.push(t);
tr
}
pub fn dump(&self, with_positions: bool) -> String {
let mut output = String::new();
if let Some(r) = self.root {
self[r].dump(self, with_positions, &mut output);
}
output
}
pub fn start_position(&self, t: TreeRef) -> usize {
self[t].start_pos
}
pub fn end_position(&self, t: TreeRef) -> usize {
self[t].end_pos
}
pub fn trees(&self) -> impl Iterator<Item = TreeRef> {
(0..self.trees.len()).map(|i| TreeRef::from_index(i))
}
pub fn find_tree_at(&self, pos: usize) -> Option<TreeRef> {
let mut current = self.root?;
let mut tree = &self[current];
if pos < tree.start_pos || pos >= tree.end_pos {
return None;
}
loop {
let mut found = false;
for child in &tree.children {
if let Child::Tree(next) = child {
let next_tree = &self[*next];
if pos >= next_tree.start_pos && pos < next_tree.end_pos {
found = true;
current = *next;
tree = next_tree;
break;
}
}
}
if !found {
return Some(current);
}
}
}
}
impl<'a> std::ops::Index<TreeRef> for SyntaxTree<'a> {
type Output = Tree<'a>;
fn index(&self, index: TreeRef) -> &Self::Output {
&self.trees[index.index()]
}
}
impl<'a> std::ops::IndexMut<TreeRef> for SyntaxTree<'a> {
fn index_mut(&mut self, index: TreeRef) -> &mut Self::Output {
&mut self.trees[index.index()]
}
}
#[derive(Debug, Eq, PartialEq)]
pub enum TreeKind {
Error,
File,
FunDecl,
ParamList,
Parameter,
TypeExpression,
Block,
LetStatement,
ReturnStatement,
ExpressionStatement,
LiteralExpression,
GroupingExpression,
UnaryExpression,
ConditionalExpression,
CallExpression,
ArgumentList,
Argument,
BinaryExpression,
IfStatement,
Identifier,
}
pub struct Tree<'a> {
pub kind: TreeKind,
pub parent: Option<TreeRef>, // TODO: Do we actually need this?
pub start_pos: usize,
pub end_pos: usize,
pub children: Vec<Child<'a>>,
}
impl<'a> Tree<'a> {
pub fn nth_token(&self, index: usize) -> Option<&Token<'a>> {
self.children
.get(index)
.map(|c| match c {
Child::Token(t) => Some(t),
_ => None,
})
.flatten()
}
pub fn nth_tree(&self, index: usize) -> Option<TreeRef> {
self.children
.get(index)
.map(|c| match c {
Child::Tree(t) => Some(*t),
_ => None,
})
.flatten()
}
}
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
pub struct TreeRef(NonZeroU32);
impl TreeRef {
pub fn from_index(index: usize) -> TreeRef {
let index: u32 = (index + 1).try_into().unwrap();
TreeRef(NonZeroU32::new(index).unwrap())
}
pub fn index(&self) -> usize {
let index: usize = self.0.get().try_into().unwrap();
index - 1
}
}
impl<'a> Tree<'a> {
pub fn dump(&self, tree: &SyntaxTree<'a>, with_positions: bool, output: &mut String) {
let _ = write!(output, "{:?}", self.kind);
if with_positions {
let _ = write!(output, " [{}, {})", self.start_pos, self.end_pos);
}
let _ = write!(output, "\n");
for child in self.children.iter() {
child.dump_rec(2, tree, with_positions, output);
}
}
}
pub enum Child<'a> {
Token(Token<'a>),
Tree(TreeRef),
}
impl<'a> Child<'a> {
fn dump_rec(
&self,
indent: usize,
tree: &SyntaxTree<'a>,
with_positions: bool,
output: &mut String,
) {
for _ in 0..indent {
let _ = write!(output, " ");
}
match self {
Child::Token(t) => {
let _ = write!(output, "{:?}:'{:?}'", t.kind, t.as_str());
if with_positions {
let _ = write!(output, " [{}, {})", t.start, t.start + t.as_str().len());
}
let _ = write!(output, "\n");
}
Child::Tree(t) => {
let t = &tree[*t];
let _ = write!(output, "{:?}", t.kind);
if with_positions {
let _ = write!(output, " [{}, {})", t.start_pos, t.end_pos);
}
let _ = write!(output, "\n");
for child in t.children.iter() {
child.dump_rec(indent + 2, tree, with_positions, output);
}
}
}
}
pub fn start_position(&self, syntax_tree: &SyntaxTree) -> usize {
match &self {
Child::Token(t) => t.start,
Child::Tree(t) => syntax_tree[*t].start_pos,
}
}
pub fn end_position(&self, syntax_tree: &SyntaxTree) -> usize {
match &self {
Child::Token(t) => t.start + t.as_str().len(),
Child::Tree(t) => syntax_tree[*t].end_pos,
}
}
}
enum ParseEvent<'a> {
Start { kind: TreeKind },
End,
Advance { token: Token<'a> },
}
struct MarkStarted {
index: usize,
}
struct MarkClosed {
index: usize,
}
struct CParser<'a> {
tokens: Tokens<'a>,
current: Token<'a>,
fuel: Cell<u32>,
events: Vec<ParseEvent<'a>>,
}
impl<'a> CParser<'a> {
fn new(tokens: Tokens<'a>) -> Self {
let mut parser = CParser {
tokens,
current: Token::new(TokenKind::EOF, 0, ""),
fuel: Cell::new(256),
events: Vec::new(),
};
parser.current = parser.tokens.next();
parser.skip_ephemera();
parser
}
fn start(&mut self) -> MarkStarted {
let mark = MarkStarted {
index: self.events.len(),
};
self.events.push(ParseEvent::Start {
kind: TreeKind::Error,
});
mark
}
fn end(&mut self, mark: MarkStarted, kind: TreeKind) -> MarkClosed {
self.events[mark.index] = ParseEvent::Start { kind };
self.events.push(ParseEvent::End);
MarkClosed { index: mark.index }
}
fn start_before(&mut self, mark: MarkClosed) -> MarkStarted {
// TODO: Point backwards and pointer chase in tree build?
let mark = MarkStarted { index: mark.index };
self.events.insert(
mark.index,
ParseEvent::Start {
kind: TreeKind::Error,
},
);
mark
}
fn advance(&mut self) {
assert!(!self.eof()); // Don't try to advance past EOF
self.fuel.set(256); // Consuming a token, rest stuck detector
self.events.push(ParseEvent::Advance {
token: self.current.clone(),
});
self.current = self.tokens.next();
self.skip_ephemera();
}
fn skip_ephemera(&mut self) {
while self.current.kind == TokenKind::Whitespace || self.current.kind == TokenKind::Comment
{
self.current = self.tokens.next();
}
}
fn eof(&self) -> bool {
self.current.kind == TokenKind::EOF
}
fn peek(&self) -> TokenKind {
assert!(self.fuel.get() > 0, "parser is stuck!");
self.fuel.set(self.fuel.get() - 1);
self.current.kind
}
fn at(&self, kind: TokenKind) -> bool {
self.peek() == kind
}
fn eat(&mut self, kind: TokenKind) -> bool {
if self.at(kind) {
self.advance();
true
} else {
false
}
}
fn expect<T>(&mut self, kind: TokenKind, error: T)
where
T: Into<String>,
{
if self.eat(kind) {
return;
}
self.error(error);
}
fn advance_with_error<T>(&mut self, error: T) -> MarkClosed
where
T: Into<String>,
{
let m = self.start();
self.error(error);
self.advance();
self.end(m, TreeKind::Error)
}
fn error<T>(&mut self, message: T)
where
T: Into<String>,
{
self.error_at(self.current.clone(), message)
}
fn error_at<T>(&mut self, token: Token<'a>, message: T)
where
T: Into<String>,
{
let message: String = message.into();
let mut final_message = "Error ".to_string();
if token.kind == TokenKind::EOF {
final_message.push_str("at end")
} else if token.kind != TokenKind::Error {
final_message.push_str("at '");
final_message.push_str(token.as_str());
final_message.push_str("'");
}
final_message.push_str(": ");
final_message.push_str(&message);
self.events.push(ParseEvent::Advance {
token: Token::error(token.start, final_message),
});
}
fn build_tree(self) -> (SyntaxTree<'a>, Lines) {
let mut events = self.events;
let mut stack = Vec::new();
let mut result = SyntaxTree::new();
// The first element in our events vector must be a start; the whole
// thing must be bracketed in a tree.
assert!(matches!(events.get(0), Some(ParseEvent::Start { .. })));
// The last element in our events vector must be an end, otherwise
// the parser has failed badly. We'll remove it here so that, after
// processing the entire array, the stack retains the tree that we
// start with the very first ::Start.
assert!(matches!(events.pop(), Some(ParseEvent::End)));
for event in events {
match event {
ParseEvent::Start { kind } => stack.push(Tree {
kind,
parent: None,
start_pos: 0,
end_pos: 0,
children: Vec::new(),
}),
ParseEvent::End => {
let t = result.add_tree(stack.pop().unwrap());
stack.last_mut().unwrap().children.push(Child::Tree(t));
}
ParseEvent::Advance { token } => {
stack.last_mut().unwrap().children.push(Child::Token(token));
}
}
}
assert!(stack.len() == 1, "Not all trees were ended!");
let root = result.add_tree(stack.pop().unwrap());
result.root = Some(root);
(result, self.tokens.lines())
}
}
pub fn parse(source: &str) -> (SyntaxTree, Lines) {
let tokens = Tokens::new(source);
let mut parser = CParser::new(tokens);
file(&mut parser);
parser.build_tree()
}
fn file(p: &mut CParser) {
let m = p.start();
while !p.eof() {
match p.peek() {
TokenKind::Fun => function(p),
_ => statement(p),
}
}
p.end(m, TreeKind::File);
}
fn function(p: &mut CParser) {
assert!(p.at(TokenKind::Fun));
let m = p.start();
p.expect(TokenKind::Fun, "expected a function to start with 'fun'");
p.expect(TokenKind::Identifier, "expected a function name");
if p.at(TokenKind::LeftParen) {
param_list(p);
}
if p.eat(TokenKind::Arrow) {
type_expr(p);
}
if p.at(TokenKind::LeftBrace) {
block(p);
}
p.end(m, TreeKind::FunDecl);
}
fn param_list(p: &mut CParser) {
assert!(p.at(TokenKind::LeftParen));
let m = p.start();
p.expect(TokenKind::LeftParen, "expect '(' to start a parameter list");
while !p.at(TokenKind::RightParen) && !p.eof() {
if p.at(TokenKind::Identifier) {
parameter(p);
} else {
break;
}
}
p.expect(TokenKind::RightParen, "expect ')' to end a parameter list");
p.end(m, TreeKind::ParamList);
}
fn parameter(p: &mut CParser) {
assert!(p.at(TokenKind::Identifier));
let m = p.start();
p.expect(
TokenKind::Identifier,
"expected an identifier for a parameter name",
);
if p.eat(TokenKind::Colon) {
type_expr(p);
}
if !p.at(TokenKind::RightParen) {
p.expect(TokenKind::Comma, "expected a comma between parameters");
}
p.end(m, TreeKind::Parameter);
}
fn type_expr(p: &mut CParser) {
let m = p.start();
// TODO: Other kinds of type expressions probably!
p.expect(TokenKind::Identifier, "expected the identifier of a type");
p.end(m, TreeKind::TypeExpression);
}
fn block(p: &mut CParser) {
assert!(p.at(TokenKind::LeftBrace));
let m = p.start();
p.expect(TokenKind::LeftBrace, "expect '{' to start a block");
while !p.at(TokenKind::RightBrace) && !p.eof() {
statement(p);
}
p.expect(TokenKind::RightBrace, "expect '}' to start a block");
p.end(m, TreeKind::Block);
}
fn statement(p: &mut CParser) {
match p.peek() {
TokenKind::LeftBrace => block(p),
TokenKind::Let => statement_let(p),
TokenKind::Return => statement_return(p),
// NOTE: Technically 'if' is an expression, but `if` doesn't
// require a semicolon at the end if it's all by itself.
TokenKind::If => statement_if(p),
_ => statement_expression(p),
}
}
fn statement_if(p: &mut CParser) {
assert!(p.at(TokenKind::If));
let m = p.start();
conditional(p);
p.end(m, TreeKind::IfStatement);
}
fn statement_let(p: &mut CParser) {
assert!(p.at(TokenKind::Let));
let m = p.start();
p.expect(TokenKind::Let, "expect 'let' to start a let statement");
p.expect(TokenKind::Identifier, "expected a name for the variable");
p.expect(TokenKind::Equal, "expected a '=' after the variable name");
expression(p);
if !p.at(TokenKind::RightBrace) {
p.expect(TokenKind::Semicolon, "expect ';' to end a let statement");
}
p.end(m, TreeKind::LetStatement);
}
fn statement_return(p: &mut CParser) {
assert!(p.at(TokenKind::Return));
let m = p.start();
p.expect(
TokenKind::Return,
"expect 'return' to start a return statement",
);
expression(p);
if !p.at(TokenKind::RightBrace) {
p.expect(TokenKind::Semicolon, "expect ';' to end a return statement");
}
p.end(m, TreeKind::ReturnStatement);
}
fn statement_expression(p: &mut CParser) {
let m = p.start();
expression(p);
if !p.at(TokenKind::RightBrace) {
p.expect(
TokenKind::Semicolon,
"expect ';' to end an expression statement",
);
}
p.end(m, TreeKind::ExpressionStatement);
}
fn expression(p: &mut CParser) {
expression_with_power(p, 0)
}
// BINDING POWERS. When parsing expressions we only accept expressions that
// meet a minimum binding power. (This is like "precedence" but I just super
// don't like that terminology.)
const ASSIGNMENT_POWER: u8 = 0; // =
const OR_POWER: u8 = 1; // or
const AND_POWER: u8 = 2; // and
const EQUALITY_POWER: u8 = 3; // == !=
const COMPARISON_POWER: u8 = 4; // < > <= >=
const TERM_POWER: u8 = 5; // + -
const FACTOR_POWER: u8 = 6; // * /
const UNARY_POWER: u8 = 7; // ! -
// const PRIMARY_POWER: u8 = 9;
fn token_power<'a>(token: TokenKind) -> Option<u8> {
match token {
TokenKind::Equal => Some(ASSIGNMENT_POWER),
TokenKind::Or => Some(OR_POWER),
TokenKind::And => Some(AND_POWER),
TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER),
TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => {
Some(COMPARISON_POWER)
}
TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER),
TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER),
_ => None,
}
}
fn expression_with_power(p: &mut CParser, minimum_power: u8) {
let mut expr = prefix_expression(p);
while p.at(TokenKind::LeftParen) {
let m = p.start_before(expr);
argument_list(p);
expr = p.end(m, TreeKind::CallExpression);
}
loop {
let Some(power) = token_power(p.peek()) else {
break;
};
if power < minimum_power {
break;
}
// TODO: I don't think this works for other "infix" types, but we'll
// see won't we.
let m = p.start_before(expr);
p.advance(); // Consume the operator
expression_with_power(p, power);
expr = p.end(m, TreeKind::BinaryExpression);
}
}
fn argument_list(p: &mut CParser) {
assert!(p.at(TokenKind::LeftParen));
let m = p.start();
p.expect(
TokenKind::LeftParen,
"expect an argument list to start with '('",
);
while !p.at(TokenKind::RightParen) && !p.eof() {
argument(p);
}
p.expect(
TokenKind::RightParen,
"expect an argument list to start with '('",
);
p.end(m, TreeKind::ArgumentList);
}
fn argument(p: &mut CParser) {
let m = p.start();
expression(p);
if !p.at(TokenKind::RightParen) {
p.expect(TokenKind::Comma, "expect a ',' between arguments");
}
p.end(m, TreeKind::Argument);
}
fn prefix_expression(p: &mut CParser) -> MarkClosed {
match p.peek() {
TokenKind::Number => literal(p),
TokenKind::String => literal(p),
TokenKind::True => literal(p),
TokenKind::False => literal(p),
TokenKind::LeftParen => grouping(p),
TokenKind::Bang => unary(p),
TokenKind::Minus => unary(p),
TokenKind::If => conditional(p),
TokenKind::Identifier => identifier(p),
_ => p.advance_with_error("expected an expression"),
}
}
fn literal(p: &mut CParser) -> MarkClosed {
let m = p.start();
p.advance();
p.end(m, TreeKind::LiteralExpression)
}
fn grouping(p: &mut CParser) -> MarkClosed {
assert!(p.at(TokenKind::LeftParen));
let m = p.start();
p.expect(TokenKind::LeftParen, "expected '(' to start grouping");
expression(p);
p.expect(TokenKind::RightParen, "unmatched parentheses in expression");
p.end(m, TreeKind::GroupingExpression)
}
fn unary(p: &mut CParser) -> MarkClosed {
let m = p.start();
p.advance(); // Past the operator
expression_with_power(p, UNARY_POWER);
p.end(m, TreeKind::UnaryExpression)
}
fn conditional(p: &mut CParser) -> MarkClosed {
assert!(p.at(TokenKind::If));
let m = p.start();
p.expect(TokenKind::If, "expected conditional to start with 'if'");
expression(p);
block(p);
if p.eat(TokenKind::Else) {
if p.at(TokenKind::If) {
// Don't require another block, just jump right into the conditional.
conditional(p);
} else {
block(p);
}
}
p.end(m, TreeKind::ConditionalExpression)
}
fn identifier(p: &mut CParser) -> MarkClosed {
assert!(p.at(TokenKind::Identifier));
let m = p.start();
p.advance();
p.end(m, TreeKind::Identifier)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tree_ref_size() {
// What's the point of doing all that work if the tree ref isn't nice
// and "small"?
//
// TODO: This is a dumb optimization because tokens are
// huge so Child is huge no matter what we do. If we retain
// tokens out of line then we can re-visit this optimization.
assert_eq!(4, std::mem::size_of::<Option<TreeRef>>());
}
}