[fine] Parent pointers in trees

This commit is contained in:
John Doty 2024-01-05 11:10:38 -08:00
parent 4f3536ea50
commit 7abb8eafc2
3 changed files with 132 additions and 55 deletions

View file

@ -1,34 +1,56 @@
// NOTE: much of this parser structure derived from
// https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html
use crate::tokens::{Lines, Token, TokenKind, Tokens};
use std::cell::Cell;
use std::{cell::Cell, num::NonZeroU32};
// BINDING POWERS. When parsing expressions we only accept expressions that
// meet a minimum binding power. (This is like "precedence" but I just super
// don't like that terminology.)
const ASSIGNMENT_POWER: u8 = 0; // =
const OR_POWER: u8 = 1; // or
const AND_POWER: u8 = 2; // and
const EQUALITY_POWER: u8 = 3; // == !=
const COMPARISON_POWER: u8 = 4; // < > <= >=
const TERM_POWER: u8 = 5; // + -
const FACTOR_POWER: u8 = 6; // * /
const UNARY_POWER: u8 = 7; // ! -
pub struct ConcreteTree<'a> {
trees: Vec<Tree<'a>>,
root: Option<TreeRef>,
}
// const PRIMARY_POWER: u8 = 9;
fn token_power<'a>(token: TokenKind) -> Option<u8> {
match token {
TokenKind::Equal => Some(ASSIGNMENT_POWER),
TokenKind::Or => Some(OR_POWER),
TokenKind::And => Some(AND_POWER),
TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER),
TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => {
Some(COMPARISON_POWER)
impl<'a> ConcreteTree<'a> {
pub fn new() -> Self {
ConcreteTree {
trees: vec![],
root: None,
}
TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER),
TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER),
_ => None,
}
pub fn add_tree(&mut self, t: Tree<'a>) -> TreeRef {
assert!(t.parent.is_none());
let tr = TreeRef::from_index(self.trees.len());
// NOTE: Because of the difficulty of holding multiple mutable
// references it's this is our best chance to patch up parent
// pointers.
for child in t.children.iter() {
if let Child::Tree(ct) = child {
self[*ct].parent = Some(tr);
}
}
self.trees.push(t);
tr
}
pub fn dump(&self) -> String {
match self.root {
Some(r) => self[r].dump(self),
None => String::new(),
}
}
}
impl<'a> std::ops::Index<TreeRef> for ConcreteTree<'a> {
type Output = Tree<'a>;
fn index(&self, index: TreeRef) -> &Self::Output {
&self.trees[index.index()]
}
}
impl<'a> std::ops::IndexMut<TreeRef> for ConcreteTree<'a> {
fn index_mut(&mut self, index: TreeRef) -> &mut Self::Output {
&mut self.trees[index.index()]
}
}
@ -58,52 +80,53 @@ pub enum TreeKind {
pub struct Tree<'a> {
pub kind: TreeKind,
// TODO: Indirect reference? Flatness? Using a reference structure will
// make caching and annotation easier if desired.
pub parent: Option<TreeRef>,
pub children: Vec<Child<'a>>,
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub struct TreeRef(NonZeroU32);
impl TreeRef {
pub fn from_index(index: usize) -> TreeRef {
let index: u32 = (index + 1).try_into().unwrap();
TreeRef(NonZeroU32::new(index).unwrap())
}
pub fn index(&self) -> usize {
let index: usize = self.0.get().try_into().unwrap();
index - 1
}
}
impl<'a> Tree<'a> {
pub fn dump(&self) -> String {
pub fn dump(&self, tree: &ConcreteTree<'a>) -> String {
let mut output = String::new();
output.push_str(&format!("{:?}\n", self.kind));
for child in self.children.iter() {
child.dump_rec(2, &mut output);
child.dump_rec(2, tree, &mut output);
}
output
}
}
impl<'a> std::fmt::Debug for Tree<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "[{:?}", self.kind)?;
for child in self.children.iter() {
match child {
Child::Token(t) => write!(f, " {:?}:'{}'", t.kind, t.as_str())?,
Child::Tree(t) => write!(f, " {t:?}")?,
}
}
write!(f, "]")?;
Ok(())
}
}
pub enum Child<'a> {
Token(Token<'a>),
Tree(Tree<'a>),
Tree(TreeRef),
}
impl<'a> Child<'a> {
fn dump_rec(&self, indent: usize, output: &mut String) {
fn dump_rec(&self, indent: usize, tree: &ConcreteTree<'a>, output: &mut String) {
for _ in 0..indent {
output.push(' ');
}
match self {
Child::Token(t) => output.push_str(&format!("{:?}:'{:?}'\n", t.kind, t.as_str())),
Child::Tree(t) => {
let t = &tree[*t];
output.push_str(&format!("{:?}\n", t.kind));
for child in t.children.iter() {
child.dump_rec(indent + 2, output);
child.dump_rec(indent + 2, tree, output);
}
}
}
@ -261,10 +284,12 @@ impl<'a> CParser<'a> {
});
}
fn build_tree(self) -> (Tree<'a>, Lines) {
fn build_tree(self) -> (ConcreteTree<'a>, Lines) {
let mut events = self.events;
let mut stack = Vec::new();
let mut result = ConcreteTree::new();
// The first element in our events vector must be a start; the whole
// thing must be bracketed in a tree.
assert!(matches!(events.get(0), Some(ParseEvent::Start { .. })));
@ -279,12 +304,13 @@ impl<'a> CParser<'a> {
match event {
ParseEvent::Start { kind } => stack.push(Tree {
kind,
parent: None,
children: Vec::new(),
}),
ParseEvent::End => {
let tree = stack.pop().unwrap();
stack.last_mut().unwrap().children.push(Child::Tree(tree));
let t = result.add_tree(stack.pop().unwrap());
stack.last_mut().unwrap().children.push(Child::Tree(t));
}
ParseEvent::Advance { token } => {
@ -294,11 +320,14 @@ impl<'a> CParser<'a> {
}
assert!(stack.len() == 1, "Not all trees were ended!");
(stack.pop().unwrap(), self.tokens.lines())
let root = result.add_tree(stack.pop().unwrap());
result.root = Some(root);
(result, self.tokens.lines())
}
}
pub fn parse_concrete(source: &str) -> (Tree, Lines) {
pub fn parse_concrete(source: &str) -> (ConcreteTree, Lines) {
let tokens = Tokens::new(source);
let mut parser = CParser::new(tokens);
@ -463,6 +492,35 @@ fn expression(p: &mut CParser) {
expression_with_power(p, 0)
}
// BINDING POWERS. When parsing expressions we only accept expressions that
// meet a minimum binding power. (This is like "precedence" but I just super
// don't like that terminology.)
const ASSIGNMENT_POWER: u8 = 0; // =
const OR_POWER: u8 = 1; // or
const AND_POWER: u8 = 2; // and
const EQUALITY_POWER: u8 = 3; // == !=
const COMPARISON_POWER: u8 = 4; // < > <= >=
const TERM_POWER: u8 = 5; // + -
const FACTOR_POWER: u8 = 6; // * /
const UNARY_POWER: u8 = 7; // ! -
// const PRIMARY_POWER: u8 = 9;
fn token_power<'a>(token: TokenKind) -> Option<u8> {
match token {
TokenKind::Equal => Some(ASSIGNMENT_POWER),
TokenKind::Or => Some(OR_POWER),
TokenKind::And => Some(AND_POWER),
TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER),
TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => {
Some(COMPARISON_POWER)
}
TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER),
TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER),
_ => None,
}
}
fn expression_with_power(p: &mut CParser, minimum_power: u8) {
let mut expr = prefix_expression(p);
while p.at(TokenKind::LeftParen) {
@ -591,3 +649,15 @@ fn identifier(p: &mut CParser) -> MarkClosed {
p.end(m, TreeKind::Identifier)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tree_ref_size() {
// What's the point of doing all that work if the tree ref isn't nice
// and "small"?
assert_eq!(4, std::mem::size_of::<Option<TreeRef>>());
}
}

View file

@ -58,11 +58,18 @@ pub enum TokenKind {
Yield,
}
// NOTE: Tokens are kinda big (like 40 bytes?) and AFAICT the only way to go
// smaller would be to stop using string pointers and use smaller
// sizes/offsets instead, e.g., 32b for offset and 32b for size, and
// stop tracking the position independently from the start, and then
// require the source text when converting to line/col. I'm unwilling to
// give up the ergonomics of &str and String right now, so we're just
// not doing it.
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Token<'a> {
pub kind: TokenKind,
pub start: usize,
value: Result<&'a str, String>,
value: Result<&'a str, Box<str>>,
}
impl<'a> Token<'a> {
@ -78,7 +85,7 @@ impl<'a> Token<'a> {
Token {
kind: TokenKind::Error,
start,
value: Err(message),
value: Err(message.into()),
}
}

View file

@ -1,4 +1,4 @@
use fine::parser::concrete::Tree;
use fine::parser::concrete::ConcreteTree;
use pretty_assertions::assert_eq;
fn rebase_concrete(source_path: &str, dump: &str) {
@ -68,7 +68,7 @@ fn rebase_concrete(source_path: &str, dump: &str) {
std::fs::write(source_path, result).expect("unable to write the new file!");
}
fn assert_concrete(tree: &Tree, expected: &str, source_path: &str) {
fn assert_concrete(tree: &ConcreteTree, expected: &str, source_path: &str) {
let dump = tree.dump();
let rebase = std::env::var("FINE_TEST_REBASE")
.unwrap_or(String::new())