Compare commits
No commits in common. "7abb8eafc2050cfc3020315c8ddba2b53aa07a98" and "f203da328b5e1af80360dafa1b5439600aa0cbdc" have entirely different histories.
7abb8eafc2
...
f203da328b
3 changed files with 79 additions and 125 deletions
|
|
@ -1,56 +1,34 @@
|
||||||
// NOTE: much of this parser structure derived from
|
// NOTE: much of this parser structure derived from
|
||||||
// https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html
|
// https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html
|
||||||
use crate::tokens::{Lines, Token, TokenKind, Tokens};
|
use crate::tokens::{Lines, Token, TokenKind, Tokens};
|
||||||
use std::{cell::Cell, num::NonZeroU32};
|
use std::cell::Cell;
|
||||||
|
|
||||||
pub struct ConcreteTree<'a> {
|
// BINDING POWERS. When parsing expressions we only accept expressions that
|
||||||
trees: Vec<Tree<'a>>,
|
// meet a minimum binding power. (This is like "precedence" but I just super
|
||||||
root: Option<TreeRef>,
|
// don't like that terminology.)
|
||||||
}
|
const ASSIGNMENT_POWER: u8 = 0; // =
|
||||||
|
const OR_POWER: u8 = 1; // or
|
||||||
|
const AND_POWER: u8 = 2; // and
|
||||||
|
const EQUALITY_POWER: u8 = 3; // == !=
|
||||||
|
const COMPARISON_POWER: u8 = 4; // < > <= >=
|
||||||
|
const TERM_POWER: u8 = 5; // + -
|
||||||
|
const FACTOR_POWER: u8 = 6; // * /
|
||||||
|
const UNARY_POWER: u8 = 7; // ! -
|
||||||
|
|
||||||
impl<'a> ConcreteTree<'a> {
|
// const PRIMARY_POWER: u8 = 9;
|
||||||
pub fn new() -> Self {
|
|
||||||
ConcreteTree {
|
|
||||||
trees: vec![],
|
|
||||||
root: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn add_tree(&mut self, t: Tree<'a>) -> TreeRef {
|
fn token_power<'a>(token: TokenKind) -> Option<u8> {
|
||||||
assert!(t.parent.is_none());
|
match token {
|
||||||
let tr = TreeRef::from_index(self.trees.len());
|
TokenKind::Equal => Some(ASSIGNMENT_POWER),
|
||||||
|
TokenKind::Or => Some(OR_POWER),
|
||||||
// NOTE: Because of the difficulty of holding multiple mutable
|
TokenKind::And => Some(AND_POWER),
|
||||||
// references it's this is our best chance to patch up parent
|
TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER),
|
||||||
// pointers.
|
TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => {
|
||||||
for child in t.children.iter() {
|
Some(COMPARISON_POWER)
|
||||||
if let Child::Tree(ct) = child {
|
|
||||||
self[*ct].parent = Some(tr);
|
|
||||||
}
|
}
|
||||||
}
|
TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER),
|
||||||
self.trees.push(t);
|
TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER),
|
||||||
tr
|
_ => None,
|
||||||
}
|
|
||||||
|
|
||||||
pub fn dump(&self) -> String {
|
|
||||||
match self.root {
|
|
||||||
Some(r) => self[r].dump(self),
|
|
||||||
None => String::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> std::ops::Index<TreeRef> for ConcreteTree<'a> {
|
|
||||||
type Output = Tree<'a>;
|
|
||||||
|
|
||||||
fn index(&self, index: TreeRef) -> &Self::Output {
|
|
||||||
&self.trees[index.index()]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> std::ops::IndexMut<TreeRef> for ConcreteTree<'a> {
|
|
||||||
fn index_mut(&mut self, index: TreeRef) -> &mut Self::Output {
|
|
||||||
&mut self.trees[index.index()]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -80,53 +58,52 @@ pub enum TreeKind {
|
||||||
|
|
||||||
pub struct Tree<'a> {
|
pub struct Tree<'a> {
|
||||||
pub kind: TreeKind,
|
pub kind: TreeKind,
|
||||||
pub parent: Option<TreeRef>,
|
// TODO: Indirect reference? Flatness? Using a reference structure will
|
||||||
|
// make caching and annotation easier if desired.
|
||||||
pub children: Vec<Child<'a>>,
|
pub children: Vec<Child<'a>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
|
||||||
pub struct TreeRef(NonZeroU32);
|
|
||||||
|
|
||||||
impl TreeRef {
|
|
||||||
pub fn from_index(index: usize) -> TreeRef {
|
|
||||||
let index: u32 = (index + 1).try_into().unwrap();
|
|
||||||
TreeRef(NonZeroU32::new(index).unwrap())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn index(&self) -> usize {
|
|
||||||
let index: usize = self.0.get().try_into().unwrap();
|
|
||||||
index - 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Tree<'a> {
|
impl<'a> Tree<'a> {
|
||||||
pub fn dump(&self, tree: &ConcreteTree<'a>) -> String {
|
pub fn dump(&self) -> String {
|
||||||
let mut output = String::new();
|
let mut output = String::new();
|
||||||
output.push_str(&format!("{:?}\n", self.kind));
|
output.push_str(&format!("{:?}\n", self.kind));
|
||||||
for child in self.children.iter() {
|
for child in self.children.iter() {
|
||||||
child.dump_rec(2, tree, &mut output);
|
child.dump_rec(2, &mut output);
|
||||||
}
|
}
|
||||||
output
|
output
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> std::fmt::Debug for Tree<'a> {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "[{:?}", self.kind)?;
|
||||||
|
for child in self.children.iter() {
|
||||||
|
match child {
|
||||||
|
Child::Token(t) => write!(f, " {:?}:'{}'", t.kind, t.as_str())?,
|
||||||
|
Child::Tree(t) => write!(f, " {t:?}")?,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
write!(f, "]")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub enum Child<'a> {
|
pub enum Child<'a> {
|
||||||
Token(Token<'a>),
|
Token(Token<'a>),
|
||||||
Tree(TreeRef),
|
Tree(Tree<'a>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Child<'a> {
|
impl<'a> Child<'a> {
|
||||||
fn dump_rec(&self, indent: usize, tree: &ConcreteTree<'a>, output: &mut String) {
|
fn dump_rec(&self, indent: usize, output: &mut String) {
|
||||||
for _ in 0..indent {
|
for _ in 0..indent {
|
||||||
output.push(' ');
|
output.push(' ');
|
||||||
}
|
}
|
||||||
match self {
|
match self {
|
||||||
Child::Token(t) => output.push_str(&format!("{:?}:'{:?}'\n", t.kind, t.as_str())),
|
Child::Token(t) => output.push_str(&format!("{:?}:'{:?}'\n", t.kind, t.as_str())),
|
||||||
Child::Tree(t) => {
|
Child::Tree(t) => {
|
||||||
let t = &tree[*t];
|
|
||||||
output.push_str(&format!("{:?}\n", t.kind));
|
output.push_str(&format!("{:?}\n", t.kind));
|
||||||
for child in t.children.iter() {
|
for child in t.children.iter() {
|
||||||
child.dump_rec(indent + 2, tree, output);
|
child.dump_rec(indent + 2, output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -284,12 +261,10 @@ impl<'a> CParser<'a> {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_tree(self) -> (ConcreteTree<'a>, Lines) {
|
fn build_tree(self) -> (Tree<'a>, Lines) {
|
||||||
let mut events = self.events;
|
let mut events = self.events;
|
||||||
let mut stack = Vec::new();
|
let mut stack = Vec::new();
|
||||||
|
|
||||||
let mut result = ConcreteTree::new();
|
|
||||||
|
|
||||||
// The first element in our events vector must be a start; the whole
|
// The first element in our events vector must be a start; the whole
|
||||||
// thing must be bracketed in a tree.
|
// thing must be bracketed in a tree.
|
||||||
assert!(matches!(events.get(0), Some(ParseEvent::Start { .. })));
|
assert!(matches!(events.get(0), Some(ParseEvent::Start { .. })));
|
||||||
|
|
@ -304,13 +279,12 @@ impl<'a> CParser<'a> {
|
||||||
match event {
|
match event {
|
||||||
ParseEvent::Start { kind } => stack.push(Tree {
|
ParseEvent::Start { kind } => stack.push(Tree {
|
||||||
kind,
|
kind,
|
||||||
parent: None,
|
|
||||||
children: Vec::new(),
|
children: Vec::new(),
|
||||||
}),
|
}),
|
||||||
|
|
||||||
ParseEvent::End => {
|
ParseEvent::End => {
|
||||||
let t = result.add_tree(stack.pop().unwrap());
|
let tree = stack.pop().unwrap();
|
||||||
stack.last_mut().unwrap().children.push(Child::Tree(t));
|
stack.last_mut().unwrap().children.push(Child::Tree(tree));
|
||||||
}
|
}
|
||||||
|
|
||||||
ParseEvent::Advance { token } => {
|
ParseEvent::Advance { token } => {
|
||||||
|
|
@ -320,14 +294,11 @@ impl<'a> CParser<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
assert!(stack.len() == 1, "Not all trees were ended!");
|
assert!(stack.len() == 1, "Not all trees were ended!");
|
||||||
let root = result.add_tree(stack.pop().unwrap());
|
(stack.pop().unwrap(), self.tokens.lines())
|
||||||
result.root = Some(root);
|
|
||||||
|
|
||||||
(result, self.tokens.lines())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_concrete(source: &str) -> (ConcreteTree, Lines) {
|
pub fn parse_concrete(source: &str) -> (Tree, Lines) {
|
||||||
let tokens = Tokens::new(source);
|
let tokens = Tokens::new(source);
|
||||||
let mut parser = CParser::new(tokens);
|
let mut parser = CParser::new(tokens);
|
||||||
|
|
||||||
|
|
@ -492,35 +463,6 @@ fn expression(p: &mut CParser) {
|
||||||
expression_with_power(p, 0)
|
expression_with_power(p, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
// BINDING POWERS. When parsing expressions we only accept expressions that
|
|
||||||
// meet a minimum binding power. (This is like "precedence" but I just super
|
|
||||||
// don't like that terminology.)
|
|
||||||
const ASSIGNMENT_POWER: u8 = 0; // =
|
|
||||||
const OR_POWER: u8 = 1; // or
|
|
||||||
const AND_POWER: u8 = 2; // and
|
|
||||||
const EQUALITY_POWER: u8 = 3; // == !=
|
|
||||||
const COMPARISON_POWER: u8 = 4; // < > <= >=
|
|
||||||
const TERM_POWER: u8 = 5; // + -
|
|
||||||
const FACTOR_POWER: u8 = 6; // * /
|
|
||||||
const UNARY_POWER: u8 = 7; // ! -
|
|
||||||
|
|
||||||
// const PRIMARY_POWER: u8 = 9;
|
|
||||||
|
|
||||||
fn token_power<'a>(token: TokenKind) -> Option<u8> {
|
|
||||||
match token {
|
|
||||||
TokenKind::Equal => Some(ASSIGNMENT_POWER),
|
|
||||||
TokenKind::Or => Some(OR_POWER),
|
|
||||||
TokenKind::And => Some(AND_POWER),
|
|
||||||
TokenKind::EqualEqual | TokenKind::BangEqual => Some(EQUALITY_POWER),
|
|
||||||
TokenKind::Less | TokenKind::Greater | TokenKind::GreaterEqual | TokenKind::LessEqual => {
|
|
||||||
Some(COMPARISON_POWER)
|
|
||||||
}
|
|
||||||
TokenKind::Plus | TokenKind::Minus => Some(TERM_POWER),
|
|
||||||
TokenKind::Star | TokenKind::Slash => Some(FACTOR_POWER),
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn expression_with_power(p: &mut CParser, minimum_power: u8) {
|
fn expression_with_power(p: &mut CParser, minimum_power: u8) {
|
||||||
let mut expr = prefix_expression(p);
|
let mut expr = prefix_expression(p);
|
||||||
while p.at(TokenKind::LeftParen) {
|
while p.at(TokenKind::LeftParen) {
|
||||||
|
|
@ -653,11 +595,30 @@ fn identifier(p: &mut CParser) -> MarkClosed {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use pretty_assertions::assert_eq;
|
||||||
|
|
||||||
#[test]
|
fn test_successful_expression_parse(source: &str, expected: &str) {
|
||||||
fn tree_ref_size() {
|
let tokens = Tokens::new(source);
|
||||||
// What's the point of doing all that work if the tree ref isn't nice
|
let mut parser = CParser::new(tokens);
|
||||||
// and "small"?
|
|
||||||
assert_eq!(4, std::mem::size_of::<Option<TreeRef>>());
|
expression(&mut parser);
|
||||||
|
|
||||||
|
let (tree, _) = parser.build_tree();
|
||||||
|
assert_eq!(
|
||||||
|
expected,
|
||||||
|
format!("{tree:?}"),
|
||||||
|
"The parse structure of the expressions did not match"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
macro_rules! test_expr {
|
||||||
|
($name:ident, $input:expr, $expected:expr) => {
|
||||||
|
#[test]
|
||||||
|
fn $name() {
|
||||||
|
test_successful_expression_parse($input, $expected);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
test_expr!(number_expr, "12", "[LiteralExpression Number:'12']");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -58,18 +58,11 @@ pub enum TokenKind {
|
||||||
Yield,
|
Yield,
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: Tokens are kinda big (like 40 bytes?) and AFAICT the only way to go
|
|
||||||
// smaller would be to stop using string pointers and use smaller
|
|
||||||
// sizes/offsets instead, e.g., 32b for offset and 32b for size, and
|
|
||||||
// stop tracking the position independently from the start, and then
|
|
||||||
// require the source text when converting to line/col. I'm unwilling to
|
|
||||||
// give up the ergonomics of &str and String right now, so we're just
|
|
||||||
// not doing it.
|
|
||||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||||
pub struct Token<'a> {
|
pub struct Token<'a> {
|
||||||
pub kind: TokenKind,
|
pub kind: TokenKind,
|
||||||
pub start: usize,
|
pub start: usize,
|
||||||
value: Result<&'a str, Box<str>>,
|
value: Result<&'a str, String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Token<'a> {
|
impl<'a> Token<'a> {
|
||||||
|
|
@ -85,7 +78,7 @@ impl<'a> Token<'a> {
|
||||||
Token {
|
Token {
|
||||||
kind: TokenKind::Error,
|
kind: TokenKind::Error,
|
||||||
start,
|
start,
|
||||||
value: Err(message.into()),
|
value: Err(message),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
use fine::parser::concrete::ConcreteTree;
|
use fine::parser::concrete::Tree;
|
||||||
use pretty_assertions::assert_eq;
|
use pretty_assertions::assert_eq;
|
||||||
|
|
||||||
fn rebase_concrete(source_path: &str, dump: &str) {
|
fn rebase_concrete(source_path: &str, dump: &str) {
|
||||||
|
|
@ -68,7 +68,7 @@ fn rebase_concrete(source_path: &str, dump: &str) {
|
||||||
std::fs::write(source_path, result).expect("unable to write the new file!");
|
std::fs::write(source_path, result).expect("unable to write the new file!");
|
||||||
}
|
}
|
||||||
|
|
||||||
fn assert_concrete(tree: &ConcreteTree, expected: &str, source_path: &str) {
|
fn assert_concrete(tree: &Tree, expected: &str, source_path: &str) {
|
||||||
let dump = tree.dump();
|
let dump = tree.dump();
|
||||||
let rebase = std::env::var("FINE_TEST_REBASE")
|
let rebase = std::env::var("FINE_TEST_REBASE")
|
||||||
.unwrap_or(String::new())
|
.unwrap_or(String::new())
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue