rkgk/crates/haku/src/parser.rs
liquidev 2595bf0d82 syntax v2
introduce a new, more ergonomic syntax for haku
not all features are implemented just yet. still missing:

- custom tags (non-True/False)
- color literals
- lists
2024-09-01 09:29:11 +02:00

607 lines
15 KiB
Rust

use core::cell::Cell;
use alloc::vec::Vec;
use crate::{
ast::{Ast, NodeAllocError, NodeId, NodeKind},
diagnostic::Diagnostic,
source::Span,
token::{Lexis, TokenKind, TokenKindSet},
};
#[derive(Debug, Clone, Copy)]
pub struct ParserLimits {
pub max_events: usize,
}
pub struct Parser<'a> {
tokens: &'a Lexis,
events: Vec<Event>,
position: u32,
fuel: Cell<u32>,
pub diagnostics: Vec<Diagnostic>,
}
#[derive(Debug)]
enum Event {
Open { kind: NodeKind },
Close,
Advance,
}
struct Open {
index: Option<usize>,
}
struct Closed {
index: Option<usize>,
}
impl<'a> Parser<'a> {
const FUEL: u32 = 256;
pub fn new(input: &'a Lexis, limits: &ParserLimits) -> Self {
assert!(limits.max_events < u32::MAX as usize);
Self {
tokens: input,
events: Vec::with_capacity(limits.max_events),
position: 0,
diagnostics: Vec::with_capacity(16),
fuel: Cell::new(Self::FUEL),
}
}
fn event(&mut self, event: Event) -> Option<usize> {
if self.events.len() < self.events.capacity() {
let index = self.events.len();
self.events.push(event);
Some(index)
} else {
None
}
}
fn open(&mut self) -> Open {
Open {
index: self.event(Event::Open {
kind: NodeKind::Error,
}),
}
}
fn open_before(&mut self, closed: Closed) -> Open {
if let Some(index) = closed.index {
if self.events.len() < self.events.capacity() {
self.events.insert(
index,
Event::Open {
kind: NodeKind::Error,
},
);
return Open { index: Some(index) };
}
}
Open { index: None }
}
fn close(&mut self, open: Open, kind: NodeKind) -> Closed {
if let Some(index) = open.index {
self.events[index] = Event::Open { kind };
self.event(Event::Close);
Closed { index: Some(index) }
} else {
Closed { index: None }
}
}
fn is_eof(&self) -> bool {
self.peek() == TokenKind::Eof
}
fn advance(&mut self) {
if !self.is_eof() {
self.position += 1;
self.event(Event::Advance);
self.fuel.set(Self::FUEL);
}
}
#[track_caller]
fn peek(&self) -> TokenKind {
assert_ne!(self.fuel.get(), 0, "parser is stuck");
self.fuel.set(self.fuel.get() - 1);
self.tokens.kind(self.position)
}
fn span(&self) -> Span {
self.tokens.span(self.position)
}
fn emit(&mut self, diagnostic: Diagnostic) {
if self.diagnostics.len() < self.diagnostics.capacity() {
self.diagnostics.push(diagnostic);
}
}
fn advance_with_error(&mut self) -> Closed {
let opened = self.open();
self.advance();
self.close(opened, NodeKind::Error)
}
fn optional_newline(&mut self) -> bool {
if self.peek() == TokenKind::Newline {
self.advance();
true
} else {
false
}
}
pub fn into_ast(self, ast: &mut Ast) -> Result<(NodeId, Vec<Diagnostic>), NodeAllocError> {
let mut token = 0;
let mut events = self.events;
let mut stack = Vec::new();
struct StackEntry {
node_id: NodeId,
// TODO: This should probably be optimized to use a shared stack.
children: Vec<NodeId>,
}
// Remove the last Close to keep a single node on the stack.
assert!(matches!(events.pop(), Some(Event::Close)));
for event in events {
match event {
Event::Open { kind } => {
stack.push(StackEntry {
node_id: ast.alloc(kind, self.tokens.span(token))?,
children: Vec::new(),
});
}
Event::Close => {
let end_span = self.tokens.span(token.saturating_sub(1));
let stack_entry = stack.pop().unwrap();
ast.alloc_children(stack_entry.node_id, &stack_entry.children);
ast.extend_span(stack_entry.node_id, end_span.end);
stack.last_mut().unwrap().children.push(stack_entry.node_id);
}
Event::Advance => {
let span = self.tokens.span(token);
let node_id = ast.alloc(NodeKind::Token, span)?;
stack
.last_mut()
.expect("advance() may only be used in an open node")
.children
.push(node_id);
token += 1;
}
}
}
if stack.len() != 1 {
// This means we had too many events emitted and they are no longer balanced.
return Err(NodeAllocError);
}
// assert_eq!(token, self.tokens.len());
let end_span = self.tokens.span(token.saturating_sub(1));
let stack_entry = stack.pop().unwrap();
ast.alloc_children(stack_entry.node_id, &stack_entry.children);
ast.extend_span(stack_entry.node_id, end_span.end);
Ok((stack_entry.node_id, self.diagnostics))
}
}
impl<'a> core::fmt::Debug for Parser<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("Parser")
.field("events", &self.events)
.finish_non_exhaustive()
}
}
enum Tighter {
Left,
Right,
}
fn tighter(left: TokenKind, right: TokenKind) -> Tighter {
fn tightness(kind: TokenKind) -> Option<usize> {
match kind {
TokenKind::Equal => Some(0),
TokenKind::EqualEqual
| TokenKind::NotEqual
| TokenKind::Less
| TokenKind::LessEqual
| TokenKind::Greater
| TokenKind::GreaterEqual => Some(1),
TokenKind::Plus | TokenKind::Minus => Some(2),
TokenKind::Star | TokenKind::Slash => Some(3),
_ if PREFIX_TOKENS.contains(kind) => Some(4),
_ => None,
}
}
let Some(right_tightness) = tightness(right) else {
return Tighter::Left;
};
let Some(left_tightness) = tightness(left) else {
assert!(left == TokenKind::Eof);
return Tighter::Right;
};
if right_tightness > left_tightness {
Tighter::Right
} else {
Tighter::Left
}
}
fn precedence_parse(p: &mut Parser, left: TokenKind) {
let mut lhs = prefix(p);
loop {
let right = p.peek();
match tighter(left, right) {
Tighter::Left => break,
Tighter::Right => {
let o = p.open_before(lhs);
let kind = infix(p, right);
lhs = p.close(o, kind);
}
}
}
}
fn one(p: &mut Parser, kind: NodeKind) -> Closed {
let o = p.open();
p.advance();
p.close(o, kind)
}
fn list(p: &mut Parser) -> Closed {
let o = p.open();
let lspan = p.span();
p.advance(); // [
p.optional_newline();
loop {
match p.peek() {
TokenKind::Eof => {
p.emit(Diagnostic::error(lspan, "missing `]` to close this list"));
break;
}
TokenKind::RBrack => {
p.advance();
break;
}
_ => (),
}
expr(p);
match p.peek() {
TokenKind::Comma | TokenKind::Newline => {
p.advance();
continue;
}
TokenKind::RBrack => {
p.advance();
break;
}
_ => {
let span = p.span();
p.emit(Diagnostic::error(
span,
"comma `,` or new line expected after list element",
));
p.advance_with_error();
}
}
}
p.close(o, NodeKind::List)
}
fn unary(p: &mut Parser) -> Closed {
let o = p.open();
let op = p.open();
p.advance();
p.close(op, NodeKind::Op);
prefix(p);
p.close(o, NodeKind::Unary)
}
fn paren(p: &mut Parser) -> Closed {
let o = p.open();
let lspan = p.span();
p.advance(); // (
if p.peek() == TokenKind::RParen {
p.advance(); // )
p.close(o, NodeKind::ParenEmpty)
} else {
p.optional_newline();
expr(p);
p.optional_newline();
if p.peek() != TokenKind::RParen {
p.emit(Diagnostic::error(lspan, "missing closing parenthesis `)`"));
p.advance_with_error()
} else {
p.advance();
p.close(o, NodeKind::Paren)
}
}
}
fn param(p: &mut Parser) {
let o = p.open();
if let TokenKind::Ident | TokenKind::Underscore = p.peek() {
p.advance();
} else {
let span = p.span();
p.emit(Diagnostic::error(
span,
"parameter names must be identifiers or `_`",
));
p.advance_with_error();
}
p.close(o, NodeKind::Param);
}
fn lambda(p: &mut Parser) -> Closed {
let o = p.open();
p.advance(); // backslash
let params = p.open();
loop {
param(p);
match p.peek() {
TokenKind::Comma => {
p.advance();
continue;
}
TokenKind::RArrow => break,
_ => {
let span = p.span();
p.emit(Diagnostic::error(
span,
"`,` or `->` expected after function parameter",
));
p.advance_with_error();
break;
}
}
}
p.close(params, NodeKind::Params);
// NOTE: Can be false if there are some stray tokens.
// We prefer to bail early and let the rest of the program parse.
if p.peek() == TokenKind::RArrow {
p.advance();
p.optional_newline();
expr(p);
}
p.close(o, NodeKind::Lambda)
}
fn if_expr(p: &mut Parser) -> Closed {
let o = p.open();
p.advance(); // if
if p.peek() != TokenKind::LParen {
let span = p.span();
p.emit(Diagnostic::error(
span,
"the condition in an `if` expression must be surrounded with parentheses",
));
// NOTE: Don't advance, it's more likely the programmer expected no parentheses to be needed.
}
p.advance();
expr(p); // Condition
if p.peek() != TokenKind::RParen {
let span = p.span();
p.emit(Diagnostic::error(
span,
"missing closing parenthesis after `if` condition",
));
}
p.advance();
p.optional_newline();
expr(p); // True branch
p.optional_newline();
if p.peek() != TokenKind::Else {
let span = p.span();
p.emit(Diagnostic::error(
span,
"`if` expression is missing an `else` clause",
));
}
p.advance();
p.optional_newline();
expr(p); // False branch
p.close(o, NodeKind::If)
}
fn let_expr(p: &mut Parser) -> Closed {
let o = p.open();
p.advance(); // let
if p.peek() == TokenKind::Ident {
let ident = p.open();
p.advance();
p.close(ident, NodeKind::Ident);
} else {
let span = p.span();
p.emit(Diagnostic::error(span, "`let` variable name expected"));
p.advance_with_error();
}
if p.peek() == TokenKind::Equal {
p.advance();
} else {
let span = p.span();
p.emit(Diagnostic::error(span, "`=` expected after variable name"));
p.advance_with_error();
}
expr(p);
if p.peek() == TokenKind::Newline {
p.advance();
} else {
let span = p.span();
p.emit(Diagnostic::error(
span,
"new line expected after `let` expression",
));
p.advance_with_error();
}
expr(p);
p.close(o, NodeKind::Let)
}
const PREFIX_TOKENS: TokenKindSet = TokenKindSet::new(&[
TokenKind::Ident,
TokenKind::Tag,
TokenKind::Number,
TokenKind::Color,
// NOTE: This is ambiguous in function calls.
// In that case, the infix operator takes precedence (because the `match` arms for the infix op
// come first.)
TokenKind::Minus,
TokenKind::Not,
TokenKind::LParen,
TokenKind::Backslash,
TokenKind::If,
TokenKind::Let,
TokenKind::LBrack,
]);
fn prefix(p: &mut Parser) -> Closed {
match p.peek() {
TokenKind::Ident => one(p, NodeKind::Ident),
TokenKind::Tag => one(p, NodeKind::Tag),
TokenKind::Number => one(p, NodeKind::Number),
TokenKind::Color => one(p, NodeKind::Color),
TokenKind::LBrack => list(p),
TokenKind::Minus | TokenKind::Not => unary(p),
TokenKind::LParen => paren(p),
TokenKind::Backslash => lambda(p),
TokenKind::If => if_expr(p),
TokenKind::Let => let_expr(p),
_ => {
assert!(
!PREFIX_TOKENS.contains(p.peek()),
"{:?} found in PREFIX_TOKENS",
p.peek()
);
let span = p.span();
p.emit(Diagnostic::error(
span,
"an expression was expected, but this token does not start one",
));
p.advance_with_error()
}
}
}
fn infix(p: &mut Parser, op: TokenKind) -> NodeKind {
match op {
TokenKind::Plus
| TokenKind::Minus
| TokenKind::Star
| TokenKind::Slash
| TokenKind::EqualEqual
| TokenKind::NotEqual
| TokenKind::Less
| TokenKind::LessEqual
| TokenKind::Greater
| TokenKind::GreaterEqual
| TokenKind::Equal => infix_binary(p, op),
_ if PREFIX_TOKENS.contains(op) => infix_call(p),
_ => panic!("unhandled infix operator {op:?}"),
}
}
fn infix_binary(p: &mut Parser, op: TokenKind) -> NodeKind {
let o = p.open();
p.advance();
p.close(o, NodeKind::Op);
if p.peek() == TokenKind::Newline {
p.advance();
}
precedence_parse(p, op);
NodeKind::Binary
}
fn infix_call(p: &mut Parser) -> NodeKind {
while PREFIX_TOKENS.contains(p.peek()) {
prefix(p);
}
NodeKind::Call
}
pub fn expr(p: &mut Parser) {
precedence_parse(p, TokenKind::Eof)
}
pub fn toplevel(p: &mut Parser) {
let o = p.open();
p.optional_newline();
while p.peek() != TokenKind::Eof {
expr(p);
match p.peek() {
TokenKind::Newline => {
p.advance();
continue;
}
TokenKind::Eof => break,
_ => {
let span = p.span();
p.emit(Diagnostic::error(
span,
"newline expected after toplevel expression",
))
}
}
}
p.close(o, NodeKind::Toplevel);
}
#[cfg(test)]
mod tests;