diff --git a/crates/haku-cli/src/main.rs b/crates/haku-cli/src/main.rs index 5c506c0..d1c39d9 100644 --- a/crates/haku-cli/src/main.rs +++ b/crates/haku-cli/src/main.rs @@ -1,71 +1,31 @@ -// NOTE: This is a very bad CLI. -// Sorry! +// NOTE: This is a very bad CLI. I only use it for debugging haku with LLDB. +// Sorry that it doesn't actually do anything! use std::{error::Error, fmt::Display, io::BufRead}; use haku::{ - bytecode::{Chunk, Defs}, - compiler::{compile_expr, Compiler, Source}, - sexp::{parse_toplevel, Ast, Parser, SourceCode}, - system::System, - value::{BytecodeLoc, Closure, FunctionName, Ref, Value}, - vm::{Vm, VmLimits}, + ast::{dump::dump, Ast}, + lexer::{lex, Lexer}, + parser::{expr, Parser, ParserLimits}, + source::SourceCode, + token::Lexis, + value::Value, }; fn eval(code: &str) -> Result> { - let mut system = System::new(1); - - let ast = Ast::new(1024); let code = SourceCode::unlimited_len(code); - let mut parser = Parser::new(ast, code); - let root = parse_toplevel(&mut parser); - let ast = parser.ast; - let src = Source { - code, - ast: &ast, - system: &system, - }; + let mut lexer = Lexer::new(Lexis::new(1024), code); + lex(&mut lexer).expect("too many tokens"); - let mut defs = Defs::new(256); - let mut chunk = Chunk::new(65536).unwrap(); - let mut compiler = Compiler::new(&mut defs, &mut chunk); - compile_expr(&mut compiler, &src, root)?; - let diagnostics = compiler.diagnostics; - let defs = compiler.defs; - println!("{chunk:?}"); + let mut parser = Parser::new(&lexer.lexis, &ParserLimits { max_events: 1024 }); + expr(&mut parser); - for diagnostic in &diagnostics { - eprintln!( - "{}..{}: {}", - diagnostic.span.start, diagnostic.span.end, diagnostic.message - ); - } + let mut ast = Ast::new(1024); + let (root, _) = parser.into_ast(&mut ast).unwrap(); - if !diagnostics.is_empty() { - return Err(Box::new(DiagnosticsEmitted)); - } + eprintln!("{}", dump(&ast, root, Some(code))); - let mut vm = Vm::new( - defs, - &VmLimits { - stack_capacity: 256, - call_stack_capacity: 256, - ref_capacity: 256, - fuel: 32768, - memory: 1024, - }, - ); - let chunk_id = system.add_chunk(chunk)?; - let closure = vm.create_ref(Ref::Closure(Closure { - start: BytecodeLoc { - chunk_id, - offset: 0, - }, - name: FunctionName::Anonymous, - param_count: 0, - captures: Vec::new(), - }))?; - Ok(vm.run(&system, closure)?) + Ok(Value::Nil) } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] diff --git a/crates/haku-wasm/src/lib.rs b/crates/haku-wasm/src/lib.rs index 5f37ab5..4562f94 100644 --- a/crates/haku-wasm/src/lib.rs +++ b/crates/haku-wasm/src/lib.rs @@ -2,18 +2,23 @@ extern crate alloc; -use core::{alloc::Layout, slice}; +use core::{alloc::Layout, num::Saturating, slice}; use alloc::{boxed::Box, vec::Vec}; use haku::{ + ast::Ast, bytecode::{Chunk, Defs, DefsImage}, - compiler::{compile_expr, CompileError, Compiler, Diagnostic, Source}, + compiler::{compile_expr, CompileError, Compiler, Source}, + diagnostic::Diagnostic, + lexer::{lex, Lexer}, + parser::{self, Parser}, render::{ tiny_skia::{Pixmap, PremultipliedColorU8}, Renderer, RendererLimits, }, - sexp::{parse_toplevel, Ast, Parser, SourceCode}, + source::SourceCode, system::{ChunkId, System, SystemImage}, + token::Lexis, value::{BytecodeLoc, Closure, FunctionName, Ref, Value}, vm::{Exception, Vm, VmImage, VmLimits}, }; @@ -41,6 +46,8 @@ struct Limits { max_source_code_len: usize, max_chunks: usize, max_defs: usize, + max_tokens: usize, + max_parser_events: usize, ast_capacity: usize, chunk_capacity: usize, stack_capacity: usize, @@ -58,6 +65,8 @@ impl Default for Limits { max_source_code_len: 65536, max_chunks: 2, max_defs: 256, + max_tokens: 1024, + max_parser_events: 1024, ast_capacity: 1024, chunk_capacity: 65536, stack_capacity: 1024, @@ -101,6 +110,8 @@ macro_rules! limit_setter { limit_setter!(max_source_code_len); limit_setter!(max_chunks); limit_setter!(max_defs); +limit_setter!(max_tokens); +limit_setter!(max_parser_events); limit_setter!(ast_capacity); limit_setter!(chunk_capacity); limit_setter!(stack_capacity); @@ -207,6 +218,8 @@ unsafe extern "C" fn haku_exception_message_len(instance: *const Instance) -> u3 enum StatusCode { Ok, SourceCodeTooLong, + TooManyTokens, + TooManyAstNodes, ChunkTooBig, DiagnosticsEmitted, TooManyChunks, @@ -238,6 +251,8 @@ extern "C" fn haku_status_string(code: StatusCode) -> *const i8 { match code { StatusCode::Ok => c"ok", StatusCode::SourceCodeTooLong => c"source code is too long", + StatusCode::TooManyTokens => c"source code has too many tokens", + StatusCode::TooManyAstNodes => c"source code has too many AST nodes", StatusCode::ChunkTooBig => c"compiled bytecode is too large", StatusCode::DiagnosticsEmitted => c"diagnostics were emitted", StatusCode::TooManyChunks => c"too many registered bytecode chunks", @@ -281,22 +296,22 @@ unsafe extern "C" fn haku_num_diagnostics(brush: *const Brush) -> u32 { #[no_mangle] unsafe extern "C" fn haku_diagnostic_start(brush: *const Brush, index: u32) -> u32 { - (*brush).diagnostics[index as usize].span.start as u32 + (*brush).diagnostics[index as usize].span().start } #[no_mangle] unsafe extern "C" fn haku_diagnostic_end(brush: *const Brush, index: u32) -> u32 { - (*brush).diagnostics[index as usize].span.end as u32 + (*brush).diagnostics[index as usize].span().end } #[no_mangle] unsafe extern "C" fn haku_diagnostic_message(brush: *const Brush, index: u32) -> *const u8 { - (*brush).diagnostics[index as usize].message.as_ptr() + (*brush).diagnostics[index as usize].message().as_ptr() } #[no_mangle] unsafe extern "C" fn haku_diagnostic_message_len(brush: *const Brush, index: u32) -> u32 { - (*brush).diagnostics[index as usize].message.len() as u32 + (*brush).diagnostics[index as usize].message().len() as u32 } #[no_mangle] @@ -315,15 +330,27 @@ unsafe extern "C" fn haku_compile_brush( let code = core::str::from_utf8(slice::from_raw_parts(code, code_len as usize)) .expect("invalid UTF-8"); - let code = match SourceCode::limited_len(code, instance.limits.max_source_code_len) { - Some(code) => code, - None => return StatusCode::SourceCodeTooLong, + let Some(code) = SourceCode::limited_len(code, instance.limits.max_source_code_len as u32) + else { + return StatusCode::SourceCodeTooLong; }; - let ast = Ast::new(instance.limits.ast_capacity); - let mut parser = Parser::new(ast, code); - let root = parse_toplevel(&mut parser); - let ast = parser.ast; + let mut lexer = Lexer::new(Lexis::new(instance.limits.max_tokens), code); + if lex(&mut lexer).is_err() { + return StatusCode::TooManyTokens; + }; + + let mut ast = Ast::new(instance.limits.ast_capacity); + let mut parser = Parser::new( + &lexer.lexis, + &haku::parser::ParserLimits { + max_events: instance.limits.max_parser_events, + }, + ); + parser::toplevel(&mut parser); + let Ok((root, mut parser_diagnostics)) = parser.into_ast(&mut ast) else { + return StatusCode::TooManyAstNodes; + }; let src = Source { code, @@ -339,8 +366,11 @@ unsafe extern "C" fn haku_compile_brush( } } - if !compiler.diagnostics.is_empty() { - brush.diagnostics = compiler.diagnostics; + let mut diagnostics = lexer.diagnostics; + diagnostics.append(&mut parser_diagnostics); + diagnostics.append(&mut compiler.diagnostics); + if !diagnostics.is_empty() { + brush.diagnostics = diagnostics; return StatusCode::DiagnosticsEmitted; } diff --git a/crates/haku/src/ast.rs b/crates/haku/src/ast.rs new file mode 100644 index 0000000..843708c --- /dev/null +++ b/crates/haku/src/ast.rs @@ -0,0 +1,125 @@ +use core::{error::Error, fmt::Display}; + +use alloc::vec::Vec; + +use crate::source::Span; + +pub mod dump; +pub mod walk; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct NodeId(u32); + +impl NodeId { + pub const NIL: NodeId = NodeId(0); +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum NodeKind { + Nil, + + Token, + + Ident, + Tag, + Number, + Color, + List, + + Op, + Unary, + Binary, + Call, + ParenEmpty, + Paren, + Lambda, + Params, + Param, + If, + Let, + + Toplevel, + + Error, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Node { + pub span: Span, + pub kind: NodeKind, +} + +#[derive(Debug, Clone)] +pub struct Ast { + kinds: Vec, + spans: Vec, + children_spans: Vec<(u32, u32)>, + children: Vec, +} + +impl Ast { + pub fn new(capacity: usize) -> Self { + assert!(capacity >= 1, "there must be space for at least a nil node"); + assert!(capacity <= u32::MAX as usize); + + let mut ast = Self { + kinds: Vec::with_capacity(capacity), + spans: Vec::with_capacity(capacity), + children_spans: Vec::with_capacity(capacity), + children: Vec::new(), + }; + + ast.alloc(NodeKind::Nil, Span::new(0, 0)).unwrap(); + + ast + } + + pub fn alloc(&mut self, kind: NodeKind, span: Span) -> Result { + if self.kinds.len() >= self.kinds.capacity() { + return Err(NodeAllocError); + } + + let index = self.kinds.len() as u32; + self.kinds.push(kind); + self.spans.push(span); + self.children_spans.push((0, 0)); + Ok(NodeId(index)) + } + + // NOTE: This never produces a NodeAllocError, because there can more or less only ever be as many children for + // nodes as there are nodes. + pub fn alloc_children(&mut self, for_node: NodeId, children: &[NodeId]) { + let start = self.children.len(); + self.children.extend_from_slice(children); + let end = self.children.len(); + self.children_spans[for_node.0 as usize] = (start as u32, end as u32); + } + + pub fn extend_span(&mut self, in_node: NodeId, end: u32) { + self.spans[in_node.0 as usize].end = end; + } + + pub fn kind(&self, id: NodeId) -> NodeKind { + self.kinds[id.0 as usize] + } + + pub fn span(&self, id: NodeId) -> Span { + self.spans[id.0 as usize] + } + + pub fn children(&self, id: NodeId) -> &[NodeId] { + let (start, end) = self.children_spans[id.0 as usize]; + &self.children[start as usize..end as usize] + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct NodeAllocError; + +impl Display for NodeAllocError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.write_str("too many nodes") + } +} + +impl Error for NodeAllocError {} diff --git a/crates/haku/src/ast/dump.rs b/crates/haku/src/ast/dump.rs new file mode 100644 index 0000000..32ed89a --- /dev/null +++ b/crates/haku/src/ast/dump.rs @@ -0,0 +1,34 @@ +use alloc::string::String; +use core::fmt::Write; + +use crate::{ast::NodeKind, source::SourceCode}; + +use super::{Ast, NodeId}; + +pub fn dump(ast: &Ast, node: NodeId, code: Option<&SourceCode>) -> String { + let mut result = String::new(); + + fn rec(ast: &Ast, node: NodeId, code: Option<&SourceCode>, result: &mut String, depth: usize) { + for _ in 0..depth { + result.push_str(" "); + } + + write!(result, "{:?} @ {:?}", ast.kind(node), ast.span(node)).unwrap(); + if let Some(code) = code { + if ast.kind(node) == NodeKind::Token { + write!(result, " {:?}", ast.span(node).slice(code)).unwrap(); + } + } + writeln!(result).unwrap(); + for &child in ast.children(node) { + rec(ast, child, code, result, depth + 1); + } + } + + rec(ast, node, code, &mut result, 0); + + // Remove the trailing newline. + result.pop(); + + result +} diff --git a/crates/haku/src/ast/walk.rs b/crates/haku/src/ast/walk.rs new file mode 100644 index 0000000..d76623b --- /dev/null +++ b/crates/haku/src/ast/walk.rs @@ -0,0 +1,73 @@ +use super::{Ast, NodeId, NodeKind}; + +impl Ast { + pub fn child(&self, parent: NodeId, kind: NodeKind) -> Option { + self.children(parent) + .iter() + .find(|&&child| self.kind(child) == kind) + .copied() + } + + pub fn walk(&self, parent: NodeId) -> Walk<'_> { + Walk { + ast: self, + parent, + index: 0, + } + } +} + +/// An iterator over a node's children, with convenience methods for accessing those children. +#[derive(Clone)] +pub struct Walk<'a> { + ast: &'a Ast, + parent: NodeId, + index: usize, +} + +impl<'a> Walk<'a> { + /// Walk to the first non-Nil, non-Error, non-Token node. + pub fn node(&mut self) -> Option { + while let Some(id) = self.next() { + if !matches!( + self.ast.kind(id), + NodeKind::Nil | NodeKind::Token | NodeKind::Error + ) { + return Some(id); + } + } + + None + } + + /// Walk to the next [`node`][`Self::node`] of the given kind. + pub fn node_of(&mut self, kind: NodeKind) -> Option { + while let Some(id) = self.node() { + if self.ast.kind(id) == kind { + return Some(id); + } + } + + None + } + + /// Find the first node of the given kind. This does not advance the iterator. + pub fn get(&self, kind: NodeKind) -> Option { + self.clone().find(|&id| self.ast.kind(id) == kind) + } +} + +impl<'a> Iterator for Walk<'a> { + type Item = NodeId; + + fn next(&mut self) -> Option { + let children = self.ast.children(self.parent); + if self.index < children.len() { + let index = self.index; + self.index += 1; + Some(children[index]) + } else { + None + } + } +} diff --git a/crates/haku/src/bytecode.rs b/crates/haku/src/bytecode.rs index 82932bc..a98285b 100644 --- a/crates/haku/src/bytecode.rs +++ b/crates/haku/src/bytecode.rs @@ -17,6 +17,8 @@ pub enum Opcode { // Duplicate existing values. /// Push a value relative to the bottom of the current stack window. Local, // (index: u8) + /// Set the value of a value relative to the bottom of the current stack window. + SetLocal, // (index: u8) /// Push a captured value. Capture, // (index: u8) /// Get the value of a definition. @@ -24,12 +26,8 @@ pub enum Opcode { /// Set the value of a definition. SetDef, // (index: u16) - /// Drop `number` values from the stack. - /// - DropLet, // (number: u8) - // Create literal functions. - Function, // (params: u8, then: u16), at `then`: (capture_count: u8, captures: [(source: u8, index: u8); capture_count]) + Function, // (params: u8, then: u16), at `then`: (local_count: u8, capture_count: u8, captures: [(source: u8, index: u8); capture_count]) // Control flow. Jump, // (offset: u16) diff --git a/crates/haku/src/compiler.rs b/crates/haku/src/compiler.rs index a6966f3..a3b82e2 100644 --- a/crates/haku/src/compiler.rs +++ b/crates/haku/src/compiler.rs @@ -6,9 +6,11 @@ use core::{ use alloc::vec::Vec; use crate::{ + ast::{Ast, NodeId, NodeKind}, bytecode::{Chunk, DefError, Defs, EmitError, Opcode, CAPTURE_CAPTURE, CAPTURE_LOCAL}, - sexp::{Ast, NodeId, NodeKind, SourceCode, Span}, - system::System, + diagnostic::Diagnostic, + source::SourceCode, + system::{System, SystemFnArity}, }; pub struct Source<'a> { @@ -17,12 +19,6 @@ pub struct Source<'a> { pub system: &'a System, } -#[derive(Debug, Clone, Copy)] -pub struct Diagnostic { - pub span: Span, - pub message: &'static str, -} - #[derive(Debug, Clone, Copy)] struct Local<'a> { name: &'a str, @@ -46,6 +42,11 @@ pub struct Compiler<'a, 'b> { scopes: Vec>, } +#[derive(Debug, Clone, Copy)] +pub struct ClosureSpec { + pub(crate) local_count: u8, +} + impl<'a, 'b> Compiler<'a, 'b> { pub fn new(defs: &'a mut Defs, chunk: &'b mut Chunk) -> Self { Self { @@ -59,20 +60,24 @@ impl<'a, 'b> Compiler<'a, 'b> { } } - pub fn diagnose(&mut self, diagnostic: Diagnostic) { - if self.diagnostics.len() >= self.diagnostics.capacity() { - return; - } - - if self.diagnostics.len() == self.diagnostics.capacity() - 1 { - self.diagnostics.push(Diagnostic { - span: Span::new(0, 0), - message: "too many diagnostics emitted, stopping", // hello clangd! - }) - } else { + fn emit(&mut self, diagnostic: Diagnostic) { + if self.diagnostics.len() < self.diagnostics.capacity() { self.diagnostics.push(diagnostic); } } + + pub fn closure_spec(&self) -> ClosureSpec { + ClosureSpec { + local_count: self + .scopes + .last() + .unwrap() + .locals + .len() + .try_into() + .unwrap_or_default(), + } + } } type CompileResult = Result; @@ -82,27 +87,51 @@ pub fn compile_expr<'a>( src: &Source<'a>, node_id: NodeId, ) -> CompileResult { - let node = src.ast.get(node_id); - match node.kind { - NodeKind::Eof => unreachable!("eof node should never be emitted"), + match src.ast.kind(node_id) { + // The nil node is special, as it inhabits node ID 0. + NodeKind::Nil => { + unreachable!("Nil node should never be emitted (ParenEmpty is used for nil literals)") + } + // Tokens are trivia and should never be emitted---they're only useful for error reporting. + NodeKind::Token => unreachable!("Token node should never be emitted"), + // Op nodes are only used to provide a searching anchor for the operator in Unary and Binary. + NodeKind::Op => unreachable!("Op node should never be emitted"), + // Params nodes are only used to provide a searching anchor for Lambda parameters. + NodeKind::Params => unreachable!("Param node should never be emitted"), + // Param nodes are only used to provide a searching anchor for identifiers in Params nodes, + // as they may also contain commas and other trivia. + NodeKind::Param => unreachable!("Param node should never be emitted"), + + NodeKind::Color => unsupported(c, src, node_id, "color literals are not implemented yet"), - NodeKind::Nil => compile_nil(c), NodeKind::Ident => compile_ident(c, src, node_id), NodeKind::Number => compile_number(c, src, node_id), - NodeKind::List(_, _) => compile_list(c, src, node_id), - NodeKind::Toplevel(_) => compile_toplevel(c, src, node_id), + NodeKind::Tag => compile_tag(c, src, node_id), + NodeKind::List => unsupported(c, src, node_id, "list literals are not implemented yet"), - NodeKind::Error(message) => { - c.diagnose(Diagnostic { - span: node.span, - message, - }); - Ok(()) - } + NodeKind::Unary => compile_unary(c, src, node_id), + NodeKind::Binary => compile_binary(c, src, node_id), + NodeKind::Call => compile_call(c, src, node_id), + NodeKind::Paren => compile_paren(c, src, node_id), + NodeKind::ParenEmpty => compile_nil(c), + NodeKind::Lambda => compile_lambda(c, src, node_id), + NodeKind::If => compile_if(c, src, node_id), + NodeKind::Let => compile_let(c, src, node_id), + + NodeKind::Toplevel => compile_toplevel(c, src, node_id), + + // Error nodes are ignored, because for each error node an appropriate parser + // diagnostic is emitted anyways. + NodeKind::Error => Ok(()), } } -fn compile_nil(c: &mut Compiler<'_, '_>) -> CompileResult { +fn unsupported(c: &mut Compiler, src: &Source, node_id: NodeId, message: &str) -> CompileResult { + c.emit(Diagnostic::error(src.ast.span(node_id), message)); + Ok(()) +} + +fn compile_nil(c: &mut Compiler) -> CompileResult { c.chunk.emit_opcode(Opcode::Nil)?; Ok(()) @@ -144,48 +173,39 @@ fn find_variable( } fn compile_ident<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) -> CompileResult { - let ident = src.ast.get(node_id); - let name = ident.span.slice(src.code); + let span = src.ast.span(node_id); + let name = span.slice(src.code); - match name { - "false" => _ = c.chunk.emit_opcode(Opcode::False)?, - "true" => _ = c.chunk.emit_opcode(Opcode::True)?, - _ => match find_variable(c, name, c.scopes.len() - 1) { - Ok(Some(Variable::Local(index))) => { - c.chunk.emit_opcode(Opcode::Local)?; - c.chunk.emit_u8(index)?; + match find_variable(c, name, c.scopes.len() - 1) { + Ok(Some(Variable::Local(index))) => { + c.chunk.emit_opcode(Opcode::Local)?; + c.chunk.emit_u8(index)?; + } + Ok(Some(Variable::Captured(index))) => { + c.chunk.emit_opcode(Opcode::Capture)?; + c.chunk.emit_u8(index)?; + } + Ok(None) => { + if let Some(def_id) = c.defs.get(name) { + c.chunk.emit_opcode(Opcode::Def)?; + c.chunk.emit_u16(def_id.to_u16())?; + } else { + c.emit(Diagnostic::error(span, "undefined variable")); } - Ok(Some(Variable::Captured(index))) => { - c.chunk.emit_opcode(Opcode::Capture)?; - c.chunk.emit_u8(index)?; - } - Ok(None) => { - if let Some(def_id) = c.defs.get(name) { - c.chunk.emit_opcode(Opcode::Def)?; - c.chunk.emit_u16(def_id.to_u16())?; - } else { - c.diagnose(Diagnostic { - span: ident.span, - message: "undefined variable", - }); - } - } - Err(CaptureError) => { - c.diagnose(Diagnostic { - span: ident.span, - message: "too many variables captured from outer functions in this scope", - }); - } - }, - } + } + Err(CaptureError) => { + c.emit(Diagnostic::error( + span, + "too many variables captured from outer functions in this scope", + )); + } + }; Ok(()) } fn compile_number(c: &mut Compiler<'_, '_>, src: &Source<'_>, node_id: NodeId) -> CompileResult { - let node = src.ast.get(node_id); - - let literal = node.span.slice(src.code); + let literal = src.ast.span(node_id).slice(src.code); let float: f32 = literal .parse() .expect("the parser should've gotten us a string parsable by the stdlib"); @@ -196,48 +216,130 @@ fn compile_number(c: &mut Compiler<'_, '_>, src: &Source<'_>, node_id: NodeId) - Ok(()) } -fn compile_list<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) -> CompileResult { - let NodeKind::List(function_id, args) = src.ast.get(node_id).kind else { - unreachable!("compile_list expects a List"); - }; +fn compile_tag(c: &mut Compiler<'_, '_>, src: &Source, node_id: NodeId) -> CompileResult { + let tag = src.ast.span(node_id).slice(src.code); - let function = src.ast.get(function_id); - let name = function.span.slice(src.code); - - if function.kind == NodeKind::Ident { - match name { - "fn" => return compile_fn(c, src, args), - "if" => return compile_if(c, src, args), - "let" => return compile_let(c, src, args), - _ => (), - }; + match tag { + "False" => { + c.chunk.emit_opcode(Opcode::False)?; + } + "True" => { + c.chunk.emit_opcode(Opcode::True)?; + } + _ => { + c.emit(Diagnostic::error(src.ast.span(node_id), "uppercased identifiers are reserved for future use; please start your identifiers with a lowercase letter instead")); + } } + Ok(()) +} + +fn compile_unary<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) -> CompileResult { + let mut walk = src.ast.walk(node_id); + let Some(op) = walk.node() else { return Ok(()) }; + let Some(expr) = walk.node() else { + return Ok(()); + }; + + if src.ast.kind(op) != NodeKind::Op { + return Ok(()); + } + let name = src.ast.span(op).slice(src.code); + + compile_expr(c, src, expr)?; + if let Some(index) = (src.system.resolve_fn)(SystemFnArity::Unary, name) { + let argument_count = 1; + c.chunk.emit_opcode(Opcode::System)?; + c.chunk.emit_u8(index)?; + c.chunk.emit_u8(argument_count)?; + } else { + c.emit(Diagnostic::error( + src.ast.span(op), + "this unary operator is currently unimplemented", + )); + } + + Ok(()) +} + +fn compile_binary<'a>( + c: &mut Compiler<'a, '_>, + src: &Source<'a>, + node_id: NodeId, +) -> CompileResult { + let mut walk = src.ast.walk(node_id); + let Some(left) = walk.node() else { + return Ok(()); + }; + let Some(op) = walk.node() else { return Ok(()) }; + let Some(right) = walk.node() else { + return Ok(()); + }; + + if src.ast.kind(op) != NodeKind::Op { + return Ok(()); + } + let name = src.ast.span(op).slice(src.code); + + if name == "=" { + c.emit(Diagnostic::error( + src.ast.span(op), + "defs `a = b` may only appear at the top level", + )); + return Ok(()); + } + + compile_expr(c, src, left)?; + compile_expr(c, src, right)?; + if let Some(index) = (src.system.resolve_fn)(SystemFnArity::Binary, name) { + let argument_count = 2; + c.chunk.emit_opcode(Opcode::System)?; + c.chunk.emit_u8(index)?; + c.chunk.emit_u8(argument_count)?; + } else { + c.emit(Diagnostic::error( + src.ast.span(op), + "this unary operator is currently unimplemented", + )); + } + + Ok(()) +} + +fn compile_call<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) -> CompileResult { + let mut walk = src.ast.walk(node_id); + let Some(func) = walk.node() else { + return Ok(()); + }; + let name = src.ast.span(func).slice(src.code); + let mut argument_count = 0; - let mut args = args; - while let NodeKind::List(head, tail) = src.ast.get(args).kind { - compile_expr(c, src, head)?; + while let Some(arg) = walk.node() { + compile_expr(c, src, arg)?; argument_count += 1; - args = tail; } let argument_count = u8::try_from(argument_count).unwrap_or_else(|_| { - c.diagnose(Diagnostic { - span: src.ast.get(args).span, - message: "function call has too many arguments", - }); + c.emit(Diagnostic::error( + src.ast.span(node_id), + "function call has too many arguments", + )); 0 }); - if let (NodeKind::Ident, Some(index)) = (function.kind, (src.system.resolve_fn)(name)) { + if let (NodeKind::Ident, Some(index)) = ( + src.ast.kind(func), + (src.system.resolve_fn)(SystemFnArity::Nary, name), + ) { c.chunk.emit_opcode(Opcode::System)?; c.chunk.emit_u8(index)?; c.chunk.emit_u8(argument_count)?; } else { // This is a bit of an oddity: we only emit the function expression _after_ the arguments, // but since the language is effectless this doesn't matter in practice. - // It makes for less code in the compiler and the VM. - compile_expr(c, src, function_id)?; + // It makes for a bit less code in the VM, since there's no need to find the function + // down the stack - it's always on top. + compile_expr(c, src, func)?; c.chunk.emit_opcode(Opcode::Call)?; c.chunk.emit_u8(argument_count)?; } @@ -245,67 +347,28 @@ fn compile_list<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) Ok(()) } -struct WalkList { - current: NodeId, - ok: bool, -} - -impl WalkList { - fn new(start: NodeId) -> Self { - Self { - current: start, - ok: true, - } - } - - fn expect_arg( - &mut self, - c: &mut Compiler<'_, '_>, - src: &Source<'_>, - message: &'static str, - ) -> NodeId { - if !self.ok { - return NodeId::NIL; - } - - if let NodeKind::List(expr, tail) = src.ast.get(self.current).kind { - self.current = tail; - expr - } else { - c.diagnose(Diagnostic { - span: src.ast.get(self.current).span, - message, - }); - self.ok = false; - NodeId::NIL - } - } - - fn expect_nil(&mut self, c: &mut Compiler<'_, '_>, src: &Source<'_>, message: &'static str) { - if src.ast.get(self.current).kind != NodeKind::Nil { - c.diagnose(Diagnostic { - span: src.ast.get(self.current).span, - message, - }); - // NOTE: Don't set self.ok to false, since this is not a fatal error. - // The nodes returned previously are valid and therefore it's safe to operate on them. - // Just having extra arguments shouldn't inhibit emitting additional diagnostics in - // the expression. - } - } -} - -fn compile_if<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, args: NodeId) -> CompileResult { - let mut list = WalkList::new(args); - - let condition = list.expect_arg(c, src, "missing `if` condition"); - let if_true = list.expect_arg(c, src, "missing `if` true branch"); - let if_false = list.expect_arg(c, src, "missing `if` false branch"); - list.expect_nil(c, src, "extra arguments after `if` false branch"); - - if !list.ok { +fn compile_paren<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) -> CompileResult { + let Some(inner) = src.ast.walk(node_id).node() else { return Ok(()); - } + }; + + compile_expr(c, src, inner)?; + + Ok(()) +} + +fn compile_if<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) -> CompileResult { + let mut walk = src.ast.walk(node_id); + + let Some(condition) = walk.node() else { + return Ok(()); + }; + let Some(if_true) = walk.node() else { + return Ok(()); + }; + let Some(if_false) = walk.node() else { + return Ok(()); + }; compile_expr(c, src, condition)?; @@ -328,113 +391,70 @@ fn compile_if<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, args: NodeId) -> C Ok(()) } -fn compile_let<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, args: NodeId) -> CompileResult { - let mut list = WalkList::new(args); +fn compile_let<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) -> CompileResult { + let mut walk = src.ast.walk(node_id); - let binding_list = list.expect_arg(c, src, "missing `let` binding list ((x 1) (y 2) ...)"); - let expr = list.expect_arg(c, src, "missing expression to `let` names into"); - list.expect_nil(c, src, "extra arguments after `let` expression"); - - if !list.ok { + let Some(ident) = walk.node() else { return Ok(()); - } - - // NOTE: Our `let` behaves like `let*` from Lisps. - // This is because this is generally the more intuitive behaviour with how variable declarations - // work in traditional imperative languages. - // We do not offer an alternative to Lisp `let` to be as minimal as possible. - - let mut current = binding_list; - let mut local_count: usize = 0; - while let NodeKind::List(head, tail) = src.ast.get(current).kind { - if !matches!(src.ast.get(head).kind, NodeKind::List(_, _)) { - c.diagnose(Diagnostic { - span: src.ast.get(head).span, - message: "`let` binding expected, like (x 1)", - }); - current = tail; - continue; - } - - let mut list = WalkList::new(head); - let ident = list.expect_arg(c, src, "binding name expected"); - let value = list.expect_arg(c, src, "binding value expected"); - list.expect_nil(c, src, "extra expressions after `let` binding value"); - - if src.ast.get(ident).kind != NodeKind::Ident { - c.diagnose(Diagnostic { - span: src.ast.get(ident).span, - message: "binding name must be an identifier", - }); - } - - // NOTE: Compile expression _before_ putting the value into scope. - // This is so that the variable cannot refer to itself, as it is yet to be declared. - compile_expr(c, src, value)?; - - let name = src.ast.get(ident).span.slice(src.code); - let scope = c.scopes.last_mut().unwrap(); - if scope.locals.len() >= u8::MAX as usize { - c.diagnose(Diagnostic { - span: src.ast.get(ident).span, - message: "too many names bound in this function at a single time", - }); - } else { - scope.locals.push(Local { name }); - } - - local_count += 1; - current = tail; - } + }; + let Some(expr) = walk.node() else { + return Ok(()); + }; + let Some(then) = walk.node() else { + return Ok(()); + }; compile_expr(c, src, expr)?; - + let name = src.ast.span(ident).slice(src.code); let scope = c.scopes.last_mut().unwrap(); - scope - .locals - .resize_with(scope.locals.len() - local_count, || unreachable!()); + let index = if scope.locals.len() >= u8::MAX as usize { + c.emit(Diagnostic::error( + src.ast.span(ident), + "too many names bound in this function at a single time", + )); - // NOTE: If we reach more than 255 locals declared in our `let`, we should've gotten - // a diagnostic emitted in the `while` loop beforehand. - let local_count = u8::try_from(local_count).unwrap_or(0); - c.chunk.emit_opcode(Opcode::DropLet)?; - c.chunk.emit_u8(local_count)?; + // Don't emit the expression, because it will most likely contain errors due to this + // `let` failing. + return Ok(()); + } else { + let index = scope.locals.len(); + scope.locals.push(Local { name }); + index as u8 + }; + c.chunk.emit_opcode(Opcode::SetLocal)?; + c.chunk.emit_u8(index)?; + + compile_expr(c, src, then)?; Ok(()) } -fn compile_fn<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, args: NodeId) -> CompileResult { - let mut list = WalkList::new(args); - - let param_list = list.expect_arg(c, src, "missing function parameters"); - let body = list.expect_arg(c, src, "missing function body"); - list.expect_nil(c, src, "extra arguments after function body"); - - if !list.ok { +fn compile_lambda<'a>( + c: &mut Compiler<'a, '_>, + src: &Source<'a>, + node_id: NodeId, +) -> CompileResult { + let mut walk = src.ast.walk(node_id); + let Some(params) = walk.node() else { return Ok(()); - } + }; + let Some(body) = walk.node() else { + return Ok(()); + }; let mut locals = Vec::new(); - let mut current = param_list; - while let NodeKind::List(ident, tail) = src.ast.get(current).kind { - if let NodeKind::Ident = src.ast.get(ident).kind { - locals.push(Local { - name: src.ast.get(ident).span.slice(src.code), - }) - } else { - c.diagnose(Diagnostic { - span: src.ast.get(ident).span, - message: "function parameters must be identifiers", - }) - } - current = tail; + let mut params_walk = src.ast.walk(params); + while let Some(param) = params_walk.node() { + locals.push(Local { + name: src.ast.span(param).slice(src.code), + }); } let param_count = u8::try_from(locals.len()).unwrap_or_else(|_| { - c.diagnose(Diagnostic { - span: src.ast.get(param_list).span, - message: "too many function parameters", - }); + c.emit(Diagnostic::error( + src.ast.span(params), + "too many function parameters", + )); 0 }); @@ -453,13 +473,21 @@ fn compile_fn<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, args: NodeId) -> C c.chunk.patch_u16(after_offset, after); let scope = c.scopes.pop().unwrap(); - let capture_count = u8::try_from(scope.captures.len()).unwrap_or_else(|_| { - c.diagnose(Diagnostic { - span: src.ast.get(body).span, - message: "function refers to too many variables from the outer function", - }); + let local_count = u8::try_from(scope.locals.len()).unwrap_or_else(|_| { + c.emit(Diagnostic::error( + src.ast.span(body), + "function contains too many local variables", + )); 0 }); + let capture_count = u8::try_from(scope.captures.len()).unwrap_or_else(|_| { + c.emit(Diagnostic::error( + src.ast.span(body), + "function refers to too many variables from its outer functions", + )); + 0 + }); + c.chunk.emit_u8(local_count)?; c.chunk.emit_u8(capture_count)?; for capture in scope.captures { match capture { @@ -484,31 +512,27 @@ fn compile_toplevel<'a>( src: &Source<'a>, node_id: NodeId, ) -> CompileResult { - let NodeKind::Toplevel(mut current) = src.ast.get(node_id).kind else { - unreachable!("compile_toplevel expects a Toplevel"); - }; + def_prepass(c, src, node_id)?; - def_prepass(c, src, current)?; + let mut walk = src.ast.walk(node_id); + let mut result_expr = None; + while let Some(toplevel_expr) = walk.node() { + if let Some(result_expr) = result_expr { + // TODO: This diagnostic should show you the expression after the result. + c.emit(Diagnostic::error( + src.ast.span(result_expr), + "the result value must be the last thing in the program", + )); + } - let mut had_result = false; - while let NodeKind::List(expr, tail) = src.ast.get(current).kind { - match compile_toplevel_expr(c, src, expr)? { + match compile_toplevel_expr(c, src, toplevel_expr)? { ToplevelExpr::Def => (), - ToplevelExpr::Result => had_result = true, + ToplevelExpr::Result if result_expr.is_none() => result_expr = Some(toplevel_expr), + ToplevelExpr::Result => (), } - - if had_result && src.ast.get(tail).kind != NodeKind::Nil { - c.diagnose(Diagnostic { - span: src.ast.get(tail).span, - message: "result value may not be followed by anything else", - }); - break; - } - - current = tail; } - if !had_result { + if result_expr.is_none() { c.chunk.emit_opcode(Opcode::Nil)?; } c.chunk.emit_opcode(Opcode::Return)?; @@ -516,36 +540,28 @@ fn compile_toplevel<'a>( Ok(()) } -fn def_prepass<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) -> CompileResult { +fn def_prepass<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, toplevel: NodeId) -> CompileResult { + let mut walk = src.ast.walk(toplevel); + // This is a bit of a pattern matching tapeworm, but Rust unfortunately doesn't have `if let` // chains yet to make this more readable. - let mut current = node_id; - while let NodeKind::List(expr, tail) = src.ast.get(current).kind { - if let NodeKind::List(head_id, tail_id) = src.ast.get(expr).kind { - let head = src.ast.get(head_id); - let name = head.span.slice(src.code); - if head.kind == NodeKind::Ident && name == "def" { - if let NodeKind::List(ident_id, _) = src.ast.get(tail_id).kind { - let ident = src.ast.get(ident_id); - if ident.kind == NodeKind::Ident { - let name = ident.span.slice(src.code); - match c.defs.add(name) { - Ok(_) => (), - Err(DefError::Exists) => c.diagnose(Diagnostic { - span: ident.span, - message: "redefinitions of defs are not allowed", - }), - Err(DefError::OutOfSpace) => c.diagnose(Diagnostic { - span: ident.span, - message: "too many defs", - }), - } + while let Some(binary) = walk.node_of(NodeKind::Binary) { + let mut binary_walk = src.ast.walk(binary); + if let (Some(ident), Some(op)) = (binary_walk.node(), binary_walk.get(NodeKind::Op)) { + if src.ast.span(op).slice(src.code) == "=" { + let name = src.ast.span(ident).slice(src.code); + match c.defs.add(name) { + Ok(_) => (), + Err(DefError::Exists) => c.emit(Diagnostic::error( + src.ast.span(ident), + "a def with this name already exists", + )), + Err(DefError::OutOfSpace) => { + c.emit(Diagnostic::error(src.ast.span(binary), "too many defs")) } } } } - - current = tail; } Ok(()) @@ -562,14 +578,10 @@ fn compile_toplevel_expr<'a>( src: &Source<'a>, node_id: NodeId, ) -> CompileResult { - let node = src.ast.get(node_id); - - if let NodeKind::List(head_id, tail_id) = node.kind { - let head = src.ast.get(head_id); - if head.kind == NodeKind::Ident { - let name = head.span.slice(src.code); - if name == "def" { - compile_def(c, src, tail_id)?; + if src.ast.kind(node_id) == NodeKind::Binary { + if let Some(op) = src.ast.walk(node_id).get(NodeKind::Op) { + if src.ast.span(op).slice(src.code) == "=" { + compile_def(c, src, node_id)?; return Ok(ToplevelExpr::Def); } } @@ -579,24 +591,32 @@ fn compile_toplevel_expr<'a>( Ok(ToplevelExpr::Result) } -fn compile_def<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, args: NodeId) -> CompileResult { - let mut list = WalkList::new(args); - - let ident = list.expect_arg(c, src, "missing definition name"); - let value = list.expect_arg(c, src, "missing definition value"); - list.expect_nil(c, src, "extra arguments after definition"); - - if !list.ok { +fn compile_def<'a>(c: &mut Compiler<'a, '_>, src: &Source<'a>, node_id: NodeId) -> CompileResult { + let mut walk = src.ast.walk(node_id); + let Some(left) = walk.node() else { return Ok(()); + }; + let Some(_op) = walk.node() else { + return Ok(()); + }; + let Some(right) = walk.node() else { + return Ok(()); + }; + + if src.ast.kind(left) != NodeKind::Ident { + c.emit(Diagnostic::error( + src.ast.span(left), + "def name (identifier) expected", + )); } - let name = src.ast.get(ident).span.slice(src.code); + let name = src.ast.span(left).slice(src.code); // NOTE: def_prepass collects all definitions beforehand. // In case a def ends up not existing, that means we ran out of space for defs - so emit a // zero def instead. let def_id = c.defs.get(name).unwrap_or_default(); - compile_expr(c, src, value)?; + compile_expr(c, src, right)?; c.chunk.emit_opcode(Opcode::SetDef)?; c.chunk.emit_u16(def_id.to_u16())?; diff --git a/crates/haku/src/diagnostic.rs b/crates/haku/src/diagnostic.rs new file mode 100644 index 0000000..3ce6917 --- /dev/null +++ b/crates/haku/src/diagnostic.rs @@ -0,0 +1,26 @@ +use alloc::string::String; + +use crate::source::Span; + +#[derive(Debug, Clone)] +pub struct Diagnostic { + span: Span, + message: String, +} + +impl Diagnostic { + pub fn error(span: Span, message: impl Into) -> Self { + Self { + span, + message: message.into(), + } + } + + pub fn span(&self) -> Span { + self.span + } + + pub fn message(&self) -> &str { + &self.message + } +} diff --git a/crates/haku/src/lexer.rs b/crates/haku/src/lexer.rs new file mode 100644 index 0000000..32aa8a5 --- /dev/null +++ b/crates/haku/src/lexer.rs @@ -0,0 +1,237 @@ +use alloc::vec::Vec; + +use crate::{ + diagnostic::Diagnostic, + source::{SourceCode, Span}, + token::{Lexis, TokenAllocError, TokenKind}, +}; + +pub struct Lexer<'a> { + pub lexis: Lexis, + pub diagnostics: Vec, + input: &'a SourceCode, + position: u32, +} + +impl<'a> Lexer<'a> { + pub fn new(lexis: Lexis, input: &'a SourceCode) -> Self { + Self { + lexis, + diagnostics: Vec::new(), + input, + position: 0, + } + } + + fn current(&self) -> char { + self.input[self.position as usize..] + .chars() + .next() + .unwrap_or('\0') + } + + fn advance(&mut self) { + self.position += self.current().len_utf8() as u32; + } + + fn emit(&mut self, diagnostic: Diagnostic) { + if self.diagnostics.len() < self.diagnostics.capacity() { + self.diagnostics.push(diagnostic); + } + } +} + +fn one(l: &mut Lexer<'_>, kind: TokenKind) -> TokenKind { + l.advance(); + kind +} + +fn one_or_two(l: &mut Lexer<'_>, kind1: TokenKind, c2: char, kind2: TokenKind) -> TokenKind { + l.advance(); + if l.current() == c2 { + l.advance(); + kind2 + } else { + kind1 + } +} + +fn is_ident_char(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') +} + +fn ident(l: &mut Lexer<'_>) -> TokenKind { + let start = l.position; + while is_ident_char(l.current()) { + l.advance(); + } + let end = l.position; + + match Span::new(start, end).slice(l.input) { + "_" => TokenKind::Underscore, + "and" => TokenKind::And, + "or" => TokenKind::Or, + "if" => TokenKind::If, + "else" => TokenKind::Else, + "let" => TokenKind::Let, + _ => TokenKind::Ident, + } +} + +fn tag(l: &mut Lexer<'_>) -> TokenKind { + while is_ident_char(l.current()) { + l.advance(); + } + TokenKind::Tag +} + +// NOTE: You shouldn't expect that the numbers produced by the lexer are parsable. +fn number(l: &mut Lexer<'_>) -> TokenKind { + while l.current().is_ascii_digit() { + l.advance(); + } + + if l.current() == '.' { + let dot = l.position; + l.advance(); + if !l.current().is_ascii_digit() { + l.emit(Diagnostic::error( + Span::new(dot, l.position), + "there must be at least a single digit after the decimal point", + )); + } + while l.current().is_ascii_digit() { + l.advance(); + } + } + + TokenKind::Number +} + +// NOTE: You shouldn't expect that the color literals produced by the lexer are parsable. +fn color(l: &mut Lexer<'_>) -> TokenKind { + let hash = l.position; + l.advance(); // # + + if !l.current().is_ascii_hexdigit() { + l.emit(Diagnostic::error( + Span::new(hash, l.position), + "hex digits expected after `#` (color literal)", + )); + } + + let start = l.position; + while l.current().is_ascii_hexdigit() { + l.advance(); + } + let len = l.position - start; + + if !matches!(len, 3 | 4 | 6 | 8) { + l.emit(Diagnostic::error(Span::new(hash, l.position), "incorrect number of digits in color literal (must be #RGB, #RGBA, #RRGGBB, or #RRGGBBAA)")); + } + + TokenKind::Color +} + +fn whitespace_and_comments(l: &mut Lexer<'_>) { + loop { + match l.current() { + '-' => { + let position = l.position; + l.advance(); + if l.current() == '-' { + while l.current() != '\n' { + l.advance(); + } + } else { + // An unfortunate little bit of backtracking here; + // This seems like the simplest possible solution though. + // We don't treat comments as a separate token to simplify the parsing phase, + // and because of this, handling this at the "real" token level would complicate + // things quite a bit. + l.position = position; + break; + } + } + + ' ' | '\r' | '\t' => l.advance(), + + _ => break, + } + } +} + +fn newline(l: &mut Lexer<'_>) -> (TokenKind, Span) { + let start = l.position; + l.advance(); // skip the initial newline + let end = l.position; + + // Skip additional newlines after this one, to only produce one token. + // These do not count into this newline's span though. + loop { + whitespace_and_comments(l); + if l.current() == '\n' { + l.advance(); + continue; + } else { + break; + } + } + + (TokenKind::Newline, Span::new(start, end)) +} + +fn token(l: &mut Lexer<'_>) -> (TokenKind, Span) { + whitespace_and_comments(l); + + let start = l.position; + let kind = match l.current() { + '\0' => TokenKind::Eof, + + // NOTE: Order matters here. Numbers and tags take priority over identifers. + c if c.is_ascii_uppercase() => tag(l), + c if c.is_ascii_digit() => number(l), + c if is_ident_char(c) => ident(l), + + '#' => color(l), + + '+' => one(l, TokenKind::Plus), + '-' => one_or_two(l, TokenKind::Minus, '>', TokenKind::RArrow), + '*' => one(l, TokenKind::Star), + '/' => one(l, TokenKind::Slash), + '=' => one_or_two(l, TokenKind::Equal, '=', TokenKind::EqualEqual), + '!' => one_or_two(l, TokenKind::Not, '=', TokenKind::NotEqual), + '<' => one_or_two(l, TokenKind::Less, '=', TokenKind::LessEqual), + '>' => one_or_two(l, TokenKind::Greater, '=', TokenKind::GreaterEqual), + + '\n' => return newline(l), + '(' => one(l, TokenKind::LParen), + ')' => one(l, TokenKind::RParen), + '[' => one(l, TokenKind::LBrack), + ']' => one(l, TokenKind::RBrack), + ',' => one(l, TokenKind::Comma), + '\\' => one(l, TokenKind::Backslash), + + _ => { + l.advance(); + l.emit(Diagnostic::error( + Span::new(start, l.position), + "unexpected character", + )); + TokenKind::Error + } + }; + let end = l.position; + (kind, Span::new(start, end)) +} + +pub fn lex(l: &mut Lexer<'_>) -> Result<(), TokenAllocError> { + loop { + let (kind, span) = token(l); + l.lexis.push(kind, span)?; + if kind == TokenKind::Eof { + break; + } + } + Ok(()) +} diff --git a/crates/haku/src/lib.rs b/crates/haku/src/lib.rs index 22aecc8..81e69f2 100644 --- a/crates/haku/src/lib.rs +++ b/crates/haku/src/lib.rs @@ -2,10 +2,15 @@ extern crate alloc; +pub mod ast; pub mod bytecode; pub mod compiler; +pub mod diagnostic; +pub mod lexer; +pub mod parser; pub mod render; -pub mod sexp; +pub mod source; pub mod system; +pub mod token; pub mod value; pub mod vm; diff --git a/crates/haku/src/parser.rs b/crates/haku/src/parser.rs new file mode 100644 index 0000000..df487ee --- /dev/null +++ b/crates/haku/src/parser.rs @@ -0,0 +1,607 @@ +use core::cell::Cell; + +use alloc::vec::Vec; + +use crate::{ + ast::{Ast, NodeAllocError, NodeId, NodeKind}, + diagnostic::Diagnostic, + source::Span, + token::{Lexis, TokenKind, TokenKindSet}, +}; + +#[derive(Debug, Clone, Copy)] +pub struct ParserLimits { + pub max_events: usize, +} + +pub struct Parser<'a> { + tokens: &'a Lexis, + events: Vec, + position: u32, + fuel: Cell, + pub diagnostics: Vec, +} + +#[derive(Debug)] +enum Event { + Open { kind: NodeKind }, + Close, + Advance, +} + +struct Open { + index: Option, +} + +struct Closed { + index: Option, +} + +impl<'a> Parser<'a> { + const FUEL: u32 = 256; + + pub fn new(input: &'a Lexis, limits: &ParserLimits) -> Self { + assert!(limits.max_events < u32::MAX as usize); + + Self { + tokens: input, + events: Vec::with_capacity(limits.max_events), + position: 0, + diagnostics: Vec::with_capacity(16), + fuel: Cell::new(Self::FUEL), + } + } + + fn event(&mut self, event: Event) -> Option { + if self.events.len() < self.events.capacity() { + let index = self.events.len(); + self.events.push(event); + Some(index) + } else { + None + } + } + + fn open(&mut self) -> Open { + Open { + index: self.event(Event::Open { + kind: NodeKind::Error, + }), + } + } + + fn open_before(&mut self, closed: Closed) -> Open { + if let Some(index) = closed.index { + if self.events.len() < self.events.capacity() { + self.events.insert( + index, + Event::Open { + kind: NodeKind::Error, + }, + ); + return Open { index: Some(index) }; + } + } + Open { index: None } + } + + fn close(&mut self, open: Open, kind: NodeKind) -> Closed { + if let Some(index) = open.index { + self.events[index] = Event::Open { kind }; + self.event(Event::Close); + Closed { index: Some(index) } + } else { + Closed { index: None } + } + } + + fn is_eof(&self) -> bool { + self.peek() == TokenKind::Eof + } + + fn advance(&mut self) { + if !self.is_eof() { + self.position += 1; + self.event(Event::Advance); + self.fuel.set(Self::FUEL); + } + } + + #[track_caller] + fn peek(&self) -> TokenKind { + assert_ne!(self.fuel.get(), 0, "parser is stuck"); + self.fuel.set(self.fuel.get() - 1); + + self.tokens.kind(self.position) + } + + fn span(&self) -> Span { + self.tokens.span(self.position) + } + + fn emit(&mut self, diagnostic: Diagnostic) { + if self.diagnostics.len() < self.diagnostics.capacity() { + self.diagnostics.push(diagnostic); + } + } + + fn advance_with_error(&mut self) -> Closed { + let opened = self.open(); + self.advance(); + self.close(opened, NodeKind::Error) + } + + fn optional_newline(&mut self) -> bool { + if self.peek() == TokenKind::Newline { + self.advance(); + true + } else { + false + } + } + + pub fn into_ast(self, ast: &mut Ast) -> Result<(NodeId, Vec), NodeAllocError> { + let mut token = 0; + let mut events = self.events; + let mut stack = Vec::new(); + + struct StackEntry { + node_id: NodeId, + // TODO: This should probably be optimized to use a shared stack. + children: Vec, + } + + // Remove the last Close to keep a single node on the stack. + assert!(matches!(events.pop(), Some(Event::Close))); + + for event in events { + match event { + Event::Open { kind } => { + stack.push(StackEntry { + node_id: ast.alloc(kind, self.tokens.span(token))?, + children: Vec::new(), + }); + } + Event::Close => { + let end_span = self.tokens.span(token.saturating_sub(1)); + let stack_entry = stack.pop().unwrap(); + ast.alloc_children(stack_entry.node_id, &stack_entry.children); + ast.extend_span(stack_entry.node_id, end_span.end); + stack.last_mut().unwrap().children.push(stack_entry.node_id); + } + Event::Advance => { + let span = self.tokens.span(token); + let node_id = ast.alloc(NodeKind::Token, span)?; + stack + .last_mut() + .expect("advance() may only be used in an open node") + .children + .push(node_id); + token += 1; + } + } + } + + if stack.len() != 1 { + // This means we had too many events emitted and they are no longer balanced. + return Err(NodeAllocError); + } + // assert_eq!(token, self.tokens.len()); + + let end_span = self.tokens.span(token.saturating_sub(1)); + let stack_entry = stack.pop().unwrap(); + ast.alloc_children(stack_entry.node_id, &stack_entry.children); + ast.extend_span(stack_entry.node_id, end_span.end); + + Ok((stack_entry.node_id, self.diagnostics)) + } +} + +impl<'a> core::fmt::Debug for Parser<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Parser") + .field("events", &self.events) + .finish_non_exhaustive() + } +} + +enum Tighter { + Left, + Right, +} + +fn tighter(left: TokenKind, right: TokenKind) -> Tighter { + fn tightness(kind: TokenKind) -> Option { + match kind { + TokenKind::Equal => Some(0), + TokenKind::EqualEqual + | TokenKind::NotEqual + | TokenKind::Less + | TokenKind::LessEqual + | TokenKind::Greater + | TokenKind::GreaterEqual => Some(1), + TokenKind::Plus | TokenKind::Minus => Some(2), + TokenKind::Star | TokenKind::Slash => Some(3), + _ if PREFIX_TOKENS.contains(kind) => Some(4), + _ => None, + } + } + + let Some(right_tightness) = tightness(right) else { + return Tighter::Left; + }; + let Some(left_tightness) = tightness(left) else { + assert!(left == TokenKind::Eof); + return Tighter::Right; + }; + + if right_tightness > left_tightness { + Tighter::Right + } else { + Tighter::Left + } +} + +fn precedence_parse(p: &mut Parser, left: TokenKind) { + let mut lhs = prefix(p); + + loop { + let right = p.peek(); + match tighter(left, right) { + Tighter::Left => break, + Tighter::Right => { + let o = p.open_before(lhs); + let kind = infix(p, right); + lhs = p.close(o, kind); + } + } + } +} + +fn one(p: &mut Parser, kind: NodeKind) -> Closed { + let o = p.open(); + p.advance(); + p.close(o, kind) +} + +fn list(p: &mut Parser) -> Closed { + let o = p.open(); + let lspan = p.span(); + p.advance(); // [ + p.optional_newline(); + + loop { + match p.peek() { + TokenKind::Eof => { + p.emit(Diagnostic::error(lspan, "missing `]` to close this list")); + break; + } + + TokenKind::RBrack => { + p.advance(); + break; + } + + _ => (), + } + + expr(p); + + match p.peek() { + TokenKind::Comma | TokenKind::Newline => { + p.advance(); + continue; + } + + TokenKind::RBrack => { + p.advance(); + break; + } + + _ => { + let span = p.span(); + p.emit(Diagnostic::error( + span, + "comma `,` or new line expected after list element", + )); + p.advance_with_error(); + } + } + } + + p.close(o, NodeKind::List) +} + +fn unary(p: &mut Parser) -> Closed { + let o = p.open(); + + let op = p.open(); + p.advance(); + p.close(op, NodeKind::Op); + + prefix(p); + + p.close(o, NodeKind::Unary) +} + +fn paren(p: &mut Parser) -> Closed { + let o = p.open(); + let lspan = p.span(); + p.advance(); // ( + if p.peek() == TokenKind::RParen { + p.advance(); // ) + p.close(o, NodeKind::ParenEmpty) + } else { + p.optional_newline(); + expr(p); + p.optional_newline(); + if p.peek() != TokenKind::RParen { + p.emit(Diagnostic::error(lspan, "missing closing parenthesis `)`")); + p.advance_with_error() + } else { + p.advance(); + p.close(o, NodeKind::Paren) + } + } +} + +fn param(p: &mut Parser) { + let o = p.open(); + + if let TokenKind::Ident | TokenKind::Underscore = p.peek() { + p.advance(); + } else { + let span = p.span(); + p.emit(Diagnostic::error( + span, + "parameter names must be identifiers or `_`", + )); + p.advance_with_error(); + } + + p.close(o, NodeKind::Param); +} + +fn lambda(p: &mut Parser) -> Closed { + let o = p.open(); + p.advance(); // backslash + + let params = p.open(); + loop { + param(p); + match p.peek() { + TokenKind::Comma => { + p.advance(); + continue; + } + + TokenKind::RArrow => break, + + _ => { + let span = p.span(); + p.emit(Diagnostic::error( + span, + "`,` or `->` expected after function parameter", + )); + p.advance_with_error(); + break; + } + } + } + p.close(params, NodeKind::Params); + + // NOTE: Can be false if there are some stray tokens. + // We prefer to bail early and let the rest of the program parse. + if p.peek() == TokenKind::RArrow { + p.advance(); + p.optional_newline(); + expr(p); + } + + p.close(o, NodeKind::Lambda) +} + +fn if_expr(p: &mut Parser) -> Closed { + let o = p.open(); + + p.advance(); // if + if p.peek() != TokenKind::LParen { + let span = p.span(); + p.emit(Diagnostic::error( + span, + "the condition in an `if` expression must be surrounded with parentheses", + )); + // NOTE: Don't advance, it's more likely the programmer expected no parentheses to be needed. + } + p.advance(); + expr(p); // Condition + if p.peek() != TokenKind::RParen { + let span = p.span(); + p.emit(Diagnostic::error( + span, + "missing closing parenthesis after `if` condition", + )); + } + p.advance(); + p.optional_newline(); + + expr(p); // True branch + p.optional_newline(); + + if p.peek() != TokenKind::Else { + let span = p.span(); + p.emit(Diagnostic::error( + span, + "`if` expression is missing an `else` clause", + )); + } + p.advance(); + p.optional_newline(); + + expr(p); // False branch + + p.close(o, NodeKind::If) +} + +fn let_expr(p: &mut Parser) -> Closed { + let o = p.open(); + + p.advance(); // let + + if p.peek() == TokenKind::Ident { + let ident = p.open(); + p.advance(); + p.close(ident, NodeKind::Ident); + } else { + let span = p.span(); + p.emit(Diagnostic::error(span, "`let` variable name expected")); + p.advance_with_error(); + } + + if p.peek() == TokenKind::Equal { + p.advance(); + } else { + let span = p.span(); + p.emit(Diagnostic::error(span, "`=` expected after variable name")); + p.advance_with_error(); + } + + expr(p); + + if p.peek() == TokenKind::Newline { + p.advance(); + } else { + let span = p.span(); + p.emit(Diagnostic::error( + span, + "new line expected after `let` expression", + )); + p.advance_with_error(); + } + + expr(p); + + p.close(o, NodeKind::Let) +} + +const PREFIX_TOKENS: TokenKindSet = TokenKindSet::new(&[ + TokenKind::Ident, + TokenKind::Tag, + TokenKind::Number, + TokenKind::Color, + // NOTE: This is ambiguous in function calls. + // In that case, the infix operator takes precedence (because the `match` arms for the infix op + // come first.) + TokenKind::Minus, + TokenKind::Not, + TokenKind::LParen, + TokenKind::Backslash, + TokenKind::If, + TokenKind::Let, + TokenKind::LBrack, +]); + +fn prefix(p: &mut Parser) -> Closed { + match p.peek() { + TokenKind::Ident => one(p, NodeKind::Ident), + TokenKind::Tag => one(p, NodeKind::Tag), + TokenKind::Number => one(p, NodeKind::Number), + TokenKind::Color => one(p, NodeKind::Color), + TokenKind::LBrack => list(p), + + TokenKind::Minus | TokenKind::Not => unary(p), + TokenKind::LParen => paren(p), + TokenKind::Backslash => lambda(p), + TokenKind::If => if_expr(p), + TokenKind::Let => let_expr(p), + + _ => { + assert!( + !PREFIX_TOKENS.contains(p.peek()), + "{:?} found in PREFIX_TOKENS", + p.peek() + ); + + let span = p.span(); + p.emit(Diagnostic::error( + span, + "an expression was expected, but this token does not start one", + )); + p.advance_with_error() + } + } +} + +fn infix(p: &mut Parser, op: TokenKind) -> NodeKind { + match op { + TokenKind::Plus + | TokenKind::Minus + | TokenKind::Star + | TokenKind::Slash + | TokenKind::EqualEqual + | TokenKind::NotEqual + | TokenKind::Less + | TokenKind::LessEqual + | TokenKind::Greater + | TokenKind::GreaterEqual + | TokenKind::Equal => infix_binary(p, op), + + _ if PREFIX_TOKENS.contains(op) => infix_call(p), + + _ => panic!("unhandled infix operator {op:?}"), + } +} + +fn infix_binary(p: &mut Parser, op: TokenKind) -> NodeKind { + let o = p.open(); + p.advance(); + p.close(o, NodeKind::Op); + + if p.peek() == TokenKind::Newline { + p.advance(); + } + + precedence_parse(p, op); + NodeKind::Binary +} + +fn infix_call(p: &mut Parser) -> NodeKind { + while PREFIX_TOKENS.contains(p.peek()) { + prefix(p); + } + + NodeKind::Call +} + +pub fn expr(p: &mut Parser) { + precedence_parse(p, TokenKind::Eof) +} + +pub fn toplevel(p: &mut Parser) { + let o = p.open(); + p.optional_newline(); + while p.peek() != TokenKind::Eof { + expr(p); + + match p.peek() { + TokenKind::Newline => { + p.advance(); + continue; + } + + TokenKind::Eof => break, + + _ => { + let span = p.span(); + p.emit(Diagnostic::error( + span, + "newline expected after toplevel expression", + )) + } + } + } + p.close(o, NodeKind::Toplevel); +} + +#[cfg(test)] +mod tests; diff --git a/crates/haku/src/parser/tests.rs b/crates/haku/src/parser/tests.rs new file mode 100644 index 0000000..381bc1a --- /dev/null +++ b/crates/haku/src/parser/tests.rs @@ -0,0 +1,912 @@ +use alloc::{format, string::String}; + +use crate::{ + ast::{dump::dump, Ast, NodeId}, + lexer::{lex, Lexer}, + parser::expr, + source::SourceCode, + token::Lexis, +}; + +use super::{toplevel, Parser, ParserLimits}; + +fn parse(s: &str, f: fn(&mut Parser)) -> (Ast, NodeId) { + let mut lexer = Lexer::new(Lexis::new(1024), SourceCode::unlimited_len(s)); + lex(&mut lexer).expect("too many tokens"); + + let mut parser = Parser::new(&lexer.lexis, &ParserLimits { max_events: 1024 }); + f(&mut parser); + + if !parser.diagnostics.is_empty() { + panic!("parser emitted diagnostics: {:#?}", parser.diagnostics); + } + + let mut ast = Ast::new(1024); + let (root, _) = parser.into_ast(&mut ast).unwrap(); + (ast, root) +} + +fn ast(s: &str, f: fn(&mut Parser)) -> String { + let (ast, root) = parse(s, f); + // The extra newline is mostly so that it's easier to make the string literals look nice. + format!("\n{}", dump(&ast, root, None)) +} + +#[track_caller] +fn assert_ast_eq(s: &str, f: fn(&mut Parser), ast_s: &str) { + let got = ast(s, f); + if ast_s != got { + panic!("AST mismatch. expected:\n{ast_s}\n\ngot:\n{got}\n"); + } +} + +#[test] +fn one_literals() { + assert_ast_eq( + "1", + expr, + " +Number @ 0..1 + Token @ 0..1", + ); + + assert_ast_eq( + "ExampleTag123", + expr, + " +Tag @ 0..13 + Token @ 0..13", + ); + + assert_ast_eq( + "example_ident123", + expr, + " +Ident @ 0..16 + Token @ 0..16", + ); + + assert_ast_eq( + "#000", + expr, + " +Color @ 0..4 + Token @ 0..4", + ); + + assert_ast_eq( + "#000F", + expr, + " +Color @ 0..5 + Token @ 0..5", + ); + + assert_ast_eq( + "#058EF0", + expr, + " +Color @ 0..7 + Token @ 0..7", + ); + + assert_ast_eq( + "#058EF0FF", + expr, + " +Color @ 0..9 + Token @ 0..9", + ); +} + +#[test] +fn list() { + assert_ast_eq( + "[]", + expr, + " +List @ 0..2 + Token @ 0..1 + Token @ 1..2", + ); + + assert_ast_eq( + "[1]", + expr, + " +List @ 0..3 + Token @ 0..1 + Number @ 1..2 + Token @ 1..2 + Token @ 2..3", + ); + + assert_ast_eq( + "[1, 2]", + expr, + " +List @ 0..6 + Token @ 0..1 + Number @ 1..2 + Token @ 1..2 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5 + Token @ 5..6", + ); + + assert_ast_eq( + "[ + 1 + 2 + ]", + expr, + " +List @ 0..42 + Token @ 0..1 + Token @ 1..2 + Number @ 15..16 + Token @ 15..16 + Token @ 16..17 + Number @ 30..31 + Token @ 30..31 + Token @ 31..32 + Token @ 41..42", + ); +} + +#[test] +fn unary() { + assert_ast_eq( + "-1", + expr, + " +Unary @ 0..2 + Op @ 0..1 + Token @ 0..1 + Number @ 1..2 + Token @ 1..2", + ); + + assert_ast_eq( + "!1", + expr, + " +Unary @ 0..2 + Op @ 0..1 + Token @ 0..1 + Number @ 1..2 + Token @ 1..2", + ); +} + +#[test] +fn binary_single() { + assert_ast_eq( + "1 + 1", + expr, + " +Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5", + ); + + assert_ast_eq( + "1 - 1", + expr, + " +Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5", + ); + + assert_ast_eq( + "1 * 1", + expr, + " +Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5", + ); + + assert_ast_eq( + "1 / 1", + expr, + " +Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5", + ); + + assert_ast_eq( + "1 < 1", + expr, + " +Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5", + ); + + assert_ast_eq( + "1 > 1", + expr, + " +Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5", + ); + + assert_ast_eq( + "1 == 1", + expr, + " +Binary @ 0..6 + Number @ 0..1 + Token @ 0..1 + Op @ 2..4 + Token @ 2..4 + Number @ 5..6 + Token @ 5..6", + ); + + assert_ast_eq( + "1 != 1", + expr, + " +Binary @ 0..6 + Number @ 0..1 + Token @ 0..1 + Op @ 2..4 + Token @ 2..4 + Number @ 5..6 + Token @ 5..6", + ); + + assert_ast_eq( + "1 <= 1", + expr, + " +Binary @ 0..6 + Number @ 0..1 + Token @ 0..1 + Op @ 2..4 + Token @ 2..4 + Number @ 5..6 + Token @ 5..6", + ); + + assert_ast_eq( + "1 >= 1", + expr, + " +Binary @ 0..6 + Number @ 0..1 + Token @ 0..1 + Op @ 2..4 + Token @ 2..4 + Number @ 5..6 + Token @ 5..6", + ); + + assert_ast_eq( + "1 = 1", + expr, + " +Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5", + ); +} + +#[test] +fn binary_precedence() { + assert_ast_eq( + "1 + 1 + 1", + expr, + " +Binary @ 0..9 + Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5 + Op @ 6..7 + Token @ 6..7 + Number @ 8..9 + Token @ 8..9", + ); + + assert_ast_eq( + "1 * 1 + 1", + expr, + " +Binary @ 0..9 + Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5 + Op @ 6..7 + Token @ 6..7 + Number @ 8..9 + Token @ 8..9", + ); + + assert_ast_eq( + "1 + 1 * 1", + expr, + " +Binary @ 0..9 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Binary @ 4..9 + Number @ 4..5 + Token @ 4..5 + Op @ 6..7 + Token @ 6..7 + Number @ 8..9 + Token @ 8..9", + ); + + assert_ast_eq( + "1 < 1 + 1", + expr, + " +Binary @ 0..9 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Binary @ 4..9 + Number @ 4..5 + Token @ 4..5 + Op @ 6..7 + Token @ 6..7 + Number @ 8..9 + Token @ 8..9", + ); + + assert_ast_eq( + "1 + 1 < 1", + expr, + " +Binary @ 0..9 + Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5 + Op @ 6..7 + Token @ 6..7 + Number @ 8..9 + Token @ 8..9", + ); + + assert_ast_eq( + "1 + 1 * 1 < 1", + expr, + " +Binary @ 0..13 + Binary @ 0..9 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Binary @ 4..9 + Number @ 4..5 + Token @ 4..5 + Op @ 6..7 + Token @ 6..7 + Number @ 8..9 + Token @ 8..9 + Op @ 10..11 + Token @ 10..11 + Number @ 12..13 + Token @ 12..13", + ); + + assert_ast_eq( + "1 * 1 + 1 < 1", + expr, + " +Binary @ 0..13 + Binary @ 0..9 + Binary @ 0..5 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 4..5 + Token @ 4..5 + Op @ 6..7 + Token @ 6..7 + Number @ 8..9 + Token @ 8..9 + Op @ 10..11 + Token @ 10..11 + Number @ 12..13 + Token @ 12..13", + ); +} + +#[test] +fn binary_cont() { + assert_ast_eq( + "1 + + 1", + expr, + " +Binary @ 0..16 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Token @ 3..4 + Number @ 15..16 + Token @ 15..16", + ); + + assert_ast_eq( + "1 + + + 1", + expr, + " +Binary @ 0..17 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Token @ 3..4 + Number @ 16..17 + Token @ 16..17", + ); +} + +#[test] +fn paren_empty() { + assert_ast_eq( + "()", + expr, + " +ParenEmpty @ 0..2 + Token @ 0..1 + Token @ 1..2", + ); +} + +#[test] +fn paren() { + assert_ast_eq( + "(1)", + expr, + " +Paren @ 0..3 + Token @ 0..1 + Number @ 1..2 + Token @ 1..2 + Token @ 2..3", + ); + + assert_ast_eq( + "(1 + 1) * 1", + expr, + " +Binary @ 0..11 + Paren @ 0..7 + Token @ 0..1 + Binary @ 1..6 + Number @ 1..2 + Token @ 1..2 + Op @ 3..4 + Token @ 3..4 + Number @ 5..6 + Token @ 5..6 + Token @ 6..7 + Op @ 8..9 + Token @ 8..9 + Number @ 10..11 + Token @ 10..11", + ); + + assert_ast_eq( + "1 * (1 + 1)", + expr, + " +Binary @ 0..11 + Number @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Paren @ 4..11 + Token @ 4..5 + Binary @ 5..10 + Number @ 5..6 + Token @ 5..6 + Op @ 7..8 + Token @ 7..8 + Number @ 9..10 + Token @ 9..10 + Token @ 10..11", + ); + + assert_ast_eq( + "( + 1 + + 1 + )", + expr, + " +Paren @ 0..47 + Token @ 0..1 + Token @ 1..2 + Binary @ 15..33 + Number @ 15..16 + Token @ 15..16 + Op @ 17..18 + Token @ 17..18 + Token @ 18..19 + Number @ 32..33 + Token @ 32..33 + Token @ 36..37 + Token @ 46..47", + ); +} + +#[test] +fn infix_call() { + assert_ast_eq( + "f x y", + toplevel, + " +Toplevel @ 0..5 + Call @ 0..5 + Ident @ 0..1 + Token @ 0..1 + Ident @ 2..3 + Token @ 2..3 + Ident @ 4..5 + Token @ 4..5", + ); + + assert_ast_eq( + "sin 1 + cos 2", + toplevel, + " +Toplevel @ 0..13 + Binary @ 0..13 + Call @ 0..5 + Ident @ 0..3 + Token @ 0..3 + Number @ 4..5 + Token @ 4..5 + Op @ 6..7 + Token @ 6..7 + Call @ 8..13 + Ident @ 8..11 + Token @ 8..11 + Number @ 12..13 + Token @ 12..13", + ); +} + +#[test] +fn infix_call_unary_arg() { + assert_ast_eq( + // NOTE: The whitespace here is misleading. + // This is a binary `-`. + "f -1", + toplevel, + " +Toplevel @ 0..4 + Binary @ 0..4 + Ident @ 0..1 + Token @ 0..1 + Op @ 2..3 + Token @ 2..3 + Number @ 3..4 + Token @ 3..4", + ); + + assert_ast_eq( + "f (-1)", + toplevel, + " +Toplevel @ 0..6 + Call @ 0..6 + Ident @ 0..1 + Token @ 0..1 + Paren @ 2..6 + Token @ 2..3 + Unary @ 3..5 + Op @ 3..4 + Token @ 3..4 + Number @ 4..5 + Token @ 4..5 + Token @ 5..6", + ); +} + +#[test] +fn lambda() { + assert_ast_eq( + r#" \_ -> () "#, + toplevel, + " +Toplevel @ 1..9 + Lambda @ 1..9 + Token @ 1..2 + Params @ 2..3 + Param @ 2..3 + Token @ 2..3 + Token @ 4..6 + ParenEmpty @ 7..9 + Token @ 7..8 + Token @ 8..9", + ); + + assert_ast_eq( + r#" \x -> x "#, + toplevel, + " +Toplevel @ 1..8 + Lambda @ 1..8 + Token @ 1..2 + Params @ 2..3 + Param @ 2..3 + Token @ 2..3 + Token @ 4..6 + Ident @ 7..8 + Token @ 7..8", + ); + + assert_ast_eq( + r#" \x, y -> x + y "#, + toplevel, + " +Toplevel @ 1..15 + Lambda @ 1..15 + Token @ 1..2 + Params @ 2..6 + Param @ 2..3 + Token @ 2..3 + Token @ 3..4 + Param @ 5..6 + Token @ 5..6 + Token @ 7..9 + Binary @ 10..15 + Ident @ 10..11 + Token @ 10..11 + Op @ 12..13 + Token @ 12..13 + Ident @ 14..15 + Token @ 14..15", + ); + + assert_ast_eq( + r#" \x, y -> + x + y "#, + toplevel, + " +Toplevel @ 1..29 + Lambda @ 1..29 + Token @ 1..2 + Params @ 2..6 + Param @ 2..3 + Token @ 2..3 + Token @ 3..4 + Param @ 5..6 + Token @ 5..6 + Token @ 7..9 + Token @ 9..10 + Binary @ 24..29 + Ident @ 24..25 + Token @ 24..25 + Op @ 26..27 + Token @ 26..27 + Ident @ 28..29 + Token @ 28..29", + ); + + assert_ast_eq( + r#" f \x -> g \y -> x + y "#, + toplevel, + " +Toplevel @ 1..22 + Call @ 1..22 + Ident @ 1..2 + Token @ 1..2 + Lambda @ 3..22 + Token @ 3..4 + Params @ 4..5 + Param @ 4..5 + Token @ 4..5 + Token @ 6..8 + Call @ 9..22 + Ident @ 9..10 + Token @ 9..10 + Lambda @ 11..22 + Token @ 11..12 + Params @ 12..13 + Param @ 12..13 + Token @ 12..13 + Token @ 14..16 + Binary @ 17..22 + Ident @ 17..18 + Token @ 17..18 + Op @ 19..20 + Token @ 19..20 + Ident @ 21..22 + Token @ 21..22", + ); + + assert_ast_eq( + r#" f \x -> + g \y -> + x + y "#, + toplevel, + " +Toplevel @ 1..48 + Call @ 1..48 + Ident @ 1..2 + Token @ 1..2 + Lambda @ 3..48 + Token @ 3..4 + Params @ 4..5 + Param @ 4..5 + Token @ 4..5 + Token @ 6..8 + Token @ 8..9 + Call @ 21..48 + Ident @ 21..22 + Token @ 21..22 + Lambda @ 23..48 + Token @ 23..24 + Params @ 24..25 + Param @ 24..25 + Token @ 24..25 + Token @ 26..28 + Token @ 28..29 + Binary @ 43..48 + Ident @ 43..44 + Token @ 43..44 + Op @ 45..46 + Token @ 45..46 + Ident @ 47..48 + Token @ 47..48", + ); +} + +#[test] +fn if_expr() { + assert_ast_eq( + r#" if (true) 1 else 2 "#, + toplevel, + " +Toplevel @ 1..19 + If @ 1..19 + Token @ 1..3 + Token @ 4..5 + Ident @ 5..9 + Token @ 5..9 + Token @ 9..10 + Number @ 11..12 + Token @ 11..12 + Token @ 13..17 + Number @ 18..19 + Token @ 18..19", + ); + + assert_ast_eq( + r#" if (true) + 1 + else + 2 "#, + toplevel, + " +Toplevel @ 1..63 + If @ 1..63 + Token @ 1..3 + Token @ 4..5 + Ident @ 5..9 + Token @ 5..9 + Token @ 9..10 + Token @ 10..11 + Number @ 27..28 + Token @ 27..28 + Token @ 28..29 + Token @ 41..45 + Token @ 45..46 + Number @ 62..63 + Token @ 62..63", + ); +} + +#[test] +fn let_expr() { + assert_ast_eq( + r#" let x = 1 + x "#, + toplevel, + " +Toplevel @ 1..24 + Let @ 1..24 + Token @ 1..4 + Ident @ 5..6 + Token @ 5..6 + Token @ 7..8 + Number @ 9..10 + Token @ 9..10 + Token @ 10..11 + Ident @ 23..24 + Token @ 23..24", + ); + + assert_ast_eq( + r#" let x = 1 + let y = 2 + x + y "#, + toplevel, + " +Toplevel @ 1..50 + Let @ 1..50 + Token @ 1..4 + Ident @ 5..6 + Token @ 5..6 + Token @ 7..8 + Number @ 9..10 + Token @ 9..10 + Token @ 10..11 + Let @ 23..50 + Token @ 23..26 + Ident @ 27..28 + Token @ 27..28 + Token @ 29..30 + Number @ 31..32 + Token @ 31..32 + Token @ 32..33 + Binary @ 45..50 + Ident @ 45..46 + Token @ 45..46 + Op @ 47..48 + Token @ 47..48 + Ident @ 49..50 + Token @ 49..50", + ) +} diff --git a/crates/haku/src/sexp.rs b/crates/haku/src/sexp.rs deleted file mode 100644 index 9a2b8b2..0000000 --- a/crates/haku/src/sexp.rs +++ /dev/null @@ -1,510 +0,0 @@ -use core::{cell::Cell, fmt, ops::Deref}; - -use alloc::vec::Vec; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct Span { - pub start: usize, - pub end: usize, -} - -impl Span { - pub fn new(start: usize, end: usize) -> Self { - Self { start, end } - } - - pub fn slice<'a>(&self, source: &'a SourceCode) -> &'a str { - &source.code[self.start..self.end] - } -} - -/// Source code string with a verified size limit. -/// An exact size limit is not enforced by this type - it only ensures the string isn't longer than -/// intended, to not stall the parser for an unexpected amount of time. -#[derive(Debug, PartialEq, Eq)] -#[repr(transparent)] -pub struct SourceCode { - code: str, -} - -impl SourceCode { - pub fn limited_len(code: &str, max_len: usize) -> Option<&Self> { - if code.len() <= max_len { - Some(Self::unlimited_len(code)) - } else { - None - } - } - - pub fn unlimited_len(code: &str) -> &Self { - // SAFETY: SourceCode is a transparent wrapper around str, so converting between them is safe. - unsafe { core::mem::transmute(code) } - } -} - -impl Deref for SourceCode { - type Target = str; - - fn deref(&self) -> &Self::Target { - &self.code - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct NodeId(usize); - -impl NodeId { - pub const NIL: NodeId = NodeId(0); -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum NodeKind { - Nil, - Eof, - - // Atoms - Ident, - Number, - - List(NodeId, NodeId), - Toplevel(NodeId), - - Error(&'static str), -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct Node { - pub span: Span, - pub kind: NodeKind, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Ast { - pub nodes: Vec, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum AstWriteMode { - Compact, - Spans, -} - -impl Ast { - pub fn new(capacity: usize) -> Self { - assert!(capacity >= 1, "there must be space for at least a nil node"); - - let mut ast = Self { - nodes: Vec::with_capacity(capacity), - }; - - ast.alloc(Node { - span: Span::new(0, 0), - kind: NodeKind::Nil, - }) - .unwrap(); - - ast - } - - pub fn alloc(&mut self, node: Node) -> Result { - if self.nodes.len() >= self.nodes.capacity() { - return Err(NodeAllocError); - } - - let index = self.nodes.len(); - self.nodes.push(node); - Ok(NodeId(index)) - } - - pub fn get(&self, node_id: NodeId) -> &Node { - &self.nodes[node_id.0] - } - - pub fn get_mut(&mut self, node_id: NodeId) -> &mut Node { - &mut self.nodes[node_id.0] - } - - pub fn write( - &self, - source: &SourceCode, - node_id: NodeId, - w: &mut dyn fmt::Write, - mode: AstWriteMode, - ) -> fmt::Result { - #[allow(clippy::too_many_arguments)] - fn write_list( - ast: &Ast, - source: &SourceCode, - w: &mut dyn fmt::Write, - mode: AstWriteMode, - mut head: NodeId, - mut tail: NodeId, - sep_element: &str, - sep_tail: &str, - ) -> fmt::Result { - loop { - write_rec(ast, source, w, mode, head)?; - match ast.get(tail).kind { - NodeKind::Nil => break, - NodeKind::List(head2, tail2) => { - w.write_str(sep_element)?; - (head, tail) = (head2, tail2); - } - _ => { - w.write_str(sep_tail)?; - write_rec(ast, source, w, mode, tail)?; - break; - } - } - } - Ok(()) - } - - // NOTE: Separated out to a separate function in case we ever want to introduce auto-indentation. - fn write_rec( - ast: &Ast, - source: &SourceCode, - w: &mut dyn fmt::Write, - mode: AstWriteMode, - node_id: NodeId, - ) -> fmt::Result { - let node = ast.get(node_id); - match &node.kind { - NodeKind::Nil => write!(w, "()")?, - NodeKind::Eof => write!(w, "")?, - NodeKind::Ident | NodeKind::Number => write!(w, "{}", node.span.slice(source))?, - - NodeKind::List(head, tail) => { - w.write_char('(')?; - write_list(ast, source, w, mode, *head, *tail, " ", " . ")?; - w.write_char(')')?; - } - - NodeKind::Toplevel(list) => { - let NodeKind::List(head, tail) = ast.get(*list).kind else { - unreachable!("child of Toplevel must be a List"); - }; - - write_list(ast, source, w, mode, head, tail, "\n", " . ")?; - } - - NodeKind::Error(message) => write!(w, "#error({message})")?, - } - - if mode == AstWriteMode::Spans { - write!(w, "@{}..{}", node.span.start, node.span.end)?; - } - - Ok(()) - } - - write_rec(self, source, w, mode, node_id)?; - - Ok(()) - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct NodeAllocError; - -pub struct Parser<'a> { - pub ast: Ast, - input: &'a SourceCode, - position: usize, - fuel: Cell, - alloc_error: NodeId, -} - -impl<'a> Parser<'a> { - const FUEL: usize = 256; - - pub fn new(mut ast: Ast, input: &'a SourceCode) -> Self { - let alloc_error = ast - .alloc(Node { - span: Span::new(0, 0), - kind: NodeKind::Error("program is too big"), - }) - .expect("there is not enough space in the arena for an error node"); - - Self { - ast, - input, - position: 0, - fuel: Cell::new(Self::FUEL), - alloc_error, - } - } - - #[track_caller] - pub fn current(&self) -> char { - assert_ne!(self.fuel.get(), 0, "parser is stuck"); - self.fuel.set(self.fuel.get() - 1); - - self.input[self.position..].chars().next().unwrap_or('\0') - } - - pub fn advance(&mut self) { - self.position += self.current().len_utf8(); - self.fuel.set(Self::FUEL); - } - - pub fn alloc(&mut self, expr: Node) -> NodeId { - self.ast.alloc(expr).unwrap_or(self.alloc_error) - } -} - -pub fn skip_whitespace_and_comments(p: &mut Parser<'_>) { - loop { - match p.current() { - ' ' | '\t' | '\n' => { - p.advance(); - continue; - } - ';' => { - while p.current() != '\n' && p.current() != '\0' { - p.advance(); - } - } - _ => break, - } - } -} - -fn is_decimal_digit(c: char) -> bool { - c.is_ascii_digit() -} - -pub fn parse_number(p: &mut Parser<'_>) -> NodeKind { - while is_decimal_digit(p.current()) { - p.advance(); - } - if p.current() == '.' { - p.advance(); - if !is_decimal_digit(p.current()) { - return NodeKind::Error("missing digits after decimal point '.' in number literal"); - } - while is_decimal_digit(p.current()) { - p.advance(); - } - } - - NodeKind::Number -} - -fn is_ident(c: char) -> bool { - // The identifier character set is quite limited to help with easy expansion in the future. - // Rationale: - // - alphabet and digits are pretty obvious - // - '-' and '_' can be used for identifier separators, whichever you prefer. - // - '+', '-', '*', '/', '^' are for arithmetic. - // - '=', '!', '<', '>' are fore comparison. - // - '\' is for builtin string constants, such as \n. - // For other operators, it's generally clearer to use words (such as `and` and `or`.) - matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '+' | '*' | '/' | '\\' | '^' | '!' | '=' | '<' | '>') -} - -pub fn parse_ident(p: &mut Parser<'_>) -> NodeKind { - while is_ident(p.current()) { - p.advance(); - } - - NodeKind::Ident -} - -struct List { - head: NodeId, - tail: NodeId, -} - -impl List { - fn new() -> Self { - Self { - head: NodeId::NIL, - tail: NodeId::NIL, - } - } - - fn append(&mut self, p: &mut Parser<'_>, node: NodeId) { - let node_span = p.ast.get(node).span; - - let new_tail = p.alloc(Node { - span: node_span, - kind: NodeKind::List(node, NodeId::NIL), - }); - if self.head == NodeId::NIL { - self.head = new_tail; - self.tail = new_tail; - } else { - let old_tail = p.ast.get_mut(self.tail); - let NodeKind::List(expr_before, _) = old_tail.kind else { - return; - }; - *old_tail = Node { - span: Span::new(old_tail.span.start, node_span.end), - kind: NodeKind::List(expr_before, new_tail), - }; - self.tail = new_tail; - } - } -} - -pub fn parse_list(p: &mut Parser<'_>) -> NodeId { - // This could've been a lot simpler if Rust supported tail recursion. - - let start = p.position; - - p.advance(); // skip past opening parenthesis - skip_whitespace_and_comments(p); - - let mut list = List::new(); - - while p.current() != ')' { - if p.current() == '\0' { - return p.alloc(Node { - span: Span::new(start, p.position), - kind: NodeKind::Error("missing ')' to close '('"), - }); - } - - let expr = parse_expr(p); - skip_whitespace_and_comments(p); - - list.append(p, expr); - } - p.advance(); // skip past closing parenthesis - - // If we didn't have any elements, we must not modify the initial Nil with ID 0. - if list.head == NodeId::NIL { - list.head = p.alloc(Node { - span: Span::new(0, 0), - kind: NodeKind::Nil, - }); - } - - let end = p.position; - p.ast.get_mut(list.head).span = Span::new(start, end); - - list.head -} - -pub fn parse_expr(p: &mut Parser<'_>) -> NodeId { - let start = p.position; - let kind = match p.current() { - '\0' => NodeKind::Eof, - c if is_decimal_digit(c) => parse_number(p), - // NOTE: Because of the `match` order, this prevents identifiers from starting with a digit. - c if is_ident(c) => parse_ident(p), - '(' => return parse_list(p), - _ => { - p.advance(); - NodeKind::Error("unexpected character") - } - }; - let end = p.position; - - p.alloc(Node { - span: Span::new(start, end), - kind, - }) -} - -pub fn parse_toplevel(p: &mut Parser<'_>) -> NodeId { - let start = p.position; - - let mut nodes = List::new(); - - skip_whitespace_and_comments(p); - while p.current() != '\0' { - let expr = parse_expr(p); - skip_whitespace_and_comments(p); - - nodes.append(p, expr); - } - - let end = p.position; - - p.alloc(Node { - span: Span::new(start, end), - kind: NodeKind::Toplevel(nodes.head), - }) -} - -#[cfg(test)] -mod tests { - use core::error::Error; - - use alloc::{boxed::Box, string::String}; - - use super::*; - - #[track_caller] - fn parse( - f: fn(&mut Parser<'_>) -> NodeId, - source: &str, - expected: &str, - ) -> Result<(), Box> { - let ast = Ast::new(16); - let code = SourceCode::unlimited_len(source); - let mut p = Parser::new(ast, code); - let node = f(&mut p); - let ast = p.ast; - - let mut s = String::new(); - ast.write(code, node, &mut s, AstWriteMode::Spans)?; - - assert_eq!(s, expected); - - Ok(()) - } - - #[test] - fn parse_number() -> Result<(), Box> { - parse(parse_expr, "123", "123@0..3")?; - parse(parse_expr, "123.456", "123.456@0..7")?; - Ok(()) - } - - #[test] - fn parse_ident() -> Result<(), Box> { - parse(parse_expr, "abc", "abc@0..3")?; - parse(parse_expr, "abcABC_01234", "abcABC_01234@0..12")?; - parse(parse_expr, "+-*/\\^!=<>", "+-*/\\^!=<>@0..10")?; - Ok(()) - } - - #[test] - fn parse_list() -> Result<(), Box> { - parse(parse_expr, "()", "()@0..2")?; - parse(parse_expr, "(a a)", "(a@1..2 a@3..4)@0..5")?; - parse(parse_expr, "(a a a)", "(a@1..2 a@3..4 a@5..6)@0..7")?; - parse(parse_expr, "(() ())", "(()@1..3 ()@4..6)@0..7")?; - parse( - parse_expr, - "(nestedy (nest OwO))", - "(nestedy@1..8 (nest@10..14 OwO@15..18)@9..19)@0..20", - )?; - Ok(()) - } - - #[test] - fn oom() -> Result<(), Box> { - parse(parse_expr, "(a a a a a a a a)", "(a@1..2 a@3..4 a@5..6 a@7..8 a@9..10 a@11..12 a@13..14 . #error(program is too big)@0..0)@0..17")?; - parse(parse_expr, "(a a a a a a a a a)", "(a@1..2 a@3..4 a@5..6 a@7..8 a@9..10 a@11..12 a@13..14 . #error(program is too big)@0..0)@0..19")?; - parse(parse_expr, "(a a a a a a a a a a)", "(a@1..2 a@3..4 a@5..6 a@7..8 a@9..10 a@11..12 a@13..14 . #error(program is too big)@0..0)@0..21")?; - parse(parse_expr, "(a a a a a a a a a a a)", "(a@1..2 a@3..4 a@5..6 a@7..8 a@9..10 a@11..12 a@13..14 . #error(program is too big)@0..0)@0..23")?; - Ok(()) - } - - #[test] - fn toplevel() -> Result<(), Box> { - parse( - parse_toplevel, - r#" - (hello world) - (abc) - "#, - "(hello@18..23 world@24..29)@17..30\n(abc@48..51)@47..52@0..65", - )?; - Ok(()) - } -} diff --git a/crates/haku/src/source.rs b/crates/haku/src/source.rs new file mode 100644 index 0000000..3eac60c --- /dev/null +++ b/crates/haku/src/source.rs @@ -0,0 +1,55 @@ +use core::{fmt, ops::Deref}; + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Span { + pub start: u32, + pub end: u32, +} + +impl Span { + pub fn new(start: u32, end: u32) -> Self { + Self { start, end } + } + + pub fn slice<'a>(&self, source: &'a SourceCode) -> &'a str { + &source.code[self.start as usize..self.end as usize] + } +} + +impl fmt::Debug for Span { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +/// Source code string with a verified size limit. +/// An exact size limit is not enforced by this type - it only ensures the string isn't longer than +/// intended, to not stall the parser for an unexpected amount of time. +#[derive(Debug, PartialEq, Eq)] +#[repr(transparent)] +pub struct SourceCode { + code: str, +} + +impl SourceCode { + pub fn limited_len(code: &str, max_len: u32) -> Option<&Self> { + if code.len() <= max_len as usize { + Some(Self::unlimited_len(code)) + } else { + None + } + } + + pub fn unlimited_len(code: &str) -> &Self { + // SAFETY: SourceCode is a transparent wrapper around str, so converting between them is safe. + unsafe { core::mem::transmute(code) } + } +} + +impl Deref for SourceCode { + type Target = str; + + fn deref(&self) -> &Self::Target { + &self.code + } +} diff --git a/crates/haku/src/system.rs b/crates/haku/src/system.rs index ef5686e..7b04bcd 100644 --- a/crates/haku/src/system.rs +++ b/crates/haku/src/system.rs @@ -16,10 +16,17 @@ pub type SystemFn = fn(&mut Vm, FnArgs) -> Result; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct ChunkId(u32); +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SystemFnArity { + Unary, + Binary, + Nary, +} + #[derive(Debug, Clone)] pub struct System { /// Resolves a system function name to an index into `fn`s. - pub resolve_fn: fn(&str) -> Option, + pub resolve_fn: fn(SystemFnArity, &str) -> Option, pub fns: [Option; 256], pub chunks: Vec, } @@ -30,7 +37,7 @@ pub struct SystemImage { } macro_rules! def_fns { - ($($index:tt $name:tt => $fnref:expr),* $(,)?) => { + ($($index:tt $arity:tt $name:tt => $fnref:expr),* $(,)?) => { pub(crate) fn init_fns(system: &mut System) { $( debug_assert!(system.fns[$index].is_none()); @@ -38,9 +45,9 @@ macro_rules! def_fns { )* } - pub(crate) fn resolve(name: &str) -> Option { - match name { - $($name => Some($index),)* + pub(crate) fn resolve(arity: SystemFnArity, name: &str) -> Option { + match (arity, name){ + $((SystemFnArity::$arity, $name) => Some($index),)* _ => None, } } @@ -106,43 +113,44 @@ pub mod fns { vm::{Exception, FnArgs, Vm}, }; - use super::System; + use super::{System, SystemFnArity}; impl System { def_fns! { - 0x00 "+" => add, - 0x01 "-" => sub, - 0x02 "*" => mul, - 0x03 "/" => div, + 0x00 Binary "+" => add, + 0x01 Binary "-" => sub, + 0x02 Binary "*" => mul, + 0x03 Binary "/" => div, + 0x04 Unary "-" => neg, - 0x40 "not" => not, - 0x41 "=" => eq, - 0x42 "<>" => neq, - 0x43 "<" => lt, - 0x44 "<=" => leq, - 0x45 ">" => gt, - 0x46 ">=" => geq, + 0x40 Unary "!" => not, + 0x41 Binary "==" => eq, + 0x42 Binary "!=" => neq, + 0x43 Binary "<" => lt, + 0x44 Binary "<=" => leq, + 0x45 Binary ">" => gt, + 0x46 Binary ">=" => geq, - 0x80 "vec" => vec, - 0x81 ".x" => vec_x, - 0x82 ".y" => vec_y, - 0x83 ".z" => vec_z, - 0x84 ".w" => vec_w, + 0x80 Nary "vec" => vec, + 0x81 Nary "vecX" => vec_x, + 0x82 Nary "vecY" => vec_y, + 0x83 Nary "vecZ" => vec_z, + 0x84 Nary "vecW" => vec_w, - 0x85 "rgba" => rgba, - 0x86 ".r" => rgba_r, - 0x87 ".g" => rgba_g, - 0x88 ".b" => rgba_b, - 0x89 ".a" => rgba_a, + 0x85 Nary "rgba" => rgba, + 0x86 Nary "rgbaR" => rgba_r, + 0x87 Nary "rgbaG" => rgba_g, + 0x88 Nary "rgbaB" => rgba_b, + 0x89 Nary "rgbaA" => rgba_a, - 0x90 "list" => list, + 0x90 Nary "list" => list, - 0xc0 "to-shape" => to_shape_f, - 0xc1 "line" => line, - 0xc2 "rect" => rect, - 0xc3 "circle" => circle, - 0xe0 "stroke" => stroke, - 0xe1 "fill" => fill, + 0xc0 Nary "toShape" => to_shape_f, + 0xc1 Nary "line" => line, + 0xc2 Nary "rect" => rect, + 0xc3 Nary "circle" => circle, + 0xe0 Nary "stroke" => stroke, + 0xe1 Nary "fill" => fill, } } @@ -196,6 +204,11 @@ pub mod fns { Ok(Value::Number(result)) } + pub fn neg(vm: &mut Vm, args: FnArgs) -> Result { + let x = args.get_number(vm, 0, "`-` can only work with numbers")?; + Ok(Value::Number(-x)) + } + pub fn not(vm: &mut Vm, args: FnArgs) -> Result { if args.num() != 1 { return Err(vm.create_exception("(not) expects a single argument to negate")); diff --git a/crates/haku/src/token.rs b/crates/haku/src/token.rs new file mode 100644 index 0000000..d8d621d --- /dev/null +++ b/crates/haku/src/token.rs @@ -0,0 +1,143 @@ +use core::{error::Error, fmt::Display}; + +use alloc::vec::Vec; + +use crate::source::Span; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenKind { + Eof, + + Ident, + Tag, + Number, + Color, + + // Operators + Plus, + Minus, + Star, + Slash, + EqualEqual, + NotEqual, + Less, + LessEqual, + Greater, + GreaterEqual, + Not, + + // Punctuation + Newline, + LParen, + RParen, + LBrack, + RBrack, + Comma, + Equal, + Backslash, + RArrow, + + // Keywords + Underscore, + And, + Or, + If, + Else, + Let, + + // NOTE: This must be kept last for TokenSet to work correctly. + Error, +} + +#[derive(Debug, Clone)] +pub struct Lexis { + pub kinds: Vec, + pub spans: Vec, +} + +impl Lexis { + pub fn new(capacity: usize) -> Self { + assert!(capacity < u32::MAX as usize); + + Self { + kinds: Vec::with_capacity(capacity), + spans: Vec::with_capacity(capacity), + } + } + + pub fn len(&self) -> u32 { + self.kinds.len() as u32 + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn push(&mut self, kind: TokenKind, span: Span) -> Result<(), TokenAllocError> { + if self.kinds.len() >= self.kinds.capacity() { + return Err(TokenAllocError); + } + + self.kinds.push(kind); + self.spans.push(span); + + Ok(()) + } + + pub fn kind(&self, position: u32) -> TokenKind { + self.kinds[position as usize] + } + + pub fn span(&self, position: u32) -> Span { + self.spans[position as usize] + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct TokenAllocError; + +impl Display for TokenAllocError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.write_str("too many tokens") + } +} + +impl Error for TokenAllocError {} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct TokenKindSet { + bits: [u32; Self::WORDS], +} + +impl TokenKindSet { + const WORDS: usize = ((TokenKind::Error as u32 + u32::BITS - 1) / (u32::BITS)) as usize; + + const fn word(kind: TokenKind) -> usize { + (kind as u32 / u32::BITS) as usize + } + + const fn bit(kind: TokenKind) -> u32 { + 1 << (kind as u32 % u32::BITS) + } + + pub const fn new(elems: &[TokenKind]) -> Self { + let mut set = Self { + bits: [0; Self::WORDS], + }; + let mut i = 0; + while i < elems.len() { + set = set.include(elems[i]); + i += 1; + } + set + } + + pub const fn include(mut self, kind: TokenKind) -> Self { + self.bits[Self::word(kind)] |= Self::bit(kind); + self + } + + pub fn contains(&self, kind: TokenKind) -> bool { + self.bits[Self::word(kind)] & Self::bit(kind) != 0 + } +} diff --git a/crates/haku/src/value.rs b/crates/haku/src/value.rs index 97b8113..b169e81 100644 --- a/crates/haku/src/value.rs +++ b/crates/haku/src/value.rs @@ -1,6 +1,6 @@ use alloc::vec::Vec; -use crate::system::ChunkId; +use crate::{compiler::ClosureSpec, system::ChunkId}; // TODO: Probably needs some pretty hardcore space optimization. // Maybe when we have static typing. @@ -156,9 +156,25 @@ pub struct Closure { pub start: BytecodeLoc, pub name: FunctionName, pub param_count: u8, + pub local_count: u8, pub captures: Vec, } +impl Closure { + pub fn chunk(chunk_id: ChunkId, spec: ClosureSpec) -> Self { + Self { + start: BytecodeLoc { + chunk_id, + offset: 0, + }, + name: FunctionName::Anonymous, + param_count: 0, + local_count: spec.local_count, + captures: Vec::new(), + } + } +} + #[derive(Debug, Clone, PartialEq)] pub struct List { pub elements: Vec, diff --git a/crates/haku/src/vm.rs b/crates/haku/src/vm.rs index d9cb816..549988e 100644 --- a/crates/haku/src/vm.rs +++ b/crates/haku/src/vm.rs @@ -123,8 +123,9 @@ impl Vm { fn push(&mut self, value: Value) -> Result<(), Exception> { if self.stack.len() >= self.stack.capacity() { - // TODO: can this error message be made clearer? - return Err(self.create_exception("too many local variables")); + return Err(self.create_exception( + "too many temporary values (local variables and expression operands)", + )); } self.stack.push(value); Ok(()) @@ -136,6 +137,14 @@ impl Vm { }) } + fn get_mut(&mut self, index: usize) -> Result<&mut Value, Exception> { + if self.stack.get(index).is_some() { + Ok(&mut self.stack[index]) + } else { + Err(self.create_exception("corrupted bytecode (set local variable out of bounds)")) + } + } + fn pop(&mut self) -> Result { self.stack .pop() @@ -168,6 +177,11 @@ impl Vm { let mut bottom = self.stack.len(); let mut fuel = self.fuel; + let init_bottom = bottom; + for _ in 0..closure.local_count { + self.push(Value::Nil)?; + } + #[allow(unused)] let closure = (); // Do not use `closure` after this! Use `get_ref` on `closure_id` instead. @@ -200,6 +214,12 @@ impl Vm { self.push(value)?; } + Opcode::SetLocal => { + let index = chunk.read_u8(&mut pc)? as usize; + let new_value = self.pop()?; + *self.get_mut(index)? = new_value; + } + Opcode::Capture => { let index = chunk.read_u8(&mut pc)? as usize; let closure = self.get_ref(closure_id).as_closure().unwrap(); @@ -226,26 +246,14 @@ impl Vm { } } - Opcode::DropLet => { - let count = chunk.read_u8(&mut pc)? as usize; - if count != 0 { - let new_len = self.stack.len().checked_sub(count).ok_or_else(|| { - self.create_exception( - "corrupted bytecode (Drop tried to drop too many values off the stack)", - ) - })?; - let value = self.pop()?; - self.stack.resize_with(new_len, || unreachable!()); - self.push(value)?; - } - } - Opcode::Function => { let param_count = chunk.read_u8(&mut pc)?; let then = chunk.read_u16(&mut pc)? as usize; let body = pc; pc = then; + let local_count = chunk.read_u8(&mut pc)?; + let capture_count = chunk.read_u8(&mut pc)? as usize; let mut captures = Vec::with_capacity(capture_count); for _ in 0..capture_count { @@ -272,6 +280,7 @@ impl Vm { }, name: FunctionName::Anonymous, param_count, + local_count, captures, }))?; self.push(Value::Ref(id))?; @@ -327,6 +336,11 @@ impl Vm { ) })?; + // NOTE: Locals are only pushed _after_ we do any stack calculations. + for _ in 0..closure.local_count { + self.push(Value::Nil)?; + } + self.push_call(frame)?; } @@ -381,10 +395,13 @@ impl Vm { } } - Ok(self + let result = self .stack .pop() - .expect("there should be a result at the top of the stack")) + .expect("there should be a result at the top of the stack"); + self.stack.resize_with(init_bottom, || unreachable!()); + + Ok(result) } fn store_context(&mut self, context: Context) { diff --git a/crates/haku/tests/language.rs b/crates/haku/tests/language.rs index 787c46f..71e8423 100644 --- a/crates/haku/tests/language.rs +++ b/crates/haku/tests/language.rs @@ -1,10 +1,14 @@ use std::error::Error; use haku::{ + ast::{dump::dump, Ast}, bytecode::{Chunk, Defs}, compiler::{compile_expr, Compiler, Source}, - sexp::{self, Ast, Parser, SourceCode}, + lexer::{lex, Lexer}, + parser::{self, Parser, ParserLimits}, + source::SourceCode, system::System, + token::Lexis, value::{BytecodeLoc, Closure, FunctionName, Ref, RefId, Value}, vm::{Vm, VmLimits}, }; @@ -12,11 +16,16 @@ use haku::{ fn eval(code: &str) -> Result> { let mut system = System::new(1); - let ast = Ast::new(1024); let code = SourceCode::unlimited_len(code); - let mut parser = Parser::new(ast, code); - let root = sexp::parse_toplevel(&mut parser); - let ast = parser.ast; + + let mut lexer = Lexer::new(Lexis::new(1024), code); + lex(&mut lexer)?; + + let mut ast = Ast::new(1024); + let mut parser = Parser::new(&lexer.lexis, &ParserLimits { max_events: 1024 }); + parser::toplevel(&mut parser); + let (root, mut parser_diagnostics) = parser.into_ast(&mut ast)?; + println!("{}", dump(&ast, root, Some(code))); let src = Source { code, ast: &ast, @@ -27,21 +36,29 @@ fn eval(code: &str) -> Result> { let mut chunk = Chunk::new(65536).unwrap(); let mut compiler = Compiler::new(&mut defs, &mut chunk); compile_expr(&mut compiler, &src, root)?; + let closure_spec = compiler.closure_spec(); let defs = compiler.defs; - for diagnostic in &compiler.diagnostics { + let mut diagnostics = lexer.diagnostics; + diagnostics.append(&mut parser_diagnostics); + diagnostics.append(&mut compiler.diagnostics); + + for diagnostic in &diagnostics { println!( - "{}..{}: {}", - diagnostic.span.start, diagnostic.span.end, diagnostic.message + "{}..{} {:?}: {}", + diagnostic.span().start, + diagnostic.span().end, + diagnostic.span().slice(code), + diagnostic.message() ); } - if !compiler.diagnostics.is_empty() { - panic!("compiler diagnostics were emitted") + if !diagnostics.is_empty() { + panic!("diagnostics were emitted") } let limits = VmLimits { - stack_capacity: 256, + stack_capacity: 1024, call_stack_capacity: 256, ref_capacity: 256, fuel: 32768, @@ -50,16 +67,9 @@ fn eval(code: &str) -> Result> { let mut vm = Vm::new(defs, &limits); let chunk_id = system.add_chunk(chunk)?; println!("bytecode: {:?}", system.chunk(chunk_id)); + println!("closure spec: {closure_spec:?}"); - let closure = vm.create_ref(Ref::Closure(Closure { - start: BytecodeLoc { - chunk_id, - offset: 0, - }, - name: FunctionName::Anonymous, - param_count: 0, - captures: Vec::new(), - }))?; + let closure = vm.create_ref(Ref::Closure(Closure::chunk(chunk_id, closure_spec)))?; let result = vm.run(&system, closure)?; println!("used fuel: {}", limits.fuel - vm.remaining_fuel()); @@ -87,49 +97,52 @@ fn literal_number() { #[test] fn literal_bool() { - assert_eq!(eval("false").unwrap(), Value::False); - assert_eq!(eval("true").unwrap(), Value::True); + assert_eq!(eval("False").unwrap(), Value::False); + assert_eq!(eval("True").unwrap(), Value::True); } #[test] fn function_nil() { - assert_eq!(eval("(fn () ())").unwrap(), Value::Ref(RefId::from_u32(1))); + assert_eq!( + eval(r#" \_ -> () "#).unwrap(), + Value::Ref(RefId::from_u32(1)) + ); } #[test] fn function_nil_call() { - assert_eq!(eval("((fn () ()))").unwrap(), Value::Nil); + assert_eq!(eval(r#"(\_ -> ()) ()"#).unwrap(), Value::Nil); } #[test] fn function_arithmetic() { - expect_number("((fn (x) (+ x 2)) 2)", 4.0, 0.0001); + expect_number(r#"(\x -> x + 2) 2"#, 4.0, 0.0001); } #[test] fn function_let() { - expect_number("((fn (add-two) (add-two 2)) (fn (x) (+ x 2)))", 4.0, 0.0001); + expect_number(r#"(\addTwo -> addTwo 2) \x -> x + 2"#, 4.0, 0.0001); } #[test] fn function_closure() { - expect_number("(((fn (x) (fn (y) (+ x y))) 2) 2)", 4.0, 0.0001); + expect_number(r#"((\x -> \y -> x + y) 2) 2"#, 4.0, 0.0001); } #[test] fn if_literal() { - expect_number("(if 1 1 2)", 1.0, 0.0001); - expect_number("(if () 1 2)", 2.0, 0.0001); - expect_number("(if false 1 2)", 2.0, 0.0001); - expect_number("(if true 1 2)", 1.0, 0.0001); + expect_number("if (1) 1 else 2", 1.0, 0.0001); + expect_number("if (()) 1 else 2", 2.0, 0.0001); + expect_number("if (False) 1 else 2", 2.0, 0.0001); + expect_number("if (True) 1 else 2", 1.0, 0.0001); } #[test] fn def_simple() { let code = r#" - (def x 1) - (def y 2) - (+ x y) + x = 1 + y = 2 + x + y "#; expect_number(code, 3.0, 0.0001); } @@ -137,13 +150,13 @@ fn def_simple() { #[test] fn def_fib_recursive() { let code = r#" - (def fib - (fn (n) - (if (< n 2) - n - (+ (fib (- n 1)) (fib (- n 2)))))) - - (fib 10) + fib = \n -> + if (n < 2) + n + else + fib (n - 1) + fib (n - 2) + + fib 10 "#; expect_number(code, 55.0, 0.0001); } @@ -151,27 +164,30 @@ fn def_fib_recursive() { #[test] fn def_mutually_recursive() { let code = r#" - (def f - (fn (x) - (if (< x 10) - (g (+ x 1)) - x))) + f = \x -> + if (x < 10) + g (x + 1) + else + x - (def g - (fn (x) - (if (< x 10) - (f (* x 2)) - x))) + g = \x -> + if (x < 10) + f (x * 2) + else + x - (f 0) + f 0 "#; expect_number(code, 14.0, 0.0001); } #[test] fn def_botsbuildbots() { - let result = eval("(def botsbuildbots (fn () (botsbuildbots))) (botsbuildbots)"); - if let Err(error) = result { + let code = r#" + botsbuildbots = \_ -> botsbuildbots () + botsbuildbots () + "#; + if let Err(error) = eval(code) { assert_eq!( error.to_string(), "Exception {\n message: \"too much recursion\",\n}" @@ -184,8 +200,8 @@ fn def_botsbuildbots() { #[test] fn let_single() { let code = r#" - (let ((x 1)) - (+ x 1)) + let x = 1 + x + 1 "#; expect_number(code, 2.0, 0.0001); } @@ -193,9 +209,9 @@ fn let_single() { #[test] fn let_many() { let code = r#" - (let ((x 1) - (y 2)) - (+ x y)) + let x = 1 + let y = 2 + x + y "#; expect_number(code, 3.0, 0.0001); } @@ -203,9 +219,9 @@ fn let_many() { #[test] fn let_sequence() { let code = r#" - (let ((x 1) - (y (+ x 1))) - (+ x y)) + let x = 1 + let y = x + 1 + x + y "#; expect_number(code, 3.0, 0.0001); } @@ -213,59 +229,40 @@ fn let_sequence() { #[test] fn let_subexpr() { let code = r#" - (+ - (let ((x 1) - (y 2)) - (* x y))) + (let x = 1 + let y = 2 + x * y) + 2 "#; - expect_number(code, 2.0, 0.0001); + expect_number(code, 4.0, 0.0001); } #[test] -fn let_empty() { +fn let_subexpr_two() { let code = r#" - (let () 1) - "#; - expect_number(code, 1.0, 0.0001); -} - -#[test] -fn let_subexpr_empty() { - let code = r#" - (+ (let () 1) (let () 1)) - "#; - expect_number(code, 2.0, 0.0001); -} - -#[test] -fn let_subexpr_many() { - let code = r#" - (+ - (let ((x 1) - (y 2)) - (* x y)) - (let () 1) - (let ((x 1)) x)) + (let x = 1 + 2) + + (let x = 1 + x) "#; expect_number(code, 3.0, 0.0001); } #[test] -fn system_arithmetic() { - expect_number("(+ 1 2 3 4)", 10.0, 0.0001); - expect_number("(+ (* 2 1) 1 (/ 6 2) (- 10 3))", 13.0, 0.0001); +fn let_subexpr_many() { + let code = r#" + (let x = 1 + let y = 2 + x * y) + + (let x = 1 + 2) + + (let x = 1 + x) + "#; + expect_number(code, 5.0, 0.0001); } #[test] -fn practical_fib_recursive() { - let code = r#" - ((fn (fib) - (fib fib 10)) - - (fn (fib n) - (if (< n 2) - n - (+ (fib fib (- n 1)) (fib fib (- n 2)))))) - "#; - expect_number(code, 55.0, 0.0001); +fn system_arithmetic() { + expect_number("1 + 2 + 3 + 4", 10.0, 0.0001); + expect_number("(2 * 1) + 1 + (6 / 2) + (10 - 3)", 13.0, 0.0001); } diff --git a/crates/rkgk/src/haku.rs b/crates/rkgk/src/haku.rs index 81b88a9..58a0e90 100644 --- a/crates/rkgk/src/haku.rs +++ b/crates/rkgk/src/haku.rs @@ -5,11 +5,15 @@ use eyre::{bail, Context, OptionExt}; use haku::{ + ast::Ast, bytecode::{Chunk, Defs, DefsImage}, compiler::{Compiler, Source}, + lexer::{lex, Lexer}, + parser::{self, Parser, ParserLimits}, render::{tiny_skia::Pixmap, Renderer, RendererLimits}, - sexp::{Ast, Parser, SourceCode}, + source::SourceCode, system::{ChunkId, System, SystemImage}, + token::Lexis, value::{BytecodeLoc, Closure, FunctionName, Ref, Value}, vm::{Vm, VmImage, VmLimits}, }; @@ -22,9 +26,11 @@ use crate::schema::Vec2; // because we do some dynamic typing magic over on the JavaScript side to automatically call all // the appropriate functions for setting these limits on the client side. pub struct Limits { - pub max_source_code_len: usize, + pub max_source_code_len: u32, pub max_chunks: usize, pub max_defs: usize, + pub max_tokens: usize, + pub max_parser_events: usize, pub ast_capacity: usize, pub chunk_capacity: usize, pub stack_capacity: usize, @@ -88,12 +94,21 @@ impl Haku { pub fn set_brush(&mut self, code: &str) -> eyre::Result<()> { self.reset(); - let ast = Ast::new(self.limits.ast_capacity); let code = SourceCode::limited_len(code, self.limits.max_source_code_len) .ok_or_eyre("source code is too long")?; - let mut parser = Parser::new(ast, code); - let root = haku::sexp::parse_toplevel(&mut parser); - let ast = parser.ast; + + let mut lexer = Lexer::new(Lexis::new(self.limits.max_tokens), code); + lex(&mut lexer)?; + + let mut parser = Parser::new( + &lexer.lexis, + &ParserLimits { + max_events: self.limits.max_parser_events, + }, + ); + parser::toplevel(&mut parser); + let mut ast = Ast::new(self.limits.ast_capacity); + let (root, parser_diagnostics) = parser.into_ast(&mut ast)?; let src = Source { code, @@ -107,7 +122,10 @@ impl Haku { haku::compiler::compile_expr(&mut compiler, &src, root) .context("failed to compile the chunk")?; - if !compiler.diagnostics.is_empty() { + if !lexer.diagnostics.is_empty() + || !parser_diagnostics.is_empty() + || !compiler.diagnostics.is_empty() + { bail!("diagnostics were emitted"); } diff --git a/rkgk.toml b/rkgk.toml index 8a0be66..ef0107d 100644 --- a/rkgk.toml +++ b/rkgk.toml @@ -61,6 +61,12 @@ max_chunks = 2 # Maximum amount of defs across all source code chunks. max_defs = 256 +# Maximum amount of tokens a single chunk can have. +max_tokens = 4096 + +# Maximum amount of events that the parser may emit in a single chunk. +max_parser_events = 4096 + # Maximum amount of AST nodes in a single parse. ast_capacity = 4096