add SourceCode wrapping str for enforcing source code length limits at parsing boundaries

I'm actually quite happy with this API design. a little zero-cost wrapper that makes you _think_ when you need to think.
2024-08-22 20:27:25 +02:00 · 2024-08-22 20:27:25 +02:00 · 6c88a041ea
parent 3913254215
commit 6c88a041ea
7 changed files with 66 additions and 16 deletions
--- a/crates/haku-cli/src/main.rs
+++ b/crates/haku-cli/src/main.rs
@ -6,7 +6,7 @@ use std::{error::Error, fmt::Display, io::BufRead};
 use haku::{
    bytecode::{Chunk, Defs},
    compiler::{compile_expr, Compiler, Source},
-    sexp::{parse_toplevel, Ast, Parser},
+    sexp::{parse_toplevel, Ast, Parser, SourceCode},
    system::System,
    value::{BytecodeLoc, Closure, FunctionName, Ref, Value},
    vm::{Vm, VmLimits},
@ -16,6 +16,7 @@ fn eval(code: &str) -> Result<Value, Box<dyn Error>> {
    let mut system = System::new(1);

    let ast = Ast::new(1024);
+    let code = SourceCode::unlimited_len(code);
    let mut parser = Parser::new(ast, code);
    let root = parse_toplevel(&mut parser);
    let ast = parser.ast;
--- a/crates/haku-wasm/src/lib.rs
+++ b/crates/haku-wasm/src/lib.rs
@ -12,7 +12,7 @@ use haku::{
        tiny_skia::{Pixmap, PremultipliedColorU8},
        Renderer, RendererLimits,
    },
-    sexp::{parse_toplevel, Ast, Parser},
+    sexp::{parse_toplevel, Ast, Parser, SourceCode},
    system::{ChunkId, System, SystemImage},
    value::{BytecodeLoc, Closure, FunctionName, Ref, Value},
    vm::{Exception, Vm, VmImage, VmLimits},
@ -37,6 +37,7 @@ unsafe extern "C" fn haku_free(ptr: *mut u8, size: usize, align: usize) {

 #[derive(Debug, Clone, Copy)]
 struct Limits {
+    max_source_code_len: usize,
    max_chunks: usize,
    max_defs: usize,
    ast_capacity: usize,
@ -53,6 +54,7 @@ struct Limits {
 impl Default for Limits {
    fn default() -> Self {
        Self {
+            max_source_code_len: 65536,
            max_chunks: 2,
            max_defs: 256,
            ast_capacity: 1024,
@ -92,6 +94,7 @@ macro_rules! limit_setter {
    };
 }

+limit_setter!(max_source_code_len);
 limit_setter!(max_chunks);
 limit_setter!(max_defs);
 limit_setter!(ast_capacity);
@ -193,6 +196,7 @@ unsafe extern "C" fn haku_exception_message_len(instance: *const Instance) -> u3
 #[repr(C)]
 enum StatusCode {
    Ok,
+    SourceCodeTooLong,
    ChunkTooBig,
    DiagnosticsEmitted,
    TooManyChunks,
@ -223,6 +227,7 @@ extern "C" fn haku_is_exception(code: StatusCode) -> bool {
 extern "C" fn haku_status_string(code: StatusCode) -> *const i8 {
    match code {
        StatusCode::Ok => c"ok",
+        StatusCode::SourceCodeTooLong => c"source code is too long",
        StatusCode::ChunkTooBig => c"compiled bytecode is too large",
        StatusCode::DiagnosticsEmitted => c"diagnostics were emitted",
        StatusCode::TooManyChunks => c"too many registered bytecode chunks",
@ -297,6 +302,10 @@ unsafe extern "C" fn haku_compile_brush(

    let code = core::str::from_utf8(slice::from_raw_parts(code, code_len as usize))
        .expect("invalid UTF-8");
+    let code = match SourceCode::limited_len(code, instance.limits.max_source_code_len) {
+        Some(code) => code,
+        None => return StatusCode::SourceCodeTooLong,
+    };

    let ast = Ast::new(instance.limits.ast_capacity);
    let mut parser = Parser::new(ast, code);
--- a/crates/haku/src/compiler.rs
+++ b/crates/haku/src/compiler.rs
@ -7,12 +7,12 @@ use alloc::vec::Vec;

 use crate::{
    bytecode::{Chunk, DefError, Defs, EmitError, Opcode, CAPTURE_CAPTURE, CAPTURE_LOCAL},
-    sexp::{Ast, NodeId, NodeKind, Span},
+    sexp::{Ast, NodeId, NodeKind, SourceCode, Span},
    system::System,
 };

 pub struct Source<'a> {
-    pub code: &'a str,
+    pub code: &'a SourceCode,
    pub ast: &'a Ast,
    pub system: &'a System,
 }
--- a/crates/haku/src/sexp.rs
+++ b/crates/haku/src/sexp.rs
@ -1,4 +1,4 @@
-use core::{cell::Cell, fmt};
+use core::{cell::Cell, fmt, ops::Deref};

 use alloc::vec::Vec;

@ -13,8 +13,40 @@ impl Span {
        Self { start, end }
    }

-    pub fn slice<'a>(&self, source: &'a str) -> &'a str {
-        &source[self.start..self.end]
+    pub fn slice<'a>(&self, source: &'a SourceCode) -> &'a str {
+        &source.code[self.start..self.end]
+    }
+}
+
+/// Source code string with a verified size limit.
+/// An exact size limit is not enforced by this type - it only ensures the string isn't longer than
+/// intended, to not stall the parser for an unexpected amount of time.
+#[derive(Debug, PartialEq, Eq)]
+#[repr(transparent)]
+pub struct SourceCode {
+    code: str,
+}
+
+impl SourceCode {
+    pub fn limited_len(code: &str, max_len: usize) -> Option<&Self> {
+        if code.len() <= max_len {
+            Some(Self::unlimited_len(code))
+        } else {
+            None
+        }
+    }
+
+    pub fn unlimited_len(code: &str) -> &Self {
+        // SAFETY: SourceCode is a transparent wrapper around str, so converting between them is safe.
+        unsafe { core::mem::transmute(code) }
+    }
+}
+
+impl Deref for SourceCode {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        &self.code
    }
 }

@ -94,7 +126,7 @@ impl Ast {

    pub fn write(
        &self,
-        source: &str,
+        source: &SourceCode,
        node_id: NodeId,
        w: &mut dyn fmt::Write,
        mode: AstWriteMode,
@ -102,7 +134,7 @@ impl Ast {
        #[allow(clippy::too_many_arguments)]
        fn write_list(
            ast: &Ast,
-            source: &str,
+            source: &SourceCode,
            w: &mut dyn fmt::Write,
            mode: AstWriteMode,
            mut head: NodeId,
@ -131,7 +163,7 @@ impl Ast {
        // NOTE: Separated out to a separate function in case we ever want to introduce auto-indentation.
        fn write_rec(
            ast: &Ast,
-            source: &str,
+            source: &SourceCode,
            w: &mut dyn fmt::Write,
            mode: AstWriteMode,
            node_id: NodeId,
@ -177,7 +209,7 @@ pub struct NodeAllocError;

 pub struct Parser<'a> {
    pub ast: Ast,
-    input: &'a str,
+    input: &'a SourceCode,
    position: usize,
    fuel: Cell<usize>,
    alloc_error: NodeId,
@ -186,7 +218,7 @@ pub struct Parser<'a> {
 impl<'a> Parser<'a> {
    const FUEL: usize = 256;

-    pub fn new(mut ast: Ast, input: &'a str) -> Self {
+    pub fn new(mut ast: Ast, input: &'a SourceCode) -> Self {
        let alloc_error = ast
            .alloc(Node {
                span: Span::new(0, 0),
@ -412,12 +444,13 @@ mod tests {
        expected: &str,
    ) -> Result<(), Box<dyn Error>> {
        let ast = Ast::new(16);
-        let mut p = Parser::new(ast, source);
+        let code = SourceCode::unlimited_len(source);
+        let mut p = Parser::new(ast, code);
        let node = f(&mut p);
        let ast = p.ast;

        let mut s = String::new();
-        ast.write(source, node, &mut s, AstWriteMode::Spans)?;
+        ast.write(code, node, &mut s, AstWriteMode::Spans)?;

        assert_eq!(s, expected);

--- a/crates/haku/tests/language.rs
+++ b/crates/haku/tests/language.rs
@ -3,7 +3,7 @@ use std::error::Error;
 use haku::{
    bytecode::{Chunk, Defs},
    compiler::{compile_expr, Compiler, Source},
-    sexp::{self, Ast, Parser},
+    sexp::{self, Ast, Parser, SourceCode},
    system::System,
    value::{BytecodeLoc, Closure, FunctionName, Ref, RefId, Value},
    vm::{Vm, VmLimits},
@ -13,6 +13,7 @@ fn eval(code: &str) -> Result<Value, Box<dyn Error>> {
    let mut system = System::new(1);

    let ast = Ast::new(1024);
+    let code = SourceCode::unlimited_len(code);
    let mut parser = Parser::new(ast, code);
    let root = sexp::parse_toplevel(&mut parser);
    let ast = parser.ast;
--- a/crates/rkgk/src/haku.rs
+++ b/crates/rkgk/src/haku.rs
@ -8,7 +8,7 @@ use haku::{
    bytecode::{Chunk, Defs, DefsImage},
    compiler::{Compiler, Source},
    render::{tiny_skia::Pixmap, Renderer, RendererLimits},
-    sexp::{Ast, Parser},
+    sexp::{Ast, Parser, SourceCode},
    system::{ChunkId, System, SystemImage},
    value::{BytecodeLoc, Closure, FunctionName, Ref, Value},
    vm::{Vm, VmImage, VmLimits},
@ -22,6 +22,7 @@ use crate::schema::Vec2;
 // because we do some dynamic typing magic over on the JavaScript side to automatically call all
 // the appropriate functions for setting these limits on the client side.
 pub struct Limits {
+    pub max_source_code_len: usize,
    pub max_chunks: usize,
    pub max_defs: usize,
    pub ast_capacity: usize,
@ -88,6 +89,8 @@ impl Haku {
        self.reset();

        let ast = Ast::new(self.limits.ast_capacity);
+        let code = SourceCode::limited_len(code, self.limits.max_source_code_len)
+            .ok_or_eyre("source code is too long")?;
        let mut parser = Parser::new(ast, code);
        let root = haku::sexp::parse_toplevel(&mut parser);
        let ast = parser.ast;
--- a/rkgk.toml
+++ b/rkgk.toml
@ -33,6 +33,9 @@ interval_seconds = 10
 # Technically clients may override these settings with some hackery, but then the server may not
 # register changes they make to the canvas.

+# Maximum length of source code.
+max_source_code_len = 65536
+
 # Maximum amount of source code chunks.
 # This should be at least 2, to allow for loading in a standard library chunk.
 max_chunks = 2