static syntax highlighting WIP

2024-03-10 23:23:50 +01:00 · 2024-03-10 23:23:50 +01:00 · 5ab8ffdba2
parent 7fd2d18b69
commit 5ab8ffdba2
12 changed files with 523 additions and 63 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1160,9 +1160,9 @@ dependencies = [
 [[package]]
 name = "regex"
-version = "1.9.3"
+version = "1.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a"
+checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
 dependencies = [
 "aho-corasick",
 "memchr",
@ -1172,9 +1172,9 @@ dependencies = [
 [[package]]
 name = "regex-automata"
-version = "0.3.6"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69"
+checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
 dependencies = [
 "aho-corasick",
 "memchr",
@ -1183,9 +1183,9 @@ dependencies = [
 [[package]]
 name = "regex-syntax"
-version = "0.7.4"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
 [[package]]
 name = "rustc-demangle"
@ -1580,6 +1580,7 @@ dependencies = [
 "log",
 "pulldown-cmark",
 "rand",
 "regex",
 "serde",
 "serde_json",
 "tokio",
--- a/content/treehouse/dev/syntax-highlighting.tree
+++ b/content/treehouse/dev/syntax-highlighting.tree
@ -0,0 +1,36 @@
 %% title = "syntax highlighting gallery"
 - this is a page demonstrating syntaxes supported by the treehouse
    - really there's not much more to it, but I use it for debugging + with it you can get a general feel for how I highlight things in the treehouse
 - `javascript`
 ```javascript
 // t is an existing tile index; variable name is short for brevity
 export function removeRedundancies(t) {
    if (isSet(t, SE) && (!isSet(t, S) || !isSet(t, E))) {
        t &= ~SE;
    }
    if (isSet(t, SW) && (!isSet(t, S) || !isSet(t, W))) {
        t &= ~SW;
    }
    if (isSet(t, NW) && (!isSet(t, N) || !isSet(t, W))) {
        t &= ~NW;
    }
    if (isSet(t, NE) && (!isSet(t, N) || !isSet(t, E))) {
        t &= ~NE;
    }
    return t;
 }
 /* This is
   a multiline comment. */
 ident Class CONSTANT funciton()
 0b1010 0o01234567 0x0123456789ABCDEF
 01234567
 1.41e-3
 'string' /**/ "string" /**/ `string`
 + - * / == != <= >= ! ~ || && . ? :
 , ;
 ```
--- a/crates/treehouse/Cargo.toml
+++ b/crates/treehouse/Cargo.toml
@ -29,3 +29,4 @@ ulid = "1.0.0"
 url = "2.5.0"
 base64 = "0.21.7"
 chrono = "0.4.35"
 regex = "1.10.3"
--- a/crates/treehouse/src/cli/generate.rs
+++ b/crates/treehouse/src/cli/generate.rs
@ -361,6 +361,7 @@ pub fn generate(paths: &Paths<'_>) -> anyhow::Result<(Config, Treehouse)> {
    config.site = std::env::var("TREEHOUSE_SITE").unwrap_or(config.site);
    config.autopopulate_emoji(&paths.static_dir.join("emoji"))?;
    config.autopopulate_pics(&paths.static_dir.join("pic"))?;
    config.load_syntaxes(&paths.static_dir.join("syntax"))?;
    info!("cleaning target directory");
    let _ = std::fs::remove_dir_all(paths.target_dir);
--- a/crates/treehouse/src/config.rs
+++ b/crates/treehouse/src/config.rs
@ -1,9 +1,15 @@
 use std::{collections::HashMap, ffi::OsStr, fs::File, io::BufReader, path::Path};
 use anyhow::Context;
 use log::debug;
 use serde::{Deserialize, Serialize};
 use walkdir::WalkDir;
 use crate::html::highlight::{
    compiled::{compile_syntax, CompiledSyntax},
    Syntax,
 };
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct Config {
    /// Website root; used when generating links.
@ -48,6 +54,13 @@ pub struct Config {
    /// On top of this, pics are autodiscovered by walking the `static/pic` directory.
    /// Only the part before the first dash is treated as the pic's id.
    pub pics: HashMap<String, String>,
    /// Syntax definitions.
    ///
    /// These are not part of the config file, but are loaded as part of site configuration from
    /// `static/syntax`.
    #[serde(skip)]
    pub syntaxes: HashMap<String, CompiledSyntax>,
 }
 #[derive(Debug, Clone, Deserialize, Serialize)]
@ -138,6 +151,30 @@ impl Config {
            self.pics.get(id).map(|x| &**x).unwrap_or("404.png")
        )
    }
    /// Loads all syntax definition files.
    pub fn load_syntaxes(&mut self, dir: &Path) -> anyhow::Result<()> {
        for entry in WalkDir::new(dir) {
            let entry = entry?;
            if entry.path().extension() == Some(OsStr::new("json")) {
                let name = entry
                    .path()
                    .file_stem()
                    .expect("syntax file name should have a stem")
                    .to_string_lossy();
                debug!("loading syntax {name:?}");
                let syntax: Syntax = serde_json::from_reader(BufReader::new(
                    File::open(entry.path()).context("could not open syntax file")?,
                ))
                .context("could not deserialize syntax file")?;
                let compiled = compile_syntax(&syntax);
                self.syntaxes.insert(name.into_owned(), compiled);
            }
        }
        Ok(())
    }
 }
 /// Data derived from the config.
--- a/crates/treehouse/src/html.rs
+++ b/crates/treehouse/src/html.rs
@ -1,6 +1,7 @@
 use std::fmt::{self, Display, Write};
 pub mod breadcrumbs;
 pub mod highlight;
 mod markdown;
 pub mod navmap;
 pub mod tree;
--- a/crates/treehouse/src/html/highlight.rs
+++ b/crates/treehouse/src/html/highlight.rs
@ -0,0 +1,94 @@
 //! Tokenizer and syntax highlighter inspired by the one found in rxi's lite.
 //! I highly recommend checking it out!
 //! https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
 //! There's also a mirror of it in the JavaScript, used to power dynamically editable code blocks.
 //!
 //! Both of these syntax highlighters use the same JSON syntax definitions; however this one is
 //! more limited, in that patterns do not support backtracking.
 //! This is effectively enforced in the dynamic highlighter because this highlighter reports any
 //! regex syntax errors upon site compilation.
 pub mod compiled;
 pub mod tokenize;
 use std::{collections::HashMap, io};
 use pulldown_cmark::escape::{escape_html, StrWrite};
 use serde::{Deserialize, Serialize};
 use self::compiled::CompiledSyntax;
 /// Syntax definition.
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct Syntax {
    /// Patterns, matched sequentially (patterns at the beginning of the list take precedence.)
    pub patterns: Vec<Pattern>,
    /// Map of replacements to use if a pattern matches a string exactly.
    pub keywords: HashMap<String, Keyword>,
 }
 /// A pattern in a syntax definition.
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct Pattern {
    /// Regular expression to match.
    pub regex: String,
    /// Flags to pass to the regex engine to alter how strings are matched.
    #[serde(default)]
    pub flags: Vec<RegexFlag>,
    /// Type to assign to the token. This can be any string, but only a select few have colors
    /// assigned.
    pub is: TokenTypes,
 }
 /// Assignable token types.
 #[derive(Debug, Clone, Deserialize, Serialize)]
 #[serde(untagged)]
 pub enum TokenTypes {
    /// Assign a single token type to the entire match.
    FullMatch(String),
    /// Assign individual token types to each capture.
    Captures(CaptureTokenTypes),
 }
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct CaptureTokenTypes {
    /// Token type to use outside captures.
    pub default: String,
    /// Token type to use inside captures.
    pub captures: Vec<String>,
 }
 /// Flag passed to the regex engine.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)]
 #[serde(rename_all = "camelCase")]
 pub enum RegexFlag {
    /// Make `.` match line separators.
    DotMatchesNewline,
 }
 /// Keyword replacement.
 #[derive(Debug, Clone, Deserialize, Serialize)]
 #[serde(rename_all = "camelCase")]
 pub struct Keyword {
    /// What to replace the token type with.
    pub into: String,
    /// Only replace the token type if it matches this one. If this is not present, any token type
    /// is replaced.
    pub only_replaces: Option<String>,
 }
 pub fn highlight(mut w: impl StrWrite, syntax: &CompiledSyntax, code: &str) -> io::Result<()> {
    let tokens = syntax.tokenize(code);
    for token in tokens {
        w.write_str("<span class=\"")?;
        escape_html(&mut w, &syntax.token_names[token.id])?;
        w.write_str("\">")?;
        escape_html(&mut w, &code[token.range])?;
        w.write_str("</span>")?;
    }
    Ok(())
 }
--- a/crates/treehouse/src/html/highlight/compiled.rs
+++ b/crates/treehouse/src/html/highlight/compiled.rs
@ -0,0 +1,118 @@
 use std::collections::HashMap;
 use log::error;
 use regex::{Regex, RegexBuilder};
 use super::{RegexFlag, Syntax, TokenTypes};
 /// During compilation, token names are converted to numeric IDs for performance.
 pub type TokenId = usize;
 pub const TOKEN_ID_DEFAULT: TokenId = 0;
 #[derive(Debug, Clone)]
 pub struct CompiledSyntax {
    /// Lookup table which maps numeric IDs to token names.
    pub token_names: Vec<String>,
    pub patterns: Vec<CompiledPattern>,
    pub keywords: HashMap<String, CompiledKeyword>,
 }
 #[derive(Debug, Clone)]
 pub enum CompiledTokenTypes {
    FullMatch(TokenId),
    Captures(CompiledCaptureTokenTypes),
 }
 #[derive(Debug, Clone)]
 pub struct CompiledCaptureTokenTypes {
    pub default: TokenId,
    pub captures: Vec<TokenId>,
 }
 #[derive(Debug, Clone)]
 pub struct CompiledPattern {
    pub regex: Regex,
    pub is: CompiledTokenTypes,
 }
 #[derive(Debug, Clone)]
 pub struct CompiledKeyword {
    pub into: TokenId,
    pub only_replaces: Option<TokenId>,
 }
 pub fn compile_syntax(syntax: &Syntax) -> CompiledSyntax {
    let mut token_names = vec!["default".into()];
    let mut get_token_id = |name: &str| -> TokenId {
        if let Some(id) = token_names.iter().position(|n| n == name) {
            id
        } else {
            let id = token_names.len();
            token_names.push(name.to_owned());
            id
        }
    };
    let patterns = syntax
        .patterns
        .iter()
        .filter_map(|pattern| {
            // NOTE: `regex` has no support for sticky flags, so we need to anchor the match to the
            // start ourselves.
            let regex = RegexBuilder::new(&format!(
                "^{}",
                // If there's an existing `^`, it should not cause compilation errors for the user.
                pattern.regex.strip_prefix('^').unwrap_or(&pattern.regex)
            ))
            .dot_matches_new_line(pattern.flags.contains(&RegexFlag::DotMatchesNewline))
            .build()
            .map_err(|e| {
                // NOTE: This could probably use better diagnostics, but it's pretty much
                // impossible to get a source span out of serde's output (because it forgoes
                // source information, rightfully so.) Therefore we have to settle on
                // a poor man's error log.
                error!("regex compilation error in pattern {pattern:?}: {e}");
            })
            .ok()?;
            Some(CompiledPattern {
                regex,
                is: match &pattern.is {
                    TokenTypes::FullMatch(name) => {
                        CompiledTokenTypes::FullMatch(get_token_id(name))
                    }
                    TokenTypes::Captures(types) => {
                        CompiledTokenTypes::Captures(CompiledCaptureTokenTypes {
                            default: get_token_id(&types.default),
                            captures: types
                                .captures
                                .iter()
                                .map(|name| get_token_id(name))
                                .collect(),
                        })
                    }
                },
            })
        })
        .collect();
    let keywords = syntax
        .keywords
        .iter()
        .map(|(text, keyword)| {
            (
                text.clone(),
                CompiledKeyword {
                    into: get_token_id(&keyword.into),
                    only_replaces: keyword.only_replaces.as_deref().map(&mut get_token_id),
                },
            )
        })
        .collect();
    CompiledSyntax {
        token_names,
        patterns,
        keywords,
    }
 }
--- a/crates/treehouse/src/html/highlight/tokenize.rs
+++ b/crates/treehouse/src/html/highlight/tokenize.rs
@ -0,0 +1,57 @@
 use std::ops::Range;
 use super::compiled::{CompiledSyntax, CompiledTokenTypes, TokenId, TOKEN_ID_DEFAULT};
 pub struct Token {
    pub id: TokenId,
    pub range: Range<usize>,
 }
 impl CompiledSyntax {
    pub fn tokenize(&self, text: &str) -> Vec<Token> {
        let mut tokens = vec![];
        let mut i = 0;
        while i < text.len() {
            let mut had_match = false;
            for pattern in &self.patterns {
                match &pattern.is {
                    CompiledTokenTypes::FullMatch(id) => {
                        if let Some(regex_match) = pattern.regex.find(&text[i..]) {
                            push_token(&mut tokens, *id, i..i + regex_match.range().end);
                            i += regex_match.range().end;
                            had_match = true;
                            break;
                        }
                    }
                    CompiledTokenTypes::Captures(types) => { /* TODO */ }
                }
            }
            if !had_match {
                push_token(&mut tokens, TOKEN_ID_DEFAULT, i..i + 1);
                i += 1;
            }
        }
        for token in &mut tokens {
            if let Some(keyword) = self.keywords.get(&text[token.range.clone()]) {
                if keyword.only_replaces.is_none() || Some(token.id) == keyword.only_replaces {
                    token.id = keyword.into;
                }
            }
        }
        tokens
    }
 }
 fn push_token(tokens: &mut Vec<Token>, id: TokenId, range: Range<usize>) {
    if let Some(previous_token) = tokens.last_mut() {
        if previous_token.id == id {
            previous_token.range.end = range.end;
            return;
        }
    }
    tokens.push(Token { id, range });
 }
--- a/crates/treehouse/src/html/markdown.rs
+++ b/crates/treehouse/src/html/markdown.rs
@ -23,6 +23,7 @@
 //! HTML renderer that takes an iterator of events as input.
 use std::borrow::Borrow;
 use std::collections::HashMap;
 use std::io;
@ -31,6 +32,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, Event, LinkType, Tag};
 use pulldown_cmark::{CowStr, Event::*};
 use crate::config::{Config, ConfigDerivedData, PicSize};
 use crate::html::highlight::highlight;
 use crate::state::Treehouse;
 enum TableState {
@ -38,6 +40,12 @@ enum TableState {
    Body,
 }
 #[derive(Debug, Clone, PartialEq, Eq)]
 enum CodeBlockState<'a> {
    NotInCodeBlock,
    InCodeBlock(Option<CowStr<'a>>),
 }
 struct HtmlWriter<'a, I, W> {
    treehouse: &'a Treehouse,
    config: &'a Config,
@ -58,7 +66,7 @@ struct HtmlWriter<'a, I, W> {
    table_cell_index: usize,
    numbers: HashMap<CowStr<'a>, usize>,
-    in_code_block: bool,
+    code_block_state: CodeBlockState<'a>,
 }
 impl<'a, I, W> HtmlWriter<'a, I, W>
@ -87,7 +95,7 @@ where
            table_alignments: vec![],
            table_cell_index: 0,
            numbers: HashMap::new(),
-            in_code_block: false,
+            code_block_state: CodeBlockState::NotInCodeBlock,
        }
    }
@ -234,16 +242,21 @@ where
                }
            }
            Tag::CodeBlock(info) => {
-                self.in_code_block = true;
+                self.code_block_state = CodeBlockState::InCodeBlock(None);
                if !self.end_newline {
                    self.write_newline()?;
                }
                match info {
-                    CodeBlockKind::Fenced(language) => match CodeBlockMode::parse(&language) {
+                    CodeBlockKind::Fenced(language) => {
                        self.code_block_state = CodeBlockState::InCodeBlock(Some(language.clone()));
                        match CodeBlockMode::parse(&language) {
                            CodeBlockMode::PlainText => self.write("<pre><code>"),
                            CodeBlockMode::SyntaxHighlightOnly { language } => {
                                self.write("<pre><code class=\"language-")?;
                                escape_html(&mut self.writer, language)?;
                                if self.config.syntaxes.contains_key(language) {
                                    self.write(" th-syntax-highlighting")?;
                                }
                                self.write("\">")
                            }
                            CodeBlockMode::LiterateProgram {
@ -292,7 +305,8 @@ where
                                self.write("<pre class=\"placeholder-console\">")?;
                                Ok(())
                            }
-                    },
+                        }
                    }
                    CodeBlockKind::Indented => self.write("<pre><code>"),
                }
            }
@ -416,7 +430,7 @@ where
                    },
                    _ => "</code></pre>\n",
                })?;
-                self.in_code_block = false;
+                self.code_block_state = CodeBlockState::NotInCodeBlock;
            }
            Tag::List(Some(_)) => {
                self.write("</ol>\n")?;
@ -505,8 +519,20 @@ where
            }
        }
-        if self.in_code_block {
+        if let CodeBlockState::InCodeBlock(language) = &self.code_block_state {
            let code_block_mode = language
                .as_ref()
                .map(|language| CodeBlockMode::parse(language));
            let highlighting_language = code_block_mode
                .as_ref()
                .and_then(|mode| mode.highlighting_language());
            let syntax =
                highlighting_language.and_then(|language| self.config.syntaxes.get(language));
            if let Some(syntax) = syntax {
                highlight(&mut self.writer, syntax, text)?;
            } else {
                escape_html(&mut self.writer, text)?;
            }
        } else {
            let mut parser = EmojiParser { text, position: 0 };
            while let Some(token) = parser.next_token() {
@ -623,6 +649,16 @@ impl<'a> CodeBlockMode<'a> {
            CodeBlockMode::SyntaxHighlightOnly { language }
        }
    }
    fn highlighting_language(&self) -> Option<&str> {
        if let CodeBlockMode::LiterateProgram { language, .. }
        | CodeBlockMode::SyntaxHighlightOnly { language } = self
        {
            Some(language)
        } else {
            None
        }
    }
 }
 /// Iterate over an `Iterator` of `Event`s, generate HTML for each `Event`, and
--- a/static/js/components/literate-programming/highlight.js
+++ b/static/js/components/literate-programming/highlight.js
@ -1,6 +1,8 @@
 // This tokenizer is highly inspired by the one found in rxi's lite.
 // I highly recommend checking it out!
 // https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
 // There's also a mirror of it in the static generator, to enable highlighting of code blocks which
 // are *not* JavaScript-powered.
 export function compileSyntax(def) {
    for (let pattern of def.patterns) {
@ -32,7 +34,7 @@ function tokenize(text, syntax) {
            let match;
            pattern.regex.lastIndex = i;
            if ((match = pattern.regex.exec(text)) != null) {
-                pushToken(tokens, pattern.as, match[0]); // TODO
+                pushToken(tokens, pattern.is, match[0]); // TODO
                i = pattern.regex.lastIndex;
                hadMatch = true;
                break;
--- a/static/syntax/javascript.json
+++ b/static/syntax/javascript.json
@ -0,0 +1,76 @@
 {
    "patterns": [
        { "regex": "\\/\\/.*", "is": "comment" },
        {
            "regex": "\\/\\*.*?\\*\\/",
            "flags": ["dotMatchesNewline"],
            "is": "comment"
        },
        { "regex": "[A-Z_][a-zA-Z0-9_]*", "is": "keyword2" },
        {
            "regex": "[a-zA-Z_][a-zA-Z0-9_]*(\\()",
            "is": { "default": "function", "captures": ["default"] }
        },
        { "regex": "[a-zA-Z_][a-zA-Z0-9_]*", "is": "identifier" },
        { "regex": "0[bB][01_]+n?", "is": "literal" },
        { "regex": "0[oO][0-7_]+n?", "is": "literal" },
        { "regex": "0[xX][0-9a-fA-F_]+n?", "is": "literal" },
        { "regex": "[0-9_]+n", "is": "literal" },
        { "regex": "[0-9_]+(\\.[0-9_]*([eE][-+]?[0-9_]+)?)?", "is": "literal" },
        { "regex": "'(\\'|[^'])*'", "is": "string" },
        { "regex": "\"(\\\"|[^\"])*\"", "is": "string" },
        { "regex": "`(\\`|[^`])*`", "is": "string" },
        { "regex": "[+=/*^%<>!~|&\\.?:-]+", "is": "operator" },
        { "regex": "[,;]", "is": "punct" }
    ],
    "keywords": {
        "as": { "into": "keyword1", "onlyReplaces": "identifier" },
        "async": { "into": "keyword1", "onlyReplaces": "identifier" },
        "await": { "into": "keyword1" },
        "break": { "into": "keyword1" },
        "case": { "into": "keyword1" },
        "catch": { "into": "keyword1" },
        "class": { "into": "keyword1" },
        "const": { "into": "keyword1" },
        "continue": { "into": "keyword1" },
        "debugger": { "into": "keyword1" },
        "default": { "into": "keyword1" },
        "delete": { "into": "keyword1" },
        "do": { "into": "keyword1" },
        "else": { "into": "keyword1" },
        "export": { "into": "keyword1" },
        "extends": { "into": "keyword1" },
        "finally": { "into": "keyword1" },
        "for": { "into": "keyword1" },
        "from": { "into": "keyword1", "onlyReplaces": "identifier" },
        "function": { "into": "keyword1" },
        "get": { "into": "keyword1", "onlyReplaces": "identifier" },
        "if": { "into": "keyword1" },
        "import": { "into": "keyword1" },
        "in": { "into": "keyword1" },
        "instanceof": { "into": "keyword1" },
        "let": { "into": "keyword1" },
        "new": { "into": "keyword1" },
        "of": { "into": "keyword1", "onlyReplaces": "identifier" },
        "return": { "into": "keyword1" },
        "set": { "into": "keyword1", "onlyReplaces": "identifier" },
        "static": { "into": "keyword1" },
        "switch": { "into": "keyword1" },
        "throw": { "into": "keyword1" },
        "try": { "into": "keyword1" },
        "typeof": { "into": "keyword1" },
        "var": { "into": "keyword1" },
        "void": { "into": "keyword1" },
        "while": { "into": "keyword1" },
        "with": { "into": "keyword1" },
        "yield": { "into": "keyword1" },
        "super": { "into": "keyword2" },
        "this": { "into": "keyword2" },
        "false": { "into": "literal" },
        "true": { "into": "literal" },
        "undefined": { "into": "literal" },
        "null": { "into": "literal" }
    }
 }