static syntax highlighting WIP

2024-03-10 23:23:50 +01:00 · 2024-03-10 23:23:50 +01:00 · 5ab8ffdba2
commit 5ab8ffdba2
parent 7fd2d18b69
12 changed files with 523 additions and 63 deletions
--- a/crates/treehouse/Cargo.toml
+++ b/crates/treehouse/Cargo.toml
@ -29,3 +29,4 @@ ulid = "1.0.0"
 url = "2.5.0"
 base64 = "0.21.7"
 chrono = "0.4.35"
+regex = "1.10.3"
--- a/crates/treehouse/src/cli/generate.rs
+++ b/crates/treehouse/src/cli/generate.rs
@ -361,6 +361,7 @@ pub fn generate(paths: &Paths<'_>) -> anyhow::Result<(Config, Treehouse)> {
    config.site = std::env::var("TREEHOUSE_SITE").unwrap_or(config.site);
    config.autopopulate_emoji(&paths.static_dir.join("emoji"))?;
    config.autopopulate_pics(&paths.static_dir.join("pic"))?;
+    config.load_syntaxes(&paths.static_dir.join("syntax"))?;

    info!("cleaning target directory");
    let _ = std::fs::remove_dir_all(paths.target_dir);
--- a/crates/treehouse/src/config.rs
+++ b/crates/treehouse/src/config.rs
@ -1,9 +1,15 @@
 use std::{collections::HashMap, ffi::OsStr, fs::File, io::BufReader, path::Path};

 use anyhow::Context;
+use log::debug;
 use serde::{Deserialize, Serialize};
 use walkdir::WalkDir;

+use crate::html::highlight::{
+    compiled::{compile_syntax, CompiledSyntax},
+    Syntax,
+};
+
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct Config {
    /// Website root; used when generating links.
@ -48,6 +54,13 @@ pub struct Config {
    /// On top of this, pics are autodiscovered by walking the `static/pic` directory.
    /// Only the part before the first dash is treated as the pic's id.
    pub pics: HashMap<String, String>,
+
+    /// Syntax definitions.
+    ///
+    /// These are not part of the config file, but are loaded as part of site configuration from
+    /// `static/syntax`.
+    #[serde(skip)]
+    pub syntaxes: HashMap<String, CompiledSyntax>,
 }

 #[derive(Debug, Clone, Deserialize, Serialize)]
@ -138,6 +151,30 @@ impl Config {
            self.pics.get(id).map(|x| &**x).unwrap_or("404.png")
        )
    }
+
+    /// Loads all syntax definition files.
+    pub fn load_syntaxes(&mut self, dir: &Path) -> anyhow::Result<()> {
+        for entry in WalkDir::new(dir) {
+            let entry = entry?;
+            if entry.path().extension() == Some(OsStr::new("json")) {
+                let name = entry
+                    .path()
+                    .file_stem()
+                    .expect("syntax file name should have a stem")
+                    .to_string_lossy();
+                debug!("loading syntax {name:?}");
+
+                let syntax: Syntax = serde_json::from_reader(BufReader::new(
+                    File::open(entry.path()).context("could not open syntax file")?,
+                ))
+                .context("could not deserialize syntax file")?;
+                let compiled = compile_syntax(&syntax);
+                self.syntaxes.insert(name.into_owned(), compiled);
+            }
+        }
+
+        Ok(())
+    }
 }

 /// Data derived from the config.
--- a/crates/treehouse/src/html.rs
+++ b/crates/treehouse/src/html.rs
@ -1,6 +1,7 @@
 use std::fmt::{self, Display, Write};

 pub mod breadcrumbs;
+pub mod highlight;
 mod markdown;
 pub mod navmap;
 pub mod tree;
--- a/crates/treehouse/src/html/highlight.rs
+++ b/crates/treehouse/src/html/highlight.rs
@ -0,0 +1,94 @@
+//! Tokenizer and syntax highlighter inspired by the one found in rxi's lite.
+//! I highly recommend checking it out!
+//! https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
+//! There's also a mirror of it in the JavaScript, used to power dynamically editable code blocks.
+//!
+//! Both of these syntax highlighters use the same JSON syntax definitions; however this one is
+//! more limited, in that patterns do not support backtracking.
+//! This is effectively enforced in the dynamic highlighter because this highlighter reports any
+//! regex syntax errors upon site compilation.
+
+pub mod compiled;
+pub mod tokenize;
+
+use std::{collections::HashMap, io};
+
+use pulldown_cmark::escape::{escape_html, StrWrite};
+use serde::{Deserialize, Serialize};
+
+use self::compiled::CompiledSyntax;
+
+/// Syntax definition.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct Syntax {
+    /// Patterns, matched sequentially (patterns at the beginning of the list take precedence.)
+    pub patterns: Vec<Pattern>,
+
+    /// Map of replacements to use if a pattern matches a string exactly.
+    pub keywords: HashMap<String, Keyword>,
+}
+
+/// A pattern in a syntax definition.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct Pattern {
+    /// Regular expression to match.
+    pub regex: String,
+
+    /// Flags to pass to the regex engine to alter how strings are matched.
+    #[serde(default)]
+    pub flags: Vec<RegexFlag>,
+
+    /// Type to assign to the token. This can be any string, but only a select few have colors
+    /// assigned.
+    pub is: TokenTypes,
+}
+
+/// Assignable token types.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum TokenTypes {
+    /// Assign a single token type to the entire match.
+    FullMatch(String),
+    /// Assign individual token types to each capture.
+    Captures(CaptureTokenTypes),
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct CaptureTokenTypes {
+    /// Token type to use outside captures.
+    pub default: String,
+    /// Token type to use inside captures.
+    pub captures: Vec<String>,
+}
+
+/// Flag passed to the regex engine.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub enum RegexFlag {
+    /// Make `.` match line separators.
+    DotMatchesNewline,
+}
+
+/// Keyword replacement.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Keyword {
+    /// What to replace the token type with.
+    pub into: String,
+
+    /// Only replace the token type if it matches this one. If this is not present, any token type
+    /// is replaced.
+    pub only_replaces: Option<String>,
+}
+
+pub fn highlight(mut w: impl StrWrite, syntax: &CompiledSyntax, code: &str) -> io::Result<()> {
+    let tokens = syntax.tokenize(code);
+    for token in tokens {
+        w.write_str("<span class=\"")?;
+        escape_html(&mut w, &syntax.token_names[token.id])?;
+        w.write_str("\">")?;
+        escape_html(&mut w, &code[token.range])?;
+        w.write_str("</span>")?;
+    }
+    Ok(())
+}
--- a/crates/treehouse/src/html/highlight/compiled.rs
+++ b/crates/treehouse/src/html/highlight/compiled.rs
@ -0,0 +1,118 @@
+use std::collections::HashMap;
+
+use log::error;
+use regex::{Regex, RegexBuilder};
+
+use super::{RegexFlag, Syntax, TokenTypes};
+
+/// During compilation, token names are converted to numeric IDs for performance.
+pub type TokenId = usize;
+
+pub const TOKEN_ID_DEFAULT: TokenId = 0;
+
+#[derive(Debug, Clone)]
+pub struct CompiledSyntax {
+    /// Lookup table which maps numeric IDs to token names.
+    pub token_names: Vec<String>,
+
+    pub patterns: Vec<CompiledPattern>,
+    pub keywords: HashMap<String, CompiledKeyword>,
+}
+
+#[derive(Debug, Clone)]
+pub enum CompiledTokenTypes {
+    FullMatch(TokenId),
+    Captures(CompiledCaptureTokenTypes),
+}
+
+#[derive(Debug, Clone)]
+pub struct CompiledCaptureTokenTypes {
+    pub default: TokenId,
+    pub captures: Vec<TokenId>,
+}
+
+#[derive(Debug, Clone)]
+pub struct CompiledPattern {
+    pub regex: Regex,
+    pub is: CompiledTokenTypes,
+}
+
+#[derive(Debug, Clone)]
+pub struct CompiledKeyword {
+    pub into: TokenId,
+    pub only_replaces: Option<TokenId>,
+}
+
+pub fn compile_syntax(syntax: &Syntax) -> CompiledSyntax {
+    let mut token_names = vec!["default".into()];
+    let mut get_token_id = |name: &str| -> TokenId {
+        if let Some(id) = token_names.iter().position(|n| n == name) {
+            id
+        } else {
+            let id = token_names.len();
+            token_names.push(name.to_owned());
+            id
+        }
+    };
+
+    let patterns = syntax
+        .patterns
+        .iter()
+        .filter_map(|pattern| {
+            // NOTE: `regex` has no support for sticky flags, so we need to anchor the match to the
+            // start ourselves.
+            let regex = RegexBuilder::new(&format!(
+                "^{}",
+                // If there's an existing `^`, it should not cause compilation errors for the user.
+                pattern.regex.strip_prefix('^').unwrap_or(&pattern.regex)
+            ))
+            .dot_matches_new_line(pattern.flags.contains(&RegexFlag::DotMatchesNewline))
+            .build()
+            .map_err(|e| {
+                // NOTE: This could probably use better diagnostics, but it's pretty much
+                // impossible to get a source span out of serde's output (because it forgoes
+                // source information, rightfully so.) Therefore we have to settle on
+                // a poor man's error log.
+                error!("regex compilation error in pattern {pattern:?}: {e}");
+            })
+            .ok()?;
+            Some(CompiledPattern {
+                regex,
+                is: match &pattern.is {
+                    TokenTypes::FullMatch(name) => {
+                        CompiledTokenTypes::FullMatch(get_token_id(name))
+                    }
+                    TokenTypes::Captures(types) => {
+                        CompiledTokenTypes::Captures(CompiledCaptureTokenTypes {
+                            default: get_token_id(&types.default),
+                            captures: types
+                                .captures
+                                .iter()
+                                .map(|name| get_token_id(name))
+                                .collect(),
+                        })
+                    }
+                },
+            })
+        })
+        .collect();
+    let keywords = syntax
+        .keywords
+        .iter()
+        .map(|(text, keyword)| {
+            (
+                text.clone(),
+                CompiledKeyword {
+                    into: get_token_id(&keyword.into),
+                    only_replaces: keyword.only_replaces.as_deref().map(&mut get_token_id),
+                },
+            )
+        })
+        .collect();
+
+    CompiledSyntax {
+        token_names,
+        patterns,
+        keywords,
+    }
+}
--- a/crates/treehouse/src/html/highlight/tokenize.rs
+++ b/crates/treehouse/src/html/highlight/tokenize.rs
@ -0,0 +1,57 @@
+use std::ops::Range;
+
+use super::compiled::{CompiledSyntax, CompiledTokenTypes, TokenId, TOKEN_ID_DEFAULT};
+
+pub struct Token {
+    pub id: TokenId,
+    pub range: Range<usize>,
+}
+
+impl CompiledSyntax {
+    pub fn tokenize(&self, text: &str) -> Vec<Token> {
+        let mut tokens = vec![];
+
+        let mut i = 0;
+        while i < text.len() {
+            let mut had_match = false;
+            for pattern in &self.patterns {
+                match &pattern.is {
+                    CompiledTokenTypes::FullMatch(id) => {
+                        if let Some(regex_match) = pattern.regex.find(&text[i..]) {
+                            push_token(&mut tokens, *id, i..i + regex_match.range().end);
+                            i += regex_match.range().end;
+                            had_match = true;
+                            break;
+                        }
+                    }
+                    CompiledTokenTypes::Captures(types) => { /* TODO */ }
+                }
+            }
+
+            if !had_match {
+                push_token(&mut tokens, TOKEN_ID_DEFAULT, i..i + 1);
+                i += 1;
+            }
+        }
+
+        for token in &mut tokens {
+            if let Some(keyword) = self.keywords.get(&text[token.range.clone()]) {
+                if keyword.only_replaces.is_none() || Some(token.id) == keyword.only_replaces {
+                    token.id = keyword.into;
+                }
+            }
+        }
+
+        tokens
+    }
+}
+
+fn push_token(tokens: &mut Vec<Token>, id: TokenId, range: Range<usize>) {
+    if let Some(previous_token) = tokens.last_mut() {
+        if previous_token.id == id {
+            previous_token.range.end = range.end;
+            return;
+        }
+    }
+    tokens.push(Token { id, range });
+}
--- a/crates/treehouse/src/html/markdown.rs
+++ b/crates/treehouse/src/html/markdown.rs
@ -23,6 +23,7 @@

 //! HTML renderer that takes an iterator of events as input.

+use std::borrow::Borrow;
 use std::collections::HashMap;
 use std::io;

@ -31,6 +32,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, Event, LinkType, Tag};
 use pulldown_cmark::{CowStr, Event::*};

 use crate::config::{Config, ConfigDerivedData, PicSize};
+use crate::html::highlight::highlight;
 use crate::state::Treehouse;

 enum TableState {
@ -38,6 +40,12 @@ enum TableState {
    Body,
 }

+#[derive(Debug, Clone, PartialEq, Eq)]
+enum CodeBlockState<'a> {
+    NotInCodeBlock,
+    InCodeBlock(Option<CowStr<'a>>),
+}
+
 struct HtmlWriter<'a, I, W> {
    treehouse: &'a Treehouse,
    config: &'a Config,
@ -58,7 +66,7 @@ struct HtmlWriter<'a, I, W> {
    table_cell_index: usize,
    numbers: HashMap<CowStr<'a>, usize>,

-    in_code_block: bool,
+    code_block_state: CodeBlockState<'a>,
 }

 impl<'a, I, W> HtmlWriter<'a, I, W>
@ -87,7 +95,7 @@ where
            table_alignments: vec![],
            table_cell_index: 0,
            numbers: HashMap::new(),
-            in_code_block: false,
+            code_block_state: CodeBlockState::NotInCodeBlock,
        }
    }

@ -234,65 +242,71 @@ where
                }
            }
            Tag::CodeBlock(info) => {
-                self.in_code_block = true;
+                self.code_block_state = CodeBlockState::InCodeBlock(None);
                if !self.end_newline {
                    self.write_newline()?;
                }
                match info {
-                    CodeBlockKind::Fenced(language) => match CodeBlockMode::parse(&language) {
-                        CodeBlockMode::PlainText => self.write("<pre><code>"),
-                        CodeBlockMode::SyntaxHighlightOnly { language } => {
-                            self.write("<pre><code class=\"language-")?;
-                            escape_html(&mut self.writer, language)?;
-                            self.write("\">")
-                        }
-                        CodeBlockMode::LiterateProgram {
-                            language,
-                            kind,
-                            program_name,
-                        } => {
-                            self.write(match &kind {
-                                LiterateCodeKind::Input => {
-                                    "<th-literate-program data-mode=\"input\" "
-                                }
-                                LiterateCodeKind::Output { .. } => {
-                                    "<th-literate-program data-mode=\"output\" "
-                                }
-                            })?;
-                            self.write("data-program=\"")?;
-                            escape_href(&mut self.writer, self.page_id)?;
-                            self.write(":")?;
-                            escape_html(&mut self.writer, program_name)?;
-                            self.write("\" data-language=\"")?;
-                            escape_html(&mut self.writer, language)?;
-                            self.write("\" role=\"code\">")?;
-
-                            if let LiterateCodeKind::Output { placeholder_pic_id } = kind {
-                                if !placeholder_pic_id.is_empty() {
-                                    self.write(
-                                        "<img class=\"placeholder-image\" loading=\"lazy\" src=\"",
-                                    )?;
-                                    escape_html(
-                                        &mut self.writer,
-                                        &self.config.pic_url(placeholder_pic_id),
-                                    )?;
-                                    self.write("\"")?;
-                                    if let Some(PicSize { width, height }) = self
-                                        .config_derived_data
-                                        .pic_size(self.config, placeholder_pic_id)
-                                    {
-                                        self.write(&format!(
-                                            " width=\"{width}\" height=\"{height}\""
-                                        ))?;
-                                    }
-                                    self.write(">")?;
+                    CodeBlockKind::Fenced(language) => {
+                        self.code_block_state = CodeBlockState::InCodeBlock(Some(language.clone()));
+                        match CodeBlockMode::parse(&language) {
+                            CodeBlockMode::PlainText => self.write("<pre><code>"),
+                            CodeBlockMode::SyntaxHighlightOnly { language } => {
+                                self.write("<pre><code class=\"language-")?;
+                                escape_html(&mut self.writer, language)?;
+                                if self.config.syntaxes.contains_key(language) {
+                                    self.write(" th-syntax-highlighting")?;
                                }
+                                self.write("\">")
                            }
+                            CodeBlockMode::LiterateProgram {
+                                language,
+                                kind,
+                                program_name,
+                            } => {
+                                self.write(match &kind {
+                                    LiterateCodeKind::Input => {
+                                        "<th-literate-program data-mode=\"input\" "
+                                    }
+                                    LiterateCodeKind::Output { .. } => {
+                                        "<th-literate-program data-mode=\"output\" "
+                                    }
+                                })?;
+                                self.write("data-program=\"")?;
+                                escape_href(&mut self.writer, self.page_id)?;
+                                self.write(":")?;
+                                escape_html(&mut self.writer, program_name)?;
+                                self.write("\" data-language=\"")?;
+                                escape_html(&mut self.writer, language)?;
+                                self.write("\" role=\"code\">")?;

-                            self.write("<pre class=\"placeholder-console\">")?;
-                            Ok(())
+                                if let LiterateCodeKind::Output { placeholder_pic_id } = kind {
+                                    if !placeholder_pic_id.is_empty() {
+                                        self.write(
+                                                            "<img class=\"placeholder-image\" loading=\"lazy\" src=\"",
+                                                        )?;
+                                        escape_html(
+                                            &mut self.writer,
+                                            &self.config.pic_url(placeholder_pic_id),
+                                        )?;
+                                        self.write("\"")?;
+                                        if let Some(PicSize { width, height }) = self
+                                            .config_derived_data
+                                            .pic_size(self.config, placeholder_pic_id)
+                                        {
+                                            self.write(&format!(
+                                                " width=\"{width}\" height=\"{height}\""
+                                            ))?;
+                                        }
+                                        self.write(">")?;
+                                    }
+                                }
+
+                                self.write("<pre class=\"placeholder-console\">")?;
+                                Ok(())
+                            }
                        }
-                    },
+                    }
                    CodeBlockKind::Indented => self.write("<pre><code>"),
                }
            }
@ -416,7 +430,7 @@ where
                    },
                    _ => "</code></pre>\n",
                })?;
-                self.in_code_block = false;
+                self.code_block_state = CodeBlockState::NotInCodeBlock;
            }
            Tag::List(Some(_)) => {
                self.write("</ol>\n")?;
@ -505,8 +519,20 @@ where
            }
        }

-        if self.in_code_block {
-            escape_html(&mut self.writer, text)?;
+        if let CodeBlockState::InCodeBlock(language) = &self.code_block_state {
+            let code_block_mode = language
+                .as_ref()
+                .map(|language| CodeBlockMode::parse(language));
+            let highlighting_language = code_block_mode
+                .as_ref()
+                .and_then(|mode| mode.highlighting_language());
+            let syntax =
+                highlighting_language.and_then(|language| self.config.syntaxes.get(language));
+            if let Some(syntax) = syntax {
+                highlight(&mut self.writer, syntax, text)?;
+            } else {
+                escape_html(&mut self.writer, text)?;
+            }
        } else {
            let mut parser = EmojiParser { text, position: 0 };
            while let Some(token) = parser.next_token() {
@ -623,6 +649,16 @@ impl<'a> CodeBlockMode<'a> {
            CodeBlockMode::SyntaxHighlightOnly { language }
        }
    }
+
+    fn highlighting_language(&self) -> Option<&str> {
+        if let CodeBlockMode::LiterateProgram { language, .. }
+        | CodeBlockMode::SyntaxHighlightOnly { language } = self
+        {
+            Some(language)
+        } else {
+            None
+        }
+    }
 }

 /// Iterate over an `Iterator` of `Event`s, generate HTML for each `Event`, and