diff --git a/Cargo.lock b/Cargo.lock index 734c92f..b7c4132 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1160,9 +1160,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.3" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", @@ -1172,9 +1172,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.6" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" +checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", @@ -1183,9 +1183,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.4" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "rustc-demangle" @@ -1580,6 +1580,7 @@ dependencies = [ "log", "pulldown-cmark", "rand", + "regex", "serde", "serde_json", "tokio", diff --git a/content/treehouse/dev/syntax-highlighting.tree b/content/treehouse/dev/syntax-highlighting.tree new file mode 100644 index 0000000..df30aaf --- /dev/null +++ b/content/treehouse/dev/syntax-highlighting.tree @@ -0,0 +1,36 @@ +%% title = "syntax highlighting gallery" + +- this is a page demonstrating syntaxes supported by the treehouse + + - really there's not much more to it, but I use it for debugging + with it you can get a general feel for how I highlight things in the treehouse + +- `javascript` +```javascript +// t is an existing tile index; variable name is short for brevity +export function removeRedundancies(t) { + if (isSet(t, SE) && (!isSet(t, S) || !isSet(t, E))) { + t &= ~SE; + } + if (isSet(t, SW) && (!isSet(t, S) || !isSet(t, W))) { + t &= ~SW; + } + if (isSet(t, NW) && (!isSet(t, N) || !isSet(t, W))) { + t &= ~NW; + } + if (isSet(t, NE) && (!isSet(t, N) || !isSet(t, E))) { + t &= ~NE; + } + return t; +} + +/* This is + a multiline comment. */ + +ident Class CONSTANT funciton() +0b1010 0o01234567 0x0123456789ABCDEF +01234567 +1.41e-3 +'string' /**/ "string" /**/ `string` ++ - * / == != <= >= ! ~ || && . ? : +, ; +``` diff --git a/crates/treehouse/Cargo.toml b/crates/treehouse/Cargo.toml index b2facbf..cdd3438 100644 --- a/crates/treehouse/Cargo.toml +++ b/crates/treehouse/Cargo.toml @@ -29,3 +29,4 @@ ulid = "1.0.0" url = "2.5.0" base64 = "0.21.7" chrono = "0.4.35" +regex = "1.10.3" diff --git a/crates/treehouse/src/cli/generate.rs b/crates/treehouse/src/cli/generate.rs index 7370c20..de3001e 100644 --- a/crates/treehouse/src/cli/generate.rs +++ b/crates/treehouse/src/cli/generate.rs @@ -361,6 +361,7 @@ pub fn generate(paths: &Paths<'_>) -> anyhow::Result<(Config, Treehouse)> { config.site = std::env::var("TREEHOUSE_SITE").unwrap_or(config.site); config.autopopulate_emoji(&paths.static_dir.join("emoji"))?; config.autopopulate_pics(&paths.static_dir.join("pic"))?; + config.load_syntaxes(&paths.static_dir.join("syntax"))?; info!("cleaning target directory"); let _ = std::fs::remove_dir_all(paths.target_dir); diff --git a/crates/treehouse/src/config.rs b/crates/treehouse/src/config.rs index 6dbe936..184b2e0 100644 --- a/crates/treehouse/src/config.rs +++ b/crates/treehouse/src/config.rs @@ -1,9 +1,15 @@ use std::{collections::HashMap, ffi::OsStr, fs::File, io::BufReader, path::Path}; use anyhow::Context; +use log::debug; use serde::{Deserialize, Serialize}; use walkdir::WalkDir; +use crate::html::highlight::{ + compiled::{compile_syntax, CompiledSyntax}, + Syntax, +}; + #[derive(Debug, Clone, Deserialize, Serialize)] pub struct Config { /// Website root; used when generating links. @@ -48,6 +54,13 @@ pub struct Config { /// On top of this, pics are autodiscovered by walking the `static/pic` directory. /// Only the part before the first dash is treated as the pic's id. pub pics: HashMap, + + /// Syntax definitions. + /// + /// These are not part of the config file, but are loaded as part of site configuration from + /// `static/syntax`. + #[serde(skip)] + pub syntaxes: HashMap, } #[derive(Debug, Clone, Deserialize, Serialize)] @@ -138,6 +151,30 @@ impl Config { self.pics.get(id).map(|x| &**x).unwrap_or("404.png") ) } + + /// Loads all syntax definition files. + pub fn load_syntaxes(&mut self, dir: &Path) -> anyhow::Result<()> { + for entry in WalkDir::new(dir) { + let entry = entry?; + if entry.path().extension() == Some(OsStr::new("json")) { + let name = entry + .path() + .file_stem() + .expect("syntax file name should have a stem") + .to_string_lossy(); + debug!("loading syntax {name:?}"); + + let syntax: Syntax = serde_json::from_reader(BufReader::new( + File::open(entry.path()).context("could not open syntax file")?, + )) + .context("could not deserialize syntax file")?; + let compiled = compile_syntax(&syntax); + self.syntaxes.insert(name.into_owned(), compiled); + } + } + + Ok(()) + } } /// Data derived from the config. diff --git a/crates/treehouse/src/html.rs b/crates/treehouse/src/html.rs index 78a78d7..12da43f 100644 --- a/crates/treehouse/src/html.rs +++ b/crates/treehouse/src/html.rs @@ -1,6 +1,7 @@ use std::fmt::{self, Display, Write}; pub mod breadcrumbs; +pub mod highlight; mod markdown; pub mod navmap; pub mod tree; diff --git a/crates/treehouse/src/html/highlight.rs b/crates/treehouse/src/html/highlight.rs new file mode 100644 index 0000000..a2aed8c --- /dev/null +++ b/crates/treehouse/src/html/highlight.rs @@ -0,0 +1,94 @@ +//! Tokenizer and syntax highlighter inspired by the one found in rxi's lite. +//! I highly recommend checking it out! +//! https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua +//! There's also a mirror of it in the JavaScript, used to power dynamically editable code blocks. +//! +//! Both of these syntax highlighters use the same JSON syntax definitions; however this one is +//! more limited, in that patterns do not support backtracking. +//! This is effectively enforced in the dynamic highlighter because this highlighter reports any +//! regex syntax errors upon site compilation. + +pub mod compiled; +pub mod tokenize; + +use std::{collections::HashMap, io}; + +use pulldown_cmark::escape::{escape_html, StrWrite}; +use serde::{Deserialize, Serialize}; + +use self::compiled::CompiledSyntax; + +/// Syntax definition. +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct Syntax { + /// Patterns, matched sequentially (patterns at the beginning of the list take precedence.) + pub patterns: Vec, + + /// Map of replacements to use if a pattern matches a string exactly. + pub keywords: HashMap, +} + +/// A pattern in a syntax definition. +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct Pattern { + /// Regular expression to match. + pub regex: String, + + /// Flags to pass to the regex engine to alter how strings are matched. + #[serde(default)] + pub flags: Vec, + + /// Type to assign to the token. This can be any string, but only a select few have colors + /// assigned. + pub is: TokenTypes, +} + +/// Assignable token types. +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(untagged)] +pub enum TokenTypes { + /// Assign a single token type to the entire match. + FullMatch(String), + /// Assign individual token types to each capture. + Captures(CaptureTokenTypes), +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct CaptureTokenTypes { + /// Token type to use outside captures. + pub default: String, + /// Token type to use inside captures. + pub captures: Vec, +} + +/// Flag passed to the regex engine. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub enum RegexFlag { + /// Make `.` match line separators. + DotMatchesNewline, +} + +/// Keyword replacement. +#[derive(Debug, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Keyword { + /// What to replace the token type with. + pub into: String, + + /// Only replace the token type if it matches this one. If this is not present, any token type + /// is replaced. + pub only_replaces: Option, +} + +pub fn highlight(mut w: impl StrWrite, syntax: &CompiledSyntax, code: &str) -> io::Result<()> { + let tokens = syntax.tokenize(code); + for token in tokens { + w.write_str("")?; + escape_html(&mut w, &code[token.range])?; + w.write_str("")?; + } + Ok(()) +} diff --git a/crates/treehouse/src/html/highlight/compiled.rs b/crates/treehouse/src/html/highlight/compiled.rs new file mode 100644 index 0000000..5552ab2 --- /dev/null +++ b/crates/treehouse/src/html/highlight/compiled.rs @@ -0,0 +1,118 @@ +use std::collections::HashMap; + +use log::error; +use regex::{Regex, RegexBuilder}; + +use super::{RegexFlag, Syntax, TokenTypes}; + +/// During compilation, token names are converted to numeric IDs for performance. +pub type TokenId = usize; + +pub const TOKEN_ID_DEFAULT: TokenId = 0; + +#[derive(Debug, Clone)] +pub struct CompiledSyntax { + /// Lookup table which maps numeric IDs to token names. + pub token_names: Vec, + + pub patterns: Vec, + pub keywords: HashMap, +} + +#[derive(Debug, Clone)] +pub enum CompiledTokenTypes { + FullMatch(TokenId), + Captures(CompiledCaptureTokenTypes), +} + +#[derive(Debug, Clone)] +pub struct CompiledCaptureTokenTypes { + pub default: TokenId, + pub captures: Vec, +} + +#[derive(Debug, Clone)] +pub struct CompiledPattern { + pub regex: Regex, + pub is: CompiledTokenTypes, +} + +#[derive(Debug, Clone)] +pub struct CompiledKeyword { + pub into: TokenId, + pub only_replaces: Option, +} + +pub fn compile_syntax(syntax: &Syntax) -> CompiledSyntax { + let mut token_names = vec!["default".into()]; + let mut get_token_id = |name: &str| -> TokenId { + if let Some(id) = token_names.iter().position(|n| n == name) { + id + } else { + let id = token_names.len(); + token_names.push(name.to_owned()); + id + } + }; + + let patterns = syntax + .patterns + .iter() + .filter_map(|pattern| { + // NOTE: `regex` has no support for sticky flags, so we need to anchor the match to the + // start ourselves. + let regex = RegexBuilder::new(&format!( + "^{}", + // If there's an existing `^`, it should not cause compilation errors for the user. + pattern.regex.strip_prefix('^').unwrap_or(&pattern.regex) + )) + .dot_matches_new_line(pattern.flags.contains(&RegexFlag::DotMatchesNewline)) + .build() + .map_err(|e| { + // NOTE: This could probably use better diagnostics, but it's pretty much + // impossible to get a source span out of serde's output (because it forgoes + // source information, rightfully so.) Therefore we have to settle on + // a poor man's error log. + error!("regex compilation error in pattern {pattern:?}: {e}"); + }) + .ok()?; + Some(CompiledPattern { + regex, + is: match &pattern.is { + TokenTypes::FullMatch(name) => { + CompiledTokenTypes::FullMatch(get_token_id(name)) + } + TokenTypes::Captures(types) => { + CompiledTokenTypes::Captures(CompiledCaptureTokenTypes { + default: get_token_id(&types.default), + captures: types + .captures + .iter() + .map(|name| get_token_id(name)) + .collect(), + }) + } + }, + }) + }) + .collect(); + let keywords = syntax + .keywords + .iter() + .map(|(text, keyword)| { + ( + text.clone(), + CompiledKeyword { + into: get_token_id(&keyword.into), + only_replaces: keyword.only_replaces.as_deref().map(&mut get_token_id), + }, + ) + }) + .collect(); + + CompiledSyntax { + token_names, + patterns, + keywords, + } +} diff --git a/crates/treehouse/src/html/highlight/tokenize.rs b/crates/treehouse/src/html/highlight/tokenize.rs new file mode 100644 index 0000000..3ced033 --- /dev/null +++ b/crates/treehouse/src/html/highlight/tokenize.rs @@ -0,0 +1,57 @@ +use std::ops::Range; + +use super::compiled::{CompiledSyntax, CompiledTokenTypes, TokenId, TOKEN_ID_DEFAULT}; + +pub struct Token { + pub id: TokenId, + pub range: Range, +} + +impl CompiledSyntax { + pub fn tokenize(&self, text: &str) -> Vec { + let mut tokens = vec![]; + + let mut i = 0; + while i < text.len() { + let mut had_match = false; + for pattern in &self.patterns { + match &pattern.is { + CompiledTokenTypes::FullMatch(id) => { + if let Some(regex_match) = pattern.regex.find(&text[i..]) { + push_token(&mut tokens, *id, i..i + regex_match.range().end); + i += regex_match.range().end; + had_match = true; + break; + } + } + CompiledTokenTypes::Captures(types) => { /* TODO */ } + } + } + + if !had_match { + push_token(&mut tokens, TOKEN_ID_DEFAULT, i..i + 1); + i += 1; + } + } + + for token in &mut tokens { + if let Some(keyword) = self.keywords.get(&text[token.range.clone()]) { + if keyword.only_replaces.is_none() || Some(token.id) == keyword.only_replaces { + token.id = keyword.into; + } + } + } + + tokens + } +} + +fn push_token(tokens: &mut Vec, id: TokenId, range: Range) { + if let Some(previous_token) = tokens.last_mut() { + if previous_token.id == id { + previous_token.range.end = range.end; + return; + } + } + tokens.push(Token { id, range }); +} diff --git a/crates/treehouse/src/html/markdown.rs b/crates/treehouse/src/html/markdown.rs index e29a5b5..4415419 100644 --- a/crates/treehouse/src/html/markdown.rs +++ b/crates/treehouse/src/html/markdown.rs @@ -23,6 +23,7 @@ //! HTML renderer that takes an iterator of events as input. +use std::borrow::Borrow; use std::collections::HashMap; use std::io; @@ -31,6 +32,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, Event, LinkType, Tag}; use pulldown_cmark::{CowStr, Event::*}; use crate::config::{Config, ConfigDerivedData, PicSize}; +use crate::html::highlight::highlight; use crate::state::Treehouse; enum TableState { @@ -38,6 +40,12 @@ enum TableState { Body, } +#[derive(Debug, Clone, PartialEq, Eq)] +enum CodeBlockState<'a> { + NotInCodeBlock, + InCodeBlock(Option>), +} + struct HtmlWriter<'a, I, W> { treehouse: &'a Treehouse, config: &'a Config, @@ -58,7 +66,7 @@ struct HtmlWriter<'a, I, W> { table_cell_index: usize, numbers: HashMap, usize>, - in_code_block: bool, + code_block_state: CodeBlockState<'a>, } impl<'a, I, W> HtmlWriter<'a, I, W> @@ -87,7 +95,7 @@ where table_alignments: vec![], table_cell_index: 0, numbers: HashMap::new(), - in_code_block: false, + code_block_state: CodeBlockState::NotInCodeBlock, } } @@ -234,65 +242,71 @@ where } } Tag::CodeBlock(info) => { - self.in_code_block = true; + self.code_block_state = CodeBlockState::InCodeBlock(None); if !self.end_newline { self.write_newline()?; } match info { - CodeBlockKind::Fenced(language) => match CodeBlockMode::parse(&language) { - CodeBlockMode::PlainText => self.write("
"),
-                        CodeBlockMode::SyntaxHighlightOnly { language } => {
-                            self.write("
")
-                        }
-                        CodeBlockMode::LiterateProgram {
-                            language,
-                            kind,
-                            program_name,
-                        } => {
-                            self.write(match &kind {
-                                LiterateCodeKind::Input => {
-                                    " {
-                                    "")?;
-
-                            if let LiterateCodeKind::Output { placeholder_pic_id } = kind {
-                                if !placeholder_pic_id.is_empty() {
-                                    self.write(
-                                        "")?;
+                    CodeBlockKind::Fenced(language) => {
+                        self.code_block_state = CodeBlockState::InCodeBlock(Some(language.clone()));
+                        match CodeBlockMode::parse(&language) {
+                            CodeBlockMode::PlainText => self.write("
"),
+                            CodeBlockMode::SyntaxHighlightOnly { language } => {
+                                self.write("
")
                             }
+                            CodeBlockMode::LiterateProgram {
+                                language,
+                                kind,
+                                program_name,
+                            } => {
+                                self.write(match &kind {
+                                    LiterateCodeKind::Input => {
+                                        " {
+                                        "")?;
 
-                            self.write("
")?;
-                            Ok(())
+                                if let LiterateCodeKind::Output { placeholder_pic_id } = kind {
+                                    if !placeholder_pic_id.is_empty() {
+                                        self.write(
+                                                            "")?;
+                                    }
+                                }
+
+                                self.write("
")?;
+                                Ok(())
+                            }
                         }
-                    },
+                    }
                     CodeBlockKind::Indented => self.write("
"),
                 }
             }
@@ -416,7 +430,7 @@ where
                     },
                     _ => "
\n", })?; - self.in_code_block = false; + self.code_block_state = CodeBlockState::NotInCodeBlock; } Tag::List(Some(_)) => { self.write("\n")?; @@ -505,8 +519,20 @@ where } } - if self.in_code_block { - escape_html(&mut self.writer, text)?; + if let CodeBlockState::InCodeBlock(language) = &self.code_block_state { + let code_block_mode = language + .as_ref() + .map(|language| CodeBlockMode::parse(language)); + let highlighting_language = code_block_mode + .as_ref() + .and_then(|mode| mode.highlighting_language()); + let syntax = + highlighting_language.and_then(|language| self.config.syntaxes.get(language)); + if let Some(syntax) = syntax { + highlight(&mut self.writer, syntax, text)?; + } else { + escape_html(&mut self.writer, text)?; + } } else { let mut parser = EmojiParser { text, position: 0 }; while let Some(token) = parser.next_token() { @@ -623,6 +649,16 @@ impl<'a> CodeBlockMode<'a> { CodeBlockMode::SyntaxHighlightOnly { language } } } + + fn highlighting_language(&self) -> Option<&str> { + if let CodeBlockMode::LiterateProgram { language, .. } + | CodeBlockMode::SyntaxHighlightOnly { language } = self + { + Some(language) + } else { + None + } + } } /// Iterate over an `Iterator` of `Event`s, generate HTML for each `Event`, and diff --git a/static/js/components/literate-programming/highlight.js b/static/js/components/literate-programming/highlight.js index d3ec00d..59af6a1 100644 --- a/static/js/components/literate-programming/highlight.js +++ b/static/js/components/literate-programming/highlight.js @@ -1,6 +1,8 @@ // This tokenizer is highly inspired by the one found in rxi's lite. // I highly recommend checking it out! // https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua +// There's also a mirror of it in the static generator, to enable highlighting of code blocks which +// are *not* JavaScript-powered. export function compileSyntax(def) { for (let pattern of def.patterns) { @@ -32,7 +34,7 @@ function tokenize(text, syntax) { let match; pattern.regex.lastIndex = i; if ((match = pattern.regex.exec(text)) != null) { - pushToken(tokens, pattern.as, match[0]); // TODO + pushToken(tokens, pattern.is, match[0]); // TODO i = pattern.regex.lastIndex; hadMatch = true; break; diff --git a/static/syntax/javascript.json b/static/syntax/javascript.json new file mode 100644 index 0000000..06dcde9 --- /dev/null +++ b/static/syntax/javascript.json @@ -0,0 +1,76 @@ +{ + "patterns": [ + { "regex": "\\/\\/.*", "is": "comment" }, + { + "regex": "\\/\\*.*?\\*\\/", + "flags": ["dotMatchesNewline"], + "is": "comment" + }, + { "regex": "[A-Z_][a-zA-Z0-9_]*", "is": "keyword2" }, + { + "regex": "[a-zA-Z_][a-zA-Z0-9_]*(\\()", + "is": { "default": "function", "captures": ["default"] } + }, + { "regex": "[a-zA-Z_][a-zA-Z0-9_]*", "is": "identifier" }, + { "regex": "0[bB][01_]+n?", "is": "literal" }, + { "regex": "0[oO][0-7_]+n?", "is": "literal" }, + { "regex": "0[xX][0-9a-fA-F_]+n?", "is": "literal" }, + { "regex": "[0-9_]+n", "is": "literal" }, + { "regex": "[0-9_]+(\\.[0-9_]*([eE][-+]?[0-9_]+)?)?", "is": "literal" }, + { "regex": "'(\\'|[^'])*'", "is": "string" }, + { "regex": "\"(\\\"|[^\"])*\"", "is": "string" }, + { "regex": "`(\\`|[^`])*`", "is": "string" }, + { "regex": "[+=/*^%<>!~|&\\.?:-]+", "is": "operator" }, + { "regex": "[,;]", "is": "punct" } + ], + "keywords": { + "as": { "into": "keyword1", "onlyReplaces": "identifier" }, + "async": { "into": "keyword1", "onlyReplaces": "identifier" }, + "await": { "into": "keyword1" }, + "break": { "into": "keyword1" }, + "case": { "into": "keyword1" }, + "catch": { "into": "keyword1" }, + "class": { "into": "keyword1" }, + "const": { "into": "keyword1" }, + "continue": { "into": "keyword1" }, + "debugger": { "into": "keyword1" }, + "default": { "into": "keyword1" }, + "delete": { "into": "keyword1" }, + "do": { "into": "keyword1" }, + "else": { "into": "keyword1" }, + "export": { "into": "keyword1" }, + "extends": { "into": "keyword1" }, + "finally": { "into": "keyword1" }, + "for": { "into": "keyword1" }, + "from": { "into": "keyword1", "onlyReplaces": "identifier" }, + "function": { "into": "keyword1" }, + "get": { "into": "keyword1", "onlyReplaces": "identifier" }, + "if": { "into": "keyword1" }, + "import": { "into": "keyword1" }, + "in": { "into": "keyword1" }, + "instanceof": { "into": "keyword1" }, + "let": { "into": "keyword1" }, + "new": { "into": "keyword1" }, + "of": { "into": "keyword1", "onlyReplaces": "identifier" }, + "return": { "into": "keyword1" }, + "set": { "into": "keyword1", "onlyReplaces": "identifier" }, + "static": { "into": "keyword1" }, + "switch": { "into": "keyword1" }, + "throw": { "into": "keyword1" }, + "try": { "into": "keyword1" }, + "typeof": { "into": "keyword1" }, + "var": { "into": "keyword1" }, + "void": { "into": "keyword1" }, + "while": { "into": "keyword1" }, + "with": { "into": "keyword1" }, + "yield": { "into": "keyword1" }, + + "super": { "into": "keyword2" }, + "this": { "into": "keyword2" }, + + "false": { "into": "literal" }, + "true": { "into": "literal" }, + "undefined": { "into": "literal" }, + "null": { "into": "literal" } + } +}