static syntax highlighting WIP

This commit is contained in:
liquidex 2024-03-10 23:23:50 +01:00
parent 7fd2d18b69
commit 5ab8ffdba2
12 changed files with 523 additions and 63 deletions

13
Cargo.lock generated
View file

@ -1160,9 +1160,9 @@ dependencies = [
[[package]] [[package]]
name = "regex" name = "regex"
version = "1.9.3" version = "1.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"memchr", "memchr",
@ -1172,9 +1172,9 @@ dependencies = [
[[package]] [[package]]
name = "regex-automata" name = "regex-automata"
version = "0.3.6" version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"memchr", "memchr",
@ -1183,9 +1183,9 @@ dependencies = [
[[package]] [[package]]
name = "regex-syntax" name = "regex-syntax"
version = "0.7.4" version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]] [[package]]
name = "rustc-demangle" name = "rustc-demangle"
@ -1580,6 +1580,7 @@ dependencies = [
"log", "log",
"pulldown-cmark", "pulldown-cmark",
"rand", "rand",
"regex",
"serde", "serde",
"serde_json", "serde_json",
"tokio", "tokio",

View file

@ -0,0 +1,36 @@
%% title = "syntax highlighting gallery"
- this is a page demonstrating syntaxes supported by the treehouse
- really there's not much more to it, but I use it for debugging + with it you can get a general feel for how I highlight things in the treehouse
- `javascript`
```javascript
// t is an existing tile index; variable name is short for brevity
export function removeRedundancies(t) {
if (isSet(t, SE) && (!isSet(t, S) || !isSet(t, E))) {
t &= ~SE;
}
if (isSet(t, SW) && (!isSet(t, S) || !isSet(t, W))) {
t &= ~SW;
}
if (isSet(t, NW) && (!isSet(t, N) || !isSet(t, W))) {
t &= ~NW;
}
if (isSet(t, NE) && (!isSet(t, N) || !isSet(t, E))) {
t &= ~NE;
}
return t;
}
/* This is
a multiline comment. */
ident Class CONSTANT funciton()
0b1010 0o01234567 0x0123456789ABCDEF
01234567
1.41e-3
'string' /**/ "string" /**/ `string`
+ - * / == != <= >= ! ~ || && . ? :
, ;
```

View file

@ -29,3 +29,4 @@ ulid = "1.0.0"
url = "2.5.0" url = "2.5.0"
base64 = "0.21.7" base64 = "0.21.7"
chrono = "0.4.35" chrono = "0.4.35"
regex = "1.10.3"

View file

@ -361,6 +361,7 @@ pub fn generate(paths: &Paths<'_>) -> anyhow::Result<(Config, Treehouse)> {
config.site = std::env::var("TREEHOUSE_SITE").unwrap_or(config.site); config.site = std::env::var("TREEHOUSE_SITE").unwrap_or(config.site);
config.autopopulate_emoji(&paths.static_dir.join("emoji"))?; config.autopopulate_emoji(&paths.static_dir.join("emoji"))?;
config.autopopulate_pics(&paths.static_dir.join("pic"))?; config.autopopulate_pics(&paths.static_dir.join("pic"))?;
config.load_syntaxes(&paths.static_dir.join("syntax"))?;
info!("cleaning target directory"); info!("cleaning target directory");
let _ = std::fs::remove_dir_all(paths.target_dir); let _ = std::fs::remove_dir_all(paths.target_dir);

View file

@ -1,9 +1,15 @@
use std::{collections::HashMap, ffi::OsStr, fs::File, io::BufReader, path::Path}; use std::{collections::HashMap, ffi::OsStr, fs::File, io::BufReader, path::Path};
use anyhow::Context; use anyhow::Context;
use log::debug;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use walkdir::WalkDir; use walkdir::WalkDir;
use crate::html::highlight::{
compiled::{compile_syntax, CompiledSyntax},
Syntax,
};
#[derive(Debug, Clone, Deserialize, Serialize)] #[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Config { pub struct Config {
/// Website root; used when generating links. /// Website root; used when generating links.
@ -48,6 +54,13 @@ pub struct Config {
/// On top of this, pics are autodiscovered by walking the `static/pic` directory. /// On top of this, pics are autodiscovered by walking the `static/pic` directory.
/// Only the part before the first dash is treated as the pic's id. /// Only the part before the first dash is treated as the pic's id.
pub pics: HashMap<String, String>, pub pics: HashMap<String, String>,
/// Syntax definitions.
///
/// These are not part of the config file, but are loaded as part of site configuration from
/// `static/syntax`.
#[serde(skip)]
pub syntaxes: HashMap<String, CompiledSyntax>,
} }
#[derive(Debug, Clone, Deserialize, Serialize)] #[derive(Debug, Clone, Deserialize, Serialize)]
@ -138,6 +151,30 @@ impl Config {
self.pics.get(id).map(|x| &**x).unwrap_or("404.png") self.pics.get(id).map(|x| &**x).unwrap_or("404.png")
) )
} }
/// Loads all syntax definition files.
pub fn load_syntaxes(&mut self, dir: &Path) -> anyhow::Result<()> {
for entry in WalkDir::new(dir) {
let entry = entry?;
if entry.path().extension() == Some(OsStr::new("json")) {
let name = entry
.path()
.file_stem()
.expect("syntax file name should have a stem")
.to_string_lossy();
debug!("loading syntax {name:?}");
let syntax: Syntax = serde_json::from_reader(BufReader::new(
File::open(entry.path()).context("could not open syntax file")?,
))
.context("could not deserialize syntax file")?;
let compiled = compile_syntax(&syntax);
self.syntaxes.insert(name.into_owned(), compiled);
}
}
Ok(())
}
} }
/// Data derived from the config. /// Data derived from the config.

View file

@ -1,6 +1,7 @@
use std::fmt::{self, Display, Write}; use std::fmt::{self, Display, Write};
pub mod breadcrumbs; pub mod breadcrumbs;
pub mod highlight;
mod markdown; mod markdown;
pub mod navmap; pub mod navmap;
pub mod tree; pub mod tree;

View file

@ -0,0 +1,94 @@
//! Tokenizer and syntax highlighter inspired by the one found in rxi's lite.
//! I highly recommend checking it out!
//! https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
//! There's also a mirror of it in the JavaScript, used to power dynamically editable code blocks.
//!
//! Both of these syntax highlighters use the same JSON syntax definitions; however this one is
//! more limited, in that patterns do not support backtracking.
//! This is effectively enforced in the dynamic highlighter because this highlighter reports any
//! regex syntax errors upon site compilation.
pub mod compiled;
pub mod tokenize;
use std::{collections::HashMap, io};
use pulldown_cmark::escape::{escape_html, StrWrite};
use serde::{Deserialize, Serialize};
use self::compiled::CompiledSyntax;
/// Syntax definition.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Syntax {
/// Patterns, matched sequentially (patterns at the beginning of the list take precedence.)
pub patterns: Vec<Pattern>,
/// Map of replacements to use if a pattern matches a string exactly.
pub keywords: HashMap<String, Keyword>,
}
/// A pattern in a syntax definition.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Pattern {
/// Regular expression to match.
pub regex: String,
/// Flags to pass to the regex engine to alter how strings are matched.
#[serde(default)]
pub flags: Vec<RegexFlag>,
/// Type to assign to the token. This can be any string, but only a select few have colors
/// assigned.
pub is: TokenTypes,
}
/// Assignable token types.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(untagged)]
pub enum TokenTypes {
/// Assign a single token type to the entire match.
FullMatch(String),
/// Assign individual token types to each capture.
Captures(CaptureTokenTypes),
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct CaptureTokenTypes {
/// Token type to use outside captures.
pub default: String,
/// Token type to use inside captures.
pub captures: Vec<String>,
}
/// Flag passed to the regex engine.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub enum RegexFlag {
/// Make `.` match line separators.
DotMatchesNewline,
}
/// Keyword replacement.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Keyword {
/// What to replace the token type with.
pub into: String,
/// Only replace the token type if it matches this one. If this is not present, any token type
/// is replaced.
pub only_replaces: Option<String>,
}
pub fn highlight(mut w: impl StrWrite, syntax: &CompiledSyntax, code: &str) -> io::Result<()> {
let tokens = syntax.tokenize(code);
for token in tokens {
w.write_str("<span class=\"")?;
escape_html(&mut w, &syntax.token_names[token.id])?;
w.write_str("\">")?;
escape_html(&mut w, &code[token.range])?;
w.write_str("</span>")?;
}
Ok(())
}

View file

@ -0,0 +1,118 @@
use std::collections::HashMap;
use log::error;
use regex::{Regex, RegexBuilder};
use super::{RegexFlag, Syntax, TokenTypes};
/// During compilation, token names are converted to numeric IDs for performance.
pub type TokenId = usize;
pub const TOKEN_ID_DEFAULT: TokenId = 0;
#[derive(Debug, Clone)]
pub struct CompiledSyntax {
/// Lookup table which maps numeric IDs to token names.
pub token_names: Vec<String>,
pub patterns: Vec<CompiledPattern>,
pub keywords: HashMap<String, CompiledKeyword>,
}
#[derive(Debug, Clone)]
pub enum CompiledTokenTypes {
FullMatch(TokenId),
Captures(CompiledCaptureTokenTypes),
}
#[derive(Debug, Clone)]
pub struct CompiledCaptureTokenTypes {
pub default: TokenId,
pub captures: Vec<TokenId>,
}
#[derive(Debug, Clone)]
pub struct CompiledPattern {
pub regex: Regex,
pub is: CompiledTokenTypes,
}
#[derive(Debug, Clone)]
pub struct CompiledKeyword {
pub into: TokenId,
pub only_replaces: Option<TokenId>,
}
pub fn compile_syntax(syntax: &Syntax) -> CompiledSyntax {
let mut token_names = vec!["default".into()];
let mut get_token_id = |name: &str| -> TokenId {
if let Some(id) = token_names.iter().position(|n| n == name) {
id
} else {
let id = token_names.len();
token_names.push(name.to_owned());
id
}
};
let patterns = syntax
.patterns
.iter()
.filter_map(|pattern| {
// NOTE: `regex` has no support for sticky flags, so we need to anchor the match to the
// start ourselves.
let regex = RegexBuilder::new(&format!(
"^{}",
// If there's an existing `^`, it should not cause compilation errors for the user.
pattern.regex.strip_prefix('^').unwrap_or(&pattern.regex)
))
.dot_matches_new_line(pattern.flags.contains(&RegexFlag::DotMatchesNewline))
.build()
.map_err(|e| {
// NOTE: This could probably use better diagnostics, but it's pretty much
// impossible to get a source span out of serde's output (because it forgoes
// source information, rightfully so.) Therefore we have to settle on
// a poor man's error log.
error!("regex compilation error in pattern {pattern:?}: {e}");
})
.ok()?;
Some(CompiledPattern {
regex,
is: match &pattern.is {
TokenTypes::FullMatch(name) => {
CompiledTokenTypes::FullMatch(get_token_id(name))
}
TokenTypes::Captures(types) => {
CompiledTokenTypes::Captures(CompiledCaptureTokenTypes {
default: get_token_id(&types.default),
captures: types
.captures
.iter()
.map(|name| get_token_id(name))
.collect(),
})
}
},
})
})
.collect();
let keywords = syntax
.keywords
.iter()
.map(|(text, keyword)| {
(
text.clone(),
CompiledKeyword {
into: get_token_id(&keyword.into),
only_replaces: keyword.only_replaces.as_deref().map(&mut get_token_id),
},
)
})
.collect();
CompiledSyntax {
token_names,
patterns,
keywords,
}
}

View file

@ -0,0 +1,57 @@
use std::ops::Range;
use super::compiled::{CompiledSyntax, CompiledTokenTypes, TokenId, TOKEN_ID_DEFAULT};
pub struct Token {
pub id: TokenId,
pub range: Range<usize>,
}
impl CompiledSyntax {
pub fn tokenize(&self, text: &str) -> Vec<Token> {
let mut tokens = vec![];
let mut i = 0;
while i < text.len() {
let mut had_match = false;
for pattern in &self.patterns {
match &pattern.is {
CompiledTokenTypes::FullMatch(id) => {
if let Some(regex_match) = pattern.regex.find(&text[i..]) {
push_token(&mut tokens, *id, i..i + regex_match.range().end);
i += regex_match.range().end;
had_match = true;
break;
}
}
CompiledTokenTypes::Captures(types) => { /* TODO */ }
}
}
if !had_match {
push_token(&mut tokens, TOKEN_ID_DEFAULT, i..i + 1);
i += 1;
}
}
for token in &mut tokens {
if let Some(keyword) = self.keywords.get(&text[token.range.clone()]) {
if keyword.only_replaces.is_none() || Some(token.id) == keyword.only_replaces {
token.id = keyword.into;
}
}
}
tokens
}
}
fn push_token(tokens: &mut Vec<Token>, id: TokenId, range: Range<usize>) {
if let Some(previous_token) = tokens.last_mut() {
if previous_token.id == id {
previous_token.range.end = range.end;
return;
}
}
tokens.push(Token { id, range });
}

View file

@ -23,6 +23,7 @@
//! HTML renderer that takes an iterator of events as input. //! HTML renderer that takes an iterator of events as input.
use std::borrow::Borrow;
use std::collections::HashMap; use std::collections::HashMap;
use std::io; use std::io;
@ -31,6 +32,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, Event, LinkType, Tag};
use pulldown_cmark::{CowStr, Event::*}; use pulldown_cmark::{CowStr, Event::*};
use crate::config::{Config, ConfigDerivedData, PicSize}; use crate::config::{Config, ConfigDerivedData, PicSize};
use crate::html::highlight::highlight;
use crate::state::Treehouse; use crate::state::Treehouse;
enum TableState { enum TableState {
@ -38,6 +40,12 @@ enum TableState {
Body, Body,
} }
#[derive(Debug, Clone, PartialEq, Eq)]
enum CodeBlockState<'a> {
NotInCodeBlock,
InCodeBlock(Option<CowStr<'a>>),
}
struct HtmlWriter<'a, I, W> { struct HtmlWriter<'a, I, W> {
treehouse: &'a Treehouse, treehouse: &'a Treehouse,
config: &'a Config, config: &'a Config,
@ -58,7 +66,7 @@ struct HtmlWriter<'a, I, W> {
table_cell_index: usize, table_cell_index: usize,
numbers: HashMap<CowStr<'a>, usize>, numbers: HashMap<CowStr<'a>, usize>,
in_code_block: bool, code_block_state: CodeBlockState<'a>,
} }
impl<'a, I, W> HtmlWriter<'a, I, W> impl<'a, I, W> HtmlWriter<'a, I, W>
@ -87,7 +95,7 @@ where
table_alignments: vec![], table_alignments: vec![],
table_cell_index: 0, table_cell_index: 0,
numbers: HashMap::new(), numbers: HashMap::new(),
in_code_block: false, code_block_state: CodeBlockState::NotInCodeBlock,
} }
} }
@ -234,16 +242,21 @@ where
} }
} }
Tag::CodeBlock(info) => { Tag::CodeBlock(info) => {
self.in_code_block = true; self.code_block_state = CodeBlockState::InCodeBlock(None);
if !self.end_newline { if !self.end_newline {
self.write_newline()?; self.write_newline()?;
} }
match info { match info {
CodeBlockKind::Fenced(language) => match CodeBlockMode::parse(&language) { CodeBlockKind::Fenced(language) => {
self.code_block_state = CodeBlockState::InCodeBlock(Some(language.clone()));
match CodeBlockMode::parse(&language) {
CodeBlockMode::PlainText => self.write("<pre><code>"), CodeBlockMode::PlainText => self.write("<pre><code>"),
CodeBlockMode::SyntaxHighlightOnly { language } => { CodeBlockMode::SyntaxHighlightOnly { language } => {
self.write("<pre><code class=\"language-")?; self.write("<pre><code class=\"language-")?;
escape_html(&mut self.writer, language)?; escape_html(&mut self.writer, language)?;
if self.config.syntaxes.contains_key(language) {
self.write(" th-syntax-highlighting")?;
}
self.write("\">") self.write("\">")
} }
CodeBlockMode::LiterateProgram { CodeBlockMode::LiterateProgram {
@ -292,7 +305,8 @@ where
self.write("<pre class=\"placeholder-console\">")?; self.write("<pre class=\"placeholder-console\">")?;
Ok(()) Ok(())
} }
}, }
}
CodeBlockKind::Indented => self.write("<pre><code>"), CodeBlockKind::Indented => self.write("<pre><code>"),
} }
} }
@ -416,7 +430,7 @@ where
}, },
_ => "</code></pre>\n", _ => "</code></pre>\n",
})?; })?;
self.in_code_block = false; self.code_block_state = CodeBlockState::NotInCodeBlock;
} }
Tag::List(Some(_)) => { Tag::List(Some(_)) => {
self.write("</ol>\n")?; self.write("</ol>\n")?;
@ -505,8 +519,20 @@ where
} }
} }
if self.in_code_block { if let CodeBlockState::InCodeBlock(language) = &self.code_block_state {
let code_block_mode = language
.as_ref()
.map(|language| CodeBlockMode::parse(language));
let highlighting_language = code_block_mode
.as_ref()
.and_then(|mode| mode.highlighting_language());
let syntax =
highlighting_language.and_then(|language| self.config.syntaxes.get(language));
if let Some(syntax) = syntax {
highlight(&mut self.writer, syntax, text)?;
} else {
escape_html(&mut self.writer, text)?; escape_html(&mut self.writer, text)?;
}
} else { } else {
let mut parser = EmojiParser { text, position: 0 }; let mut parser = EmojiParser { text, position: 0 };
while let Some(token) = parser.next_token() { while let Some(token) = parser.next_token() {
@ -623,6 +649,16 @@ impl<'a> CodeBlockMode<'a> {
CodeBlockMode::SyntaxHighlightOnly { language } CodeBlockMode::SyntaxHighlightOnly { language }
} }
} }
fn highlighting_language(&self) -> Option<&str> {
if let CodeBlockMode::LiterateProgram { language, .. }
| CodeBlockMode::SyntaxHighlightOnly { language } = self
{
Some(language)
} else {
None
}
}
} }
/// Iterate over an `Iterator` of `Event`s, generate HTML for each `Event`, and /// Iterate over an `Iterator` of `Event`s, generate HTML for each `Event`, and

View file

@ -1,6 +1,8 @@
// This tokenizer is highly inspired by the one found in rxi's lite. // This tokenizer is highly inspired by the one found in rxi's lite.
// I highly recommend checking it out! // I highly recommend checking it out!
// https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua // https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
// There's also a mirror of it in the static generator, to enable highlighting of code blocks which
// are *not* JavaScript-powered.
export function compileSyntax(def) { export function compileSyntax(def) {
for (let pattern of def.patterns) { for (let pattern of def.patterns) {
@ -32,7 +34,7 @@ function tokenize(text, syntax) {
let match; let match;
pattern.regex.lastIndex = i; pattern.regex.lastIndex = i;
if ((match = pattern.regex.exec(text)) != null) { if ((match = pattern.regex.exec(text)) != null) {
pushToken(tokens, pattern.as, match[0]); // TODO pushToken(tokens, pattern.is, match[0]); // TODO
i = pattern.regex.lastIndex; i = pattern.regex.lastIndex;
hadMatch = true; hadMatch = true;
break; break;

View file

@ -0,0 +1,76 @@
{
"patterns": [
{ "regex": "\\/\\/.*", "is": "comment" },
{
"regex": "\\/\\*.*?\\*\\/",
"flags": ["dotMatchesNewline"],
"is": "comment"
},
{ "regex": "[A-Z_][a-zA-Z0-9_]*", "is": "keyword2" },
{
"regex": "[a-zA-Z_][a-zA-Z0-9_]*(\\()",
"is": { "default": "function", "captures": ["default"] }
},
{ "regex": "[a-zA-Z_][a-zA-Z0-9_]*", "is": "identifier" },
{ "regex": "0[bB][01_]+n?", "is": "literal" },
{ "regex": "0[oO][0-7_]+n?", "is": "literal" },
{ "regex": "0[xX][0-9a-fA-F_]+n?", "is": "literal" },
{ "regex": "[0-9_]+n", "is": "literal" },
{ "regex": "[0-9_]+(\\.[0-9_]*([eE][-+]?[0-9_]+)?)?", "is": "literal" },
{ "regex": "'(\\'|[^'])*'", "is": "string" },
{ "regex": "\"(\\\"|[^\"])*\"", "is": "string" },
{ "regex": "`(\\`|[^`])*`", "is": "string" },
{ "regex": "[+=/*^%<>!~|&\\.?:-]+", "is": "operator" },
{ "regex": "[,;]", "is": "punct" }
],
"keywords": {
"as": { "into": "keyword1", "onlyReplaces": "identifier" },
"async": { "into": "keyword1", "onlyReplaces": "identifier" },
"await": { "into": "keyword1" },
"break": { "into": "keyword1" },
"case": { "into": "keyword1" },
"catch": { "into": "keyword1" },
"class": { "into": "keyword1" },
"const": { "into": "keyword1" },
"continue": { "into": "keyword1" },
"debugger": { "into": "keyword1" },
"default": { "into": "keyword1" },
"delete": { "into": "keyword1" },
"do": { "into": "keyword1" },
"else": { "into": "keyword1" },
"export": { "into": "keyword1" },
"extends": { "into": "keyword1" },
"finally": { "into": "keyword1" },
"for": { "into": "keyword1" },
"from": { "into": "keyword1", "onlyReplaces": "identifier" },
"function": { "into": "keyword1" },
"get": { "into": "keyword1", "onlyReplaces": "identifier" },
"if": { "into": "keyword1" },
"import": { "into": "keyword1" },
"in": { "into": "keyword1" },
"instanceof": { "into": "keyword1" },
"let": { "into": "keyword1" },
"new": { "into": "keyword1" },
"of": { "into": "keyword1", "onlyReplaces": "identifier" },
"return": { "into": "keyword1" },
"set": { "into": "keyword1", "onlyReplaces": "identifier" },
"static": { "into": "keyword1" },
"switch": { "into": "keyword1" },
"throw": { "into": "keyword1" },
"try": { "into": "keyword1" },
"typeof": { "into": "keyword1" },
"var": { "into": "keyword1" },
"void": { "into": "keyword1" },
"while": { "into": "keyword1" },
"with": { "into": "keyword1" },
"yield": { "into": "keyword1" },
"super": { "into": "keyword2" },
"this": { "into": "keyword2" },
"false": { "into": "literal" },
"true": { "into": "literal" },
"undefined": { "into": "literal" },
"null": { "into": "literal" }
}
}