static syntax highlighting WIP

This commit is contained in:
liquidex 2024-03-10 23:23:50 +01:00
parent 7fd2d18b69
commit 5ab8ffdba2
12 changed files with 523 additions and 63 deletions

13
Cargo.lock generated
View file

@ -1160,9 +1160,9 @@ dependencies = [
[[package]]
name = "regex"
version = "1.9.3"
version = "1.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a"
checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
dependencies = [
"aho-corasick",
"memchr",
@ -1172,9 +1172,9 @@ dependencies = [
[[package]]
name = "regex-automata"
version = "0.3.6"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69"
checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
dependencies = [
"aho-corasick",
"memchr",
@ -1183,9 +1183,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
version = "0.7.4"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]]
name = "rustc-demangle"
@ -1580,6 +1580,7 @@ dependencies = [
"log",
"pulldown-cmark",
"rand",
"regex",
"serde",
"serde_json",
"tokio",

View file

@ -0,0 +1,36 @@
%% title = "syntax highlighting gallery"
- this is a page demonstrating syntaxes supported by the treehouse
- really there's not much more to it, but I use it for debugging + with it you can get a general feel for how I highlight things in the treehouse
- `javascript`
```javascript
// t is an existing tile index; variable name is short for brevity
export function removeRedundancies(t) {
if (isSet(t, SE) && (!isSet(t, S) || !isSet(t, E))) {
t &= ~SE;
}
if (isSet(t, SW) && (!isSet(t, S) || !isSet(t, W))) {
t &= ~SW;
}
if (isSet(t, NW) && (!isSet(t, N) || !isSet(t, W))) {
t &= ~NW;
}
if (isSet(t, NE) && (!isSet(t, N) || !isSet(t, E))) {
t &= ~NE;
}
return t;
}
/* This is
a multiline comment. */
ident Class CONSTANT funciton()
0b1010 0o01234567 0x0123456789ABCDEF
01234567
1.41e-3
'string' /**/ "string" /**/ `string`
+ - * / == != <= >= ! ~ || && . ? :
, ;
```

View file

@ -29,3 +29,4 @@ ulid = "1.0.0"
url = "2.5.0"
base64 = "0.21.7"
chrono = "0.4.35"
regex = "1.10.3"

View file

@ -361,6 +361,7 @@ pub fn generate(paths: &Paths<'_>) -> anyhow::Result<(Config, Treehouse)> {
config.site = std::env::var("TREEHOUSE_SITE").unwrap_or(config.site);
config.autopopulate_emoji(&paths.static_dir.join("emoji"))?;
config.autopopulate_pics(&paths.static_dir.join("pic"))?;
config.load_syntaxes(&paths.static_dir.join("syntax"))?;
info!("cleaning target directory");
let _ = std::fs::remove_dir_all(paths.target_dir);

View file

@ -1,9 +1,15 @@
use std::{collections::HashMap, ffi::OsStr, fs::File, io::BufReader, path::Path};
use anyhow::Context;
use log::debug;
use serde::{Deserialize, Serialize};
use walkdir::WalkDir;
use crate::html::highlight::{
compiled::{compile_syntax, CompiledSyntax},
Syntax,
};
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Config {
/// Website root; used when generating links.
@ -48,6 +54,13 @@ pub struct Config {
/// On top of this, pics are autodiscovered by walking the `static/pic` directory.
/// Only the part before the first dash is treated as the pic's id.
pub pics: HashMap<String, String>,
/// Syntax definitions.
///
/// These are not part of the config file, but are loaded as part of site configuration from
/// `static/syntax`.
#[serde(skip)]
pub syntaxes: HashMap<String, CompiledSyntax>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
@ -138,6 +151,30 @@ impl Config {
self.pics.get(id).map(|x| &**x).unwrap_or("404.png")
)
}
/// Loads all syntax definition files.
pub fn load_syntaxes(&mut self, dir: &Path) -> anyhow::Result<()> {
for entry in WalkDir::new(dir) {
let entry = entry?;
if entry.path().extension() == Some(OsStr::new("json")) {
let name = entry
.path()
.file_stem()
.expect("syntax file name should have a stem")
.to_string_lossy();
debug!("loading syntax {name:?}");
let syntax: Syntax = serde_json::from_reader(BufReader::new(
File::open(entry.path()).context("could not open syntax file")?,
))
.context("could not deserialize syntax file")?;
let compiled = compile_syntax(&syntax);
self.syntaxes.insert(name.into_owned(), compiled);
}
}
Ok(())
}
}
/// Data derived from the config.

View file

@ -1,6 +1,7 @@
use std::fmt::{self, Display, Write};
pub mod breadcrumbs;
pub mod highlight;
mod markdown;
pub mod navmap;
pub mod tree;

View file

@ -0,0 +1,94 @@
//! Tokenizer and syntax highlighter inspired by the one found in rxi's lite.
//! I highly recommend checking it out!
//! https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
//! There's also a mirror of it in the JavaScript, used to power dynamically editable code blocks.
//!
//! Both of these syntax highlighters use the same JSON syntax definitions; however this one is
//! more limited, in that patterns do not support backtracking.
//! This is effectively enforced in the dynamic highlighter because this highlighter reports any
//! regex syntax errors upon site compilation.
pub mod compiled;
pub mod tokenize;
use std::{collections::HashMap, io};
use pulldown_cmark::escape::{escape_html, StrWrite};
use serde::{Deserialize, Serialize};
use self::compiled::CompiledSyntax;
/// Syntax definition.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Syntax {
/// Patterns, matched sequentially (patterns at the beginning of the list take precedence.)
pub patterns: Vec<Pattern>,
/// Map of replacements to use if a pattern matches a string exactly.
pub keywords: HashMap<String, Keyword>,
}
/// A pattern in a syntax definition.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Pattern {
/// Regular expression to match.
pub regex: String,
/// Flags to pass to the regex engine to alter how strings are matched.
#[serde(default)]
pub flags: Vec<RegexFlag>,
/// Type to assign to the token. This can be any string, but only a select few have colors
/// assigned.
pub is: TokenTypes,
}
/// Assignable token types.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(untagged)]
pub enum TokenTypes {
/// Assign a single token type to the entire match.
FullMatch(String),
/// Assign individual token types to each capture.
Captures(CaptureTokenTypes),
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct CaptureTokenTypes {
/// Token type to use outside captures.
pub default: String,
/// Token type to use inside captures.
pub captures: Vec<String>,
}
/// Flag passed to the regex engine.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub enum RegexFlag {
/// Make `.` match line separators.
DotMatchesNewline,
}
/// Keyword replacement.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Keyword {
/// What to replace the token type with.
pub into: String,
/// Only replace the token type if it matches this one. If this is not present, any token type
/// is replaced.
pub only_replaces: Option<String>,
}
pub fn highlight(mut w: impl StrWrite, syntax: &CompiledSyntax, code: &str) -> io::Result<()> {
let tokens = syntax.tokenize(code);
for token in tokens {
w.write_str("<span class=\"")?;
escape_html(&mut w, &syntax.token_names[token.id])?;
w.write_str("\">")?;
escape_html(&mut w, &code[token.range])?;
w.write_str("</span>")?;
}
Ok(())
}

View file

@ -0,0 +1,118 @@
use std::collections::HashMap;
use log::error;
use regex::{Regex, RegexBuilder};
use super::{RegexFlag, Syntax, TokenTypes};
/// During compilation, token names are converted to numeric IDs for performance.
pub type TokenId = usize;
pub const TOKEN_ID_DEFAULT: TokenId = 0;
#[derive(Debug, Clone)]
pub struct CompiledSyntax {
/// Lookup table which maps numeric IDs to token names.
pub token_names: Vec<String>,
pub patterns: Vec<CompiledPattern>,
pub keywords: HashMap<String, CompiledKeyword>,
}
#[derive(Debug, Clone)]
pub enum CompiledTokenTypes {
FullMatch(TokenId),
Captures(CompiledCaptureTokenTypes),
}
#[derive(Debug, Clone)]
pub struct CompiledCaptureTokenTypes {
pub default: TokenId,
pub captures: Vec<TokenId>,
}
#[derive(Debug, Clone)]
pub struct CompiledPattern {
pub regex: Regex,
pub is: CompiledTokenTypes,
}
#[derive(Debug, Clone)]
pub struct CompiledKeyword {
pub into: TokenId,
pub only_replaces: Option<TokenId>,
}
pub fn compile_syntax(syntax: &Syntax) -> CompiledSyntax {
let mut token_names = vec!["default".into()];
let mut get_token_id = |name: &str| -> TokenId {
if let Some(id) = token_names.iter().position(|n| n == name) {
id
} else {
let id = token_names.len();
token_names.push(name.to_owned());
id
}
};
let patterns = syntax
.patterns
.iter()
.filter_map(|pattern| {
// NOTE: `regex` has no support for sticky flags, so we need to anchor the match to the
// start ourselves.
let regex = RegexBuilder::new(&format!(
"^{}",
// If there's an existing `^`, it should not cause compilation errors for the user.
pattern.regex.strip_prefix('^').unwrap_or(&pattern.regex)
))
.dot_matches_new_line(pattern.flags.contains(&RegexFlag::DotMatchesNewline))
.build()
.map_err(|e| {
// NOTE: This could probably use better diagnostics, but it's pretty much
// impossible to get a source span out of serde's output (because it forgoes
// source information, rightfully so.) Therefore we have to settle on
// a poor man's error log.
error!("regex compilation error in pattern {pattern:?}: {e}");
})
.ok()?;
Some(CompiledPattern {
regex,
is: match &pattern.is {
TokenTypes::FullMatch(name) => {
CompiledTokenTypes::FullMatch(get_token_id(name))
}
TokenTypes::Captures(types) => {
CompiledTokenTypes::Captures(CompiledCaptureTokenTypes {
default: get_token_id(&types.default),
captures: types
.captures
.iter()
.map(|name| get_token_id(name))
.collect(),
})
}
},
})
})
.collect();
let keywords = syntax
.keywords
.iter()
.map(|(text, keyword)| {
(
text.clone(),
CompiledKeyword {
into: get_token_id(&keyword.into),
only_replaces: keyword.only_replaces.as_deref().map(&mut get_token_id),
},
)
})
.collect();
CompiledSyntax {
token_names,
patterns,
keywords,
}
}

View file

@ -0,0 +1,57 @@
use std::ops::Range;
use super::compiled::{CompiledSyntax, CompiledTokenTypes, TokenId, TOKEN_ID_DEFAULT};
pub struct Token {
pub id: TokenId,
pub range: Range<usize>,
}
impl CompiledSyntax {
pub fn tokenize(&self, text: &str) -> Vec<Token> {
let mut tokens = vec![];
let mut i = 0;
while i < text.len() {
let mut had_match = false;
for pattern in &self.patterns {
match &pattern.is {
CompiledTokenTypes::FullMatch(id) => {
if let Some(regex_match) = pattern.regex.find(&text[i..]) {
push_token(&mut tokens, *id, i..i + regex_match.range().end);
i += regex_match.range().end;
had_match = true;
break;
}
}
CompiledTokenTypes::Captures(types) => { /* TODO */ }
}
}
if !had_match {
push_token(&mut tokens, TOKEN_ID_DEFAULT, i..i + 1);
i += 1;
}
}
for token in &mut tokens {
if let Some(keyword) = self.keywords.get(&text[token.range.clone()]) {
if keyword.only_replaces.is_none() || Some(token.id) == keyword.only_replaces {
token.id = keyword.into;
}
}
}
tokens
}
}
fn push_token(tokens: &mut Vec<Token>, id: TokenId, range: Range<usize>) {
if let Some(previous_token) = tokens.last_mut() {
if previous_token.id == id {
previous_token.range.end = range.end;
return;
}
}
tokens.push(Token { id, range });
}

View file

@ -23,6 +23,7 @@
//! HTML renderer that takes an iterator of events as input.
use std::borrow::Borrow;
use std::collections::HashMap;
use std::io;
@ -31,6 +32,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, Event, LinkType, Tag};
use pulldown_cmark::{CowStr, Event::*};
use crate::config::{Config, ConfigDerivedData, PicSize};
use crate::html::highlight::highlight;
use crate::state::Treehouse;
enum TableState {
@ -38,6 +40,12 @@ enum TableState {
Body,
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum CodeBlockState<'a> {
NotInCodeBlock,
InCodeBlock(Option<CowStr<'a>>),
}
struct HtmlWriter<'a, I, W> {
treehouse: &'a Treehouse,
config: &'a Config,
@ -58,7 +66,7 @@ struct HtmlWriter<'a, I, W> {
table_cell_index: usize,
numbers: HashMap<CowStr<'a>, usize>,
in_code_block: bool,
code_block_state: CodeBlockState<'a>,
}
impl<'a, I, W> HtmlWriter<'a, I, W>
@ -87,7 +95,7 @@ where
table_alignments: vec![],
table_cell_index: 0,
numbers: HashMap::new(),
in_code_block: false,
code_block_state: CodeBlockState::NotInCodeBlock,
}
}
@ -234,65 +242,71 @@ where
}
}
Tag::CodeBlock(info) => {
self.in_code_block = true;
self.code_block_state = CodeBlockState::InCodeBlock(None);
if !self.end_newline {
self.write_newline()?;
}
match info {
CodeBlockKind::Fenced(language) => match CodeBlockMode::parse(&language) {
CodeBlockMode::PlainText => self.write("<pre><code>"),
CodeBlockMode::SyntaxHighlightOnly { language } => {
self.write("<pre><code class=\"language-")?;
escape_html(&mut self.writer, language)?;
self.write("\">")
}
CodeBlockMode::LiterateProgram {
language,
kind,
program_name,
} => {
self.write(match &kind {
LiterateCodeKind::Input => {
"<th-literate-program data-mode=\"input\" "
}
LiterateCodeKind::Output { .. } => {
"<th-literate-program data-mode=\"output\" "
}
})?;
self.write("data-program=\"")?;
escape_href(&mut self.writer, self.page_id)?;
self.write(":")?;
escape_html(&mut self.writer, program_name)?;
self.write("\" data-language=\"")?;
escape_html(&mut self.writer, language)?;
self.write("\" role=\"code\">")?;
if let LiterateCodeKind::Output { placeholder_pic_id } = kind {
if !placeholder_pic_id.is_empty() {
self.write(
"<img class=\"placeholder-image\" loading=\"lazy\" src=\"",
)?;
escape_html(
&mut self.writer,
&self.config.pic_url(placeholder_pic_id),
)?;
self.write("\"")?;
if let Some(PicSize { width, height }) = self
.config_derived_data
.pic_size(self.config, placeholder_pic_id)
{
self.write(&format!(
" width=\"{width}\" height=\"{height}\""
))?;
}
self.write(">")?;
CodeBlockKind::Fenced(language) => {
self.code_block_state = CodeBlockState::InCodeBlock(Some(language.clone()));
match CodeBlockMode::parse(&language) {
CodeBlockMode::PlainText => self.write("<pre><code>"),
CodeBlockMode::SyntaxHighlightOnly { language } => {
self.write("<pre><code class=\"language-")?;
escape_html(&mut self.writer, language)?;
if self.config.syntaxes.contains_key(language) {
self.write(" th-syntax-highlighting")?;
}
self.write("\">")
}
CodeBlockMode::LiterateProgram {
language,
kind,
program_name,
} => {
self.write(match &kind {
LiterateCodeKind::Input => {
"<th-literate-program data-mode=\"input\" "
}
LiterateCodeKind::Output { .. } => {
"<th-literate-program data-mode=\"output\" "
}
})?;
self.write("data-program=\"")?;
escape_href(&mut self.writer, self.page_id)?;
self.write(":")?;
escape_html(&mut self.writer, program_name)?;
self.write("\" data-language=\"")?;
escape_html(&mut self.writer, language)?;
self.write("\" role=\"code\">")?;
self.write("<pre class=\"placeholder-console\">")?;
Ok(())
if let LiterateCodeKind::Output { placeholder_pic_id } = kind {
if !placeholder_pic_id.is_empty() {
self.write(
"<img class=\"placeholder-image\" loading=\"lazy\" src=\"",
)?;
escape_html(
&mut self.writer,
&self.config.pic_url(placeholder_pic_id),
)?;
self.write("\"")?;
if let Some(PicSize { width, height }) = self
.config_derived_data
.pic_size(self.config, placeholder_pic_id)
{
self.write(&format!(
" width=\"{width}\" height=\"{height}\""
))?;
}
self.write(">")?;
}
}
self.write("<pre class=\"placeholder-console\">")?;
Ok(())
}
}
},
}
CodeBlockKind::Indented => self.write("<pre><code>"),
}
}
@ -416,7 +430,7 @@ where
},
_ => "</code></pre>\n",
})?;
self.in_code_block = false;
self.code_block_state = CodeBlockState::NotInCodeBlock;
}
Tag::List(Some(_)) => {
self.write("</ol>\n")?;
@ -505,8 +519,20 @@ where
}
}
if self.in_code_block {
escape_html(&mut self.writer, text)?;
if let CodeBlockState::InCodeBlock(language) = &self.code_block_state {
let code_block_mode = language
.as_ref()
.map(|language| CodeBlockMode::parse(language));
let highlighting_language = code_block_mode
.as_ref()
.and_then(|mode| mode.highlighting_language());
let syntax =
highlighting_language.and_then(|language| self.config.syntaxes.get(language));
if let Some(syntax) = syntax {
highlight(&mut self.writer, syntax, text)?;
} else {
escape_html(&mut self.writer, text)?;
}
} else {
let mut parser = EmojiParser { text, position: 0 };
while let Some(token) = parser.next_token() {
@ -623,6 +649,16 @@ impl<'a> CodeBlockMode<'a> {
CodeBlockMode::SyntaxHighlightOnly { language }
}
}
fn highlighting_language(&self) -> Option<&str> {
if let CodeBlockMode::LiterateProgram { language, .. }
| CodeBlockMode::SyntaxHighlightOnly { language } = self
{
Some(language)
} else {
None
}
}
}
/// Iterate over an `Iterator` of `Event`s, generate HTML for each `Event`, and

View file

@ -1,6 +1,8 @@
// This tokenizer is highly inspired by the one found in rxi's lite.
// I highly recommend checking it out!
// https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
// There's also a mirror of it in the static generator, to enable highlighting of code blocks which
// are *not* JavaScript-powered.
export function compileSyntax(def) {
for (let pattern of def.patterns) {
@ -32,7 +34,7 @@ function tokenize(text, syntax) {
let match;
pattern.regex.lastIndex = i;
if ((match = pattern.regex.exec(text)) != null) {
pushToken(tokens, pattern.as, match[0]); // TODO
pushToken(tokens, pattern.is, match[0]); // TODO
i = pattern.regex.lastIndex;
hadMatch = true;
break;

View file

@ -0,0 +1,76 @@
{
"patterns": [
{ "regex": "\\/\\/.*", "is": "comment" },
{
"regex": "\\/\\*.*?\\*\\/",
"flags": ["dotMatchesNewline"],
"is": "comment"
},
{ "regex": "[A-Z_][a-zA-Z0-9_]*", "is": "keyword2" },
{
"regex": "[a-zA-Z_][a-zA-Z0-9_]*(\\()",
"is": { "default": "function", "captures": ["default"] }
},
{ "regex": "[a-zA-Z_][a-zA-Z0-9_]*", "is": "identifier" },
{ "regex": "0[bB][01_]+n?", "is": "literal" },
{ "regex": "0[oO][0-7_]+n?", "is": "literal" },
{ "regex": "0[xX][0-9a-fA-F_]+n?", "is": "literal" },
{ "regex": "[0-9_]+n", "is": "literal" },
{ "regex": "[0-9_]+(\\.[0-9_]*([eE][-+]?[0-9_]+)?)?", "is": "literal" },
{ "regex": "'(\\'|[^'])*'", "is": "string" },
{ "regex": "\"(\\\"|[^\"])*\"", "is": "string" },
{ "regex": "`(\\`|[^`])*`", "is": "string" },
{ "regex": "[+=/*^%<>!~|&\\.?:-]+", "is": "operator" },
{ "regex": "[,;]", "is": "punct" }
],
"keywords": {
"as": { "into": "keyword1", "onlyReplaces": "identifier" },
"async": { "into": "keyword1", "onlyReplaces": "identifier" },
"await": { "into": "keyword1" },
"break": { "into": "keyword1" },
"case": { "into": "keyword1" },
"catch": { "into": "keyword1" },
"class": { "into": "keyword1" },
"const": { "into": "keyword1" },
"continue": { "into": "keyword1" },
"debugger": { "into": "keyword1" },
"default": { "into": "keyword1" },
"delete": { "into": "keyword1" },
"do": { "into": "keyword1" },
"else": { "into": "keyword1" },
"export": { "into": "keyword1" },
"extends": { "into": "keyword1" },
"finally": { "into": "keyword1" },
"for": { "into": "keyword1" },
"from": { "into": "keyword1", "onlyReplaces": "identifier" },
"function": { "into": "keyword1" },
"get": { "into": "keyword1", "onlyReplaces": "identifier" },
"if": { "into": "keyword1" },
"import": { "into": "keyword1" },
"in": { "into": "keyword1" },
"instanceof": { "into": "keyword1" },
"let": { "into": "keyword1" },
"new": { "into": "keyword1" },
"of": { "into": "keyword1", "onlyReplaces": "identifier" },
"return": { "into": "keyword1" },
"set": { "into": "keyword1", "onlyReplaces": "identifier" },
"static": { "into": "keyword1" },
"switch": { "into": "keyword1" },
"throw": { "into": "keyword1" },
"try": { "into": "keyword1" },
"typeof": { "into": "keyword1" },
"var": { "into": "keyword1" },
"void": { "into": "keyword1" },
"while": { "into": "keyword1" },
"with": { "into": "keyword1" },
"yield": { "into": "keyword1" },
"super": { "into": "keyword2" },
"this": { "into": "keyword2" },
"false": { "into": "literal" },
"true": { "into": "literal" },
"undefined": { "into": "literal" },
"null": { "into": "literal" }
}
}