static syntax highlighting WIP
This commit is contained in:
parent
7fd2d18b69
commit
5ab8ffdba2
12 changed files with 523 additions and 63 deletions
|
@ -29,3 +29,4 @@ ulid = "1.0.0"
|
|||
url = "2.5.0"
|
||||
base64 = "0.21.7"
|
||||
chrono = "0.4.35"
|
||||
regex = "1.10.3"
|
||||
|
|
|
@ -361,6 +361,7 @@ pub fn generate(paths: &Paths<'_>) -> anyhow::Result<(Config, Treehouse)> {
|
|||
config.site = std::env::var("TREEHOUSE_SITE").unwrap_or(config.site);
|
||||
config.autopopulate_emoji(&paths.static_dir.join("emoji"))?;
|
||||
config.autopopulate_pics(&paths.static_dir.join("pic"))?;
|
||||
config.load_syntaxes(&paths.static_dir.join("syntax"))?;
|
||||
|
||||
info!("cleaning target directory");
|
||||
let _ = std::fs::remove_dir_all(paths.target_dir);
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
use std::{collections::HashMap, ffi::OsStr, fs::File, io::BufReader, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
use log::debug;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::html::highlight::{
|
||||
compiled::{compile_syntax, CompiledSyntax},
|
||||
Syntax,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct Config {
|
||||
/// Website root; used when generating links.
|
||||
|
@ -48,6 +54,13 @@ pub struct Config {
|
|||
/// On top of this, pics are autodiscovered by walking the `static/pic` directory.
|
||||
/// Only the part before the first dash is treated as the pic's id.
|
||||
pub pics: HashMap<String, String>,
|
||||
|
||||
/// Syntax definitions.
|
||||
///
|
||||
/// These are not part of the config file, but are loaded as part of site configuration from
|
||||
/// `static/syntax`.
|
||||
#[serde(skip)]
|
||||
pub syntaxes: HashMap<String, CompiledSyntax>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
|
@ -138,6 +151,30 @@ impl Config {
|
|||
self.pics.get(id).map(|x| &**x).unwrap_or("404.png")
|
||||
)
|
||||
}
|
||||
|
||||
/// Loads all syntax definition files.
|
||||
pub fn load_syntaxes(&mut self, dir: &Path) -> anyhow::Result<()> {
|
||||
for entry in WalkDir::new(dir) {
|
||||
let entry = entry?;
|
||||
if entry.path().extension() == Some(OsStr::new("json")) {
|
||||
let name = entry
|
||||
.path()
|
||||
.file_stem()
|
||||
.expect("syntax file name should have a stem")
|
||||
.to_string_lossy();
|
||||
debug!("loading syntax {name:?}");
|
||||
|
||||
let syntax: Syntax = serde_json::from_reader(BufReader::new(
|
||||
File::open(entry.path()).context("could not open syntax file")?,
|
||||
))
|
||||
.context("could not deserialize syntax file")?;
|
||||
let compiled = compile_syntax(&syntax);
|
||||
self.syntaxes.insert(name.into_owned(), compiled);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Data derived from the config.
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
use std::fmt::{self, Display, Write};
|
||||
|
||||
pub mod breadcrumbs;
|
||||
pub mod highlight;
|
||||
mod markdown;
|
||||
pub mod navmap;
|
||||
pub mod tree;
|
||||
|
|
94
crates/treehouse/src/html/highlight.rs
Normal file
94
crates/treehouse/src/html/highlight.rs
Normal file
|
@ -0,0 +1,94 @@
|
|||
//! Tokenizer and syntax highlighter inspired by the one found in rxi's lite.
|
||||
//! I highly recommend checking it out!
|
||||
//! https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
|
||||
//! There's also a mirror of it in the JavaScript, used to power dynamically editable code blocks.
|
||||
//!
|
||||
//! Both of these syntax highlighters use the same JSON syntax definitions; however this one is
|
||||
//! more limited, in that patterns do not support backtracking.
|
||||
//! This is effectively enforced in the dynamic highlighter because this highlighter reports any
|
||||
//! regex syntax errors upon site compilation.
|
||||
|
||||
pub mod compiled;
|
||||
pub mod tokenize;
|
||||
|
||||
use std::{collections::HashMap, io};
|
||||
|
||||
use pulldown_cmark::escape::{escape_html, StrWrite};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use self::compiled::CompiledSyntax;
|
||||
|
||||
/// Syntax definition.
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct Syntax {
|
||||
/// Patterns, matched sequentially (patterns at the beginning of the list take precedence.)
|
||||
pub patterns: Vec<Pattern>,
|
||||
|
||||
/// Map of replacements to use if a pattern matches a string exactly.
|
||||
pub keywords: HashMap<String, Keyword>,
|
||||
}
|
||||
|
||||
/// A pattern in a syntax definition.
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct Pattern {
|
||||
/// Regular expression to match.
|
||||
pub regex: String,
|
||||
|
||||
/// Flags to pass to the regex engine to alter how strings are matched.
|
||||
#[serde(default)]
|
||||
pub flags: Vec<RegexFlag>,
|
||||
|
||||
/// Type to assign to the token. This can be any string, but only a select few have colors
|
||||
/// assigned.
|
||||
pub is: TokenTypes,
|
||||
}
|
||||
|
||||
/// Assignable token types.
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum TokenTypes {
|
||||
/// Assign a single token type to the entire match.
|
||||
FullMatch(String),
|
||||
/// Assign individual token types to each capture.
|
||||
Captures(CaptureTokenTypes),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct CaptureTokenTypes {
|
||||
/// Token type to use outside captures.
|
||||
pub default: String,
|
||||
/// Token type to use inside captures.
|
||||
pub captures: Vec<String>,
|
||||
}
|
||||
|
||||
/// Flag passed to the regex engine.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum RegexFlag {
|
||||
/// Make `.` match line separators.
|
||||
DotMatchesNewline,
|
||||
}
|
||||
|
||||
/// Keyword replacement.
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Keyword {
|
||||
/// What to replace the token type with.
|
||||
pub into: String,
|
||||
|
||||
/// Only replace the token type if it matches this one. If this is not present, any token type
|
||||
/// is replaced.
|
||||
pub only_replaces: Option<String>,
|
||||
}
|
||||
|
||||
pub fn highlight(mut w: impl StrWrite, syntax: &CompiledSyntax, code: &str) -> io::Result<()> {
|
||||
let tokens = syntax.tokenize(code);
|
||||
for token in tokens {
|
||||
w.write_str("<span class=\"")?;
|
||||
escape_html(&mut w, &syntax.token_names[token.id])?;
|
||||
w.write_str("\">")?;
|
||||
escape_html(&mut w, &code[token.range])?;
|
||||
w.write_str("</span>")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
118
crates/treehouse/src/html/highlight/compiled.rs
Normal file
118
crates/treehouse/src/html/highlight/compiled.rs
Normal file
|
@ -0,0 +1,118 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use log::error;
|
||||
use regex::{Regex, RegexBuilder};
|
||||
|
||||
use super::{RegexFlag, Syntax, TokenTypes};
|
||||
|
||||
/// During compilation, token names are converted to numeric IDs for performance.
|
||||
pub type TokenId = usize;
|
||||
|
||||
pub const TOKEN_ID_DEFAULT: TokenId = 0;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompiledSyntax {
|
||||
/// Lookup table which maps numeric IDs to token names.
|
||||
pub token_names: Vec<String>,
|
||||
|
||||
pub patterns: Vec<CompiledPattern>,
|
||||
pub keywords: HashMap<String, CompiledKeyword>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum CompiledTokenTypes {
|
||||
FullMatch(TokenId),
|
||||
Captures(CompiledCaptureTokenTypes),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompiledCaptureTokenTypes {
|
||||
pub default: TokenId,
|
||||
pub captures: Vec<TokenId>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompiledPattern {
|
||||
pub regex: Regex,
|
||||
pub is: CompiledTokenTypes,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CompiledKeyword {
|
||||
pub into: TokenId,
|
||||
pub only_replaces: Option<TokenId>,
|
||||
}
|
||||
|
||||
pub fn compile_syntax(syntax: &Syntax) -> CompiledSyntax {
|
||||
let mut token_names = vec!["default".into()];
|
||||
let mut get_token_id = |name: &str| -> TokenId {
|
||||
if let Some(id) = token_names.iter().position(|n| n == name) {
|
||||
id
|
||||
} else {
|
||||
let id = token_names.len();
|
||||
token_names.push(name.to_owned());
|
||||
id
|
||||
}
|
||||
};
|
||||
|
||||
let patterns = syntax
|
||||
.patterns
|
||||
.iter()
|
||||
.filter_map(|pattern| {
|
||||
// NOTE: `regex` has no support for sticky flags, so we need to anchor the match to the
|
||||
// start ourselves.
|
||||
let regex = RegexBuilder::new(&format!(
|
||||
"^{}",
|
||||
// If there's an existing `^`, it should not cause compilation errors for the user.
|
||||
pattern.regex.strip_prefix('^').unwrap_or(&pattern.regex)
|
||||
))
|
||||
.dot_matches_new_line(pattern.flags.contains(&RegexFlag::DotMatchesNewline))
|
||||
.build()
|
||||
.map_err(|e| {
|
||||
// NOTE: This could probably use better diagnostics, but it's pretty much
|
||||
// impossible to get a source span out of serde's output (because it forgoes
|
||||
// source information, rightfully so.) Therefore we have to settle on
|
||||
// a poor man's error log.
|
||||
error!("regex compilation error in pattern {pattern:?}: {e}");
|
||||
})
|
||||
.ok()?;
|
||||
Some(CompiledPattern {
|
||||
regex,
|
||||
is: match &pattern.is {
|
||||
TokenTypes::FullMatch(name) => {
|
||||
CompiledTokenTypes::FullMatch(get_token_id(name))
|
||||
}
|
||||
TokenTypes::Captures(types) => {
|
||||
CompiledTokenTypes::Captures(CompiledCaptureTokenTypes {
|
||||
default: get_token_id(&types.default),
|
||||
captures: types
|
||||
.captures
|
||||
.iter()
|
||||
.map(|name| get_token_id(name))
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
},
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
let keywords = syntax
|
||||
.keywords
|
||||
.iter()
|
||||
.map(|(text, keyword)| {
|
||||
(
|
||||
text.clone(),
|
||||
CompiledKeyword {
|
||||
into: get_token_id(&keyword.into),
|
||||
only_replaces: keyword.only_replaces.as_deref().map(&mut get_token_id),
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
CompiledSyntax {
|
||||
token_names,
|
||||
patterns,
|
||||
keywords,
|
||||
}
|
||||
}
|
57
crates/treehouse/src/html/highlight/tokenize.rs
Normal file
57
crates/treehouse/src/html/highlight/tokenize.rs
Normal file
|
@ -0,0 +1,57 @@
|
|||
use std::ops::Range;
|
||||
|
||||
use super::compiled::{CompiledSyntax, CompiledTokenTypes, TokenId, TOKEN_ID_DEFAULT};
|
||||
|
||||
pub struct Token {
|
||||
pub id: TokenId,
|
||||
pub range: Range<usize>,
|
||||
}
|
||||
|
||||
impl CompiledSyntax {
|
||||
pub fn tokenize(&self, text: &str) -> Vec<Token> {
|
||||
let mut tokens = vec![];
|
||||
|
||||
let mut i = 0;
|
||||
while i < text.len() {
|
||||
let mut had_match = false;
|
||||
for pattern in &self.patterns {
|
||||
match &pattern.is {
|
||||
CompiledTokenTypes::FullMatch(id) => {
|
||||
if let Some(regex_match) = pattern.regex.find(&text[i..]) {
|
||||
push_token(&mut tokens, *id, i..i + regex_match.range().end);
|
||||
i += regex_match.range().end;
|
||||
had_match = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
CompiledTokenTypes::Captures(types) => { /* TODO */ }
|
||||
}
|
||||
}
|
||||
|
||||
if !had_match {
|
||||
push_token(&mut tokens, TOKEN_ID_DEFAULT, i..i + 1);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for token in &mut tokens {
|
||||
if let Some(keyword) = self.keywords.get(&text[token.range.clone()]) {
|
||||
if keyword.only_replaces.is_none() || Some(token.id) == keyword.only_replaces {
|
||||
token.id = keyword.into;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tokens
|
||||
}
|
||||
}
|
||||
|
||||
fn push_token(tokens: &mut Vec<Token>, id: TokenId, range: Range<usize>) {
|
||||
if let Some(previous_token) = tokens.last_mut() {
|
||||
if previous_token.id == id {
|
||||
previous_token.range.end = range.end;
|
||||
return;
|
||||
}
|
||||
}
|
||||
tokens.push(Token { id, range });
|
||||
}
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
//! HTML renderer that takes an iterator of events as input.
|
||||
|
||||
use std::borrow::Borrow;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
|
@ -31,6 +32,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, Event, LinkType, Tag};
|
|||
use pulldown_cmark::{CowStr, Event::*};
|
||||
|
||||
use crate::config::{Config, ConfigDerivedData, PicSize};
|
||||
use crate::html::highlight::highlight;
|
||||
use crate::state::Treehouse;
|
||||
|
||||
enum TableState {
|
||||
|
@ -38,6 +40,12 @@ enum TableState {
|
|||
Body,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
enum CodeBlockState<'a> {
|
||||
NotInCodeBlock,
|
||||
InCodeBlock(Option<CowStr<'a>>),
|
||||
}
|
||||
|
||||
struct HtmlWriter<'a, I, W> {
|
||||
treehouse: &'a Treehouse,
|
||||
config: &'a Config,
|
||||
|
@ -58,7 +66,7 @@ struct HtmlWriter<'a, I, W> {
|
|||
table_cell_index: usize,
|
||||
numbers: HashMap<CowStr<'a>, usize>,
|
||||
|
||||
in_code_block: bool,
|
||||
code_block_state: CodeBlockState<'a>,
|
||||
}
|
||||
|
||||
impl<'a, I, W> HtmlWriter<'a, I, W>
|
||||
|
@ -87,7 +95,7 @@ where
|
|||
table_alignments: vec![],
|
||||
table_cell_index: 0,
|
||||
numbers: HashMap::new(),
|
||||
in_code_block: false,
|
||||
code_block_state: CodeBlockState::NotInCodeBlock,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -234,65 +242,71 @@ where
|
|||
}
|
||||
}
|
||||
Tag::CodeBlock(info) => {
|
||||
self.in_code_block = true;
|
||||
self.code_block_state = CodeBlockState::InCodeBlock(None);
|
||||
if !self.end_newline {
|
||||
self.write_newline()?;
|
||||
}
|
||||
match info {
|
||||
CodeBlockKind::Fenced(language) => match CodeBlockMode::parse(&language) {
|
||||
CodeBlockMode::PlainText => self.write("<pre><code>"),
|
||||
CodeBlockMode::SyntaxHighlightOnly { language } => {
|
||||
self.write("<pre><code class=\"language-")?;
|
||||
escape_html(&mut self.writer, language)?;
|
||||
self.write("\">")
|
||||
}
|
||||
CodeBlockMode::LiterateProgram {
|
||||
language,
|
||||
kind,
|
||||
program_name,
|
||||
} => {
|
||||
self.write(match &kind {
|
||||
LiterateCodeKind::Input => {
|
||||
"<th-literate-program data-mode=\"input\" "
|
||||
}
|
||||
LiterateCodeKind::Output { .. } => {
|
||||
"<th-literate-program data-mode=\"output\" "
|
||||
}
|
||||
})?;
|
||||
self.write("data-program=\"")?;
|
||||
escape_href(&mut self.writer, self.page_id)?;
|
||||
self.write(":")?;
|
||||
escape_html(&mut self.writer, program_name)?;
|
||||
self.write("\" data-language=\"")?;
|
||||
escape_html(&mut self.writer, language)?;
|
||||
self.write("\" role=\"code\">")?;
|
||||
|
||||
if let LiterateCodeKind::Output { placeholder_pic_id } = kind {
|
||||
if !placeholder_pic_id.is_empty() {
|
||||
self.write(
|
||||
"<img class=\"placeholder-image\" loading=\"lazy\" src=\"",
|
||||
)?;
|
||||
escape_html(
|
||||
&mut self.writer,
|
||||
&self.config.pic_url(placeholder_pic_id),
|
||||
)?;
|
||||
self.write("\"")?;
|
||||
if let Some(PicSize { width, height }) = self
|
||||
.config_derived_data
|
||||
.pic_size(self.config, placeholder_pic_id)
|
||||
{
|
||||
self.write(&format!(
|
||||
" width=\"{width}\" height=\"{height}\""
|
||||
))?;
|
||||
}
|
||||
self.write(">")?;
|
||||
CodeBlockKind::Fenced(language) => {
|
||||
self.code_block_state = CodeBlockState::InCodeBlock(Some(language.clone()));
|
||||
match CodeBlockMode::parse(&language) {
|
||||
CodeBlockMode::PlainText => self.write("<pre><code>"),
|
||||
CodeBlockMode::SyntaxHighlightOnly { language } => {
|
||||
self.write("<pre><code class=\"language-")?;
|
||||
escape_html(&mut self.writer, language)?;
|
||||
if self.config.syntaxes.contains_key(language) {
|
||||
self.write(" th-syntax-highlighting")?;
|
||||
}
|
||||
self.write("\">")
|
||||
}
|
||||
CodeBlockMode::LiterateProgram {
|
||||
language,
|
||||
kind,
|
||||
program_name,
|
||||
} => {
|
||||
self.write(match &kind {
|
||||
LiterateCodeKind::Input => {
|
||||
"<th-literate-program data-mode=\"input\" "
|
||||
}
|
||||
LiterateCodeKind::Output { .. } => {
|
||||
"<th-literate-program data-mode=\"output\" "
|
||||
}
|
||||
})?;
|
||||
self.write("data-program=\"")?;
|
||||
escape_href(&mut self.writer, self.page_id)?;
|
||||
self.write(":")?;
|
||||
escape_html(&mut self.writer, program_name)?;
|
||||
self.write("\" data-language=\"")?;
|
||||
escape_html(&mut self.writer, language)?;
|
||||
self.write("\" role=\"code\">")?;
|
||||
|
||||
self.write("<pre class=\"placeholder-console\">")?;
|
||||
Ok(())
|
||||
if let LiterateCodeKind::Output { placeholder_pic_id } = kind {
|
||||
if !placeholder_pic_id.is_empty() {
|
||||
self.write(
|
||||
"<img class=\"placeholder-image\" loading=\"lazy\" src=\"",
|
||||
)?;
|
||||
escape_html(
|
||||
&mut self.writer,
|
||||
&self.config.pic_url(placeholder_pic_id),
|
||||
)?;
|
||||
self.write("\"")?;
|
||||
if let Some(PicSize { width, height }) = self
|
||||
.config_derived_data
|
||||
.pic_size(self.config, placeholder_pic_id)
|
||||
{
|
||||
self.write(&format!(
|
||||
" width=\"{width}\" height=\"{height}\""
|
||||
))?;
|
||||
}
|
||||
self.write(">")?;
|
||||
}
|
||||
}
|
||||
|
||||
self.write("<pre class=\"placeholder-console\">")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
CodeBlockKind::Indented => self.write("<pre><code>"),
|
||||
}
|
||||
}
|
||||
|
@ -416,7 +430,7 @@ where
|
|||
},
|
||||
_ => "</code></pre>\n",
|
||||
})?;
|
||||
self.in_code_block = false;
|
||||
self.code_block_state = CodeBlockState::NotInCodeBlock;
|
||||
}
|
||||
Tag::List(Some(_)) => {
|
||||
self.write("</ol>\n")?;
|
||||
|
@ -505,8 +519,20 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
if self.in_code_block {
|
||||
escape_html(&mut self.writer, text)?;
|
||||
if let CodeBlockState::InCodeBlock(language) = &self.code_block_state {
|
||||
let code_block_mode = language
|
||||
.as_ref()
|
||||
.map(|language| CodeBlockMode::parse(language));
|
||||
let highlighting_language = code_block_mode
|
||||
.as_ref()
|
||||
.and_then(|mode| mode.highlighting_language());
|
||||
let syntax =
|
||||
highlighting_language.and_then(|language| self.config.syntaxes.get(language));
|
||||
if let Some(syntax) = syntax {
|
||||
highlight(&mut self.writer, syntax, text)?;
|
||||
} else {
|
||||
escape_html(&mut self.writer, text)?;
|
||||
}
|
||||
} else {
|
||||
let mut parser = EmojiParser { text, position: 0 };
|
||||
while let Some(token) = parser.next_token() {
|
||||
|
@ -623,6 +649,16 @@ impl<'a> CodeBlockMode<'a> {
|
|||
CodeBlockMode::SyntaxHighlightOnly { language }
|
||||
}
|
||||
}
|
||||
|
||||
fn highlighting_language(&self) -> Option<&str> {
|
||||
if let CodeBlockMode::LiterateProgram { language, .. }
|
||||
| CodeBlockMode::SyntaxHighlightOnly { language } = self
|
||||
{
|
||||
Some(language)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterate over an `Iterator` of `Event`s, generate HTML for each `Event`, and
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue