treehouse/src/html/highlight.rs

143 lines
4.3 KiB
Rust
Raw Normal View History

2024-03-10 23:23:50 +01:00
//! Tokenizer and syntax highlighter inspired by the one found in rxi's lite.
//! I highly recommend checking it out!
//! https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
//! There's also a mirror of it in the JavaScript, used to power dynamically editable code blocks.
//!
//! Both of these syntax highlighters use the same JSON syntax definitions; however this one is
//! more limited, in that patterns do not support backtracking.
//! This is effectively enforced in the dynamic highlighter because this highlighter reports any
//! regex syntax errors upon site compilation.
pub mod compiled;
pub mod tokenize;
use std::{collections::HashMap, fmt::Write};
2024-03-10 23:23:50 +01:00
use serde::{Deserialize, Serialize};
2025-08-30 13:13:29 +02:00
use crate::html::highlight::tokenize::Token;
2024-03-10 23:23:50 +01:00
use self::compiled::CompiledSyntax;
use super::EscapeHtml;
2024-03-10 23:23:50 +01:00
/// Syntax definition.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Syntax {
/// Patterns, matched sequentially (patterns at the beginning of the list take precedence.)
pub patterns: Vec<Pattern>,
/// Map of replacements to use if a pattern matches a string exactly.
pub keywords: HashMap<String, Keyword>,
}
/// A pattern in a syntax definition.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct Pattern {
/// Regular expression to match.
pub regex: String,
/// Flags to pass to the regex engine to alter how strings are matched.
#[serde(default)]
pub flags: Vec<RegexFlag>,
/// Type to assign to the token. This can be any string, but only a select few have colors
/// assigned.
pub is: TokenTypes,
}
/// Assignable token types.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(untagged)]
pub enum TokenTypes {
/// Assign a single token type to the entire match.
FullMatch(String),
/// Assign individual token types to each capture.
Captures(CaptureTokenTypes),
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct CaptureTokenTypes {
/// Token type to use outside captures.
pub default: String,
/// Token type to use inside captures.
pub captures: Vec<String>,
}
/// Flag passed to the regex engine.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub enum RegexFlag {
/// Make `.` match line separators.
DotMatchesNewline,
}
/// Keyword replacement.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Keyword {
/// What to replace the token type with.
pub into: String,
/// Only replace the token type if it matches this one. If this is not present, any token type
/// is replaced.
pub only_replaces: Option<String>,
}
2025-08-30 13:13:29 +02:00
fn write_tokens(
out: &mut String,
syntax: &CompiledSyntax,
code: &str,
tokens: impl Iterator<Item = Token>,
) {
2024-03-10 23:23:50 +01:00
for token in tokens {
2025-08-30 13:13:29 +02:00
let str = &code[token.range.clone()];
out.push_str("<span class=\"");
_ = write!(out, "{}", EscapeHtml(&syntax.token_names[token.id]));
out.push_str("\">");
2025-08-30 13:13:29 +02:00
_ = write!(out, "{}", EscapeHtml(str));
out.push_str("</span>");
2024-03-10 23:23:50 +01:00
}
}
2025-08-30 13:13:29 +02:00
pub fn highlight(out: &mut String, syntax: &CompiledSyntax, code: &str) {
let tokens = syntax.tokenize(code);
let mut line = vec![];
let mut in_columns = false;
for token in tokens {
let str = &code[token.range.clone()];
line.push(token);
if str.ends_with('\n') {
let line_comment = if line.last().is_some_and(|token| {
Some(token.id) == syntax.comment_token_id
&& code[token.range.clone()].ends_with('\n')
}) {
line.pop()
} else {
None
};
if let Some(line_comment) = line_comment {
if !in_columns {
out.push_str("<th-comment-columns>");
in_columns = true;
}
out.push_str("<span>");
write_tokens(out, syntax, code, line.drain(..));
out.push_str("</span>");
write_tokens(out, syntax, code, [line_comment].into_iter());
} else {
if in_columns {
out.push_str("</th-comment-columns>");
in_columns = false;
}
write_tokens(out, syntax, code, line.drain(..));
}
}
}
}