2024-03-10 23:23:50 +01:00
|
|
|
//! Tokenizer and syntax highlighter inspired by the one found in rxi's lite.
|
|
|
|
|
//! I highly recommend checking it out!
|
|
|
|
|
//! https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua
|
|
|
|
|
//! There's also a mirror of it in the JavaScript, used to power dynamically editable code blocks.
|
|
|
|
|
//!
|
|
|
|
|
//! Both of these syntax highlighters use the same JSON syntax definitions; however this one is
|
|
|
|
|
//! more limited, in that patterns do not support backtracking.
|
|
|
|
|
//! This is effectively enforced in the dynamic highlighter because this highlighter reports any
|
|
|
|
|
//! regex syntax errors upon site compilation.
|
|
|
|
|
|
|
|
|
|
pub mod compiled;
|
|
|
|
|
pub mod tokenize;
|
|
|
|
|
|
2024-11-16 18:33:41 +01:00
|
|
|
use std::{collections::HashMap, fmt::Write};
|
2024-03-10 23:23:50 +01:00
|
|
|
|
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
|
2025-08-30 13:13:29 +02:00
|
|
|
use crate::html::highlight::tokenize::Token;
|
|
|
|
|
|
2024-03-10 23:23:50 +01:00
|
|
|
use self::compiled::CompiledSyntax;
|
|
|
|
|
|
2024-11-16 18:33:41 +01:00
|
|
|
use super::EscapeHtml;
|
|
|
|
|
|
2024-03-10 23:23:50 +01:00
|
|
|
/// Syntax definition.
|
|
|
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
|
|
pub struct Syntax {
|
|
|
|
|
/// Patterns, matched sequentially (patterns at the beginning of the list take precedence.)
|
|
|
|
|
pub patterns: Vec<Pattern>,
|
|
|
|
|
|
|
|
|
|
/// Map of replacements to use if a pattern matches a string exactly.
|
|
|
|
|
pub keywords: HashMap<String, Keyword>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// A pattern in a syntax definition.
|
|
|
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
|
|
pub struct Pattern {
|
|
|
|
|
/// Regular expression to match.
|
|
|
|
|
pub regex: String,
|
|
|
|
|
|
|
|
|
|
/// Flags to pass to the regex engine to alter how strings are matched.
|
|
|
|
|
#[serde(default)]
|
|
|
|
|
pub flags: Vec<RegexFlag>,
|
|
|
|
|
|
|
|
|
|
/// Type to assign to the token. This can be any string, but only a select few have colors
|
|
|
|
|
/// assigned.
|
|
|
|
|
pub is: TokenTypes,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Assignable token types.
|
|
|
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
|
|
#[serde(untagged)]
|
|
|
|
|
pub enum TokenTypes {
|
|
|
|
|
/// Assign a single token type to the entire match.
|
|
|
|
|
FullMatch(String),
|
|
|
|
|
/// Assign individual token types to each capture.
|
|
|
|
|
Captures(CaptureTokenTypes),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
|
|
pub struct CaptureTokenTypes {
|
|
|
|
|
/// Token type to use outside captures.
|
|
|
|
|
pub default: String,
|
|
|
|
|
/// Token type to use inside captures.
|
|
|
|
|
pub captures: Vec<String>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Flag passed to the regex engine.
|
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)]
|
|
|
|
|
#[serde(rename_all = "camelCase")]
|
|
|
|
|
pub enum RegexFlag {
|
|
|
|
|
/// Make `.` match line separators.
|
|
|
|
|
DotMatchesNewline,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Keyword replacement.
|
|
|
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
|
|
|
#[serde(rename_all = "camelCase")]
|
|
|
|
|
pub struct Keyword {
|
|
|
|
|
/// What to replace the token type with.
|
|
|
|
|
pub into: String,
|
|
|
|
|
|
|
|
|
|
/// Only replace the token type if it matches this one. If this is not present, any token type
|
|
|
|
|
/// is replaced.
|
|
|
|
|
pub only_replaces: Option<String>,
|
|
|
|
|
}
|
|
|
|
|
|
2025-08-30 13:13:29 +02:00
|
|
|
fn write_tokens(
|
|
|
|
|
out: &mut String,
|
|
|
|
|
syntax: &CompiledSyntax,
|
|
|
|
|
code: &str,
|
|
|
|
|
tokens: impl Iterator<Item = Token>,
|
|
|
|
|
) {
|
2024-03-10 23:23:50 +01:00
|
|
|
for token in tokens {
|
2025-08-30 13:13:29 +02:00
|
|
|
let str = &code[token.range.clone()];
|
2024-11-16 18:33:41 +01:00
|
|
|
out.push_str("<span class=\"");
|
|
|
|
|
_ = write!(out, "{}", EscapeHtml(&syntax.token_names[token.id]));
|
|
|
|
|
out.push_str("\">");
|
2025-08-30 13:13:29 +02:00
|
|
|
_ = write!(out, "{}", EscapeHtml(str));
|
2024-11-16 18:33:41 +01:00
|
|
|
out.push_str("</span>");
|
2024-03-10 23:23:50 +01:00
|
|
|
}
|
|
|
|
|
}
|
2025-08-30 13:13:29 +02:00
|
|
|
|
|
|
|
|
pub fn highlight(out: &mut String, syntax: &CompiledSyntax, code: &str) {
|
|
|
|
|
let tokens = syntax.tokenize(code);
|
|
|
|
|
let mut line = vec![];
|
|
|
|
|
|
|
|
|
|
let mut in_columns = false;
|
|
|
|
|
for token in tokens {
|
|
|
|
|
let str = &code[token.range.clone()];
|
|
|
|
|
line.push(token);
|
|
|
|
|
|
|
|
|
|
if str.ends_with('\n') {
|
|
|
|
|
let line_comment = if line.last().is_some_and(|token| {
|
|
|
|
|
Some(token.id) == syntax.comment_token_id
|
|
|
|
|
&& code[token.range.clone()].ends_with('\n')
|
|
|
|
|
}) {
|
|
|
|
|
line.pop()
|
|
|
|
|
} else {
|
|
|
|
|
None
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if let Some(line_comment) = line_comment {
|
|
|
|
|
if !in_columns {
|
|
|
|
|
out.push_str("<th-comment-columns>");
|
|
|
|
|
in_columns = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out.push_str("<span>");
|
|
|
|
|
write_tokens(out, syntax, code, line.drain(..));
|
|
|
|
|
out.push_str("</span>");
|
|
|
|
|
write_tokens(out, syntax, code, [line_comment].into_iter());
|
|
|
|
|
} else {
|
|
|
|
|
if in_columns {
|
|
|
|
|
out.push_str("</th-comment-columns>");
|
|
|
|
|
in_columns = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
write_tokens(out, syntax, code, line.drain(..));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|