//! Tokenizer and syntax highlighter inspired by the one found in rxi's lite. //! I highly recommend checking it out! //! https://github.com/rxi/lite/blob/master/data/core/tokenizer.lua //! There's also a mirror of it in the JavaScript, used to power dynamically editable code blocks. //! //! Both of these syntax highlighters use the same JSON syntax definitions; however this one is //! more limited, in that patterns do not support backtracking. //! This is effectively enforced in the dynamic highlighter because this highlighter reports any //! regex syntax errors upon site compilation. pub mod compiled; pub mod tokenize; use std::{collections::HashMap, fmt::Write}; use serde::{Deserialize, Serialize}; use crate::html::highlight::tokenize::Token; use self::compiled::CompiledSyntax; use super::EscapeHtml; /// Syntax definition. #[derive(Debug, Clone, Deserialize, Serialize)] pub struct Syntax { /// Patterns, matched sequentially (patterns at the beginning of the list take precedence.) pub patterns: Vec, /// Map of replacements to use if a pattern matches a string exactly. pub keywords: HashMap, } /// A pattern in a syntax definition. #[derive(Debug, Clone, Deserialize, Serialize)] pub struct Pattern { /// Regular expression to match. pub regex: String, /// Flags to pass to the regex engine to alter how strings are matched. #[serde(default)] pub flags: Vec, /// Type to assign to the token. This can be any string, but only a select few have colors /// assigned. pub is: TokenTypes, } /// Assignable token types. #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(untagged)] pub enum TokenTypes { /// Assign a single token type to the entire match. FullMatch(String), /// Assign individual token types to each capture. Captures(CaptureTokenTypes), } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct CaptureTokenTypes { /// Token type to use outside captures. pub default: String, /// Token type to use inside captures. pub captures: Vec, } /// Flag passed to the regex engine. #[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub enum RegexFlag { /// Make `.` match line separators. DotMatchesNewline, } /// Keyword replacement. #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct Keyword { /// What to replace the token type with. pub into: String, /// Only replace the token type if it matches this one. If this is not present, any token type /// is replaced. pub only_replaces: Option, } fn write_tokens( out: &mut String, syntax: &CompiledSyntax, code: &str, tokens: impl Iterator, ) { for token in tokens { let str = &code[token.range.clone()]; out.push_str(""); _ = write!(out, "{}", EscapeHtml(str)); out.push_str(""); } } pub fn highlight(out: &mut String, syntax: &CompiledSyntax, code: &str) { let tokens = syntax.tokenize(code); let mut line = vec![]; let mut in_columns = false; for token in tokens { let str = &code[token.range.clone()]; line.push(token); if str.ends_with('\n') { let line_comment = if line.last().is_some_and(|token| { Some(token.id) == syntax.comment_token_id && code[token.range.clone()].ends_with('\n') }) { line.pop() } else { None }; if let Some(line_comment) = line_comment { if !in_columns { out.push_str(""); in_columns = true; } out.push_str(""); write_tokens(out, syntax, code, line.drain(..)); out.push_str(""); write_tokens(out, syntax, code, [line_comment].into_iter()); } else { if in_columns { out.push_str(""); in_columns = false; } write_tokens(out, syntax, code, line.drain(..)); } } } }