2024-03-10 23:23:50 +01:00
|
|
|
use std::ops::Range;
|
|
|
|
|
2025-08-26 12:46:50 +02:00
|
|
|
use super::compiled::{CompiledSyntax, CompiledTokenTypes, TOKEN_ID_DEFAULT, TokenId};
|
2024-03-10 23:23:50 +01:00
|
|
|
|
|
|
|
pub struct Token {
|
|
|
|
pub id: TokenId,
|
|
|
|
pub range: Range<usize>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl CompiledSyntax {
|
|
|
|
pub fn tokenize(&self, text: &str) -> Vec<Token> {
|
|
|
|
let mut tokens = vec![];
|
|
|
|
|
|
|
|
let mut i = 0;
|
|
|
|
while i < text.len() {
|
|
|
|
let mut had_match = false;
|
|
|
|
for pattern in &self.patterns {
|
|
|
|
match &pattern.is {
|
|
|
|
CompiledTokenTypes::FullMatch(id) => {
|
|
|
|
if let Some(regex_match) = pattern.regex.find(&text[i..]) {
|
|
|
|
push_token(&mut tokens, *id, i..i + regex_match.range().end);
|
|
|
|
i += regex_match.range().end;
|
|
|
|
had_match = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2024-03-11 23:43:39 +01:00
|
|
|
CompiledTokenTypes::Captures(types) => {
|
|
|
|
if let Some(captures) = pattern.regex.captures(&text[i..]) {
|
|
|
|
let whole_match = captures.get(0).unwrap();
|
|
|
|
let mut last_match_end = 0;
|
|
|
|
for (index, capture) in captures
|
|
|
|
.iter()
|
|
|
|
.skip(1)
|
|
|
|
.enumerate()
|
|
|
|
.filter_map(|(i, m)| m.map(|m| (i, m)))
|
|
|
|
{
|
|
|
|
let id = types
|
|
|
|
.captures
|
|
|
|
.get(index)
|
|
|
|
.copied()
|
|
|
|
.unwrap_or(TOKEN_ID_DEFAULT);
|
|
|
|
push_token(
|
|
|
|
&mut tokens,
|
|
|
|
types.default,
|
|
|
|
i + last_match_end..i + capture.range().start,
|
|
|
|
);
|
|
|
|
push_token(
|
|
|
|
&mut tokens,
|
|
|
|
id,
|
|
|
|
i + capture.range().start..i + capture.range().end,
|
|
|
|
);
|
|
|
|
last_match_end = capture.range().end;
|
|
|
|
}
|
2024-03-25 22:07:52 +01:00
|
|
|
push_token(
|
|
|
|
&mut tokens,
|
|
|
|
types.default,
|
|
|
|
i + last_match_end..i + whole_match.range().end,
|
|
|
|
);
|
2024-03-11 23:43:39 +01:00
|
|
|
i += whole_match.range().end;
|
|
|
|
had_match = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2024-03-10 23:23:50 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !had_match {
|
|
|
|
push_token(&mut tokens, TOKEN_ID_DEFAULT, i..i + 1);
|
|
|
|
i += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for token in &mut tokens {
|
2025-08-26 12:46:50 +02:00
|
|
|
if let Some(keyword) = self.keywords.get(&text[token.range.clone()])
|
|
|
|
&& (keyword.only_replaces.is_none() || Some(token.id) == keyword.only_replaces)
|
|
|
|
{
|
|
|
|
token.id = keyword.into;
|
2024-03-10 23:23:50 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
tokens
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn push_token(tokens: &mut Vec<Token>, id: TokenId, range: Range<usize>) {
|
2024-03-11 23:43:39 +01:00
|
|
|
if range.is_empty() {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2025-08-26 12:46:50 +02:00
|
|
|
if let Some(previous_token) = tokens.last_mut()
|
|
|
|
&& previous_token.id == id
|
|
|
|
{
|
|
|
|
previous_token.range.end = range.end;
|
|
|
|
return;
|
2024-03-10 23:23:50 +01:00
|
|
|
}
|
|
|
|
tokens.push(Token { id, range });
|
|
|
|
}
|