liquidex
2595bf0d82
introduce a new, more ergonomic syntax for haku not all features are implemented just yet. still missing: - custom tags (non-True/False) - color literals - lists
238 lines
6.3 KiB
Rust
238 lines
6.3 KiB
Rust
use alloc::vec::Vec;
|
|
|
|
use crate::{
|
|
diagnostic::Diagnostic,
|
|
source::{SourceCode, Span},
|
|
token::{Lexis, TokenAllocError, TokenKind},
|
|
};
|
|
|
|
pub struct Lexer<'a> {
|
|
pub lexis: Lexis,
|
|
pub diagnostics: Vec<Diagnostic>,
|
|
input: &'a SourceCode,
|
|
position: u32,
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
pub fn new(lexis: Lexis, input: &'a SourceCode) -> Self {
|
|
Self {
|
|
lexis,
|
|
diagnostics: Vec::new(),
|
|
input,
|
|
position: 0,
|
|
}
|
|
}
|
|
|
|
fn current(&self) -> char {
|
|
self.input[self.position as usize..]
|
|
.chars()
|
|
.next()
|
|
.unwrap_or('\0')
|
|
}
|
|
|
|
fn advance(&mut self) {
|
|
self.position += self.current().len_utf8() as u32;
|
|
}
|
|
|
|
fn emit(&mut self, diagnostic: Diagnostic) {
|
|
if self.diagnostics.len() < self.diagnostics.capacity() {
|
|
self.diagnostics.push(diagnostic);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn one(l: &mut Lexer<'_>, kind: TokenKind) -> TokenKind {
|
|
l.advance();
|
|
kind
|
|
}
|
|
|
|
fn one_or_two(l: &mut Lexer<'_>, kind1: TokenKind, c2: char, kind2: TokenKind) -> TokenKind {
|
|
l.advance();
|
|
if l.current() == c2 {
|
|
l.advance();
|
|
kind2
|
|
} else {
|
|
kind1
|
|
}
|
|
}
|
|
|
|
fn is_ident_char(c: char) -> bool {
|
|
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
|
|
}
|
|
|
|
fn ident(l: &mut Lexer<'_>) -> TokenKind {
|
|
let start = l.position;
|
|
while is_ident_char(l.current()) {
|
|
l.advance();
|
|
}
|
|
let end = l.position;
|
|
|
|
match Span::new(start, end).slice(l.input) {
|
|
"_" => TokenKind::Underscore,
|
|
"and" => TokenKind::And,
|
|
"or" => TokenKind::Or,
|
|
"if" => TokenKind::If,
|
|
"else" => TokenKind::Else,
|
|
"let" => TokenKind::Let,
|
|
_ => TokenKind::Ident,
|
|
}
|
|
}
|
|
|
|
fn tag(l: &mut Lexer<'_>) -> TokenKind {
|
|
while is_ident_char(l.current()) {
|
|
l.advance();
|
|
}
|
|
TokenKind::Tag
|
|
}
|
|
|
|
// NOTE: You shouldn't expect that the numbers produced by the lexer are parsable.
|
|
fn number(l: &mut Lexer<'_>) -> TokenKind {
|
|
while l.current().is_ascii_digit() {
|
|
l.advance();
|
|
}
|
|
|
|
if l.current() == '.' {
|
|
let dot = l.position;
|
|
l.advance();
|
|
if !l.current().is_ascii_digit() {
|
|
l.emit(Diagnostic::error(
|
|
Span::new(dot, l.position),
|
|
"there must be at least a single digit after the decimal point",
|
|
));
|
|
}
|
|
while l.current().is_ascii_digit() {
|
|
l.advance();
|
|
}
|
|
}
|
|
|
|
TokenKind::Number
|
|
}
|
|
|
|
// NOTE: You shouldn't expect that the color literals produced by the lexer are parsable.
|
|
fn color(l: &mut Lexer<'_>) -> TokenKind {
|
|
let hash = l.position;
|
|
l.advance(); // #
|
|
|
|
if !l.current().is_ascii_hexdigit() {
|
|
l.emit(Diagnostic::error(
|
|
Span::new(hash, l.position),
|
|
"hex digits expected after `#` (color literal)",
|
|
));
|
|
}
|
|
|
|
let start = l.position;
|
|
while l.current().is_ascii_hexdigit() {
|
|
l.advance();
|
|
}
|
|
let len = l.position - start;
|
|
|
|
if !matches!(len, 3 | 4 | 6 | 8) {
|
|
l.emit(Diagnostic::error(Span::new(hash, l.position), "incorrect number of digits in color literal (must be #RGB, #RGBA, #RRGGBB, or #RRGGBBAA)"));
|
|
}
|
|
|
|
TokenKind::Color
|
|
}
|
|
|
|
fn whitespace_and_comments(l: &mut Lexer<'_>) {
|
|
loop {
|
|
match l.current() {
|
|
'-' => {
|
|
let position = l.position;
|
|
l.advance();
|
|
if l.current() == '-' {
|
|
while l.current() != '\n' {
|
|
l.advance();
|
|
}
|
|
} else {
|
|
// An unfortunate little bit of backtracking here;
|
|
// This seems like the simplest possible solution though.
|
|
// We don't treat comments as a separate token to simplify the parsing phase,
|
|
// and because of this, handling this at the "real" token level would complicate
|
|
// things quite a bit.
|
|
l.position = position;
|
|
break;
|
|
}
|
|
}
|
|
|
|
' ' | '\r' | '\t' => l.advance(),
|
|
|
|
_ => break,
|
|
}
|
|
}
|
|
}
|
|
|
|
fn newline(l: &mut Lexer<'_>) -> (TokenKind, Span) {
|
|
let start = l.position;
|
|
l.advance(); // skip the initial newline
|
|
let end = l.position;
|
|
|
|
// Skip additional newlines after this one, to only produce one token.
|
|
// These do not count into this newline's span though.
|
|
loop {
|
|
whitespace_and_comments(l);
|
|
if l.current() == '\n' {
|
|
l.advance();
|
|
continue;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
(TokenKind::Newline, Span::new(start, end))
|
|
}
|
|
|
|
fn token(l: &mut Lexer<'_>) -> (TokenKind, Span) {
|
|
whitespace_and_comments(l);
|
|
|
|
let start = l.position;
|
|
let kind = match l.current() {
|
|
'\0' => TokenKind::Eof,
|
|
|
|
// NOTE: Order matters here. Numbers and tags take priority over identifers.
|
|
c if c.is_ascii_uppercase() => tag(l),
|
|
c if c.is_ascii_digit() => number(l),
|
|
c if is_ident_char(c) => ident(l),
|
|
|
|
'#' => color(l),
|
|
|
|
'+' => one(l, TokenKind::Plus),
|
|
'-' => one_or_two(l, TokenKind::Minus, '>', TokenKind::RArrow),
|
|
'*' => one(l, TokenKind::Star),
|
|
'/' => one(l, TokenKind::Slash),
|
|
'=' => one_or_two(l, TokenKind::Equal, '=', TokenKind::EqualEqual),
|
|
'!' => one_or_two(l, TokenKind::Not, '=', TokenKind::NotEqual),
|
|
'<' => one_or_two(l, TokenKind::Less, '=', TokenKind::LessEqual),
|
|
'>' => one_or_two(l, TokenKind::Greater, '=', TokenKind::GreaterEqual),
|
|
|
|
'\n' => return newline(l),
|
|
'(' => one(l, TokenKind::LParen),
|
|
')' => one(l, TokenKind::RParen),
|
|
'[' => one(l, TokenKind::LBrack),
|
|
']' => one(l, TokenKind::RBrack),
|
|
',' => one(l, TokenKind::Comma),
|
|
'\\' => one(l, TokenKind::Backslash),
|
|
|
|
_ => {
|
|
l.advance();
|
|
l.emit(Diagnostic::error(
|
|
Span::new(start, l.position),
|
|
"unexpected character",
|
|
));
|
|
TokenKind::Error
|
|
}
|
|
};
|
|
let end = l.position;
|
|
(kind, Span::new(start, end))
|
|
}
|
|
|
|
pub fn lex(l: &mut Lexer<'_>) -> Result<(), TokenAllocError> {
|
|
loop {
|
|
let (kind, span) = token(l);
|
|
l.lexis.push(kind, span)?;
|
|
if kind == TokenKind::Eof {
|
|
break;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|