rkgk/crates/haku/src/lexer.rs
liquidex 2595bf0d82 syntax v2
introduce a new, more ergonomic syntax for haku
not all features are implemented just yet. still missing:

- custom tags (non-True/False)
- color literals
- lists
2024-09-01 09:29:11 +02:00

238 lines
6.3 KiB
Rust

use alloc::vec::Vec;
use crate::{
diagnostic::Diagnostic,
source::{SourceCode, Span},
token::{Lexis, TokenAllocError, TokenKind},
};
pub struct Lexer<'a> {
pub lexis: Lexis,
pub diagnostics: Vec<Diagnostic>,
input: &'a SourceCode,
position: u32,
}
impl<'a> Lexer<'a> {
pub fn new(lexis: Lexis, input: &'a SourceCode) -> Self {
Self {
lexis,
diagnostics: Vec::new(),
input,
position: 0,
}
}
fn current(&self) -> char {
self.input[self.position as usize..]
.chars()
.next()
.unwrap_or('\0')
}
fn advance(&mut self) {
self.position += self.current().len_utf8() as u32;
}
fn emit(&mut self, diagnostic: Diagnostic) {
if self.diagnostics.len() < self.diagnostics.capacity() {
self.diagnostics.push(diagnostic);
}
}
}
fn one(l: &mut Lexer<'_>, kind: TokenKind) -> TokenKind {
l.advance();
kind
}
fn one_or_two(l: &mut Lexer<'_>, kind1: TokenKind, c2: char, kind2: TokenKind) -> TokenKind {
l.advance();
if l.current() == c2 {
l.advance();
kind2
} else {
kind1
}
}
fn is_ident_char(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
}
fn ident(l: &mut Lexer<'_>) -> TokenKind {
let start = l.position;
while is_ident_char(l.current()) {
l.advance();
}
let end = l.position;
match Span::new(start, end).slice(l.input) {
"_" => TokenKind::Underscore,
"and" => TokenKind::And,
"or" => TokenKind::Or,
"if" => TokenKind::If,
"else" => TokenKind::Else,
"let" => TokenKind::Let,
_ => TokenKind::Ident,
}
}
fn tag(l: &mut Lexer<'_>) -> TokenKind {
while is_ident_char(l.current()) {
l.advance();
}
TokenKind::Tag
}
// NOTE: You shouldn't expect that the numbers produced by the lexer are parsable.
fn number(l: &mut Lexer<'_>) -> TokenKind {
while l.current().is_ascii_digit() {
l.advance();
}
if l.current() == '.' {
let dot = l.position;
l.advance();
if !l.current().is_ascii_digit() {
l.emit(Diagnostic::error(
Span::new(dot, l.position),
"there must be at least a single digit after the decimal point",
));
}
while l.current().is_ascii_digit() {
l.advance();
}
}
TokenKind::Number
}
// NOTE: You shouldn't expect that the color literals produced by the lexer are parsable.
fn color(l: &mut Lexer<'_>) -> TokenKind {
let hash = l.position;
l.advance(); // #
if !l.current().is_ascii_hexdigit() {
l.emit(Diagnostic::error(
Span::new(hash, l.position),
"hex digits expected after `#` (color literal)",
));
}
let start = l.position;
while l.current().is_ascii_hexdigit() {
l.advance();
}
let len = l.position - start;
if !matches!(len, 3 | 4 | 6 | 8) {
l.emit(Diagnostic::error(Span::new(hash, l.position), "incorrect number of digits in color literal (must be #RGB, #RGBA, #RRGGBB, or #RRGGBBAA)"));
}
TokenKind::Color
}
fn whitespace_and_comments(l: &mut Lexer<'_>) {
loop {
match l.current() {
'-' => {
let position = l.position;
l.advance();
if l.current() == '-' {
while l.current() != '\n' {
l.advance();
}
} else {
// An unfortunate little bit of backtracking here;
// This seems like the simplest possible solution though.
// We don't treat comments as a separate token to simplify the parsing phase,
// and because of this, handling this at the "real" token level would complicate
// things quite a bit.
l.position = position;
break;
}
}
' ' | '\r' | '\t' => l.advance(),
_ => break,
}
}
}
fn newline(l: &mut Lexer<'_>) -> (TokenKind, Span) {
let start = l.position;
l.advance(); // skip the initial newline
let end = l.position;
// Skip additional newlines after this one, to only produce one token.
// These do not count into this newline's span though.
loop {
whitespace_and_comments(l);
if l.current() == '\n' {
l.advance();
continue;
} else {
break;
}
}
(TokenKind::Newline, Span::new(start, end))
}
fn token(l: &mut Lexer<'_>) -> (TokenKind, Span) {
whitespace_and_comments(l);
let start = l.position;
let kind = match l.current() {
'\0' => TokenKind::Eof,
// NOTE: Order matters here. Numbers and tags take priority over identifers.
c if c.is_ascii_uppercase() => tag(l),
c if c.is_ascii_digit() => number(l),
c if is_ident_char(c) => ident(l),
'#' => color(l),
'+' => one(l, TokenKind::Plus),
'-' => one_or_two(l, TokenKind::Minus, '>', TokenKind::RArrow),
'*' => one(l, TokenKind::Star),
'/' => one(l, TokenKind::Slash),
'=' => one_or_two(l, TokenKind::Equal, '=', TokenKind::EqualEqual),
'!' => one_or_two(l, TokenKind::Not, '=', TokenKind::NotEqual),
'<' => one_or_two(l, TokenKind::Less, '=', TokenKind::LessEqual),
'>' => one_or_two(l, TokenKind::Greater, '=', TokenKind::GreaterEqual),
'\n' => return newline(l),
'(' => one(l, TokenKind::LParen),
')' => one(l, TokenKind::RParen),
'[' => one(l, TokenKind::LBrack),
']' => one(l, TokenKind::RBrack),
',' => one(l, TokenKind::Comma),
'\\' => one(l, TokenKind::Backslash),
_ => {
l.advance();
l.emit(Diagnostic::error(
Span::new(start, l.position),
"unexpected character",
));
TokenKind::Error
}
};
let end = l.position;
(kind, Span::new(start, end))
}
pub fn lex(l: &mut Lexer<'_>) -> Result<(), TokenAllocError> {
loop {
let (kind, span) = token(l);
l.lexis.push(kind, span)?;
if kind == TokenKind::Eof {
break;
}
}
Ok(())
}