syntax v2
introduce a new, more ergonomic syntax for haku not all features are implemented just yet. still missing: - custom tags (non-True/False) - color literals - lists
This commit is contained in:
parent
a3e5e8bd10
commit
2595bf0d82
21 changed files with 2844 additions and 1062 deletions
237
crates/haku/src/lexer.rs
Normal file
237
crates/haku/src/lexer.rs
Normal file
|
@ -0,0 +1,237 @@
|
|||
use alloc::vec::Vec;
|
||||
|
||||
use crate::{
|
||||
diagnostic::Diagnostic,
|
||||
source::{SourceCode, Span},
|
||||
token::{Lexis, TokenAllocError, TokenKind},
|
||||
};
|
||||
|
||||
pub struct Lexer<'a> {
|
||||
pub lexis: Lexis,
|
||||
pub diagnostics: Vec<Diagnostic>,
|
||||
input: &'a SourceCode,
|
||||
position: u32,
|
||||
}
|
||||
|
||||
impl<'a> Lexer<'a> {
|
||||
pub fn new(lexis: Lexis, input: &'a SourceCode) -> Self {
|
||||
Self {
|
||||
lexis,
|
||||
diagnostics: Vec::new(),
|
||||
input,
|
||||
position: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn current(&self) -> char {
|
||||
self.input[self.position as usize..]
|
||||
.chars()
|
||||
.next()
|
||||
.unwrap_or('\0')
|
||||
}
|
||||
|
||||
fn advance(&mut self) {
|
||||
self.position += self.current().len_utf8() as u32;
|
||||
}
|
||||
|
||||
fn emit(&mut self, diagnostic: Diagnostic) {
|
||||
if self.diagnostics.len() < self.diagnostics.capacity() {
|
||||
self.diagnostics.push(diagnostic);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn one(l: &mut Lexer<'_>, kind: TokenKind) -> TokenKind {
|
||||
l.advance();
|
||||
kind
|
||||
}
|
||||
|
||||
fn one_or_two(l: &mut Lexer<'_>, kind1: TokenKind, c2: char, kind2: TokenKind) -> TokenKind {
|
||||
l.advance();
|
||||
if l.current() == c2 {
|
||||
l.advance();
|
||||
kind2
|
||||
} else {
|
||||
kind1
|
||||
}
|
||||
}
|
||||
|
||||
fn is_ident_char(c: char) -> bool {
|
||||
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
|
||||
}
|
||||
|
||||
fn ident(l: &mut Lexer<'_>) -> TokenKind {
|
||||
let start = l.position;
|
||||
while is_ident_char(l.current()) {
|
||||
l.advance();
|
||||
}
|
||||
let end = l.position;
|
||||
|
||||
match Span::new(start, end).slice(l.input) {
|
||||
"_" => TokenKind::Underscore,
|
||||
"and" => TokenKind::And,
|
||||
"or" => TokenKind::Or,
|
||||
"if" => TokenKind::If,
|
||||
"else" => TokenKind::Else,
|
||||
"let" => TokenKind::Let,
|
||||
_ => TokenKind::Ident,
|
||||
}
|
||||
}
|
||||
|
||||
fn tag(l: &mut Lexer<'_>) -> TokenKind {
|
||||
while is_ident_char(l.current()) {
|
||||
l.advance();
|
||||
}
|
||||
TokenKind::Tag
|
||||
}
|
||||
|
||||
// NOTE: You shouldn't expect that the numbers produced by the lexer are parsable.
|
||||
fn number(l: &mut Lexer<'_>) -> TokenKind {
|
||||
while l.current().is_ascii_digit() {
|
||||
l.advance();
|
||||
}
|
||||
|
||||
if l.current() == '.' {
|
||||
let dot = l.position;
|
||||
l.advance();
|
||||
if !l.current().is_ascii_digit() {
|
||||
l.emit(Diagnostic::error(
|
||||
Span::new(dot, l.position),
|
||||
"there must be at least a single digit after the decimal point",
|
||||
));
|
||||
}
|
||||
while l.current().is_ascii_digit() {
|
||||
l.advance();
|
||||
}
|
||||
}
|
||||
|
||||
TokenKind::Number
|
||||
}
|
||||
|
||||
// NOTE: You shouldn't expect that the color literals produced by the lexer are parsable.
|
||||
fn color(l: &mut Lexer<'_>) -> TokenKind {
|
||||
let hash = l.position;
|
||||
l.advance(); // #
|
||||
|
||||
if !l.current().is_ascii_hexdigit() {
|
||||
l.emit(Diagnostic::error(
|
||||
Span::new(hash, l.position),
|
||||
"hex digits expected after `#` (color literal)",
|
||||
));
|
||||
}
|
||||
|
||||
let start = l.position;
|
||||
while l.current().is_ascii_hexdigit() {
|
||||
l.advance();
|
||||
}
|
||||
let len = l.position - start;
|
||||
|
||||
if !matches!(len, 3 | 4 | 6 | 8) {
|
||||
l.emit(Diagnostic::error(Span::new(hash, l.position), "incorrect number of digits in color literal (must be #RGB, #RGBA, #RRGGBB, or #RRGGBBAA)"));
|
||||
}
|
||||
|
||||
TokenKind::Color
|
||||
}
|
||||
|
||||
fn whitespace_and_comments(l: &mut Lexer<'_>) {
|
||||
loop {
|
||||
match l.current() {
|
||||
'-' => {
|
||||
let position = l.position;
|
||||
l.advance();
|
||||
if l.current() == '-' {
|
||||
while l.current() != '\n' {
|
||||
l.advance();
|
||||
}
|
||||
} else {
|
||||
// An unfortunate little bit of backtracking here;
|
||||
// This seems like the simplest possible solution though.
|
||||
// We don't treat comments as a separate token to simplify the parsing phase,
|
||||
// and because of this, handling this at the "real" token level would complicate
|
||||
// things quite a bit.
|
||||
l.position = position;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
' ' | '\r' | '\t' => l.advance(),
|
||||
|
||||
_ => break,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn newline(l: &mut Lexer<'_>) -> (TokenKind, Span) {
|
||||
let start = l.position;
|
||||
l.advance(); // skip the initial newline
|
||||
let end = l.position;
|
||||
|
||||
// Skip additional newlines after this one, to only produce one token.
|
||||
// These do not count into this newline's span though.
|
||||
loop {
|
||||
whitespace_and_comments(l);
|
||||
if l.current() == '\n' {
|
||||
l.advance();
|
||||
continue;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
(TokenKind::Newline, Span::new(start, end))
|
||||
}
|
||||
|
||||
fn token(l: &mut Lexer<'_>) -> (TokenKind, Span) {
|
||||
whitespace_and_comments(l);
|
||||
|
||||
let start = l.position;
|
||||
let kind = match l.current() {
|
||||
'\0' => TokenKind::Eof,
|
||||
|
||||
// NOTE: Order matters here. Numbers and tags take priority over identifers.
|
||||
c if c.is_ascii_uppercase() => tag(l),
|
||||
c if c.is_ascii_digit() => number(l),
|
||||
c if is_ident_char(c) => ident(l),
|
||||
|
||||
'#' => color(l),
|
||||
|
||||
'+' => one(l, TokenKind::Plus),
|
||||
'-' => one_or_two(l, TokenKind::Minus, '>', TokenKind::RArrow),
|
||||
'*' => one(l, TokenKind::Star),
|
||||
'/' => one(l, TokenKind::Slash),
|
||||
'=' => one_or_two(l, TokenKind::Equal, '=', TokenKind::EqualEqual),
|
||||
'!' => one_or_two(l, TokenKind::Not, '=', TokenKind::NotEqual),
|
||||
'<' => one_or_two(l, TokenKind::Less, '=', TokenKind::LessEqual),
|
||||
'>' => one_or_two(l, TokenKind::Greater, '=', TokenKind::GreaterEqual),
|
||||
|
||||
'\n' => return newline(l),
|
||||
'(' => one(l, TokenKind::LParen),
|
||||
')' => one(l, TokenKind::RParen),
|
||||
'[' => one(l, TokenKind::LBrack),
|
||||
']' => one(l, TokenKind::RBrack),
|
||||
',' => one(l, TokenKind::Comma),
|
||||
'\\' => one(l, TokenKind::Backslash),
|
||||
|
||||
_ => {
|
||||
l.advance();
|
||||
l.emit(Diagnostic::error(
|
||||
Span::new(start, l.position),
|
||||
"unexpected character",
|
||||
));
|
||||
TokenKind::Error
|
||||
}
|
||||
};
|
||||
let end = l.position;
|
||||
(kind, Span::new(start, end))
|
||||
}
|
||||
|
||||
pub fn lex(l: &mut Lexer<'_>) -> Result<(), TokenAllocError> {
|
||||
loop {
|
||||
let (kind, span) = token(l);
|
||||
l.lexis.push(kind, span)?;
|
||||
if kind == TokenKind::Eof {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue