rkgk/crates/haku/src/lexer.rs

use alloc::vec::Vec;

use crate::{
    diagnostic::Diagnostic,
    source::{SourceCode, Span},
    token::{Lexis, TokenAllocError, TokenKind},
};

pub struct Lexer<'a> {
    pub lexis: Lexis,
    pub diagnostics: Vec<Diagnostic>,
    input: &'a SourceCode,
    position: u32,
}

impl<'a> Lexer<'a> {
    pub fn new(lexis: Lexis, input: &'a SourceCode) -> Self {
        Self {
            lexis,
            diagnostics: Vec::new(),
            input,
            position: 0,
        }
    }

    fn current(&self) -> char {
        self.input[self.position as usize..]
            .chars()
            .next()
            .unwrap_or('\0')
    }

    fn advance(&mut self) {
        self.position += self.current().len_utf8() as u32;
    }

    fn emit(&mut self, diagnostic: Diagnostic) {
        if self.diagnostics.len() < self.diagnostics.capacity() {
            self.diagnostics.push(diagnostic);
        }
    }
}

fn one(l: &mut Lexer<'_>, kind: TokenKind) -> TokenKind {
    l.advance();
    kind
}

fn one_or_two(l: &mut Lexer<'_>, kind1: TokenKind, c2: char, kind2: TokenKind) -> TokenKind {
    l.advance();
    if l.current() == c2 {
        l.advance();
        kind2
    } else {
        kind1
    }
}

fn is_ident_char(c: char) -> bool {
    matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
}

fn ident(l: &mut Lexer<'_>) -> TokenKind {
    let start = l.position;
    while is_ident_char(l.current()) {
        l.advance();
    }
    let end = l.position;

    match Span::new(start, end).slice(l.input) {
        "_" => TokenKind::Underscore,
        "and" => TokenKind::And,
        "or" => TokenKind::Or,
        "if" => TokenKind::If,
        "else" => TokenKind::Else,
        "let" => TokenKind::Let,
        _ => TokenKind::Ident,
    }
}

fn tag(l: &mut Lexer<'_>) -> TokenKind {
    while is_ident_char(l.current()) {
        l.advance();
    }
    TokenKind::Tag
}

// NOTE: You shouldn't expect that the numbers produced by the lexer are parsable.
fn number(l: &mut Lexer<'_>) -> TokenKind {
    while l.current().is_ascii_digit() {
        l.advance();
    }

    if l.current() == '.' {
        let dot = l.position;
        l.advance();
        if !l.current().is_ascii_digit() {
            l.emit(Diagnostic::error(
                Span::new(dot, l.position),
                "there must be at least a single digit after the decimal point",
            ));
        }
        while l.current().is_ascii_digit() {
            l.advance();
        }
    }

    TokenKind::Number
}

// NOTE: You shouldn't expect that the color literals produced by the lexer are parsable.
fn color(l: &mut Lexer<'_>) -> TokenKind {
    let hash = l.position;
    l.advance(); // #

    if !l.current().is_ascii_hexdigit() {
        l.emit(Diagnostic::error(
            Span::new(hash, l.position),
            "hex digits expected after `#` (color literal)",
        ));
    }

    let start = l.position;
    while l.current().is_ascii_hexdigit() {
        l.advance();
    }
    let len = l.position - start;

    if !matches!(len, 3 | 4 | 6 | 8) {
        l.emit(Diagnostic::error(Span::new(hash, l.position), "incorrect number of digits in color literal (must be #RGB, #RGBA, #RRGGBB, or #RRGGBBAA)"));
    }

    TokenKind::Color
}

fn whitespace_and_comments(l: &mut Lexer<'_>) {
    loop {
        match l.current() {
            '-' => {
                let position = l.position;
                l.advance();
                if l.current() == '-' {
                    while l.current() != '\n' {
                        l.advance();
                    }
                } else {
                    // An unfortunate little bit of backtracking here;
                    // This seems like the simplest possible solution though.
                    // We don't treat comments as a separate token to simplify the parsing phase,
                    // and because of this, handling this at the "real" token level would complicate
                    // things quite a bit.
                    l.position = position;
                    break;
                }
            }

            ' ' | '\r' | '\t' => l.advance(),

            _ => break,
        }
    }
}

fn newline(l: &mut Lexer<'_>) -> (TokenKind, Span) {
    let start = l.position;
    l.advance(); // skip the initial newline
    let end = l.position;

    // Skip additional newlines after this one, to only produce one token.
    // These do not count into this newline's span though.
    loop {
        whitespace_and_comments(l);
        if l.current() == '\n' {
            l.advance();
            continue;
        } else {
            break;
        }
    }

    (TokenKind::Newline, Span::new(start, end))
}

fn token(l: &mut Lexer<'_>) -> (TokenKind, Span) {
    whitespace_and_comments(l);

    let start = l.position;
    let kind = match l.current() {
        '\0' => TokenKind::Eof,

        // NOTE: Order matters here. Numbers and tags take priority over identifers.
        c if c.is_ascii_uppercase() => tag(l),
        c if c.is_ascii_digit() => number(l),
        c if is_ident_char(c) => ident(l),

        '#' => color(l),

        '+' => one(l, TokenKind::Plus),
        '-' => one_or_two(l, TokenKind::Minus, '>', TokenKind::RArrow),
        '*' => one(l, TokenKind::Star),
        '/' => one(l, TokenKind::Slash),
        '=' => one_or_two(l, TokenKind::Equal, '=', TokenKind::EqualEqual),
        '!' => one_or_two(l, TokenKind::Not, '=', TokenKind::NotEqual),
        '<' => one_or_two(l, TokenKind::Less, '=', TokenKind::LessEqual),
        '>' => one_or_two(l, TokenKind::Greater, '=', TokenKind::GreaterEqual),

        '\n' => return newline(l),
        '(' => one(l, TokenKind::LParen),
        ')' => one(l, TokenKind::RParen),
        '[' => one(l, TokenKind::LBrack),
        ']' => one(l, TokenKind::RBrack),
        ',' => one(l, TokenKind::Comma),
        '\\' => one(l, TokenKind::Backslash),

        _ => {
            l.advance();
            l.emit(Diagnostic::error(
                Span::new(start, l.position),
                "unexpected character",
            ));
            TokenKind::Error
        }
    };
    let end = l.position;
    (kind, Span::new(start, end))
}

pub fn lex(l: &mut Lexer<'_>) -> Result<(), TokenAllocError> {
    loop {
        let (kind, span) = token(l);
        l.lexis.push(kind, span)?;
        if kind == TokenKind::Eof {
            break;
        }
    }
    Ok(())
}