rutile/compiler/lex.c
2025-01-19 19:32:01 -03:00

581 lines
12 KiB
C

#include <stdio.h> /* feof, ferror, fread, FILE, EOF */
#include <stdlib.h> /* malloc calloc free */
#include <string.h> /* memset */
#include "lex.h"
#include "messages.h"
#include "pre.h"
#include "libs/stb_ds.h"
#define LEX_BUFFER_SIZE 8192
#define LEX_HALF_BUFFER_SIZE LEX_BUFFER_SIZE / 2
#define LEX_BUFFER_SENTINEL '\0'
#define MAX_IDENT_SIZE 1024u
#define STRING_LITERAL_BASE_SIZE 255
#define STRING_LITERAL_MAX_SIZE 4096
#define at_buffer_end(ls) (*(ls)->fwd == LEX_BUFFER_SENTINEL)
#define ascii_isident(c) (c == '_' || c == '?' || c == '!' || ascii_isalnum(c))
#define ascii_isident_start(c) (c == '_' || ascii_isalpha(c))
#define lex_error(ls, ...) do { \
error((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \
} while(0)
#define lex_fatal(ls, ...) do { \
fatal((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \
} while(0)
typedef Optional(u8) MaybeChr;
const char *TokenIdStr[T_TOKEN_COUNT] = {
[T_INVALID] = "(invalid token)",
[T_PLUS] = "+",
[T_MINUS] = "-",
[T_STAR] = "*",
[T_BAR] = "/",
[T_EXCLAMATION] = "!",
[T_LPAREN] = "(",
[T_RPAREN] = ")",
[T_COMMA] = ",",
[T_LESSTHAN] = "<",
[T_GREATTHAN] = ">",
[T_LOGAND] = "and",
[T_LOGOR] = "or",
[T_EQUAL] = "=",
[T_LOGICEQUAL] = "==",
[T_NOTEQUAL] = "!=",
[T_HASH] = "#",
[T_COLON] = ":",
[T_SEMICOLON] = ";",
[T_LBRACKET] = "[",
[T_RBRACKET] = "]",
[T_LBRACE] = "{",
[T_RBRACE] = "}",
[T_IDENT] = "(identifier)",
[T_STRING] = "(string literal)",
[T_NUMBER] = "(number)",
[T_DECNUMBER] = "(decimal number)",
[T_CONST] = "const",
[T_DISCARD] = "discard",
[T_ELIF] = "elif",
[T_ELSE] = "else",
[T_END] = "end",
[T_IF] = "if",
[T_LET] = "let",
[T_PROC] = "proc",
[T_RETURN] = "return",
[T_VAR] = "var",
[T_WHILE] = "while",
[T_STRUCT] = "struct",
[T_USE] = "use",
[T_BREAK] = "break",
[T_NEXT] = "next",
[T_EOF] = "(EOF)",
[T_ERROR] = "(error)",
};
/* Non retarded ASCII character class comparison */
static bool
ascii_isdigit(u32 c)
{
return c >= '0' && c <= '9';
}
static bool
ascii_isalpha(u32 c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
static bool
ascii_isspace(u32 c)
{
return c == ' ' || (c >= '\t' && c <= '\r');
}
static bool
ascii_isalnum(u32 c)
{
return ascii_isalpha(c) || ascii_isdigit(c);
}
static void
update_line_count(LexState *ls, u8 chr)
{
switch (chr) {
case '\n':
ls->cur_loc.column = 1;
++ls->cur_loc.line;
break;
case '\t': /* fallthrough */
default:
++ls->cur_loc.column;
}
}
static u8
peek(LexState *ls)
{
return *ls->fwd;
}
/* Moves the forward pointer and the column count by one */
static void
next(LexState *ls)
{
++ls->fwd;
++ls->cur_loc.column;
}
static void
backup(LexState *ls, int n)
{
ls->fwd -= n;
if (*ls->fwd == '\n')
--ls->cur_loc.line;
/* not quite right if fwd is \n... */
--ls->cur_loc.column;
}
static bool
read_buf(LexState *ls, u8 *buf, isize n, isize *ar)
{
if (feof(ls->input_fp))
return false;
const isize rb = fread(buf, sizeof(*buf), n, ls->input_fp);
if (ferror(ls->input_fp)) {
fatal(ls->cm, nil, "could not read input file\n");
}
*ar = rb;
buf[rb] = LEX_BUFFER_SENTINEL;
return true;
}
static bool
reload_buffers(LexState *ls)
{
//if ((ls->fwd != ls->buf + ls->buflen) || (ls->fwd != ls->buf + ls->buflen2))
// lex_fatal(ls, "invalid nil byte in middle of source file");
const u8 *end_of_buf1 = ls->buf + ls->buflen;
const u8 *end_of_buf2 = ls->buf + LEX_HALF_BUFFER_SIZE + ls->buflen2;
if (ls->fwd == end_of_buf1) { /* end of first buffer */
u8 *buf2 = ls->buf + LEX_HALF_BUFFER_SIZE;
if (!read_buf(ls, buf2, LEX_HALF_BUFFER_SIZE, &ls->buflen2))
return false; /* reached EOF, no more data */
ls->fwd = buf2;
} else if (ls->fwd == end_of_buf2) { /* end of second buffer */
u8 *buf1 = ls->buf;
if (!read_buf(ls, buf1, LEX_HALF_BUFFER_SIZE, &ls->buflen))
return false; /* reached EOF, no more data */
ls->fwd = buf1;
}
/* reset pointers back to the beginning of the buffer */
ls->lbegin = ls->fwd;
return true;
}
static MaybeChr
read_chr(LexState *ls)
{
u8 chr = peek(ls);
if (chr == LEX_BUFFER_SENTINEL) { /* maybe end of buffer */
if (!reload_buffers(ls))
return None(MaybeChr);
}
update_line_count(ls, chr);
return Some(MaybeChr, *ls->fwd++);
}
static MaybeChr
skip_whitespace(LexState *ls)
{
/*
* [ abc = 2*9 - 1 ]
* ^-fwd, lbegin
* [ abc = 2*9 - 1 ]
* lbegin-^^-fwd
*/
MaybeChr c;
for (;;) {
c = read_chr(ls);
if (!c.ok)
return None(MaybeChr);
if (!ascii_isspace(c.val))
break;
++ls->lbegin;
}
return c;
}
static LexToken
make_error(void)
{
return (LexToken){ .id = T_ERROR };
}
static u8 *
intern_identifier(LexState *ls, u8 *ident)
{
IdentsBucket *entry;
if ((entry = shgetp_null(ls->idents, ident)) == nil) {
shput(ls->idents, ident, 0);
return (u8 *)shgets(ls->idents, ident).key;
}
return (u8 *)entry->key;
}
/*
* *──┬(ident)┬──*
* ╰───<───╯
*/
static LexToken
identifier(LexState *ls)
{
LexToken token = { .loc = ls->cur_loc };
/* this gets copied to the hash table arena, no problem */
u8 ident_buf[MAX_IDENT_SIZE];
usize i = 0;
MaybeChr chr = { *ls->lbegin, true };
while (chr.ok && ascii_isident(chr.val)) {
if (i + 1 == MAX_IDENT_SIZE) {
lex_error(ls, "identifier is too long (max: %u)\n", MAX_IDENT_SIZE);
return make_error();
}
ident_buf[i++] = chr.val;
chr = read_chr(ls);
}
ident_buf[i] = '\0';
/* ate 1 extra character, give it back */
if (chr.ok)
backup(ls, 1);
token.id = T_IDENT;
token.ident = Str_from_buf(intern_identifier(ls, ident_buf), i);
token.len = i;
return token;
}
static LexToken
string_literal(LexState *ls)
{
LexToken token = { .loc = ls->cur_loc };
isize str_buf_len = STRING_LITERAL_BASE_SIZE;
u8 *str_buf = malloc(str_buf_len);
isize i = 0;
/* skip past " */
MaybeChr chr = read_chr(ls);
while (chr.val != '"') {
if (i + 1 == STRING_LITERAL_MAX_SIZE) {
lex_error(ls, "string literal length exceeds maximum of %d bytes", STRING_LITERAL_MAX_SIZE);
return make_error();
}
if (i + 1 > str_buf_len) {
str_buf = realloc(str_buf, str_buf_len *= 2);
}
str_buf[i++] = chr.val;
chr = read_chr(ls);
if (!chr.ok || chr.val == '\n') {
lex_error(ls, "unterminated string literal");
return make_error();
}
}
if (i > 0) {
str_buf[i] = '\0';
} else { /* empty literal */
free(str_buf); /* we wasted our time */
str_buf = nil;
}
token.id = T_STRING;
token.str = Str_from_buf(str_buf, i);
token.len = i;
return token;
}
/* Identifies a numeric literal that may have a prefix:
*
* ('0')─┬──────────────────────┬─*
* ├('b')╭──┬(digit)┬─────╯
* ├('o')┤ ╰───<───╯
* ╰('x')╯
* Indirectly based on a BSD (?) implementation.
*/
static LexToken
number_literal(LexState *ls)
{
LexToken token = { .id = T_NUMBER, .loc = ls->cur_loc };
u64 number = 0;
u8 base = 10;
MaybeChr chr = { *ls->lbegin, true };
if (chr.val == '0') {
chr = read_chr(ls); /* skip 0 prefix */
if (!chr.ok) { /* EOF edge case */
return token; /* 0 */
}
switch (chr.val) {
case 'b':
base = 2;
break;
case 'o':
base = 8;
break;
case 'x':
base = 16;
break;
default:
if (ascii_isdigit(chr.val)) {
lex_error(ls, "use '0o' for an octal literal");
return make_error();
}
//lex_error(ls, "unknown numeric prefix '0%c'", chr.val);
/* start of another token */
return token; /* 0 */
}
chr = read_chr(ls);
if (!chr.ok) {
lex_error(ls, "expected a digit after the base prefix");
return make_error();
}
}
const u64 mmax = U64_MAX / base;
static const u8 digits[] = "0123456789abcdef";
while (chr.ok) {
u8 *digitp = memchr(digits, chr.val, lengthof(digits));
if (digitp == nil)
break;
u8 digit = digitp - digits;
if (digit >= base) {
lex_error(ls, "invalid literal");
return make_error();
}
if (number > mmax)
goto overflow;
number *= base;
/* overflow for adding the digit */
if (U64_MAX - digit < number)
goto overflow;
number += digit;
chr = read_chr(ls);
}
if (chr.ok)
backup(ls, 1);
token.inumber = number;
return token;
overflow:
lex_error(ls, "integer literal is too big (2^64 max)");
return make_error();
}
static LexToken
keyword(LexToken *t)
{
#define kwcmp(ident, kw, tid) \
{if (Str_equal(ident, kw)) return (LexToken){ .id = tid, .len = kw.len, .loc = t->loc };}
Str ident = t->ident;
--ident.len;
switch (*ident.s++) {
case 'a':
kwcmp(ident, Sl("nd"), T_LOGAND);
break;
case 'b':
kwcmp(ident, Sl("reak"), T_BREAK);
break;
case 'c':
kwcmp(ident, Sl("onst"), T_CONST);
break;
case 'd':
kwcmp(ident, Sl("iscard"), T_DISCARD);
break;
case 'e':
kwcmp(ident, Sl("nd"), T_END);
kwcmp(ident, Sl("lse"), T_ELSE);
kwcmp(ident, Sl("lif"), T_ELIF);
break;
case 'i':
kwcmp(ident, Sl("f"), T_IF);
break;
case 'l':
kwcmp(ident, Sl("et"), T_LET);
break;
case 'n':
kwcmp(ident, Sl("ot"), T_LOGNOT);
kwcmp(ident, Sl("ext"), T_NEXT);
break;
case 'o':
kwcmp(ident, Sl("r"), T_LOGOR);
break;
case 'p':
kwcmp(ident, Sl("roc"), T_PROC);
break;
case 'r':
kwcmp(ident, Sl("eturn"), T_RETURN);
break;
case 's':
kwcmp(ident, Sl("truct"), T_STRUCT);
break;
case 'v':
kwcmp(ident, Sl("ar"), T_VAR);
break;
case 'w':
kwcmp(ident, Sl("hile"), T_WHILE);
break;
case 'u':
kwcmp(ident, Sl("se"), T_USE);
break;
}
return *t;
#undef kwcmp
}
LexToken
lex_scan(LexState *ls)
{
#define TOKEN(chr, t) case chr: token.id = t; break;
if (arrlen(ls->backlist) > 0) {
return arrpop(ls->backlist);
}
/* lexeme start pointer */
ls->lbegin = ls->fwd;
LexToken token = {0};
MaybeChr c = skip_whitespace(ls);
if (!c.ok) {
token.id = T_EOF;
ls->eof = true;
return token;
}
token.loc = ls->cur_loc;
//trace("token now: '%c'\n", c.val);
//trace("lp: <%s>\n", ls->lbegin);
//trace("fwd: <%s>\n", ls->fwd);
switch (c.val) {
case '!':
if (peek(ls) == '=') {
token.id = T_NOTEQUAL;
next(ls);
} else {
token.id = T_EXCLAMATION;
}
break;
TOKEN('+', T_PLUS)
TOKEN('-', T_MINUS)
TOKEN('*', T_STAR)
TOKEN('/', T_BAR)
TOKEN('(', T_LPAREN)
TOKEN(')', T_RPAREN)
TOKEN(',', T_COMMA)
TOKEN('<', T_LESSTHAN)
TOKEN('>', T_GREATTHAN)
TOKEN('#', T_HASH)
TOKEN(':', T_COLON)
TOKEN(';', T_SEMICOLON)
TOKEN('[', T_LBRACKET)
TOKEN(']', T_RBRACKET)
TOKEN('{', T_LBRACE)
TOKEN('}', T_RBRACE)
case '=':
if (peek(ls) == '=') {
token.id = T_LOGICEQUAL;
next(ls);
} else {
token.id = T_EQUAL;
}
break;
case '"':
return string_literal(ls);
case '0' ... '9':
return number_literal(ls);
default: {
const u8 uc = c.val;
if (ascii_isident_start(uc)) {
LexToken ident_or_keyword = identifier(ls);
if (ident_or_keyword.id != T_IDENT)
return make_error();
return keyword(&ident_or_keyword);
}
if (uc > 0x7f) /* DEL, the last ASCII character */
lex_error(ls, "unicode tokens aren't allowed yet");
else
lex_error(ls, "unknown token '%c' (\\x%02x)", uc, uc);
return make_error();
}
}
return token;
#undef TOKEN
}
/* Put a token into the backlist. The next call to `lex_scan` will return this
* token. The backlist is a stack of tokens, so technically you can have unlimited
* look-ahead at the cost of memory.
*/
void
lex_backup(LexState *ls, LexToken token)
{
arrput(ls->backlist, token);
}
/* Checks if `t` token type is equal to `exp_tok`. This does not eat any token. */
bool
lex_match(LexState *ls, LexToken *token, enum LexTokenId exp_tok)
{
if (token->id != exp_tok) {
lex_error(ls, "expected '%s' but got '%s' instead\n",
TokenIdStr[exp_tok], TokenIdStr[token->id]);
return false;
}
return true;
}
LexState *
lex_new(Compiler *cm, FILE *input_fp, Str file_name, usize tabsize)
{
LexState *ls = calloc(1, sizeof(*ls));
ls->buf = calloc(LEX_BUFFER_SIZE + 1, sizeof(*ls->buf));
ls->lbegin = ls->fwd = ls->buf;
ls->tabsize = tabsize;
ls->input_fp = input_fp;
ls->cur_loc.line = 1;
ls->cur_loc.source = file_name;
ls->cm = cm;
/* We use a hash table with string keys as a set containing all identifiers
* in a compilation unit, to avoid dupplicate allocations.
*/
sh_new_arena(ls->idents);
/* We provide our own buffering scheme */
setbuf(input_fp, nil);
/* Initial fill of first buffer.
* Any file error gets caught in the function, only thing that can happen
* here is that the file is actually empty, so instant EOF.
*/
read_buf(ls, ls->buf, LEX_HALF_BUFFER_SIZE, &ls->buflen);
return ls;
}
/* Destroys a lexing context and frees its allocated memory.
* Note that this will also deallocate the identifier arena.
*/
void
lex_destroy(LexState *ls)
{
shfree(ls->idents);
arrfree(ls->backlist);
free(ls->buf);
free(ls);
}