rutile/compiler/lex.c

#include <stdio.h> /* feof, ferror, fread, FILE, EOF */
#include <stdlib.h> /* malloc calloc free */
#include <string.h> /* memset */

#include "lex.h"
#include "messages.h"
#include "pre.h"
#include "libs/stb_ds.h"

#define LEX_BUFFER_SIZE 8192
#define LEX_HALF_BUFFER_SIZE LEX_BUFFER_SIZE / 2
#define LEX_BUFFER_SENTINEL '\0'

#define MAX_IDENT_SIZE 1024u
#define STRING_LITERAL_BASE_SIZE 255
#define STRING_LITERAL_MAX_SIZE 4096

#define at_buffer_end(ls) (*(ls)->fwd == '\0')
#define ascii_isident(c) (c == '_' || c == '?' || c == '!' || ascii_isalnum(c))
#define ascii_isident_start(c) (c == '_' || ascii_isalpha(c))

#define lex_error(ls, ...) do {							\
		error((ls)->cm, &(ls)->cur_loc, __VA_ARGS__);	\
	} while(0)

#define lex_fatal(ls, ...) do {							\
		fatal((ls)->cm, &(ls)->cur_loc, __VA_ARGS__);	\
	} while(0)

typedef Optional(u8) MaybeChr;

const char *TokenIdStr[T_TOKEN_COUNT] = {
	[T_INVALID] = "(invalid token)",
	[T_PLUS] = "+",
	[T_MINUS] = "-",
	[T_STAR] = "*",
	[T_BAR] = "/",
	[T_EXCLAMATION] = "!",
	[T_LPAREN] = "(",
	[T_RPAREN] = ")",
	[T_COMMA] = ",",
	[T_LESSTHAN] = "<",
	[T_GREATTHAN] = ">",
	[T_LOGAND] = "and",
	[T_LOGOR] = "or",
	[T_EQUAL] = "=",
	[T_LOGICEQUAL] = "==",
	[T_NOTEQUAL] = "!=",
	[T_HASH] = "#",
	[T_COLON] = ":",
	[T_SEMICOLON] = ";",
	[T_LBRACKET] = "[",
	[T_RBRACKET] = "]",
	[T_LBRACE] = "{",
	[T_RBRACE] = "}",
	[T_IDENT] = "(identifier)",
	[T_STRING] = "(string literal)",
	[T_NUMBER] = "(number)",
	[T_DECNUMBER] = "(decimal number)",
	[T_CONST] = "const",
	[T_DISCARD] = "discard",
	[T_ELIF] = "elif",
	[T_ELSE] = "else",
	[T_END] = "end",
	[T_IF] = "if",
	[T_LET] = "let",
	[T_PROC] = "proc",
	[T_RETURN] = "return",
	[T_VAR] = "var",
	[T_WHILE] = "while",
	[T_STRUCT] = "struct",
	[T_USE] = "use",
	[T_BREAK] = "break",
	[T_NEXT] = "next",
	[T_EOF] = "(EOF)",
	[T_ERROR] = "(error)",
};

/* Non retarded ASCII character class comparison */
static bool
ascii_isdigit(u32 c)
{
	return c >= '0' && c <= '9';
}

static bool
ascii_isalpha(u32 c)
{
	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}

static bool
ascii_isspace(u32 c)
{
	return c == ' ' || (c >= '\t' && c <= '\r');
}

static bool
ascii_isalnum(u32 c)
{
	return ascii_isalpha(c) || ascii_isdigit(c);
}

static void
update_line_count(LexState *ls, u8 chr)
{
	switch (chr) {
	case '\n':
		ls->cur_loc.column = 1;
		++ls->cur_loc.line;
		break;
	case '\t': /* fallthrough */
	default:
		++ls->cur_loc.column;
	}
}

static u8
peek(LexState *ls)
{
	return *ls->fwd;
}

static void
backup(LexState *ls, int n)
{
	ls->fwd -= n;
	if (*ls->fwd == '\n')
		--ls->cur_loc.line;
	/* not quite right if fwd is \n... */
	--ls->cur_loc.column;
}

static bool
read_buf(LexState *ls, u8 *buf, isize n, isize *ar)
{
	if (feof(ls->input_fp))
		return false;
	const isize rb = fread(buf, sizeof(*buf), n, ls->input_fp);
	if (ferror(ls->input_fp)) {
		fatal(ls->cm, nil, "could not read input file\n");
	}
   	*ar	= rb;
	buf[rb] = LEX_BUFFER_SENTINEL;
	return true;
}

static bool
reload_buffers(LexState *ls)
{
	//if ((ls->fwd != ls->buf + ls->buflen) || (ls->fwd != ls->buf + ls->buflen2))
	//	lex_fatal(ls, "invalid nil byte in middle of source file");

	const u8 *end_of_buf1 = ls->buf + ls->buflen;
	const u8 *end_of_buf2 = ls->buf + LEX_HALF_BUFFER_SIZE + ls->buflen2;

	if (ls->fwd == end_of_buf1) { /* end of first buffer */
		u8 *buf2 = ls->buf + LEX_HALF_BUFFER_SIZE;
		if (!read_buf(ls, buf2, LEX_HALF_BUFFER_SIZE, &ls->buflen2))
			return false; /* reached EOF, no more data */
		ls->fwd = buf2;
	} else if (ls->fwd == end_of_buf2) { /* end of second buffer */
		u8 *buf1 = ls->buf;
		if (!read_buf(ls, buf1, LEX_HALF_BUFFER_SIZE, &ls->buflen))
			return false; /* reached EOF, no more data */
		ls->fwd = buf1;
	}
	/* reset pointers back to the beginning of the buffer */
	ls->lbegin = ls->fwd;
	return true;
}

static MaybeChr
read_chr(LexState *ls)
{
	u8 chr = peek(ls);
	if (chr == LEX_BUFFER_SENTINEL) { /* maybe end of buffer */
		if (!reload_buffers(ls))
			return None(MaybeChr);
	}
	update_line_count(ls, chr);
	return Some(MaybeChr, *ls->fwd++);
}

static MaybeChr
skip_whitespace(LexState *ls)
{
	/* skip any whitespace
	 * [ abc =    2*9  -  1 ]
	 *        ^-fwd, lbegin
	 * [ abc =    2*9  -  1 ]
	 *     lbegin-^^-fwd
	 * */
	MaybeChr c;
	for (;;) {
		c = read_chr(ls);
		if (!c.ok) {
			return None(MaybeChr);
		}
		if (!ascii_isspace(c.val))
			break;
		++ls->lbegin;
	}
	return c;
}

static LexToken
make_error(void)
{
	return (LexToken){ .id = T_ERROR };
}

static u8 *
intern_identifier(LexState *ls, u8 *ident)
{
	IdentsBucket *entry;
	if ((entry = shgetp_null(ls->idents, ident)) == nil) {
		shput(ls->idents, ident, 0);
		return (u8 *)shgets(ls->idents, ident).key;
	}
	return (u8 *)entry->key;
}

/*
 * *──┬(ident)┬──*
 *    ╰───<───╯
 */
static LexToken
identifier(LexState *ls)
{
	LexToken token = { .loc = ls->cur_loc };
	/* this gets copied to the hash table arena, no problem */
	u8 ident_buf[MAX_IDENT_SIZE];
	usize i = 0;

	MaybeChr chr = { *ls->lbegin, true };
	while (chr.ok && ascii_isident(chr.val)) {
		if (i + 1 == MAX_IDENT_SIZE) {
			lex_error(ls, "identifier is too long (max: %u)\n", MAX_IDENT_SIZE);
			return make_error();
		}
		ident_buf[i++] = chr.val;
		chr = read_chr(ls);
	}
	ident_buf[i] = '\0';
	/* ate 1 extra character, give it back */
	if (chr.ok)
		backup(ls, 1);

	token.id = T_IDENT;
	token.ident = (Str){intern_identifier(ls, ident_buf), i};
	token.len = i;
	return token;
}

static LexToken
string_literal(LexState *ls)
{
	LexToken token = { .loc = ls->cur_loc };
	isize str_buf_len = STRING_LITERAL_BASE_SIZE;
	u8 *str_buf = malloc(str_buf_len);
	isize i = 0;

	/* skip past " */
	MaybeChr chr = read_chr(ls);
	while (chr.val != '"') {
		if (i + 1 == STRING_LITERAL_MAX_SIZE) {
			lex_error(ls, "string literal length exceeds maximum of %d bytes", STRING_LITERAL_MAX_SIZE);
			goto err;
		}
		if (i + 1 > str_buf_len) {
			str_buf = realloc(str_buf, str_buf_len *= 2);
		}
		str_buf[i++] = chr.val;
		chr = read_chr(ls);
		if (!chr.ok || chr.val == '\n') {
			lex_error(ls, "unterminated string literal");
			goto err;
		}
	}
	if (i > 0) {
		str_buf[i] = '\0';
	} else { /* empty literal */
		free(str_buf); /* we wasted our time */
		str_buf = nil;
	}

	token.id = T_STRING;
	token.str = Str_from_buf(str_buf, i);
	token.len = i;
	return token;
err:
	return make_error();
}

/*	Identifies a numeric literal that may have a prefix:
 *
 *	('0')─┬──────────────────────┬─*
 *	      ├('b')╭──┬(digit)┬─────╯
 *	      ├('o')┤  ╰───<───╯
 *	      ╰('x')╯
 * Indirectly based on a BSD (?) implementation.
 */
static LexToken
number_literal(LexState *ls)
{
	LexToken token = { .id = T_NUMBER, .loc = ls->cur_loc };
	u64 number = 0;
	u8 base = 10;

	MaybeChr chr = { *ls->lbegin, true };

	if (chr.val == '0') {
		chr = read_chr(ls); /* skip 0 prefix */
		if (!chr.ok) { /* EOF edge case */
			return token; /* 0 */
		}
		switch (chr.val) {
		case 'b':
			base = 2;
			break;
		case 'o':
			base = 8;
			break;
		case 'x':
			base = 16;
			break;
		default:
		if (ascii_isdigit(chr.val)) {
				lex_error(ls, "use '0o' for an octal literal");
				return make_error();
			}
			//lex_error(ls, "unknown numeric prefix '0%c'", chr.val);
			/* start of another token */
			return token; /* 0 */
		}
		chr = read_chr(ls);
		if (!chr.ok) {
			lex_error(ls, "expected a digit after the base prefix");
			return make_error();
		}
	}

	const u64 mmax = U64_MAX / base;
	static const u8 digits[] = "0123456789abcdef";

	while (chr.ok) {
		u8 *digitp = memchr(digits, chr.val, lengthof(digits));
		if (digitp == nil)
			break;

		u8 digit = digitp - digits;
		if (digit >= base) {
			lex_error(ls, "invalid literal");
			return make_error();
		}
		if (number > mmax)
			goto overflow;
		number *= base;
		/* overflow for adding the digit */
		if (U64_MAX - digit < number)
			goto overflow;

		number += digit;
		chr = read_chr(ls);
	}
	if (chr.ok)
		backup(ls, 1);

	token.inumber = number;
	return token;
overflow:
	lex_error(ls, "integer literal is too big (2^64 max)");
	return make_error();
}

static LexToken
keyword(LexToken *t)
{
#define kwcmp(ident, kw, tid) \
	{if (Str_equal(ident, kw)) return (LexToken){ .id = tid, .len = kw.len, .loc = t->loc };}

	Str ident = t->ident;
	--ident.len;
	switch (*ident.s++) {
	case 'a':
		kwcmp(ident, Sl("nd"), T_LOGAND);
		break;
	case 'b':
		kwcmp(ident, Sl("reak"), T_BREAK);
		break;
	case 'c':
		kwcmp(ident, Sl("onst"), T_CONST);
		break;
	case 'd':
		kwcmp(ident, Sl("iscard"), T_DISCARD);
		break;
	case 'e':
		kwcmp(ident, Sl("nd"), T_END);
		kwcmp(ident, Sl("lse"), T_ELSE);
		kwcmp(ident, Sl("lif"), T_ELIF);
		break;
	case 'i':
		kwcmp(ident, Sl("f"), T_IF);
		break;
	case 'l':
		kwcmp(ident, Sl("et"), T_LET);
		break;
	case 'n':
		kwcmp(ident, Sl("ot"), T_LOGNOT);
		kwcmp(ident, Sl("ext"), T_NEXT);
		break;
	case 'o':
		kwcmp(ident, Sl("r"), T_LOGOR);
		break;
	case 'p':
		kwcmp(ident, Sl("roc"), T_PROC);
		break;
	case 'r':
		kwcmp(ident, Sl("eturn"), T_RETURN);
		break;
	case 's':
		kwcmp(ident, Sl("truct"), T_STRUCT);
		break;
	case 'v':
		kwcmp(ident, Sl("ar"), T_VAR);
		break;
	case 'w':
		kwcmp(ident, Sl("hile"), T_WHILE);
		break;
	case 'u':
		kwcmp(ident, Sl("se"), T_USE);
		break;
	}
	return *t;
#undef kwcmp
}

LexToken
lex_scan(LexState *ls)
{
#define TOKEN(chr, t) case chr: token.id = t; break;

	if (arrlen(ls->backlist) > 0) {
		return arrpop(ls->backlist);
	}
	/* lexeme start pointer */
	ls->lbegin = ls->fwd;

	LexToken token = {0};
	MaybeChr c = skip_whitespace(ls);
	if (!c.ok) {
		token.id = T_EOF;
		ls->eof = true;
		return token;
	}
	token.loc = ls->cur_loc;

	//trace("token now: '%c'\n", c.val);
	//trace("lp: <%s>\n", ls->lbegin);
	//trace("fwd: <%s>\n", ls->fwd);
	switch (c.val) {
	case '!':
		if (peek(ls) == '=') {
			token.id = T_NOTEQUAL;
			++ls->fwd;
			++ls->cur_loc.column;
		} else {
			token.id = T_EXCLAMATION;
		}
		break;
	TOKEN('+', T_PLUS)
	TOKEN('-', T_MINUS)
	TOKEN('*', T_STAR)
	TOKEN('/', T_BAR)
	TOKEN('(', T_LPAREN)
	TOKEN(')', T_RPAREN)
	TOKEN(',', T_COMMA)
	TOKEN('<', T_LESSTHAN)
	TOKEN('>', T_GREATTHAN)
	TOKEN('#', T_HASH)
	TOKEN(':', T_COLON)
	TOKEN(';', T_SEMICOLON)
	TOKEN('[', T_LBRACKET)
	TOKEN(']', T_RBRACKET)
	TOKEN('{', T_LBRACE)
	TOKEN('}', T_RBRACE)
	case '=':
		if (peek(ls) == '=') {
			token.id = T_LOGICEQUAL;
			++ls->fwd;
			++ls->cur_loc.column;
		} else {
			token.id = T_EQUAL;
		}
		break;
	case '"':
		return string_literal(ls);
	case '0' ... '9':
		return number_literal(ls);
	default: {
		const u8 uc = c.val;
		if (ascii_isident_start(uc)) {
			LexToken ident_or_keyword = identifier(ls);
			if (ident_or_keyword.id != T_IDENT)
				return make_error();
		   	return keyword(&ident_or_keyword);
		}

		if (uc > 0x7f) /* DEL, the last ASCII character */
			lex_error(ls, "unicode tokens aren't allowed yet");
		else
			lex_error(ls, "unknown token '%c' (\\x%02x)", uc, uc);
		return make_error();
	}
	}
	return token;
#undef TOKEN
}

/* Put a token into the backlist. The next call to `lex_scan` will return this
 * token. The backlist is a stack of tokens, so technically you can have unlimited
 * look-ahead at the cost of memory.
 */
void
lex_backup(LexState *ls, LexToken token)
{
	arrput(ls->backlist, token);
}

/* Checks if `t` token type is equal to `exp_tok`. This does not eat any token. */
bool
lex_match(LexState *ls, LexToken *token, enum LexTokenId exp_tok)
{
	if (token->id != exp_tok) {
		lex_error(ls, "expected '%s' but got '%s' instead\n",
				TokenIdStr[exp_tok], TokenIdStr[token->id]);
		return false;
	}
	return true;
}

LexState *
lex_new(Compiler *cm, FILE *input_fp, Str file_name, usize tabsize)
{
	LexState *ls = calloc(1, sizeof(*ls));
	ls->buf = calloc(LEX_BUFFER_SIZE + 1, sizeof(*ls->buf));
	ls->lbegin = ls->fwd = ls->buf;
	ls->tabsize = tabsize;
	ls->input_fp = input_fp;
	ls->cur_loc.line = 1;
	ls->cur_loc.source = file_name;
	ls->cm = cm;
	/* We use a hash table with string keys as a set containing all identifiers
	 * in a compilation unit, to avoid dupplicate allocations.
	 */
	sh_new_arena(ls->idents);
	/* We provide our own buffering scheme */
	setbuf(input_fp, nil);
	/* Initial fill of first buffer.
	 * Any file error gets caught in the function, only thing that can happen
	 * here is that the file is actually empty, so instant EOF.
	 */
	read_buf(ls, ls->buf, LEX_HALF_BUFFER_SIZE, &ls->buflen);
	return ls;
}

/* Destroys a lexing context and frees its allocated memory.
 * Note that this will also deallocate the identifier arena.
 */
void
lex_destroy(LexState *ls)
{
	shfree(ls->idents);
	arrfree(ls->backlist);
	free(ls->buf);
	free(ls);
}