578 lines
12 KiB
C
578 lines
12 KiB
C
#include <stdio.h> /* feof, ferror, fread, FILE, EOF */
|
|
#include <stdlib.h> /* malloc calloc free */
|
|
#include <string.h> /* memset */
|
|
|
|
#include "lex.h"
|
|
#include "messages.h"
|
|
#include "pre.h"
|
|
#include "libs/stb_ds.h"
|
|
|
|
#define LEX_BUFFER_SIZE 8192
|
|
#define LEX_HALF_BUFFER_SIZE LEX_BUFFER_SIZE / 2
|
|
#define LEX_BUFFER_SENTINEL '\0'
|
|
|
|
#define MAX_IDENT_SIZE 1024u
|
|
#define STRING_LITERAL_BASE_SIZE 255
|
|
#define STRING_LITERAL_MAX_SIZE 4096
|
|
|
|
#define at_buffer_end(ls) (*(ls)->fwd == '\0')
|
|
#define ascii_isident(c) (c == '_' || c == '?' || c == '!' || ascii_isalnum(c))
|
|
#define ascii_isident_start(c) (c == '_' || ascii_isalpha(c))
|
|
|
|
#define lex_error(ls, ...) do { \
|
|
error((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \
|
|
} while(0)
|
|
|
|
#define lex_fatal(ls, ...) do { \
|
|
fatal((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \
|
|
} while(0)
|
|
|
|
typedef Optional(u8) MaybeChr;
|
|
|
|
const char *TokenIdStr[T_TOKEN_COUNT] = {
|
|
[T_INVALID] = "(invalid token)",
|
|
[T_PLUS] = "+",
|
|
[T_MINUS] = "-",
|
|
[T_STAR] = "*",
|
|
[T_BAR] = "/",
|
|
[T_EXCLAMATION] = "!",
|
|
[T_LPAREN] = "(",
|
|
[T_RPAREN] = ")",
|
|
[T_COMMA] = ",",
|
|
[T_LESSTHAN] = "<",
|
|
[T_GREATTHAN] = ">",
|
|
[T_LOGAND] = "and",
|
|
[T_LOGOR] = "or",
|
|
[T_EQUAL] = "=",
|
|
[T_LOGICEQUAL] = "==",
|
|
[T_NOTEQUAL] = "!=",
|
|
[T_HASH] = "#",
|
|
[T_COLON] = ":",
|
|
[T_SEMICOLON] = ";",
|
|
[T_LBRACKET] = "[",
|
|
[T_RBRACKET] = "]",
|
|
[T_LBRACE] = "{",
|
|
[T_RBRACE] = "}",
|
|
[T_IDENT] = "(identifier)",
|
|
[T_STRING] = "(string literal)",
|
|
[T_NUMBER] = "(number)",
|
|
[T_DECNUMBER] = "(decimal number)",
|
|
[T_CONST] = "const",
|
|
[T_DISCARD] = "discard",
|
|
[T_ELIF] = "elif",
|
|
[T_ELSE] = "else",
|
|
[T_END] = "end",
|
|
[T_IF] = "if",
|
|
[T_LET] = "let",
|
|
[T_PROC] = "proc",
|
|
[T_RETURN] = "return",
|
|
[T_VAR] = "var",
|
|
[T_WHILE] = "while",
|
|
[T_STRUCT] = "struct",
|
|
[T_USE] = "use",
|
|
[T_BREAK] = "break",
|
|
[T_NEXT] = "next",
|
|
[T_EOF] = "(EOF)",
|
|
[T_ERROR] = "(error)",
|
|
};
|
|
|
|
/* Non retarded ASCII character class comparison */
|
|
static bool
|
|
ascii_isdigit(u32 c)
|
|
{
|
|
return c >= '0' && c <= '9';
|
|
}
|
|
|
|
static bool
|
|
ascii_isalpha(u32 c)
|
|
{
|
|
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
|
}
|
|
|
|
static bool
|
|
ascii_isspace(u32 c)
|
|
{
|
|
return c == ' ' || (c >= '\t' && c <= '\r');
|
|
}
|
|
|
|
static bool
|
|
ascii_isalnum(u32 c)
|
|
{
|
|
return ascii_isalpha(c) || ascii_isdigit(c);
|
|
}
|
|
|
|
static void
|
|
update_line_count(LexState *ls, u8 chr)
|
|
{
|
|
switch (chr) {
|
|
case '\n':
|
|
ls->cur_loc.column = 1;
|
|
++ls->cur_loc.line;
|
|
break;
|
|
case '\t': /* fallthrough */
|
|
default:
|
|
++ls->cur_loc.column;
|
|
}
|
|
}
|
|
|
|
static u8
|
|
peek(LexState *ls)
|
|
{
|
|
return *ls->fwd;
|
|
}
|
|
|
|
static void
|
|
backup(LexState *ls, int n)
|
|
{
|
|
ls->fwd -= n;
|
|
if (*ls->fwd == '\n')
|
|
--ls->cur_loc.line;
|
|
/* not quite right if fwd is \n... */
|
|
--ls->cur_loc.column;
|
|
}
|
|
|
|
static bool
|
|
read_buf(LexState *ls, u8 *buf, isize n, isize *ar)
|
|
{
|
|
if (feof(ls->input_fp))
|
|
return false;
|
|
const isize rb = fread(buf, sizeof(*buf), n, ls->input_fp);
|
|
if (ferror(ls->input_fp)) {
|
|
fatal(ls->cm, nil, "could not read input file\n");
|
|
}
|
|
*ar = rb;
|
|
buf[rb] = LEX_BUFFER_SENTINEL;
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
reload_buffers(LexState *ls)
|
|
{
|
|
//if ((ls->fwd != ls->buf + ls->buflen) || (ls->fwd != ls->buf + ls->buflen2))
|
|
// lex_fatal(ls, "invalid nil byte in middle of source file");
|
|
|
|
const u8 *end_of_buf1 = ls->buf + ls->buflen;
|
|
const u8 *end_of_buf2 = ls->buf + LEX_HALF_BUFFER_SIZE + ls->buflen2;
|
|
|
|
if (ls->fwd == end_of_buf1) { /* end of first buffer */
|
|
u8 *buf2 = ls->buf + LEX_HALF_BUFFER_SIZE;
|
|
if (!read_buf(ls, buf2, LEX_HALF_BUFFER_SIZE, &ls->buflen2))
|
|
return false; /* reached EOF, no more data */
|
|
ls->fwd = buf2;
|
|
} else if (ls->fwd == end_of_buf2) { /* end of second buffer */
|
|
u8 *buf1 = ls->buf;
|
|
if (!read_buf(ls, buf1, LEX_HALF_BUFFER_SIZE, &ls->buflen))
|
|
return false; /* reached EOF, no more data */
|
|
ls->fwd = buf1;
|
|
}
|
|
/* reset pointers back to the beginning of the buffer */
|
|
ls->lbegin = ls->fwd;
|
|
return true;
|
|
}
|
|
|
|
static MaybeChr
|
|
read_chr(LexState *ls)
|
|
{
|
|
u8 chr = peek(ls);
|
|
if (chr == LEX_BUFFER_SENTINEL) { /* maybe end of buffer */
|
|
if (!reload_buffers(ls))
|
|
return None(MaybeChr);
|
|
}
|
|
update_line_count(ls, chr);
|
|
return Some(MaybeChr, *ls->fwd++);
|
|
}
|
|
|
|
static MaybeChr
|
|
skip_whitespace(LexState *ls)
|
|
{
|
|
/* skip any whitespace
|
|
* [ abc = 2*9 - 1 ]
|
|
* ^-fwd, lbegin
|
|
* [ abc = 2*9 - 1 ]
|
|
* lbegin-^^-fwd
|
|
* */
|
|
MaybeChr c;
|
|
for (;;) {
|
|
c = read_chr(ls);
|
|
if (!c.ok) {
|
|
return None(MaybeChr);
|
|
}
|
|
if (!ascii_isspace(c.val))
|
|
break;
|
|
++ls->lbegin;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
static LexToken
|
|
make_error(void)
|
|
{
|
|
return (LexToken){ .id = T_ERROR };
|
|
}
|
|
|
|
static u8 *
|
|
intern_identifier(LexState *ls, u8 *ident)
|
|
{
|
|
IdentsBucket *entry;
|
|
if ((entry = shgetp_null(ls->idents, ident)) == nil) {
|
|
shput(ls->idents, ident, 0);
|
|
return (u8 *)shgets(ls->idents, ident).key;
|
|
}
|
|
return (u8 *)entry->key;
|
|
}
|
|
|
|
/*
|
|
* *──┬(ident)┬──*
|
|
* ╰───<───╯
|
|
*/
|
|
static LexToken
|
|
identifier(LexState *ls)
|
|
{
|
|
LexToken token = { .loc = ls->cur_loc };
|
|
/* this gets copied to the hash table arena, no problem */
|
|
u8 ident_buf[MAX_IDENT_SIZE];
|
|
usize i = 0;
|
|
|
|
MaybeChr chr = { *ls->lbegin, true };
|
|
while (chr.ok && ascii_isident(chr.val)) {
|
|
if (i + 1 == MAX_IDENT_SIZE) {
|
|
lex_error(ls, "identifier is too long (max: %u)\n", MAX_IDENT_SIZE);
|
|
return make_error();
|
|
}
|
|
ident_buf[i++] = chr.val;
|
|
chr = read_chr(ls);
|
|
}
|
|
ident_buf[i] = '\0';
|
|
/* ate 1 extra character, give it back */
|
|
if (chr.ok)
|
|
backup(ls, 1);
|
|
|
|
token.id = T_IDENT;
|
|
token.ident = (Str){intern_identifier(ls, ident_buf), i};
|
|
token.len = i;
|
|
return token;
|
|
}
|
|
|
|
static LexToken
|
|
string_literal(LexState *ls)
|
|
{
|
|
LexToken token = { .loc = ls->cur_loc };
|
|
isize str_buf_len = STRING_LITERAL_BASE_SIZE;
|
|
u8 *str_buf = malloc(str_buf_len);
|
|
isize i = 0;
|
|
|
|
/* skip past " */
|
|
MaybeChr chr = read_chr(ls);
|
|
while (chr.val != '"') {
|
|
if (i + 1 == STRING_LITERAL_MAX_SIZE) {
|
|
lex_error(ls, "string literal length exceeds maximum of %d bytes", STRING_LITERAL_MAX_SIZE);
|
|
goto err;
|
|
}
|
|
if (i + 1 > str_buf_len) {
|
|
str_buf = realloc(str_buf, str_buf_len *= 2);
|
|
}
|
|
str_buf[i++] = chr.val;
|
|
chr = read_chr(ls);
|
|
if (!chr.ok || chr.val == '\n') {
|
|
lex_error(ls, "unterminated string literal");
|
|
goto err;
|
|
}
|
|
}
|
|
if (i > 0) {
|
|
str_buf[i] = '\0';
|
|
} else { /* empty literal */
|
|
free(str_buf); /* we wasted our time */
|
|
str_buf = nil;
|
|
}
|
|
|
|
token.id = T_STRING;
|
|
token.str = Str_from_buf(str_buf, i);
|
|
token.len = i;
|
|
return token;
|
|
err:
|
|
return make_error();
|
|
}
|
|
|
|
/* Identifies a numeric literal that may have a prefix:
|
|
*
|
|
* ('0')─┬──────────────────────┬─*
|
|
* ├('b')╭──┬(digit)┬─────╯
|
|
* ├('o')┤ ╰───<───╯
|
|
* ╰('x')╯
|
|
* Indirectly based on a BSD (?) implementation.
|
|
*/
|
|
static LexToken
|
|
number_literal(LexState *ls)
|
|
{
|
|
LexToken token = { .id = T_NUMBER, .loc = ls->cur_loc };
|
|
u64 number = 0;
|
|
u8 base = 10;
|
|
|
|
MaybeChr chr = { *ls->lbegin, true };
|
|
|
|
if (chr.val == '0') {
|
|
chr = read_chr(ls); /* skip 0 prefix */
|
|
if (!chr.ok) { /* EOF edge case */
|
|
return token; /* 0 */
|
|
}
|
|
switch (chr.val) {
|
|
case 'b':
|
|
base = 2;
|
|
break;
|
|
case 'o':
|
|
base = 8;
|
|
break;
|
|
case 'x':
|
|
base = 16;
|
|
break;
|
|
default:
|
|
if (ascii_isdigit(chr.val)) {
|
|
lex_error(ls, "use '0o' for an octal literal");
|
|
return make_error();
|
|
}
|
|
//lex_error(ls, "unknown numeric prefix '0%c'", chr.val);
|
|
/* start of another token */
|
|
return token; /* 0 */
|
|
}
|
|
chr = read_chr(ls);
|
|
if (!chr.ok) {
|
|
lex_error(ls, "expected a digit after the base prefix");
|
|
return make_error();
|
|
}
|
|
}
|
|
|
|
const u64 mmax = U64_MAX / base;
|
|
static const u8 digits[] = "0123456789abcdef";
|
|
|
|
while (chr.ok) {
|
|
u8 *digitp = memchr(digits, chr.val, lengthof(digits));
|
|
if (digitp == nil)
|
|
break;
|
|
|
|
u8 digit = digitp - digits;
|
|
if (digit >= base) {
|
|
lex_error(ls, "invalid literal");
|
|
return make_error();
|
|
}
|
|
if (number > mmax)
|
|
goto overflow;
|
|
number *= base;
|
|
/* overflow for adding the digit */
|
|
if (U64_MAX - digit < number)
|
|
goto overflow;
|
|
|
|
number += digit;
|
|
chr = read_chr(ls);
|
|
}
|
|
if (chr.ok)
|
|
backup(ls, 1);
|
|
|
|
token.inumber = number;
|
|
return token;
|
|
overflow:
|
|
lex_error(ls, "integer literal is too big (2^64 max)");
|
|
return make_error();
|
|
}
|
|
|
|
static LexToken
|
|
keyword(LexToken *t)
|
|
{
|
|
#define kwcmp(ident, kw, tid) \
|
|
{if (Str_equal(ident, kw)) return (LexToken){ .id = tid, .len = kw.len, .loc = t->loc };}
|
|
|
|
Str ident = t->ident;
|
|
--ident.len;
|
|
switch (*ident.s++) {
|
|
case 'a':
|
|
kwcmp(ident, Sl("nd"), T_LOGAND);
|
|
break;
|
|
case 'b':
|
|
kwcmp(ident, Sl("reak"), T_BREAK);
|
|
break;
|
|
case 'c':
|
|
kwcmp(ident, Sl("onst"), T_CONST);
|
|
break;
|
|
case 'd':
|
|
kwcmp(ident, Sl("iscard"), T_DISCARD);
|
|
break;
|
|
case 'e':
|
|
kwcmp(ident, Sl("nd"), T_END);
|
|
kwcmp(ident, Sl("lse"), T_ELSE);
|
|
kwcmp(ident, Sl("lif"), T_ELIF);
|
|
break;
|
|
case 'i':
|
|
kwcmp(ident, Sl("f"), T_IF);
|
|
break;
|
|
case 'l':
|
|
kwcmp(ident, Sl("et"), T_LET);
|
|
break;
|
|
case 'n':
|
|
kwcmp(ident, Sl("ot"), T_LOGNOT);
|
|
kwcmp(ident, Sl("ext"), T_NEXT);
|
|
break;
|
|
case 'o':
|
|
kwcmp(ident, Sl("r"), T_LOGOR);
|
|
break;
|
|
case 'p':
|
|
kwcmp(ident, Sl("roc"), T_PROC);
|
|
break;
|
|
case 'r':
|
|
kwcmp(ident, Sl("eturn"), T_RETURN);
|
|
break;
|
|
case 's':
|
|
kwcmp(ident, Sl("truct"), T_STRUCT);
|
|
break;
|
|
case 'v':
|
|
kwcmp(ident, Sl("ar"), T_VAR);
|
|
break;
|
|
case 'w':
|
|
kwcmp(ident, Sl("hile"), T_WHILE);
|
|
break;
|
|
case 'u':
|
|
kwcmp(ident, Sl("se"), T_USE);
|
|
break;
|
|
}
|
|
return *t;
|
|
#undef kwcmp
|
|
}
|
|
|
|
LexToken
|
|
lex_scan(LexState *ls)
|
|
{
|
|
#define TOKEN(chr, t) case chr: token.id = t; break;
|
|
|
|
if (arrlen(ls->backlist) > 0) {
|
|
return arrpop(ls->backlist);
|
|
}
|
|
/* lexeme start pointer */
|
|
ls->lbegin = ls->fwd;
|
|
|
|
LexToken token = {0};
|
|
MaybeChr c = skip_whitespace(ls);
|
|
if (!c.ok) {
|
|
token.id = T_EOF;
|
|
ls->eof = true;
|
|
return token;
|
|
}
|
|
token.loc = ls->cur_loc;
|
|
|
|
//trace("token now: '%c'\n", c.val);
|
|
//trace("lp: <%s>\n", ls->lbegin);
|
|
//trace("fwd: <%s>\n", ls->fwd);
|
|
switch (c.val) {
|
|
case '!':
|
|
if (peek(ls) == '=') {
|
|
token.id = T_NOTEQUAL;
|
|
++ls->fwd;
|
|
++ls->cur_loc.column;
|
|
} else {
|
|
token.id = T_EXCLAMATION;
|
|
}
|
|
break;
|
|
TOKEN('+', T_PLUS)
|
|
TOKEN('-', T_MINUS)
|
|
TOKEN('*', T_STAR)
|
|
TOKEN('/', T_BAR)
|
|
TOKEN('(', T_LPAREN)
|
|
TOKEN(')', T_RPAREN)
|
|
TOKEN(',', T_COMMA)
|
|
TOKEN('<', T_LESSTHAN)
|
|
TOKEN('>', T_GREATTHAN)
|
|
TOKEN('#', T_HASH)
|
|
TOKEN(':', T_COLON)
|
|
TOKEN(';', T_SEMICOLON)
|
|
TOKEN('[', T_LBRACKET)
|
|
TOKEN(']', T_RBRACKET)
|
|
TOKEN('{', T_LBRACE)
|
|
TOKEN('}', T_RBRACE)
|
|
case '=':
|
|
if (peek(ls) == '=') {
|
|
token.id = T_LOGICEQUAL;
|
|
++ls->fwd;
|
|
++ls->cur_loc.column;
|
|
} else {
|
|
token.id = T_EQUAL;
|
|
}
|
|
break;
|
|
case '"':
|
|
return string_literal(ls);
|
|
case '0' ... '9':
|
|
return number_literal(ls);
|
|
default: {
|
|
const u8 uc = c.val;
|
|
if (ascii_isident_start(uc)) {
|
|
LexToken ident_or_keyword = identifier(ls);
|
|
if (ident_or_keyword.id != T_IDENT)
|
|
return make_error();
|
|
return keyword(&ident_or_keyword);
|
|
}
|
|
|
|
if (uc > 0x7f) /* DEL, the last ASCII character */
|
|
lex_error(ls, "unicode tokens aren't allowed yet");
|
|
else
|
|
lex_error(ls, "unknown token '%c' (\\x%02x)", uc, uc);
|
|
return make_error();
|
|
}
|
|
}
|
|
return token;
|
|
#undef TOKEN
|
|
}
|
|
|
|
/* Put a token into the backlist. The next call to `lex_scan` will return this
|
|
* token. The backlist is a stack of tokens, so technically you can have unlimited
|
|
* look-ahead at the cost of memory.
|
|
*/
|
|
void
|
|
lex_backup(LexState *ls, LexToken token)
|
|
{
|
|
arrput(ls->backlist, token);
|
|
}
|
|
|
|
/* Checks if `t` token type is equal to `exp_tok`. This does not eat any token. */
|
|
bool
|
|
lex_match(LexState *ls, LexToken *token, enum LexTokenId exp_tok)
|
|
{
|
|
if (token->id != exp_tok) {
|
|
lex_error(ls, "expected '%s' but got '%s' instead\n",
|
|
TokenIdStr[exp_tok], TokenIdStr[token->id]);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
LexState *
|
|
lex_new(Compiler *cm, FILE *input_fp, Str file_name, usize tabsize)
|
|
{
|
|
LexState *ls = calloc(1, sizeof(*ls));
|
|
ls->buf = calloc(LEX_BUFFER_SIZE + 1, sizeof(*ls->buf));
|
|
ls->lbegin = ls->fwd = ls->buf;
|
|
ls->tabsize = tabsize;
|
|
ls->input_fp = input_fp;
|
|
ls->cur_loc.line = 1;
|
|
ls->cur_loc.source = file_name;
|
|
ls->cm = cm;
|
|
/* We use a hash table with string keys as a set containing all identifiers
|
|
* in a compilation unit, to avoid dupplicate allocations.
|
|
*/
|
|
sh_new_arena(ls->idents);
|
|
/* We provide our own buffering scheme */
|
|
setbuf(input_fp, nil);
|
|
/* Initial fill of first buffer.
|
|
* Any file error gets caught in the function, only thing that can happen
|
|
* here is that the file is actually empty, so instant EOF.
|
|
*/
|
|
read_buf(ls, ls->buf, LEX_HALF_BUFFER_SIZE, &ls->buflen);
|
|
return ls;
|
|
}
|
|
|
|
/* Destroys a lexing context and frees its allocated memory.
|
|
* Note that this will also deallocate the identifier arena.
|
|
*/
|
|
void
|
|
lex_destroy(LexState *ls)
|
|
{
|
|
shfree(ls->idents);
|
|
arrfree(ls->backlist);
|
|
free(ls->buf);
|
|
free(ls);
|
|
}
|