#include /* feof, ferror, fread, FILE, EOF */ #include /* malloc calloc free */ #include /* memset */ #include "lex.h" #include "messages.h" #include "pre.h" #include "libs/stb_ds.h" #define LEX_BUFFER_SIZE 8192 #define LEX_HALF_BUFFER_SIZE LEX_BUFFER_SIZE / 2 #define LEX_BUFFER_SENTINEL '\0' #define MAX_IDENT_SIZE 1024u #define STRING_LITERAL_BASE_SIZE 255 #define STRING_LITERAL_MAX_SIZE 4096 #define at_buffer_end(ls) (*(ls)->fwd == '\0') #define ascii_isident(c) (c == '_' || c == '?' || c == '!' || ascii_isalnum(c)) #define ascii_isident_start(c) (c == '_' || ascii_isalpha(c)) #define lex_error(ls, ...) do { \ error((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \ } while(0) #define lex_fatal(ls, ...) do { \ fatal((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \ } while(0) typedef Optional(u8) MaybeChr; const char *TokenIdStr[T_TOKEN_COUNT] = { [T_INVALID] = "(invalid token)", [T_PLUS] = "+", [T_MINUS] = "-", [T_STAR] = "*", [T_BAR] = "/", [T_EXCLAMATION] = "!", [T_LPAREN] = "(", [T_RPAREN] = ")", [T_COMMA] = ",", [T_LESSTHAN] = "<", [T_GREATTHAN] = ">", [T_LOGAND] = "and", [T_LOGOR] = "or", [T_EQUAL] = "=", [T_LOGICEQUAL] = "==", [T_NOTEQUAL] = "!=", [T_HASH] = "#", [T_COLON] = ":", [T_SEMICOLON] = ";", [T_LBRACKET] = "[", [T_RBRACKET] = "]", [T_LBRACE] = "{", [T_RBRACE] = "}", [T_IDENT] = "(identifier)", [T_STRING] = "(string literal)", [T_NUMBER] = "(number)", [T_DECNUMBER] = "(decimal number)", [T_CONST] = "const", [T_DISCARD] = "discard", [T_ELIF] = "elif", [T_ELSE] = "else", [T_END] = "end", [T_IF] = "if", [T_LET] = "let", [T_PROC] = "proc", [T_RETURN] = "return", [T_VAR] = "var", [T_WHILE] = "while", [T_STRUCT] = "struct", [T_USE] = "use", [T_BREAK] = "break", [T_NEXT] = "next", [T_EOF] = "(EOF)", [T_ERROR] = "(error)", }; /* Non retarded ASCII character class comparison */ static bool ascii_isdigit(u32 c) { return c >= '0' && c <= '9'; } static bool ascii_isalpha(u32 c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } static bool ascii_isspace(u32 c) { return c == ' ' || (c >= '\t' && c <= '\r'); } static bool ascii_isalnum(u32 c) { return ascii_isalpha(c) || ascii_isdigit(c); } static void update_line_count(LexState *ls, u8 chr) { switch (chr) { case '\n': ls->cur_loc.column = 1; ++ls->cur_loc.line; break; case '\t': /* fallthrough */ default: ++ls->cur_loc.column; } } static u8 peek(LexState *ls) { return *ls->fwd; } static void backup(LexState *ls, int n) { ls->fwd -= n; if (*ls->fwd == '\n') --ls->cur_loc.line; /* not quite right if fwd is \n... */ --ls->cur_loc.column; } static bool read_buf(LexState *ls, u8 *buf, isize n, isize *ar) { if (feof(ls->input_fp)) return false; const isize rb = fread(buf, sizeof(*buf), n, ls->input_fp); if (ferror(ls->input_fp)) { fatal(ls->cm, nil, "could not read input file\n"); } *ar = rb; buf[rb] = LEX_BUFFER_SENTINEL; return true; } static bool reload_buffers(LexState *ls) { //if ((ls->fwd != ls->buf + ls->buflen) || (ls->fwd != ls->buf + ls->buflen2)) // lex_fatal(ls, "invalid nil byte in middle of source file"); const u8 *end_of_buf1 = ls->buf + ls->buflen; const u8 *end_of_buf2 = ls->buf + LEX_HALF_BUFFER_SIZE + ls->buflen2; if (ls->fwd == end_of_buf1) { /* end of first buffer */ u8 *buf2 = ls->buf + LEX_HALF_BUFFER_SIZE; if (!read_buf(ls, buf2, LEX_HALF_BUFFER_SIZE, &ls->buflen2)) return false; /* reached EOF, no more data */ ls->fwd = buf2; } else if (ls->fwd == end_of_buf2) { /* end of second buffer */ u8 *buf1 = ls->buf; if (!read_buf(ls, buf1, LEX_HALF_BUFFER_SIZE, &ls->buflen)) return false; /* reached EOF, no more data */ ls->fwd = buf1; } /* reset pointers back to the beginning of the buffer */ ls->lbegin = ls->fwd; return true; } static MaybeChr read_chr(LexState *ls) { u8 chr = peek(ls); if (chr == LEX_BUFFER_SENTINEL) { /* maybe end of buffer */ if (!reload_buffers(ls)) return None(MaybeChr); } update_line_count(ls, chr); return Some(MaybeChr, *ls->fwd++); } static MaybeChr skip_whitespace(LexState *ls) { /* skip any whitespace * [ abc = 2*9 - 1 ] * ^-fwd, lbegin * [ abc = 2*9 - 1 ] * lbegin-^^-fwd * */ MaybeChr c; for (;;) { c = read_chr(ls); if (!c.ok) { return None(MaybeChr); } if (!ascii_isspace(c.val)) break; ++ls->lbegin; } return c; } static LexToken make_error(void) { return (LexToken){ .id = T_ERROR }; } static u8 * intern_identifier(LexState *ls, u8 *ident) { IdentsBucket *entry; if ((entry = shgetp_null(ls->idents, ident)) == nil) { shput(ls->idents, ident, 0); return (u8 *)shgets(ls->idents, ident).key; } return (u8 *)entry->key; } /* * *──┬(ident)┬──* * ╰───<───╯ */ static LexToken identifier(LexState *ls) { LexToken token = { .loc = ls->cur_loc }; /* this gets copied to the hash table arena, no problem */ u8 ident_buf[MAX_IDENT_SIZE]; usize i = 0; MaybeChr chr = { *ls->lbegin, true }; while (chr.ok && ascii_isident(chr.val)) { if (i + 1 == MAX_IDENT_SIZE) { lex_error(ls, "identifier is too long (max: %u)\n", MAX_IDENT_SIZE); return make_error(); } ident_buf[i++] = chr.val; chr = read_chr(ls); } ident_buf[i] = '\0'; /* ate 1 extra character, give it back */ if (chr.ok) backup(ls, 1); token.id = T_IDENT; token.ident = (Str){intern_identifier(ls, ident_buf), i}; token.len = i; return token; } static LexToken string_literal(LexState *ls) { LexToken token = { .loc = ls->cur_loc }; isize str_buf_len = STRING_LITERAL_BASE_SIZE; u8 *str_buf = malloc(str_buf_len); isize i = 0; /* skip past " */ MaybeChr chr = read_chr(ls); while (chr.val != '"') { if (i + 1 == STRING_LITERAL_MAX_SIZE) { lex_error(ls, "string literal length exceeds maximum of %d bytes", STRING_LITERAL_MAX_SIZE); goto err; } if (i + 1 > str_buf_len) { str_buf = realloc(str_buf, str_buf_len *= 2); } str_buf[i++] = chr.val; chr = read_chr(ls); if (!chr.ok || chr.val == '\n') { lex_error(ls, "unterminated string literal"); goto err; } } if (i > 0) { str_buf[i] = '\0'; } else { /* empty literal */ free(str_buf); /* we wasted our time */ str_buf = nil; } token.id = T_STRING; token.str = Str_from_buf(str_buf, i); token.len = i; return token; err: return make_error(); } /* Identifies a numeric literal that may have a prefix: * * ('0')─┬──────────────────────┬─* * ├('b')╭──┬(digit)┬─────╯ * ├('o')┤ ╰───<───╯ * ╰('x')╯ * Indirectly based on a BSD (?) implementation. */ static LexToken number_literal(LexState *ls) { LexToken token = { .id = T_NUMBER, .loc = ls->cur_loc }; u64 number = 0; u8 base = 10; MaybeChr chr = { *ls->lbegin, true }; if (chr.val == '0') { chr = read_chr(ls); /* skip 0 prefix */ if (!chr.ok) { /* EOF edge case */ return token; /* 0 */ } switch (chr.val) { case 'b': base = 2; break; case 'o': base = 8; break; case 'x': base = 16; break; default: if (ascii_isdigit(chr.val)) { lex_error(ls, "use '0o' for an octal literal"); return make_error(); } //lex_error(ls, "unknown numeric prefix '0%c'", chr.val); /* start of another token */ return token; /* 0 */ } chr = read_chr(ls); if (!chr.ok) { lex_error(ls, "expected a digit after the base prefix"); return make_error(); } } const u64 mmax = U64_MAX / base; static const u8 digits[] = "0123456789abcdef"; while (chr.ok) { u8 *digitp = memchr(digits, chr.val, lengthof(digits)); if (digitp == nil) break; u8 digit = digitp - digits; if (digit >= base) { lex_error(ls, "invalid literal"); return make_error(); } if (number > mmax) goto overflow; number *= base; /* overflow for adding the digit */ if (U64_MAX - digit < number) goto overflow; number += digit; chr = read_chr(ls); } if (chr.ok) backup(ls, 1); token.inumber = number; return token; overflow: lex_error(ls, "integer literal is too big (2^64 max)"); return make_error(); } static LexToken keyword(LexToken *t) { #define kwcmp(ident, kw, tid) \ {if (Str_equal(ident, kw)) return (LexToken){ .id = tid, .len = kw.len, .loc = t->loc };} Str ident = t->ident; --ident.len; switch (*ident.s++) { case 'a': kwcmp(ident, Sl("nd"), T_LOGAND); break; case 'b': kwcmp(ident, Sl("reak"), T_BREAK); break; case 'c': kwcmp(ident, Sl("onst"), T_CONST); break; case 'd': kwcmp(ident, Sl("iscard"), T_DISCARD); break; case 'e': kwcmp(ident, Sl("nd"), T_END); kwcmp(ident, Sl("lse"), T_ELSE); kwcmp(ident, Sl("lif"), T_ELIF); break; case 'i': kwcmp(ident, Sl("f"), T_IF); break; case 'l': kwcmp(ident, Sl("et"), T_LET); break; case 'n': kwcmp(ident, Sl("ot"), T_LOGNOT); kwcmp(ident, Sl("ext"), T_NEXT); break; case 'o': kwcmp(ident, Sl("r"), T_LOGOR); break; case 'p': kwcmp(ident, Sl("roc"), T_PROC); break; case 'r': kwcmp(ident, Sl("eturn"), T_RETURN); break; case 's': kwcmp(ident, Sl("truct"), T_STRUCT); break; case 'v': kwcmp(ident, Sl("ar"), T_VAR); break; case 'w': kwcmp(ident, Sl("hile"), T_WHILE); break; case 'u': kwcmp(ident, Sl("se"), T_USE); break; } return *t; #undef kwcmp } LexToken lex_scan(LexState *ls) { #define TOKEN(chr, t) case chr: token.id = t; break; if (arrlen(ls->backlist) > 0) { return arrpop(ls->backlist); } /* lexeme start pointer */ ls->lbegin = ls->fwd; LexToken token = {0}; MaybeChr c = skip_whitespace(ls); if (!c.ok) { token.id = T_EOF; ls->eof = true; return token; } token.loc = ls->cur_loc; //trace("token now: '%c'\n", c.val); //trace("lp: <%s>\n", ls->lbegin); //trace("fwd: <%s>\n", ls->fwd); switch (c.val) { case '!': if (peek(ls) == '=') { token.id = T_NOTEQUAL; ++ls->fwd; ++ls->cur_loc.column; } else { token.id = T_EXCLAMATION; } break; TOKEN('+', T_PLUS) TOKEN('-', T_MINUS) TOKEN('*', T_STAR) TOKEN('/', T_BAR) TOKEN('(', T_LPAREN) TOKEN(')', T_RPAREN) TOKEN(',', T_COMMA) TOKEN('<', T_LESSTHAN) TOKEN('>', T_GREATTHAN) TOKEN('#', T_HASH) TOKEN(':', T_COLON) TOKEN(';', T_SEMICOLON) TOKEN('[', T_LBRACKET) TOKEN(']', T_RBRACKET) TOKEN('{', T_LBRACE) TOKEN('}', T_RBRACE) case '=': if (peek(ls) == '=') { token.id = T_LOGICEQUAL; ++ls->fwd; ++ls->cur_loc.column; } else { token.id = T_EQUAL; } break; case '"': return string_literal(ls); case '0' ... '9': return number_literal(ls); default: { const u8 uc = c.val; if (ascii_isident_start(uc)) { LexToken ident_or_keyword = identifier(ls); if (ident_or_keyword.id != T_IDENT) return make_error(); return keyword(&ident_or_keyword); } if (uc > 0x7f) /* DEL, the last ASCII character */ lex_error(ls, "unicode tokens aren't allowed yet"); else lex_error(ls, "unknown token '%c' (\\x%02x)", uc, uc); return make_error(); } } return token; #undef TOKEN } /* Put a token into the backlist. The next call to `lex_scan` will return this * token. The backlist is a stack of tokens, so technically you can have unlimited * look-ahead at the cost of memory. */ void lex_backup(LexState *ls, LexToken token) { arrput(ls->backlist, token); } /* Checks if `t` token type is equal to `exp_tok`. This does not eat any token. */ bool lex_match(LexState *ls, LexToken *token, enum LexTokenId exp_tok) { if (token->id != exp_tok) { lex_error(ls, "expected '%s' but got '%s' instead\n", TokenIdStr[exp_tok], TokenIdStr[token->id]); return false; } return true; } LexState * lex_new(Compiler *cm, FILE *input_fp, Str file_name, usize tabsize) { LexState *ls = calloc(1, sizeof(*ls)); ls->buf = calloc(LEX_BUFFER_SIZE + 1, sizeof(*ls->buf)); ls->lbegin = ls->fwd = ls->buf; ls->tabsize = tabsize; ls->input_fp = input_fp; ls->cur_loc.line = 1; ls->cur_loc.source = file_name; ls->cm = cm; /* We use a hash table with string keys as a set containing all identifiers * in a compilation unit, to avoid dupplicate allocations. */ sh_new_arena(ls->idents); /* We provide our own buffering scheme */ setbuf(input_fp, nil); /* Initial fill of first buffer. * Any file error gets caught in the function, only thing that can happen * here is that the file is actually empty, so instant EOF. */ read_buf(ls, ls->buf, LEX_HALF_BUFFER_SIZE, &ls->buflen); return ls; } /* Destroys a lexing context and frees its allocated memory. * Note that this will also deallocate the identifier arena. */ void lex_destroy(LexState *ls) { shfree(ls->idents); arrfree(ls->backlist); free(ls->buf); free(ls); }