Initial commit

This commit is contained in:
tocariimaa 2025-01-12 18:20:42 -03:00
commit bba597f7bf
28 changed files with 6029 additions and 0 deletions

38
Makefile Normal file
View file

@ -0,0 +1,38 @@
# This makefile should work for both GNU and BSD Make I think...
SRCDIR = ./compiler
COMPILER_SRCS != find $(SRCDIR) -type f -name '*.c'
COMPILER_OBJS := $(COMPILER_SRCS:.c=.o)
COMPILER_DEPS := $(COMPILER_OBJS:.o=.d)
ASAN = -fsanitize=address,undefined
CFLAGS := -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wvla -Wwrite-strings \
-Wnull-dereference -pipe -O0 -ggdb3 -std=c11 $(ASAN)
LDFLAGS := $(ASAN)
all: rutilec ast2dot
rutilec: $(COMPILER_OBJS)
$(CC) $(LDFLAGS) $^ -o $@$(EXE)
ast2dot: tools/ast2dot.c $(COMPILER_OBJS)
$(CC) $(CFLAGS) $(LDFLAGS) $(COMPILER_OBJS) $< -o $@$(EXE)
clean:
rm -f $(COMPILER_OBJS) $(COMPILER_DEPS) ./rutilec$(EXE) ./ast2dot$(EXE)
options:
@echo "Build options:"
@echo "CC = $(CC)"
@echo "CFLAGS = $(CFLAGS)"
@echo "LDFLAGS = $(LDFLAGS)"
@echo "ASan flags = $(ASAN)"
@echo "SRCS = $(COMPILER_SRCS)"
@echo "OBJS = $(COMPILER_OBJS)"
.PHONY: all clean options
-include $(COMPILER_DEPS)
%.o: %.c Makefile
$(CC) $(CFLAGS) -MMD -MP -c -o $@ $<

39
README.md Normal file
View file

@ -0,0 +1,39 @@
# Rutile
Yet another compiled programming language.
Very unstable and in early development.
```
proc main*(): cint
puts("Hello, world!")
return 0
end
```
## Building
### Build time dependencies
- C11 C compiler (tested on GCC, Clang and TCC)
- Libc
- BSD or GNU Make
- POSIX shell
- `find` command
### Build time dependencies (single header libraries)
These are contained in `compiler/libs`.
- [stb_ds.h](https://github.com/nothings/stb)
- [optparse](https://github.com/skeeto/optparse)
### Procedure
Note that the default `CFLAGS` and `LDFLAGS` are meant for development builds.
```sh
# debug build (uses default flags):
make -j$(nproc) ptgc
# for release:
make -j$(nproc) CFLAGS='-O2 -march=native -DNDEBUG' LDFLAGS='' ASAN=''
```
## License
GNU GPLv3 for the compiler and BSD 3-Clause for the standard library.
## Acknowledgements
- Christopher Wellons, for his public domain libraries.
- Sean Barrett's `stb_ds.h`.

151
compiler/ast.h Normal file
View file

@ -0,0 +1,151 @@
#ifndef _ast_h_
#define _ast_h_
#include "pre.h"
#include "datatype.h"
#include "symbol.h"
#include "location.h"
#define ast_node_is_atom(nk) \
(nk == AST_IDENT || nk == AST_NUMBER || nk == AST_STRLIT || nk == AST_PROCCALL)
#define ast_node_is_unary(nk) \
(nk == AST_UNARY || ast_node_is_atom(nk))
#define ast_node_is_expr(nk) \
(nk == AST_BINEXPR || ast_node_is_unary(nk))
enum AstType
{
AST_INVALID, /* For use as a placeholder until the actual type is decided */
AST_NUMBER, /* number */
AST_IDENT, /* ident */
AST_STRLIT, /* strlit */
AST_PROCDEF, /* proc */
AST_PROCCALL, /* call */
AST_PROCCALL_ARGS, /* */
AST_VARDECL, /* var */
AST_VARASSIGN, /* varassgn */
AST_IF, /* ifse */
AST_RETURN, /* ret */
AST_BREAK,
AST_LOOP, /* loop */
AST_STMTS, /* stmts */
AST_EXPRS, /* exprs */
AST_BINEXPR, /* bin */
AST_UNARY, /* unary */
AST_ATTRIBUTE, /* attribute */
AST_DISCARD,
};
typedef struct Ast Ast;
typedef struct {
Str op;
Ast *left, *right;
DataType *type; /* filled in by sema */
} AstBinop;
typedef struct {
Str op;
Ast *atom;
DataType *type; /* filled in by sema */
} AstUnary;
typedef struct {
Str ident;
Str dtype;
/* Symbol kind for this parameter, `SymVar` would represent a mutable
* parameter and `SymLet` a immutable one. */
enum SymbolKind kind;
Location loc, dtype_loc;
} AstIdentTypePair;
typedef struct {
Str name;
bool ispublic;
Ast *body;
Vec(AstIdentTypePair) args;
Ast *rettype;
DataType *type;
} AstProc;
typedef struct {
Str name;
Ast *args;
} AstProcCall;
typedef struct {
Str name;
/* Data type, nil if no type was explicitly stated, meaning that
* type deduction must be made from the expression, also implying that
* if this field is nil, `expr` MUSN'T be nil. */
Ast *datatype;
Ast *expr; /* if the declaration assigns a value */
enum SymbolKind kind; /* whether is a let, var or const... */
DataType *type; /* filled in by sema */
} AstVarDecl;
typedef struct {
Str name;
Ast *expr;
} AstVarAssign;
typedef struct {
u64 n;
DataType *type; /* filled in by the sema */
} AstNumber;
typedef struct {
Ast *cond;
Ast *body;
} AstElif;
typedef struct {
Ast *cond;
Ast *true_body;
Ast *false_body;
Vec(AstElif) elifs;
} AstIf;
/* Abstract representation of a loop, providing a pre and post condition.
* `while` loops are modelled as a loop with a precondition only.
* For infinite loops both `precond` and `postcond` are nil. */
typedef struct {
Ast *precond, *postcond, *body;
} AstLoop;
typedef struct {
/* Attributes for now can only be identifiers */
Vec(Str) attrs;
Ast *node; /* The decorated node */
} AstAttribute;
typedef struct {
Ast *expr;
} AstDiscard;
struct Ast {
enum AstType type;
union {
AstBinop bin; /* binary expression */
AstUnary unary; /* unary operator */
AstNumber number; /* number (this is an atom) */
Str ident; /* identifier (this is an atom too) */
AstProc proc; /* procedure definition */
AstProcCall call; /* procedure call */
AstVarDecl var; /* variable declaration */
AstVarAssign varassgn;
Ast *ret; /* return statement, this points to its expression (if any) */
AstIf ifse; /* if statement/expression */
AstLoop loop;
Vec(Ast *) stmts;
Vec(Ast *) exprs;
Str strlit; /* String literal */
AstAttribute attribute;
AstDiscard discard;
};
Location loc; /* location in the source code of this node */
};
_Static_assert(sizeof(Ast) <= 512, "AST node got too bloated");
#endif

10
compiler/cgBackends.h Normal file
View file

@ -0,0 +1,10 @@
#ifndef _cgbackends_
#define _cgbackends_
enum CodegenBackends
{
CgBackendC,
CgBackendLibGccJit, /* libgccjit backend */
};
#endif

382
compiler/cgC.c Normal file
View file

@ -0,0 +1,382 @@
#include <stdio.h>
#include "pre.h"
#include "codegen.h"
#include "cgC.h"
#include "ast.h"
#include "libs/stb_ds.h"
#define EMIT_SEMICOLON_NL(out) fputs(";\n", out)
#define EMIT_RB_NL(out) fputs("}\n", out)
static void
emit_expr(CodegenC *cgc, const Ast *expr);
static void
emit_expr_list(CodegenC *cgc, const Vec(Ast *) exprs, bool sep);
static void
emit_node(CodegenC *cgc, const Ast *node);
static void
indent(CodegenC *cgc)
{
#define INDENT(out) fputc('\t', out)
switch (cgc->indent) {
case 8: INDENT(cgc->cgctx->out); /* fallthrough */
case 7: INDENT(cgc->cgctx->out); /* fallthrough */
case 6: INDENT(cgc->cgctx->out); /* fallthrough */
case 5: INDENT(cgc->cgctx->out); /* fallthrough */
case 4: INDENT(cgc->cgctx->out); /* fallthrough */
case 3: INDENT(cgc->cgctx->out); /* fallthrough */
case 2: INDENT(cgc->cgctx->out); /* fallthrough */
case 1: INDENT(cgc->cgctx->out); /* fallthrough */
case 0: break;
default:
for (isize left = 0; left < cgc->indent; ++left)
INDENT(cgc->cgctx->out);
}
#undef INDENT
}
/* Interns a string literal into the string table, returning its ID */
static i64
intern_strlit(CodegenC *cgc, const Str *str)
{
const i64 strno = shget(cgc->cgctx->strings, str->s);
if (strno != -1) /* string already exists, return its index number */
return strno;
shput(cgc->cgctx->strings, str->s, cgc->cgctx->strlit_no);
return cgc->cgctx->strlit_no++;
}
static void
emit_comment(CodegenC *cgc, Str comment, bool nl_after)
{
fprintf(cgc->cgctx->out, "/* %s */%c", comment.s, nl_after ? '\n' : '\0');
}
static void
emit_include(CodegenC *cgc, Str path, bool local)
{
fprintf(
cgc->cgctx->out, "#include %c%s%c\n",
local ? '"' : '<', path.s, local ? '"' : '>'
);
}
static const char *
basic_datatype_to_c(CodegenC *cgc, const DataType *dt)
{
switch (dt->kind) {
case DtkBasic:
switch (dt->size) {
case 0: return "void";
case 1: return "uint8_t";
case 2: return "uint16_t";
case 4: return "uint32_t";
case 8: return "uint64_t";
}
break;
case DtkVoid:
return "void";
break;
}
return nil;
}
static void
emit_datatype(CodegenC *cgc, const DataType *dt)
{
switch (dt->kind) {
case DtkBasic:
case DtkVoid:
fputs(basic_datatype_to_c(cgc, dt), cgc->cgctx->out);
break;
case DtkStruct:
fprintf(cgc->cgctx->out, "struct %s", dt->name.s);
break;
}
}
static void
emit_c_attribute(CodegenC *cgc, Str attr)
{
fprintf(cgc->cgctx->out, "__attribute((%s))", attr.s);
}
static void
emit_structdecl(CodegenC *cgc, const DataType *dt)
{
fputs("struct %s {\n", cgc->cgctx->out);
for (isize i = 0; i < arrlen(dt->compound.fields); ++i) {
emit_datatype(cgc, dt->compound.fields[i]);
EMIT_SEMICOLON_NL(cgc->cgctx->out);
}
fputc('}', cgc->cgctx->out);
if (dt->compound.packed)
emit_c_attribute(cgc, Sl("packed"));
EMIT_SEMICOLON_NL(cgc->cgctx->out);
}
static void
emit_vardecl(CodegenC *cgc, const AstVarDecl *decl)
{
if (decl->kind == SymConst)
fputs("const ", cgc->cgctx->out);
emit_datatype(cgc, decl->type);
fprintf(cgc->cgctx->out, " %s", decl->name.s);
if (decl->expr != nil) {
fputc('=', cgc->cgctx->out);
emit_expr(cgc, decl->expr);
}
EMIT_SEMICOLON_NL(cgc->cgctx->out);
}
static void
emit_varassign(CodegenC *cgc, const AstVarAssign *assign)
{
fprintf(cgc->cgctx->out, "%s = ", assign->name.s);
emit_expr(cgc, assign->expr);
EMIT_SEMICOLON_NL(cgc->cgctx->out);
}
static void
emit_proc(CodegenC *cgc, const AstProc *proc)
{
if (!proc->ispublic)
fputs("static ", cgc->cgctx->out);
emit_datatype(cgc, proc->type->proc.rettype);
fprintf(cgc->cgctx->out, " %s(", proc->name.s);
const isize arglen = arrlen(proc->args);
if (arglen == 0)
fputs("void", cgc->cgctx->out);
for (isize i = 0; i < arglen; ++i) {
AstIdentTypePair arg = proc->args[i];
//emit_datatype(cgc, arg.dtype);
fputs("uint64_t ", cgc->cgctx->out);
fputs((char *)arg.ident.s, cgc->cgctx->out);
if (i + 1 < arglen)
fputc(',', cgc->cgctx->out);
}
fputs(")\n{\n", cgc->cgctx->out);
if (proc->body != nil)
emit_node(cgc, proc->body);
EMIT_RB_NL(cgc->cgctx->out);
}
static void
emit_proccall(CodegenC *cgc, const AstProcCall *call)
{
fprintf(cgc->cgctx->out, "%s(", call->name.s);
if (call->args != nil)
emit_expr_list(cgc, (const Vec(Ast *))call->args->stmts, true);
fputs(")", cgc->cgctx->out);
}
static void
emit_if(CodegenC *cgc, const AstIf *ift)
{
fputs("if (", cgc->cgctx->out);
emit_expr(cgc, ift->cond);
fputs("){\n", cgc->cgctx->out);
emit_node(cgc, ift->true_body);
fputc('}', cgc->cgctx->out);
if (ift->false_body != nil) {
fputs("else", cgc->cgctx->out);
fputs("{\n", cgc->cgctx->out);
emit_node(cgc, ift->false_body);
fputc('}', cgc->cgctx->out);
}
fputc('\n', cgc->cgctx->out);
}
static void
emit_whileLoop(CodegenC *cgc, const AstLoop *whl)
{
fputs("while (", cgc->cgctx->out);
emit_expr(cgc, whl->precond);
fputs("){\n", cgc->cgctx->out);
emit_node(cgc, whl->body);
fputs("}\n", cgc->cgctx->out);
}
static void
emit_loop(CodegenC *cgc, const AstLoop *loop)
{
if (loop->precond != nil)
emit_whileLoop(cgc, loop);
else if (loop->postcond != nil)
unreachable();
}
static void
emit_return(CodegenC *cgc, const Ast *ret_expr)
{
fputs("return ", cgc->cgctx->out);
emit_expr(cgc, ret_expr);
EMIT_SEMICOLON_NL(cgc->cgctx->out);
}
static void
emit_break(CodegenC *cgc, const Ast *unused)
{
(void)unused;
fputs("break;\n", cgc->cgctx->out);
}
static void
emit_discard(CodegenC *cgc, const Ast *expr)
{
emit_node(cgc, expr);
}
static void
emit_expr_number(CodegenC *cgc, const AstNumber *num)
{
fprintf(cgc->cgctx->out, "%lu", num->n);
}
static void
emit_expr_strlit(CodegenC *cgc, const Str *strlit)
{
fprintf(cgc->cgctx->out, "\"%s\"", strlit->s);
}
static void
emit_expr_ident(CodegenC *cgc, const Str *ident)
{
fputs((char *)ident->s, cgc->cgctx->out);
}
static void
emit_expr_unary(CodegenC *cgc, const AstUnary *unary)
{
emit_expr(cgc, unary->atom);
}
static void
emit_expr_binop(CodegenC *cgc, const AstBinop *expr)
{
/* guard binops with parenthesis, even if they are redundant */
fputc('(', cgc->cgctx->out);
emit_expr(cgc, expr->left);
fputc('+', cgc->cgctx->out);
emit_expr(cgc, expr->right);
fputc(')', cgc->cgctx->out);
}
static void
emit_expr(CodegenC *cgc, const Ast *expr)
{
if (expr == nil)
return;
switch (expr->type) {
case AST_BINEXPR:
emit_expr_binop(cgc, &expr->bin);
break;
case AST_UNARY:
emit_expr_unary(cgc, &expr->unary);
break;
case AST_NUMBER:
emit_expr_number(cgc, &expr->number);
break;
case AST_STRLIT:
emit_expr_strlit(cgc, &expr->strlit);
break;
case AST_IDENT:
emit_expr_ident(cgc, &expr->ident);
break;
case AST_PROCCALL:
emit_proccall(cgc, &expr->call);
break;
default:
unreachable();
}
}
static void
emit_expr_list(CodegenC *cgc, const Vec(Ast *) exprs, bool sep)
{
const isize exprs_len = arrlen(exprs);
for (isize i = 0; i < exprs_len; ++i) {
emit_expr(cgc, exprs[i]);
if (sep && i + 1 < exprs_len) /* no trailling separator */
fputc(',', cgc->cgctx->out);
}
}
static void
emit_stmt_list(CodegenC *cgc, Vec(Ast *) stmts)
{
for (isize i = 0; i < arrlen(stmts); ++i) {
emit_node(cgc, stmts[i]);
}
}
static void
emit_node(CodegenC *cgc, const Ast *node)
{
switch (node->type) {
case AST_STMTS:
emit_stmt_list(cgc, node->stmts);
break;
case AST_PROCDEF:
emit_proc(cgc, &node->proc);
break;
case AST_PROCCALL:
emit_proccall(cgc, &node->call);
EMIT_SEMICOLON_NL(cgc->cgctx->out);
break;
case AST_IF:
emit_if(cgc, &node->ifse);
break;
case AST_LOOP:
emit_loop(cgc, &node->loop);
break;
case AST_RETURN:
emit_return(cgc, node->ret);
break;
case AST_BREAK:
emit_break(cgc, nil);
break;
case AST_DISCARD:
emit_discard(cgc, node->discard.expr);
break;
case AST_VARDECL:
emit_vardecl(cgc, &node->var);
break;
case AST_VARASSIGN:
emit_varassign(cgc, &node->varassgn);
break;
case AST_BINEXPR:
case AST_UNARY:
case AST_NUMBER:
case AST_STRLIT:
case AST_IDENT:
emit_expr(cgc, node);
break;
case AST_PROCCALL_ARGS:
case AST_EXPRS:
case AST_INVALID:
unreachable();
}
}
void
cgC(CodegenC *cgc, const Ast *program)
{
cgc->cgctx->out = stdout;
char note_buf[255] = {0};
snprintf(note_buf, sizeof(note_buf),
"generated C IR from %s", cgc->cgctx->cctx->current_filename.s
);
emit_comment(cgc, Str_from_c(note_buf), true);
emit_include(cgc, Sl("stdint.h"), false);
fputc('\n', cgc->cgctx->out);
emit_node(cgc, program);
}

15
compiler/cgC.h Normal file
View file

@ -0,0 +1,15 @@
#ifndef _cgC_h_
#define _cgC_h_
#include "codegen.h"
#include "ast.h"
typedef struct {
CodegenCtx *cgctx;
int indent;
} CodegenC;
void
cgC(CodegenC *cgc, const Ast *program);
#endif

101
compiler/codegen.c Normal file
View file

@ -0,0 +1,101 @@
#define _POSIX_C_SOURCE 200809L
#include <unistd.h>
#include <spawn.h>
#include <sys/wait.h>
#include "codegen.h"
#include "cgC.h"
#include "messages.h"
#include "libs/stb_ds.h"
/* (Std)In --> process --> (Std)Out */
void
spawn_with_iofp(const char *path, char *const *argv,
pid_t *pid, FILE **in, FILE **out)
{
int irp[2], asmp[2];
posix_spawn_file_actions_t fileacts;
posix_spawn_file_actions_init(&fileacts);
if (in != nil) {
/* the "in" pipe */
if (pipe(irp) < 0)
fatal(nil, nil, "could not open pipe");
posix_spawn_file_actions_addclose(&fileacts, irp[1]);
posix_spawn_file_actions_adddup2(&fileacts, irp[0], STDIN_FILENO);
}
if (out != nil) {
/* the "out" pipe */
if (pipe(asmp) < 0)
fatal(nil, nil, "could not open pipe");
posix_spawn_file_actions_addclose(&fileacts, asmp[0]);
posix_spawn_file_actions_adddup2(&fileacts, asmp[1], STDOUT_FILENO);
}
if (posix_spawn(pid, path, &fileacts, nil, argv, nil) != 0)
fatal(nil, nil, "posix_spawn failed");
posix_spawn_file_actions_destroy(&fileacts);
if (in != nil) {
close(irp[0]);
if ((*in = fdopen(irp[1], "wb")) == nil)
fatal(nil, nil, "fdopen fail");
}
if (out != nil) {
close(asmp[1]);
if ((*out = fdopen(asmp[0], "rb")) == nil)
fatal(nil, nil, "fdopen fail");
}
}
void
process_wait(pid_t pid)
{
int pstat;
waitpid(pid, &pstat, 0);
if (!WIFEXITED(pstat))
fatal(nil, nil, "qbe crashed");
/* did not crash, read return status */
int exitc;
if ((exitc = WEXITSTATUS(pstat)) != 0)
fatal(nil, nil, "qbe exited with non-zero status %d", exitc);
}
CodegenCtx *
codegen_new(Compiler *cm, enum CodegenBackends backend)
{
CodegenCtx *ctx = calloc(1, sizeof(*ctx));
ctx->ext_pid = ctx->ld_pid = -1;
ctx->backend = backend;
ctx->cctx = cm;
sh_new_arena(ctx->strings);
shdefault(ctx->strings, -1);
return ctx;
}
void
codegen_destroy(CodegenCtx *cgctx)
{
if (cgctx->ext_pid != -1)
process_wait(cgctx->ext_pid);
if (cgctx->ld_pid != -1)
process_wait(cgctx->ld_pid);
shfree(cgctx->strings);
free(cgctx);
}
void
codegen(CodegenCtx *cgctx, Ast *program)
{
switch (cgctx->backend) {
case CgBackendC:
cgC(&(CodegenC){.cgctx = cgctx, .indent = 2}, program);
break;
case CgBackendLibGccJit:
fatal(nil, nil, "libgccjit backend not implemented yet");
break;
}
}

37
compiler/codegen.h Normal file
View file

@ -0,0 +1,37 @@
#ifndef _codegen_h_
#define _codegen_h_
#include <stdio.h> /* FILE */
#include <sys/types.h> /* for pid_t */
#include "pre.h"
#include "ast.h"
#include "state.h"
#include "cgBackends.h"
typedef struct {
FILE *out; /* File where to output QBE IR */
FILE *asm_out;
i64 strlit_no;
i64 internal_label;
/* Hash map acting as a set, which contains all strings in a compilation
* unit. Strings get interned on this hash map to remove duplicates.
*/
HashMapStr(i64) *strings;
pid_t ext_pid, ld_pid;
enum CodegenBackends backend;
Compiler *cctx;
} CodegenCtx;
void
spawn_with_iofp(const char *path, char *const *argv, pid_t *pid, FILE **in, FILE **out);
void
process_wait(pid_t pid);
CodegenCtx *
codegen_new(Compiler *cm, enum CodegenBackends backend);
void
codegen_destroy(CodegenCtx *cgctx);
void
codegen(CodegenCtx *cgctx, Ast *program);
#endif

53
compiler/datatype.h Normal file
View file

@ -0,0 +1,53 @@
#ifndef _datatype_h_
#define _datatype_h_
#include "pre.h"
enum DataTypeKind
{
DtkInvalid = 0,
DtkVoid,
DtkBasic,
DtkStruct,
DtkUnion,
DtkProc,
DtkArray,
DtkBool,
};
typedef struct DataType DataType;
typedef struct {
bool packed;
Vec(DataType *) fields;
} DataTypeCompound;
struct DataType
{
enum DataTypeKind kind;
u16 size; /* size in bytes of the data type */
bool builtin; /* if this type is defined in compilerland */
bool sign; /* if the type is numerical and has a sign or not */
Str name;
union {
DataTypeCompound compound; /* Represents either a struct or union type */
struct {
DataType *rettype;
Vec(DataType *) argtypes;
bool public;
bool extern_lnk; /* external linkage */
bool c_varargs; /* C-style varargs (for FFI) */
} proc;
struct {
DataType *base;
isize len;
} array;
};
};
typedef struct {
bool ok; /* whether the type checking succeeded */
Str msg; /* message describing the type error */
} DataTypeCheck;
#endif

581
compiler/lex.c Normal file
View file

@ -0,0 +1,581 @@
#include <stdio.h> /* feof, ferror, fread, FILE, EOF */
#include <stdlib.h> /* malloc calloc free */
#include <string.h> /* memset */
#include "lex.h"
#include "messages.h"
#include "pre.h"
#include "libs/stb_ds.h"
#define LEX_BUFFER_SIZE 8192
#define LEX_HALF_BUFFER_SIZE LEX_BUFFER_SIZE / 2
#define LEX_BUFFER_SENTINEL '\0'
#define MAX_IDENT_SIZE 1024u
#define STRING_LITERAL_BASE_SIZE 255
#define STRING_LITERAL_MAX_SIZE 4096
#define at_buffer_end(ls) (*(ls)->fwd == '\0')
#define ascii_isident(c) (c == '_' || c == '?' || c == '!' || ascii_isalnum(c))
#define ascii_isident_start(c) (c == '_' || ascii_isalpha(c))
#define lex_error(ls, ...) do { \
error((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \
} while(0)
#define lex_fatal(ls, ...) do { \
fatal((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \
} while(0)
typedef Optional(u8) MaybeChr;
const char *TokenIdStr[T_TOKEN_COUNT] = {
[T_INVALID] = "(invalid token)",
[T_PLUS] = "+",
[T_MINUS] = "-",
[T_STAR] = "*",
[T_BAR] = "/",
[T_EXCLAMATION] = "!",
[T_LPAREN] = "(",
[T_RPAREN] = ")",
[T_COMMA] = ",",
[T_LESSTHAN] = "<",
[T_GREATTHAN] = ">",
[T_LOGAND] = "and",
[T_LOGOR] = "or",
[T_EQUAL] = "=",
[T_LOGICEQUAL] = "==",
[T_NOTEQUAL] = "!=",
[T_HASH] = "#",
[T_COLON] = ":",
[T_SEMICOLON] = ";",
[T_LBRACKET] = "[",
[T_RBRACKET] = "]",
[T_LBRACE] = "{",
[T_RBRACE] = "}",
[T_IDENT] = "(identifier)",
[T_STRING] = "(string literal)",
[T_NUMBER] = "(number)",
[T_DECNUMBER] = "(decimal number)",
[T_CONST] = "const",
[T_DISCARD] = "discard",
[T_ELIF] = "elif",
[T_ELSE] = "else",
[T_END] = "end",
[T_IF] = "if",
[T_LET] = "let",
[T_PROC] = "proc",
[T_RETURN] = "return",
[T_VAR] = "var",
[T_WHILE] = "while",
[T_STRUCT] = "struct",
[T_USE] = "use",
[T_BREAK] = "break",
[T_NEXT] = "next",
[T_EOF] = "(EOF)",
[T_ERROR] = "(error)",
};
/* Non retarded ASCII character class comparison */
static bool
ascii_isdigit(u32 c)
{
return c >= '0' && c <= '9';
}
static bool
ascii_isalpha(u32 c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
static bool
ascii_isspace(u32 c)
{
return c == ' ' || (c >= '\t' && c <= '\r');
}
static bool
ascii_isalnum(u32 c)
{
return ascii_isalpha(c) || ascii_isdigit(c);
}
static void
update_line_count(LexState *ls, u8 chr)
{
switch (chr) {
case '\n':
ls->cur_loc.column = 1;
++ls->cur_loc.line;
break;
case '\t': /* fallthrough */
default:
++ls->cur_loc.column;
}
}
static u8
peek(LexState *ls)
{
return *ls->fwd;
}
static void
backup(LexState *ls, int n)
{
ls->fwd -= n;
if (*ls->fwd == '\n')
--ls->cur_loc.line;
/* not quite right if fwd is \n... */
--ls->cur_loc.column;
}
static bool
read_buf(LexState *ls, u8 *buf, isize n, isize *ar)
{
if (feof(ls->input_fp))
return false;
const isize rb = fread(buf, sizeof(*buf), n, ls->input_fp);
if (ferror(ls->input_fp)) {
fatal(ls->cm, nil, "could not read input file\n");
}
*ar = rb;
buf[rb] = LEX_BUFFER_SENTINEL;
return true;
}
static bool
reload_buffers(LexState *ls)
{
//if ((ls->fwd != ls->buf + ls->buflen) || (ls->fwd != ls->buf + ls->buflen2))
// lex_fatal(ls, "invalid nil byte in middle of source file");
const u8 *end_of_buf1 = ls->buf + ls->buflen;
const u8 *end_of_buf2 = ls->buf + LEX_HALF_BUFFER_SIZE + ls->buflen2;
if (ls->fwd == end_of_buf1) { /* end of first buffer */
u8 *buf2 = ls->buf + LEX_HALF_BUFFER_SIZE;
if (!read_buf(ls, buf2, LEX_HALF_BUFFER_SIZE, &ls->buflen2))
return false; /* reached EOF, no more data */
ls->fwd = buf2;
} else if (ls->fwd == end_of_buf2) { /* end of second buffer */
u8 *buf1 = ls->buf;
if (!read_buf(ls, buf1, LEX_HALF_BUFFER_SIZE, &ls->buflen))
return false; /* reached EOF, no more data */
ls->fwd = buf1;
}
/* reset pointers back to the beginning of the buffer */
ls->lbegin = ls->fwd;
return true;
}
static MaybeChr
read_chr(LexState *ls)
{
u8 chr = peek(ls);
if (chr == LEX_BUFFER_SENTINEL) { /* maybe end of buffer */
if (!reload_buffers(ls))
return None(MaybeChr);
}
update_line_count(ls, chr);
return Some(MaybeChr, *ls->fwd++);
}
static MaybeChr
skip_whitespace(LexState *ls)
{
/* skip any whitespace
* [ abc = 2*9 - 1 ]
* ^-fwd, lbegin
* [ abc = 2*9 - 1 ]
* lbegin-^^-fwd
* */
MaybeChr c;
for (;;) {
c = read_chr(ls);
if (!c.ok) {
return None(MaybeChr);
}
if (!ascii_isspace(c.val))
break;
++ls->lbegin;
}
return c;
}
static LexToken
make_error(void)
{
return (LexToken){ .id = T_ERROR };
}
static u8 *
intern_identifier(LexState *ls, u8 *ident)
{
IdentsBucket *entry;
if ((entry = shgetp_null(ls->idents, ident)) == nil) {
shput(ls->idents, ident, 0);
return (u8 *)shgets(ls->idents, ident).key;
}
return (u8 *)entry->key;
}
/*
* *(ident)*
* <
*/
static LexToken
identifier(LexState *ls)
{
/* this gets copied to the hash table arena, no problem */
u8 ident_buf[MAX_IDENT_SIZE];
usize i = 0;
MaybeChr chr = { *ls->lbegin, true };
while (chr.ok && ascii_isident(chr.val)) {
if (i + 1 == MAX_IDENT_SIZE) {
lex_error(ls, "identifier is too long (max: %u)\n", MAX_IDENT_SIZE);
return make_error();
}
ident_buf[i++] = chr.val;
chr = read_chr(ls);
}
ident_buf[i] = '\0';
/* ate 1 extra character, give it back */
if (chr.ok)
backup(ls, 1);
return (LexToken) {
.id = T_IDENT,
.ident = {intern_identifier(ls, ident_buf), i},
.len = i,
};
}
static LexToken
string_literal(LexState *ls)
{
isize str_buf_len = STRING_LITERAL_BASE_SIZE;
u8 *str_buf = malloc(str_buf_len);
isize i = 0;
/* skip past " */
MaybeChr chr = read_chr(ls);
while (chr.val != '"') {
if (i + 1 == STRING_LITERAL_MAX_SIZE) {
lex_error(ls, "string literal length exceeds maximum of %d bytes", STRING_LITERAL_MAX_SIZE);
goto err;
}
if (i + 1 > str_buf_len) {
str_buf = realloc(str_buf, str_buf_len *= 2);
}
str_buf[i++] = chr.val;
chr = read_chr(ls);
if (!chr.ok || chr.val == '\n') {
lex_error(ls, "unterminated string literal");
goto err;
}
}
if (i > 0) {
str_buf[i] = '\0';
} else { /* empty literal */
free(str_buf); /* we wasted our time */
str_buf = nil;
}
return (LexToken) {
.id = T_STRING,
.str = {str_buf, i},
.len = i,
};
err:
return make_error();
}
/* Identifies a numeric literal that may have a prefix:
*
* ('0')*
* ('b')(digit)
* ('o') <
* ('x')
* Indirectly based on a BSD (?) implementation.
*/
static LexToken
number_literal(LexState *ls)
{
LexToken t = { .id = T_NUMBER };
u64 number = 0;
u8 base = 10;
MaybeChr chr = { *ls->lbegin, true };
if (chr.val == '0') {
chr = read_chr(ls); /* skip 0 prefix */
if (!chr.ok) { /* EOF edge case */
return t; /* 0 */
}
switch (chr.val) {
case 'b':
base = 2;
break;
case 'o':
base = 8;
break;
case 'x':
base = 16;
break;
default:
if (ascii_isdigit(chr.val)) {
lex_error(ls, "use '0o' for an octal literal");
return make_error();
}
//lex_error(ls, "unknown numeric prefix '0%c'", chr.val);
/* start of another token */
return t; /* 0 */
}
chr = read_chr(ls);
if (!chr.ok) {
lex_error(ls, "expected a digit after the base prefix");
return make_error();
}
}
const u64 mmax = U64_MAX / base;
static const u8 digits[] = "0123456789abcdef";
while (chr.ok) {
u8 *digitp = memchr(digits, chr.val, lengthof(digits));
if (digitp == nil)
break;
u8 digit = digitp - digits;
if (digit >= base) {
lex_error(ls, "invalid literal");
return make_error();
}
if (number > mmax)
goto overflow;
number *= base;
/* overflow for adding the digit */
if (U64_MAX - digit < number)
goto overflow;
number += digit;
chr = read_chr(ls);
}
if (chr.ok)
backup(ls, 1);
t.inumber = number;
return t;
overflow:
lex_error(ls, "integer literal is too big (2^64 max)");
return make_error();
}
static LexToken
keyword(LexToken *t)
{
#define kwcmp(ident, kw, tid) \
{if (Str_equal(ident, kw)) return (LexToken){ .id = tid, .len = kw.len };}
Str ident = t->ident;
--ident.len;
switch (*ident.s++) {
case 'a':
kwcmp(ident, Sl("nd"), T_LOGAND);
break;
case 'b':
kwcmp(ident, Sl("reak"), T_BREAK);
break;
case 'c':
kwcmp(ident, Sl("onst"), T_CONST);
break;
case 'd':
kwcmp(ident, Sl("iscard"), T_DISCARD);
break;
case 'e':
kwcmp(ident, Sl("nd"), T_END);
kwcmp(ident, Sl("lse"), T_ELSE);
kwcmp(ident, Sl("lif"), T_ELIF);
break;
case 'i':
kwcmp(ident, Sl("f"), T_IF);
break;
case 'l':
kwcmp(ident, Sl("et"), T_LET);
break;
case 'n':
kwcmp(ident, Sl("ot"), T_LOGNOT);
kwcmp(ident, Sl("ext"), T_NEXT);
break;
case 'o':
kwcmp(ident, Sl("r"), T_LOGOR);
break;
case 'p':
kwcmp(ident, Sl("roc"), T_PROC);
break;
case 'r':
kwcmp(ident, Sl("eturn"), T_RETURN);
break;
case 's':
kwcmp(ident, Sl("truct"), T_STRUCT);
break;
case 'v':
kwcmp(ident, Sl("ar"), T_VAR);
break;
case 'w':
kwcmp(ident, Sl("hile"), T_WHILE);
break;
case 'u':
kwcmp(ident, Sl("se"), T_USE);
break;
}
return *t;
#undef kwcmp
}
LexToken
lex_scan(LexState *ls)
{
if (arrlen(ls->backlist) > 0) {
return arrpop(ls->backlist);
}
/* lexeme start pointer */
ls->lbegin = ls->fwd;
LexToken token = {0};
MaybeChr c = skip_whitespace(ls);
if (!c.ok) {
token.id = T_EOF;
ls->eof = true;
return token;
}
#define TOKEN(chr, t) case chr: token.id = t; break;
//trace("token now: '%c'\n", c.val);
//trace("lp: <%s>\n", ls->lbegin);
//trace("fwd: <%s>\n", ls->fwd);
switch (c.val) {
case '!':
if (peek(ls) == '=') {
token.id = T_NOTEQUAL;
++ls->fwd;
} else {
token.id = T_EXCLAMATION;
}
break;
TOKEN('+', T_PLUS)
TOKEN('-', T_MINUS)
TOKEN('*', T_STAR)
TOKEN('/', T_BAR)
TOKEN('(', T_LPAREN)
TOKEN(')', T_RPAREN)
TOKEN(',', T_COMMA)
TOKEN('<', T_LESSTHAN)
TOKEN('>', T_GREATTHAN)
TOKEN('#', T_HASH)
TOKEN(':', T_COLON)
TOKEN(';', T_SEMICOLON)
TOKEN('[', T_LBRACKET)
TOKEN(']', T_RBRACKET)
TOKEN('{', T_LBRACE)
TOKEN('}', T_RBRACE)
case '=':
if (peek(ls) == '=') {
token.id = T_LOGICEQUAL;
++ls->fwd;
} else {
token.id = T_EQUAL;
}
break;
case '"':
return string_literal(ls);
case '0' ... '9':
return number_literal(ls);
default: {
const u8 uc = c.val;
if (ascii_isident_start(uc)) {
LexToken ident_or_keyword = identifier(ls);
if (ident_or_keyword.id != T_IDENT)
return make_error();
return keyword(&ident_or_keyword);
}
if (uc > 0x7f) /* DEL, the last ASCII character */
lex_error(ls, "unicode tokens aren't allowed yet");
else
lex_error(ls, "unknown token '%c' (\\x%02x)", uc, uc);
return make_error();
}
}
return token;
#undef TOKEN
}
/* Put a token into the backlist. The next call to `lex_scan` will return this
* token. The backlist is a stack of tokens, so technically you can have unlimited
* look-ahead at the cost of memory.
*/
void
lex_backup(LexState *ls, LexToken token)
{
arrput(ls->backlist, token);
i64 col = ls->cur_loc.column - token.len;
if (col < 1) {
if (ls->cur_loc.line > 1)
--ls->cur_loc.line;
} else {
ls->cur_loc.column = col;
}
}
/* Checks if `t` token type is equal to `exp_tok`. This does not eat any token. */
bool
lex_match(LexState *ls, LexToken *token, enum LexTokenId exp_tok)
{
if (token->id != exp_tok) {
lex_error(ls, "expected '%s' but got '%s' instead\n",
TokenIdStr[exp_tok], TokenIdStr[token->id]);
return false;
}
return true;
}
LexState *
lex_new(Compiler *cm, FILE *input_fp, Str file_name, usize tabsize)
{
LexState *ls = calloc(1, sizeof(*ls));
ls->buf = calloc(LEX_BUFFER_SIZE + 1, sizeof(*ls->buf));
ls->lbegin = ls->fwd = ls->buf;
ls->tabsize = tabsize;
ls->input_fp = input_fp;
ls->cur_loc.line = 1;
ls->cur_loc.source = file_name;
ls->cm = cm;
/* We use a hash table with string keys as a set containing all identifiers
* in a compilation unit, to avoid dupplicate allocations.
*/
sh_new_arena(ls->idents);
/* We provide our own buffering scheme */
setbuf(input_fp, nil);
/* Initial fill of first buffer.
* Any file error gets caught in the function, only thing that can happen
* here is that the file is actually empty, so instant EOF.
*/
read_buf(ls, ls->buf, LEX_HALF_BUFFER_SIZE, &ls->buflen);
return ls;
}
/* Destroys a lexing context and frees its allocated memory.
* Note that this will also deallocate the identifier arena.
*/
void
lex_destroy(LexState *ls)
{
shfree(ls->idents);
arrfree(ls->backlist);
free(ls->buf);
free(ls);
}

93
compiler/lex.h Normal file
View file

@ -0,0 +1,93 @@
#ifndef _lex_h_
#define _lex_h_
#include <stdio.h>
#include "pre.h"
#include "location.h"
#include "state.h"
#include "libs/stb_ds.h"
enum LexTokenId {
T_INVALID = 0,
/* Unary and binary operators */
T_PLUS, T_MINUS, T_STAR, T_BAR,
T_LESSTHAN, T_GREATTHAN, T_LOGNOT, T_LOGAND, T_LOGOR, T_LOGICEQUAL, T_NOTEQUAL,
T_HASH,
/* Others */
T_EQUAL, T_EXCLAMATION, T_LPAREN, T_RPAREN, T_COMMA,
T_COLON, T_SEMICOLON, T_LBRACKET, T_RBRACKET, T_LBRACE, T_RBRACE,
/* Atoms */
T_IDENT, T_STRING, T_NUMBER, T_DECNUMBER,
/* Keywords */
T_CONST,
T_ELSE,
T_END,
T_ELIF,
T_IF,
T_LET,
T_PROC,
T_RETURN,
T_VAR,
T_DISCARD,
T_WHILE,
T_STRUCT,
T_USE,
T_BREAK,
T_NEXT,
/* Control */
T_EOF, T_ERROR,
T_TOKEN_COUNT, /* does not represent an actual token */
};
/* Table mapping a `LexTokenId` to a string name of the token */
extern const char *TokenIdStr[];
typedef struct {
enum LexTokenId id;
union {
Str ident, str, keyword;
/* XXX: Defer number parsing until it is actually needed?
* So we can move number parsing out of the lexer. */
/* Integer literal, it's the parser problem to tell
* whether the literal is negative or not.
*/
u64 inumber;
/* Floating point literal */
double floatn;
};
isize len; /* Size in bytes of this token */
} LexToken;
typedef HashMapStr(i8) IdentsBucket;
typedef struct {
FILE *input_fp;
/* Lexing buffer. This is actually split into two buffers, providing
* a double-buffering scheme */
u8 *buf;
/* Actual length of each buffer (fread may read less than LEX_BUFFER_SIZE) */
isize buflen, buflen2;
u8 *lbegin; /* marks the begin of the current lexeme */
u8 *fwd; /* this pointer is the scanner */
Vec(LexToken) backlist; /* stack of backed up tokens */
int tabsize;
bool eof;
Location cur_loc;
Compiler *cm;
IdentsBucket *idents;
} LexState;
LexToken
lex_scan(LexState *ls);
void
lex_backup(LexState *ls, LexToken token);
bool
lex_match(LexState *ls, LexToken *t, enum LexTokenId exp_tok);
LexState *
lex_new(Compiler *cm, FILE *input_fp, Str file_name, usize tabsize);
void
lex_destroy(LexState *l);
#endif

403
compiler/libs/optparse.h Normal file
View file

@ -0,0 +1,403 @@
/* Optparse --- portable, reentrant, embeddable, getopt-like option parser
*
* This is free and unencumbered software released into the public domain.
*
* To get the implementation, define OPTPARSE_IMPLEMENTATION.
* Optionally define OPTPARSE_API to control the API's visibility
* and/or linkage (static, __attribute__, __declspec).
*
* The POSIX getopt() option parser has three fatal flaws. These flaws
* are solved by Optparse.
*
* 1) Parser state is stored entirely in global variables, some of
* which are static and inaccessible. This means only one thread can
* use getopt(). It also means it's not possible to recursively parse
* nested sub-arguments while in the middle of argument parsing.
* Optparse fixes this by storing all state on a local struct.
*
* 2) The POSIX standard provides no way to properly reset the parser.
* This means for portable code that getopt() is only good for one
* run, over one argv with one option string. It also means subcommand
* options cannot be processed with getopt(). Most implementations
* provide a method to reset the parser, but it's not portable.
* Optparse provides an optparse_arg() function for stepping over
* subcommands and continuing parsing of options with another option
* string. The Optparse struct itself can be passed around to
* subcommand handlers for additional subcommand option parsing. A
* full reset can be achieved by with an additional optparse_init().
*
* 3) Error messages are printed to stderr. This can be disabled with
* opterr, but the messages themselves are still inaccessible.
* Optparse solves this by writing an error message in its errmsg
* field. The downside to Optparse is that this error message will
* always be in English rather than the current locale.
*
* Optparse should be familiar with anyone accustomed to getopt(), and
* it could be a nearly drop-in replacement. The option string is the
* same and the fields have the same names as the getopt() global
* variables (optarg, optind, optopt).
*
* Optparse also supports GNU-style long options with optparse_long().
* The interface is slightly different and simpler than getopt_long().
*
* By default, argv is permuted as it is parsed, moving non-option
* arguments to the end. This can be disabled by setting the `permute`
* field to 0 after initialization.
*/
#ifndef OPTPARSE_H
#define OPTPARSE_H
#ifndef OPTPARSE_API
# define OPTPARSE_API
#endif
struct optparse {
char **argv;
int permute;
int optind;
int optopt;
char *optarg;
char errmsg[64];
int subopt;
};
enum optparse_argtype {
OPTPARSE_NONE,
OPTPARSE_REQUIRED,
OPTPARSE_OPTIONAL
};
struct optparse_long {
const char *longname;
int shortname;
enum optparse_argtype argtype;
};
/**
* Initializes the parser state.
*/
OPTPARSE_API
void optparse_init(struct optparse *options, char **argv);
/**
* Read the next option in the argv array.
* @param optstring a getopt()-formatted option string.
* @return the next option character, -1 for done, or '?' for error
*
* Just like getopt(), a character followed by no colons means no
* argument. One colon means the option has a required argument. Two
* colons means the option takes an optional argument.
*/
OPTPARSE_API
int optparse(struct optparse *options, const char *optstring);
/**
* Handles GNU-style long options in addition to getopt() options.
* This works a lot like GNU's getopt_long(). The last option in
* longopts must be all zeros, marking the end of the array. The
* longindex argument may be NULL.
*/
OPTPARSE_API
int optparse_long(struct optparse *options,
const struct optparse_long *longopts,
int *longindex);
/**
* Used for stepping over non-option arguments.
* @return the next non-option argument, or NULL for no more arguments
*
* Argument parsing can continue with optparse() after using this
* function. That would be used to parse the options for the
* subcommand returned by optparse_arg(). This function allows you to
* ignore the value of optind.
*/
OPTPARSE_API
char *optparse_arg(struct optparse *options);
/* Implementation */
#ifdef OPTPARSE_IMPLEMENTATION
#define OPTPARSE_MSG_INVALID "invalid option"
#define OPTPARSE_MSG_MISSING "option requires an argument"
#define OPTPARSE_MSG_TOOMANY "option takes no arguments"
static int
optparse_error(struct optparse *options, const char *msg, const char *data)
{
unsigned p = 0;
const char *sep = " -- '";
while (*msg)
options->errmsg[p++] = *msg++;
while (*sep)
options->errmsg[p++] = *sep++;
while (p < sizeof(options->errmsg) - 2 && *data)
options->errmsg[p++] = *data++;
options->errmsg[p++] = '\'';
options->errmsg[p++] = '\0';
return '?';
}
OPTPARSE_API
void
optparse_init(struct optparse *options, char **argv)
{
options->argv = argv;
options->permute = 1;
options->optind = argv[0] != 0;
options->subopt = 0;
options->optarg = 0;
options->errmsg[0] = '\0';
}
static int
optparse_is_dashdash(const char *arg)
{
return arg != 0 && arg[0] == '-' && arg[1] == '-' && arg[2] == '\0';
}
static int
optparse_is_shortopt(const char *arg)
{
return arg != 0 && arg[0] == '-' && arg[1] != '-' && arg[1] != '\0';
}
static int
optparse_is_longopt(const char *arg)
{
return arg != 0 && arg[0] == '-' && arg[1] == '-' && arg[2] != '\0';
}
static void
optparse_permute(struct optparse *options, int index)
{
char *nonoption = options->argv[index];
int i;
for (i = index; i < options->optind - 1; i++)
options->argv[i] = options->argv[i + 1];
options->argv[options->optind - 1] = nonoption;
}
static int
optparse_argtype(const char *optstring, char c)
{
int count = OPTPARSE_NONE;
if (c == ':')
return -1;
for (; *optstring && c != *optstring; optstring++);
if (!*optstring)
return -1;
if (optstring[1] == ':')
count += optstring[2] == ':' ? 2 : 1;
return count;
}
OPTPARSE_API
int
optparse(struct optparse *options, const char *optstring)
{
int type;
char *next;
char *option = options->argv[options->optind];
options->errmsg[0] = '\0';
options->optopt = 0;
options->optarg = 0;
if (option == 0) {
return -1;
} else if (optparse_is_dashdash(option)) {
options->optind++; /* consume "--" */
return -1;
} else if (!optparse_is_shortopt(option)) {
if (options->permute) {
int index = options->optind++;
int r = optparse(options, optstring);
optparse_permute(options, index);
options->optind--;
return r;
} else {
return -1;
}
}
option += options->subopt + 1;
options->optopt = option[0];
type = optparse_argtype(optstring, option[0]);
next = options->argv[options->optind + 1];
switch (type) {
case -1: {
char str[2] = {0, 0};
str[0] = option[0];
options->optind++;
return optparse_error(options, OPTPARSE_MSG_INVALID, str);
}
case OPTPARSE_NONE:
if (option[1]) {
options->subopt++;
} else {
options->subopt = 0;
options->optind++;
}
return option[0];
case OPTPARSE_REQUIRED:
options->subopt = 0;
options->optind++;
if (option[1]) {
options->optarg = option + 1;
} else if (next != 0) {
options->optarg = next;
options->optind++;
} else {
char str[2] = {0, 0};
str[0] = option[0];
options->optarg = 0;
return optparse_error(options, OPTPARSE_MSG_MISSING, str);
}
return option[0];
case OPTPARSE_OPTIONAL:
options->subopt = 0;
options->optind++;
if (option[1])
options->optarg = option + 1;
else
options->optarg = 0;
return option[0];
}
return 0;
}
OPTPARSE_API
char *
optparse_arg(struct optparse *options)
{
char *option = options->argv[options->optind];
options->subopt = 0;
if (option != 0)
options->optind++;
return option;
}
static int
optparse_longopts_end(const struct optparse_long *longopts, int i)
{
return !longopts[i].longname && !longopts[i].shortname;
}
static void
optparse_from_long(const struct optparse_long *longopts, char *optstring)
{
char *p = optstring;
int i;
for (i = 0; !optparse_longopts_end(longopts, i); i++) {
if (longopts[i].shortname && longopts[i].shortname < 127) {
int a;
*p++ = (char)longopts[i].shortname;
for (a = 0; a < (int)longopts[i].argtype; a++)
*p++ = ':';
}
}
*p = '\0';
}
/* Unlike strcmp(), handles options containing "=". */
static int
optparse_longopts_match(const char *longname, const char *option)
{
const char *a = option, *n = longname;
if (longname == 0)
return 0;
for (; *a && *n && *a != '='; a++, n++)
if (*a != *n)
return 0;
return *n == '\0' && (*a == '\0' || *a == '=');
}
/* Return the part after "=", or NULL. */
static char *
optparse_longopts_arg(char *option)
{
for (; *option && *option != '='; option++);
if (*option == '=')
return option + 1;
else
return 0;
}
static int
optparse_long_fallback(struct optparse *options,
const struct optparse_long *longopts,
int *longindex)
{
int result;
char optstring[96 * 3 + 1]; /* 96 ASCII printable characters */
optparse_from_long(longopts, optstring);
result = optparse(options, optstring);
if (longindex != 0) {
*longindex = -1;
if (result != -1) {
int i;
for (i = 0; !optparse_longopts_end(longopts, i); i++)
if (longopts[i].shortname == options->optopt)
*longindex = i;
}
}
return result;
}
OPTPARSE_API
int
optparse_long(struct optparse *options,
const struct optparse_long *longopts,
int *longindex)
{
int i;
char *option = options->argv[options->optind];
if (option == 0) {
return -1;
} else if (optparse_is_dashdash(option)) {
options->optind++; /* consume "--" */
return -1;
} else if (optparse_is_shortopt(option)) {
return optparse_long_fallback(options, longopts, longindex);
} else if (!optparse_is_longopt(option)) {
if (options->permute) {
int index = options->optind++;
int r = optparse_long(options, longopts, longindex);
optparse_permute(options, index);
options->optind--;
return r;
} else {
return -1;
}
}
/* Parse as long option. */
options->errmsg[0] = '\0';
options->optopt = 0;
options->optarg = 0;
option += 2; /* skip "--" */
options->optind++;
for (i = 0; !optparse_longopts_end(longopts, i); i++) {
const char *name = longopts[i].longname;
if (optparse_longopts_match(name, option)) {
char *arg;
if (longindex)
*longindex = i;
options->optopt = longopts[i].shortname;
arg = optparse_longopts_arg(option);
if (longopts[i].argtype == OPTPARSE_NONE && arg != 0) {
return optparse_error(options, OPTPARSE_MSG_TOOMANY, name);
} if (arg != 0) {
options->optarg = arg;
} else if (longopts[i].argtype == OPTPARSE_REQUIRED) {
options->optarg = options->argv[options->optind];
if (options->optarg == 0)
return optparse_error(options, OPTPARSE_MSG_MISSING, name);
else
options->optind++;
}
return options->optopt;
}
}
return optparse_error(options, OPTPARSE_MSG_INVALID, option);
}
#endif /* OPTPARSE_IMPLEMENTATION */
#endif /* OPTPARSE_H */

View file

@ -0,0 +1,3 @@
/* This file holds the implementation of the optparse library functionality */
#define OPTPARSE_IMPLEMENTATION
#include "optparse.h"

1895
compiler/libs/stb_ds.h Normal file

File diff suppressed because it is too large Load diff

3
compiler/libs/stb_impl.c Normal file
View file

@ -0,0 +1,3 @@
/* This file holds the implementation of stb library functionality */
#define STB_DS_IMPLEMENTATION
#include "stb_ds.h"

11
compiler/location.h Normal file
View file

@ -0,0 +1,11 @@
#ifndef _location_h_
#define _location_h_
#include "pre.h"
typedef struct {
Str source;
i64 line, column;
} Location;
#endif

58
compiler/messages.c Normal file
View file

@ -0,0 +1,58 @@
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include "messages.h"
#include "location.h"
/* SGI sequence */
#define ANSI_C(c) "\x1b["c"m"
/* 8bit palette color */
#define ANSI_8C(c) "\x1b[38;5;"c"m"
#define ANSI_8CB(c) "\x1b[48;5;"c"m"
/* True color */
#define ANSI_RC(r,g,b) "\x1b[38;2;"r";"g";"b"m"
#define ANSI_RCB(r,g,b) "\x1b[48;2;"r";"g";"b"m"
#define ANSI_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
#define ANSI_IF(cond, seq) (cond ? (seq) : "")
#define make_diag_func(name, diagtype, after) \
void name(Compiler *cm, const Location *loc, const char *s, ...) { \
va_list args; \
va_start(args, s); \
print_diagnostic(cm, loc, diagtype, s, args); \
va_end(args); \
after; \
}
void
print_diagnostic(Compiler *cm, const Location *loc, DiagType dt, const char *msg, va_list args)
{
static const char *ds[] = {"fatal", "error", "warning", "note"};
static const char *dsc[] = {
ANSI_C("1;90"), ANSI_C("1;31"), ANSI_C("1;35"), ANSI_C("1;34")
};
char fmsg[4096] = {0};
char dmsg[32] = {0};
bool color = cm != nil ? cm->opts.color : false;
if (dt == diag_error && cm->error_count < cm->opts.max_errors)
++cm->error_count;
vsnprintf(fmsg, sizeof(fmsg), msg, args);
snprintf(dmsg, sizeof(dmsg), "%s%s:%s", ANSI_IF(color, dsc[dt]), ds[dt], ANSI_IF(color, ANSI_RESET));
if (loc != nil) {
fprintf(stderr, "(%s:%li:%li) %s %s\n",
loc->source.s, loc->line, loc->column, dmsg, fmsg);
} else {
fprintf(stderr, "%s %s\n", dmsg, fmsg);
}
}
make_diag_func(fatal, diag_fatal, exit(EXIT_FAILURE))
make_diag_func(error, diag_error, )
make_diag_func(warning, diag_warning, )
make_diag_func(note, diag_note, )

30
compiler/messages.h Normal file
View file

@ -0,0 +1,30 @@
#ifndef _messages_h_
#define _messages_h_
#include <stdarg.h>
#include "state.h"
#include "location.h"
#ifdef __GNUC__
# define fmtattr(archt, fmtsi, ftchk) __attribute((format(archt, fmtsi, ftchk)))
#else
# define fmtattr(a, b, c)
#endif
typedef enum
{
diag_fatal = 0,
diag_error,
diag_warning,
diag_note,
} DiagType;
void
fatal(Compiler *cm, const Location *loc, const char *s, ...) fmtattr(printf, 3, 4);
void
error(Compiler *cm, const Location *loc, const char *s, ...) fmtattr(printf, 3, 4);
void
warning(Compiler *cm, const Location *loc, const char *s, ...) fmtattr(printf, 3, 4);
void
note(Compiler *cm, const Location *loc, const char *s, ...) fmtattr(printf, 3, 4);
#endif

665
compiler/parse.c Normal file
View file

@ -0,0 +1,665 @@
/* Recursive descent parser + Pratt parser (for expressions)
* TODO:
* - DRY code that handle list of tokens, I have like three almost identical functions for that.
* - Use an arena for the AST nodes. Nuke all of them with a single call
* when we no longer need the AST.
*/
#include <stdlib.h>
#include "ast.h"
#include "pre.h"
#include "parse.h"
#include "lex.h"
#include "state.h"
#include "messages.h"
#include "libs/stb_ds.h"
#define MAX_STMTS_IN_BLOCK 2000
#define MAX_PROC_ARG_COUNT 127
#define EXPR_INIT_PREC 1
/* Consume a token and match it */
#define next_match(lexer, tokt) \
do { LexToken t = lex_scan(lexer); lex_match(lexer, &t, tokt); } while (0)
/* Scans a token (mutating `t`), and if its id matches `ttype`,
* it executes the code block. Otherwise, the scanned token
* gets put back (so a next call to `lex_scan` can pick it up).
*/
#define matchopt(t, ttype, ps) \
if ((t = lex_scan(ps->lexer)).id != ttype) { \
lex_backup((ps)->lexer, t); \
} else
#define token_is_binop(t) (t >= T_PLUS && t <= T_NOTEQUAL)
#define token_is_atom(t) (t >= T_IDENT && t <= T_DECNUMBER)
#define token_is_unary(t) (t == T_MINUS || t == T_LOGNOT)
#define token_is_expr_start(t) (token_is_unary(t) || token_is_atom(t))
#define parse_error(ctx, ...) \
do { error((ctx)->cm, &((ctx)->lexer->cur_loc), __VA_ARGS__); (ctx)->ok = false; } while (0)
typedef Optional(AstIdentTypePair) OptAstIdentTypePair;
typedef struct {
int pred;
bool left_assoc; /* false if right assoc... */
} OperatorPrec;
/* Operator table specifying the precedence and associativeness
* of each operator, used by the expression parser.
* The precedence goes from lower to higher.
*/
const OperatorPrec OperatorTable[] = {
[T_LOGOR] = {1, true},
[T_LOGAND] = {2, true},
[T_LESSTHAN] = {3, true},
[T_GREATTHAN] = {3, true},
[T_LOGICEQUAL] = {3, true},
[T_NOTEQUAL] = {3, true},
[T_PLUS] = {4, true},
[T_MINUS] = {4, true},
[T_STAR] = {5, true},
[T_BAR] = {5, true},
};
static Ast *
expr(ParserState *ps, int minprec);
static Ast *
expr_comma_list(ParserState *ps);
static Ast *
stmt(ParserState *ps, LexToken token);
static Ast *
stmt_list_until(ParserState *ps, bool putback, const enum LexTokenId *end_markers, isize len);
static Ast *
make_tree(enum AstType type, Location loc)
{
Ast *tree = calloc(1, sizeof(Ast));
tree->type = type;
tree->loc = loc;
return tree;
}
static Ast *
make_binop(enum LexTokenId op, Location loc, Ast *lhs, Ast *rhs)
{
Ast *tree = make_tree(AST_BINEXPR, loc);
tree->bin.op = Str_from_c(TokenIdStr[op]);
tree->bin.left = lhs;
tree->bin.right = rhs;
return tree;
}
static Ast *
make_ident_node(Str ident, Location loc)
{
Ast *tree = make_tree(AST_IDENT, loc);
tree->ident = ident;
return tree;
}
static OptAstIdentTypePair
ident_type_pair(ParserState *ps)
{
AstIdentTypePair itp = { .loc = ps->lexer->cur_loc };
/* ident */
LexToken token = lex_scan(ps->lexer);
lex_match(ps->lexer, &token, T_IDENT);
itp.ident = token.ident;
/* type */
next_match(ps->lexer, T_COLON);
/* optional qualifier */
token = lex_scan(ps->lexer);
if (token.id == T_VAR) {
itp.kind = SymVar;
} else {
itp.kind = SymLet;
lex_backup(ps->lexer, token);
}
itp.dtype_loc = ps->lexer->cur_loc;
token = lex_scan(ps->lexer);
if (token.id != T_IDENT) {
parse_error(ps, "expected a type, got %s instead", TokenIdStr[token.id]);
return None(OptAstIdentTypePair);
}
itp.dtype = token.ident;
return Some(OptAstIdentTypePair, itp);
}
static Vec(AstIdentTypePair)
proc_arglist(ParserState *ps)
{
Vec(AstIdentTypePair) args = nil;
LexToken next;
for (;;) {
OptAstIdentTypePair oitp = ident_type_pair(ps);
if (!oitp.ok)
return nil;
if (arrlen(args) + 1 > MAX_PROC_ARG_COUNT) {
parse_error(ps, "more than %d (implementation limit) proc arguments", MAX_PROC_ARG_COUNT);
return nil;
}
arrput(args, oitp.val);
next = lex_scan(ps->lexer);
/* do we have a comma? if not, we reached the end of the list */
if (next.id != T_COMMA)
break;
/* check if we have an expression next to this comma, we do this
* to allow a trailling comma
*/
next = lex_scan(ps->lexer);
if (next.id != T_IDENT)
break;
lex_backup(ps->lexer, next);
}
trace("token in arglist out: %s\n", TokenIdStr[next.id]);
lex_backup(ps->lexer, next);
if (arrlen(args) == 0) {
arrfree(args);
return nil;
}
return args;
}
static Ast *
proc_decl(ParserState *ps)
{
LexToken proc_name = lex_scan(ps->lexer);
lex_match(ps->lexer, &proc_name, T_IDENT);
Ast *proc = make_tree(AST_PROCDEF, ps->lexer->cur_loc);
proc->proc.name = proc_name.ident;
trace("proc name: %s\n", proc->proc.name.s);
LexToken token = lex_scan(ps->lexer);
if (token.id == T_STAR) {
proc->proc.ispublic = true;
token = lex_scan(ps->lexer);
}
lex_match(ps->lexer, &token, T_LPAREN);
token = lex_scan(ps->lexer);
if (token.id != T_RPAREN) {
lex_backup(ps->lexer, token);
proc->proc.args = proc_arglist(ps);
token = lex_scan(ps->lexer);
}
lex_match(ps->lexer, &token, T_RPAREN);
/* return type */
token = lex_scan(ps->lexer);
if (token.id == T_COLON) {
token = lex_scan(ps->lexer);
lex_match(ps->lexer, &token, T_IDENT);
proc->proc.rettype = make_ident_node(token.ident, ps->lexer->cur_loc);
} else {
lex_backup(ps->lexer, token);
}
/* body */
proc->proc.body = stmt_list_until(ps, false, (enum LexTokenId[]){T_END}, 1);
return proc;
}
static Ast *
function_call(ParserState *ps, Str ident, bool ate_lp)
{
Ast *funcc = make_tree(AST_PROCCALL, ps->lexer->cur_loc);
funcc->call = (AstProcCall){ .name = ident };
if (!ate_lp)
next_match(ps->lexer, T_LPAREN);
LexToken next = lex_scan(ps->lexer);
if (token_is_expr_start(next.id)) {
lex_backup(ps->lexer, next);
funcc->call.args = expr_comma_list(ps);
} else {
lex_backup(ps->lexer, next);
}
next_match(ps->lexer, T_RPAREN);
trace("function call to: %s\n", ident.s);
return funcc;
}
static Ast *
variable_assign(ParserState *ps, Str ident, Location loc)
{
Ast *tree = make_tree(AST_VARASSIGN, loc);
tree->varassgn.name = ident;
tree->varassgn.expr = expr(ps, EXPR_INIT_PREC);
return tree;
}
static Ast *
funccall_or_assignment(ParserState *ps, Str ident)
{
LexToken token;
matchopt(token, T_EQUAL, ps) {
return variable_assign(ps, ident, ps->lexer->cur_loc);
}
return function_call(ps, ident, false);
}
static Ast *
variable_decl(ParserState *ps, enum LexTokenId decl_kind)
{
static const enum SymbolKind Token2SemaVarKind[] = {
[T_LET] = SymLet,
[T_VAR] = SymVar,
[T_CONST] = SymConst,
};
Assert(decl_kind == T_LET || decl_kind == T_VAR || decl_kind == T_CONST);
LexToken token = lex_scan(ps->lexer);
lex_match(ps->lexer, &token, T_IDENT);
Ast *decl = make_tree(AST_VARDECL, ps->lexer->cur_loc);
decl->var = (AstVarDecl) {
.name = token.ident,
.kind = Token2SemaVarKind[decl_kind],
};
/* type */
matchopt(token, T_COLON, ps) {
token = lex_scan(ps->lexer);
if (token.id != T_IDENT) {
parse_error(ps, "expected a type, got %s instead", TokenIdStr[token.id]);
return nil;
}
decl->var.datatype = make_ident_node(token.ident, ps->lexer->cur_loc);
}
/* assignment expression */
matchopt(token, T_EQUAL, ps) {
trace("assignment of decl here\n");
decl->var.expr = expr(ps, EXPR_INIT_PREC);
}
trace(
"var decl %s %s: %s\n",
TokenIdStr[decl_kind],
decl->var.name.s,
decl->var.datatype != nil ? (char *)decl->var.datatype->ident.s : "(no type)"
);
/* if there's no type there must be an expr */
/* TODO: move to semantic analysis phase? */
if (decl->var.datatype == nil && decl->var.expr == nil) {
parse_error(
ps,
"'%s' declaration must have an assignment expression if no type is specified, "
"but neither a type nor expression was supplied",
TokenIdStr[decl_kind]
);
return nil;
}
return decl;
}
static Ast *
return_stmt(ParserState *ps)
{
Ast *tree = make_tree(AST_RETURN, ps->lexer->cur_loc);
LexToken next = lex_scan(ps->lexer);
if (token_is_expr_start(next.id)) {
lex_backup(ps->lexer, next);
tree->ret = expr(ps, EXPR_INIT_PREC);
} else {
lex_backup(ps->lexer, next);
}
return tree;
}
static Ast *
break_stmt(ParserState *ps)
{
return make_tree(AST_BREAK, ps->lexer->cur_loc);
}
static Ast *
discard_stmt(ParserState *ps)
{
Ast *tree = make_tree(AST_DISCARD, ps->lexer->cur_loc);
tree->discard.expr = expr(ps, EXPR_INIT_PREC);
return tree;
}
static Ast *
parse_attribute(ParserState *ps)
{
Ast *tree = make_tree(AST_ATTRIBUTE, ps->lexer->cur_loc);
LexToken next = lex_scan(ps->lexer);
lex_match(ps->lexer, &next, T_LBRACKET);
next = lex_scan(ps->lexer);
lex_match(ps->lexer, &next, T_RBRACKET);
return tree;
}
/* A declaration "decorated" with an attribute */
static Ast *
decorated_decl(ParserState *ps)
{
Ast *attr = parse_attribute(ps);
LexToken next = lex_scan(ps->lexer);
switch (next.id) {
case T_PROC:
attr->attribute.node = proc_decl(ps);
break;
case T_CONST:
case T_LET:
case T_VAR:
attr->attribute.node = variable_decl(ps, next.id);
break;
default:
parse_error(ps, "node of kind '%s' cannot be attributed", TokenIdStr[next.id]);
return nil;
}
return attr;
}
static Ast *
if_stmt_expr(ParserState *ps)
{
const enum LexTokenId if_block_ends[] = {T_ELSE, T_ELIF, T_END};
Ast *tree = make_tree(AST_IF, ps->lexer->cur_loc);
/* parse `if` */
tree->ifse.cond = expr(ps, EXPR_INIT_PREC);
tree->ifse.true_body = stmt_list_until(ps, true, if_block_ends, countof(if_block_ends));
tree->ifse.false_body = nil;
LexToken next = lex_scan(ps->lexer);
AstElif elif_tree;
/* parse `elif`s and else */
for (;;) {
switch (next.id) {
case T_END: /* only has true branch */
return tree;
case T_ELSE:
/* once we see an `else` block, we assume the end of the `if` block,
* enforcing that `else` must be the last. */
trace("we got else\n");
tree->ifse.false_body = stmt_list_until(ps, true, (enum LexTokenId[]){T_ELIF, T_END}, 2);
next = lex_scan(ps->lexer);
if (next.id == T_ELIF) {
parse_error(ps, "'elif' branch after 'else' branch not allowed");
lex_backup(ps->lexer, next);
return nil;
}
return tree;
case T_ELIF:
trace("we got elif\n");
elif_tree.cond = expr(ps, EXPR_INIT_PREC);
elif_tree.body = stmt_list_until(ps, true, if_block_ends, countof(if_block_ends));
next = lex_scan(ps->lexer);
arrput(tree->ifse.elifs, elif_tree);
/* no more `elif` blocks neither an `else` block next */
if (next.id == T_END)
return tree;
Assert(next.id == T_ELSE || next.id == T_ELIF);
break;
default: /* shouldn't happen */
lex_backup(ps->lexer, next);
parse_error(ps, "huh?: %s", TokenIdStr[next.id]);
return nil;
}
}
return tree;
}
static Ast *
while_stmt(ParserState *ps)
{
Ast *tree = make_tree(AST_LOOP, ps->lexer->cur_loc);
tree->loop.precond = expr(ps, EXPR_INIT_PREC);
tree->loop.body = stmt_list_until(ps, false, (enum LexTokenId[]){T_END}, 1);
return tree;
}
static Ast *
atom(ParserState *ps)
{
Ast *tree = make_tree(AST_INVALID, ps->lexer->cur_loc);
LexToken t = lex_scan(ps->lexer);
LexToken next;
switch (t.id) {
case T_NUMBER:
tree->type = AST_NUMBER;
tree->number.n = t.inumber;
trace("number in atom: %lu\n", t.inumber);
return tree;
case T_STRING:
tree->type = AST_STRLIT;
tree->strlit = t.str;
return tree;
case T_IDENT:
next = lex_scan(ps->lexer);
/* It is a plain symbol or a function call? */
if (next.id == T_LPAREN) {
free(tree);
tree = function_call(ps, t.ident, true);
} else {
lex_backup(ps->lexer, next);
tree->type = AST_IDENT;
tree->ident = t.ident;
}
return tree;
default:
parse_error(ps, "expected a number, identifier or expression, not '%s'", TokenIdStr[t.id]);
free(tree);
}
return nil;
}
static Ast *
unary(ParserState *ps)
{
LexToken next = lex_scan(ps->lexer);
if (token_is_unary(next.id)) {
Ast *unt = make_tree(AST_UNARY, ps->lexer->cur_loc);
unt->unary.op = Str_from_c(TokenIdStr[next.id]);
unt->unary.atom = atom(ps);
return unt;
}
lex_backup(ps->lexer, next);
return atom(ps);
}
/* Parse a binary expression or an atom. This implements the Pratt parser algorithm.
* See also:
* - https://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing
* - https://www.oilshell.org/blog/2016/11/01.html
* XXX: Mutate to the shunting yard variation? Since it uses an explicit stack instead of the call
* stack, guard against deeply nested expressions.
*/
static Ast *
expr(ParserState *ps, int minprec)
{
Ast *tree = unary(ps);
for (;;) {
LexToken t = lex_scan(ps->lexer);
if (!token_is_binop(t.id)
|| t.id == T_END
|| OperatorTable[t.id].pred < minprec) {
lex_backup(ps->lexer, t);
break;
}
const OperatorPrec op = OperatorTable[t.id];
const int next_prec = op.left_assoc ? op.pred + 1 : op.pred;
tree = make_binop(t.id, ps->lexer->cur_loc, tree, expr(ps, next_prec));
}
return tree;
}
static Vec(Ast *)
sep_list(ParserState *ps, Ast *(*prod_fn)(Compiler *, void *))
{
(void)ps, (void)prod_fn;
Vec(Ast *) prod = nil;
return prod;
}
static Ast *
expr_comma_list(ParserState *ps)
{
Ast *tree = make_tree(AST_EXPRS, ps->lexer->cur_loc);
Vec(Ast *) exprs = nil;
LexToken next;
for (;;) {
arrput(exprs, expr(ps, EXPR_INIT_PREC));
next = lex_scan(ps->lexer);
trace("commalist tok: %s\n", TokenIdStr[next.id]);
/* do we have a comma? if not, we reached the end of the list */
if (next.id != T_COMMA)
break;
next = lex_scan(ps->lexer);
/* check if we have an expression next to this comma, we do this
* to allow a trailling comma
*/
if (!token_is_expr_start(next.id))
break;
lex_backup(ps->lexer, next);
}
lex_backup(ps->lexer, next);
if (arrlen(exprs) == 0) {
free(tree);
arrfree(exprs);
return nil;
}
tree->exprs = exprs;
return tree;
}
static bool
token_id_in_list(enum LexTokenId c, const enum LexTokenId *toks, isize len)
{
for (isize i = 0; i < len; ++i)
if (c == toks[i])
return true;
return false;
}
/* Parses a statement list until the token `end_marker`. Returns `nil` if the statement list
* is empty. */
static Ast *
stmt_list_until(ParserState *ps, bool putback, const enum LexTokenId *end_markers, isize len)
{
LexToken token = lex_scan(ps->lexer);
Vec(Ast *) stmts = nil;
Ast *body = make_tree(AST_STMTS, ps->lexer->cur_loc);
/* stmt* */
while (!token_id_in_list(token.id, end_markers, len)) {
trace("stmt list token: %s\n", TokenIdStr[token.id]);
if (arrlen(stmts) + 1 > MAX_STMTS_IN_BLOCK) {
parse_error(ps, "more than %d (implementation limit) statements in block", MAX_STMTS_IN_BLOCK);
return nil;
}
arrput(stmts, stmt(ps, token));
token = lex_scan(ps->lexer);
if (token.id == T_EOF) {
parse_error(ps, "unexpected EOF, expected a statement or `end`");
break;
}
if (token.id == T_SEMICOLON)
token = lex_scan(ps->lexer);
}
//lex_match(ps->lexer, &token, end_marker);
trace("token before end next_match: %s\n", TokenIdStr[token.id]);
if (putback)
lex_backup(ps->lexer, token);
/* empty list, just return nil instead of wasting space on a 0-length
* vector */
if (arrlen(stmts) == 0) {
free(body);
arrfree(stmts);
return nil;
}
body->stmts = stmts;
return body;
}
static Ast *
stmt(ParserState *ps, LexToken token)
{
switch (token.id) {
case T_IDENT:
return funccall_or_assignment(ps, token.ident);
case T_CONST:
case T_LET:
case T_VAR:
return variable_decl(ps, token.id);
case T_PROC:
return proc_decl(ps);
case T_HASH:
return decorated_decl(ps);
case T_RETURN:
return return_stmt(ps);
case T_BREAK:
return break_stmt(ps);
case T_DISCARD:
return discard_stmt(ps);
case T_IF:
return if_stmt_expr(ps);
case T_ELIF:
parse_error(ps, "stray 'elif'");
return nil;
case T_WHILE:
return while_stmt(ps);
case T_ELSE:
parse_error(ps, "'else' with no accompanying 'if'");
return nil;
case T_END:
parse_error(ps, "stray 'end' keyword");
return nil;
case T_EOF:
parse_error(ps, "unexpected EOF while parsing a statement");
return nil;
default:
parse_error(ps, "invalid statement '%s'", TokenIdStr[token.id]);
exit(1);
}
return nil;
}
/* Parse statements until EOF. */
static Ast *
stmt_list(ParserState *ps)
{
Ast *tree = make_tree(AST_STMTS, ps->lexer->cur_loc);
for (;;) {
const LexToken next = lex_scan(ps->lexer);
if (next.id == T_EOF)
break;
arrput(tree->stmts, stmt(ps, next));
}
return tree;
}
ParserState *
parse_new(Compiler *cm, LexState *ls)
{
ParserState *ps = calloc(1, sizeof(*ps));
ps->cm = cm;
ps->lexer = ls;
ps->ok = true;
return ps;
}
void
parse_destroy(ParserState *ps)
{
free(ps);
}
Ast *
parse(ParserState *ps)
{
return stmt_list(ps);
}

21
compiler/parse.h Normal file
View file

@ -0,0 +1,21 @@
#ifndef _parse_h_
#define _parse_h_
#include "ast.h"
#include "state.h"
#include "lex.h"
typedef struct {
Compiler *cm;
LexState *lexer;
bool ok;
} ParserState;
ParserState *
parse_new(Compiler *cm, LexState *ls);
void
parse_destroy(ParserState *ps);
Ast *
parse(ParserState *ps);
#endif

158
compiler/pre.h Normal file
View file

@ -0,0 +1,158 @@
#ifndef _pre_h_
#define _pre_h_
/* Prelude file, containing some useful macros and types. */
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
typedef int8_t i8;
typedef uint8_t u8;
typedef int16_t i16;
typedef uint16_t u16;
typedef int32_t i32;
typedef uint32_t u32;
typedef int64_t i64;
typedef uint64_t u64;
typedef float f32;
typedef double f64;
typedef uintptr_t uptr;
typedef ptrdiff_t isize;
typedef size_t usize;
typedef _Bool bool;
#define true ((bool)1)
#define false ((bool)0)
#define nil ((void *)0)
#define U64_MAX ((u64)-1)
#define Slice(T) \
struct { \
T *s; \
isize len; \
}
typedef Slice(u8) Str;
#define Optional(T) struct {T val; bool ok;}
#define Some(T, v) (T){v, true}
#define None(T) (T){.ok = false}
/* Meant for use with stb_ds */
#define Vec(T) T *
#define HashMap(K, V) struct { K key; V value; }
#define HashMapStr(V) struct { char *key; V value; }
/* Length of an array */
#define countof(arr) (isize)(sizeof(arr) / sizeof(*(arr)))
/* Length of string literal */
#define lengthof(s) (countof(s) - 1)
#define ViewMem(T, arr, lo, hi) ((Slice(T)){.s = arr+lo, .len = hi - lo})
#define View(sl, lo, hi) ((sl).s += lo, (sl).len = hi - lo, (sl))
#define foreach(val, arr) for(__typeof__(*(arr)) *__p = (arr), (val) = *__p; __p < (arr)+(arrlen((arr))); (val) = *(__p++))
#define foreach_getindex(val, arr) (&(val) - (arr))
/* Useful integer operations good to have. */
#define max(x, y) ((x) > (y) ? (x) : (y))
#define min(x, y) ((x) < (y) ? (x) : (y))
#define clamp(x, lo, hi) max(lo, min(x, hi))
#define BitPos(pos) (1 << (pos))
#if defined(__GNUC__) || defined(__clang__)
# define debugtrap() __builtin_trap()
# define unreachable() __builtin_unreachable()
#else /* not optimal... */
# define debugtrap() abort()
# define unreachable() abort()
#endif
#ifndef NDEBUG
# if defined (__GNUC__) || defined(__clang__)
# define Assert(pred) if (!(pred)) { __builtin_trap(); }
# else
# define Assert(pred) if (!(pred)) { *(volatile int *)0 = 0; }
# endif
# define trace(...) do { \
fprintf(stderr, "%s:%-5i", __FILE__, __LINE__); \
fprintf(stderr, __VA_ARGS__); \
} while (0)
#else
# define Assert(pred)
# define trace(...)
#endif
/* Creates a `Str` from a string literal */
#define Sl(s) ((Str){ (u8 *)s, (isize)lengthof(s) })
/* Creates a `Str` from a buffer of size `len` */
#define Sb(s, len) ((Str){ (u8 *)s, (isize)len })
/* Creates a `Str` from a C string. */
#define Str_from_c(s) ((Str){ (u8 *)s, (isize)(s != nil ? strlen(s) : 0) })
#define Str_empty(s) ((s).len == 0)
#define Str_default(s, sor) (!Str_empty(s) ? (s) : (sor))
int
vsnprintf(char *, unsigned long, const char *, va_list);
/* "Converts" a `Str` into a C string. Since `Str` are meant to be
* null-terminated already, no conversion is made, but ensures that the
* null terminator is present. */
static inline char *
Str_to_c(Str s)
{
if (s.len == 0 || s.s == nil)
return nil;
Assert(s.s[s.len - 1] == '\0');
return (char *)s.s;
}
/* Returns `true` if both strings are equal. */
static inline bool
Str_equal(Str s1, Str s2)
{
/* because passing nil to mem* is UB even if size == 0... */
return (s1.len == s2.len) && (s1.len == 0 || memcmp(s1.s, s2.s, s1.len) == 0);
}
/* Heaps allocates a new `Str` of size `len`, with contents from `data` if it is
* not `nil`.*/
static inline Str
Str_new(const u8 *data, isize len)
{
Assert(len >= 0);
Str s;
s.s = calloc(len + 1, sizeof(*s.s));
s.len = len;
if (data != nil) {
memcpy(s.s, data, len);
s.s[len + 1] = '\0'; /* ensure */
}
return s;
}
/* Returns a formatted string (heap allocated) of the exact required size. */
static inline Str
Strafmt(const char *fmt, ...)
{
Str s = {0};
va_list args;
va_start(args, fmt);
/* Calculate buffer size required to hold the formatted string */
int reqs = vsnprintf(nil, 0, fmt, args);
va_end(args);
if (reqs < 0)
return s;
s = Str_new(nil, reqs);
va_start(args, fmt); /* `vsnprintf` touched the arg list, reinitialize it */
/* the nil terminator is guaranteed by `Str_new` */
vsnprintf((char *)s.s, s.len + 1, fmt, args);
va_end(args);
return s;
}
#endif

200
compiler/rutilec.c Normal file
View file

@ -0,0 +1,200 @@
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "pre.h"
#include "lex.h"
#include "parse.h"
#include "sema.h"
#include "state.h"
#include "codegen.h"
#include "messages.h"
#include "libs/optparse.h"
#include "libs/stb_ds.h"
#ifndef GIT_HASH
# define GIT_HASH "<no hash>"
#endif
#ifndef BUG_REPORT_URL
# define BUG_REPORT_URL "https://codeberg.org/tocariimaa/rutile"
#endif
#ifndef TARGET_EXE_EXT /* without prefix dot! */
# define TARGET_EXE_EXT Sl("")
#endif
static const char *HelpMessage = \
"Summary of common options:\n" \
" -c\tCompile only. Don't link, output an object file instead.\n" \
" -d\tDefine a constant with the specified value.\n" \
" -h\tPrint this help message.\n" \
" -o\tSet output file name of the executable/object file.\n" \
" -v\tPrint the version of this compiler, plus other relevant information.\n" \
" -S\tEmit intermediate code.\n" \
" -R\tSet the code generation mode, 'release' for an optimized build,\n" \
" \t'debug' for a debug build.\n" \
;
/* Creates the output binary file name, changing the extension to the current platform
* executable file extension, or it simply removes the original extension if the platform
* has no binary extension, (i.e UNIX-likes OSes). */
static Str
make_binary_filename(Compiler *cm, Str src_filename, const Str exe_ext)
{
bool exe_has_ext = exe_ext.len > 0;
const size_t ss = src_filename.len;
Assert(ss != 0);
char *buf = malloc(ss + 2 + (exe_has_ext ? exe_ext.len : 0));
memcpy(buf, src_filename.s, ss);
buf[ss] = '\0';
char *p = buf + (ss - 1);
while (p != buf && *p != '.') /* Search for the first '.' backwards */
--p;
/* No extension in filename, egde case really */
if (p == buf) {
if (!exe_has_ext)
fatal(cm, nil, "output file name required in this case (host OS binary format lacks extension)");
/* append extension then */
p = buf + ss;
*p = '.';
}
if (exe_has_ext) {
memcpy(++p, exe_ext.s, exe_ext.len);
p += 3;
}
*p = '\0';
return Str_from_c(buf);
}
static enum CodegenBackends
backend_from_str(Compiler *cm, Str s)
{
if (s.len == 1 && (s.s[0] == 'c' || s.s[0] == 'C'))
return CgBackendC;
else if (Str_equal(s, Sl("gcc")))
return CgBackendLibGccJit;
fatal(cm, nil, "unknown backend '%s'", s.s);
unreachable();
}
static Str
cli_boilerplate(char **argv, Compiler *cm)
{
const struct optparse_long longopts[] = {
{"backend", 'b', OPTPARSE_REQUIRED},
{"compile-only", 'c', OPTPARSE_NONE},
{"define", 'd', OPTPARSE_REQUIRED},
{"max-errors", 'E', OPTPARSE_REQUIRED},
{"release", 'R', OPTPARSE_REQUIRED},
{"emit-ir", 'S', OPTPARSE_OPTIONAL},
{"exe", 'o', OPTPARSE_REQUIRED},
{"version", 'v', OPTPARSE_NONE},
{"help", 'h', OPTPARSE_NONE},
{0},
};
const char *no_fun_env = getenv("NO_COLOR");
cm->opts.color = isatty(STDERR_FILENO) && !(no_fun_env != nil && *no_fun_env != '\0');
struct optparse opts;
optparse_init(&opts, argv);
i8 opt;
while ((opt = optparse_long(&opts, longopts, nil)) != -1) {
switch (opt) {
case 'b':
cm->opts.backend = backend_from_str(cm, Str_from_c(opts.optarg));
break;
case 'c':
cm->opts.compile_only = true;
break;
case 'd':
trace("define: %s\n", opts.optarg);
arrput(cm->opts.defines, Str_from_c(opts.optarg));
break;
case 'h':
printf("Usage: %s [options...] files...\n\n%s\n", *argv, HelpMessage);
exit(0);
case 'E':
cm->opts.max_errors = atoi(opts.optarg); /* XXX: atoi LOL */
break;
case 'R':
trace("release: %s\n", opts.optarg);
cm->opts.release_mode = Str_from_c(opts.optarg);
break;
case 'S':
break;
case 'o':
cm->opts.exe_out = Str_from_c(opts.optarg);
break;
case 'v':
printf("Rutile compiler v0.0.1\n");
printf("git commit: %s\nReport bugs here: %s\n", GIT_HASH, BUG_REPORT_URL);
exit(0);
case '?':
fatal(cm, nil, "%s: %s", *argv, opts.errmsg);
}
}
const char *src_filename = optparse_arg(&opts);
if (src_filename == nil)
fatal(cm, nil, "no input files specified");
return Str_from_c(src_filename);
}
int
main(int argc, char **argv)
{
(void)argc;
Compiler cm = {
.opts = {
.backend = CgBackendC,
.max_errors = 20,
}
};
Str src_filename = cli_boilerplate(argv, &cm);
FILE *src_in = nil;
if (src_filename.s[0] == '-' && src_filename.s[1] == '\0') {
src_in = stdin;
src_filename = Sl("<stdin>");
} else {
if ((src_in = fopen((char *)src_filename.s, "rb")) == nil) {
fatal(&cm, nil, "can't open: %s", src_filename.s);
}
}
cm.current_filename = src_filename;
if (cm.opts.exe_out.len == 0)
cm.opts.exe_out = make_binary_filename(&cm, src_filename, TARGET_EXE_EXT);
if (Str_equal(cm.opts.exe_out, cm.current_filename)) {
fatal(&cm, nil, "input source file and output file are the same");
}
/* Compiler pipeline */
LexState *ls = lex_new(&cm, src_in, src_filename, 4);
ParserState *ps = parse_new(&cm, ls);
SemaCtx *ss = sema_new(&cm);
Ast *program = parse(ps);
if (!ps->ok)
goto err;
sema(ss, program);
if (!ss->ok)
goto err;
CodegenCtx *cgctx = codegen_new(&cm, cm.opts.backend);
codegen(cgctx, program);
codegen_destroy(cgctx);
err:
sema_destroy(ss);
parse_destroy(ps);
lex_destroy(ls);
fclose(src_in);
return 0;
}

980
compiler/sema.c Normal file
View file

@ -0,0 +1,980 @@
/* Semantic analyzer and type checker */
#include <stdlib.h>
#include <string.h>
#include "pre.h"
#include "sema.h"
#include "datatype.h"
#include "location.h"
#include "symbol.h"
#include "ast.h"
#include "state.h"
#include "messages.h"
#include "libs/stb_ds.h"
#define sema_error(ctx, loc, ...) do { \
error((ctx)->cm, loc, __VA_ARGS__); \
(ctx)->ok = false; \
} while (0)
#define sema_warning(ctx, loc, ...) warning((ctx)->cm, loc, __VA_ARGS__)
#define sema_note(ctx, loc, ...) note((ctx)->cm, loc, __VA_ARGS__)
#define sema_is_stmt_terminal(s) (s->type == AST_RETURN || s->type == AST_BREAK)
#define sym_insert(syms, k, v) shput(syms, k, v)
enum SemaCtxFlags /* 64 bits */
{
SctxInsideProc = BitPos(0),
SctxInsideLoop = BitPos(1),
SctxInsideIf = BitPos(2),
SctxInTopLevel = BitPos(3),
SctxInExpr = BitPos(4),
SctxInDiscard = BitPos(5),
SctxInStmtBlock = BitPos(6),
};
typedef struct {
enum SymbolKind kind;
/* The data type associated with the symbol. */
DataType *dtype;
bool used;
bool procparm; /* if its a proc parameter */
Location loc;
} Symbol;
typedef HashMapStr(Symbol) SymbolEntry;
struct Scope
{
Scope *prev; /* Previous scope in the stack */
SymbolEntry *symbols; /* All the symbols in this scope */
};
typedef struct {
bool ok;
} SemaStatus;
static const Symbol InvalidSymbol = {.kind = SymInvalid};
static const DataType *InvalidDataType = &(DataType){.kind = DtkInvalid};
static DataTypeCheck
datatype_struct_cmp(SemaCtx *sctx, DataType *s1, DataType *s2);
static DataTypeCheck
datatype_cmp(SemaCtx *sctx, DataType *dt1, DataType *dt2);
static DataType *
sema_expr(SemaCtx *sctx, Ast *expr, Location loc);
static void
sema_expr_list(SemaCtx *sctx, Vec(Ast *) exprs, Location loc);
static void
sema_node(SemaCtx *sctx, Ast *node);
static void
sema_stmts(SemaCtx *sctx, Vec(Ast *) stmts);
static DataType *
resolve_datatype(SemaCtx *sctx, const Str ident, Location loc);
static Scope *
make_scope(Scope *prev)
{
Scope *sc = malloc(sizeof(*sc));
sc->prev = prev;
sc->symbols = nil;
sh_new_arena(sc->symbols);
shdefault(sc->symbols, InvalidSymbol);
return sc;
}
static SemaCtx *
make_semactx(Compiler *cm, SemaCtx *prev)
{
SemaCtx *smc = calloc(1, sizeof(*smc));
smc->cm = cm;
smc->prev = prev;
return smc;
}
static DataType *
make_data_type(enum DataTypeKind kind, u16 size, bool builtin, bool sign)
{
DataType *dt = calloc(1, sizeof(*dt));
dt->kind = kind;
dt->size = size;
dt->builtin = builtin;
dt->sign = sign;
return dt;
}
static DataType *
make_proc_type(bool builtin, DataType *rettype, Vec(DataType *) argtypes)
{
DataType *pdt = calloc(1, sizeof(*pdt));
pdt->kind = DtkProc;
pdt->builtin = builtin;
pdt->proc.rettype = rettype;
pdt->proc.argtypes = argtypes;
return pdt;
}
static Vec(DataType *)
make_type_list_from_idents(SemaCtx *sctx, Vec(AstIdentTypePair) idents)
{
if (idents == nil)
return nil;
Vec(DataType *) dts = nil;
foreach (ident, idents)
arrput(dts, resolve_datatype(sctx, ident.dtype, ident.dtype_loc));
return dts;
}
static Vec(DataType *)
make_proc_args(DataType *a[], isize len)
{
Vec(DataType *) args = nil;
arrsetlen(args, len);
memcpy(args, a, len);
return args;
}
/* Pushes a new context frame. Note that this inherits the flags and scope of the
* previous context frame.
* XXX: could rather only push flags...
*/
static void
push_semactx(SemaCtx **sctx)
{
SemaCtx *tmp = make_semactx((*sctx)->cm, *sctx);
tmp->flags = (*sctx)->flags;
tmp->current_scope = (*sctx)->current_scope;
tmp->top_scope = (*sctx)->top_scope;
tmp->ok = (*sctx)->ok;
*sctx = tmp;
}
/* Pops the current context frame. */
static void
pop_semactx(SemaCtx **sctx)
{
SemaCtx *prev = (*sctx)->prev;
compiler_assert((*sctx)->cm, prev != nil);
prev->ok = (*sctx)->ok;
free(*sctx);
*sctx = prev;
}
static void
enter_scope(SemaCtx *sctx)
{
sctx->current_scope = make_scope(sctx->current_scope);
}
static void
exit_scope(SemaCtx *sctx)
{
compiler_assert(sctx->cm, sctx->current_scope->prev != nil);
sctx->current_scope = sctx->current_scope->prev;
}
Symbol *
sym_search_oncurrent(Scope *scope, const Str name)
{
Symbol *sym = &shget(scope->symbols, name.s);
if (sym->kind != SymInvalid)
return sym;
return nil;
}
/* Searches for a symbol in the current and previous scopes */
Symbol *
sym_search(Scope *scope, const Str name)
{
Scope *sp = scope;
Symbol *sym = nil;
while (sp != nil && sym == nil) {
sym = sym_search_oncurrent(sp, name);
sp = sp->prev;
}
return sym;
}
/* Scans through the current scope for any unused var-like bindings
* (including proc parameters)
*/
static void
sema_check_unused_vars(SemaCtx *sctx)
{
/* Very simple, iterate over all bindings on this scope and report any that
* doesn't have the 'used' flag toggled. */
const SymbolEntry *syms = sctx->current_scope->symbols;
for (isize i = 0; i < shlen(syms); ++i) {
const Symbol sym = syms[i].value;
if (!sym.used && symbol_is_var_binding(sym.kind)) {
const char *bind_kind_name = !sym.procparm ? "variable" : "proc parameter"; // SymbolKindStr[sym.kind] : "proc parameter";
sema_warning(
sctx, &sym.loc,
"unused %s '%s'", bind_kind_name, syms[i].key
);
}
}
}
static void
sema_check_dead_stmts(SemaCtx *sctx, Vec(Ast *) stmts)
{
(void)sctx, (void)stmts;
/* those who forsake the CFG are doomed to implement it badly without even
* noticing... */
}
static void
sema_match_proc_type(SemaCtx *sctx, Symbol *fsym, Str fident)
{
if (fsym->dtype->kind != DtkProc) {
sema_error(
sctx, nil,
"cannot call '%s' because has non-proc type '%s'",
fident.s, "uh"
);
return;
}
}
static DataType *
sema_proccall(SemaCtx *sctx, const AstProcCall *call, Location loc)
{
Symbol *fsym = sym_search(sctx->current_scope, call->name);
if (fsym == nil) {
sema_error(sctx, &loc, "call to undeclared proc '%s'", call->name.s);
return nil;
}
fsym->used = true;
sema_match_proc_type(sctx, fsym, call->name);
/* check call arguments */
const isize proc_arglen = arrlen(fsym->dtype->proc.argtypes);
if (call->args != nil) {
compiler_assert(sctx->cm, call->args->type == AST_EXPRS);
const isize call_arglen = arrlen(call->args->exprs);
if (call_arglen != proc_arglen) {
const char *at_most = call_arglen > proc_arglen ? "s at most" : "";
sema_error(
sctx, &loc,
"argument length mismatch: given %li arguments to '%s' but it expects %li argument%s",
call_arglen, call->name.s, proc_arglen, at_most
);
return nil;
}
sema_expr_list(sctx, call->args->exprs, loc); /* now sema-check the args */
} else if (call->args == nil && proc_arglen != 0) {
sema_error(sctx, &loc, "'%s' proc takes %li argument(s), but none given",
call->name.s, proc_arglen);
return nil;
}
if (fsym->dtype->proc.rettype != sctx->builtintypes.void_t
&& (~sctx->flags & SctxInDiscard)
&& (~sctx->flags & SctxInExpr)) {
sema_error(sctx, &loc, "result of function call with non-void type ignored");
sema_note(sctx, &loc, "use 'discard' if this was intentional");
return nil;
}
for (isize i = 0; i < proc_arglen; ++i) {
;
}
return fsym->dtype;
}
/************ Semantic and type checking of expressions ************/
/* Type checking for expressions is done inside-out */
static DataType *
sema_expr_number(SemaCtx *sctx, AstNumber *num)
{
#define pow2(exp) (2 << (exp - 1))
/* type rule axiom */
num->type = sym_search_oncurrent(sctx->top_scope, Sl("u64"))->dtype;
return num->type;
#undef pow2
}
static DataType *
sema_expr_strlit(SemaCtx *sctx, const Str *strlit)
{
(void)sctx, (void)strlit;
/* type rule axiom */
return sym_search_oncurrent(sctx->top_scope, Sl("string"))->dtype;
}
static Symbol *
sema_expr_ident(SemaCtx *sctx, const Str ident)
{
Symbol *ident_sym = sym_search(sctx->current_scope, ident);
if (ident_sym == nil) {
sema_error(sctx, nil, "undeclared identifier '%s'", ident.s);
return nil;
}
if (ident_sym->kind == SymType) {
sema_error(sctx, nil, "data type '%s' used as identifier in expression", ident.s);
return nil;
}
ident_sym->used = true;
return ident_sym;
}
static DataType *
sema_expr_unary(SemaCtx *sctx, AstUnary *unary, Location loc)
{
Ast *expr = unary->atom;
compiler_assert(sctx->cm, ast_node_is_expr(expr->type));
//if (expr->type == AST_STRLIT) {
// sema_error(sctx, nil, "%s with a string literal makes no sense\n", TokenIdStr[unary->op]);
// return;
//}
//if (expr->type == AST_NUMBER) {
// if (unary->op == T_MINUS && !expr->number.type->sign) {
// }
//}
return sema_expr(sctx, expr, loc);
}
static DataType *
sema_binop(SemaCtx *sctx, const AstBinop *expr, Location loc)
{
Symbol *opsym = sym_search_oncurrent(sctx->top_scope, expr->op);
if (opsym == nil) {
sema_error(sctx, nil, "no operator '%s'", expr->op.s);
return nil;
}
if (arrlen(opsym->dtype->proc.argtypes) != 2) {
sema_error(sctx, nil, "no binary operator for '%s'", expr->op.s);
return nil;
}
DataType *ldt = sema_expr(sctx, expr->left, loc);
DataType *rdt = sema_expr(sctx, expr->right, loc);
/* Skip typechecking if either ldt or rdt have `InvalidDataType` and propagate
* it up the call stack. */
if (ldt == InvalidDataType || rdt == InvalidDataType)
return (DataType *)InvalidDataType;
DataTypeCheck tchk;
if (!(tchk = datatype_cmp(sctx, ldt, rdt)).ok) {
sema_error(sctx, &loc, "type error: %s", tchk.msg.s);
return nil;
}
return ldt;
}
static DataType *
sema_expr(SemaCtx *sctx, Ast *expr, Location loc)
{
compiler_assert(sctx->cm, ast_node_is_expr(expr->type));
push_semactx(&sctx);
sctx->flags |= SctxInExpr;
DataType *dt = nil;
switch (expr->type) {
case AST_BINEXPR:
dt = sema_binop(sctx, &expr->bin, loc);
break;
case AST_UNARY:
dt = sema_expr_unary(sctx, &expr->unary, loc);
break;
case AST_NUMBER:
dt = sema_expr_number(sctx, &expr->number);
break;
case AST_STRLIT:
dt = sema_expr_strlit(sctx, &expr->strlit);
break;
case AST_IDENT:
dt = sema_expr_ident(sctx, expr->ident)->dtype;
break;
case AST_PROCCALL:
dt = sema_proccall(sctx, &expr->call, expr->loc);
break;
default:
unreachable();
}
pop_semactx(&sctx);
return dt;
}
static void
sema_expr_list(SemaCtx *sctx, Vec(Ast *) exprs, Location loc)
{
foreach (expr, exprs) {
sema_expr(sctx, expr, loc);
}
}
/************ Type checking ************/
/* Structurally compare two structural data types. */
static DataTypeCheck
datatype_struct_cmp(SemaCtx *sctx, DataType *s1, DataType *s2)
{
compiler_assert(sctx->cm, s1->kind == DtkStruct && s2->kind == DtkStruct);
const DataTypeCompound *s1s = &s1->compound;
const DataTypeCompound *s2s = &s2->compound;
if (s1s->packed != s2s->packed)
return (DataTypeCheck){false, Sl("")};
if (arrlen(s1s->fields) != arrlen(s2s->fields))
return (DataTypeCheck){false, Sl("")};
for (isize i = 0; i < arrlen(s1s->fields); ++i) {
DataTypeCheck tchk;
if (!(tchk = datatype_cmp(sctx, s1s->fields[i], s2s->fields[i])).ok)
return tchk;
}
return (DataTypeCheck){.ok = true};
}
static DataTypeCheck
datatype_array_cmp(SemaCtx *sctx, DataType *a1, DataType *a2)
{
DataTypeCheck tchk = {.ok = true};
if (a1->array.len != a2->array.len)
return (DataTypeCheck){false, Sl("")};
if (!(tchk = datatype_cmp(sctx, a1->array.base, a2->array.base)).ok)
return tchk;
return tchk;
}
static DataTypeCheck
datatype_proc_cmp(SemaCtx *sctx, DataType *pc1, DataType *pc2)
{
DataTypeCheck tchk = {.ok = true};
if (pc1->proc.public != pc2->proc.public)
return (DataTypeCheck){false, Sl("")};
if (pc1->proc.extern_lnk != pc2->proc.extern_lnk)
return (DataTypeCheck){false, Sl("")};
if (pc1->proc.c_varargs != pc2->proc.c_varargs)
return (DataTypeCheck){false, Sl("")};
if (arrlen(pc1->proc.argtypes) != arrlen(pc2->proc.argtypes))
return (DataTypeCheck){false, Sl("")};
if (!(tchk = datatype_cmp(sctx, pc1->proc.rettype, pc2->proc.rettype)).ok)
return tchk;
for (isize i = 0; i < arrlen(pc1->proc.argtypes); ++i) {
if (!(tchk = datatype_cmp(sctx, pc1->proc.argtypes[i], pc2->proc.argtypes[i])).ok)
return tchk;
}
return tchk;
}
static DataTypeCheck
datatype_basic_cmp(SemaCtx *sctx, DataType *dt1, DataType *dt2)
{
(void)sctx;
if (dt1->size > dt2->size) /* if it has a size equal or less than dt2 */
return (DataTypeCheck){false, Sl("")};
if (dt1->sign != dt2->sign)
return (DataTypeCheck){false, Strafmt("integers with different sign")};
return (DataTypeCheck){.ok = true};
}
/* Compares two datatype objects, returning true if they are equal. */
static DataTypeCheck
datatype_cmp(SemaCtx *sctx, DataType *dt1, DataType *dt2)
{
if (dt1 == nil || dt2 == nil)
return (DataTypeCheck){false, Sl("")};
/* TODO: return more information in case of a mismatch... */
if (dt1 == dt2) /* shallow */
return (DataTypeCheck){.ok = true};
if (dt1->kind != dt2->kind)
return (DataTypeCheck){.ok = false};
switch (dt1->kind) {
case DtkBasic:
return datatype_basic_cmp(sctx, dt1, dt2);
case DtkStruct:
case DtkUnion:
return datatype_struct_cmp(sctx, dt1, dt2);
case DtkProc:
return datatype_proc_cmp(sctx, dt1, dt2);
case DtkArray:
return datatype_array_cmp(sctx, dt1, dt2);
case DtkBool:
case DtkVoid:
return (DataTypeCheck){.ok = true};
}
return (DataTypeCheck){.ok = false};
}
static DataType *
expr_get_datatype(SemaCtx *sctx, Ast *expr)
{
compiler_assert(sctx->cm, ast_node_is_expr(expr->type));
switch (expr->type) {
case AST_BINEXPR:
return expr->bin.type;
case AST_UNARY:
return expr->unary.type;
case AST_NUMBER:
return expr->number.type;
case AST_STRLIT:
return sym_search_oncurrent(sctx->top_scope, Sl("string"))->dtype;
/* XXX: for these two we could attach the type in the ast... */
case AST_IDENT:
return sym_search(sctx->current_scope, expr->ident)->dtype;
case AST_PROCCALL:
return sym_search(sctx->current_scope, expr->call.name)->dtype->proc.rettype;
default:
unreachable();
}
return nil;
}
/* Search for the type in the symbol table, asserting that is a data type. */
static DataType *
resolve_datatype(SemaCtx *sctx, const Str ident, Location loc)
{
Symbol *dtsym = sym_search(sctx->current_scope, ident);
if (dtsym == nil) {
sema_error(sctx, &loc, "no such type '%s'", ident.s);
return (DataType *)InvalidDataType;
}
if (dtsym->kind != SymType) {
sema_error(sctx, &loc, "'%s' is not a type but a %s", ident.s, SymbolKindStr[dtsym->kind]);
return (DataType *)InvalidDataType;
}
return dtsym->dtype;
}
static void
sema_procdef(SemaCtx *sctx, AstProc *proc, Location loc)
{
Symbol *sym_prev;
if ((sym_prev = sym_search(sctx->current_scope, proc->name)) != nil) {
sema_error(
sctx, nil,
"'%s' was already declared as a %s",
proc->name.s, SymbolKindStr[sym_prev->kind]
);
sema_note(sctx, &sym_prev->loc, "'%s' previously declared here", proc->name.s);
return;
}
if (Str_equal(proc->name, Sl("main"))) {
sctx->main_defined = true;
if (!proc->ispublic) {
sema_error(sctx, &loc, "'main' has to be declared as a public proc");
}
}
const Ast *rettype_node = proc->rettype;
DataType *proc_rettype = nil;
if (rettype_node != nil) {
compiler_assert(sctx->cm, rettype_node->type == AST_IDENT);
proc_rettype = resolve_datatype(sctx, proc->rettype->ident, rettype_node->loc);
if (proc_rettype == InvalidDataType)
return;
} else {
/* return type node is nil, we infer that as a `void` type */
proc_rettype = sctx->builtintypes.void_t;
}
Vec(DataType *) procargs = make_type_list_from_idents(sctx, proc->args);
DataType *procdtype = make_proc_type(false, proc_rettype, procargs);
procdtype->proc.public = proc->ispublic;
Symbol proc_sym = {
.kind = SymProc,
.dtype = procdtype,
.loc = loc
};
sym_insert(sctx->current_scope->symbols, proc->name.s, proc_sym);
proc->type = procdtype;
/* proc has no body at all */
if (proc->body == nil)
return;
/* analyze the body */
compiler_assert(sctx->cm, proc->body->type == AST_STMTS);
push_semactx(&sctx);
enter_scope(sctx);
compiler_assert(sctx->cm, arrlen(proc->args) == arrlen(procargs));
/* Inject proc parameters into the proc body top scope */
for (isize i = 0; i < arrlen(proc->args); ++i) {
DataType *argdtype = procargs[i];
enum SymbolKind argsymkind = proc->args[i].kind;
compiler_assert(sctx->cm, argdtype != nil);
compiler_assert(sctx->cm, argsymkind == SymLet || argsymkind == SymVar);
Symbol argsym = {
.kind = argsymkind,
.dtype = argdtype,
.procparm = true,
.loc = proc->args[i].loc
};
sym_insert(sctx->current_scope->symbols, proc->args[i].ident.s, argsym);
}
sctx->flags |= SctxInsideProc;
sema_stmts(sctx, proc->body->stmts);
sema_check_unused_vars(sctx);
exit_scope(sctx);
pop_semactx(&sctx);
}
static void
sema_return(SemaCtx *sctx, Ast *ret_expr, Location loc)
{
if (~sctx->flags & SctxInsideProc) {
sema_error(sctx, &loc, "'return' outside of proc");
}
if (ret_expr != nil)
sema_expr(sctx, ret_expr, loc);
}
static void
sema_break(SemaCtx *sctx, Ast *unused, Location loc)
{
(void)unused;
if (~sctx->flags & SctxInsideLoop) {
sema_error(sctx, &loc, "'break' used outside of a loop");
}
}
static void
sema_discard(SemaCtx *sctx, Ast *expr, Location loc)
{
sctx->flags |= SctxInDiscard;
sema_expr(sctx, expr, loc);
sctx->flags &= ~SctxInDiscard;
}
static void
sema_attribute(SemaCtx *sctx, AstAttribute *attr)
{
sema_node(sctx, attr->node);
}
static void
sema_var_decl(SemaCtx *sctx, AstVarDecl *decl, Location loc)
{
compiler_assert(sctx->cm, symbol_is_var_binding(decl->kind));
const Symbol *symp = sym_search(sctx->current_scope, decl->name);
if (symp != nil && symp->kind != decl->kind) {
switch (symp->kind) {
case SymLet:
sema_error(sctx, &symp->loc, "'%s' was already declared as 'let'", decl->name.s);
return;
case SymVar:
sema_error(sctx, &symp->loc, "'%s' was already declared as 'var'", decl->name.s);
return;
case SymConst:
sema_error(
sctx, &symp->loc,
"declaration of '%s' shadows previously declared constant with the same name",
decl->name.s
);
return;
case SymType:
sema_error(sctx, &symp->loc, "'%s' was already declared as a type", decl->name.s);
return;
default:
break;
}
sema_note(sctx, &symp->loc, "'%s' was declared in this line", decl->name.s);
}
Ast *dexpr = decl->expr;
if (dexpr != nil) {
sema_expr(sctx, dexpr, loc); /* check the assignment expression */
} else {
sema_warning(sctx, &loc, "variable is unitialized");
}
if (decl->datatype == nil) {
sema_error(sctx, nil, "we don't do type inference yet sorry");
return;
}
compiler_assert(sctx->cm, decl->datatype->type == AST_IDENT);
DataType *dtype = resolve_datatype(sctx, decl->datatype->ident, decl->datatype->loc);
/* Note that we ignore whether `resolve_datatype` return an invalid type,
* since we still want to insert the variable into the symbol table,
* otherwise we would have spurious "undeclared identifier" errors. */
decl->type = dtype;
Symbol sym = {
.kind = decl->kind,
.dtype = dtype,
.loc = loc,
};
/* Insert the variable to the symbol table */
sym_insert(sctx->current_scope->symbols, decl->name.s, sym);
}
static void
sema_var_assign(SemaCtx *sctx, AstVarAssign *assign, Location loc)
{
sema_expr_ident(sctx, assign->name);
sema_expr(sctx, assign->expr, loc);
Symbol *decl = sym_search(sctx->current_scope, assign->name);
if (decl == nil) {
sema_error(sctx, &loc, "assign to undeclared variable '%s'", assign->name.s);
return;
}
if (!symbol_is_var_binding(decl->kind)) {
sema_error(
sctx, &loc,
"assign to non-variable symbol ('%s' is a '%s')",
assign->name.s, SymbolKindStr[decl->kind]
);
return;
}
if (decl->kind != SymVar) {
sema_error(
sctx, &loc,
"assign to immutable symbol ('%s' was declared as '%s')",
assign->name.s, SymbolKindStr[decl->kind]
);
return;
}
//datatype_cmp(sctx, nil, decl->dtype);
}
static void
sema_ifstmtexpr(SemaCtx *sctx, AstIf *ift, Location loc)
{
sema_expr(sctx, ift->cond, loc);
sema_node(sctx, ift->true_body);
sema_node(sctx, ift->false_body);
const isize elifs_len = arrlen(ift->elifs);
if (elifs_len > 0) {
for (isize i = 0; i < elifs_len; ++i) {
AstElif *elif = &ift->elifs[i];
sema_expr(sctx, elif->cond, loc);
sema_node(sctx, elif->body);
}
}
}
static void
sema_loop(SemaCtx *sctx, AstLoop *loop, Location loc)
{
if (loop->precond != nil) {
sema_expr(sctx, loop->precond, loc);
}
if (loop->postcond != nil) {
sema_expr(sctx, loop->postcond, loc);
}
push_semactx(&sctx);
sctx->flags |= SctxInsideLoop;
sema_node(sctx, loop->body);
pop_semactx(&sctx);
}
static void
sema_stmts(SemaCtx *sctx, Vec(Ast *) stmts)
{
/* AST_STMTS imply the opening of a new scope */
const isize stmts_len = arrlen(stmts);
for (isize i = 0; i < stmts_len; ++i) {
sema_node(sctx, stmts[i]);
if (sema_is_stmt_terminal(stmts[i]) && i + 1 != stmts_len) {
sema_warning(sctx, &stmts[i + 1]->loc, "dead code after 'return'");
}
}
}
static void
sema_stmt_block(SemaCtx *sctx, Vec(Ast *) stmts)
{
enter_scope(sctx);
sema_stmts(sctx, stmts);
exit_scope(sctx);
/* check for unused bindings declared in this scope */
sema_check_unused_vars(sctx);
}
static void
sema_node(SemaCtx *sctx, Ast *node)
{
if (node == nil)
return;
switch (node->type) {
case AST_IF:
sema_ifstmtexpr(sctx, &node->ifse, node->loc);
break;
case AST_LOOP:
sema_loop(sctx, &node->loop, node->loc);
break;
case AST_STMTS:
sema_stmt_block(sctx, node->stmts);
break;
case AST_PROCDEF:
sema_procdef(sctx, &node->proc, node->loc);
break;
case AST_PROCCALL:
sema_proccall(sctx, &node->call, node->loc);
break;
case AST_VARDECL:
sema_var_decl(sctx, &node->var, node->loc);
break;
case AST_VARASSIGN:
sema_var_assign(sctx, &node->varassgn, node->loc);
break;
case AST_RETURN:
sema_return(sctx, node->ret, node->loc);
break;
case AST_BREAK:
sema_break(sctx, nil, node->loc);
break;
case AST_DISCARD:
sema_discard(sctx, node->discard.expr, node->loc);
break;
case AST_ATTRIBUTE:
sema_attribute(sctx, &node->attribute);
break;
case AST_BINEXPR:
case AST_UNARY:
case AST_NUMBER:
case AST_STRLIT:
case AST_IDENT:
sema_expr(sctx, node, node->loc);
break;
case AST_INVALID:
case AST_EXPRS:
case AST_PROCCALL_ARGS:
unreachable();
}
}
static void
sema_make_builtin_types(SemaCtx *sctx)
{
typedef struct {
const char *name;
Symbol sym;
} NameSym;
DataType *void_type = make_data_type(DtkVoid, 0, true, false);
DataType *str_type = make_data_type(DtkStruct, 0, false, false);
DataType *puts_proto = make_data_type(DtkProc, 0, false, false);
puts_proto->proc.rettype = void_type;
puts_proto->proc.argtypes = make_proc_args((DataType *[]){str_type}, 1);
puts_proto->proc.extern_lnk = true;
NameSym builtin_basic_types[] = {
{"void", {.kind = SymType, .dtype = void_type}},
{"u64", {.kind = SymType, .dtype = make_data_type(DtkBasic, 8, true, false)}},
{"i64", {.kind = SymType, .dtype = make_data_type(DtkBasic, 8, true, true)}},
{"cint", {.kind = SymType, .dtype = make_data_type(DtkBasic, sizeof(int), true, true)}},
{"string", {.kind = SymType, .dtype = str_type}},
{"bool", {.kind = SymType, .dtype = make_data_type(DtkBool, 1, true, false)}},
};
DataType *u64_dt = builtin_basic_types[1].sym.dtype;
DataType *bool_dt = builtin_basic_types[5].sym.dtype;
NameSym builtin_procs[] = {
{
"+",
{
.kind = SymProc,
.dtype = make_proc_type(
true,
u64_dt,
make_proc_args((DataType *[]){u64_dt, u64_dt}, 2)
)
}
},
{
"-",
{
.kind = SymProc,
.dtype = make_proc_type(
true,
u64_dt,
make_proc_args((DataType *[]){u64_dt, u64_dt}, 2)
)
}
},
{
"==",
{
.kind = SymProc,
.dtype = make_proc_type(
true,
bool_dt,
make_proc_args((DataType *[]){u64_dt, u64_dt}, 2)
)
}
},
};
for (isize i = 0; i < countof(builtin_basic_types); ++i) {
const char *name = builtin_basic_types[i].name;
Symbol sym = builtin_basic_types[i].sym;
sym_insert(sctx->current_scope->symbols, name, sym);
}
for (isize i = 0; i < countof(builtin_procs); ++i) {
sym_insert(sctx->current_scope->symbols,
builtin_procs[i].name, builtin_procs[i].sym);
}
sctx->builtintypes.tyu64 = builtin_basic_types[1].sym.dtype;
sctx->builtintypes.void_t = void_type;
Symbol puts_sym = {.kind = SymProc, .dtype = puts_proto};
sym_insert(sctx->current_scope->symbols, "puts", puts_sym);
}
SemaCtx *
sema_new(Compiler *cm)
{
SemaCtx *toplevel_context = make_semactx(cm, nil);
toplevel_context->current_scope = make_scope(nil);
sema_make_builtin_types(toplevel_context);
toplevel_context->top_scope = toplevel_context->current_scope;
toplevel_context->ok = true;
return toplevel_context;
}
void
sema_destroy(SemaCtx *sctx)
{
free(sctx);
}
void
sema(SemaCtx *sctx, Ast *program)
{
/* Analyze toplevel */
/* XXX: DRY it */
compiler_assert(sctx->cm, program->type == AST_STMTS);
for (isize i = 0; i < arrlen(program->stmts); ++i)
sema_node(sctx, program->stmts[i]);
if (!sctx->cm->opts.compile_only && !sctx->main_defined)
sema_error(sctx, nil, "missing 'main' entrypoint proc");
/* check unused local procedures */
const SymbolEntry *syms = sctx->current_scope->symbols;
for (isize i = 0; i < shlen(syms); ++i) {
const Symbol fsym = syms[i].value;
if (fsym.kind == SymProc
&& !fsym.dtype->builtin
&& !fsym.dtype->proc.public
&& !fsym.dtype->proc.extern_lnk
&& !fsym.used) {
sema_warning(
sctx, &fsym.loc,
"defined proc '%s' is never called in this module", syms[i].key
);
}
}
}

32
compiler/sema.h Normal file
View file

@ -0,0 +1,32 @@
#ifndef _sema_h_
#define _sema_h_
#include "ast.h"
#include "state.h"
typedef struct Scope Scope;
typedef struct SemaCtx SemaCtx;
struct SemaCtx
{
SemaCtx *prev;
Scope *current_scope;
Scope *top_scope;
Compiler *cm;
u64 flags; /* Bit field storing context flags */
struct {
DataType *tyu64;
DataType *void_t;
} builtintypes;
bool ok; /* did the semantic check fail */
bool main_defined;
};
SemaCtx *
sema_new(Compiler *cm);
void
sema_destroy(SemaCtx *sctx);
void
sema(SemaCtx *sctx, Ast *program);
#endif

13
compiler/state.c Normal file
View file

@ -0,0 +1,13 @@
#include <stdio.h>
#include "pre.h"
#include "state.h"
void
compiler_assert_impl(Compiler *cm, const char *pred_s)
{
(void)cm;
fprintf(stderr, "Bug check fail: `%s`\n", pred_s);
fputs("This is a compiler bug, please report! (run with -v for bug reporting instructions)\n\n", stderr);
fflush(stderr);
debugtrap();
}

30
compiler/state.h Normal file
View file

@ -0,0 +1,30 @@
#ifndef _state_h_
#define _state_h_
#include "pre.h"
#include "cgBackends.h"
/* Assert meant to catch compiler bugs. The difference with a normal assert is that
* this one stays on release builds. Better to crash than to deal with some weird bug
* seeping through codegen.
*/
#define compiler_assert(cm, pred) if (!(pred)) {compiler_assert_impl(cm, #pred);}
typedef struct {
struct {
bool color; /* colored diagnostics */
bool compile_only;
Str exe_out;
Str release_mode;
Vec(Str) defines;
enum CodegenBackends backend;
isize max_errors;
} opts;
Str current_filename;
isize error_count;
} Compiler;
void
compiler_assert_impl(Compiler *cm, const char *pred_s);
#endif

10
compiler/symbol.c Normal file
View file

@ -0,0 +1,10 @@
#include "symbol.h"
const char *SymbolKindStr[] = {
[SymInvalid] = "",
[SymLet] = "let",
[SymVar] = "var",
[SymConst] = "const",
[SymProc] = "proc",
[SymType] = "type definition",
};

17
compiler/symbol.h Normal file
View file

@ -0,0 +1,17 @@
#ifndef _symbol_h_
#define _symbol_h_
#define symbol_is_var_binding(sk) ((sk) >= SymLet && (sk) <= SymConst)
enum SymbolKind
{
SymInvalid,
SymLet,
SymVar,
SymConst,
SymProc,
SymType, /* a data type that is */
};
extern const char *SymbolKindStr[];
#endif