commit bba597f7bfa7550feafbf5f34e7a63aada24d5e7 Author: tocariimaa Date: Sun Jan 12 18:20:42 2025 -0300 Initial commit diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..46d2175 --- /dev/null +++ b/Makefile @@ -0,0 +1,38 @@ +# This makefile should work for both GNU and BSD Make I think... + +SRCDIR = ./compiler +COMPILER_SRCS != find $(SRCDIR) -type f -name '*.c' +COMPILER_OBJS := $(COMPILER_SRCS:.c=.o) +COMPILER_DEPS := $(COMPILER_OBJS:.o=.d) + +ASAN = -fsanitize=address,undefined +CFLAGS := -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wvla -Wwrite-strings \ + -Wnull-dereference -pipe -O0 -ggdb3 -std=c11 $(ASAN) +LDFLAGS := $(ASAN) + +all: rutilec ast2dot + +rutilec: $(COMPILER_OBJS) + $(CC) $(LDFLAGS) $^ -o $@$(EXE) + +ast2dot: tools/ast2dot.c $(COMPILER_OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $(COMPILER_OBJS) $< -o $@$(EXE) + +clean: + rm -f $(COMPILER_OBJS) $(COMPILER_DEPS) ./rutilec$(EXE) ./ast2dot$(EXE) + +options: + @echo "Build options:" + @echo "CC = $(CC)" + @echo "CFLAGS = $(CFLAGS)" + @echo "LDFLAGS = $(LDFLAGS)" + @echo "ASan flags = $(ASAN)" + @echo "SRCS = $(COMPILER_SRCS)" + @echo "OBJS = $(COMPILER_OBJS)" + +.PHONY: all clean options + +-include $(COMPILER_DEPS) + +%.o: %.c Makefile + $(CC) $(CFLAGS) -MMD -MP -c -o $@ $< diff --git a/README.md b/README.md new file mode 100644 index 0000000..604deba --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# Rutile +Yet another compiled programming language. +Very unstable and in early development. + +``` +proc main*(): cint + puts("Hello, world!") + return 0 +end +``` + +## Building +### Build time dependencies +- C11 C compiler (tested on GCC, Clang and TCC) +- Libc +- BSD or GNU Make +- POSIX shell +- `find` command + +### Build time dependencies (single header libraries) +These are contained in `compiler/libs`. +- [stb_ds.h](https://github.com/nothings/stb) +- [optparse](https://github.com/skeeto/optparse) + +### Procedure +Note that the default `CFLAGS` and `LDFLAGS` are meant for development builds. +```sh +# debug build (uses default flags): +make -j$(nproc) ptgc +# for release: +make -j$(nproc) CFLAGS='-O2 -march=native -DNDEBUG' LDFLAGS='' ASAN='' +``` + +## License +GNU GPLv3 for the compiler and BSD 3-Clause for the standard library. + +## Acknowledgements +- Christopher Wellons, for his public domain libraries. +- Sean Barrett's `stb_ds.h`. diff --git a/compiler/ast.h b/compiler/ast.h new file mode 100644 index 0000000..cdf8d3a --- /dev/null +++ b/compiler/ast.h @@ -0,0 +1,151 @@ +#ifndef _ast_h_ +#define _ast_h_ + +#include "pre.h" +#include "datatype.h" +#include "symbol.h" +#include "location.h" + +#define ast_node_is_atom(nk) \ + (nk == AST_IDENT || nk == AST_NUMBER || nk == AST_STRLIT || nk == AST_PROCCALL) +#define ast_node_is_unary(nk) \ + (nk == AST_UNARY || ast_node_is_atom(nk)) +#define ast_node_is_expr(nk) \ + (nk == AST_BINEXPR || ast_node_is_unary(nk)) + +enum AstType +{ + AST_INVALID, /* For use as a placeholder until the actual type is decided */ + AST_NUMBER, /* number */ + AST_IDENT, /* ident */ + AST_STRLIT, /* strlit */ + AST_PROCDEF, /* proc */ + AST_PROCCALL, /* call */ + AST_PROCCALL_ARGS, /* */ + AST_VARDECL, /* var */ + AST_VARASSIGN, /* varassgn */ + AST_IF, /* ifse */ + AST_RETURN, /* ret */ + AST_BREAK, + AST_LOOP, /* loop */ + AST_STMTS, /* stmts */ + AST_EXPRS, /* exprs */ + AST_BINEXPR, /* bin */ + AST_UNARY, /* unary */ + AST_ATTRIBUTE, /* attribute */ + AST_DISCARD, +}; + +typedef struct Ast Ast; + +typedef struct { + Str op; + Ast *left, *right; + DataType *type; /* filled in by sema */ +} AstBinop; + +typedef struct { + Str op; + Ast *atom; + DataType *type; /* filled in by sema */ +} AstUnary; + +typedef struct { + Str ident; + Str dtype; + /* Symbol kind for this parameter, `SymVar` would represent a mutable + * parameter and `SymLet` a immutable one. */ + enum SymbolKind kind; + Location loc, dtype_loc; +} AstIdentTypePair; + +typedef struct { + Str name; + bool ispublic; + Ast *body; + Vec(AstIdentTypePair) args; + Ast *rettype; + + DataType *type; +} AstProc; + +typedef struct { + Str name; + Ast *args; +} AstProcCall; + +typedef struct { + Str name; + /* Data type, nil if no type was explicitly stated, meaning that + * type deduction must be made from the expression, also implying that + * if this field is nil, `expr` MUSN'T be nil. */ + Ast *datatype; + Ast *expr; /* if the declaration assigns a value */ + enum SymbolKind kind; /* whether is a let, var or const... */ + DataType *type; /* filled in by sema */ +} AstVarDecl; + +typedef struct { + Str name; + Ast *expr; +} AstVarAssign; + +typedef struct { + u64 n; + DataType *type; /* filled in by the sema */ +} AstNumber; + +typedef struct { + Ast *cond; + Ast *body; +} AstElif; + +typedef struct { + Ast *cond; + Ast *true_body; + Ast *false_body; + Vec(AstElif) elifs; +} AstIf; + +/* Abstract representation of a loop, providing a pre and post condition. + * `while` loops are modelled as a loop with a precondition only. + * For infinite loops both `precond` and `postcond` are nil. */ +typedef struct { + Ast *precond, *postcond, *body; +} AstLoop; + +typedef struct { + /* Attributes for now can only be identifiers */ + Vec(Str) attrs; + Ast *node; /* The decorated node */ +} AstAttribute; + +typedef struct { + Ast *expr; +} AstDiscard; + +struct Ast { + enum AstType type; + union { + AstBinop bin; /* binary expression */ + AstUnary unary; /* unary operator */ + AstNumber number; /* number (this is an atom) */ + Str ident; /* identifier (this is an atom too) */ + AstProc proc; /* procedure definition */ + AstProcCall call; /* procedure call */ + AstVarDecl var; /* variable declaration */ + AstVarAssign varassgn; + Ast *ret; /* return statement, this points to its expression (if any) */ + AstIf ifse; /* if statement/expression */ + AstLoop loop; + Vec(Ast *) stmts; + Vec(Ast *) exprs; + Str strlit; /* String literal */ + AstAttribute attribute; + AstDiscard discard; + }; + Location loc; /* location in the source code of this node */ +}; +_Static_assert(sizeof(Ast) <= 512, "AST node got too bloated"); + +#endif diff --git a/compiler/cgBackends.h b/compiler/cgBackends.h new file mode 100644 index 0000000..d7206af --- /dev/null +++ b/compiler/cgBackends.h @@ -0,0 +1,10 @@ +#ifndef _cgbackends_ +#define _cgbackends_ + +enum CodegenBackends +{ + CgBackendC, + CgBackendLibGccJit, /* libgccjit backend */ +}; + +#endif diff --git a/compiler/cgC.c b/compiler/cgC.c new file mode 100644 index 0000000..aac892c --- /dev/null +++ b/compiler/cgC.c @@ -0,0 +1,382 @@ +#include + +#include "pre.h" +#include "codegen.h" +#include "cgC.h" +#include "ast.h" +#include "libs/stb_ds.h" + +#define EMIT_SEMICOLON_NL(out) fputs(";\n", out) +#define EMIT_RB_NL(out) fputs("}\n", out) + +static void +emit_expr(CodegenC *cgc, const Ast *expr); +static void +emit_expr_list(CodegenC *cgc, const Vec(Ast *) exprs, bool sep); +static void +emit_node(CodegenC *cgc, const Ast *node); + +static void +indent(CodegenC *cgc) +{ +#define INDENT(out) fputc('\t', out) + switch (cgc->indent) { + case 8: INDENT(cgc->cgctx->out); /* fallthrough */ + case 7: INDENT(cgc->cgctx->out); /* fallthrough */ + case 6: INDENT(cgc->cgctx->out); /* fallthrough */ + case 5: INDENT(cgc->cgctx->out); /* fallthrough */ + case 4: INDENT(cgc->cgctx->out); /* fallthrough */ + case 3: INDENT(cgc->cgctx->out); /* fallthrough */ + case 2: INDENT(cgc->cgctx->out); /* fallthrough */ + case 1: INDENT(cgc->cgctx->out); /* fallthrough */ + case 0: break; + default: + for (isize left = 0; left < cgc->indent; ++left) + INDENT(cgc->cgctx->out); + } +#undef INDENT +} + +/* Interns a string literal into the string table, returning its ID */ +static i64 +intern_strlit(CodegenC *cgc, const Str *str) +{ + const i64 strno = shget(cgc->cgctx->strings, str->s); + if (strno != -1) /* string already exists, return its index number */ + return strno; + + shput(cgc->cgctx->strings, str->s, cgc->cgctx->strlit_no); + return cgc->cgctx->strlit_no++; +} + +static void +emit_comment(CodegenC *cgc, Str comment, bool nl_after) +{ + fprintf(cgc->cgctx->out, "/* %s */%c", comment.s, nl_after ? '\n' : '\0'); +} + +static void +emit_include(CodegenC *cgc, Str path, bool local) +{ + fprintf( + cgc->cgctx->out, "#include %c%s%c\n", + local ? '"' : '<', path.s, local ? '"' : '>' + ); +} + +static const char * +basic_datatype_to_c(CodegenC *cgc, const DataType *dt) +{ + switch (dt->kind) { + case DtkBasic: + switch (dt->size) { + case 0: return "void"; + case 1: return "uint8_t"; + case 2: return "uint16_t"; + case 4: return "uint32_t"; + case 8: return "uint64_t"; + } + break; + case DtkVoid: + return "void"; + break; + } + return nil; +} + +static void +emit_datatype(CodegenC *cgc, const DataType *dt) +{ + switch (dt->kind) { + case DtkBasic: + case DtkVoid: + fputs(basic_datatype_to_c(cgc, dt), cgc->cgctx->out); + break; + case DtkStruct: + fprintf(cgc->cgctx->out, "struct %s", dt->name.s); + break; + } +} + +static void +emit_c_attribute(CodegenC *cgc, Str attr) +{ + fprintf(cgc->cgctx->out, "__attribute((%s))", attr.s); +} + +static void +emit_structdecl(CodegenC *cgc, const DataType *dt) +{ + fputs("struct %s {\n", cgc->cgctx->out); + for (isize i = 0; i < arrlen(dt->compound.fields); ++i) { + emit_datatype(cgc, dt->compound.fields[i]); + EMIT_SEMICOLON_NL(cgc->cgctx->out); + } + fputc('}', cgc->cgctx->out); + if (dt->compound.packed) + emit_c_attribute(cgc, Sl("packed")); + EMIT_SEMICOLON_NL(cgc->cgctx->out); +} + +static void +emit_vardecl(CodegenC *cgc, const AstVarDecl *decl) +{ + if (decl->kind == SymConst) + fputs("const ", cgc->cgctx->out); + + emit_datatype(cgc, decl->type); + fprintf(cgc->cgctx->out, " %s", decl->name.s); + if (decl->expr != nil) { + fputc('=', cgc->cgctx->out); + emit_expr(cgc, decl->expr); + } + EMIT_SEMICOLON_NL(cgc->cgctx->out); +} + +static void +emit_varassign(CodegenC *cgc, const AstVarAssign *assign) +{ + fprintf(cgc->cgctx->out, "%s = ", assign->name.s); + emit_expr(cgc, assign->expr); + EMIT_SEMICOLON_NL(cgc->cgctx->out); +} + +static void +emit_proc(CodegenC *cgc, const AstProc *proc) +{ + if (!proc->ispublic) + fputs("static ", cgc->cgctx->out); + + emit_datatype(cgc, proc->type->proc.rettype); + fprintf(cgc->cgctx->out, " %s(", proc->name.s); + + const isize arglen = arrlen(proc->args); + if (arglen == 0) + fputs("void", cgc->cgctx->out); + for (isize i = 0; i < arglen; ++i) { + AstIdentTypePair arg = proc->args[i]; + //emit_datatype(cgc, arg.dtype); + fputs("uint64_t ", cgc->cgctx->out); + fputs((char *)arg.ident.s, cgc->cgctx->out); + if (i + 1 < arglen) + fputc(',', cgc->cgctx->out); + } + fputs(")\n{\n", cgc->cgctx->out); + if (proc->body != nil) + emit_node(cgc, proc->body); + EMIT_RB_NL(cgc->cgctx->out); +} + +static void +emit_proccall(CodegenC *cgc, const AstProcCall *call) +{ + fprintf(cgc->cgctx->out, "%s(", call->name.s); + if (call->args != nil) + emit_expr_list(cgc, (const Vec(Ast *))call->args->stmts, true); + fputs(")", cgc->cgctx->out); +} + +static void +emit_if(CodegenC *cgc, const AstIf *ift) +{ + fputs("if (", cgc->cgctx->out); + emit_expr(cgc, ift->cond); + fputs("){\n", cgc->cgctx->out); + emit_node(cgc, ift->true_body); + fputc('}', cgc->cgctx->out); + if (ift->false_body != nil) { + fputs("else", cgc->cgctx->out); + fputs("{\n", cgc->cgctx->out); + emit_node(cgc, ift->false_body); + fputc('}', cgc->cgctx->out); + } + fputc('\n', cgc->cgctx->out); +} + +static void +emit_whileLoop(CodegenC *cgc, const AstLoop *whl) +{ + fputs("while (", cgc->cgctx->out); + emit_expr(cgc, whl->precond); + fputs("){\n", cgc->cgctx->out); + emit_node(cgc, whl->body); + fputs("}\n", cgc->cgctx->out); +} + +static void +emit_loop(CodegenC *cgc, const AstLoop *loop) +{ + if (loop->precond != nil) + emit_whileLoop(cgc, loop); + else if (loop->postcond != nil) + unreachable(); +} + +static void +emit_return(CodegenC *cgc, const Ast *ret_expr) +{ + fputs("return ", cgc->cgctx->out); + emit_expr(cgc, ret_expr); + EMIT_SEMICOLON_NL(cgc->cgctx->out); +} + +static void +emit_break(CodegenC *cgc, const Ast *unused) +{ + (void)unused; + fputs("break;\n", cgc->cgctx->out); +} + +static void +emit_discard(CodegenC *cgc, const Ast *expr) +{ + emit_node(cgc, expr); +} + +static void +emit_expr_number(CodegenC *cgc, const AstNumber *num) +{ + fprintf(cgc->cgctx->out, "%lu", num->n); +} + +static void +emit_expr_strlit(CodegenC *cgc, const Str *strlit) +{ + fprintf(cgc->cgctx->out, "\"%s\"", strlit->s); +} + +static void +emit_expr_ident(CodegenC *cgc, const Str *ident) +{ + fputs((char *)ident->s, cgc->cgctx->out); +} + +static void +emit_expr_unary(CodegenC *cgc, const AstUnary *unary) +{ + emit_expr(cgc, unary->atom); +} + +static void +emit_expr_binop(CodegenC *cgc, const AstBinop *expr) +{ + /* guard binops with parenthesis, even if they are redundant */ + fputc('(', cgc->cgctx->out); + emit_expr(cgc, expr->left); + fputc('+', cgc->cgctx->out); + emit_expr(cgc, expr->right); + fputc(')', cgc->cgctx->out); +} + +static void +emit_expr(CodegenC *cgc, const Ast *expr) +{ + if (expr == nil) + return; + switch (expr->type) { + case AST_BINEXPR: + emit_expr_binop(cgc, &expr->bin); + break; + case AST_UNARY: + emit_expr_unary(cgc, &expr->unary); + break; + case AST_NUMBER: + emit_expr_number(cgc, &expr->number); + break; + case AST_STRLIT: + emit_expr_strlit(cgc, &expr->strlit); + break; + case AST_IDENT: + emit_expr_ident(cgc, &expr->ident); + break; + case AST_PROCCALL: + emit_proccall(cgc, &expr->call); + break; + default: + unreachable(); + } +} + +static void +emit_expr_list(CodegenC *cgc, const Vec(Ast *) exprs, bool sep) +{ + const isize exprs_len = arrlen(exprs); + for (isize i = 0; i < exprs_len; ++i) { + emit_expr(cgc, exprs[i]); + if (sep && i + 1 < exprs_len) /* no trailling separator */ + fputc(',', cgc->cgctx->out); + } +} + +static void +emit_stmt_list(CodegenC *cgc, Vec(Ast *) stmts) +{ + for (isize i = 0; i < arrlen(stmts); ++i) { + emit_node(cgc, stmts[i]); + } +} + +static void +emit_node(CodegenC *cgc, const Ast *node) +{ + switch (node->type) { + case AST_STMTS: + emit_stmt_list(cgc, node->stmts); + break; + case AST_PROCDEF: + emit_proc(cgc, &node->proc); + break; + case AST_PROCCALL: + emit_proccall(cgc, &node->call); + EMIT_SEMICOLON_NL(cgc->cgctx->out); + break; + case AST_IF: + emit_if(cgc, &node->ifse); + break; + case AST_LOOP: + emit_loop(cgc, &node->loop); + break; + case AST_RETURN: + emit_return(cgc, node->ret); + break; + case AST_BREAK: + emit_break(cgc, nil); + break; + case AST_DISCARD: + emit_discard(cgc, node->discard.expr); + break; + case AST_VARDECL: + emit_vardecl(cgc, &node->var); + break; + case AST_VARASSIGN: + emit_varassign(cgc, &node->varassgn); + break; + case AST_BINEXPR: + case AST_UNARY: + case AST_NUMBER: + case AST_STRLIT: + case AST_IDENT: + emit_expr(cgc, node); + break; + case AST_PROCCALL_ARGS: + case AST_EXPRS: + case AST_INVALID: + unreachable(); + } +} + +void +cgC(CodegenC *cgc, const Ast *program) +{ + cgc->cgctx->out = stdout; + + char note_buf[255] = {0}; + snprintf(note_buf, sizeof(note_buf), + "generated C IR from %s", cgc->cgctx->cctx->current_filename.s + ); + emit_comment(cgc, Str_from_c(note_buf), true); + + emit_include(cgc, Sl("stdint.h"), false); + fputc('\n', cgc->cgctx->out); + emit_node(cgc, program); +} diff --git a/compiler/cgC.h b/compiler/cgC.h new file mode 100644 index 0000000..9473734 --- /dev/null +++ b/compiler/cgC.h @@ -0,0 +1,15 @@ +#ifndef _cgC_h_ +#define _cgC_h_ + +#include "codegen.h" +#include "ast.h" + +typedef struct { + CodegenCtx *cgctx; + int indent; +} CodegenC; + +void +cgC(CodegenC *cgc, const Ast *program); + +#endif diff --git a/compiler/codegen.c b/compiler/codegen.c new file mode 100644 index 0000000..00606e3 --- /dev/null +++ b/compiler/codegen.c @@ -0,0 +1,101 @@ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include + +#include "codegen.h" +#include "cgC.h" +#include "messages.h" +#include "libs/stb_ds.h" + +/* (Std)In --> process --> (Std)Out */ +void +spawn_with_iofp(const char *path, char *const *argv, + pid_t *pid, FILE **in, FILE **out) +{ + int irp[2], asmp[2]; + posix_spawn_file_actions_t fileacts; + posix_spawn_file_actions_init(&fileacts); + + if (in != nil) { + /* the "in" pipe */ + if (pipe(irp) < 0) + fatal(nil, nil, "could not open pipe"); + posix_spawn_file_actions_addclose(&fileacts, irp[1]); + posix_spawn_file_actions_adddup2(&fileacts, irp[0], STDIN_FILENO); + } + if (out != nil) { + /* the "out" pipe */ + if (pipe(asmp) < 0) + fatal(nil, nil, "could not open pipe"); + posix_spawn_file_actions_addclose(&fileacts, asmp[0]); + posix_spawn_file_actions_adddup2(&fileacts, asmp[1], STDOUT_FILENO); + } + + if (posix_spawn(pid, path, &fileacts, nil, argv, nil) != 0) + fatal(nil, nil, "posix_spawn failed"); + + posix_spawn_file_actions_destroy(&fileacts); + + if (in != nil) { + close(irp[0]); + if ((*in = fdopen(irp[1], "wb")) == nil) + fatal(nil, nil, "fdopen fail"); + } + if (out != nil) { + close(asmp[1]); + if ((*out = fdopen(asmp[0], "rb")) == nil) + fatal(nil, nil, "fdopen fail"); + } +} + +void +process_wait(pid_t pid) +{ + int pstat; + waitpid(pid, &pstat, 0); + if (!WIFEXITED(pstat)) + fatal(nil, nil, "qbe crashed"); + /* did not crash, read return status */ + int exitc; + if ((exitc = WEXITSTATUS(pstat)) != 0) + fatal(nil, nil, "qbe exited with non-zero status %d", exitc); +} + + +CodegenCtx * +codegen_new(Compiler *cm, enum CodegenBackends backend) +{ + CodegenCtx *ctx = calloc(1, sizeof(*ctx)); + ctx->ext_pid = ctx->ld_pid = -1; + ctx->backend = backend; + ctx->cctx = cm; + + sh_new_arena(ctx->strings); + shdefault(ctx->strings, -1); + return ctx; +} + +void +codegen_destroy(CodegenCtx *cgctx) +{ + if (cgctx->ext_pid != -1) + process_wait(cgctx->ext_pid); + if (cgctx->ld_pid != -1) + process_wait(cgctx->ld_pid); + shfree(cgctx->strings); + free(cgctx); +} + +void +codegen(CodegenCtx *cgctx, Ast *program) +{ + switch (cgctx->backend) { + case CgBackendC: + cgC(&(CodegenC){.cgctx = cgctx, .indent = 2}, program); + break; + case CgBackendLibGccJit: + fatal(nil, nil, "libgccjit backend not implemented yet"); + break; + } +} diff --git a/compiler/codegen.h b/compiler/codegen.h new file mode 100644 index 0000000..f7ae184 --- /dev/null +++ b/compiler/codegen.h @@ -0,0 +1,37 @@ +#ifndef _codegen_h_ +#define _codegen_h_ +#include /* FILE */ +#include /* for pid_t */ + +#include "pre.h" +#include "ast.h" +#include "state.h" +#include "cgBackends.h" + +typedef struct { + FILE *out; /* File where to output QBE IR */ + FILE *asm_out; + + i64 strlit_no; + i64 internal_label; + /* Hash map acting as a set, which contains all strings in a compilation + * unit. Strings get interned on this hash map to remove duplicates. + */ + HashMapStr(i64) *strings; + pid_t ext_pid, ld_pid; + enum CodegenBackends backend; + Compiler *cctx; +} CodegenCtx; + +void +spawn_with_iofp(const char *path, char *const *argv, pid_t *pid, FILE **in, FILE **out); +void +process_wait(pid_t pid); +CodegenCtx * +codegen_new(Compiler *cm, enum CodegenBackends backend); +void +codegen_destroy(CodegenCtx *cgctx); +void +codegen(CodegenCtx *cgctx, Ast *program); + +#endif diff --git a/compiler/datatype.h b/compiler/datatype.h new file mode 100644 index 0000000..94f369d --- /dev/null +++ b/compiler/datatype.h @@ -0,0 +1,53 @@ +#ifndef _datatype_h_ +#define _datatype_h_ +#include "pre.h" + +enum DataTypeKind +{ + DtkInvalid = 0, + DtkVoid, + DtkBasic, + DtkStruct, + DtkUnion, + DtkProc, + DtkArray, + DtkBool, +}; + +typedef struct DataType DataType; + +typedef struct { + bool packed; + Vec(DataType *) fields; +} DataTypeCompound; + +struct DataType +{ + enum DataTypeKind kind; + u16 size; /* size in bytes of the data type */ + bool builtin; /* if this type is defined in compilerland */ + bool sign; /* if the type is numerical and has a sign or not */ + Str name; + + union { + DataTypeCompound compound; /* Represents either a struct or union type */ + struct { + DataType *rettype; + Vec(DataType *) argtypes; + bool public; + bool extern_lnk; /* external linkage */ + bool c_varargs; /* C-style varargs (for FFI) */ + } proc; + struct { + DataType *base; + isize len; + } array; + }; +}; + +typedef struct { + bool ok; /* whether the type checking succeeded */ + Str msg; /* message describing the type error */ +} DataTypeCheck; + +#endif diff --git a/compiler/lex.c b/compiler/lex.c new file mode 100644 index 0000000..30fc41a --- /dev/null +++ b/compiler/lex.c @@ -0,0 +1,581 @@ +#include /* feof, ferror, fread, FILE, EOF */ +#include /* malloc calloc free */ +#include /* memset */ + +#include "lex.h" +#include "messages.h" +#include "pre.h" +#include "libs/stb_ds.h" + +#define LEX_BUFFER_SIZE 8192 +#define LEX_HALF_BUFFER_SIZE LEX_BUFFER_SIZE / 2 +#define LEX_BUFFER_SENTINEL '\0' + +#define MAX_IDENT_SIZE 1024u +#define STRING_LITERAL_BASE_SIZE 255 +#define STRING_LITERAL_MAX_SIZE 4096 + +#define at_buffer_end(ls) (*(ls)->fwd == '\0') +#define ascii_isident(c) (c == '_' || c == '?' || c == '!' || ascii_isalnum(c)) +#define ascii_isident_start(c) (c == '_' || ascii_isalpha(c)) + +#define lex_error(ls, ...) do { \ + error((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \ + } while(0) + +#define lex_fatal(ls, ...) do { \ + fatal((ls)->cm, &(ls)->cur_loc, __VA_ARGS__); \ + } while(0) + +typedef Optional(u8) MaybeChr; + +const char *TokenIdStr[T_TOKEN_COUNT] = { + [T_INVALID] = "(invalid token)", + [T_PLUS] = "+", + [T_MINUS] = "-", + [T_STAR] = "*", + [T_BAR] = "/", + [T_EXCLAMATION] = "!", + [T_LPAREN] = "(", + [T_RPAREN] = ")", + [T_COMMA] = ",", + [T_LESSTHAN] = "<", + [T_GREATTHAN] = ">", + [T_LOGAND] = "and", + [T_LOGOR] = "or", + [T_EQUAL] = "=", + [T_LOGICEQUAL] = "==", + [T_NOTEQUAL] = "!=", + [T_HASH] = "#", + [T_COLON] = ":", + [T_SEMICOLON] = ";", + [T_LBRACKET] = "[", + [T_RBRACKET] = "]", + [T_LBRACE] = "{", + [T_RBRACE] = "}", + [T_IDENT] = "(identifier)", + [T_STRING] = "(string literal)", + [T_NUMBER] = "(number)", + [T_DECNUMBER] = "(decimal number)", + [T_CONST] = "const", + [T_DISCARD] = "discard", + [T_ELIF] = "elif", + [T_ELSE] = "else", + [T_END] = "end", + [T_IF] = "if", + [T_LET] = "let", + [T_PROC] = "proc", + [T_RETURN] = "return", + [T_VAR] = "var", + [T_WHILE] = "while", + [T_STRUCT] = "struct", + [T_USE] = "use", + [T_BREAK] = "break", + [T_NEXT] = "next", + [T_EOF] = "(EOF)", + [T_ERROR] = "(error)", +}; + +/* Non retarded ASCII character class comparison */ +static bool +ascii_isdigit(u32 c) +{ + return c >= '0' && c <= '9'; +} + +static bool +ascii_isalpha(u32 c) +{ + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +static bool +ascii_isspace(u32 c) +{ + return c == ' ' || (c >= '\t' && c <= '\r'); +} + +static bool +ascii_isalnum(u32 c) +{ + return ascii_isalpha(c) || ascii_isdigit(c); +} + +static void +update_line_count(LexState *ls, u8 chr) +{ + switch (chr) { + case '\n': + ls->cur_loc.column = 1; + ++ls->cur_loc.line; + break; + case '\t': /* fallthrough */ + default: + ++ls->cur_loc.column; + } +} + +static u8 +peek(LexState *ls) +{ + return *ls->fwd; +} + +static void +backup(LexState *ls, int n) +{ + ls->fwd -= n; + if (*ls->fwd == '\n') + --ls->cur_loc.line; + /* not quite right if fwd is \n... */ + --ls->cur_loc.column; +} + +static bool +read_buf(LexState *ls, u8 *buf, isize n, isize *ar) +{ + if (feof(ls->input_fp)) + return false; + const isize rb = fread(buf, sizeof(*buf), n, ls->input_fp); + if (ferror(ls->input_fp)) { + fatal(ls->cm, nil, "could not read input file\n"); + } + *ar = rb; + buf[rb] = LEX_BUFFER_SENTINEL; + return true; +} + +static bool +reload_buffers(LexState *ls) +{ + //if ((ls->fwd != ls->buf + ls->buflen) || (ls->fwd != ls->buf + ls->buflen2)) + // lex_fatal(ls, "invalid nil byte in middle of source file"); + + const u8 *end_of_buf1 = ls->buf + ls->buflen; + const u8 *end_of_buf2 = ls->buf + LEX_HALF_BUFFER_SIZE + ls->buflen2; + + if (ls->fwd == end_of_buf1) { /* end of first buffer */ + u8 *buf2 = ls->buf + LEX_HALF_BUFFER_SIZE; + if (!read_buf(ls, buf2, LEX_HALF_BUFFER_SIZE, &ls->buflen2)) + return false; /* reached EOF, no more data */ + ls->fwd = buf2; + } else if (ls->fwd == end_of_buf2) { /* end of second buffer */ + u8 *buf1 = ls->buf; + if (!read_buf(ls, buf1, LEX_HALF_BUFFER_SIZE, &ls->buflen)) + return false; /* reached EOF, no more data */ + ls->fwd = buf1; + } + /* reset pointers back to the beginning of the buffer */ + ls->lbegin = ls->fwd; + return true; +} + +static MaybeChr +read_chr(LexState *ls) +{ + u8 chr = peek(ls); + if (chr == LEX_BUFFER_SENTINEL) { /* maybe end of buffer */ + if (!reload_buffers(ls)) + return None(MaybeChr); + } + update_line_count(ls, chr); + return Some(MaybeChr, *ls->fwd++); +} + +static MaybeChr +skip_whitespace(LexState *ls) +{ + /* skip any whitespace + * [ abc = 2*9 - 1 ] + * ^-fwd, lbegin + * [ abc = 2*9 - 1 ] + * lbegin-^^-fwd + * */ + MaybeChr c; + for (;;) { + c = read_chr(ls); + if (!c.ok) { + return None(MaybeChr); + } + if (!ascii_isspace(c.val)) + break; + ++ls->lbegin; + } + return c; +} + +static LexToken +make_error(void) +{ + return (LexToken){ .id = T_ERROR }; +} + +static u8 * +intern_identifier(LexState *ls, u8 *ident) +{ + IdentsBucket *entry; + if ((entry = shgetp_null(ls->idents, ident)) == nil) { + shput(ls->idents, ident, 0); + return (u8 *)shgets(ls->idents, ident).key; + } + return (u8 *)entry->key; +} + +/* + * *──┬(ident)┬──* + * ╰───<───╯ + */ +static LexToken +identifier(LexState *ls) +{ + /* this gets copied to the hash table arena, no problem */ + u8 ident_buf[MAX_IDENT_SIZE]; + usize i = 0; + + MaybeChr chr = { *ls->lbegin, true }; + while (chr.ok && ascii_isident(chr.val)) { + if (i + 1 == MAX_IDENT_SIZE) { + lex_error(ls, "identifier is too long (max: %u)\n", MAX_IDENT_SIZE); + return make_error(); + } + ident_buf[i++] = chr.val; + chr = read_chr(ls); + } + ident_buf[i] = '\0'; + /* ate 1 extra character, give it back */ + if (chr.ok) + backup(ls, 1); + + return (LexToken) { + .id = T_IDENT, + .ident = {intern_identifier(ls, ident_buf), i}, + .len = i, + }; +} + +static LexToken +string_literal(LexState *ls) +{ + isize str_buf_len = STRING_LITERAL_BASE_SIZE; + u8 *str_buf = malloc(str_buf_len); + isize i = 0; + + /* skip past " */ + MaybeChr chr = read_chr(ls); + while (chr.val != '"') { + if (i + 1 == STRING_LITERAL_MAX_SIZE) { + lex_error(ls, "string literal length exceeds maximum of %d bytes", STRING_LITERAL_MAX_SIZE); + goto err; + } + if (i + 1 > str_buf_len) { + str_buf = realloc(str_buf, str_buf_len *= 2); + } + str_buf[i++] = chr.val; + chr = read_chr(ls); + if (!chr.ok || chr.val == '\n') { + lex_error(ls, "unterminated string literal"); + goto err; + } + } + if (i > 0) { + str_buf[i] = '\0'; + } else { /* empty literal */ + free(str_buf); /* we wasted our time */ + str_buf = nil; + } + + return (LexToken) { + .id = T_STRING, + .str = {str_buf, i}, + .len = i, + }; +err: + return make_error(); +} + +/* Identifies a numeric literal that may have a prefix: + * + * ('0')─┬──────────────────────┬─* + * ├('b')╭──┬(digit)┬─────╯ + * ├('o')┤ ╰───<───╯ + * ╰('x')╯ + * Indirectly based on a BSD (?) implementation. + */ +static LexToken +number_literal(LexState *ls) +{ + LexToken t = { .id = T_NUMBER }; + u64 number = 0; + u8 base = 10; + + MaybeChr chr = { *ls->lbegin, true }; + + if (chr.val == '0') { + chr = read_chr(ls); /* skip 0 prefix */ + if (!chr.ok) { /* EOF edge case */ + return t; /* 0 */ + } + switch (chr.val) { + case 'b': + base = 2; + break; + case 'o': + base = 8; + break; + case 'x': + base = 16; + break; + default: + if (ascii_isdigit(chr.val)) { + lex_error(ls, "use '0o' for an octal literal"); + return make_error(); + } + //lex_error(ls, "unknown numeric prefix '0%c'", chr.val); + /* start of another token */ + return t; /* 0 */ + } + chr = read_chr(ls); + if (!chr.ok) { + lex_error(ls, "expected a digit after the base prefix"); + return make_error(); + } + } + + const u64 mmax = U64_MAX / base; + static const u8 digits[] = "0123456789abcdef"; + + while (chr.ok) { + u8 *digitp = memchr(digits, chr.val, lengthof(digits)); + if (digitp == nil) + break; + + u8 digit = digitp - digits; + if (digit >= base) { + lex_error(ls, "invalid literal"); + return make_error(); + } + if (number > mmax) + goto overflow; + number *= base; + /* overflow for adding the digit */ + if (U64_MAX - digit < number) + goto overflow; + + number += digit; + chr = read_chr(ls); + } + if (chr.ok) + backup(ls, 1); + + t.inumber = number; + return t; +overflow: + lex_error(ls, "integer literal is too big (2^64 max)"); + return make_error(); +} + +static LexToken +keyword(LexToken *t) +{ +#define kwcmp(ident, kw, tid) \ + {if (Str_equal(ident, kw)) return (LexToken){ .id = tid, .len = kw.len };} + + Str ident = t->ident; + --ident.len; + switch (*ident.s++) { + case 'a': + kwcmp(ident, Sl("nd"), T_LOGAND); + break; + case 'b': + kwcmp(ident, Sl("reak"), T_BREAK); + break; + case 'c': + kwcmp(ident, Sl("onst"), T_CONST); + break; + case 'd': + kwcmp(ident, Sl("iscard"), T_DISCARD); + break; + case 'e': + kwcmp(ident, Sl("nd"), T_END); + kwcmp(ident, Sl("lse"), T_ELSE); + kwcmp(ident, Sl("lif"), T_ELIF); + break; + case 'i': + kwcmp(ident, Sl("f"), T_IF); + break; + case 'l': + kwcmp(ident, Sl("et"), T_LET); + break; + case 'n': + kwcmp(ident, Sl("ot"), T_LOGNOT); + kwcmp(ident, Sl("ext"), T_NEXT); + break; + case 'o': + kwcmp(ident, Sl("r"), T_LOGOR); + break; + case 'p': + kwcmp(ident, Sl("roc"), T_PROC); + break; + case 'r': + kwcmp(ident, Sl("eturn"), T_RETURN); + break; + case 's': + kwcmp(ident, Sl("truct"), T_STRUCT); + break; + case 'v': + kwcmp(ident, Sl("ar"), T_VAR); + break; + case 'w': + kwcmp(ident, Sl("hile"), T_WHILE); + break; + case 'u': + kwcmp(ident, Sl("se"), T_USE); + break; + } + return *t; +#undef kwcmp +} + +LexToken +lex_scan(LexState *ls) +{ + if (arrlen(ls->backlist) > 0) { + return arrpop(ls->backlist); + } + /* lexeme start pointer */ + ls->lbegin = ls->fwd; + + LexToken token = {0}; + MaybeChr c = skip_whitespace(ls); + if (!c.ok) { + token.id = T_EOF; + ls->eof = true; + return token; + } + +#define TOKEN(chr, t) case chr: token.id = t; break; + //trace("token now: '%c'\n", c.val); + //trace("lp: <%s>\n", ls->lbegin); + //trace("fwd: <%s>\n", ls->fwd); + switch (c.val) { + case '!': + if (peek(ls) == '=') { + token.id = T_NOTEQUAL; + ++ls->fwd; + } else { + token.id = T_EXCLAMATION; + } + break; + TOKEN('+', T_PLUS) + TOKEN('-', T_MINUS) + TOKEN('*', T_STAR) + TOKEN('/', T_BAR) + TOKEN('(', T_LPAREN) + TOKEN(')', T_RPAREN) + TOKEN(',', T_COMMA) + TOKEN('<', T_LESSTHAN) + TOKEN('>', T_GREATTHAN) + TOKEN('#', T_HASH) + TOKEN(':', T_COLON) + TOKEN(';', T_SEMICOLON) + TOKEN('[', T_LBRACKET) + TOKEN(']', T_RBRACKET) + TOKEN('{', T_LBRACE) + TOKEN('}', T_RBRACE) + case '=': + if (peek(ls) == '=') { + token.id = T_LOGICEQUAL; + ++ls->fwd; + } else { + token.id = T_EQUAL; + } + break; + case '"': + return string_literal(ls); + case '0' ... '9': + return number_literal(ls); + default: { + const u8 uc = c.val; + if (ascii_isident_start(uc)) { + LexToken ident_or_keyword = identifier(ls); + if (ident_or_keyword.id != T_IDENT) + return make_error(); + return keyword(&ident_or_keyword); + } + + if (uc > 0x7f) /* DEL, the last ASCII character */ + lex_error(ls, "unicode tokens aren't allowed yet"); + else + lex_error(ls, "unknown token '%c' (\\x%02x)", uc, uc); + return make_error(); + } + } + return token; +#undef TOKEN +} + +/* Put a token into the backlist. The next call to `lex_scan` will return this + * token. The backlist is a stack of tokens, so technically you can have unlimited + * look-ahead at the cost of memory. + */ +void +lex_backup(LexState *ls, LexToken token) +{ + arrput(ls->backlist, token); + i64 col = ls->cur_loc.column - token.len; + if (col < 1) { + if (ls->cur_loc.line > 1) + --ls->cur_loc.line; + } else { + ls->cur_loc.column = col; + } +} + +/* Checks if `t` token type is equal to `exp_tok`. This does not eat any token. */ +bool +lex_match(LexState *ls, LexToken *token, enum LexTokenId exp_tok) +{ + if (token->id != exp_tok) { + lex_error(ls, "expected '%s' but got '%s' instead\n", + TokenIdStr[exp_tok], TokenIdStr[token->id]); + return false; + } + return true; +} + +LexState * +lex_new(Compiler *cm, FILE *input_fp, Str file_name, usize tabsize) +{ + LexState *ls = calloc(1, sizeof(*ls)); + ls->buf = calloc(LEX_BUFFER_SIZE + 1, sizeof(*ls->buf)); + ls->lbegin = ls->fwd = ls->buf; + ls->tabsize = tabsize; + ls->input_fp = input_fp; + ls->cur_loc.line = 1; + ls->cur_loc.source = file_name; + ls->cm = cm; + /* We use a hash table with string keys as a set containing all identifiers + * in a compilation unit, to avoid dupplicate allocations. + */ + sh_new_arena(ls->idents); + /* We provide our own buffering scheme */ + setbuf(input_fp, nil); + /* Initial fill of first buffer. + * Any file error gets caught in the function, only thing that can happen + * here is that the file is actually empty, so instant EOF. + */ + read_buf(ls, ls->buf, LEX_HALF_BUFFER_SIZE, &ls->buflen); + return ls; +} + +/* Destroys a lexing context and frees its allocated memory. + * Note that this will also deallocate the identifier arena. + */ +void +lex_destroy(LexState *ls) +{ + shfree(ls->idents); + arrfree(ls->backlist); + free(ls->buf); + free(ls); +} diff --git a/compiler/lex.h b/compiler/lex.h new file mode 100644 index 0000000..6d46a5f --- /dev/null +++ b/compiler/lex.h @@ -0,0 +1,93 @@ +#ifndef _lex_h_ +#define _lex_h_ +#include + +#include "pre.h" +#include "location.h" +#include "state.h" +#include "libs/stb_ds.h" + +enum LexTokenId { + T_INVALID = 0, + /* Unary and binary operators */ + T_PLUS, T_MINUS, T_STAR, T_BAR, + T_LESSTHAN, T_GREATTHAN, T_LOGNOT, T_LOGAND, T_LOGOR, T_LOGICEQUAL, T_NOTEQUAL, + T_HASH, + /* Others */ + T_EQUAL, T_EXCLAMATION, T_LPAREN, T_RPAREN, T_COMMA, + T_COLON, T_SEMICOLON, T_LBRACKET, T_RBRACKET, T_LBRACE, T_RBRACE, + /* Atoms */ + T_IDENT, T_STRING, T_NUMBER, T_DECNUMBER, + /* Keywords */ + T_CONST, + T_ELSE, + T_END, + T_ELIF, + T_IF, + T_LET, + T_PROC, + T_RETURN, + T_VAR, + T_DISCARD, + T_WHILE, + T_STRUCT, + T_USE, + T_BREAK, + T_NEXT, + /* Control */ + T_EOF, T_ERROR, + T_TOKEN_COUNT, /* does not represent an actual token */ +}; + +/* Table mapping a `LexTokenId` to a string name of the token */ +extern const char *TokenIdStr[]; + +typedef struct { + enum LexTokenId id; + union { + Str ident, str, keyword; + /* XXX: Defer number parsing until it is actually needed? + * So we can move number parsing out of the lexer. */ + /* Integer literal, it's the parser problem to tell + * whether the literal is negative or not. + */ + u64 inumber; + /* Floating point literal */ + double floatn; + }; + isize len; /* Size in bytes of this token */ +} LexToken; + +typedef HashMapStr(i8) IdentsBucket; + +typedef struct { + FILE *input_fp; + /* Lexing buffer. This is actually split into two buffers, providing + * a double-buffering scheme */ + u8 *buf; + /* Actual length of each buffer (fread may read less than LEX_BUFFER_SIZE) */ + isize buflen, buflen2; + + u8 *lbegin; /* marks the begin of the current lexeme */ + u8 *fwd; /* this pointer is the scanner */ + Vec(LexToken) backlist; /* stack of backed up tokens */ + + int tabsize; + bool eof; + Location cur_loc; + Compiler *cm; + IdentsBucket *idents; +} LexState; + +LexToken +lex_scan(LexState *ls); +void +lex_backup(LexState *ls, LexToken token); +bool +lex_match(LexState *ls, LexToken *t, enum LexTokenId exp_tok); +LexState * +lex_new(Compiler *cm, FILE *input_fp, Str file_name, usize tabsize); +void +lex_destroy(LexState *l); + +#endif diff --git a/compiler/libs/optparse.h b/compiler/libs/optparse.h new file mode 100644 index 0000000..9b7c7f8 --- /dev/null +++ b/compiler/libs/optparse.h @@ -0,0 +1,403 @@ +/* Optparse --- portable, reentrant, embeddable, getopt-like option parser + * + * This is free and unencumbered software released into the public domain. + * + * To get the implementation, define OPTPARSE_IMPLEMENTATION. + * Optionally define OPTPARSE_API to control the API's visibility + * and/or linkage (static, __attribute__, __declspec). + * + * The POSIX getopt() option parser has three fatal flaws. These flaws + * are solved by Optparse. + * + * 1) Parser state is stored entirely in global variables, some of + * which are static and inaccessible. This means only one thread can + * use getopt(). It also means it's not possible to recursively parse + * nested sub-arguments while in the middle of argument parsing. + * Optparse fixes this by storing all state on a local struct. + * + * 2) The POSIX standard provides no way to properly reset the parser. + * This means for portable code that getopt() is only good for one + * run, over one argv with one option string. It also means subcommand + * options cannot be processed with getopt(). Most implementations + * provide a method to reset the parser, but it's not portable. + * Optparse provides an optparse_arg() function for stepping over + * subcommands and continuing parsing of options with another option + * string. The Optparse struct itself can be passed around to + * subcommand handlers for additional subcommand option parsing. A + * full reset can be achieved by with an additional optparse_init(). + * + * 3) Error messages are printed to stderr. This can be disabled with + * opterr, but the messages themselves are still inaccessible. + * Optparse solves this by writing an error message in its errmsg + * field. The downside to Optparse is that this error message will + * always be in English rather than the current locale. + * + * Optparse should be familiar with anyone accustomed to getopt(), and + * it could be a nearly drop-in replacement. The option string is the + * same and the fields have the same names as the getopt() global + * variables (optarg, optind, optopt). + * + * Optparse also supports GNU-style long options with optparse_long(). + * The interface is slightly different and simpler than getopt_long(). + * + * By default, argv is permuted as it is parsed, moving non-option + * arguments to the end. This can be disabled by setting the `permute` + * field to 0 after initialization. + */ +#ifndef OPTPARSE_H +#define OPTPARSE_H + +#ifndef OPTPARSE_API +# define OPTPARSE_API +#endif + +struct optparse { + char **argv; + int permute; + int optind; + int optopt; + char *optarg; + char errmsg[64]; + int subopt; +}; + +enum optparse_argtype { + OPTPARSE_NONE, + OPTPARSE_REQUIRED, + OPTPARSE_OPTIONAL +}; + +struct optparse_long { + const char *longname; + int shortname; + enum optparse_argtype argtype; +}; + +/** + * Initializes the parser state. + */ +OPTPARSE_API +void optparse_init(struct optparse *options, char **argv); + +/** + * Read the next option in the argv array. + * @param optstring a getopt()-formatted option string. + * @return the next option character, -1 for done, or '?' for error + * + * Just like getopt(), a character followed by no colons means no + * argument. One colon means the option has a required argument. Two + * colons means the option takes an optional argument. + */ +OPTPARSE_API +int optparse(struct optparse *options, const char *optstring); + +/** + * Handles GNU-style long options in addition to getopt() options. + * This works a lot like GNU's getopt_long(). The last option in + * longopts must be all zeros, marking the end of the array. The + * longindex argument may be NULL. + */ +OPTPARSE_API +int optparse_long(struct optparse *options, + const struct optparse_long *longopts, + int *longindex); + +/** + * Used for stepping over non-option arguments. + * @return the next non-option argument, or NULL for no more arguments + * + * Argument parsing can continue with optparse() after using this + * function. That would be used to parse the options for the + * subcommand returned by optparse_arg(). This function allows you to + * ignore the value of optind. + */ +OPTPARSE_API +char *optparse_arg(struct optparse *options); + +/* Implementation */ +#ifdef OPTPARSE_IMPLEMENTATION + +#define OPTPARSE_MSG_INVALID "invalid option" +#define OPTPARSE_MSG_MISSING "option requires an argument" +#define OPTPARSE_MSG_TOOMANY "option takes no arguments" + +static int +optparse_error(struct optparse *options, const char *msg, const char *data) +{ + unsigned p = 0; + const char *sep = " -- '"; + while (*msg) + options->errmsg[p++] = *msg++; + while (*sep) + options->errmsg[p++] = *sep++; + while (p < sizeof(options->errmsg) - 2 && *data) + options->errmsg[p++] = *data++; + options->errmsg[p++] = '\''; + options->errmsg[p++] = '\0'; + return '?'; +} + +OPTPARSE_API +void +optparse_init(struct optparse *options, char **argv) +{ + options->argv = argv; + options->permute = 1; + options->optind = argv[0] != 0; + options->subopt = 0; + options->optarg = 0; + options->errmsg[0] = '\0'; +} + +static int +optparse_is_dashdash(const char *arg) +{ + return arg != 0 && arg[0] == '-' && arg[1] == '-' && arg[2] == '\0'; +} + +static int +optparse_is_shortopt(const char *arg) +{ + return arg != 0 && arg[0] == '-' && arg[1] != '-' && arg[1] != '\0'; +} + +static int +optparse_is_longopt(const char *arg) +{ + return arg != 0 && arg[0] == '-' && arg[1] == '-' && arg[2] != '\0'; +} + +static void +optparse_permute(struct optparse *options, int index) +{ + char *nonoption = options->argv[index]; + int i; + for (i = index; i < options->optind - 1; i++) + options->argv[i] = options->argv[i + 1]; + options->argv[options->optind - 1] = nonoption; +} + +static int +optparse_argtype(const char *optstring, char c) +{ + int count = OPTPARSE_NONE; + if (c == ':') + return -1; + for (; *optstring && c != *optstring; optstring++); + if (!*optstring) + return -1; + if (optstring[1] == ':') + count += optstring[2] == ':' ? 2 : 1; + return count; +} + +OPTPARSE_API +int +optparse(struct optparse *options, const char *optstring) +{ + int type; + char *next; + char *option = options->argv[options->optind]; + options->errmsg[0] = '\0'; + options->optopt = 0; + options->optarg = 0; + if (option == 0) { + return -1; + } else if (optparse_is_dashdash(option)) { + options->optind++; /* consume "--" */ + return -1; + } else if (!optparse_is_shortopt(option)) { + if (options->permute) { + int index = options->optind++; + int r = optparse(options, optstring); + optparse_permute(options, index); + options->optind--; + return r; + } else { + return -1; + } + } + option += options->subopt + 1; + options->optopt = option[0]; + type = optparse_argtype(optstring, option[0]); + next = options->argv[options->optind + 1]; + switch (type) { + case -1: { + char str[2] = {0, 0}; + str[0] = option[0]; + options->optind++; + return optparse_error(options, OPTPARSE_MSG_INVALID, str); + } + case OPTPARSE_NONE: + if (option[1]) { + options->subopt++; + } else { + options->subopt = 0; + options->optind++; + } + return option[0]; + case OPTPARSE_REQUIRED: + options->subopt = 0; + options->optind++; + if (option[1]) { + options->optarg = option + 1; + } else if (next != 0) { + options->optarg = next; + options->optind++; + } else { + char str[2] = {0, 0}; + str[0] = option[0]; + options->optarg = 0; + return optparse_error(options, OPTPARSE_MSG_MISSING, str); + } + return option[0]; + case OPTPARSE_OPTIONAL: + options->subopt = 0; + options->optind++; + if (option[1]) + options->optarg = option + 1; + else + options->optarg = 0; + return option[0]; + } + return 0; +} + +OPTPARSE_API +char * +optparse_arg(struct optparse *options) +{ + char *option = options->argv[options->optind]; + options->subopt = 0; + if (option != 0) + options->optind++; + return option; +} + +static int +optparse_longopts_end(const struct optparse_long *longopts, int i) +{ + return !longopts[i].longname && !longopts[i].shortname; +} + +static void +optparse_from_long(const struct optparse_long *longopts, char *optstring) +{ + char *p = optstring; + int i; + for (i = 0; !optparse_longopts_end(longopts, i); i++) { + if (longopts[i].shortname && longopts[i].shortname < 127) { + int a; + *p++ = (char)longopts[i].shortname; + for (a = 0; a < (int)longopts[i].argtype; a++) + *p++ = ':'; + } + } + *p = '\0'; +} + +/* Unlike strcmp(), handles options containing "=". */ +static int +optparse_longopts_match(const char *longname, const char *option) +{ + const char *a = option, *n = longname; + if (longname == 0) + return 0; + for (; *a && *n && *a != '='; a++, n++) + if (*a != *n) + return 0; + return *n == '\0' && (*a == '\0' || *a == '='); +} + +/* Return the part after "=", or NULL. */ +static char * +optparse_longopts_arg(char *option) +{ + for (; *option && *option != '='; option++); + if (*option == '=') + return option + 1; + else + return 0; +} + +static int +optparse_long_fallback(struct optparse *options, + const struct optparse_long *longopts, + int *longindex) +{ + int result; + char optstring[96 * 3 + 1]; /* 96 ASCII printable characters */ + optparse_from_long(longopts, optstring); + result = optparse(options, optstring); + if (longindex != 0) { + *longindex = -1; + if (result != -1) { + int i; + for (i = 0; !optparse_longopts_end(longopts, i); i++) + if (longopts[i].shortname == options->optopt) + *longindex = i; + } + } + return result; +} + +OPTPARSE_API +int +optparse_long(struct optparse *options, + const struct optparse_long *longopts, + int *longindex) +{ + int i; + char *option = options->argv[options->optind]; + if (option == 0) { + return -1; + } else if (optparse_is_dashdash(option)) { + options->optind++; /* consume "--" */ + return -1; + } else if (optparse_is_shortopt(option)) { + return optparse_long_fallback(options, longopts, longindex); + } else if (!optparse_is_longopt(option)) { + if (options->permute) { + int index = options->optind++; + int r = optparse_long(options, longopts, longindex); + optparse_permute(options, index); + options->optind--; + return r; + } else { + return -1; + } + } + + /* Parse as long option. */ + options->errmsg[0] = '\0'; + options->optopt = 0; + options->optarg = 0; + option += 2; /* skip "--" */ + options->optind++; + for (i = 0; !optparse_longopts_end(longopts, i); i++) { + const char *name = longopts[i].longname; + if (optparse_longopts_match(name, option)) { + char *arg; + if (longindex) + *longindex = i; + options->optopt = longopts[i].shortname; + arg = optparse_longopts_arg(option); + if (longopts[i].argtype == OPTPARSE_NONE && arg != 0) { + return optparse_error(options, OPTPARSE_MSG_TOOMANY, name); + } if (arg != 0) { + options->optarg = arg; + } else if (longopts[i].argtype == OPTPARSE_REQUIRED) { + options->optarg = options->argv[options->optind]; + if (options->optarg == 0) + return optparse_error(options, OPTPARSE_MSG_MISSING, name); + else + options->optind++; + } + return options->optopt; + } + } + return optparse_error(options, OPTPARSE_MSG_INVALID, option); +} + +#endif /* OPTPARSE_IMPLEMENTATION */ +#endif /* OPTPARSE_H */ diff --git a/compiler/libs/optparse_impl.c b/compiler/libs/optparse_impl.c new file mode 100644 index 0000000..e41bf8b --- /dev/null +++ b/compiler/libs/optparse_impl.c @@ -0,0 +1,3 @@ +/* This file holds the implementation of the optparse library functionality */ +#define OPTPARSE_IMPLEMENTATION +#include "optparse.h" diff --git a/compiler/libs/stb_ds.h b/compiler/libs/stb_ds.h new file mode 100644 index 0000000..e84c82d --- /dev/null +++ b/compiler/libs/stb_ds.h @@ -0,0 +1,1895 @@ +/* stb_ds.h - v0.67 - public domain data structures - Sean Barrett 2019 + + This is a single-header-file library that provides easy-to-use + dynamic arrays and hash tables for C (also works in C++). + + For a gentle introduction: + http://nothings.org/stb_ds + + To use this library, do this in *one* C or C++ file: + #define STB_DS_IMPLEMENTATION + #include "stb_ds.h" + +TABLE OF CONTENTS + + Table of Contents + Compile-time options + License + Documentation + Notes + Notes - Dynamic arrays + Notes - Hash maps + Credits + +COMPILE-TIME OPTIONS + + #define STBDS_NO_SHORT_NAMES + + This flag needs to be set globally. + + By default stb_ds exposes shorter function names that are not qualified + with the "stbds_" prefix. If these names conflict with the names in your + code, define this flag. + + #define STBDS_SIPHASH_2_4 + + This flag only needs to be set in the file containing #define STB_DS_IMPLEMENTATION. + + By default stb_ds.h hashes using a weaker variant of SipHash and a custom hash for + 4- and 8-byte keys. On 64-bit platforms, you can define the above flag to force + stb_ds.h to use specification-compliant SipHash-2-4 for all keys. Doing so makes + hash table insertion about 20% slower on 4- and 8-byte keys, 5% slower on + 64-byte keys, and 10% slower on 256-byte keys on my test computer. + + #define STBDS_REALLOC(context,ptr,size) better_realloc + #define STBDS_FREE(context,ptr) better_free + + These defines only need to be set in the file containing #define STB_DS_IMPLEMENTATION. + + By default stb_ds uses stdlib realloc() and free() for memory management. You can + substitute your own functions instead by defining these symbols. You must either + define both, or neither. Note that at the moment, 'context' will always be NULL. + @TODO add an array/hash initialization function that takes a memory context pointer. + + #define STBDS_UNIT_TESTS + + Defines a function stbds_unit_tests() that checks the functioning of the data structures. + + Note that on older versions of gcc (e.g. 5.x.x) you may need to build with '-std=c++0x' + (or equivalentally '-std=c++11') when using anonymous structures as seen on the web + page or in STBDS_UNIT_TESTS. + +LICENSE + + Placed in the public domain and also MIT licensed. + See end of file for detailed license information. + +DOCUMENTATION + + Dynamic Arrays + + Non-function interface: + + Declare an empty dynamic array of type T + T* foo = NULL; + + Access the i'th item of a dynamic array 'foo' of type T, T* foo: + foo[i] + + Functions (actually macros) + + arrfree: + void arrfree(T*); + Frees the array. + + arrlen: + ptrdiff_t arrlen(T*); + Returns the number of elements in the array. + + arrlenu: + size_t arrlenu(T*); + Returns the number of elements in the array as an unsigned type. + + arrpop: + T arrpop(T* a) + Removes the final element of the array and returns it. + + arrput: + T arrput(T* a, T b); + Appends the item b to the end of array a. Returns b. + + arrins: + T arrins(T* a, int p, T b); + Inserts the item b into the middle of array a, into a[p], + moving the rest of the array over. Returns b. + + arrinsn: + void arrinsn(T* a, int p, int n); + Inserts n uninitialized items into array a starting at a[p], + moving the rest of the array over. + + arraddnptr: + T* arraddnptr(T* a, int n) + Appends n uninitialized items onto array at the end. + Returns a pointer to the first uninitialized item added. + + arraddnindex: + size_t arraddnindex(T* a, int n) + Appends n uninitialized items onto array at the end. + Returns the index of the first uninitialized item added. + + arrdel: + void arrdel(T* a, int p); + Deletes the element at a[p], moving the rest of the array over. + + arrdeln: + void arrdeln(T* a, int p, int n); + Deletes n elements starting at a[p], moving the rest of the array over. + + arrdelswap: + void arrdelswap(T* a, int p); + Deletes the element at a[p], replacing it with the element from + the end of the array. O(1) performance. + + arrsetlen: + void arrsetlen(T* a, int n); + Changes the length of the array to n. Allocates uninitialized + slots at the end if necessary. + + arrsetcap: + size_t arrsetcap(T* a, int n); + Sets the length of allocated storage to at least n. It will not + change the length of the array. + + arrcap: + size_t arrcap(T* a); + Returns the number of total elements the array can contain without + needing to be reallocated. + + Hash maps & String hash maps + + Given T is a structure type: struct { TK key; TV value; }. Note that some + functions do not require TV value and can have other fields. For string + hash maps, TK must be 'char *'. + + Special interface: + + stbds_rand_seed: + void stbds_rand_seed(size_t seed); + For security against adversarially chosen data, you should seed the + library with a strong random number. Or at least seed it with time(). + + stbds_hash_string: + size_t stbds_hash_string(char *str, size_t seed); + Returns a hash value for a string. + + stbds_hash_bytes: + size_t stbds_hash_bytes(void *p, size_t len, size_t seed); + These functions hash an arbitrary number of bytes. The function + uses a custom hash for 4- and 8-byte data, and a weakened version + of SipHash for everything else. On 64-bit platforms you can get + specification-compliant SipHash-2-4 on all data by defining + STBDS_SIPHASH_2_4, at a significant cost in speed. + + Non-function interface: + + Declare an empty hash map of type T + T* foo = NULL; + + Access the i'th entry in a hash table T* foo: + foo[i] + + Function interface (actually macros): + + hmfree + shfree + void hmfree(T*); + void shfree(T*); + Frees the hashmap and sets the pointer to NULL. + + hmlen + shlen + ptrdiff_t hmlen(T*) + ptrdiff_t shlen(T*) + Returns the number of elements in the hashmap. + + hmlenu + shlenu + size_t hmlenu(T*) + size_t shlenu(T*) + Returns the number of elements in the hashmap. + + hmgeti + shgeti + hmgeti_ts + ptrdiff_t hmgeti(T*, TK key) + ptrdiff_t shgeti(T*, char* key) + ptrdiff_t hmgeti_ts(T*, TK key, ptrdiff_t tempvar) + Returns the index in the hashmap which has the key 'key', or -1 + if the key is not present. + + hmget + hmget_ts + shget + TV hmget(T*, TK key) + TV shget(T*, char* key) + TV hmget_ts(T*, TK key, ptrdiff_t tempvar) + Returns the value corresponding to 'key' in the hashmap. + The structure must have a 'value' field + + hmgets + shgets + T hmgets(T*, TK key) + T shgets(T*, char* key) + Returns the structure corresponding to 'key' in the hashmap. + + hmgetp + shgetp + hmgetp_ts + hmgetp_null + shgetp_null + T* hmgetp(T*, TK key) + T* shgetp(T*, char* key) + T* hmgetp_ts(T*, TK key, ptrdiff_t tempvar) + T* hmgetp_null(T*, TK key) + T* shgetp_null(T*, char *key) + Returns a pointer to the structure corresponding to 'key' in + the hashmap. Functions ending in "_null" return NULL if the key + is not present in the hashmap; the others return a pointer to a + structure holding the default value (but not the searched-for key). + + hmdefault + shdefault + TV hmdefault(T*, TV value) + TV shdefault(T*, TV value) + Sets the default value for the hashmap, the value which will be + returned by hmget/shget if the key is not present. + + hmdefaults + shdefaults + TV hmdefaults(T*, T item) + TV shdefaults(T*, T item) + Sets the default struct for the hashmap, the contents which will be + returned by hmgets/shgets if the key is not present. + + hmput + shput + TV hmput(T*, TK key, TV value) + TV shput(T*, char* key, TV value) + Inserts a pair into the hashmap. If the key is already + present in the hashmap, updates its value. + + hmputs + shputs + T hmputs(T*, T item) + T shputs(T*, T item) + Inserts a struct with T.key into the hashmap. If the struct is already + present in the hashmap, updates it. + + hmdel + shdel + int hmdel(T*, TK key) + int shdel(T*, char* key) + If 'key' is in the hashmap, deletes its entry and returns 1. + Otherwise returns 0. + + Function interface (actually macros) for strings only: + + sh_new_strdup + void sh_new_strdup(T*); + Overwrites the existing pointer with a newly allocated + string hashmap which will automatically allocate and free + each string key using realloc/free + + sh_new_arena + void sh_new_arena(T*); + Overwrites the existing pointer with a newly allocated + string hashmap which will automatically allocate each string + key to a string arena. Every string key ever used by this + hash table remains in the arena until the arena is freed. + Additionally, any key which is deleted and reinserted will + be allocated multiple times in the string arena. + +NOTES + + * These data structures are realloc'd when they grow, and the macro + "functions" write to the provided pointer. This means: (a) the pointer + must be an lvalue, and (b) the pointer to the data structure is not + stable, and you must maintain it the same as you would a realloc'd + pointer. For example, if you pass a pointer to a dynamic array to a + function which updates it, the function must return back the new + pointer to the caller. This is the price of trying to do this in C. + + * The following are the only functions that are thread-safe on a single data + structure, i.e. can be run in multiple threads simultaneously on the same + data structure + hmlen shlen + hmlenu shlenu + hmget_ts shget_ts + hmgeti_ts shgeti_ts + hmgets_ts shgets_ts + + * You iterate over the contents of a dynamic array and a hashmap in exactly + the same way, using arrlen/hmlen/shlen: + + for (i=0; i < arrlen(foo); ++i) + ... foo[i] ... + + * All operations except arrins/arrdel are O(1) amortized, but individual + operations can be slow, so these data structures may not be suitable + for real time use. Dynamic arrays double in capacity as needed, so + elements are copied an average of once. Hash tables double/halve + their size as needed, with appropriate hysteresis to maintain O(1) + performance. + +NOTES - DYNAMIC ARRAY + + * If you know how long a dynamic array is going to be in advance, you can avoid + extra memory allocations by using arrsetlen to allocate it to that length in + advance and use foo[n] while filling it out, or arrsetcap to allocate the memory + for that length and use arrput/arrpush as normal. + + * Unlike some other versions of the dynamic array, this version should + be safe to use with strict-aliasing optimizations. + +NOTES - HASH MAP + + * For compilers other than GCC and clang (e.g. Visual Studio), for hmput/hmget/hmdel + and variants, the key must be an lvalue (so the macro can take the address of it). + Extensions are used that eliminate this requirement if you're using C99 and later + in GCC or clang, or if you're using C++ in GCC. But note that this can make your + code less portable. + + * To test for presence of a key in a hashmap, just do 'hmgeti(foo,key) >= 0'. + + * The iteration order of your data in the hashmap is determined solely by the + order of insertions and deletions. In particular, if you never delete, new + keys are always added at the end of the array. This will be consistent + across all platforms and versions of the library. However, you should not + attempt to serialize the internal hash table, as the hash is not consistent + between different platforms, and may change with future versions of the library. + + * Use sh_new_arena() for string hashmaps that you never delete from. Initialize + with NULL if you're managing the memory for your strings, or your strings are + never freed (at least until the hashmap is freed). Otherwise, use sh_new_strdup(). + @TODO: make an arena variant that garbage collects the strings with a trivial + copy collector into a new arena whenever the table shrinks / rebuilds. Since + current arena recommendation is to only use arena if it never deletes, then + this can just replace current arena implementation. + + * If adversarial input is a serious concern and you're on a 64-bit platform, + enable STBDS_SIPHASH_2_4 (see the 'Compile-time options' section), and pass + a strong random number to stbds_rand_seed. + + * The default value for the hash table is stored in foo[-1], so if you + use code like 'hmget(T,k)->value = 5' you can accidentally overwrite + the value stored by hmdefault if 'k' is not present. + +CREDITS + + Sean Barrett -- library, idea for dynamic array API/implementation + Per Vognsen -- idea for hash table API/implementation + Rafael Sachetto -- arrpop() + github:HeroicKatora -- arraddn() reworking + + Bugfixes: + Andy Durdin + Shane Liesegang + Vinh Truong + Andreas Molzer + github:hashitaku + github:srdjanstipic + Macoy Madson + Andreas Vennstrom + Tobias Mansfield-Williams +*/ + +#ifdef STBDS_UNIT_TESTS +#define _CRT_SECURE_NO_WARNINGS +#endif + +#ifndef INCLUDE_STB_DS_H +#define INCLUDE_STB_DS_H + +#include +#include + +#ifndef STBDS_NO_SHORT_NAMES +#define arrlen stbds_arrlen +#define arrlenu stbds_arrlenu +#define arrput stbds_arrput +#define arrpush stbds_arrput +#define arrpop stbds_arrpop +#define arrfree stbds_arrfree +#define arraddn stbds_arraddn // deprecated, use one of the following instead: +#define arraddnptr stbds_arraddnptr +#define arraddnindex stbds_arraddnindex +#define arrsetlen stbds_arrsetlen +#define arrlast stbds_arrlast +#define arrins stbds_arrins +#define arrinsn stbds_arrinsn +#define arrdel stbds_arrdel +#define arrdeln stbds_arrdeln +#define arrdelswap stbds_arrdelswap +#define arrcap stbds_arrcap +#define arrsetcap stbds_arrsetcap + +#define hmput stbds_hmput +#define hmputs stbds_hmputs +#define hmget stbds_hmget +#define hmget_ts stbds_hmget_ts +#define hmgets stbds_hmgets +#define hmgetp stbds_hmgetp +#define hmgetp_ts stbds_hmgetp_ts +#define hmgetp_null stbds_hmgetp_null +#define hmgeti stbds_hmgeti +#define hmgeti_ts stbds_hmgeti_ts +#define hmdel stbds_hmdel +#define hmlen stbds_hmlen +#define hmlenu stbds_hmlenu +#define hmfree stbds_hmfree +#define hmdefault stbds_hmdefault +#define hmdefaults stbds_hmdefaults + +#define shput stbds_shput +#define shputi stbds_shputi +#define shputs stbds_shputs +#define shget stbds_shget +#define shgeti stbds_shgeti +#define shgets stbds_shgets +#define shgetp stbds_shgetp +#define shgetp_null stbds_shgetp_null +#define shdel stbds_shdel +#define shlen stbds_shlen +#define shlenu stbds_shlenu +#define shfree stbds_shfree +#define shdefault stbds_shdefault +#define shdefaults stbds_shdefaults +#define sh_new_arena stbds_sh_new_arena +#define sh_new_strdup stbds_sh_new_strdup + +#define stralloc stbds_stralloc +#define strreset stbds_strreset +#endif + +#if defined(STBDS_REALLOC) && !defined(STBDS_FREE) || !defined(STBDS_REALLOC) && defined(STBDS_FREE) +#error "You must define both STBDS_REALLOC and STBDS_FREE, or neither." +#endif +#if !defined(STBDS_REALLOC) && !defined(STBDS_FREE) +#include +#define STBDS_REALLOC(c,p,s) realloc(p,s) +#define STBDS_FREE(c,p) free(p) +#endif + +#ifdef _MSC_VER +#define STBDS_NOTUSED(v) (void)(v) +#else +#define STBDS_NOTUSED(v) (void)sizeof(v) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// for security against attackers, seed the library with a random number, at least time() but stronger is better +extern void stbds_rand_seed(size_t seed); + +// these are the hash functions used internally if you want to test them or use them for other purposes +extern size_t stbds_hash_bytes(void *p, size_t len, size_t seed); +extern size_t stbds_hash_string(char *str, size_t seed); + +// this is a simple string arena allocator, initialize with e.g. 'stbds_string_arena my_arena={0}'. +typedef struct stbds_string_arena stbds_string_arena; +extern char * stbds_stralloc(stbds_string_arena *a, char *str); +extern void stbds_strreset(stbds_string_arena *a); + +// have to #define STBDS_UNIT_TESTS to call this +extern void stbds_unit_tests(void); + +/////////////// +// +// Everything below here is implementation details +// + +extern void * stbds_arrgrowf(void *a, size_t elemsize, size_t addlen, size_t min_cap); +extern void stbds_arrfreef(void *a); +extern void stbds_hmfree_func(void *p, size_t elemsize); +extern void * stbds_hmget_key(void *a, size_t elemsize, void *key, size_t keysize, int mode); +extern void * stbds_hmget_key_ts(void *a, size_t elemsize, void *key, size_t keysize, ptrdiff_t *temp, int mode); +extern void * stbds_hmput_default(void *a, size_t elemsize); +extern void * stbds_hmput_key(void *a, size_t elemsize, void *key, size_t keysize, int mode); +extern void * stbds_hmdel_key(void *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode); +extern void * stbds_shmode_func(size_t elemsize, int mode); + +#ifdef __cplusplus +} +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define STBDS_HAS_TYPEOF +#ifdef __cplusplus +//#define STBDS_HAS_LITERAL_ARRAY // this is currently broken for clang +#endif +#endif + +#if !defined(__cplusplus) +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define STBDS_HAS_LITERAL_ARRAY +#endif +#endif + +// this macro takes the address of the argument, but on gcc/clang can accept rvalues +#if defined(STBDS_HAS_LITERAL_ARRAY) && defined(STBDS_HAS_TYPEOF) + #if __clang__ + #define STBDS_ADDRESSOF(typevar, value) ((__typeof__(typevar)[1]){value}) // literal array decays to pointer to value + #else + #define STBDS_ADDRESSOF(typevar, value) ((typeof(typevar)[1]){value}) // literal array decays to pointer to value + #endif +#else +#define STBDS_ADDRESSOF(typevar, value) &(value) +#endif + +#define STBDS_OFFSETOF(var,field) ((char *) &(var)->field - (char *) (var)) + +#define stbds_header(t) ((stbds_array_header *) (t) - 1) +#define stbds_temp(t) stbds_header(t)->temp +#define stbds_temp_key(t) (*(char **) stbds_header(t)->hash_table) + +#define stbds_arrsetcap(a,n) (stbds_arrgrow(a,0,n)) +#define stbds_arrsetlen(a,n) ((stbds_arrcap(a) < (size_t) (n) ? stbds_arrsetcap((a),(size_t)(n)),0 : 0), (a) ? stbds_header(a)->length = (size_t) (n) : 0) +#define stbds_arrcap(a) ((a) ? stbds_header(a)->capacity : 0) +#define stbds_arrlen(a) ((a) ? (ptrdiff_t) stbds_header(a)->length : 0) +#define stbds_arrlenu(a) ((a) ? stbds_header(a)->length : 0) +#define stbds_arrput(a,v) (stbds_arrmaybegrow(a,1), (a)[stbds_header(a)->length++] = (v)) +#define stbds_arrpush stbds_arrput // synonym +#define stbds_arrpop(a) (stbds_header(a)->length--, (a)[stbds_header(a)->length]) +#define stbds_arraddn(a,n) ((void)(stbds_arraddnindex(a, n))) // deprecated, use one of the following instead: +#define stbds_arraddnptr(a,n) (stbds_arrmaybegrow(a,n), (n) ? (stbds_header(a)->length += (n), &(a)[stbds_header(a)->length-(n)]) : (a)) +#define stbds_arraddnindex(a,n)(stbds_arrmaybegrow(a,n), (n) ? (stbds_header(a)->length += (n), stbds_header(a)->length-(n)) : stbds_arrlen(a)) +#define stbds_arraddnoff stbds_arraddnindex +#define stbds_arrlast(a) ((a)[stbds_header(a)->length-1]) +#define stbds_arrfree(a) ((void) ((a) ? STBDS_FREE(NULL,stbds_header(a)) : (void)0), (a)=NULL) +#define stbds_arrdel(a,i) stbds_arrdeln(a,i,1) +#define stbds_arrdeln(a,i,n) (memmove(&(a)[i], &(a)[(i)+(n)], sizeof *(a) * (stbds_header(a)->length-(n)-(i))), stbds_header(a)->length -= (n)) +#define stbds_arrdelswap(a,i) ((a)[i] = stbds_arrlast(a), stbds_header(a)->length -= 1) +#define stbds_arrinsn(a,i,n) (stbds_arraddn((a),(n)), memmove(&(a)[(i)+(n)], &(a)[i], sizeof *(a) * (stbds_header(a)->length-(n)-(i)))) +#define stbds_arrins(a,i,v) (stbds_arrinsn((a),(i),1), (a)[i]=(v)) + +#define stbds_arrmaybegrow(a,n) ((!(a) || stbds_header(a)->length + (n) > stbds_header(a)->capacity) \ + ? (stbds_arrgrow(a,n,0),0) : 0) + +#define stbds_arrgrow(a,b,c) ((a) = stbds_arrgrowf_wrapper((a), sizeof *(a), (b), (c))) + +#define stbds_hmput(t, k, v) \ + ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) STBDS_ADDRESSOF((t)->key, (k)), sizeof (t)->key, 0), \ + (t)[stbds_temp((t)-1)].key = (k), \ + (t)[stbds_temp((t)-1)].value = (v)) + +#define stbds_hmputs(t, s) \ + ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), &(s).key, sizeof (s).key, STBDS_HM_BINARY), \ + (t)[stbds_temp((t)-1)] = (s)) + +#define stbds_hmgeti(t,k) \ + ((t) = stbds_hmget_key_wrapper((t), sizeof *(t), (void*) STBDS_ADDRESSOF((t)->key, (k)), sizeof (t)->key, STBDS_HM_BINARY), \ + stbds_temp((t)-1)) + +#define stbds_hmgeti_ts(t,k,temp) \ + ((t) = stbds_hmget_key_ts_wrapper((t), sizeof *(t), (void*) STBDS_ADDRESSOF((t)->key, (k)), sizeof (t)->key, &(temp), STBDS_HM_BINARY), \ + (temp)) + +#define stbds_hmgetp(t, k) \ + ((void) stbds_hmgeti(t,k), &(t)[stbds_temp((t)-1)]) + +#define stbds_hmgetp_ts(t, k, temp) \ + ((void) stbds_hmgeti_ts(t,k,temp), &(t)[temp]) + +#define stbds_hmdel(t,k) \ + (((t) = stbds_hmdel_key_wrapper((t),sizeof *(t), (void*) STBDS_ADDRESSOF((t)->key, (k)), sizeof (t)->key, STBDS_OFFSETOF((t),key), STBDS_HM_BINARY)),(t)?stbds_temp((t)-1):0) + +#define stbds_hmdefault(t, v) \ + ((t) = stbds_hmput_default_wrapper((t), sizeof *(t)), (t)[-1].value = (v)) + +#define stbds_hmdefaults(t, s) \ + ((t) = stbds_hmput_default_wrapper((t), sizeof *(t)), (t)[-1] = (s)) + +#define stbds_hmfree(p) \ + ((void) ((p) != NULL ? stbds_hmfree_func((p)-1,sizeof*(p)),0 : 0),(p)=NULL) + +#define stbds_hmgets(t, k) (*stbds_hmgetp(t,k)) +#define stbds_hmget(t, k) (stbds_hmgetp(t,k)->value) +#define stbds_hmget_ts(t, k, temp) (stbds_hmgetp_ts(t,k,temp)->value) +#define stbds_hmlen(t) ((t) ? (ptrdiff_t) stbds_header((t)-1)->length-1 : 0) +#define stbds_hmlenu(t) ((t) ? stbds_header((t)-1)->length-1 : 0) +#define stbds_hmgetp_null(t,k) (stbds_hmgeti(t,k) == -1 ? NULL : &(t)[stbds_temp((t)-1)]) + +#define stbds_shput(t, k, v) \ + ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) (k), sizeof (t)->key, STBDS_HM_STRING), \ + (t)[stbds_temp((t)-1)].value = (v)) + +#define stbds_shputi(t, k, v) \ + ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) (k), sizeof (t)->key, STBDS_HM_STRING), \ + (t)[stbds_temp((t)-1)].value = (v), stbds_temp((t)-1)) + +#define stbds_shputs(t, s) \ + ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) (s).key, sizeof (s).key, STBDS_HM_STRING), \ + (t)[stbds_temp((t)-1)] = (s), \ + (t)[stbds_temp((t)-1)].key = stbds_temp_key((t)-1)) // above line overwrites whole structure, so must rewrite key here if it was allocated internally + +#define stbds_pshput(t, p) \ + ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) (p)->key, sizeof (p)->key, STBDS_HM_PTR_TO_STRING), \ + (t)[stbds_temp((t)-1)] = (p)) + +#define stbds_shgeti(t,k) \ + ((t) = stbds_hmget_key_wrapper((t), sizeof *(t), (void*) (k), sizeof (t)->key, STBDS_HM_STRING), \ + stbds_temp((t)-1)) + +#define stbds_pshgeti(t,k) \ + ((t) = stbds_hmget_key_wrapper((t), sizeof *(t), (void*) (k), sizeof (*(t))->key, STBDS_HM_PTR_TO_STRING), \ + stbds_temp((t)-1)) + +#define stbds_shgetp(t, k) \ + ((void) stbds_shgeti(t,k), &(t)[stbds_temp((t)-1)]) + +#define stbds_pshget(t, k) \ + ((void) stbds_pshgeti(t,k), (t)[stbds_temp((t)-1)]) + +#define stbds_shdel(t,k) \ + (((t) = stbds_hmdel_key_wrapper((t),sizeof *(t), (void*) (k), sizeof (t)->key, STBDS_OFFSETOF((t),key), STBDS_HM_STRING)),(t)?stbds_temp((t)-1):0) +#define stbds_pshdel(t,k) \ + (((t) = stbds_hmdel_key_wrapper((t),sizeof *(t), (void*) (k), sizeof (*(t))->key, STBDS_OFFSETOF(*(t),key), STBDS_HM_PTR_TO_STRING)),(t)?stbds_temp((t)-1):0) + +#define stbds_sh_new_arena(t) \ + ((t) = stbds_shmode_func_wrapper(t, sizeof *(t), STBDS_SH_ARENA)) +#define stbds_sh_new_strdup(t) \ + ((t) = stbds_shmode_func_wrapper(t, sizeof *(t), STBDS_SH_STRDUP)) + +#define stbds_shdefault(t, v) stbds_hmdefault(t,v) +#define stbds_shdefaults(t, s) stbds_hmdefaults(t,s) + +#define stbds_shfree stbds_hmfree +#define stbds_shlenu stbds_hmlenu + +#define stbds_shgets(t, k) (*stbds_shgetp(t,k)) +#define stbds_shget(t, k) (stbds_shgetp(t,k)->value) +#define stbds_shgetp_null(t,k) (stbds_shgeti(t,k) == -1 ? NULL : &(t)[stbds_temp((t)-1)]) +#define stbds_shlen stbds_hmlen + +typedef struct +{ + size_t length; + size_t capacity; + void * hash_table; + ptrdiff_t temp; +} stbds_array_header; + +typedef struct stbds_string_block +{ + struct stbds_string_block *next; + char storage[8]; +} stbds_string_block; + +struct stbds_string_arena +{ + stbds_string_block *storage; + size_t remaining; + unsigned char block; + unsigned char mode; // this isn't used by the string arena itself +}; + +#define STBDS_HM_BINARY 0 +#define STBDS_HM_STRING 1 + +enum +{ + STBDS_SH_NONE, + STBDS_SH_DEFAULT, + STBDS_SH_STRDUP, + STBDS_SH_ARENA +}; + +#ifdef __cplusplus +// in C we use implicit assignment from these void*-returning functions to T*. +// in C++ these templates make the same code work +template static T * stbds_arrgrowf_wrapper(T *a, size_t elemsize, size_t addlen, size_t min_cap) { + return (T*)stbds_arrgrowf((void *)a, elemsize, addlen, min_cap); +} +template static T * stbds_hmget_key_wrapper(T *a, size_t elemsize, void *key, size_t keysize, int mode) { + return (T*)stbds_hmget_key((void*)a, elemsize, key, keysize, mode); +} +template static T * stbds_hmget_key_ts_wrapper(T *a, size_t elemsize, void *key, size_t keysize, ptrdiff_t *temp, int mode) { + return (T*)stbds_hmget_key_ts((void*)a, elemsize, key, keysize, temp, mode); +} +template static T * stbds_hmput_default_wrapper(T *a, size_t elemsize) { + return (T*)stbds_hmput_default((void *)a, elemsize); +} +template static T * stbds_hmput_key_wrapper(T *a, size_t elemsize, void *key, size_t keysize, int mode) { + return (T*)stbds_hmput_key((void*)a, elemsize, key, keysize, mode); +} +template static T * stbds_hmdel_key_wrapper(T *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode){ + return (T*)stbds_hmdel_key((void*)a, elemsize, key, keysize, keyoffset, mode); +} +template static T * stbds_shmode_func_wrapper(T *, size_t elemsize, int mode) { + return (T*)stbds_shmode_func(elemsize, mode); +} +#else +#define stbds_arrgrowf_wrapper stbds_arrgrowf +#define stbds_hmget_key_wrapper stbds_hmget_key +#define stbds_hmget_key_ts_wrapper stbds_hmget_key_ts +#define stbds_hmput_default_wrapper stbds_hmput_default +#define stbds_hmput_key_wrapper stbds_hmput_key +#define stbds_hmdel_key_wrapper stbds_hmdel_key +#define stbds_shmode_func_wrapper(t,e,m) stbds_shmode_func(e,m) +#endif + +#endif // INCLUDE_STB_DS_H + + +////////////////////////////////////////////////////////////////////////////// +// +// IMPLEMENTATION +// + +#ifdef STB_DS_IMPLEMENTATION +#include +#include + +#ifndef STBDS_ASSERT +#define STBDS_ASSERT_WAS_UNDEFINED +#define STBDS_ASSERT(x) ((void) 0) +#endif + +#ifdef STBDS_STATISTICS +#define STBDS_STATS(x) x +size_t stbds_array_grow; +size_t stbds_hash_grow; +size_t stbds_hash_shrink; +size_t stbds_hash_rebuild; +size_t stbds_hash_probes; +size_t stbds_hash_alloc; +size_t stbds_rehash_probes; +size_t stbds_rehash_items; +#else +#define STBDS_STATS(x) +#endif + +// +// stbds_arr implementation +// + +//int *prev_allocs[65536]; +//int num_prev; + +void *stbds_arrgrowf(void *a, size_t elemsize, size_t addlen, size_t min_cap) +{ + stbds_array_header temp={0}; // force debugging + void *b; + size_t min_len = stbds_arrlen(a) + addlen; + (void) sizeof(temp); + + // compute the minimum capacity needed + if (min_len > min_cap) + min_cap = min_len; + + if (min_cap <= stbds_arrcap(a)) + return a; + + // increase needed capacity to guarantee O(1) amortized + if (min_cap < 2 * stbds_arrcap(a)) + min_cap = 2 * stbds_arrcap(a); + else if (min_cap < 4) + min_cap = 4; + + //if (num_prev < 65536) if (a) prev_allocs[num_prev++] = (int *) ((char *) a+1); + //if (num_prev == 2201) + // num_prev = num_prev; + b = STBDS_REALLOC(NULL, (a) ? stbds_header(a) : 0, elemsize * min_cap + sizeof(stbds_array_header)); + //if (num_prev < 65536) prev_allocs[num_prev++] = (int *) (char *) b; + b = (char *) b + sizeof(stbds_array_header); + if (a == NULL) { + stbds_header(b)->length = 0; + stbds_header(b)->hash_table = 0; + stbds_header(b)->temp = 0; + } else { + STBDS_STATS(++stbds_array_grow); + } + stbds_header(b)->capacity = min_cap; + + return b; +} + +void stbds_arrfreef(void *a) +{ + STBDS_FREE(NULL, stbds_header(a)); +} + +// +// stbds_hm hash table implementation +// + +#ifdef STBDS_INTERNAL_SMALL_BUCKET +#define STBDS_BUCKET_LENGTH 4 +#else +#define STBDS_BUCKET_LENGTH 8 +#endif + +#define STBDS_BUCKET_SHIFT (STBDS_BUCKET_LENGTH == 8 ? 3 : 2) +#define STBDS_BUCKET_MASK (STBDS_BUCKET_LENGTH-1) +#define STBDS_CACHE_LINE_SIZE 64 + +#define STBDS_ALIGN_FWD(n,a) (((n) + (a) - 1) & ~((a)-1)) + +typedef struct +{ + size_t hash [STBDS_BUCKET_LENGTH]; + ptrdiff_t index[STBDS_BUCKET_LENGTH]; +} stbds_hash_bucket; // in 32-bit, this is one 64-byte cache line; in 64-bit, each array is one 64-byte cache line + +typedef struct +{ + char * temp_key; // this MUST be the first field of the hash table + size_t slot_count; + size_t used_count; + size_t used_count_threshold; + size_t used_count_shrink_threshold; + size_t tombstone_count; + size_t tombstone_count_threshold; + size_t seed; + size_t slot_count_log2; + stbds_string_arena string; + stbds_hash_bucket *storage; // not a separate allocation, just 64-byte aligned storage after this struct +} stbds_hash_index; + +#define STBDS_INDEX_EMPTY -1 +#define STBDS_INDEX_DELETED -2 +#define STBDS_INDEX_IN_USE(x) ((x) >= 0) + +#define STBDS_HASH_EMPTY 0 +#define STBDS_HASH_DELETED 1 + +static size_t stbds_hash_seed=0x31415926; + +void stbds_rand_seed(size_t seed) +{ + stbds_hash_seed = seed; +} + +#define stbds_load_32_or_64(var, temp, v32, v64_hi, v64_lo) \ + temp = v64_lo ^ v32, temp <<= 16, temp <<= 16, temp >>= 16, temp >>= 16, /* discard if 32-bit */ \ + var = v64_hi, var <<= 16, var <<= 16, /* discard if 32-bit */ \ + var ^= temp ^ v32 + +#define STBDS_SIZE_T_BITS ((sizeof (size_t)) * 8) + +static size_t stbds_probe_position(size_t hash, size_t slot_count, size_t slot_log2) +{ + size_t pos; + STBDS_NOTUSED(slot_log2); + pos = hash & (slot_count-1); + #ifdef STBDS_INTERNAL_BUCKET_START + pos &= ~STBDS_BUCKET_MASK; + #endif + return pos; +} + +static size_t stbds_log2(size_t slot_count) +{ + size_t n=0; + while (slot_count > 1) { + slot_count >>= 1; + ++n; + } + return n; +} + +static stbds_hash_index *stbds_make_hash_index(size_t slot_count, stbds_hash_index *ot) +{ + stbds_hash_index *t; + t = (stbds_hash_index *) STBDS_REALLOC(NULL,0,(slot_count >> STBDS_BUCKET_SHIFT) * sizeof(stbds_hash_bucket) + sizeof(stbds_hash_index) + STBDS_CACHE_LINE_SIZE-1); + t->storage = (stbds_hash_bucket *) STBDS_ALIGN_FWD((size_t) (t+1), STBDS_CACHE_LINE_SIZE); + t->slot_count = slot_count; + t->slot_count_log2 = stbds_log2(slot_count); + t->tombstone_count = 0; + t->used_count = 0; + + #if 0 // A1 + t->used_count_threshold = slot_count*12/16; // if 12/16th of table is occupied, grow + t->tombstone_count_threshold = slot_count* 2/16; // if tombstones are 2/16th of table, rebuild + t->used_count_shrink_threshold = slot_count* 4/16; // if table is only 4/16th full, shrink + #elif 1 // A2 + //t->used_count_threshold = slot_count*12/16; // if 12/16th of table is occupied, grow + //t->tombstone_count_threshold = slot_count* 3/16; // if tombstones are 3/16th of table, rebuild + //t->used_count_shrink_threshold = slot_count* 4/16; // if table is only 4/16th full, shrink + + // compute without overflowing + t->used_count_threshold = slot_count - (slot_count>>2); + t->tombstone_count_threshold = (slot_count>>3) + (slot_count>>4); + t->used_count_shrink_threshold = slot_count >> 2; + + #elif 0 // B1 + t->used_count_threshold = slot_count*13/16; // if 13/16th of table is occupied, grow + t->tombstone_count_threshold = slot_count* 2/16; // if tombstones are 2/16th of table, rebuild + t->used_count_shrink_threshold = slot_count* 5/16; // if table is only 5/16th full, shrink + #else // C1 + t->used_count_threshold = slot_count*14/16; // if 14/16th of table is occupied, grow + t->tombstone_count_threshold = slot_count* 2/16; // if tombstones are 2/16th of table, rebuild + t->used_count_shrink_threshold = slot_count* 6/16; // if table is only 6/16th full, shrink + #endif + // Following statistics were measured on a Core i7-6700 @ 4.00Ghz, compiled with clang 7.0.1 -O2 + // Note that the larger tables have high variance as they were run fewer times + // A1 A2 B1 C1 + // 0.10ms : 0.10ms : 0.10ms : 0.11ms : 2,000 inserts creating 2K table + // 0.96ms : 0.95ms : 0.97ms : 1.04ms : 20,000 inserts creating 20K table + // 14.48ms : 14.46ms : 10.63ms : 11.00ms : 200,000 inserts creating 200K table + // 195.74ms : 196.35ms : 203.69ms : 214.92ms : 2,000,000 inserts creating 2M table + // 2193.88ms : 2209.22ms : 2285.54ms : 2437.17ms : 20,000,000 inserts creating 20M table + // 65.27ms : 53.77ms : 65.33ms : 65.47ms : 500,000 inserts & deletes in 2K table + // 72.78ms : 62.45ms : 71.95ms : 72.85ms : 500,000 inserts & deletes in 20K table + // 89.47ms : 77.72ms : 96.49ms : 96.75ms : 500,000 inserts & deletes in 200K table + // 97.58ms : 98.14ms : 97.18ms : 97.53ms : 500,000 inserts & deletes in 2M table + // 118.61ms : 119.62ms : 120.16ms : 118.86ms : 500,000 inserts & deletes in 20M table + // 192.11ms : 194.39ms : 196.38ms : 195.73ms : 500,000 inserts & deletes in 200M table + + if (slot_count <= STBDS_BUCKET_LENGTH) + t->used_count_shrink_threshold = 0; + // to avoid infinite loop, we need to guarantee that at least one slot is empty and will terminate probes + STBDS_ASSERT(t->used_count_threshold + t->tombstone_count_threshold < t->slot_count); + STBDS_STATS(++stbds_hash_alloc); + if (ot) { + t->string = ot->string; + // reuse old seed so we can reuse old hashes so below "copy out old data" doesn't do any hashing + t->seed = ot->seed; + } else { + size_t a,b,temp; + memset(&t->string, 0, sizeof(t->string)); + t->seed = stbds_hash_seed; + // LCG + // in 32-bit, a = 2147001325 b = 715136305 + // in 64-bit, a = 2862933555777941757 b = 3037000493 + stbds_load_32_or_64(a,temp, 2147001325, 0x27bb2ee6, 0x87b0b0fd); + stbds_load_32_or_64(b,temp, 715136305, 0, 0xb504f32d); + stbds_hash_seed = stbds_hash_seed * a + b; + } + + { + size_t i,j; + for (i=0; i < slot_count >> STBDS_BUCKET_SHIFT; ++i) { + stbds_hash_bucket *b = &t->storage[i]; + for (j=0; j < STBDS_BUCKET_LENGTH; ++j) + b->hash[j] = STBDS_HASH_EMPTY; + for (j=0; j < STBDS_BUCKET_LENGTH; ++j) + b->index[j] = STBDS_INDEX_EMPTY; + } + } + + // copy out the old data, if any + if (ot) { + size_t i,j; + t->used_count = ot->used_count; + for (i=0; i < ot->slot_count >> STBDS_BUCKET_SHIFT; ++i) { + stbds_hash_bucket *ob = &ot->storage[i]; + for (j=0; j < STBDS_BUCKET_LENGTH; ++j) { + if (STBDS_INDEX_IN_USE(ob->index[j])) { + size_t hash = ob->hash[j]; + size_t pos = stbds_probe_position(hash, t->slot_count, t->slot_count_log2); + size_t step = STBDS_BUCKET_LENGTH; + STBDS_STATS(++stbds_rehash_items); + for (;;) { + size_t limit,z; + stbds_hash_bucket *bucket; + bucket = &t->storage[pos >> STBDS_BUCKET_SHIFT]; + STBDS_STATS(++stbds_rehash_probes); + + for (z=pos & STBDS_BUCKET_MASK; z < STBDS_BUCKET_LENGTH; ++z) { + if (bucket->hash[z] == 0) { + bucket->hash[z] = hash; + bucket->index[z] = ob->index[j]; + goto done; + } + } + + limit = pos & STBDS_BUCKET_MASK; + for (z = 0; z < limit; ++z) { + if (bucket->hash[z] == 0) { + bucket->hash[z] = hash; + bucket->index[z] = ob->index[j]; + goto done; + } + } + + pos += step; // quadratic probing + step += STBDS_BUCKET_LENGTH; + pos &= (t->slot_count-1); + } + } + done: + ; + } + } + } + + return t; +} + +#define STBDS_ROTATE_LEFT(val, n) (((val) << (n)) | ((val) >> (STBDS_SIZE_T_BITS - (n)))) +#define STBDS_ROTATE_RIGHT(val, n) (((val) >> (n)) | ((val) << (STBDS_SIZE_T_BITS - (n)))) + +size_t stbds_hash_string(char *str, size_t seed) +{ + size_t hash = seed; + while (*str) + hash = STBDS_ROTATE_LEFT(hash, 9) + (unsigned char) *str++; + + // Thomas Wang 64-to-32 bit mix function, hopefully also works in 32 bits + hash ^= seed; + hash = (~hash) + (hash << 18); + hash ^= hash ^ STBDS_ROTATE_RIGHT(hash,31); + hash = hash * 21; + hash ^= hash ^ STBDS_ROTATE_RIGHT(hash,11); + hash += (hash << 6); + hash ^= STBDS_ROTATE_RIGHT(hash,22); + return hash+seed; +} + +#ifdef STBDS_SIPHASH_2_4 +#define STBDS_SIPHASH_C_ROUNDS 2 +#define STBDS_SIPHASH_D_ROUNDS 4 +typedef int STBDS_SIPHASH_2_4_can_only_be_used_in_64_bit_builds[sizeof(size_t) == 8 ? 1 : -1]; +#endif + +#ifndef STBDS_SIPHASH_C_ROUNDS +#define STBDS_SIPHASH_C_ROUNDS 1 +#endif +#ifndef STBDS_SIPHASH_D_ROUNDS +#define STBDS_SIPHASH_D_ROUNDS 1 +#endif + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4127) // conditional expression is constant, for do..while(0) and sizeof()== +#endif + +static size_t stbds_siphash_bytes(void *p, size_t len, size_t seed) +{ + unsigned char *d = (unsigned char *) p; + size_t i,j; + size_t v0,v1,v2,v3, data; + + // hash that works on 32- or 64-bit registers without knowing which we have + // (computes different results on 32-bit and 64-bit platform) + // derived from siphash, but on 32-bit platforms very different as it uses 4 32-bit state not 4 64-bit + v0 = ((((size_t) 0x736f6d65 << 16) << 16) + 0x70736575) ^ seed; + v1 = ((((size_t) 0x646f7261 << 16) << 16) + 0x6e646f6d) ^ ~seed; + v2 = ((((size_t) 0x6c796765 << 16) << 16) + 0x6e657261) ^ seed; + v3 = ((((size_t) 0x74656462 << 16) << 16) + 0x79746573) ^ ~seed; + + #ifdef STBDS_TEST_SIPHASH_2_4 + // hardcoded with key material in the siphash test vectors + v0 ^= 0x0706050403020100ull ^ seed; + v1 ^= 0x0f0e0d0c0b0a0908ull ^ ~seed; + v2 ^= 0x0706050403020100ull ^ seed; + v3 ^= 0x0f0e0d0c0b0a0908ull ^ ~seed; + #endif + + #define STBDS_SIPROUND() \ + do { \ + v0 += v1; v1 = STBDS_ROTATE_LEFT(v1, 13); v1 ^= v0; v0 = STBDS_ROTATE_LEFT(v0,STBDS_SIZE_T_BITS/2); \ + v2 += v3; v3 = STBDS_ROTATE_LEFT(v3, 16); v3 ^= v2; \ + v2 += v1; v1 = STBDS_ROTATE_LEFT(v1, 17); v1 ^= v2; v2 = STBDS_ROTATE_LEFT(v2,STBDS_SIZE_T_BITS/2); \ + v0 += v3; v3 = STBDS_ROTATE_LEFT(v3, 21); v3 ^= v0; \ + } while (0) + + for (i=0; i+sizeof(size_t) <= len; i += sizeof(size_t), d += sizeof(size_t)) { + data = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24); + data |= (size_t) (d[4] | (d[5] << 8) | (d[6] << 16) | (d[7] << 24)) << 16 << 16; // discarded if size_t == 4 + + v3 ^= data; + for (j=0; j < STBDS_SIPHASH_C_ROUNDS; ++j) + STBDS_SIPROUND(); + v0 ^= data; + } + data = len << (STBDS_SIZE_T_BITS-8); + switch (len - i) { + case 7: data |= ((size_t) d[6] << 24) << 24; // fall through + case 6: data |= ((size_t) d[5] << 20) << 20; // fall through + case 5: data |= ((size_t) d[4] << 16) << 16; // fall through + case 4: data |= (d[3] << 24); // fall through + case 3: data |= (d[2] << 16); // fall through + case 2: data |= (d[1] << 8); // fall through + case 1: data |= d[0]; // fall through + case 0: break; + } + v3 ^= data; + for (j=0; j < STBDS_SIPHASH_C_ROUNDS; ++j) + STBDS_SIPROUND(); + v0 ^= data; + v2 ^= 0xff; + for (j=0; j < STBDS_SIPHASH_D_ROUNDS; ++j) + STBDS_SIPROUND(); + +#ifdef STBDS_SIPHASH_2_4 + return v0^v1^v2^v3; +#else + return v1^v2^v3; // slightly stronger since v0^v3 in above cancels out final round operation? I tweeted at the authors of SipHash about this but they didn't reply +#endif +} + +size_t stbds_hash_bytes(void *p, size_t len, size_t seed) +{ +#ifdef STBDS_SIPHASH_2_4 + return stbds_siphash_bytes(p,len,seed); +#else + unsigned char *d = (unsigned char *) p; + + if (len == 4) { + unsigned int hash = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24); + #if 0 + // HASH32-A Bob Jenkin's hash function w/o large constants + hash ^= seed; + hash -= (hash<<6); + hash ^= (hash>>17); + hash -= (hash<<9); + hash ^= seed; + hash ^= (hash<<4); + hash -= (hash<<3); + hash ^= (hash<<10); + hash ^= (hash>>15); + #elif 1 + // HASH32-BB Bob Jenkin's presumably-accidental version of Thomas Wang hash with rotates turned into shifts. + // Note that converting these back to rotates makes it run a lot slower, presumably due to collisions, so I'm + // not really sure what's going on. + hash ^= seed; + hash = (hash ^ 61) ^ (hash >> 16); + hash = hash + (hash << 3); + hash = hash ^ (hash >> 4); + hash = hash * 0x27d4eb2d; + hash ^= seed; + hash = hash ^ (hash >> 15); + #else // HASH32-C - Murmur3 + hash ^= seed; + hash *= 0xcc9e2d51; + hash = (hash << 17) | (hash >> 15); + hash *= 0x1b873593; + hash ^= seed; + hash = (hash << 19) | (hash >> 13); + hash = hash*5 + 0xe6546b64; + hash ^= hash >> 16; + hash *= 0x85ebca6b; + hash ^= seed; + hash ^= hash >> 13; + hash *= 0xc2b2ae35; + hash ^= hash >> 16; + #endif + // Following statistics were measured on a Core i7-6700 @ 4.00Ghz, compiled with clang 7.0.1 -O2 + // Note that the larger tables have high variance as they were run fewer times + // HASH32-A // HASH32-BB // HASH32-C + // 0.10ms // 0.10ms // 0.10ms : 2,000 inserts creating 2K table + // 0.96ms // 0.95ms // 0.99ms : 20,000 inserts creating 20K table + // 14.69ms // 14.43ms // 14.97ms : 200,000 inserts creating 200K table + // 199.99ms // 195.36ms // 202.05ms : 2,000,000 inserts creating 2M table + // 2234.84ms // 2187.74ms // 2240.38ms : 20,000,000 inserts creating 20M table + // 55.68ms // 53.72ms // 57.31ms : 500,000 inserts & deletes in 2K table + // 63.43ms // 61.99ms // 65.73ms : 500,000 inserts & deletes in 20K table + // 80.04ms // 77.96ms // 81.83ms : 500,000 inserts & deletes in 200K table + // 100.42ms // 97.40ms // 102.39ms : 500,000 inserts & deletes in 2M table + // 119.71ms // 120.59ms // 121.63ms : 500,000 inserts & deletes in 20M table + // 185.28ms // 195.15ms // 187.74ms : 500,000 inserts & deletes in 200M table + // 15.58ms // 14.79ms // 15.52ms : 200,000 inserts creating 200K table with varying key spacing + + return (((size_t) hash << 16 << 16) | hash) ^ seed; + } else if (len == 8 && sizeof(size_t) == 8) { + size_t hash = d[0] | (d[1] << 8) | (d[2] << 16) | (d[3] << 24); + hash |= (size_t) (d[4] | (d[5] << 8) | (d[6] << 16) | (d[7] << 24)) << 16 << 16; // avoid warning if size_t == 4 + hash ^= seed; + hash = (~hash) + (hash << 21); + hash ^= STBDS_ROTATE_RIGHT(hash,24); + hash *= 265; + hash ^= STBDS_ROTATE_RIGHT(hash,14); + hash ^= seed; + hash *= 21; + hash ^= STBDS_ROTATE_RIGHT(hash,28); + hash += (hash << 31); + hash = (~hash) + (hash << 18); + return hash; + } else { + return stbds_siphash_bytes(p,len,seed); + } +#endif +} +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + +static int stbds_is_key_equal(void *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode, size_t i) +{ + if (mode >= STBDS_HM_STRING) + return 0==strcmp((char *) key, * (char **) ((char *) a + elemsize*i + keyoffset)); + else + return 0==memcmp(key, (char *) a + elemsize*i + keyoffset, keysize); +} + +#define STBDS_HASH_TO_ARR(x,elemsize) ((char*) (x) - (elemsize)) +#define STBDS_ARR_TO_HASH(x,elemsize) ((char*) (x) + (elemsize)) + +#define stbds_hash_table(a) ((stbds_hash_index *) stbds_header(a)->hash_table) + +void stbds_hmfree_func(void *a, size_t elemsize) +{ + if (a == NULL) return; + if (stbds_hash_table(a) != NULL) { + if (stbds_hash_table(a)->string.mode == STBDS_SH_STRDUP) { + size_t i; + // skip 0th element, which is default + for (i=1; i < stbds_header(a)->length; ++i) + STBDS_FREE(NULL, *(char**) ((char *) a + elemsize*i)); + } + stbds_strreset(&stbds_hash_table(a)->string); + } + STBDS_FREE(NULL, stbds_header(a)->hash_table); + STBDS_FREE(NULL, stbds_header(a)); +} + +static ptrdiff_t stbds_hm_find_slot(void *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode) +{ + void *raw_a = STBDS_HASH_TO_ARR(a,elemsize); + stbds_hash_index *table = stbds_hash_table(raw_a); + size_t hash = mode >= STBDS_HM_STRING ? stbds_hash_string((char*)key,table->seed) : stbds_hash_bytes(key, keysize,table->seed); + size_t step = STBDS_BUCKET_LENGTH; + size_t limit,i; + size_t pos; + stbds_hash_bucket *bucket; + + if (hash < 2) hash += 2; // stored hash values are forbidden from being 0, so we can detect empty slots + + pos = stbds_probe_position(hash, table->slot_count, table->slot_count_log2); + + for (;;) { + STBDS_STATS(++stbds_hash_probes); + bucket = &table->storage[pos >> STBDS_BUCKET_SHIFT]; + + // start searching from pos to end of bucket, this should help performance on small hash tables that fit in cache + for (i=pos & STBDS_BUCKET_MASK; i < STBDS_BUCKET_LENGTH; ++i) { + if (bucket->hash[i] == hash) { + if (stbds_is_key_equal(a, elemsize, key, keysize, keyoffset, mode, bucket->index[i])) { + return (pos & ~STBDS_BUCKET_MASK)+i; + } + } else if (bucket->hash[i] == STBDS_HASH_EMPTY) { + return -1; + } + } + + // search from beginning of bucket to pos + limit = pos & STBDS_BUCKET_MASK; + for (i = 0; i < limit; ++i) { + if (bucket->hash[i] == hash) { + if (stbds_is_key_equal(a, elemsize, key, keysize, keyoffset, mode, bucket->index[i])) { + return (pos & ~STBDS_BUCKET_MASK)+i; + } + } else if (bucket->hash[i] == STBDS_HASH_EMPTY) { + return -1; + } + } + + // quadratic probing + pos += step; + step += STBDS_BUCKET_LENGTH; + pos &= (table->slot_count-1); + } + /* NOTREACHED */ +} + +void * stbds_hmget_key_ts(void *a, size_t elemsize, void *key, size_t keysize, ptrdiff_t *temp, int mode) +{ + size_t keyoffset = 0; + if (a == NULL) { + // make it non-empty so we can return a temp + a = stbds_arrgrowf(0, elemsize, 0, 1); + stbds_header(a)->length += 1; + memset(a, 0, elemsize); + *temp = STBDS_INDEX_EMPTY; + // adjust a to point after the default element + return STBDS_ARR_TO_HASH(a,elemsize); + } else { + stbds_hash_index *table; + void *raw_a = STBDS_HASH_TO_ARR(a,elemsize); + // adjust a to point to the default element + table = (stbds_hash_index *) stbds_header(raw_a)->hash_table; + if (table == 0) { + *temp = -1; + } else { + ptrdiff_t slot = stbds_hm_find_slot(a, elemsize, key, keysize, keyoffset, mode); + if (slot < 0) { + *temp = STBDS_INDEX_EMPTY; + } else { + stbds_hash_bucket *b = &table->storage[slot >> STBDS_BUCKET_SHIFT]; + *temp = b->index[slot & STBDS_BUCKET_MASK]; + } + } + return a; + } +} + +void * stbds_hmget_key(void *a, size_t elemsize, void *key, size_t keysize, int mode) +{ + ptrdiff_t temp; + void *p = stbds_hmget_key_ts(a, elemsize, key, keysize, &temp, mode); + stbds_temp(STBDS_HASH_TO_ARR(p,elemsize)) = temp; + return p; +} + +void * stbds_hmput_default(void *a, size_t elemsize) +{ + // three cases: + // a is NULL <- allocate + // a has a hash table but no entries, because of shmode <- grow + // a has entries <- do nothing + if (a == NULL || stbds_header(STBDS_HASH_TO_ARR(a,elemsize))->length == 0) { + a = stbds_arrgrowf(a ? STBDS_HASH_TO_ARR(a,elemsize) : NULL, elemsize, 0, 1); + stbds_header(a)->length += 1; + memset(a, 0, elemsize); + a=STBDS_ARR_TO_HASH(a,elemsize); + } + return a; +} + +static char *stbds_strdup(char *str); + +void *stbds_hmput_key(void *a, size_t elemsize, void *key, size_t keysize, int mode) +{ + size_t keyoffset=0; + void *raw_a; + stbds_hash_index *table; + + if (a == NULL) { + a = stbds_arrgrowf(0, elemsize, 0, 1); + memset(a, 0, elemsize); + stbds_header(a)->length += 1; + // adjust a to point AFTER the default element + a = STBDS_ARR_TO_HASH(a,elemsize); + } + + // adjust a to point to the default element + raw_a = a; + a = STBDS_HASH_TO_ARR(a,elemsize); + + table = (stbds_hash_index *) stbds_header(a)->hash_table; + + if (table == NULL || table->used_count >= table->used_count_threshold) { + stbds_hash_index *nt; + size_t slot_count; + + slot_count = (table == NULL) ? STBDS_BUCKET_LENGTH : table->slot_count*2; + nt = stbds_make_hash_index(slot_count, table); + if (table) + STBDS_FREE(NULL, table); + else + nt->string.mode = mode >= STBDS_HM_STRING ? STBDS_SH_DEFAULT : 0; + stbds_header(a)->hash_table = table = nt; + STBDS_STATS(++stbds_hash_grow); + } + + // we iterate hash table explicitly because we want to track if we saw a tombstone + { + size_t hash = mode >= STBDS_HM_STRING ? stbds_hash_string((char*)key,table->seed) : stbds_hash_bytes(key, keysize,table->seed); + size_t step = STBDS_BUCKET_LENGTH; + size_t pos; + ptrdiff_t tombstone = -1; + stbds_hash_bucket *bucket; + + // stored hash values are forbidden from being 0, so we can detect empty slots to early out quickly + if (hash < 2) hash += 2; + + pos = stbds_probe_position(hash, table->slot_count, table->slot_count_log2); + + for (;;) { + size_t limit, i; + STBDS_STATS(++stbds_hash_probes); + bucket = &table->storage[pos >> STBDS_BUCKET_SHIFT]; + + // start searching from pos to end of bucket + for (i=pos & STBDS_BUCKET_MASK; i < STBDS_BUCKET_LENGTH; ++i) { + if (bucket->hash[i] == hash) { + if (stbds_is_key_equal(raw_a, elemsize, key, keysize, keyoffset, mode, bucket->index[i])) { + stbds_temp(a) = bucket->index[i]; + if (mode >= STBDS_HM_STRING) + stbds_temp_key(a) = * (char **) ((char *) raw_a + elemsize*bucket->index[i] + keyoffset); + return STBDS_ARR_TO_HASH(a,elemsize); + } + } else if (bucket->hash[i] == 0) { + pos = (pos & ~STBDS_BUCKET_MASK) + i; + goto found_empty_slot; + } else if (tombstone < 0) { + if (bucket->index[i] == STBDS_INDEX_DELETED) + tombstone = (ptrdiff_t) ((pos & ~STBDS_BUCKET_MASK) + i); + } + } + + // search from beginning of bucket to pos + limit = pos & STBDS_BUCKET_MASK; + for (i = 0; i < limit; ++i) { + if (bucket->hash[i] == hash) { + if (stbds_is_key_equal(raw_a, elemsize, key, keysize, keyoffset, mode, bucket->index[i])) { + stbds_temp(a) = bucket->index[i]; + return STBDS_ARR_TO_HASH(a,elemsize); + } + } else if (bucket->hash[i] == 0) { + pos = (pos & ~STBDS_BUCKET_MASK) + i; + goto found_empty_slot; + } else if (tombstone < 0) { + if (bucket->index[i] == STBDS_INDEX_DELETED) + tombstone = (ptrdiff_t) ((pos & ~STBDS_BUCKET_MASK) + i); + } + } + + // quadratic probing + pos += step; + step += STBDS_BUCKET_LENGTH; + pos &= (table->slot_count-1); + } + found_empty_slot: + if (tombstone >= 0) { + pos = tombstone; + --table->tombstone_count; + } + ++table->used_count; + + { + ptrdiff_t i = (ptrdiff_t) stbds_arrlen(a); + // we want to do stbds_arraddn(1), but we can't use the macros since we don't have something of the right type + if ((size_t) i+1 > stbds_arrcap(a)) + *(void **) &a = stbds_arrgrowf(a, elemsize, 1, 0); + raw_a = STBDS_ARR_TO_HASH(a,elemsize); + + STBDS_ASSERT((size_t) i+1 <= stbds_arrcap(a)); + stbds_header(a)->length = i+1; + bucket = &table->storage[pos >> STBDS_BUCKET_SHIFT]; + bucket->hash[pos & STBDS_BUCKET_MASK] = hash; + bucket->index[pos & STBDS_BUCKET_MASK] = i-1; + stbds_temp(a) = i-1; + + switch (table->string.mode) { + case STBDS_SH_STRDUP: stbds_temp_key(a) = *(char **) ((char *) a + elemsize*i) = stbds_strdup((char*) key); break; + case STBDS_SH_ARENA: stbds_temp_key(a) = *(char **) ((char *) a + elemsize*i) = stbds_stralloc(&table->string, (char*)key); break; + case STBDS_SH_DEFAULT: stbds_temp_key(a) = *(char **) ((char *) a + elemsize*i) = (char *) key; break; + default: memcpy((char *) a + elemsize*i, key, keysize); break; + } + } + return STBDS_ARR_TO_HASH(a,elemsize); + } +} + +void * stbds_shmode_func(size_t elemsize, int mode) +{ + void *a = stbds_arrgrowf(0, elemsize, 0, 1); + stbds_hash_index *h; + memset(a, 0, elemsize); + stbds_header(a)->length = 1; + stbds_header(a)->hash_table = h = (stbds_hash_index *) stbds_make_hash_index(STBDS_BUCKET_LENGTH, NULL); + h->string.mode = (unsigned char) mode; + return STBDS_ARR_TO_HASH(a,elemsize); +} + +void * stbds_hmdel_key(void *a, size_t elemsize, void *key, size_t keysize, size_t keyoffset, int mode) +{ + if (a == NULL) { + return 0; + } else { + stbds_hash_index *table; + void *raw_a = STBDS_HASH_TO_ARR(a,elemsize); + table = (stbds_hash_index *) stbds_header(raw_a)->hash_table; + stbds_temp(raw_a) = 0; + if (table == 0) { + return a; + } else { + ptrdiff_t slot; + slot = stbds_hm_find_slot(a, elemsize, key, keysize, keyoffset, mode); + if (slot < 0) + return a; + else { + stbds_hash_bucket *b = &table->storage[slot >> STBDS_BUCKET_SHIFT]; + int i = slot & STBDS_BUCKET_MASK; + ptrdiff_t old_index = b->index[i]; + ptrdiff_t final_index = (ptrdiff_t) stbds_arrlen(raw_a)-1-1; // minus one for the raw_a vs a, and minus one for 'last' + STBDS_ASSERT(slot < (ptrdiff_t) table->slot_count); + --table->used_count; + ++table->tombstone_count; + stbds_temp(raw_a) = 1; + STBDS_ASSERT(table->used_count >= 0); + //STBDS_ASSERT(table->tombstone_count < table->slot_count/4); + b->hash[i] = STBDS_HASH_DELETED; + b->index[i] = STBDS_INDEX_DELETED; + + if (mode == STBDS_HM_STRING && table->string.mode == STBDS_SH_STRDUP) + STBDS_FREE(NULL, *(char**) ((char *) a+elemsize*old_index)); + + // if indices are the same, memcpy is a no-op, but back-pointer-fixup will fail, so skip + if (old_index != final_index) { + // swap delete + memmove((char*) a + elemsize*old_index, (char*) a + elemsize*final_index, elemsize); + + // now find the slot for the last element + if (mode == STBDS_HM_STRING) + slot = stbds_hm_find_slot(a, elemsize, *(char**) ((char *) a+elemsize*old_index + keyoffset), keysize, keyoffset, mode); + else + slot = stbds_hm_find_slot(a, elemsize, (char* ) a+elemsize*old_index + keyoffset, keysize, keyoffset, mode); + STBDS_ASSERT(slot >= 0); + b = &table->storage[slot >> STBDS_BUCKET_SHIFT]; + i = slot & STBDS_BUCKET_MASK; + STBDS_ASSERT(b->index[i] == final_index); + b->index[i] = old_index; + } + stbds_header(raw_a)->length -= 1; + + if (table->used_count < table->used_count_shrink_threshold && table->slot_count > STBDS_BUCKET_LENGTH) { + stbds_header(raw_a)->hash_table = stbds_make_hash_index(table->slot_count>>1, table); + STBDS_FREE(NULL, table); + STBDS_STATS(++stbds_hash_shrink); + } else if (table->tombstone_count > table->tombstone_count_threshold) { + stbds_header(raw_a)->hash_table = stbds_make_hash_index(table->slot_count , table); + STBDS_FREE(NULL, table); + STBDS_STATS(++stbds_hash_rebuild); + } + + return a; + } + } + } + /* NOTREACHED */ +} + +static char *stbds_strdup(char *str) +{ + // to keep replaceable allocator simple, we don't want to use strdup. + // rolling our own also avoids problem of strdup vs _strdup + size_t len = strlen(str)+1; + char *p = (char*) STBDS_REALLOC(NULL, 0, len); + memmove(p, str, len); + return p; +} + +#ifndef STBDS_STRING_ARENA_BLOCKSIZE_MIN +#define STBDS_STRING_ARENA_BLOCKSIZE_MIN 512u +#endif +#ifndef STBDS_STRING_ARENA_BLOCKSIZE_MAX +#define STBDS_STRING_ARENA_BLOCKSIZE_MAX (1u<<20) +#endif + +char *stbds_stralloc(stbds_string_arena *a, char *str) +{ + char *p; + size_t len = strlen(str)+1; + if (len > a->remaining) { + // compute the next blocksize + size_t blocksize = a->block; + + // size is 512, 512, 1024, 1024, 2048, 2048, 4096, 4096, etc., so that + // there are log(SIZE) allocations to free when we destroy the table + blocksize = (size_t) (STBDS_STRING_ARENA_BLOCKSIZE_MIN) << (blocksize>>1); + + // if size is under 1M, advance to next blocktype + if (blocksize < (size_t)(STBDS_STRING_ARENA_BLOCKSIZE_MAX)) + ++a->block; + + if (len > blocksize) { + // if string is larger than blocksize, then just allocate the full size. + // note that we still advance string_block so block size will continue + // increasing, so e.g. if somebody only calls this with 1000-long strings, + // eventually the arena will start doubling and handling those as well + stbds_string_block *sb = (stbds_string_block *) STBDS_REALLOC(NULL, 0, sizeof(*sb)-8 + len); + memmove(sb->storage, str, len); + if (a->storage) { + // insert it after the first element, so that we don't waste the space there + sb->next = a->storage->next; + a->storage->next = sb; + } else { + sb->next = 0; + a->storage = sb; + a->remaining = 0; // this is redundant, but good for clarity + } + return sb->storage; + } else { + stbds_string_block *sb = (stbds_string_block *) STBDS_REALLOC(NULL, 0, sizeof(*sb)-8 + blocksize); + sb->next = a->storage; + a->storage = sb; + a->remaining = blocksize; + } + } + + STBDS_ASSERT(len <= a->remaining); + p = a->storage->storage + a->remaining - len; + a->remaining -= len; + memmove(p, str, len); + return p; +} + +void stbds_strreset(stbds_string_arena *a) +{ + stbds_string_block *x,*y; + x = a->storage; + while (x) { + y = x->next; + STBDS_FREE(NULL, x); + x = y; + } + memset(a, 0, sizeof(*a)); +} + +#endif + +////////////////////////////////////////////////////////////////////////////// +// +// UNIT TESTS +// + +#ifdef STBDS_UNIT_TESTS +#include +#ifdef STBDS_ASSERT_WAS_UNDEFINED +#undef STBDS_ASSERT +#endif +#ifndef STBDS_ASSERT +#define STBDS_ASSERT assert +#include +#endif + +typedef struct { int key,b,c,d; } stbds_struct; +typedef struct { int key[2],b,c,d; } stbds_struct2; + +static char buffer[256]; +char *strkey(int n) +{ +#if defined(_WIN32) && defined(__STDC_WANT_SECURE_LIB__) + sprintf_s(buffer, sizeof(buffer), "test_%d", n); +#else + sprintf(buffer, "test_%d", n); +#endif + return buffer; +} + +void stbds_unit_tests(void) +{ +#if defined(_MSC_VER) && _MSC_VER <= 1200 && defined(__cplusplus) + // VC6 C++ doesn't like the template<> trick on unnamed structures, so do nothing! + STBDS_ASSERT(0); +#else + const int testsize = 100000; + const int testsize2 = testsize/20; + int *arr=NULL; + struct { int key; int value; } *intmap = NULL; + struct { char *key; int value; } *strmap = NULL, s; + struct { stbds_struct key; int value; } *map = NULL; + stbds_struct *map2 = NULL; + stbds_struct2 *map3 = NULL; + stbds_string_arena sa = { 0 }; + int key3[2] = { 1,2 }; + ptrdiff_t temp; + + int i,j; + + STBDS_ASSERT(arrlen(arr)==0); + for (i=0; i < 20000; i += 50) { + for (j=0; j < i; ++j) + arrpush(arr,j); + arrfree(arr); + } + + for (i=0; i < 4; ++i) { + arrpush(arr,1); arrpush(arr,2); arrpush(arr,3); arrpush(arr,4); + arrdel(arr,i); + arrfree(arr); + arrpush(arr,1); arrpush(arr,2); arrpush(arr,3); arrpush(arr,4); + arrdelswap(arr,i); + arrfree(arr); + } + + for (i=0; i < 5; ++i) { + arrpush(arr,1); arrpush(arr,2); arrpush(arr,3); arrpush(arr,4); + stbds_arrins(arr,i,5); + STBDS_ASSERT(arr[i] == 5); + if (i < 4) + STBDS_ASSERT(arr[4] == 4); + arrfree(arr); + } + + i = 1; + STBDS_ASSERT(hmgeti(intmap,i) == -1); + hmdefault(intmap, -2); + STBDS_ASSERT(hmgeti(intmap, i) == -1); + STBDS_ASSERT(hmget (intmap, i) == -2); + for (i=0; i < testsize; i+=2) + hmput(intmap, i, i*5); + for (i=0; i < testsize; i+=1) { + if (i & 1) STBDS_ASSERT(hmget(intmap, i) == -2 ); + else STBDS_ASSERT(hmget(intmap, i) == i*5); + if (i & 1) STBDS_ASSERT(hmget_ts(intmap, i, temp) == -2 ); + else STBDS_ASSERT(hmget_ts(intmap, i, temp) == i*5); + } + for (i=0; i < testsize; i+=2) + hmput(intmap, i, i*3); + for (i=0; i < testsize; i+=1) + if (i & 1) STBDS_ASSERT(hmget(intmap, i) == -2 ); + else STBDS_ASSERT(hmget(intmap, i) == i*3); + for (i=2; i < testsize; i+=4) + hmdel(intmap, i); // delete half the entries + for (i=0; i < testsize; i+=1) + if (i & 3) STBDS_ASSERT(hmget(intmap, i) == -2 ); + else STBDS_ASSERT(hmget(intmap, i) == i*3); + for (i=0; i < testsize; i+=1) + hmdel(intmap, i); // delete the rest of the entries + for (i=0; i < testsize; i+=1) + STBDS_ASSERT(hmget(intmap, i) == -2 ); + hmfree(intmap); + for (i=0; i < testsize; i+=2) + hmput(intmap, i, i*3); + hmfree(intmap); + + #if defined(__clang__) || defined(__GNUC__) + #ifndef __cplusplus + intmap = NULL; + hmput(intmap, 15, 7); + hmput(intmap, 11, 3); + hmput(intmap, 9, 5); + STBDS_ASSERT(hmget(intmap, 9) == 5); + STBDS_ASSERT(hmget(intmap, 11) == 3); + STBDS_ASSERT(hmget(intmap, 15) == 7); + #endif + #endif + + for (i=0; i < testsize; ++i) + stralloc(&sa, strkey(i)); + strreset(&sa); + + { + s.key = "a", s.value = 1; + shputs(strmap, s); + STBDS_ASSERT(*strmap[0].key == 'a'); + STBDS_ASSERT(strmap[0].key == s.key); + STBDS_ASSERT(strmap[0].value == s.value); + shfree(strmap); + } + + { + s.key = "a", s.value = 1; + sh_new_strdup(strmap); + shputs(strmap, s); + STBDS_ASSERT(*strmap[0].key == 'a'); + STBDS_ASSERT(strmap[0].key != s.key); + STBDS_ASSERT(strmap[0].value == s.value); + shfree(strmap); + } + + { + s.key = "a", s.value = 1; + sh_new_arena(strmap); + shputs(strmap, s); + STBDS_ASSERT(*strmap[0].key == 'a'); + STBDS_ASSERT(strmap[0].key != s.key); + STBDS_ASSERT(strmap[0].value == s.value); + shfree(strmap); + } + + for (j=0; j < 2; ++j) { + STBDS_ASSERT(shgeti(strmap,"foo") == -1); + if (j == 0) + sh_new_strdup(strmap); + else + sh_new_arena(strmap); + STBDS_ASSERT(shgeti(strmap,"foo") == -1); + shdefault(strmap, -2); + STBDS_ASSERT(shgeti(strmap,"foo") == -1); + for (i=0; i < testsize; i+=2) + shput(strmap, strkey(i), i*3); + for (i=0; i < testsize; i+=1) + if (i & 1) STBDS_ASSERT(shget(strmap, strkey(i)) == -2 ); + else STBDS_ASSERT(shget(strmap, strkey(i)) == i*3); + for (i=2; i < testsize; i+=4) + shdel(strmap, strkey(i)); // delete half the entries + for (i=0; i < testsize; i+=1) + if (i & 3) STBDS_ASSERT(shget(strmap, strkey(i)) == -2 ); + else STBDS_ASSERT(shget(strmap, strkey(i)) == i*3); + for (i=0; i < testsize; i+=1) + shdel(strmap, strkey(i)); // delete the rest of the entries + for (i=0; i < testsize; i+=1) + STBDS_ASSERT(shget(strmap, strkey(i)) == -2 ); + shfree(strmap); + } + + { + struct { char *key; char value; } *hash = NULL; + char name[4] = "jen"; + shput(hash, "bob" , 'h'); + shput(hash, "sally" , 'e'); + shput(hash, "fred" , 'l'); + shput(hash, "jen" , 'x'); + shput(hash, "doug" , 'o'); + + shput(hash, name , 'l'); + shfree(hash); + } + + for (i=0; i < testsize; i += 2) { + stbds_struct s = { i,i*2,i*3,i*4 }; + hmput(map, s, i*5); + } + + for (i=0; i < testsize; i += 1) { + stbds_struct s = { i,i*2,i*3 ,i*4 }; + stbds_struct t = { i,i*2,i*3+1,i*4 }; + if (i & 1) STBDS_ASSERT(hmget(map, s) == 0); + else STBDS_ASSERT(hmget(map, s) == i*5); + if (i & 1) STBDS_ASSERT(hmget_ts(map, s, temp) == 0); + else STBDS_ASSERT(hmget_ts(map, s, temp) == i*5); + //STBDS_ASSERT(hmget(map, t.key) == 0); + } + + for (i=0; i < testsize; i += 2) { + stbds_struct s = { i,i*2,i*3,i*4 }; + hmputs(map2, s); + } + hmfree(map); + + for (i=0; i < testsize; i += 1) { + stbds_struct s = { i,i*2,i*3,i*4 }; + stbds_struct t = { i,i*2,i*3+1,i*4 }; + if (i & 1) STBDS_ASSERT(hmgets(map2, s.key).d == 0); + else STBDS_ASSERT(hmgets(map2, s.key).d == i*4); + //STBDS_ASSERT(hmgetp(map2, t.key) == 0); + } + hmfree(map2); + + for (i=0; i < testsize; i += 2) { + stbds_struct2 s = { { i,i*2 }, i*3,i*4, i*5 }; + hmputs(map3, s); + } + for (i=0; i < testsize; i += 1) { + stbds_struct2 s = { { i,i*2}, i*3, i*4, i*5 }; + stbds_struct2 t = { { i,i*2}, i*3+1, i*4, i*5 }; + if (i & 1) STBDS_ASSERT(hmgets(map3, s.key).d == 0); + else STBDS_ASSERT(hmgets(map3, s.key).d == i*5); + //STBDS_ASSERT(hmgetp(map3, t.key) == 0); + } +#endif +} +#endif + + +/* +------------------------------------------------------------------------------ +This software is available under 2 licenses -- choose whichever you prefer. +------------------------------------------------------------------------------ +ALTERNATIVE A - MIT License +Copyright (c) 2019 Sean Barrett +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +------------------------------------------------------------------------------ +ALTERNATIVE B - Public Domain (www.unlicense.org) +This is free and unencumbered software released into the public domain. +Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +software, either in source code form or as a compiled binary, for any purpose, +commercial or non-commercial, and by any means. +In jurisdictions that recognize copyright laws, the author or authors of this +software dedicate any and all copyright interest in the software to the public +domain. We make this dedication for the benefit of the public at large and to +the detriment of our heirs and successors. We intend this dedication to be an +overt act of relinquishment in perpetuity of all present and future rights to +this software under copyright law. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +------------------------------------------------------------------------------ +*/ diff --git a/compiler/libs/stb_impl.c b/compiler/libs/stb_impl.c new file mode 100644 index 0000000..a1f5188 --- /dev/null +++ b/compiler/libs/stb_impl.c @@ -0,0 +1,3 @@ +/* This file holds the implementation of stb library functionality */ +#define STB_DS_IMPLEMENTATION +#include "stb_ds.h" diff --git a/compiler/location.h b/compiler/location.h new file mode 100644 index 0000000..16fe5cd --- /dev/null +++ b/compiler/location.h @@ -0,0 +1,11 @@ +#ifndef _location_h_ +#define _location_h_ + +#include "pre.h" + +typedef struct { + Str source; + i64 line, column; +} Location; + +#endif diff --git a/compiler/messages.c b/compiler/messages.c new file mode 100644 index 0000000..351af74 --- /dev/null +++ b/compiler/messages.c @@ -0,0 +1,58 @@ +#include +#include +#include + +#include "messages.h" +#include "location.h" + +/* SGI sequence */ +#define ANSI_C(c) "\x1b["c"m" +/* 8bit palette color */ +#define ANSI_8C(c) "\x1b[38;5;"c"m" +#define ANSI_8CB(c) "\x1b[48;5;"c"m" +/* True color */ +#define ANSI_RC(r,g,b) "\x1b[38;2;"r";"g";"b"m" +#define ANSI_RCB(r,g,b) "\x1b[48;2;"r";"g";"b"m" +#define ANSI_RESET "\x1b[0m" +#define ANSI_BOLD "\x1b[1m" +#define ANSI_IF(cond, seq) (cond ? (seq) : "") + +#define make_diag_func(name, diagtype, after) \ + void name(Compiler *cm, const Location *loc, const char *s, ...) { \ + va_list args; \ + va_start(args, s); \ + print_diagnostic(cm, loc, diagtype, s, args); \ + va_end(args); \ + after; \ + } + +void +print_diagnostic(Compiler *cm, const Location *loc, DiagType dt, const char *msg, va_list args) +{ + static const char *ds[] = {"fatal", "error", "warning", "note"}; + static const char *dsc[] = { + ANSI_C("1;90"), ANSI_C("1;31"), ANSI_C("1;35"), ANSI_C("1;34") + }; + + char fmsg[4096] = {0}; + char dmsg[32] = {0}; + bool color = cm != nil ? cm->opts.color : false; + + if (dt == diag_error && cm->error_count < cm->opts.max_errors) + ++cm->error_count; + + vsnprintf(fmsg, sizeof(fmsg), msg, args); + snprintf(dmsg, sizeof(dmsg), "%s%s:%s", ANSI_IF(color, dsc[dt]), ds[dt], ANSI_IF(color, ANSI_RESET)); + + if (loc != nil) { + fprintf(stderr, "(%s:%li:%li) %s %s\n", + loc->source.s, loc->line, loc->column, dmsg, fmsg); + } else { + fprintf(stderr, "%s %s\n", dmsg, fmsg); + } +} + +make_diag_func(fatal, diag_fatal, exit(EXIT_FAILURE)) +make_diag_func(error, diag_error, ) +make_diag_func(warning, diag_warning, ) +make_diag_func(note, diag_note, ) diff --git a/compiler/messages.h b/compiler/messages.h new file mode 100644 index 0000000..4f0160b --- /dev/null +++ b/compiler/messages.h @@ -0,0 +1,30 @@ +#ifndef _messages_h_ +#define _messages_h_ + +#include +#include "state.h" +#include "location.h" + +#ifdef __GNUC__ +# define fmtattr(archt, fmtsi, ftchk) __attribute((format(archt, fmtsi, ftchk))) +#else +# define fmtattr(a, b, c) +#endif + +typedef enum +{ + diag_fatal = 0, + diag_error, + diag_warning, + diag_note, +} DiagType; + +void +fatal(Compiler *cm, const Location *loc, const char *s, ...) fmtattr(printf, 3, 4); +void +error(Compiler *cm, const Location *loc, const char *s, ...) fmtattr(printf, 3, 4); +void +warning(Compiler *cm, const Location *loc, const char *s, ...) fmtattr(printf, 3, 4); +void +note(Compiler *cm, const Location *loc, const char *s, ...) fmtattr(printf, 3, 4); +#endif diff --git a/compiler/parse.c b/compiler/parse.c new file mode 100644 index 0000000..523513a --- /dev/null +++ b/compiler/parse.c @@ -0,0 +1,665 @@ +/* Recursive descent parser + Pratt parser (for expressions) + * TODO: + * - DRY code that handle list of tokens, I have like three almost identical functions for that. + * - Use an arena for the AST nodes. Nuke all of them with a single call + * when we no longer need the AST. + */ +#include + +#include "ast.h" +#include "pre.h" +#include "parse.h" +#include "lex.h" +#include "state.h" +#include "messages.h" +#include "libs/stb_ds.h" + +#define MAX_STMTS_IN_BLOCK 2000 +#define MAX_PROC_ARG_COUNT 127 +#define EXPR_INIT_PREC 1 + +/* Consume a token and match it */ +#define next_match(lexer, tokt) \ + do { LexToken t = lex_scan(lexer); lex_match(lexer, &t, tokt); } while (0) + +/* Scans a token (mutating `t`), and if its id matches `ttype`, + * it executes the code block. Otherwise, the scanned token + * gets put back (so a next call to `lex_scan` can pick it up). + */ +#define matchopt(t, ttype, ps) \ + if ((t = lex_scan(ps->lexer)).id != ttype) { \ + lex_backup((ps)->lexer, t); \ + } else + +#define token_is_binop(t) (t >= T_PLUS && t <= T_NOTEQUAL) +#define token_is_atom(t) (t >= T_IDENT && t <= T_DECNUMBER) +#define token_is_unary(t) (t == T_MINUS || t == T_LOGNOT) +#define token_is_expr_start(t) (token_is_unary(t) || token_is_atom(t)) +#define parse_error(ctx, ...) \ + do { error((ctx)->cm, &((ctx)->lexer->cur_loc), __VA_ARGS__); (ctx)->ok = false; } while (0) + +typedef Optional(AstIdentTypePair) OptAstIdentTypePair; +typedef struct { + int pred; + bool left_assoc; /* false if right assoc... */ +} OperatorPrec; + +/* Operator table specifying the precedence and associativeness + * of each operator, used by the expression parser. + * The precedence goes from lower to higher. + */ +const OperatorPrec OperatorTable[] = { + [T_LOGOR] = {1, true}, + [T_LOGAND] = {2, true}, + [T_LESSTHAN] = {3, true}, + [T_GREATTHAN] = {3, true}, + [T_LOGICEQUAL] = {3, true}, + [T_NOTEQUAL] = {3, true}, + [T_PLUS] = {4, true}, + [T_MINUS] = {4, true}, + [T_STAR] = {5, true}, + [T_BAR] = {5, true}, +}; + +static Ast * +expr(ParserState *ps, int minprec); +static Ast * +expr_comma_list(ParserState *ps); +static Ast * +stmt(ParserState *ps, LexToken token); +static Ast * +stmt_list_until(ParserState *ps, bool putback, const enum LexTokenId *end_markers, isize len); + + +static Ast * +make_tree(enum AstType type, Location loc) +{ + Ast *tree = calloc(1, sizeof(Ast)); + tree->type = type; + tree->loc = loc; + return tree; +} + +static Ast * +make_binop(enum LexTokenId op, Location loc, Ast *lhs, Ast *rhs) +{ + Ast *tree = make_tree(AST_BINEXPR, loc); + tree->bin.op = Str_from_c(TokenIdStr[op]); + tree->bin.left = lhs; + tree->bin.right = rhs; + return tree; +} + +static Ast * +make_ident_node(Str ident, Location loc) +{ + Ast *tree = make_tree(AST_IDENT, loc); + tree->ident = ident; + return tree; +} + +static OptAstIdentTypePair +ident_type_pair(ParserState *ps) +{ + AstIdentTypePair itp = { .loc = ps->lexer->cur_loc }; + /* ident */ + LexToken token = lex_scan(ps->lexer); + lex_match(ps->lexer, &token, T_IDENT); + itp.ident = token.ident; + /* type */ + next_match(ps->lexer, T_COLON); + /* optional qualifier */ + token = lex_scan(ps->lexer); + if (token.id == T_VAR) { + itp.kind = SymVar; + } else { + itp.kind = SymLet; + lex_backup(ps->lexer, token); + } + itp.dtype_loc = ps->lexer->cur_loc; + token = lex_scan(ps->lexer); + if (token.id != T_IDENT) { + parse_error(ps, "expected a type, got %s instead", TokenIdStr[token.id]); + return None(OptAstIdentTypePair); + } + itp.dtype = token.ident; + return Some(OptAstIdentTypePair, itp); +} + +static Vec(AstIdentTypePair) +proc_arglist(ParserState *ps) +{ + Vec(AstIdentTypePair) args = nil; + LexToken next; + + for (;;) { + OptAstIdentTypePair oitp = ident_type_pair(ps); + if (!oitp.ok) + return nil; + if (arrlen(args) + 1 > MAX_PROC_ARG_COUNT) { + parse_error(ps, "more than %d (implementation limit) proc arguments", MAX_PROC_ARG_COUNT); + return nil; + } + + arrput(args, oitp.val); + next = lex_scan(ps->lexer); + /* do we have a comma? if not, we reached the end of the list */ + if (next.id != T_COMMA) + break; + /* check if we have an expression next to this comma, we do this + * to allow a trailling comma + */ + next = lex_scan(ps->lexer); + if (next.id != T_IDENT) + break; + lex_backup(ps->lexer, next); + } + trace("token in arglist out: %s\n", TokenIdStr[next.id]); + lex_backup(ps->lexer, next); + + if (arrlen(args) == 0) { + arrfree(args); + return nil; + } + return args; +} + +static Ast * +proc_decl(ParserState *ps) +{ + LexToken proc_name = lex_scan(ps->lexer); + lex_match(ps->lexer, &proc_name, T_IDENT); + + Ast *proc = make_tree(AST_PROCDEF, ps->lexer->cur_loc); + proc->proc.name = proc_name.ident; + trace("proc name: %s\n", proc->proc.name.s); + + LexToken token = lex_scan(ps->lexer); + if (token.id == T_STAR) { + proc->proc.ispublic = true; + token = lex_scan(ps->lexer); + } + + lex_match(ps->lexer, &token, T_LPAREN); + token = lex_scan(ps->lexer); + if (token.id != T_RPAREN) { + lex_backup(ps->lexer, token); + proc->proc.args = proc_arglist(ps); + token = lex_scan(ps->lexer); + } + lex_match(ps->lexer, &token, T_RPAREN); + + /* return type */ + token = lex_scan(ps->lexer); + if (token.id == T_COLON) { + token = lex_scan(ps->lexer); + lex_match(ps->lexer, &token, T_IDENT); + proc->proc.rettype = make_ident_node(token.ident, ps->lexer->cur_loc); + } else { + lex_backup(ps->lexer, token); + } + /* body */ + proc->proc.body = stmt_list_until(ps, false, (enum LexTokenId[]){T_END}, 1); + return proc; +} + +static Ast * +function_call(ParserState *ps, Str ident, bool ate_lp) +{ + Ast *funcc = make_tree(AST_PROCCALL, ps->lexer->cur_loc); + funcc->call = (AstProcCall){ .name = ident }; + + if (!ate_lp) + next_match(ps->lexer, T_LPAREN); + + LexToken next = lex_scan(ps->lexer); + if (token_is_expr_start(next.id)) { + lex_backup(ps->lexer, next); + funcc->call.args = expr_comma_list(ps); + } else { + lex_backup(ps->lexer, next); + } + next_match(ps->lexer, T_RPAREN); + trace("function call to: %s\n", ident.s); + + return funcc; +} + +static Ast * +variable_assign(ParserState *ps, Str ident, Location loc) +{ + Ast *tree = make_tree(AST_VARASSIGN, loc); + tree->varassgn.name = ident; + tree->varassgn.expr = expr(ps, EXPR_INIT_PREC); + return tree; +} + +static Ast * +funccall_or_assignment(ParserState *ps, Str ident) +{ + LexToken token; + matchopt(token, T_EQUAL, ps) { + return variable_assign(ps, ident, ps->lexer->cur_loc); + } + return function_call(ps, ident, false); +} + +static Ast * +variable_decl(ParserState *ps, enum LexTokenId decl_kind) +{ + static const enum SymbolKind Token2SemaVarKind[] = { + [T_LET] = SymLet, + [T_VAR] = SymVar, + [T_CONST] = SymConst, + }; + Assert(decl_kind == T_LET || decl_kind == T_VAR || decl_kind == T_CONST); + + LexToken token = lex_scan(ps->lexer); + lex_match(ps->lexer, &token, T_IDENT); + + Ast *decl = make_tree(AST_VARDECL, ps->lexer->cur_loc); + decl->var = (AstVarDecl) { + .name = token.ident, + .kind = Token2SemaVarKind[decl_kind], + }; + + /* type */ + matchopt(token, T_COLON, ps) { + token = lex_scan(ps->lexer); + if (token.id != T_IDENT) { + parse_error(ps, "expected a type, got %s instead", TokenIdStr[token.id]); + return nil; + } + decl->var.datatype = make_ident_node(token.ident, ps->lexer->cur_loc); + } + + /* assignment expression */ + matchopt(token, T_EQUAL, ps) { + trace("assignment of decl here\n"); + decl->var.expr = expr(ps, EXPR_INIT_PREC); + } + trace( + "var decl %s %s: %s\n", + TokenIdStr[decl_kind], + decl->var.name.s, + decl->var.datatype != nil ? (char *)decl->var.datatype->ident.s : "(no type)" + ); + /* if there's no type there must be an expr */ + /* TODO: move to semantic analysis phase? */ + if (decl->var.datatype == nil && decl->var.expr == nil) { + parse_error( + ps, + "'%s' declaration must have an assignment expression if no type is specified, " + "but neither a type nor expression was supplied", + TokenIdStr[decl_kind] + ); + return nil; + } + return decl; +} + +static Ast * +return_stmt(ParserState *ps) +{ + Ast *tree = make_tree(AST_RETURN, ps->lexer->cur_loc); + + LexToken next = lex_scan(ps->lexer); + if (token_is_expr_start(next.id)) { + lex_backup(ps->lexer, next); + tree->ret = expr(ps, EXPR_INIT_PREC); + } else { + lex_backup(ps->lexer, next); + } + return tree; +} + +static Ast * +break_stmt(ParserState *ps) +{ + return make_tree(AST_BREAK, ps->lexer->cur_loc); +} + +static Ast * +discard_stmt(ParserState *ps) +{ + Ast *tree = make_tree(AST_DISCARD, ps->lexer->cur_loc); + tree->discard.expr = expr(ps, EXPR_INIT_PREC); + return tree; +} + +static Ast * +parse_attribute(ParserState *ps) +{ + Ast *tree = make_tree(AST_ATTRIBUTE, ps->lexer->cur_loc); + LexToken next = lex_scan(ps->lexer); + lex_match(ps->lexer, &next, T_LBRACKET); + next = lex_scan(ps->lexer); + lex_match(ps->lexer, &next, T_RBRACKET); + return tree; +} + +/* A declaration "decorated" with an attribute */ +static Ast * +decorated_decl(ParserState *ps) +{ + Ast *attr = parse_attribute(ps); + LexToken next = lex_scan(ps->lexer); + switch (next.id) { + case T_PROC: + attr->attribute.node = proc_decl(ps); + break; + case T_CONST: + case T_LET: + case T_VAR: + attr->attribute.node = variable_decl(ps, next.id); + break; + default: + parse_error(ps, "node of kind '%s' cannot be attributed", TokenIdStr[next.id]); + return nil; + } + return attr; +} + +static Ast * +if_stmt_expr(ParserState *ps) +{ + const enum LexTokenId if_block_ends[] = {T_ELSE, T_ELIF, T_END}; + Ast *tree = make_tree(AST_IF, ps->lexer->cur_loc); + /* parse `if` */ + tree->ifse.cond = expr(ps, EXPR_INIT_PREC); + tree->ifse.true_body = stmt_list_until(ps, true, if_block_ends, countof(if_block_ends)); + tree->ifse.false_body = nil; + + LexToken next = lex_scan(ps->lexer); + AstElif elif_tree; + /* parse `elif`s and else */ + for (;;) { + switch (next.id) { + case T_END: /* only has true branch */ + return tree; + case T_ELSE: + /* once we see an `else` block, we assume the end of the `if` block, + * enforcing that `else` must be the last. */ + trace("we got else\n"); + tree->ifse.false_body = stmt_list_until(ps, true, (enum LexTokenId[]){T_ELIF, T_END}, 2); + next = lex_scan(ps->lexer); + if (next.id == T_ELIF) { + parse_error(ps, "'elif' branch after 'else' branch not allowed"); + lex_backup(ps->lexer, next); + return nil; + } + return tree; + case T_ELIF: + trace("we got elif\n"); + elif_tree.cond = expr(ps, EXPR_INIT_PREC); + elif_tree.body = stmt_list_until(ps, true, if_block_ends, countof(if_block_ends)); + next = lex_scan(ps->lexer); + arrput(tree->ifse.elifs, elif_tree); + /* no more `elif` blocks neither an `else` block next */ + if (next.id == T_END) + return tree; + Assert(next.id == T_ELSE || next.id == T_ELIF); + break; + default: /* shouldn't happen */ + lex_backup(ps->lexer, next); + parse_error(ps, "huh?: %s", TokenIdStr[next.id]); + return nil; + } + } + return tree; +} + +static Ast * +while_stmt(ParserState *ps) +{ + Ast *tree = make_tree(AST_LOOP, ps->lexer->cur_loc); + tree->loop.precond = expr(ps, EXPR_INIT_PREC); + tree->loop.body = stmt_list_until(ps, false, (enum LexTokenId[]){T_END}, 1); + return tree; +} + +static Ast * +atom(ParserState *ps) +{ + Ast *tree = make_tree(AST_INVALID, ps->lexer->cur_loc); + LexToken t = lex_scan(ps->lexer); + LexToken next; + + switch (t.id) { + case T_NUMBER: + tree->type = AST_NUMBER; + tree->number.n = t.inumber; + trace("number in atom: %lu\n", t.inumber); + return tree; + case T_STRING: + tree->type = AST_STRLIT; + tree->strlit = t.str; + return tree; + case T_IDENT: + next = lex_scan(ps->lexer); + /* It is a plain symbol or a function call? */ + if (next.id == T_LPAREN) { + free(tree); + tree = function_call(ps, t.ident, true); + } else { + lex_backup(ps->lexer, next); + tree->type = AST_IDENT; + tree->ident = t.ident; + } + return tree; + default: + parse_error(ps, "expected a number, identifier or expression, not '%s'", TokenIdStr[t.id]); + free(tree); + } + return nil; +} + +static Ast * +unary(ParserState *ps) +{ + LexToken next = lex_scan(ps->lexer); + if (token_is_unary(next.id)) { + Ast *unt = make_tree(AST_UNARY, ps->lexer->cur_loc); + unt->unary.op = Str_from_c(TokenIdStr[next.id]); + unt->unary.atom = atom(ps); + return unt; + } + lex_backup(ps->lexer, next); + return atom(ps); +} + +/* Parse a binary expression or an atom. This implements the Pratt parser algorithm. + * See also: + * - https://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing + * - https://www.oilshell.org/blog/2016/11/01.html + * XXX: Mutate to the shunting yard variation? Since it uses an explicit stack instead of the call + * stack, guard against deeply nested expressions. + */ +static Ast * +expr(ParserState *ps, int minprec) +{ + Ast *tree = unary(ps); + for (;;) { + LexToken t = lex_scan(ps->lexer); + if (!token_is_binop(t.id) + || t.id == T_END + || OperatorTable[t.id].pred < minprec) { + lex_backup(ps->lexer, t); + break; + } + const OperatorPrec op = OperatorTable[t.id]; + const int next_prec = op.left_assoc ? op.pred + 1 : op.pred; + tree = make_binop(t.id, ps->lexer->cur_loc, tree, expr(ps, next_prec)); + } + return tree; +} + +static Vec(Ast *) +sep_list(ParserState *ps, Ast *(*prod_fn)(Compiler *, void *)) +{ + (void)ps, (void)prod_fn; + Vec(Ast *) prod = nil; + return prod; +} + +static Ast * +expr_comma_list(ParserState *ps) +{ + Ast *tree = make_tree(AST_EXPRS, ps->lexer->cur_loc); + Vec(Ast *) exprs = nil; + + LexToken next; + for (;;) { + arrput(exprs, expr(ps, EXPR_INIT_PREC)); + next = lex_scan(ps->lexer); + trace("commalist tok: %s\n", TokenIdStr[next.id]); + /* do we have a comma? if not, we reached the end of the list */ + if (next.id != T_COMMA) + break; + next = lex_scan(ps->lexer); + /* check if we have an expression next to this comma, we do this + * to allow a trailling comma + */ + if (!token_is_expr_start(next.id)) + break; + lex_backup(ps->lexer, next); + } + lex_backup(ps->lexer, next); + + if (arrlen(exprs) == 0) { + free(tree); + arrfree(exprs); + return nil; + } + tree->exprs = exprs; + return tree; +} + +static bool +token_id_in_list(enum LexTokenId c, const enum LexTokenId *toks, isize len) +{ + for (isize i = 0; i < len; ++i) + if (c == toks[i]) + return true; + return false; +} + +/* Parses a statement list until the token `end_marker`. Returns `nil` if the statement list + * is empty. */ +static Ast * +stmt_list_until(ParserState *ps, bool putback, const enum LexTokenId *end_markers, isize len) +{ + LexToken token = lex_scan(ps->lexer); + Vec(Ast *) stmts = nil; + Ast *body = make_tree(AST_STMTS, ps->lexer->cur_loc); + + /* stmt* */ + while (!token_id_in_list(token.id, end_markers, len)) { + trace("stmt list token: %s\n", TokenIdStr[token.id]); + if (arrlen(stmts) + 1 > MAX_STMTS_IN_BLOCK) { + parse_error(ps, "more than %d (implementation limit) statements in block", MAX_STMTS_IN_BLOCK); + return nil; + } + arrput(stmts, stmt(ps, token)); + + token = lex_scan(ps->lexer); + if (token.id == T_EOF) { + parse_error(ps, "unexpected EOF, expected a statement or `end`"); + break; + } + if (token.id == T_SEMICOLON) + token = lex_scan(ps->lexer); + } + //lex_match(ps->lexer, &token, end_marker); + trace("token before end next_match: %s\n", TokenIdStr[token.id]); + if (putback) + lex_backup(ps->lexer, token); + /* empty list, just return nil instead of wasting space on a 0-length + * vector */ + if (arrlen(stmts) == 0) { + free(body); + arrfree(stmts); + return nil; + } + body->stmts = stmts; + return body; +} + +static Ast * +stmt(ParserState *ps, LexToken token) +{ + switch (token.id) { + case T_IDENT: + return funccall_or_assignment(ps, token.ident); + case T_CONST: + case T_LET: + case T_VAR: + return variable_decl(ps, token.id); + case T_PROC: + return proc_decl(ps); + case T_HASH: + return decorated_decl(ps); + case T_RETURN: + return return_stmt(ps); + case T_BREAK: + return break_stmt(ps); + case T_DISCARD: + return discard_stmt(ps); + case T_IF: + return if_stmt_expr(ps); + case T_ELIF: + parse_error(ps, "stray 'elif'"); + return nil; + case T_WHILE: + return while_stmt(ps); + case T_ELSE: + parse_error(ps, "'else' with no accompanying 'if'"); + return nil; + case T_END: + parse_error(ps, "stray 'end' keyword"); + return nil; + case T_EOF: + parse_error(ps, "unexpected EOF while parsing a statement"); + return nil; + default: + parse_error(ps, "invalid statement '%s'", TokenIdStr[token.id]); + exit(1); + } + return nil; +} + +/* Parse statements until EOF. */ +static Ast * +stmt_list(ParserState *ps) +{ + Ast *tree = make_tree(AST_STMTS, ps->lexer->cur_loc); + for (;;) { + const LexToken next = lex_scan(ps->lexer); + if (next.id == T_EOF) + break; + arrput(tree->stmts, stmt(ps, next)); + } + return tree; +} + +ParserState * +parse_new(Compiler *cm, LexState *ls) +{ + ParserState *ps = calloc(1, sizeof(*ps)); + ps->cm = cm; + ps->lexer = ls; + ps->ok = true; + return ps; +} + +void +parse_destroy(ParserState *ps) +{ + free(ps); +} + +Ast * +parse(ParserState *ps) +{ + return stmt_list(ps); +} diff --git a/compiler/parse.h b/compiler/parse.h new file mode 100644 index 0000000..5343dbc --- /dev/null +++ b/compiler/parse.h @@ -0,0 +1,21 @@ +#ifndef _parse_h_ +#define _parse_h_ + +#include "ast.h" +#include "state.h" +#include "lex.h" + +typedef struct { + Compiler *cm; + LexState *lexer; + bool ok; +} ParserState; + +ParserState * +parse_new(Compiler *cm, LexState *ls); +void +parse_destroy(ParserState *ps); +Ast * +parse(ParserState *ps); + +#endif diff --git a/compiler/pre.h b/compiler/pre.h new file mode 100644 index 0000000..743f2ca --- /dev/null +++ b/compiler/pre.h @@ -0,0 +1,158 @@ +#ifndef _pre_h_ +#define _pre_h_ +/* Prelude file, containing some useful macros and types. */ + +#include +#include +#include +#include +#include + +typedef int8_t i8; +typedef uint8_t u8; +typedef int16_t i16; +typedef uint16_t u16; +typedef int32_t i32; +typedef uint32_t u32; +typedef int64_t i64; +typedef uint64_t u64; +typedef float f32; +typedef double f64; + +typedef uintptr_t uptr; +typedef ptrdiff_t isize; +typedef size_t usize; +typedef _Bool bool; + +#define true ((bool)1) +#define false ((bool)0) +#define nil ((void *)0) +#define U64_MAX ((u64)-1) + +#define Slice(T) \ + struct { \ + T *s; \ + isize len; \ + } + +typedef Slice(u8) Str; +#define Optional(T) struct {T val; bool ok;} +#define Some(T, v) (T){v, true} +#define None(T) (T){.ok = false} +/* Meant for use with stb_ds */ +#define Vec(T) T * +#define HashMap(K, V) struct { K key; V value; } +#define HashMapStr(V) struct { char *key; V value; } + +/* Length of an array */ +#define countof(arr) (isize)(sizeof(arr) / sizeof(*(arr))) +/* Length of string literal */ +#define lengthof(s) (countof(s) - 1) + +#define ViewMem(T, arr, lo, hi) ((Slice(T)){.s = arr+lo, .len = hi - lo}) +#define View(sl, lo, hi) ((sl).s += lo, (sl).len = hi - lo, (sl)) +#define foreach(val, arr) for(__typeof__(*(arr)) *__p = (arr), (val) = *__p; __p < (arr)+(arrlen((arr))); (val) = *(__p++)) +#define foreach_getindex(val, arr) (&(val) - (arr)) + +/* Useful integer operations good to have. */ +#define max(x, y) ((x) > (y) ? (x) : (y)) +#define min(x, y) ((x) < (y) ? (x) : (y)) +#define clamp(x, lo, hi) max(lo, min(x, hi)) + +#define BitPos(pos) (1 << (pos)) + +#if defined(__GNUC__) || defined(__clang__) +# define debugtrap() __builtin_trap() +# define unreachable() __builtin_unreachable() +#else /* not optimal... */ +# define debugtrap() abort() +# define unreachable() abort() +#endif + +#ifndef NDEBUG +# if defined (__GNUC__) || defined(__clang__) +# define Assert(pred) if (!(pred)) { __builtin_trap(); } +# else +# define Assert(pred) if (!(pred)) { *(volatile int *)0 = 0; } +# endif +# define trace(...) do { \ + fprintf(stderr, "%s:%-5i", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) +#else +# define Assert(pred) +# define trace(...) +#endif + +/* Creates a `Str` from a string literal */ +#define Sl(s) ((Str){ (u8 *)s, (isize)lengthof(s) }) +/* Creates a `Str` from a buffer of size `len` */ +#define Sb(s, len) ((Str){ (u8 *)s, (isize)len }) +/* Creates a `Str` from a C string. */ +#define Str_from_c(s) ((Str){ (u8 *)s, (isize)(s != nil ? strlen(s) : 0) }) +#define Str_empty(s) ((s).len == 0) +#define Str_default(s, sor) (!Str_empty(s) ? (s) : (sor)) + +int +vsnprintf(char *, unsigned long, const char *, va_list); + +/* "Converts" a `Str` into a C string. Since `Str` are meant to be + * null-terminated already, no conversion is made, but ensures that the + * null terminator is present. */ +static inline char * +Str_to_c(Str s) +{ + if (s.len == 0 || s.s == nil) + return nil; + Assert(s.s[s.len - 1] == '\0'); + return (char *)s.s; +} + +/* Returns `true` if both strings are equal. */ +static inline bool +Str_equal(Str s1, Str s2) +{ + /* because passing nil to mem* is UB even if size == 0... */ + return (s1.len == s2.len) && (s1.len == 0 || memcmp(s1.s, s2.s, s1.len) == 0); +} + +/* Heaps allocates a new `Str` of size `len`, with contents from `data` if it is + * not `nil`.*/ +static inline Str +Str_new(const u8 *data, isize len) +{ + Assert(len >= 0); + Str s; + s.s = calloc(len + 1, sizeof(*s.s)); + s.len = len; + if (data != nil) { + memcpy(s.s, data, len); + s.s[len + 1] = '\0'; /* ensure */ + } + return s; +} + +/* Returns a formatted string (heap allocated) of the exact required size. */ +static inline Str +Strafmt(const char *fmt, ...) +{ + Str s = {0}; + va_list args; + + va_start(args, fmt); + /* Calculate buffer size required to hold the formatted string */ + int reqs = vsnprintf(nil, 0, fmt, args); + va_end(args); + if (reqs < 0) + return s; + + s = Str_new(nil, reqs); + va_start(args, fmt); /* `vsnprintf` touched the arg list, reinitialize it */ + /* the nil terminator is guaranteed by `Str_new` */ + vsnprintf((char *)s.s, s.len + 1, fmt, args); + va_end(args); + + return s; +} + +#endif diff --git a/compiler/rutilec.c b/compiler/rutilec.c new file mode 100644 index 0000000..fb6eb8d --- /dev/null +++ b/compiler/rutilec.c @@ -0,0 +1,200 @@ +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include + +#include "pre.h" +#include "lex.h" +#include "parse.h" +#include "sema.h" +#include "state.h" +#include "codegen.h" +#include "messages.h" + +#include "libs/optparse.h" +#include "libs/stb_ds.h" + +#ifndef GIT_HASH +# define GIT_HASH "" +#endif +#ifndef BUG_REPORT_URL +# define BUG_REPORT_URL "https://codeberg.org/tocariimaa/rutile" +#endif +#ifndef TARGET_EXE_EXT /* without prefix dot! */ +# define TARGET_EXE_EXT Sl("") +#endif + +static const char *HelpMessage = \ + "Summary of common options:\n" \ + " -c\tCompile only. Don't link, output an object file instead.\n" \ + " -d\tDefine a constant with the specified value.\n" \ + " -h\tPrint this help message.\n" \ + " -o\tSet output file name of the executable/object file.\n" \ + " -v\tPrint the version of this compiler, plus other relevant information.\n" \ + " -S\tEmit intermediate code.\n" \ + " -R\tSet the code generation mode, 'release' for an optimized build,\n" \ + " \t'debug' for a debug build.\n" \ + ; + +/* Creates the output binary file name, changing the extension to the current platform + * executable file extension, or it simply removes the original extension if the platform + * has no binary extension, (i.e UNIX-likes OSes). */ +static Str +make_binary_filename(Compiler *cm, Str src_filename, const Str exe_ext) +{ + bool exe_has_ext = exe_ext.len > 0; + const size_t ss = src_filename.len; + Assert(ss != 0); + char *buf = malloc(ss + 2 + (exe_has_ext ? exe_ext.len : 0)); + memcpy(buf, src_filename.s, ss); + buf[ss] = '\0'; + + char *p = buf + (ss - 1); + while (p != buf && *p != '.') /* Search for the first '.' backwards */ + --p; + /* No extension in filename, egde case really */ + if (p == buf) { + if (!exe_has_ext) + fatal(cm, nil, "output file name required in this case (host OS binary format lacks extension)"); + /* append extension then */ + p = buf + ss; + *p = '.'; + } + if (exe_has_ext) { + memcpy(++p, exe_ext.s, exe_ext.len); + p += 3; + } + *p = '\0'; + return Str_from_c(buf); +} + +static enum CodegenBackends +backend_from_str(Compiler *cm, Str s) +{ + if (s.len == 1 && (s.s[0] == 'c' || s.s[0] == 'C')) + return CgBackendC; + else if (Str_equal(s, Sl("gcc"))) + return CgBackendLibGccJit; + fatal(cm, nil, "unknown backend '%s'", s.s); + unreachable(); +} + +static Str +cli_boilerplate(char **argv, Compiler *cm) +{ + const struct optparse_long longopts[] = { + {"backend", 'b', OPTPARSE_REQUIRED}, + {"compile-only", 'c', OPTPARSE_NONE}, + {"define", 'd', OPTPARSE_REQUIRED}, + {"max-errors", 'E', OPTPARSE_REQUIRED}, + {"release", 'R', OPTPARSE_REQUIRED}, + {"emit-ir", 'S', OPTPARSE_OPTIONAL}, + {"exe", 'o', OPTPARSE_REQUIRED}, + {"version", 'v', OPTPARSE_NONE}, + {"help", 'h', OPTPARSE_NONE}, + {0}, + }; + + const char *no_fun_env = getenv("NO_COLOR"); + cm->opts.color = isatty(STDERR_FILENO) && !(no_fun_env != nil && *no_fun_env != '\0'); + + struct optparse opts; + optparse_init(&opts, argv); + + i8 opt; + while ((opt = optparse_long(&opts, longopts, nil)) != -1) { + switch (opt) { + case 'b': + cm->opts.backend = backend_from_str(cm, Str_from_c(opts.optarg)); + break; + case 'c': + cm->opts.compile_only = true; + break; + case 'd': + trace("define: %s\n", opts.optarg); + arrput(cm->opts.defines, Str_from_c(opts.optarg)); + break; + case 'h': + printf("Usage: %s [options...] files...\n\n%s\n", *argv, HelpMessage); + exit(0); + case 'E': + cm->opts.max_errors = atoi(opts.optarg); /* XXX: atoi LOL */ + break; + case 'R': + trace("release: %s\n", opts.optarg); + cm->opts.release_mode = Str_from_c(opts.optarg); + break; + case 'S': + break; + case 'o': + cm->opts.exe_out = Str_from_c(opts.optarg); + break; + case 'v': + printf("Rutile compiler v0.0.1\n"); + printf("git commit: %s\nReport bugs here: %s\n", GIT_HASH, BUG_REPORT_URL); + exit(0); + case '?': + fatal(cm, nil, "%s: %s", *argv, opts.errmsg); + } + } + + const char *src_filename = optparse_arg(&opts); + if (src_filename == nil) + fatal(cm, nil, "no input files specified"); + return Str_from_c(src_filename); +} + +int +main(int argc, char **argv) +{ + (void)argc; + Compiler cm = { + .opts = { + .backend = CgBackendC, + .max_errors = 20, + } + }; + + Str src_filename = cli_boilerplate(argv, &cm); + FILE *src_in = nil; + + if (src_filename.s[0] == '-' && src_filename.s[1] == '\0') { + src_in = stdin; + src_filename = Sl(""); + } else { + if ((src_in = fopen((char *)src_filename.s, "rb")) == nil) { + fatal(&cm, nil, "can't open: %s", src_filename.s); + } + } + + cm.current_filename = src_filename; + if (cm.opts.exe_out.len == 0) + cm.opts.exe_out = make_binary_filename(&cm, src_filename, TARGET_EXE_EXT); + + if (Str_equal(cm.opts.exe_out, cm.current_filename)) { + fatal(&cm, nil, "input source file and output file are the same"); + } + + /* Compiler pipeline */ + LexState *ls = lex_new(&cm, src_in, src_filename, 4); + ParserState *ps = parse_new(&cm, ls); + SemaCtx *ss = sema_new(&cm); + Ast *program = parse(ps); + if (!ps->ok) + goto err; + sema(ss, program); + if (!ss->ok) + goto err; + + CodegenCtx *cgctx = codegen_new(&cm, cm.opts.backend); + codegen(cgctx, program); + codegen_destroy(cgctx); +err: + sema_destroy(ss); + parse_destroy(ps); + lex_destroy(ls); + fclose(src_in); + + return 0; +} diff --git a/compiler/sema.c b/compiler/sema.c new file mode 100644 index 0000000..913a845 --- /dev/null +++ b/compiler/sema.c @@ -0,0 +1,980 @@ +/* Semantic analyzer and type checker */ +#include +#include + +#include "pre.h" +#include "sema.h" +#include "datatype.h" +#include "location.h" +#include "symbol.h" +#include "ast.h" +#include "state.h" +#include "messages.h" +#include "libs/stb_ds.h" + +#define sema_error(ctx, loc, ...) do { \ + error((ctx)->cm, loc, __VA_ARGS__); \ + (ctx)->ok = false; \ + } while (0) +#define sema_warning(ctx, loc, ...) warning((ctx)->cm, loc, __VA_ARGS__) +#define sema_note(ctx, loc, ...) note((ctx)->cm, loc, __VA_ARGS__) +#define sema_is_stmt_terminal(s) (s->type == AST_RETURN || s->type == AST_BREAK) +#define sym_insert(syms, k, v) shput(syms, k, v) + +enum SemaCtxFlags /* 64 bits */ +{ + SctxInsideProc = BitPos(0), + SctxInsideLoop = BitPos(1), + SctxInsideIf = BitPos(2), + SctxInTopLevel = BitPos(3), + SctxInExpr = BitPos(4), + SctxInDiscard = BitPos(5), + SctxInStmtBlock = BitPos(6), +}; + +typedef struct { + enum SymbolKind kind; + /* The data type associated with the symbol. */ + DataType *dtype; + bool used; + bool procparm; /* if its a proc parameter */ + Location loc; +} Symbol; + +typedef HashMapStr(Symbol) SymbolEntry; + +struct Scope +{ + Scope *prev; /* Previous scope in the stack */ + SymbolEntry *symbols; /* All the symbols in this scope */ +}; + +typedef struct { + bool ok; +} SemaStatus; + +static const Symbol InvalidSymbol = {.kind = SymInvalid}; +static const DataType *InvalidDataType = &(DataType){.kind = DtkInvalid}; + +static DataTypeCheck +datatype_struct_cmp(SemaCtx *sctx, DataType *s1, DataType *s2); +static DataTypeCheck +datatype_cmp(SemaCtx *sctx, DataType *dt1, DataType *dt2); +static DataType * +sema_expr(SemaCtx *sctx, Ast *expr, Location loc); +static void +sema_expr_list(SemaCtx *sctx, Vec(Ast *) exprs, Location loc); +static void +sema_node(SemaCtx *sctx, Ast *node); +static void +sema_stmts(SemaCtx *sctx, Vec(Ast *) stmts); +static DataType * +resolve_datatype(SemaCtx *sctx, const Str ident, Location loc); + +static Scope * +make_scope(Scope *prev) +{ + Scope *sc = malloc(sizeof(*sc)); + sc->prev = prev; + sc->symbols = nil; + sh_new_arena(sc->symbols); + shdefault(sc->symbols, InvalidSymbol); + return sc; +} + +static SemaCtx * +make_semactx(Compiler *cm, SemaCtx *prev) +{ + SemaCtx *smc = calloc(1, sizeof(*smc)); + smc->cm = cm; + smc->prev = prev; + return smc; +} + +static DataType * +make_data_type(enum DataTypeKind kind, u16 size, bool builtin, bool sign) +{ + DataType *dt = calloc(1, sizeof(*dt)); + dt->kind = kind; + dt->size = size; + dt->builtin = builtin; + dt->sign = sign; + return dt; +} + +static DataType * +make_proc_type(bool builtin, DataType *rettype, Vec(DataType *) argtypes) +{ + DataType *pdt = calloc(1, sizeof(*pdt)); + pdt->kind = DtkProc; + pdt->builtin = builtin; + pdt->proc.rettype = rettype; + pdt->proc.argtypes = argtypes; + return pdt; +} + +static Vec(DataType *) +make_type_list_from_idents(SemaCtx *sctx, Vec(AstIdentTypePair) idents) +{ + if (idents == nil) + return nil; + + Vec(DataType *) dts = nil; + foreach (ident, idents) + arrput(dts, resolve_datatype(sctx, ident.dtype, ident.dtype_loc)); + return dts; +} + +static Vec(DataType *) +make_proc_args(DataType *a[], isize len) +{ + Vec(DataType *) args = nil; + arrsetlen(args, len); + memcpy(args, a, len); + return args; +} + +/* Pushes a new context frame. Note that this inherits the flags and scope of the + * previous context frame. + * XXX: could rather only push flags... + */ +static void +push_semactx(SemaCtx **sctx) +{ + SemaCtx *tmp = make_semactx((*sctx)->cm, *sctx); + tmp->flags = (*sctx)->flags; + tmp->current_scope = (*sctx)->current_scope; + tmp->top_scope = (*sctx)->top_scope; + tmp->ok = (*sctx)->ok; + *sctx = tmp; +} + +/* Pops the current context frame. */ +static void +pop_semactx(SemaCtx **sctx) +{ + SemaCtx *prev = (*sctx)->prev; + compiler_assert((*sctx)->cm, prev != nil); + prev->ok = (*sctx)->ok; + free(*sctx); + *sctx = prev; +} + +static void +enter_scope(SemaCtx *sctx) +{ + sctx->current_scope = make_scope(sctx->current_scope); +} + +static void +exit_scope(SemaCtx *sctx) +{ + compiler_assert(sctx->cm, sctx->current_scope->prev != nil); + sctx->current_scope = sctx->current_scope->prev; +} + +Symbol * +sym_search_oncurrent(Scope *scope, const Str name) +{ + Symbol *sym = &shget(scope->symbols, name.s); + if (sym->kind != SymInvalid) + return sym; + return nil; +} + +/* Searches for a symbol in the current and previous scopes */ +Symbol * +sym_search(Scope *scope, const Str name) +{ + Scope *sp = scope; + Symbol *sym = nil; + while (sp != nil && sym == nil) { + sym = sym_search_oncurrent(sp, name); + sp = sp->prev; + } + return sym; +} + +/* Scans through the current scope for any unused var-like bindings + * (including proc parameters) + */ +static void +sema_check_unused_vars(SemaCtx *sctx) +{ + /* Very simple, iterate over all bindings on this scope and report any that + * doesn't have the 'used' flag toggled. */ + const SymbolEntry *syms = sctx->current_scope->symbols; + for (isize i = 0; i < shlen(syms); ++i) { + const Symbol sym = syms[i].value; + if (!sym.used && symbol_is_var_binding(sym.kind)) { + const char *bind_kind_name = !sym.procparm ? "variable" : "proc parameter"; // SymbolKindStr[sym.kind] : "proc parameter"; + sema_warning( + sctx, &sym.loc, + "unused %s '%s'", bind_kind_name, syms[i].key + ); + } + } +} + +static void +sema_check_dead_stmts(SemaCtx *sctx, Vec(Ast *) stmts) +{ + (void)sctx, (void)stmts; + /* those who forsake the CFG are doomed to implement it badly without even + * noticing... */ +} + +static void +sema_match_proc_type(SemaCtx *sctx, Symbol *fsym, Str fident) +{ + if (fsym->dtype->kind != DtkProc) { + sema_error( + sctx, nil, + "cannot call '%s' because has non-proc type '%s'", + fident.s, "uh" + ); + return; + } +} + +static DataType * +sema_proccall(SemaCtx *sctx, const AstProcCall *call, Location loc) +{ + Symbol *fsym = sym_search(sctx->current_scope, call->name); + if (fsym == nil) { + sema_error(sctx, &loc, "call to undeclared proc '%s'", call->name.s); + return nil; + } + + fsym->used = true; + sema_match_proc_type(sctx, fsym, call->name); + + /* check call arguments */ + const isize proc_arglen = arrlen(fsym->dtype->proc.argtypes); + if (call->args != nil) { + compiler_assert(sctx->cm, call->args->type == AST_EXPRS); + const isize call_arglen = arrlen(call->args->exprs); + + if (call_arglen != proc_arglen) { + const char *at_most = call_arglen > proc_arglen ? "s at most" : ""; + sema_error( + sctx, &loc, + "argument length mismatch: given %li arguments to '%s' but it expects %li argument%s", + call_arglen, call->name.s, proc_arglen, at_most + ); + return nil; + } + sema_expr_list(sctx, call->args->exprs, loc); /* now sema-check the args */ + } else if (call->args == nil && proc_arglen != 0) { + sema_error(sctx, &loc, "'%s' proc takes %li argument(s), but none given", + call->name.s, proc_arglen); + return nil; + } + + if (fsym->dtype->proc.rettype != sctx->builtintypes.void_t + && (~sctx->flags & SctxInDiscard) + && (~sctx->flags & SctxInExpr)) { + sema_error(sctx, &loc, "result of function call with non-void type ignored"); + sema_note(sctx, &loc, "use 'discard' if this was intentional"); + return nil; + } + + for (isize i = 0; i < proc_arglen; ++i) { + ; + } + return fsym->dtype; +} + +/************ Semantic and type checking of expressions ************/ +/* Type checking for expressions is done inside-out */ + +static DataType * +sema_expr_number(SemaCtx *sctx, AstNumber *num) +{ +#define pow2(exp) (2 << (exp - 1)) + /* type rule axiom */ + num->type = sym_search_oncurrent(sctx->top_scope, Sl("u64"))->dtype; + return num->type; +#undef pow2 +} + +static DataType * +sema_expr_strlit(SemaCtx *sctx, const Str *strlit) +{ + (void)sctx, (void)strlit; + /* type rule axiom */ + return sym_search_oncurrent(sctx->top_scope, Sl("string"))->dtype; +} + +static Symbol * +sema_expr_ident(SemaCtx *sctx, const Str ident) +{ + Symbol *ident_sym = sym_search(sctx->current_scope, ident); + if (ident_sym == nil) { + sema_error(sctx, nil, "undeclared identifier '%s'", ident.s); + return nil; + } + if (ident_sym->kind == SymType) { + sema_error(sctx, nil, "data type '%s' used as identifier in expression", ident.s); + return nil; + } + ident_sym->used = true; + return ident_sym; +} + +static DataType * +sema_expr_unary(SemaCtx *sctx, AstUnary *unary, Location loc) +{ + Ast *expr = unary->atom; + compiler_assert(sctx->cm, ast_node_is_expr(expr->type)); + //if (expr->type == AST_STRLIT) { + // sema_error(sctx, nil, "%s with a string literal makes no sense\n", TokenIdStr[unary->op]); + // return; + //} + + //if (expr->type == AST_NUMBER) { + // if (unary->op == T_MINUS && !expr->number.type->sign) { + // } + //} + return sema_expr(sctx, expr, loc); +} + +static DataType * +sema_binop(SemaCtx *sctx, const AstBinop *expr, Location loc) +{ + Symbol *opsym = sym_search_oncurrent(sctx->top_scope, expr->op); + if (opsym == nil) { + sema_error(sctx, nil, "no operator '%s'", expr->op.s); + return nil; + } + if (arrlen(opsym->dtype->proc.argtypes) != 2) { + sema_error(sctx, nil, "no binary operator for '%s'", expr->op.s); + return nil; + } + + DataType *ldt = sema_expr(sctx, expr->left, loc); + DataType *rdt = sema_expr(sctx, expr->right, loc); + /* Skip typechecking if either ldt or rdt have `InvalidDataType` and propagate + * it up the call stack. */ + if (ldt == InvalidDataType || rdt == InvalidDataType) + return (DataType *)InvalidDataType; + + DataTypeCheck tchk; + if (!(tchk = datatype_cmp(sctx, ldt, rdt)).ok) { + sema_error(sctx, &loc, "type error: %s", tchk.msg.s); + return nil; + } + return ldt; +} + +static DataType * +sema_expr(SemaCtx *sctx, Ast *expr, Location loc) +{ + compiler_assert(sctx->cm, ast_node_is_expr(expr->type)); + push_semactx(&sctx); + sctx->flags |= SctxInExpr; + + DataType *dt = nil; + switch (expr->type) { + case AST_BINEXPR: + dt = sema_binop(sctx, &expr->bin, loc); + break; + case AST_UNARY: + dt = sema_expr_unary(sctx, &expr->unary, loc); + break; + case AST_NUMBER: + dt = sema_expr_number(sctx, &expr->number); + break; + case AST_STRLIT: + dt = sema_expr_strlit(sctx, &expr->strlit); + break; + case AST_IDENT: + dt = sema_expr_ident(sctx, expr->ident)->dtype; + break; + case AST_PROCCALL: + dt = sema_proccall(sctx, &expr->call, expr->loc); + break; + default: + unreachable(); + } + + pop_semactx(&sctx); + return dt; +} + +static void +sema_expr_list(SemaCtx *sctx, Vec(Ast *) exprs, Location loc) +{ + foreach (expr, exprs) { + sema_expr(sctx, expr, loc); + } +} + +/************ Type checking ************/ + +/* Structurally compare two structural data types. */ +static DataTypeCheck +datatype_struct_cmp(SemaCtx *sctx, DataType *s1, DataType *s2) +{ + compiler_assert(sctx->cm, s1->kind == DtkStruct && s2->kind == DtkStruct); + const DataTypeCompound *s1s = &s1->compound; + const DataTypeCompound *s2s = &s2->compound; + + if (s1s->packed != s2s->packed) + return (DataTypeCheck){false, Sl("")}; + if (arrlen(s1s->fields) != arrlen(s2s->fields)) + return (DataTypeCheck){false, Sl("")}; + for (isize i = 0; i < arrlen(s1s->fields); ++i) { + DataTypeCheck tchk; + if (!(tchk = datatype_cmp(sctx, s1s->fields[i], s2s->fields[i])).ok) + return tchk; + } + return (DataTypeCheck){.ok = true}; +} + +static DataTypeCheck +datatype_array_cmp(SemaCtx *sctx, DataType *a1, DataType *a2) +{ + DataTypeCheck tchk = {.ok = true}; + if (a1->array.len != a2->array.len) + return (DataTypeCheck){false, Sl("")}; + if (!(tchk = datatype_cmp(sctx, a1->array.base, a2->array.base)).ok) + return tchk; + return tchk; +} + +static DataTypeCheck +datatype_proc_cmp(SemaCtx *sctx, DataType *pc1, DataType *pc2) +{ + DataTypeCheck tchk = {.ok = true}; + + if (pc1->proc.public != pc2->proc.public) + return (DataTypeCheck){false, Sl("")}; + if (pc1->proc.extern_lnk != pc2->proc.extern_lnk) + return (DataTypeCheck){false, Sl("")}; + if (pc1->proc.c_varargs != pc2->proc.c_varargs) + return (DataTypeCheck){false, Sl("")}; + if (arrlen(pc1->proc.argtypes) != arrlen(pc2->proc.argtypes)) + return (DataTypeCheck){false, Sl("")}; + if (!(tchk = datatype_cmp(sctx, pc1->proc.rettype, pc2->proc.rettype)).ok) + return tchk; + + for (isize i = 0; i < arrlen(pc1->proc.argtypes); ++i) { + if (!(tchk = datatype_cmp(sctx, pc1->proc.argtypes[i], pc2->proc.argtypes[i])).ok) + return tchk; + } + return tchk; +} + +static DataTypeCheck +datatype_basic_cmp(SemaCtx *sctx, DataType *dt1, DataType *dt2) +{ + (void)sctx; + if (dt1->size > dt2->size) /* if it has a size equal or less than dt2 */ + return (DataTypeCheck){false, Sl("")}; + if (dt1->sign != dt2->sign) + return (DataTypeCheck){false, Strafmt("integers with different sign")}; + return (DataTypeCheck){.ok = true}; +} + +/* Compares two datatype objects, returning true if they are equal. */ +static DataTypeCheck +datatype_cmp(SemaCtx *sctx, DataType *dt1, DataType *dt2) +{ + if (dt1 == nil || dt2 == nil) + return (DataTypeCheck){false, Sl("")}; + /* TODO: return more information in case of a mismatch... */ + if (dt1 == dt2) /* shallow */ + return (DataTypeCheck){.ok = true}; + if (dt1->kind != dt2->kind) + return (DataTypeCheck){.ok = false}; + + switch (dt1->kind) { + case DtkBasic: + return datatype_basic_cmp(sctx, dt1, dt2); + case DtkStruct: + case DtkUnion: + return datatype_struct_cmp(sctx, dt1, dt2); + case DtkProc: + return datatype_proc_cmp(sctx, dt1, dt2); + case DtkArray: + return datatype_array_cmp(sctx, dt1, dt2); + case DtkBool: + case DtkVoid: + return (DataTypeCheck){.ok = true}; + } + return (DataTypeCheck){.ok = false}; +} + +static DataType * +expr_get_datatype(SemaCtx *sctx, Ast *expr) +{ + compiler_assert(sctx->cm, ast_node_is_expr(expr->type)); + switch (expr->type) { + case AST_BINEXPR: + return expr->bin.type; + case AST_UNARY: + return expr->unary.type; + case AST_NUMBER: + return expr->number.type; + case AST_STRLIT: + return sym_search_oncurrent(sctx->top_scope, Sl("string"))->dtype; + /* XXX: for these two we could attach the type in the ast... */ + case AST_IDENT: + return sym_search(sctx->current_scope, expr->ident)->dtype; + case AST_PROCCALL: + return sym_search(sctx->current_scope, expr->call.name)->dtype->proc.rettype; + default: + unreachable(); + } + return nil; +} + +/* Search for the type in the symbol table, asserting that is a data type. */ +static DataType * +resolve_datatype(SemaCtx *sctx, const Str ident, Location loc) +{ + Symbol *dtsym = sym_search(sctx->current_scope, ident); + if (dtsym == nil) { + sema_error(sctx, &loc, "no such type '%s'", ident.s); + return (DataType *)InvalidDataType; + } + if (dtsym->kind != SymType) { + sema_error(sctx, &loc, "'%s' is not a type but a %s", ident.s, SymbolKindStr[dtsym->kind]); + return (DataType *)InvalidDataType; + } + return dtsym->dtype; +} + +static void +sema_procdef(SemaCtx *sctx, AstProc *proc, Location loc) +{ + Symbol *sym_prev; + if ((sym_prev = sym_search(sctx->current_scope, proc->name)) != nil) { + sema_error( + sctx, nil, + "'%s' was already declared as a %s", + proc->name.s, SymbolKindStr[sym_prev->kind] + ); + sema_note(sctx, &sym_prev->loc, "'%s' previously declared here", proc->name.s); + return; + } + + if (Str_equal(proc->name, Sl("main"))) { + sctx->main_defined = true; + if (!proc->ispublic) { + sema_error(sctx, &loc, "'main' has to be declared as a public proc"); + } + } + + const Ast *rettype_node = proc->rettype; + DataType *proc_rettype = nil; + if (rettype_node != nil) { + compiler_assert(sctx->cm, rettype_node->type == AST_IDENT); + proc_rettype = resolve_datatype(sctx, proc->rettype->ident, rettype_node->loc); + if (proc_rettype == InvalidDataType) + return; + } else { + /* return type node is nil, we infer that as a `void` type */ + proc_rettype = sctx->builtintypes.void_t; + } + + Vec(DataType *) procargs = make_type_list_from_idents(sctx, proc->args); + DataType *procdtype = make_proc_type(false, proc_rettype, procargs); + procdtype->proc.public = proc->ispublic; + Symbol proc_sym = { + .kind = SymProc, + .dtype = procdtype, + .loc = loc + }; + + sym_insert(sctx->current_scope->symbols, proc->name.s, proc_sym); + proc->type = procdtype; + + /* proc has no body at all */ + if (proc->body == nil) + return; + + /* analyze the body */ + compiler_assert(sctx->cm, proc->body->type == AST_STMTS); + push_semactx(&sctx); + enter_scope(sctx); + + compiler_assert(sctx->cm, arrlen(proc->args) == arrlen(procargs)); + /* Inject proc parameters into the proc body top scope */ + for (isize i = 0; i < arrlen(proc->args); ++i) { + DataType *argdtype = procargs[i]; + enum SymbolKind argsymkind = proc->args[i].kind; + + compiler_assert(sctx->cm, argdtype != nil); + compiler_assert(sctx->cm, argsymkind == SymLet || argsymkind == SymVar); + + Symbol argsym = { + .kind = argsymkind, + .dtype = argdtype, + .procparm = true, + .loc = proc->args[i].loc + }; + sym_insert(sctx->current_scope->symbols, proc->args[i].ident.s, argsym); + } + sctx->flags |= SctxInsideProc; + sema_stmts(sctx, proc->body->stmts); + sema_check_unused_vars(sctx); + exit_scope(sctx); + pop_semactx(&sctx); +} + +static void +sema_return(SemaCtx *sctx, Ast *ret_expr, Location loc) +{ + if (~sctx->flags & SctxInsideProc) { + sema_error(sctx, &loc, "'return' outside of proc"); + } + if (ret_expr != nil) + sema_expr(sctx, ret_expr, loc); +} + +static void +sema_break(SemaCtx *sctx, Ast *unused, Location loc) +{ + (void)unused; + if (~sctx->flags & SctxInsideLoop) { + sema_error(sctx, &loc, "'break' used outside of a loop"); + } +} + +static void +sema_discard(SemaCtx *sctx, Ast *expr, Location loc) +{ + sctx->flags |= SctxInDiscard; + sema_expr(sctx, expr, loc); + sctx->flags &= ~SctxInDiscard; +} + +static void +sema_attribute(SemaCtx *sctx, AstAttribute *attr) +{ + sema_node(sctx, attr->node); +} + +static void +sema_var_decl(SemaCtx *sctx, AstVarDecl *decl, Location loc) +{ + compiler_assert(sctx->cm, symbol_is_var_binding(decl->kind)); + + const Symbol *symp = sym_search(sctx->current_scope, decl->name); + if (symp != nil && symp->kind != decl->kind) { + switch (symp->kind) { + case SymLet: + sema_error(sctx, &symp->loc, "'%s' was already declared as 'let'", decl->name.s); + return; + case SymVar: + sema_error(sctx, &symp->loc, "'%s' was already declared as 'var'", decl->name.s); + return; + case SymConst: + sema_error( + sctx, &symp->loc, + "declaration of '%s' shadows previously declared constant with the same name", + decl->name.s + ); + return; + case SymType: + sema_error(sctx, &symp->loc, "'%s' was already declared as a type", decl->name.s); + return; + default: + break; + } + sema_note(sctx, &symp->loc, "'%s' was declared in this line", decl->name.s); + } + + Ast *dexpr = decl->expr; + if (dexpr != nil) { + sema_expr(sctx, dexpr, loc); /* check the assignment expression */ + } else { + sema_warning(sctx, &loc, "variable is unitialized"); + } + + if (decl->datatype == nil) { + sema_error(sctx, nil, "we don't do type inference yet sorry"); + return; + } + + compiler_assert(sctx->cm, decl->datatype->type == AST_IDENT); + DataType *dtype = resolve_datatype(sctx, decl->datatype->ident, decl->datatype->loc); + /* Note that we ignore whether `resolve_datatype` return an invalid type, + * since we still want to insert the variable into the symbol table, + * otherwise we would have spurious "undeclared identifier" errors. */ + decl->type = dtype; + + Symbol sym = { + .kind = decl->kind, + .dtype = dtype, + .loc = loc, + }; + /* Insert the variable to the symbol table */ + sym_insert(sctx->current_scope->symbols, decl->name.s, sym); +} + +static void +sema_var_assign(SemaCtx *sctx, AstVarAssign *assign, Location loc) +{ + sema_expr_ident(sctx, assign->name); + sema_expr(sctx, assign->expr, loc); + + Symbol *decl = sym_search(sctx->current_scope, assign->name); + if (decl == nil) { + sema_error(sctx, &loc, "assign to undeclared variable '%s'", assign->name.s); + return; + } + if (!symbol_is_var_binding(decl->kind)) { + sema_error( + sctx, &loc, + "assign to non-variable symbol ('%s' is a '%s')", + assign->name.s, SymbolKindStr[decl->kind] + ); + return; + } + if (decl->kind != SymVar) { + sema_error( + sctx, &loc, + "assign to immutable symbol ('%s' was declared as '%s')", + assign->name.s, SymbolKindStr[decl->kind] + ); + return; + } + //datatype_cmp(sctx, nil, decl->dtype); +} + +static void +sema_ifstmtexpr(SemaCtx *sctx, AstIf *ift, Location loc) +{ + sema_expr(sctx, ift->cond, loc); + sema_node(sctx, ift->true_body); + sema_node(sctx, ift->false_body); + + const isize elifs_len = arrlen(ift->elifs); + if (elifs_len > 0) { + for (isize i = 0; i < elifs_len; ++i) { + AstElif *elif = &ift->elifs[i]; + sema_expr(sctx, elif->cond, loc); + sema_node(sctx, elif->body); + } + } +} + +static void +sema_loop(SemaCtx *sctx, AstLoop *loop, Location loc) +{ + if (loop->precond != nil) { + sema_expr(sctx, loop->precond, loc); + } + if (loop->postcond != nil) { + sema_expr(sctx, loop->postcond, loc); + } + push_semactx(&sctx); + sctx->flags |= SctxInsideLoop; + sema_node(sctx, loop->body); + pop_semactx(&sctx); +} + +static void +sema_stmts(SemaCtx *sctx, Vec(Ast *) stmts) +{ + /* AST_STMTS imply the opening of a new scope */ + const isize stmts_len = arrlen(stmts); + for (isize i = 0; i < stmts_len; ++i) { + sema_node(sctx, stmts[i]); + if (sema_is_stmt_terminal(stmts[i]) && i + 1 != stmts_len) { + sema_warning(sctx, &stmts[i + 1]->loc, "dead code after 'return'"); + } + } +} + +static void +sema_stmt_block(SemaCtx *sctx, Vec(Ast *) stmts) +{ + enter_scope(sctx); + sema_stmts(sctx, stmts); + exit_scope(sctx); + /* check for unused bindings declared in this scope */ + sema_check_unused_vars(sctx); +} + +static void +sema_node(SemaCtx *sctx, Ast *node) +{ + if (node == nil) + return; + switch (node->type) { + case AST_IF: + sema_ifstmtexpr(sctx, &node->ifse, node->loc); + break; + case AST_LOOP: + sema_loop(sctx, &node->loop, node->loc); + break; + case AST_STMTS: + sema_stmt_block(sctx, node->stmts); + break; + case AST_PROCDEF: + sema_procdef(sctx, &node->proc, node->loc); + break; + case AST_PROCCALL: + sema_proccall(sctx, &node->call, node->loc); + break; + case AST_VARDECL: + sema_var_decl(sctx, &node->var, node->loc); + break; + case AST_VARASSIGN: + sema_var_assign(sctx, &node->varassgn, node->loc); + break; + case AST_RETURN: + sema_return(sctx, node->ret, node->loc); + break; + case AST_BREAK: + sema_break(sctx, nil, node->loc); + break; + case AST_DISCARD: + sema_discard(sctx, node->discard.expr, node->loc); + break; + case AST_ATTRIBUTE: + sema_attribute(sctx, &node->attribute); + break; + case AST_BINEXPR: + case AST_UNARY: + case AST_NUMBER: + case AST_STRLIT: + case AST_IDENT: + sema_expr(sctx, node, node->loc); + break; + case AST_INVALID: + case AST_EXPRS: + case AST_PROCCALL_ARGS: + unreachable(); + } +} + +static void +sema_make_builtin_types(SemaCtx *sctx) +{ + typedef struct { + const char *name; + Symbol sym; + } NameSym; + + DataType *void_type = make_data_type(DtkVoid, 0, true, false); + DataType *str_type = make_data_type(DtkStruct, 0, false, false); + DataType *puts_proto = make_data_type(DtkProc, 0, false, false); + puts_proto->proc.rettype = void_type; + puts_proto->proc.argtypes = make_proc_args((DataType *[]){str_type}, 1); + puts_proto->proc.extern_lnk = true; + + NameSym builtin_basic_types[] = { + {"void", {.kind = SymType, .dtype = void_type}}, + {"u64", {.kind = SymType, .dtype = make_data_type(DtkBasic, 8, true, false)}}, + {"i64", {.kind = SymType, .dtype = make_data_type(DtkBasic, 8, true, true)}}, + {"cint", {.kind = SymType, .dtype = make_data_type(DtkBasic, sizeof(int), true, true)}}, + {"string", {.kind = SymType, .dtype = str_type}}, + {"bool", {.kind = SymType, .dtype = make_data_type(DtkBool, 1, true, false)}}, + }; + DataType *u64_dt = builtin_basic_types[1].sym.dtype; + DataType *bool_dt = builtin_basic_types[5].sym.dtype; + + NameSym builtin_procs[] = { + { + "+", + { + .kind = SymProc, + .dtype = make_proc_type( + true, + u64_dt, + make_proc_args((DataType *[]){u64_dt, u64_dt}, 2) + ) + } + }, + { + "-", + { + .kind = SymProc, + .dtype = make_proc_type( + true, + u64_dt, + make_proc_args((DataType *[]){u64_dt, u64_dt}, 2) + ) + } + }, + { + "==", + { + .kind = SymProc, + .dtype = make_proc_type( + true, + bool_dt, + make_proc_args((DataType *[]){u64_dt, u64_dt}, 2) + ) + } + }, + }; + + for (isize i = 0; i < countof(builtin_basic_types); ++i) { + const char *name = builtin_basic_types[i].name; + Symbol sym = builtin_basic_types[i].sym; + sym_insert(sctx->current_scope->symbols, name, sym); + } + for (isize i = 0; i < countof(builtin_procs); ++i) { + sym_insert(sctx->current_scope->symbols, + builtin_procs[i].name, builtin_procs[i].sym); + } + + sctx->builtintypes.tyu64 = builtin_basic_types[1].sym.dtype; + sctx->builtintypes.void_t = void_type; + + Symbol puts_sym = {.kind = SymProc, .dtype = puts_proto}; + sym_insert(sctx->current_scope->symbols, "puts", puts_sym); +} + +SemaCtx * +sema_new(Compiler *cm) +{ + SemaCtx *toplevel_context = make_semactx(cm, nil); + toplevel_context->current_scope = make_scope(nil); + + sema_make_builtin_types(toplevel_context); + toplevel_context->top_scope = toplevel_context->current_scope; + toplevel_context->ok = true; + return toplevel_context; +} + +void +sema_destroy(SemaCtx *sctx) +{ + free(sctx); +} + +void +sema(SemaCtx *sctx, Ast *program) +{ + /* Analyze toplevel */ + /* XXX: DRY it */ + compiler_assert(sctx->cm, program->type == AST_STMTS); + for (isize i = 0; i < arrlen(program->stmts); ++i) + sema_node(sctx, program->stmts[i]); + + if (!sctx->cm->opts.compile_only && !sctx->main_defined) + sema_error(sctx, nil, "missing 'main' entrypoint proc"); + + /* check unused local procedures */ + const SymbolEntry *syms = sctx->current_scope->symbols; + for (isize i = 0; i < shlen(syms); ++i) { + const Symbol fsym = syms[i].value; + if (fsym.kind == SymProc + && !fsym.dtype->builtin + && !fsym.dtype->proc.public + && !fsym.dtype->proc.extern_lnk + && !fsym.used) { + sema_warning( + sctx, &fsym.loc, + "defined proc '%s' is never called in this module", syms[i].key + ); + } + } +} diff --git a/compiler/sema.h b/compiler/sema.h new file mode 100644 index 0000000..a3e95b9 --- /dev/null +++ b/compiler/sema.h @@ -0,0 +1,32 @@ +#ifndef _sema_h_ +#define _sema_h_ + +#include "ast.h" +#include "state.h" + +typedef struct Scope Scope; +typedef struct SemaCtx SemaCtx; + +struct SemaCtx +{ + SemaCtx *prev; + Scope *current_scope; + Scope *top_scope; + Compiler *cm; + u64 flags; /* Bit field storing context flags */ + struct { + DataType *tyu64; + DataType *void_t; + } builtintypes; + bool ok; /* did the semantic check fail */ + bool main_defined; +}; + +SemaCtx * +sema_new(Compiler *cm); +void +sema_destroy(SemaCtx *sctx); +void +sema(SemaCtx *sctx, Ast *program); + +#endif diff --git a/compiler/state.c b/compiler/state.c new file mode 100644 index 0000000..301579a --- /dev/null +++ b/compiler/state.c @@ -0,0 +1,13 @@ +#include +#include "pre.h" +#include "state.h" + +void +compiler_assert_impl(Compiler *cm, const char *pred_s) +{ + (void)cm; + fprintf(stderr, "Bug check fail: `%s`\n", pred_s); + fputs("This is a compiler bug, please report! (run with -v for bug reporting instructions)\n\n", stderr); + fflush(stderr); + debugtrap(); +} diff --git a/compiler/state.h b/compiler/state.h new file mode 100644 index 0000000..477389e --- /dev/null +++ b/compiler/state.h @@ -0,0 +1,30 @@ +#ifndef _state_h_ +#define _state_h_ + +#include "pre.h" +#include "cgBackends.h" + +/* Assert meant to catch compiler bugs. The difference with a normal assert is that + * this one stays on release builds. Better to crash than to deal with some weird bug + * seeping through codegen. + */ +#define compiler_assert(cm, pred) if (!(pred)) {compiler_assert_impl(cm, #pred);} + +typedef struct { + struct { + bool color; /* colored diagnostics */ + bool compile_only; + Str exe_out; + Str release_mode; + Vec(Str) defines; + enum CodegenBackends backend; + isize max_errors; + } opts; + Str current_filename; + isize error_count; +} Compiler; + +void +compiler_assert_impl(Compiler *cm, const char *pred_s); + +#endif diff --git a/compiler/symbol.c b/compiler/symbol.c new file mode 100644 index 0000000..033011c --- /dev/null +++ b/compiler/symbol.c @@ -0,0 +1,10 @@ +#include "symbol.h" + +const char *SymbolKindStr[] = { + [SymInvalid] = "", + [SymLet] = "let", + [SymVar] = "var", + [SymConst] = "const", + [SymProc] = "proc", + [SymType] = "type definition", +}; diff --git a/compiler/symbol.h b/compiler/symbol.h new file mode 100644 index 0000000..7baf061 --- /dev/null +++ b/compiler/symbol.h @@ -0,0 +1,17 @@ +#ifndef _symbol_h_ +#define _symbol_h_ + +#define symbol_is_var_binding(sk) ((sk) >= SymLet && (sk) <= SymConst) + +enum SymbolKind +{ + SymInvalid, + SymLet, + SymVar, + SymConst, + SymProc, + SymType, /* a data type that is */ +}; +extern const char *SymbolKindStr[]; + +#endif