harec

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 4e83713538860a3cacc809d120577e72e79dd200
parent d4f105b90fb92e4db0e0947200af156d9a24e2b7
Author: Eyal Sawady <ecs@d2evs.net>
Date:   Fri, 11 Dec 2020 17:56:54 -0500

parse: implement declarations

Also parse a minimal subset of types and expressions

Diffstat:
Minclude/ast.h | 144++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Minclude/identifier.h | 1+
Minclude/types.h | 29+++++++++++++++++++++--------
Msrc/lex.c | 12++++++++++++
Msrc/parse.c | 457++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
5 files changed, 623 insertions(+), 20 deletions(-)

diff --git a/include/ast.h b/include/ast.h @@ -1,6 +1,9 @@ #ifndef HARE_AST_H #define HARE_AST_H +#include <stdbool.h> +#include <stdint.h> #include "identifier.h" +#include "types.h" enum ast_import_mode { AST_IMPORT_IDENTIFIER, // use foo::bar; @@ -18,25 +21,150 @@ struct ast_imports { struct ast_imports *next; }; -enum ast_declaration_type { +struct ast_list_type { + struct ast_expression *length; // NULL for slices and unbounded arrays + struct type *members; +}; + +struct ast_enum_field { + const char *name; + struct ast_expression *value; + struct enum_field *next; +}; + +struct ast_enum_type { + enum type_storage storage; + struct ast_enum_field *values; +}; + +enum variadism { + VARIADISM_NONE, + VARIADISM_C, + VARIADISM_HARE, +}; + +struct ast_function_parameters { + char *name; + struct ast_type *type; + struct ast_function_parameters *next; +}; + +struct ast_function_type { + bool noreturn; + enum variadism variadism; + struct ast_type *result; + struct ast_function_parameters *parameters; +}; + +struct ast_pointer_type { + bool nullable; + struct ast_type *referent; +}; + +struct ast_tagged_union_type { + struct ast_type *type; + struct ast_tagged_union_type *next; +}; + +struct ast_struct_union_type { + const char *name; + struct ast_type *type; + struct ast_struct_union_type *next; +}; + +struct ast_type { + enum type_storage storage; + bool constant; + union { + struct identifier alias; + struct ast_list_type array; + struct ast_enum_type _enum; + struct ast_function_type function; + struct ast_pointer_type pointer; + struct ast_list_type slice; + struct ast_struct_union_type _struct; + struct ast_tagged_union_type tagged_union; + struct ast_struct_union_type _union; + }; +}; + +enum expression_type { + EXPR_CONSTANT, +}; + +struct ast_constant_expression { + enum type_storage storage; + union { + intmax_t _signed; + uintmax_t _unsigned; + struct { + size_t len; + char *value; + } string; + }; +}; + +struct ast_expression { + enum expression_type type; + union { + struct ast_constant_expression constant; + }; +}; + +struct ast_global_decl { + char *symbol; + struct identifier ident; + struct ast_type type; + struct ast_expression init; + struct ast_global_decl *next; +}; + +struct ast_type_decl { + struct identifier ident; + struct ast_type type; + struct ast_type_decl *next; +}; + +enum function_flags { + FN_FINI = 1 << 0, + FN_INIT = 1 << 1, + FN_TEST = 1 << 2, +}; + +struct ast_function_decl { + char *symbol; + uint32_t flags; // enum function_flags + struct identifier ident; + struct ast_function_type prototype; + struct ast_expression body; +}; + +enum ast_decl_type { AST_DECL_FUNC, AST_DECL_TYPE, - AST_DECL_VAR, + AST_DECL_GLOBAL, AST_DECL_CONST, }; -struct ast_declaration { - enum ast_declaration_type type; +struct ast_decl { + enum ast_decl_type decl_type; + bool exported; + union { + struct ast_global_decl global; + struct ast_global_decl constant; + struct ast_type_decl type; + struct ast_function_decl function; + }; }; -struct ast_declarations { - struct ast_declaration decl; - struct ast_declarations *next; +struct ast_decls { + struct ast_decl decl; + struct ast_decls *next; }; struct ast_subunit { struct ast_imports *imports; - struct ast_declarations decls; + struct ast_decls decls; struct ast_subunit *next; }; diff --git a/include/identifier.h b/include/identifier.h @@ -1,5 +1,6 @@ #ifndef HARE_IDENTIFIER_H #define HARE_IDENTIFIER_H +#include <stddef.h> struct identifier { char *name; diff --git a/include/types.h b/include/types.h @@ -1,25 +1,38 @@ #ifndef HARE_TYPES_H #define HARE_TYPES_H +#include <stdbool.h> +#include "identifier.h" enum type_storage { // Scalar types - TYPE_STORAGE_U8, - TYPE_STORAGE_U16, - TYPE_STORAGE_U32, - TYPE_STORAGE_U64, - TYPE_STORAGE_I8, + TYPE_STORAGE_BOOL, + TYPE_STORAGE_CHAR, + TYPE_STORAGE_F32, + TYPE_STORAGE_F64, TYPE_STORAGE_I16, TYPE_STORAGE_I32, TYPE_STORAGE_I64, + TYPE_STORAGE_I8, TYPE_STORAGE_INT, TYPE_STORAGE_RUNE, + TYPE_STORAGE_SIZE, + TYPE_STORAGE_U16, + TYPE_STORAGE_U32, + TYPE_STORAGE_U64, + TYPE_STORAGE_U8, TYPE_STORAGE_UINT, TYPE_STORAGE_UINTPTR, - TYPE_STORAGE_SIZE, - TYPE_STORAGE_F32, - TYPE_STORAGE_F64, + TYPE_STORAGE_VOID, // Aggregate types + TYPE_STORAGE_ALIAS, + TYPE_STORAGE_ARRAY, + TYPE_STORAGE_FUNCTION, + TYPE_STORAGE_POINTER, + TYPE_STORAGE_SLICE, TYPE_STORAGE_STRING, + TYPE_STORAGE_STRUCT, + TYPE_STORAGE_TAGGED_UNION, + TYPE_STORAGE_UNION, }; const char *type_storage_unparse(enum type_storage storage); diff --git a/src/lex.c b/src/lex.c @@ -989,6 +989,18 @@ token_str(const struct token *tok) break; case TYPE_STORAGE_STRING: return string_unparse(tok); + case TYPE_STORAGE_ALIAS: + case TYPE_STORAGE_ARRAY: + case TYPE_STORAGE_BOOL: + case TYPE_STORAGE_CHAR: + case TYPE_STORAGE_FUNCTION: + case TYPE_STORAGE_POINTER: + case TYPE_STORAGE_SLICE: + case TYPE_STORAGE_STRUCT: + case TYPE_STORAGE_TAGGED_UNION: + case TYPE_STORAGE_UNION: + case TYPE_STORAGE_VOID: + assert(0); } return buf; default:; diff --git a/src/parse.c b/src/parse.c @@ -1,4 +1,5 @@ #include <assert.h> +#include <ctype.h> #include <stdarg.h> #include <stdbool.h> #include <stdio.h> @@ -17,6 +18,17 @@ struct parser { }; static void +synassert_msg(bool cond, const char *msg, struct token *tok) +{ + if (!cond) { + fprintf(stderr, "Syntax error: %s at %s:%d:%d ('%s')\n", msg, + tok->loc.path, tok->loc.lineno, tok->loc.colno, + token_str(tok)); + exit(1); + } +} + +static void synassert(bool cond, struct token *tok, ...) { if (!cond) { @@ -24,11 +36,10 @@ synassert(bool cond, struct token *tok, ...) va_start(ap, tok); enum lexical_token t = va_arg(ap, enum lexical_token); - const char *s = token_str(tok); fprintf(stderr, - "Syntax error: unexpected '%s' at %s:%d:%d%s", s, - tok->loc.path, tok->loc.lineno, tok->loc.colno, - t == T_EOF ? "\n" : ", expected " ); + "Syntax error: unexpected '%s' at %s:%d:%d%s", + token_str(tok), tok->loc.path, tok->loc.lineno, + tok->loc.colno, t == T_EOF ? "\n" : ", expected " ); while (t != T_EOF) { if (t == T_LITERAL || t == T_NAME) { fprintf(stderr, "%s", lexical_token_str(t)); @@ -142,6 +153,442 @@ parse_imports(struct parser *par, struct ast_subunit *subunit) trleave(TR_PARSE, NULL); } +static void parse_type(struct parser *par, struct ast_type *type); + +static void +parse_parameter_list(struct parser *par, struct ast_function_type *type) +{ + trenter(TR_PARSE, "parameter-list"); + struct token tok = {0}; + bool more = true; + struct ast_function_parameters **next = &type->parameters; + while (more) { + *next = calloc(1, sizeof(struct ast_function_parameters)); + (*next)->type = calloc(1, sizeof(struct ast_type)); + want(par, T_NAME, &tok); + (*next)->name = tok.name; + want(par, T_COLON, NULL); + parse_type(par, (*next)->type); + switch (lex(par->lex, &tok)) { + case T_COMMA: + switch (lex(par->lex, &tok)) { + case T_ELLIPSIS: + type->variadism = VARIADISM_HARE; + if (lex(par->lex, &tok) != T_COMMA) { + unlex(par->lex, &tok); + } + more = false; + break; + default: + unlex(par->lex, &tok); + next = &(*next)->next; + break; + } + break; + case T_ELLIPSIS: + type->variadism = VARIADISM_C; + if (lex(par->lex, &tok) != T_COMMA) { + unlex(par->lex, &tok); + } + more = false; + break; + default: + more = false; + unlex(par->lex, &tok); + break; + } + } + trleave(TR_PARSE, NULL); +} + +static void +parse_prototype(struct parser *par, struct ast_function_type *type) +{ + trenter(TR_PARSE, "prototype"); + want(par, T_LPAREN, NULL); + struct token tok = {0}; + if (lex(par->lex, &tok) != T_RPAREN) { + unlex(par->lex, &tok); + parse_parameter_list(par, type); + want(par, T_RPAREN, NULL); + } + type->result = calloc(1, sizeof(struct ast_type)); + parse_type(par, type->result); + // TODO: unparse prototype + trleave(TR_PARSE, NULL); +} + +static void +parse_type(struct parser *par, struct ast_type *type) +{ + trenter(TR_PARSE, "type"); + struct token tok = {0}; + switch (lex(par->lex, &tok)) { + case T_CONST: + type->constant = true; + break; + default: + unlex(par->lex, &tok); + break; + } + switch (lex(par->lex, &tok)) { + case T_I8: + type->storage = TYPE_STORAGE_I8; + break; + case T_I16: + type->storage = TYPE_STORAGE_I16; + break; + case T_I32: + type->storage = TYPE_STORAGE_I32; + break; + case T_I64: + type->storage = TYPE_STORAGE_I64; + break; + case T_U8: + type->storage = TYPE_STORAGE_U8; + break; + case T_U16: + type->storage = TYPE_STORAGE_U16; + break; + case T_U32: + type->storage = TYPE_STORAGE_U32; + break; + case T_U64: + type->storage = TYPE_STORAGE_U64; + break; + case T_INT: + type->storage = TYPE_STORAGE_INT; + break; + case T_UINT: + type->storage = TYPE_STORAGE_UINT; + break; + case T_SIZE: + type->storage = TYPE_STORAGE_SIZE; + break; + case T_UINTPTR: + type->storage = TYPE_STORAGE_UINTPTR; + break; + case T_CHAR: + type->storage = TYPE_STORAGE_CHAR; + break; + case T_RUNE: + type->storage = TYPE_STORAGE_RUNE; + break; + case T_STR: + type->storage = TYPE_STORAGE_STRING; + break; + case T_F32: + type->storage = TYPE_STORAGE_F32; + break; + case T_F64: + type->storage = TYPE_STORAGE_F64; + break; + case T_BOOL: + type->storage = TYPE_STORAGE_BOOL; + break; + case T_VOID: + type->storage = TYPE_STORAGE_VOID; + break; + case T_ENUM: + assert(0); // TODO: Enums + case T_NULLABLE: + type->pointer.nullable = true; + want(par, T_TIMES, NULL); + /* fallthrough */ + case T_TIMES: + type->storage = TYPE_STORAGE_POINTER; + type->pointer.referent = calloc(1, sizeof(struct ast_type)); + parse_type(par, type->pointer.referent); + break; + case T_STRUCT: + case T_UNION: + assert(0); // TODO: Structs/unions + case T_LPAREN: + assert(0); // TODO: Tagged unions + case T_LBRACKET: + assert(0); // TODO: Slices/arrays + case T_ATTR_NORETURN: + type->function.noreturn = true; + want(par, T_FN, NULL); + /* fallthrough */ + case T_FN: + type->storage = TYPE_STORAGE_FUNCTION; + parse_prototype(par, &type->function); + break; + default: + unlex(par->lex, &tok); + type->storage = TYPE_STORAGE_ALIAS; + parse_identifier(par, &type->alias); + break; + } + // TODO: unparse type + trleave(TR_PARSE, NULL); +} + +static void +parse_simple_expression(struct parser *par, struct ast_expression *exp) +{ + struct token tok = {0}; + lex(par->lex, &tok); + assert(tok.token == T_LITERAL); // TODO: other simple expressions + exp->type = EXPR_CONSTANT; + exp->constant.storage = tok.storage; + switch (tok.storage) { + case TYPE_STORAGE_CHAR: + case TYPE_STORAGE_U8: + case TYPE_STORAGE_U16: + case TYPE_STORAGE_U32: + case TYPE_STORAGE_U64: + case TYPE_STORAGE_UINT: + case TYPE_STORAGE_UINTPTR: + case TYPE_STORAGE_SIZE: + exp->constant._unsigned = (uintmax_t)tok._unsigned; + break; + case TYPE_STORAGE_I8: + case TYPE_STORAGE_I16: + case TYPE_STORAGE_I32: + case TYPE_STORAGE_I64: + case TYPE_STORAGE_INT: + exp->constant._signed = (intmax_t)tok._signed; + break; + case TYPE_STORAGE_STRING: + exp->constant.string.len = tok.string.len; + exp->constant.string.value = tok.string.value; + break; + default: + assert(0); // TODO + } +} + +static void +parse_complex_expression(struct parser *par, struct ast_expression *exp) +{ + // TODO: other complex expressions + parse_simple_expression(par, exp); +} + +static char * +parse_attr_symbol(struct parser *par) +{ + struct token tok = {0}; + want(par, T_LPAREN, NULL); + want(par, T_LITERAL, &tok); + synassert_msg(tok.storage == TYPE_STORAGE_STRING, + "expected string literal", &tok); + for (size_t i = 0; i < tok.string.len; i++) { + uint32_t c = tok.string.value[i]; + synassert_msg(c <= 0x7F && (isalnum(c) || c == '_' || c == '$' + || c == '.'), "invalid symbol", &tok); + synassert_msg(i != 0 || (!isdigit(c) && c != '$'), + "invalid symbol", &tok); + } + want(par, T_RPAREN, NULL); + return tok.string.value; +} + +static void +parse_global_decl(struct parser *par, enum lexical_token mode, + struct ast_global_decl *decl) +{ + trenter(TR_PARSE, "global"); + struct token tok = {0}; + struct ast_global_decl *i = decl; + assert(mode == T_LET || mode == T_CONST || mode == T_DEF); + bool more = true; + while (more) { + if (mode == T_LET || mode == T_CONST) { + switch (lex(par->lex, &tok)) { + case T_ATTR_SYMBOL: + i->symbol = parse_attr_symbol(par); + break; + default: + unlex(par->lex, &tok); + break; + } + } + parse_identifier(par, &i->ident); + want(par, T_COLON, NULL); + parse_type(par, &i->type); + if (mode == T_CONST) { + i->type.constant = true; + } + want(par, T_EQUAL, NULL); + parse_simple_expression(par, &i->init); + switch (lex(par->lex, &tok)) { + case T_COMMA: + lex(par->lex, &tok); + if (tok.token == T_NAME || tok.token == T_ATTR_SYMBOL) { + i->next = calloc(1, sizeof(struct ast_global_decl)); + i = i->next; + unlex(par->lex, &tok); + break; + } + /* fallthrough */ + default: + more = false; + unlex(par->lex, &tok); + break; + } + } + + for (struct ast_global_decl *i = decl; i; i = i->next) { + char ibuf[1024], tbuf[1024], ebuf[1024]; + identifier_unparse_static(&i->ident, ibuf, sizeof(ibuf)); + strncpy(tbuf, "[type]", sizeof(tbuf)); // TODO: unparse type + strncpy(ebuf, "[expr]", sizeof(ebuf)); // TODO: unparse expr + if (decl->symbol) { + trace(TR_PARSE, "%s @symbol(\"%s\") %s: %s = %s", + lexical_token_str(mode), decl->symbol, ibuf, + tbuf, ebuf); + } else { + trace(TR_PARSE, "%s %s: %s = [expr]", + lexical_token_str(mode), ibuf, tbuf); + } + } + trleave(TR_PARSE, NULL); +} + +static void +parse_type_decl(struct parser *par, struct ast_type_decl *decl) +{ + trenter(TR_PARSE, "typedef"); + struct token tok = {0}; + struct ast_type_decl *i = decl; + bool more = true; + while (more) { + parse_identifier(par, &i->ident); + want(par, T_EQUAL, NULL); + parse_type(par, &i->type); + switch (lex(par->lex, &tok)) { + case T_COMMA: + lex(par->lex, &tok); + if (lex(par->lex, &tok) == T_NAME) { + i->next = calloc(1, sizeof(struct ast_type_decl)); + i = i->next; + unlex(par->lex, &tok); + break; + } + /* fallthrough */ + default: + more = false; + unlex(par->lex, &tok); + break; + } + } + + for (struct ast_type_decl *i = decl; i; i = i->next) { + char ibuf[1024], tbuf[1024]; + identifier_unparse_static(&i->ident, ibuf, sizeof(ibuf)); + strncpy(tbuf, "[type]", sizeof(tbuf)); // TODO: unparse type + trace(TR_PARSE, "def %s = %s", ibuf, tbuf); + } + trleave(TR_PARSE, NULL); +} + +static void +parse_fn_decl(struct parser *par, struct ast_function_decl *decl) +{ + trenter(TR_PARSE, "fn"); + struct token tok = {0}; + bool more = true; + while (more) { + switch (lex(par->lex, &tok)) { + case T_ATTR_FINI: + decl->flags |= FN_FINI; + break; + case T_ATTR_INIT: + decl->flags |= FN_INIT; + break; + case T_ATTR_SYMBOL: + decl->symbol = parse_attr_symbol(par); + break; + case T_ATTR_TEST: + decl->flags |= FN_TEST; + break; + case T_ATTR_NORETURN: + decl->prototype.noreturn = true; + break; + default: + more = false; + unlex(par->lex, &tok); + break; + } + } + want(par, T_FN, NULL); + parse_identifier(par, &decl->ident); + parse_prototype(par, &decl->prototype); + want(par, T_EQUAL, NULL); + parse_complex_expression(par, &decl->body); + + char symbol[1024], buf[1024]; + if (decl->symbol) { + snprintf(symbol, sizeof(symbol), "@symbol(\"%s\") ", decl->symbol); + } + identifier_unparse_static(&decl->ident, buf, sizeof(buf)); + trace(TR_PARSE, "%s%s%s%s%sfn %s %s = %s", + decl->flags & FN_FINI ? "@fini " : "", + decl->flags & FN_INIT ? "@init " : "", + decl->prototype.noreturn ? "@noreturn " : "", + decl->flags & FN_TEST ? "@test " : "", + decl->symbol ? symbol : "", buf, "[prototype]", "[expr]"); + trleave(TR_PARSE, NULL); +} + +static void +parse_decl(struct parser *par, struct ast_decl *decl) +{ + struct token tok = {0}; + switch (lex(par->lex, &tok)) { + case T_CONST: + case T_LET: + decl->decl_type = AST_DECL_GLOBAL; + parse_global_decl(par, tok.token, &decl->global); + break; + case T_DEF: + decl->decl_type = AST_DECL_CONST; + parse_global_decl(par, tok.token, &decl->constant); + break; + case T_TYPE: + decl->decl_type = AST_DECL_TYPE; + parse_type_decl(par, &decl->type); + break; + default: + unlex(par->lex, &tok); + decl->decl_type = AST_DECL_FUNC; + parse_fn_decl(par, &decl->function); + break; + } +} + +static void +parse_decls(struct parser *par, struct ast_decls *decls) +{ + trenter(TR_PARSE, "decls"); + struct token tok = {0}; + struct ast_decls **next = &decls; + while (tok.token != T_EOF) { + switch (lex(par->lex, &tok)) { + case T_EXPORT: + (*next)->decl.exported = true; + trace(TR_PARSE, "export"); + break; + default: + unlex(par->lex, &tok); + break; + } + parse_decl(par, &(*next)->decl); + next = &(*next)->next; + *next = calloc(1, sizeof(struct ast_decls)); + want(par, T_SEMICOLON, NULL); + if (lex(par->lex, &tok) != T_EOF) { + unlex(par->lex, &tok); + } + } + free(*next); + *next = 0; + trleave(TR_PARSE, NULL); +} + void parse(struct lexer *lex, struct ast_subunit *subunit) { @@ -149,4 +596,6 @@ parse(struct lexer *lex, struct ast_subunit *subunit) .lex = lex, }; parse_imports(&par, subunit); + parse_decls(&par, &subunit->decls); + want(&par, T_EOF, NULL); }