harec

[hare] Hare compiler, written in C11 for POSIX OSs
Log | Files | Refs | README | LICENSE

commit cd8e44cbb0d975160cf6bd755798793e080a164c
parent 7d1f4cf5f3b34969dda64eeec345c1f21e58967b
Author: Bor Grošelj Simić <bgs@turminal.net>
Date:   Thu, 25 Aug 2022 21:36:09 +0200

lex.c: report lexer errors

Lexer used T_ERROR for error reporting, but the rest of the code just
ignored those tokens. Lex errors now get reported immediately and abort
execution.

Invalid UTF-8 sequences are reported in next(), changing (c !=
UTF8_INVALID) checks in the rest of the code to eof checks.

Signed-off-by: Bor Grošelj Simić <bgs@turminal.net>

Diffstat:
Minclude/lex.h | 3+++
Msrc/lex.c | 131++++++++++++++++++++++++++++++++++++++++---------------------------------------
2 files changed, 69 insertions(+), 65 deletions(-)

diff --git a/include/lex.h b/include/lex.h @@ -3,6 +3,9 @@ #include <stdint.h> #include <stdio.h> #include "types.h" +#include "utf8.h" + +#define C_EOF UTF8_INVALID // Keep sorted enum lexical_token { diff --git a/src/lex.c b/src/lex.c @@ -2,11 +2,13 @@ #include <ctype.h> #include <errno.h> #include <inttypes.h> +#include <stdarg.h> #include <stdbool.h> #include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> +#include <stdnoreturn.h> #include <string.h> #include "lex.h" #include "utf8.h" @@ -135,6 +137,21 @@ static const char *tokens[] = { [T_BXOREQ] = "^=", }; +static noreturn void +error(struct location *loc, char *fmt, ...) +{ + fprintf(stderr, "Syntax error at %s:%d:%d: ", sources[loc->file], + loc->lineno, loc->colno); + + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + + fputc('\n', stderr); + exit(EXIT_FAILURE); +} + void lex_init(struct lexer *lexer, FILE *f, int fileid) { @@ -181,6 +198,9 @@ next(struct lexer *lexer, struct location *loc, bool buffer) } else { c = utf8_get(lexer->in); update_lineno(&lexer->loc, c); + if (c == UTF8_INVALID && !feof(lexer->in)) { + error(&lexer->loc, "Invalid UTF-8 sequence encountered"); + } } if (loc != NULL) { *loc = lexer->loc; @@ -188,7 +208,7 @@ next(struct lexer *lexer, struct location *loc, bool buffer) update_lineno(&lexer->loc, lexer->c[i]); } } - if (c == UTF8_INVALID || !buffer) { + if (c == C_EOF || !buffer) { return c; } if (lexer->buflen + utf8_cpsize(c) >= lexer->bufsz) { @@ -213,7 +233,7 @@ static uint32_t wgetc(struct lexer *lexer, struct location *loc) { uint32_t c; - while ((c = next(lexer, loc, false)) != UTF8_INVALID && isharespace(c)) ; + while ((c = next(lexer, loc, false)) != C_EOF && isharespace(c)) ; return c; } @@ -252,8 +272,8 @@ static uint32_t lex_name(struct lexer *lexer, struct token *out) { uint32_t c = next(lexer, &out->loc, true); - assert(c != UTF8_INVALID && c <= 0x7F && (isalpha(c) || c == '_' || c == '@')); - while ((c = next(lexer, NULL, true)) != UTF8_INVALID) { + assert(c != C_EOF && c <= 0x7F && (isalpha(c) || c == '_' || c == '@')); + while ((c = next(lexer, NULL, true)) != C_EOF) { if (c > 0x7F || (!isalnum(c) && c != '_')) { push(lexer, c, true); break; @@ -264,9 +284,7 @@ lex_name(struct lexer *lexer, struct token *out) sizeof(tokens[0]), cmp_keyword); if (!token) { if (lexer->buf[0] == '@') { - out->token = T_ERROR; - consume(lexer, -1); - return out->token; + error(&out->loc, "Unknown attribute %s", lexer->buf); } out->token = T_NAME; out->name = strdup(lexer->buf); @@ -281,7 +299,7 @@ static uint32_t lex_literal(struct lexer *lexer, struct token *out) { uint32_t c = next(lexer, &out->loc, true); - assert(c != UTF8_INVALID && c <= 0x7F && isdigit(c)); + assert(c != C_EOF && c <= 0x7F && isdigit(c)); bool started = false; int base = 10; @@ -315,7 +333,7 @@ lex_literal(struct lexer *lexer, struct token *out) char *suff = NULL; char *exp = NULL; bool isfloat = false; - while ((c = next(lexer, NULL, true)) != UTF8_INVALID) { + while ((c = next(lexer, NULL, true)) != C_EOF) { if (!strchr(basechrs, c)) { switch (c) { case '.': @@ -418,16 +436,18 @@ finalize: } } if (!isvalid) { - out->token = T_ERROR; - consume(lexer, -1); - return out->token; + error(&out->loc, "Invalid numeric suffix"); } } intmax_t exponent = 0; if (exp) { char *endptr = NULL; + errno = 0; exponent = strtoimax(exp, &endptr, 10); + if (errno == ERANGE) { + error(&out->loc, "Numerical exponent overflow"); + } // integers can't have negative exponents if (exponent < 0 && !suff) { out->storage = STORAGE_FCONST; @@ -438,9 +458,7 @@ finalize: || s == STORAGE_F64 || s == STORAGE_FCONST; if (endptr == exp || !valid) { - out->token = T_ERROR; - consume(lexer, -1); - return out->token; + error(&out->loc, "Integers cannot have negative exponents"); } } @@ -451,9 +469,7 @@ finalize: case STORAGE_FCONST: break; default: - out->token = T_ERROR; - consume(lexer, -1); - return out->token; + error(&out->loc, "Unexpected decimal point in integer literal"); } } @@ -508,7 +524,7 @@ finalize: assert(0); } if (errno == ERANGE && !isfloat) { - out->token = T_ERROR; + error(&out->loc, "Integer literal overflow"); } consume(lexer, -1); return out->token; @@ -520,7 +536,7 @@ lex_rune(struct lexer *lexer) char buf[9]; char *endptr; uint32_t c = next(lexer, NULL, false); - assert(c != UTF8_INVALID); + assert(c != C_EOF); switch (c) { case '\\': @@ -554,11 +570,7 @@ lex_rune(struct lexer *lexer) buf[2] = '\0'; c = strtoul(&buf[0], &endptr, 16); if (*endptr != '\0') { - fprintf(stderr, - "Error: invalid hex literal at %s:%d:%d\n", - sources[lexer->loc.file], lexer->loc.lineno, - lexer->loc.colno); - exit(EXIT_FAILURE); + error(&lexer->loc, "Invalid hex literal"); } return c; case 'u': @@ -569,11 +581,7 @@ lex_rune(struct lexer *lexer) buf[4] = '\0'; c = strtoul(&buf[0], &endptr, 16); if (*endptr != '\0') { - fprintf(stderr, - "Error: invalid hex literal at %s:%d:%d\n", - sources[lexer->loc.file], lexer->loc.lineno, - lexer->loc.colno); - exit(EXIT_FAILURE); + error(&lexer->loc, "Invalid hex literal"); } return c; case 'U': @@ -588,19 +596,13 @@ lex_rune(struct lexer *lexer) buf[8] = '\0'; c = strtoul(&buf[0], &endptr, 16); if (*endptr != '\0') { - fprintf(stderr, - "Error: invalid hex literal at %s:%d:%d\n", - sources[lexer->loc.file], lexer->loc.lineno, - lexer->loc.colno); - exit(EXIT_FAILURE); + error(&lexer->loc, "Invalid hex literal"); } return c; + case C_EOF: + error(&lexer->loc, "Unexpected end of file"); default: - fprintf(stderr, - "Error: invalid escape '\\%c' at %s:%d:%d\n", - c, sources[lexer->loc.file], lexer->loc.lineno, - lexer->loc.colno); - exit(EXIT_FAILURE); + error(&lexer->loc, "Invalid escape '\\%c'", c); } assert(0); default: @@ -614,36 +616,34 @@ lex_string(struct lexer *lexer, struct token *out) { uint32_t c = next(lexer, &out->loc, false); uint32_t delim; - assert(c != UTF8_INVALID); switch (c) { case '"': case '`': delim = c; - while ((c = next(lexer, NULL, false)) != UTF8_INVALID) { - if (c == delim) { - char *buf = xcalloc(lexer->buflen + 1, 1); - memcpy(buf, lexer->buf, lexer->buflen); - out->token = T_LITERAL; - out->storage = STORAGE_STRING; - out->string.len = lexer->buflen; - out->string.value = buf; - consume(lexer, -1); - return out->token; - } else { - push(lexer, c, false); - if (delim == '"') { - push(lexer, lex_rune(lexer), false); - } - next(lexer, NULL, true); + while ((c = next(lexer, NULL, false)) != delim) { + if (c == C_EOF) { + error(&lexer->loc, "Unexpected end of file"); + } + push(lexer, c, false); + if (delim == '"') { + push(lexer, lex_rune(lexer), false); } + next(lexer, NULL, true); } - assert(0); // Invariant + char *buf = xcalloc(lexer->buflen + 1, 1); + memcpy(buf, lexer->buf, lexer->buflen); + out->token = T_LITERAL; + out->storage = STORAGE_STRING; + out->string.len = lexer->buflen; + out->string.value = buf; + consume(lexer, -1); + return out->token; case '\'': c = next(lexer, NULL, false); switch (c) { case '\'': - assert(0); // Invariant + error(&out->loc, "Expected rune before trailing single quote"); case '\\': push(lexer, c, false); out->rune = lex_rune(lexer); @@ -651,8 +651,9 @@ lex_string(struct lexer *lexer, struct token *out) default: out->rune = c; } - c = next(lexer, NULL, false); - assert(c == '\''); + if (next(lexer, NULL, false) != '\'') { + error(&out->loc, "Expected trailing single quote"); + } out->token = T_LITERAL; out->storage = STORAGE_RCONST; return out->token; @@ -665,7 +666,7 @@ lex_string(struct lexer *lexer, struct token *out) static enum lexical_token lex3(struct lexer *lexer, struct token *out, uint32_t c) { - assert(c != UTF8_INVALID); + assert(c != C_EOF); switch (c) { case '.': @@ -810,7 +811,7 @@ static enum lexical_token _lex(struct lexer *lexer, struct token *out); static enum lexical_token lex2(struct lexer *lexer, struct token *out, uint32_t c) { - assert(c != UTF8_INVALID); + assert(c != C_EOF); switch (c) { case '*': @@ -841,7 +842,7 @@ lex2(struct lexer *lexer, struct token *out, uint32_t c) out->token = T_DIVEQ; break; case '/': - while ((c = next(lexer, NULL, false)) != UTF8_INVALID && c != '\n') ; + while ((c = next(lexer, NULL, false)) != C_EOF && c != '\n') ; return _lex(lexer, out); default: push(lexer, c, false); @@ -924,7 +925,7 @@ _lex(struct lexer *lexer, struct token *out) } uint32_t c = wgetc(lexer, &out->loc); - if (c == UTF8_INVALID) { + if (c == C_EOF) { out->token = T_EOF; return out->token; }