lex.c: report lexer errors - harec - [hare] Hare compiler, written in C11 for POSIX OSs

commit cd8e44cbb0d975160cf6bd755798793e080a164c
parent 7d1f4cf5f3b34969dda64eeec345c1f21e58967b
Author: Bor Grošelj Simić <bgs@turminal.net>
Date:   Thu, 25 Aug 2022 21:36:09 +0200

lex.c: report lexer errors

Lexer used T_ERROR for error reporting, but the rest of the code just
ignored those tokens. Lex errors now get reported immediately and abort
execution.

Invalid UTF-8 sequences are reported in next(), changing (c !=
UTF8_INVALID) checks in the rest of the code to eof checks.

Signed-off-by: Bor Grošelj Simić <bgs@turminal.net>

Diffstat:
M include/lex.h  | 3 +++
M src/lex.c  | 131 ++++++++++++++++++++++++++++++++++++++++---------------------------------------

2 files changed, 69 insertions(+), 65 deletions(-)
diff --git a/include/lex.h b/include/lex.h
@@ -3,6 +3,9 @@
 #include <stdint.h>
 #include <stdio.h>
 #include "types.h"
+#include "utf8.h"
+
+#define C_EOF UTF8_INVALID
 
 // Keep sorted
 enum lexical_token {
diff --git a/src/lex.c b/src/lex.c
@@ -2,11 +2,13 @@
 #include <ctype.h>
 #include <errno.h>
 #include <inttypes.h>
+#include <stdarg.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdnoreturn.h>
 #include <string.h>
 #include "lex.h"
 #include "utf8.h"
@@ -135,6 +137,21 @@ static const char *tokens[] = {
 	[T_BXOREQ] = "^=",
 };
 
+static noreturn void
+error(struct location *loc, char *fmt, ...)
+{
+	fprintf(stderr, "Syntax error at %s:%d:%d: ", sources[loc->file],
+			loc->lineno, loc->colno);
+
+	va_list ap;
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+
+	fputc('\n', stderr);
+	exit(EXIT_FAILURE);
+}
+
 void
 lex_init(struct lexer *lexer, FILE *f, int fileid)
 {
@@ -181,6 +198,9 @@ next(struct lexer *lexer, struct location *loc, bool buffer)
 	} else {
 		c = utf8_get(lexer->in);
 		update_lineno(&lexer->loc, c);
+		if (c == UTF8_INVALID && !feof(lexer->in)) {
+			error(&lexer->loc, "Invalid UTF-8 sequence encountered");
+		}
 	}
 	if (loc != NULL) {
 		*loc = lexer->loc;
@@ -188,7 +208,7 @@ next(struct lexer *lexer, struct location *loc, bool buffer)
 			update_lineno(&lexer->loc, lexer->c[i]);
 		}
 	}
-	if (c == UTF8_INVALID || !buffer) {
+	if (c == C_EOF || !buffer) {
 		return c;
 	}
 	if (lexer->buflen + utf8_cpsize(c) >= lexer->bufsz) {
@@ -213,7 +233,7 @@ static uint32_t
 wgetc(struct lexer *lexer, struct location *loc)
 {
 	uint32_t c;
-	while ((c = next(lexer, loc, false)) != UTF8_INVALID && isharespace(c)) ;
+	while ((c = next(lexer, loc, false)) != C_EOF && isharespace(c)) ;
 	return c;
 }
 
@@ -252,8 +272,8 @@ static uint32_t
 lex_name(struct lexer *lexer, struct token *out)
 {
 	uint32_t c = next(lexer, &out->loc, true);
-	assert(c != UTF8_INVALID && c <= 0x7F && (isalpha(c) || c == '_' || c == '@'));
-	while ((c = next(lexer, NULL, true)) != UTF8_INVALID) {
+	assert(c != C_EOF && c <= 0x7F && (isalpha(c) || c == '_' || c == '@'));
+	while ((c = next(lexer, NULL, true)) != C_EOF) {
 		if (c > 0x7F || (!isalnum(c) && c != '_')) {
 			push(lexer, c, true);
 			break;
@@ -264,9 +284,7 @@ lex_name(struct lexer *lexer, struct token *out)
 			sizeof(tokens[0]), cmp_keyword);
 	if (!token) {
 		if (lexer->buf[0] == '@') {
-			out->token = T_ERROR;
-			consume(lexer, -1);
-			return out->token;
+			error(&out->loc, "Unknown attribute %s", lexer->buf);
 		}
 		out->token = T_NAME;
 		out->name = strdup(lexer->buf);
@@ -281,7 +299,7 @@ static uint32_t
 lex_literal(struct lexer *lexer, struct token *out)
 {
 	uint32_t c = next(lexer, &out->loc, true);
-	assert(c != UTF8_INVALID && c <= 0x7F && isdigit(c));
+	assert(c != C_EOF && c <= 0x7F && isdigit(c));
 
 	bool started = false;
 	int base = 10;
@@ -315,7 +333,7 @@ lex_literal(struct lexer *lexer, struct token *out)
 	char *suff = NULL;
 	char *exp = NULL;
 	bool isfloat = false;
-	while ((c = next(lexer, NULL, true)) != UTF8_INVALID) {
+	while ((c = next(lexer, NULL, true)) != C_EOF) {
 		if (!strchr(basechrs, c)) {
 			switch (c) {
 			case '.':
@@ -418,16 +436,18 @@ finalize:
 			}
 		}
 		if (!isvalid) {
-			out->token = T_ERROR;
-			consume(lexer, -1);
-			return out->token;
+			error(&out->loc, "Invalid numeric suffix");
 		}
 	}
 
 	intmax_t exponent = 0;
 	if (exp) {
 		char *endptr = NULL;
+		errno = 0;
 		exponent = strtoimax(exp, &endptr, 10);
+		if (errno == ERANGE) {
+			error(&out->loc, "Numerical exponent overflow");
+		}
 		// integers can't have negative exponents
 		if (exponent < 0 && !suff) {
 			out->storage = STORAGE_FCONST;
@@ -438,9 +458,7 @@ finalize:
 			|| s == STORAGE_F64
 			|| s == STORAGE_FCONST;
 		if (endptr == exp || !valid) {
-			out->token = T_ERROR;
-			consume(lexer, -1);
-			return out->token;
+			error(&out->loc, "Integers cannot have negative exponents");
 		}
 	}
 
@@ -451,9 +469,7 @@ finalize:
 		case STORAGE_FCONST:
 			break;
 		default:
-			out->token = T_ERROR;
-			consume(lexer, -1);
-			return out->token;
+			error(&out->loc, "Unexpected decimal point in integer literal");
 		}
 	}
 
@@ -508,7 +524,7 @@ finalize:
 		assert(0);
 	}
 	if (errno == ERANGE && !isfloat) {
-		out->token = T_ERROR;
+		error(&out->loc, "Integer literal overflow");
 	}
 	consume(lexer, -1);
 	return out->token;
@@ -520,7 +536,7 @@ lex_rune(struct lexer *lexer)
 	char buf[9];
 	char *endptr;
 	uint32_t c = next(lexer, NULL, false);
-	assert(c != UTF8_INVALID);
+	assert(c != C_EOF);
 
 	switch (c) {
 	case '\\':
@@ -554,11 +570,7 @@ lex_rune(struct lexer *lexer)
 			buf[2] = '\0';
 			c = strtoul(&buf[0], &endptr, 16);
 			if (*endptr != '\0') {
-				fprintf(stderr,
-					"Error: invalid hex literal at %s:%d:%d\n",
-					sources[lexer->loc.file], lexer->loc.lineno,
-					lexer->loc.colno);
-				exit(EXIT_FAILURE);
+				error(&lexer->loc, "Invalid hex literal");
 			}
 			return c;
 		case 'u':
@@ -569,11 +581,7 @@ lex_rune(struct lexer *lexer)
 			buf[4] = '\0';
 			c = strtoul(&buf[0], &endptr, 16);
 			if (*endptr != '\0') {
-				fprintf(stderr,
-					"Error: invalid hex literal at %s:%d:%d\n",
-					sources[lexer->loc.file], lexer->loc.lineno,
-					lexer->loc.colno);
-				exit(EXIT_FAILURE);
+				error(&lexer->loc, "Invalid hex literal");
 			}
 			return c;
 		case 'U':
@@ -588,19 +596,13 @@ lex_rune(struct lexer *lexer)
 			buf[8] = '\0';
 			c = strtoul(&buf[0], &endptr, 16);
 			if (*endptr != '\0') {
-				fprintf(stderr,
-					"Error: invalid hex literal at %s:%d:%d\n",
-					sources[lexer->loc.file], lexer->loc.lineno,
-					lexer->loc.colno);
-				exit(EXIT_FAILURE);
+				error(&lexer->loc, "Invalid hex literal");
 			}
 			return c;
+		case C_EOF:
+			error(&lexer->loc, "Unexpected end of file");
 		default:
-			fprintf(stderr,
-				"Error: invalid escape '\\%c' at %s:%d:%d\n",
-				c, sources[lexer->loc.file], lexer->loc.lineno,
-				lexer->loc.colno);
-			exit(EXIT_FAILURE);
+			error(&lexer->loc, "Invalid escape '\\%c'", c);
 		}
 		assert(0);
 	default:
@@ -614,36 +616,34 @@ lex_string(struct lexer *lexer, struct token *out)
 {
 	uint32_t c = next(lexer, &out->loc, false);
 	uint32_t delim;
-	assert(c != UTF8_INVALID);
 
 	switch (c) {
 	case '"':
 	case '`':
 		delim = c;
-		while ((c = next(lexer, NULL, false)) != UTF8_INVALID) {
-			if (c == delim) {
-				char *buf = xcalloc(lexer->buflen + 1, 1);
-				memcpy(buf, lexer->buf, lexer->buflen);
-				out->token = T_LITERAL;
-				out->storage = STORAGE_STRING;
-				out->string.len = lexer->buflen;
-				out->string.value = buf;
-				consume(lexer, -1);
-				return out->token;
-			} else {
-				push(lexer, c, false);
-				if (delim == '"') {
-					push(lexer, lex_rune(lexer), false);
-				}
-				next(lexer, NULL, true);
+		while ((c = next(lexer, NULL, false)) != delim) {
+			if (c == C_EOF) {
+				error(&lexer->loc, "Unexpected end of file");
+			}
+			push(lexer, c, false);
+			if (delim == '"') {
+				push(lexer, lex_rune(lexer), false);
 			}
+			next(lexer, NULL, true);
 		}
-		assert(0); // Invariant
+		char *buf = xcalloc(lexer->buflen + 1, 1);
+		memcpy(buf, lexer->buf, lexer->buflen);
+		out->token = T_LITERAL;
+		out->storage = STORAGE_STRING;
+		out->string.len = lexer->buflen;
+		out->string.value = buf;
+		consume(lexer, -1);
+		return out->token;
 	case '\'':
 		c = next(lexer, NULL, false);
 		switch (c) {
 		case '\'':
-			assert(0); // Invariant
+			error(&out->loc, "Expected rune before trailing single quote");
 		case '\\':
 			push(lexer, c, false);
 			out->rune = lex_rune(lexer);
@@ -651,8 +651,9 @@ lex_string(struct lexer *lexer, struct token *out)
 		default:
 			out->rune = c;
 		}
-		c = next(lexer, NULL, false);
-		assert(c == '\'');
+		if (next(lexer, NULL, false) != '\'') {
+			error(&out->loc, "Expected trailing single quote");
+		}
 		out->token = T_LITERAL;
 		out->storage = STORAGE_RCONST;
 		return out->token;
@@ -665,7 +666,7 @@ lex_string(struct lexer *lexer, struct token *out)
 static enum lexical_token
 lex3(struct lexer *lexer, struct token *out, uint32_t c)
 {
-	assert(c != UTF8_INVALID);
+	assert(c != C_EOF);
 
 	switch (c) {
 	case '.':
@@ -810,7 +811,7 @@ static enum lexical_token _lex(struct lexer *lexer, struct token *out);
 static enum lexical_token
 lex2(struct lexer *lexer, struct token *out, uint32_t c)
 {
-	assert(c != UTF8_INVALID);
+	assert(c != C_EOF);
 
 	switch (c) {
 	case '*':
@@ -841,7 +842,7 @@ lex2(struct lexer *lexer, struct token *out, uint32_t c)
 			out->token = T_DIVEQ;
 			break;
 		case '/':
-			while ((c = next(lexer, NULL, false)) != UTF8_INVALID && c != '\n') ;
+			while ((c = next(lexer, NULL, false)) != C_EOF && c != '\n') ;
 			return _lex(lexer, out);
 		default:
 			push(lexer, c, false);
@@ -924,7 +925,7 @@ _lex(struct lexer *lexer, struct token *out)
 	}
 
 	uint32_t c = wgetc(lexer, &out->loc);
-	if (c == UTF8_INVALID) {
+	if (c == C_EOF) {
 		out->token = T_EOF;
 		return out->token;
 	}

	harec [hare] Hare compiler, written in C11 for POSIX OSs
	Log \| Files \| Refs \| README \| LICENSE

M	include/lex.h	\|	3	+++
M	src/lex.c	\|	131	++++++++++++++++++++++++++++++++++++++++---------------------------------------