harec

[hare] Hare compiler, written in C11 for POSIX OSs
Log | Files | Refs | README | LICENSE

lex.c (24685B)


      1 #include <assert.h>
      2 #include <ctype.h>
      3 #include <errno.h>
      4 #include <inttypes.h>
      5 #include <stdarg.h>
      6 #include <stdbool.h>
      7 #include <stddef.h>
      8 #include <stdint.h>
      9 #include <stdio.h>
     10 #include <stdlib.h>
     11 #include <stdnoreturn.h>
     12 #include <string.h>
     13 #include "lex.h"
     14 #include "utf8.h"
     15 #include "util.h"
     16 
     17 static const char *tokens[] = {
     18 	// Must be alpha sorted and match lex.h
     19 	[T_ATTR_FINI] = "@fini",
     20 	[T_ATTR_INIT] = "@init",
     21 	[T_ATTR_NORETURN] = "@noreturn",
     22 	[T_ATTR_OFFSET] = "@offset",
     23 	[T_ATTR_PACKED] = "@packed",
     24 	[T_ATTR_SYMBOL] = "@symbol",
     25 	[T_ATTR_TEST] = "@test",
     26 	[T_ATTR_THREADLOCAL] = "@threadlocal",
     27 	[T_UNDERSCORE] = "_",
     28 	[T_ABORT] = "abort",
     29 	[T_ALLOC] = "alloc",
     30 	[T_APPEND] = "append",
     31 	[T_AS] = "as",
     32 	[T_ASSERT] = "assert",
     33 	[T_BOOL] = "bool",
     34 	[T_BREAK] = "break",
     35 	[T_CASE] = "case",
     36 	[T_CHAR] = "char",
     37 	[T_CONST] = "const",
     38 	[T_CONTINUE] = "continue",
     39 	[T_DEFER] = "defer",
     40 	[T_DEF] = "def",
     41 	[T_DELETE] = "delete",
     42 	[T_ELSE] = "else",
     43 	[T_ENUM] = "enum",
     44 	[T_EXPORT] = "export",
     45 	[T_F32] = "f32",
     46 	[T_F64] = "f64",
     47 	[T_FALSE] = "false",
     48 	[T_FN] = "fn",
     49 	[T_FOR] = "for",
     50 	[T_FREE] = "free",
     51 	[T_I16] = "i16",
     52 	[T_I32] = "i32",
     53 	[T_I64] = "i64",
     54 	[T_I8] = "i8",
     55 	[T_IF] = "if",
     56 	[T_INSERT] = "insert",
     57 	[T_INT] = "int",
     58 	[T_IS] = "is",
     59 	[T_LEN] = "len",
     60 	[T_LET] = "let",
     61 	[T_MATCH] = "match",
     62 	[T_NULL] = "null",
     63 	[T_NULLABLE] = "nullable",
     64 	[T_OFFSET] = "offset",
     65 	[T_RETURN] = "return",
     66 	[T_RUNE] = "rune",
     67 	[T_SIZE] = "size",
     68 	[T_STATIC] = "static",
     69 	[T_STR] = "str",
     70 	[T_STRUCT] = "struct",
     71 	[T_SWITCH] = "switch",
     72 	[T_TRUE] = "true",
     73 	[T_TYPE] = "type",
     74 	[T_U16] = "u16",
     75 	[T_U32] = "u32",
     76 	[T_U64] = "u64",
     77 	[T_U8] = "u8",
     78 	[T_UINT] = "uint",
     79 	[T_UINTPTR] = "uintptr",
     80 	[T_UNION] = "union",
     81 	[T_USE] = "use",
     82 	[T_VAARG] = "vaarg",
     83 	[T_VAEND] = "vaend",
     84 	[T_VALIST] = "valist",
     85 	[T_VASTART] = "vastart",
     86 	[T_VOID] = "void",
     87 	[T_YIELD] = "yield",
     88 
     89 	// Operators
     90 	[T_ARROW] = "=>",
     91 	[T_BANDEQ] = "&=",
     92 	[T_BAND] = "&",
     93 	[T_BNOT] = "~",
     94 	[T_BOR] = "|",
     95 	[T_COLON] = ":",
     96 	[T_COMMA] = ",",
     97 	[T_DIV] = "/",
     98 	[T_DIVEQ] = "/=",
     99 	[T_DOT] = ".",
    100 	[T_DOUBLE_COLON] = "::",
    101 	[T_ELLIPSIS] = "...",
    102 	[T_EQUAL] = "=",
    103 	[T_GREATER] = ">",
    104 	[T_GREATEREQ] = ">=",
    105 	[T_LAND] = "&&",
    106 	[T_LANDEQ] = "&&=",
    107 	[T_LBRACE] = "{",
    108 	[T_LBRACKET] = "[",
    109 	[T_LEQUAL] = "==",
    110 	[T_LESS] = "<",
    111 	[T_LESSEQ] = "<=",
    112 	[T_LNOT] = "!",
    113 	[T_LOR] = "||",
    114 	[T_LOREQ] = "||=",
    115 	[T_LPAREN] = "(",
    116 	[T_LSHIFT] = "<<",
    117 	[T_LSHIFTEQ] = "<<=",
    118 	[T_LXOR] = "^^",
    119 	[T_LXOREQ] = "^^=",
    120 	[T_MINUS] = "-",
    121 	[T_MINUSEQ] = "-=",
    122 	[T_MODEQ] = "%=",
    123 	[T_MODULO] = "%",
    124 	[T_NEQUAL] = "!=",
    125 	[T_BOREQ] = "|=",
    126 	[T_PLUS] = "+",
    127 	[T_PLUSEQ] = "+=",
    128 	[T_QUESTION] = "?",
    129 	[T_RBRACE] = "}",
    130 	[T_RBRACKET] = "]",
    131 	[T_RPAREN] = ")",
    132 	[T_RSHIFT] = ">>",
    133 	[T_RSHIFTEQ] = ">>=",
    134 	[T_SEMICOLON] = ";",
    135 	[T_SLICE] = "..",
    136 	[T_TIMES] = "*",
    137 	[T_TIMESEQ] = "*=",
    138 	[T_BXOR] = "^",
    139 	[T_BXOREQ] = "^=",
    140 };
    141 
    142 static noreturn void
    143 error(struct location *loc, char *fmt, ...)
    144 {
    145 	fprintf(stderr, "Syntax error at %s:%d:%d: ", sources[loc->file],
    146 			loc->lineno, loc->colno);
    147 
    148 	va_list ap;
    149 	va_start(ap, fmt);
    150 	vfprintf(stderr, fmt, ap);
    151 	va_end(ap);
    152 
    153 	fputc('\n', stderr);
    154 	errline(sources[loc->file], loc->lineno, loc->colno);
    155 	exit(EXIT_FAILURE);
    156 }
    157 
    158 void
    159 lex_init(struct lexer *lexer, FILE *f, int fileid)
    160 {
    161 	memset(lexer, 0, sizeof(*lexer));
    162 	lexer->in = f;
    163 	lexer->bufsz = 256;
    164 	lexer->buf = xcalloc(1, lexer->bufsz);
    165 	lexer->un.token = T_ERROR;
    166 	lexer->loc.lineno = 1;
    167 	lexer->loc.colno = 0;
    168 	lexer->loc.file = fileid;
    169 	lexer->c[0] = UINT32_MAX;
    170 	lexer->c[1] = UINT32_MAX;
    171 }
    172 
    173 void
    174 lex_finish(struct lexer *lexer)
    175 {
    176 	fclose(lexer->in);
    177 	free(lexer->buf);
    178 }
    179 
    180 static void
    181 update_lineno(struct location *loc, uint32_t c)
    182 {
    183 	if (c == '\n') {
    184 		loc->lineno++;
    185 		loc->colno = 0;
    186 	} else if (c == '\t') {
    187 		loc->colno += 8;
    188 	} else {
    189 		loc->colno++;
    190 	}
    191 }
    192 
    193 static uint32_t
    194 next(struct lexer *lexer, struct location *loc, bool buffer)
    195 {
    196 	uint32_t c;
    197 	if (lexer->c[0] != UINT32_MAX) {
    198 		c = lexer->c[0];
    199 		lexer->c[0] = lexer->c[1];
    200 		lexer->c[1] = UINT32_MAX;
    201 	} else {
    202 		c = utf8_get(lexer->in);
    203 		update_lineno(&lexer->loc, c);
    204 		if (c == UTF8_INVALID && !feof(lexer->in)) {
    205 			error(&lexer->loc, "Invalid UTF-8 sequence encountered");
    206 		}
    207 	}
    208 	if (loc != NULL) {
    209 		*loc = lexer->loc;
    210 		for (size_t i = 0; i < 2 && lexer->c[i] != UINT32_MAX; i++) {
    211 			update_lineno(&lexer->loc, lexer->c[i]);
    212 		}
    213 	}
    214 	if (c == C_EOF || !buffer) {
    215 		return c;
    216 	}
    217 	if (lexer->buflen + utf8_cpsize(c) >= lexer->bufsz) {
    218 		lexer->bufsz *= 2;
    219 		lexer->buf = xrealloc(lexer->buf, lexer->bufsz);
    220 	}
    221 	char buf[UTF8_MAX_SIZE];
    222 	size_t sz = utf8_encode(&buf[0], c);
    223 	memcpy(lexer->buf + lexer->buflen, buf, sz);
    224 	lexer->buflen += sz;
    225 	lexer->buf[lexer->buflen] = '\0';
    226 	return c;
    227 }
    228 
    229 static bool
    230 isharespace(uint32_t c)
    231 {
    232 	return c == '\t' || c == '\n' || c == ' ';
    233 }
    234 
    235 static uint32_t
    236 wgetc(struct lexer *lexer, struct location *loc)
    237 {
    238 	uint32_t c;
    239 	while ((c = next(lexer, loc, false)) != C_EOF && isharespace(c)) ;
    240 	return c;
    241 }
    242 
    243 static void
    244 consume(struct lexer *lexer, ssize_t n)
    245 {
    246 	if (n == -1) {
    247 		lexer->buflen = 0;
    248 		lexer->buf[0] = 0;
    249 		return;
    250 	}
    251 	for (ssize_t i = 0; i < n; i++) {
    252 		while ((lexer->buf[--lexer->buflen] & 0xC0) == 0x80) ;
    253 	}
    254 	lexer->buf[lexer->buflen] = 0;
    255 }
    256 
    257 static void
    258 push(struct lexer *lexer, uint32_t c, bool buffer)
    259 {
    260 	assert(lexer->c[1] == UINT32_MAX);
    261 	lexer->c[1] = lexer->c[0];
    262 	lexer->c[0] = c;
    263 	if (buffer) {
    264 		consume(lexer, 1);
    265 	}
    266 }
    267 
    268 static int
    269 cmp_keyword(const void *va, const void *vb)
    270 {
    271 	return strcmp(*(const char **)va, *(const char **)vb);
    272 }
    273 
    274 static uint32_t
    275 lex_name(struct lexer *lexer, struct token *out)
    276 {
    277 	uint32_t c = next(lexer, &out->loc, true);
    278 	assert(c != C_EOF && c <= 0x7F && (isalpha(c) || c == '_' || c == '@'));
    279 	while ((c = next(lexer, NULL, true)) != C_EOF) {
    280 		if (c > 0x7F || (!isalnum(c) && c != '_')) {
    281 			push(lexer, c, true);
    282 			break;
    283 		}
    284 	}
    285 
    286 	void *token = bsearch(&lexer->buf, tokens, T_LAST_KEYWORD + 1,
    287 			sizeof(tokens[0]), cmp_keyword);
    288 	if (!token) {
    289 		if (lexer->buf[0] == '@') {
    290 			error(&out->loc, "Unknown attribute %s", lexer->buf);
    291 		}
    292 		out->token = T_NAME;
    293 		out->name = xstrdup(lexer->buf);
    294 	} else {
    295 		out->token = (const char **)token - tokens;
    296 	}
    297 	consume(lexer, -1);
    298 	return out->token;
    299 }
    300 
    301 static uintmax_t
    302 compute_exp(uintmax_t n, int exponent, bool _signed, struct location *loc)
    303 {
    304 	if (n == 0) {
    305 		return 0;
    306 	}
    307 	for (int i = 0; i < exponent; i++) {
    308 		uintmax_t old = n;
    309 		n *= 10;
    310 		if (n / 10 != old) {
    311 			error(loc, "Integer literal overflow");
    312 		}
    313 	}
    314 	if (_signed && n > (uintmax_t)INT64_MIN) {
    315 		error(loc, "Integer literal overflow");
    316 	}
    317 	return n;
    318 }
    319 
    320 static uint32_t
    321 lex_literal(struct lexer *lexer, struct token *out)
    322 {
    323 	uint32_t c = next(lexer, &out->loc, true);
    324 	assert(c != C_EOF && c <= 0x7F && isdigit(c));
    325 
    326 	bool started = false, leadingzero = false;
    327 	int base = 10;
    328 	const char *basechrs = "0123456789";
    329 	if (c == '0') {
    330 		switch ((c = next(lexer, NULL, true))) {
    331 		case 'b':
    332 			base = 2;
    333 			basechrs = "01";
    334 			consume(lexer, 2);
    335 			break;
    336 		case 'o':
    337 			base = 8;
    338 			basechrs = "01234567";
    339 			consume(lexer, 2);
    340 			break;
    341 		case 'x':
    342 			base = 16;
    343 			basechrs = "0123456789ABCDEFabcdef";
    344 			consume(lexer, 2);
    345 			break;
    346 		default:
    347 			started = true;
    348 			leadingzero = true;
    349 			push(lexer, c, true);
    350 			break;
    351 		}
    352 	} else {
    353 		started = true;
    354 	}
    355 
    356 	char *suff = NULL;
    357 	char *exp = NULL;
    358 	bool isfloat = false;
    359 	while ((c = next(lexer, NULL, true)) != C_EOF) {
    360 		if (!strchr(basechrs, c)) {
    361 			switch (c) {
    362 			case '.':
    363 				if (!started) {
    364 					push(lexer, c, true);
    365 					goto finalize;
    366 				}
    367 				if (lexer->require_int) {
    368 					push(lexer, '.', true);
    369 					goto finalize;
    370 				}
    371 				if (isfloat || suff || exp) {
    372 					push(lexer, c, true);
    373 					goto finalize;
    374 				}
    375 				if (!strchr(basechrs, c = next(lexer, NULL, false))) {
    376 					push(lexer, c, false);
    377 					push(lexer, '.', true);
    378 					goto finalize;
    379 				} else {
    380 					push(lexer, c, false);
    381 				}
    382 				isfloat = true;
    383 				break;
    384 			case 'e':
    385 				if (!started) {
    386 					push(lexer, c, true);
    387 					goto finalize;
    388 				}
    389 				if (exp || suff) {
    390 					push(lexer, c, true);
    391 					goto finalize;
    392 				}
    393 				// exponent is always in base 10
    394 				basechrs = "0123456789";
    395 				c = next(lexer, NULL, true);
    396 				if (c != '-' && c != '+' && !strchr(basechrs, c)) {
    397 					push(lexer, c, true);
    398 					push(lexer, 'e', true);
    399 					goto finalize;
    400 				};
    401 				exp = &lexer->buf[lexer->buflen - 1];
    402 				break;
    403 			case 'f':
    404 				if (base != 10) {
    405 					push(lexer, c, true);
    406 					goto finalize;
    407 				}
    408 				// Fallthrough
    409 			case 'i':
    410 			case 'u':
    411 			case 'z':
    412 				if (suff || !started) {
    413 					push(lexer, c, true);
    414 					goto finalize;
    415 				}
    416 				suff = &lexer->buf[lexer->buflen - 1];
    417 				basechrs = "0123456789";
    418 				break;
    419 			default:
    420 				push(lexer, c, true);
    421 				goto finalize;
    422 			}
    423 		}
    424 		started = true;
    425 	}
    426 
    427 finalize:
    428 	if (!started) {
    429 		error(&out->loc, "Invalid literal");
    430 	}
    431 	if (leadingzero && lexer->buflen >= 2 && strchr(basechrs, lexer->buf[1])) {
    432 		error(&out->loc, "Leading zero in base 10 literal");
    433 	}
    434 	lexer->require_int = false;
    435 	out->token = T_LITERAL;
    436 	if (isfloat) {
    437 		out->storage = STORAGE_FCONST;
    438 	} else {
    439 		out->storage = STORAGE_ICONST;
    440 	}
    441 	if (suff) {
    442 		const char *suffs[] = {
    443 			[STORAGE_U8] = "u8",
    444 			[STORAGE_U16] = "u16",
    445 			[STORAGE_U32] = "u32",
    446 			[STORAGE_U64] = "u64",
    447 			[STORAGE_I8] = "i8",
    448 			[STORAGE_I16] = "i16",
    449 			[STORAGE_I32] = "i32",
    450 			[STORAGE_I64] = "i64",
    451 
    452 			[STORAGE_UINT] = "u",
    453 			[STORAGE_INT] = "i",
    454 			[STORAGE_SIZE] = "z",
    455 			[STORAGE_F32] = "f32",
    456 			[STORAGE_F64] = "f64",
    457 		};
    458 		bool isvalid = false;
    459 		for (enum type_storage i = 0;
    460 				i < sizeof(suffs) / sizeof(suffs[0]); ++i) {
    461 			if (suffs[i] && strcmp(suff, suffs[i]) == 0) {
    462 				isvalid = true;
    463 				out->storage = i;
    464 				break;
    465 			}
    466 		}
    467 		if (!isvalid) {
    468 			error(&out->loc, "Invalid numeric suffix");
    469 		}
    470 	}
    471 
    472 	intmax_t exponent = 0;
    473 	if (exp) {
    474 		char *endptr = NULL;
    475 		errno = 0;
    476 		exponent = strtoimax(exp, &endptr, 10);
    477 		if (errno == ERANGE) {
    478 			error(&out->loc, "Numerical exponent overflow");
    479 		}
    480 		// integers can't have negative exponents
    481 		if (exponent < 0 && !suff) {
    482 			out->storage = STORAGE_FCONST;
    483 		}
    484 		enum type_storage s = out->storage;
    485 		bool valid = exponent >= 0
    486 			|| s == STORAGE_F32
    487 			|| s == STORAGE_F64
    488 			|| s == STORAGE_FCONST;
    489 		if (endptr == exp || !valid) {
    490 			error(&out->loc, "Integers cannot have negative exponents");
    491 		}
    492 	}
    493 
    494 	if (isfloat) {
    495 		switch (out->storage) {
    496 		case STORAGE_F32:
    497 		case STORAGE_F64:
    498 		case STORAGE_FCONST:
    499 			break;
    500 		default:
    501 			error(&out->loc, "Unexpected decimal point in integer literal");
    502 		}
    503 	}
    504 
    505 	errno = 0;
    506 	switch (out->storage) {
    507 	case STORAGE_U8:
    508 	case STORAGE_U16:
    509 	case STORAGE_U32:
    510 	case STORAGE_UINT:
    511 	case STORAGE_U64:
    512 	case STORAGE_SIZE:
    513 		out->uval = compute_exp(strtoumax(lexer->buf, NULL, base),
    514 				exponent, false, &out->loc);
    515 		break;
    516 	case STORAGE_ICONST:
    517 		out->uval = compute_exp(strtoumax(lexer->buf, NULL, base),
    518 				exponent, false, &out->loc);
    519 		if (out->uval > (uintmax_t)INT64_MAX) {
    520 			out->storage = STORAGE_U64;
    521 			break;
    522 		}
    523 		// Fallthrough
    524 	case STORAGE_I8:
    525 	case STORAGE_I16:
    526 	case STORAGE_I32:
    527 	case STORAGE_INT:
    528 	case STORAGE_I64:
    529 		out->uval = compute_exp(strtoumax(lexer->buf, NULL, base),
    530 				exponent, true, &out->loc);
    531 		if (out->uval == (uintmax_t)INT64_MIN) {
    532 			// XXX: Hack
    533 			out->ival = INT64_MIN;
    534 		} else {
    535 			out->ival = (intmax_t)out->uval;
    536 		}
    537 		break;
    538 	case STORAGE_F32:
    539 	case STORAGE_F64:
    540 	case STORAGE_FCONST:
    541 		out->fval = strtod(lexer->buf, NULL);
    542 		break;
    543 	default:
    544 		assert(0);
    545 	}
    546 	if (errno == ERANGE && !isfloat) {
    547 		error(&out->loc, "Integer literal overflow");
    548 	}
    549 	consume(lexer, -1);
    550 	return out->token;
    551 }
    552 
    553 static uint32_t
    554 lex_rune(struct lexer *lexer)
    555 {
    556 	char buf[9];
    557 	char *endptr;
    558 	uint32_t c = next(lexer, NULL, false);
    559 	assert(c != C_EOF);
    560 
    561 	switch (c) {
    562 	case '\\':
    563 		c = next(lexer, NULL, false);
    564 		switch (c) {
    565 		case '0':
    566 			return '\0';
    567 		case 'a':
    568 			return '\a';
    569 		case 'b':
    570 			return '\b';
    571 		case 'f':
    572 			return '\f';
    573 		case 'n':
    574 			return '\n';
    575 		case 'r':
    576 			return '\r';
    577 		case 't':
    578 			return '\t';
    579 		case 'v':
    580 			return '\v';
    581 		case '\\':
    582 			return '\\';
    583 		case '\'':
    584 			return '\'';
    585 		case '"':
    586 			return '\"';
    587 		case 'x':
    588 			buf[0] = next(lexer, NULL, false);
    589 			buf[1] = next(lexer, NULL, false);
    590 			buf[2] = '\0';
    591 			c = strtoul(&buf[0], &endptr, 16);
    592 			if (*endptr != '\0') {
    593 				error(&lexer->loc, "Invalid hex literal");
    594 			}
    595 			return c;
    596 		case 'u':
    597 			buf[0] = next(lexer, NULL, false);
    598 			buf[1] = next(lexer, NULL, false);
    599 			buf[2] = next(lexer, NULL, false);
    600 			buf[3] = next(lexer, NULL, false);
    601 			buf[4] = '\0';
    602 			c = strtoul(&buf[0], &endptr, 16);
    603 			if (*endptr != '\0') {
    604 				error(&lexer->loc, "Invalid hex literal");
    605 			}
    606 			return c;
    607 		case 'U':
    608 			buf[0] = next(lexer, NULL, false);
    609 			buf[1] = next(lexer, NULL, false);
    610 			buf[2] = next(lexer, NULL, false);
    611 			buf[3] = next(lexer, NULL, false);
    612 			buf[4] = next(lexer, NULL, false);
    613 			buf[5] = next(lexer, NULL, false);
    614 			buf[6] = next(lexer, NULL, false);
    615 			buf[7] = next(lexer, NULL, false);
    616 			buf[8] = '\0';
    617 			c = strtoul(&buf[0], &endptr, 16);
    618 			if (*endptr != '\0') {
    619 				error(&lexer->loc, "Invalid hex literal");
    620 			}
    621 			return c;
    622 		case C_EOF:
    623 			error(&lexer->loc, "Unexpected end of file");
    624 		default:
    625 			error(&lexer->loc, "Invalid escape '\\%c'", c);
    626 		}
    627 		assert(0);
    628 	default:
    629 		return c;
    630 	}
    631 	assert(0);
    632 }
    633 
    634 static enum lexical_token
    635 lex_string(struct lexer *lexer, struct token *out)
    636 {
    637 	uint32_t c = next(lexer, &out->loc, false);
    638 	uint32_t delim;
    639 
    640 	switch (c) {
    641 	case '"':
    642 	case '`':
    643 		delim = c;
    644 		while ((c = next(lexer, NULL, false)) != delim) {
    645 			if (c == C_EOF) {
    646 				error(&lexer->loc, "Unexpected end of file");
    647 			}
    648 			push(lexer, c, false);
    649 			if (delim == '"') {
    650 				push(lexer, lex_rune(lexer), false);
    651 			}
    652 			next(lexer, NULL, true);
    653 		}
    654 		char *buf = xcalloc(lexer->buflen + 1, 1);
    655 		memcpy(buf, lexer->buf, lexer->buflen);
    656 		out->token = T_LITERAL;
    657 		out->storage = STORAGE_STRING;
    658 		out->string.len = lexer->buflen;
    659 		out->string.value = buf;
    660 		consume(lexer, -1);
    661 		return out->token;
    662 	case '\'':
    663 		c = next(lexer, NULL, false);
    664 		switch (c) {
    665 		case '\'':
    666 			error(&out->loc, "Expected rune before trailing single quote");
    667 		case '\\':
    668 			push(lexer, c, false);
    669 			out->rune = lex_rune(lexer);
    670 			break;
    671 		default:
    672 			out->rune = c;
    673 		}
    674 		if (next(lexer, NULL, false) != '\'') {
    675 			error(&out->loc, "Expected trailing single quote");
    676 		}
    677 		out->token = T_LITERAL;
    678 		out->storage = STORAGE_RCONST;
    679 		return out->token;
    680 	default:
    681 		assert(0); // Invariant
    682 	}
    683 	assert(0);
    684 }
    685 
    686 static enum lexical_token
    687 lex3(struct lexer *lexer, struct token *out, uint32_t c)
    688 {
    689 	assert(c != C_EOF);
    690 
    691 	switch (c) {
    692 	case '.':
    693 		switch ((c = next(lexer, NULL, false))) {
    694 		case '.':
    695 			switch ((c = next(lexer, NULL, false))) {
    696 			case '.':
    697 				out->token = T_ELLIPSIS;
    698 				break;
    699 			default:
    700 				push(lexer, c, false);
    701 				out->token = T_SLICE;
    702 				break;
    703 			}
    704 			break;
    705 		default:
    706 			push(lexer, c, false);
    707 			out->token = T_DOT;
    708 			lexer->require_int = true;
    709 			break;
    710 		}
    711 		break;
    712 	case '<':
    713 		switch ((c = next(lexer, NULL, false))) {
    714 		case '<':
    715 			switch ((c = next(lexer, NULL, false))) {
    716 			case '=':
    717 				out->token = T_LSHIFTEQ;
    718 				break;
    719 			default:
    720 				push(lexer, c, false);
    721 				out->token = T_LSHIFT;
    722 				break;
    723 			}
    724 			break;
    725 		case '=':
    726 			out->token = T_LESSEQ;
    727 			break;
    728 		default:
    729 			push(lexer, c, false);
    730 			out->token = T_LESS;
    731 			break;
    732 		}
    733 		break;
    734 	case '>':
    735 		switch ((c = next(lexer, NULL, false))) {
    736 		case '>':
    737 			switch ((c = next(lexer, NULL, false))) {
    738 			case '=':
    739 				out->token = T_RSHIFTEQ;
    740 				break;
    741 			default:
    742 				push(lexer, c, false);
    743 				out->token = T_RSHIFT;
    744 				break;
    745 			}
    746 			break;
    747 		case '=':
    748 			out->token = T_GREATEREQ;
    749 			break;
    750 		default:
    751 			push(lexer, c, false);
    752 			out->token = T_GREATER;
    753 			break;
    754 		}
    755 		break;
    756 	case '&':
    757 		switch ((c = next(lexer, NULL, false))) {
    758 		case '&':
    759 			switch ((c = next(lexer, NULL, false))) {
    760 			case '=':
    761 				out->token = T_LANDEQ;
    762 				break;
    763 			default:
    764 				push(lexer, c, false);
    765 				out->token = T_LAND;
    766 				break;
    767 			}
    768 			break;
    769 		case '=':
    770 			out->token = T_BANDEQ;
    771 			break;
    772 		default:
    773 			push(lexer, c, false);
    774 			out->token = T_BAND;
    775 			break;
    776 		}
    777 		break;
    778 	case '|':
    779 		switch ((c = next(lexer, NULL, false))) {
    780 		case '|':
    781 			switch ((c = next(lexer, NULL, false))) {
    782 			case '=':
    783 				out->token = T_LOREQ;
    784 				break;
    785 			default:
    786 				push(lexer, c, false);
    787 				out->token = T_LOR;
    788 				break;
    789 			}
    790 			break;
    791 		case '=':
    792 			out->token = T_BOREQ;
    793 			break;
    794 		default:
    795 			push(lexer, c, false);
    796 			out->token = T_BOR;
    797 			break;
    798 		}
    799 		break;
    800 	case '^':
    801 		switch ((c = next(lexer, NULL, false))) {
    802 		case '^':
    803 			switch ((c = next(lexer, NULL, false))) {
    804 			case '=':
    805 				out->token = T_LXOREQ;
    806 				break;
    807 			default:
    808 				push(lexer, c, false);
    809 				out->token = T_LXOR;
    810 				break;
    811 			}
    812 			break;
    813 		case '=':
    814 			out->token = T_BXOREQ;
    815 			break;
    816 		default:
    817 			push(lexer, c, false);
    818 			out->token = T_BXOR;
    819 			break;
    820 		}
    821 		break;
    822 	default:
    823 		assert(0); // Invariant
    824 	}
    825 
    826 	return out->token;
    827 }
    828 
    829 static enum lexical_token _lex(struct lexer *lexer, struct token *out);
    830 
    831 static enum lexical_token
    832 lex2(struct lexer *lexer, struct token *out, uint32_t c)
    833 {
    834 	assert(c != C_EOF);
    835 
    836 	switch (c) {
    837 	case '*':
    838 		switch ((c = next(lexer, NULL, false))) {
    839 		case '=':
    840 			out->token = T_TIMESEQ;
    841 			break;
    842 		default:
    843 			push(lexer, c, false);
    844 			out->token = T_TIMES;
    845 			break;
    846 		}
    847 		break;
    848 	case '%':
    849 		switch ((c = next(lexer, NULL, false))) {
    850 		case '=':
    851 			out->token = T_MODEQ;
    852 			break;
    853 		default:
    854 			push(lexer, c, false);
    855 			out->token = T_MODULO;
    856 			break;
    857 		}
    858 		break;
    859 	case '/':
    860 		switch ((c = next(lexer, NULL, false))) {
    861 		case '=':
    862 			out->token = T_DIVEQ;
    863 			break;
    864 		case '/':
    865 			while ((c = next(lexer, NULL, false)) != C_EOF && c != '\n') ;
    866 			return _lex(lexer, out);
    867 		default:
    868 			push(lexer, c, false);
    869 			out->token = T_DIV;
    870 			break;
    871 		}
    872 		break;
    873 	case '+':
    874 		switch ((c = next(lexer, NULL, false))) {
    875 		case '=':
    876 			out->token = T_PLUSEQ;
    877 			break;
    878 		default:
    879 			push(lexer, c, false);
    880 			out->token = T_PLUS;
    881 			break;
    882 		}
    883 		break;
    884 	case '-':
    885 		switch ((c = next(lexer, NULL, false))) {
    886 		case '=':
    887 			out->token = T_MINUSEQ;
    888 			break;
    889 		default:
    890 			push(lexer, c, false);
    891 			out->token = T_MINUS;
    892 			break;
    893 		}
    894 		break;
    895 	case ':':
    896 		switch ((c = next(lexer, NULL, false))) {
    897 		case ':':
    898 			out->token = T_DOUBLE_COLON;
    899 			break;
    900 		default:
    901 			push(lexer, c, false);
    902 			out->token = T_COLON;
    903 			break;
    904 		}
    905 		break;
    906 	case '!':
    907 		switch ((c = next(lexer, NULL, false))) {
    908 		case '=':
    909 			out->token = T_NEQUAL;
    910 			break;
    911 		default:
    912 			push(lexer, c, false);
    913 			out->token = T_LNOT;
    914 			break;
    915 		}
    916 		break;
    917 	case '=':
    918 		switch ((c = next(lexer, NULL, false))) {
    919 		case '=':
    920 			out->token = T_LEQUAL;
    921 			break;
    922 		case '>':
    923 			out->token = T_ARROW;
    924 			break;
    925 		default:
    926 			push(lexer, c, false);
    927 			out->token = T_EQUAL;
    928 			break;
    929 		}
    930 		break;
    931 	default:
    932 		assert(0); // Invariant
    933 	}
    934 
    935 	return out->token;
    936 }
    937 
    938 static enum lexical_token
    939 _lex(struct lexer *lexer, struct token *out)
    940 {
    941 	if (lexer->un.token != T_ERROR) {
    942 		*out = lexer->un;
    943 		lexer->un.token = T_ERROR;
    944 		return out->token;
    945 	}
    946 
    947 	uint32_t c = wgetc(lexer, &out->loc);
    948 	if (c == C_EOF) {
    949 		out->token = T_EOF;
    950 		return out->token;
    951 	}
    952 
    953 	if (c <= 0x7F && isdigit(c)) {
    954 		push(lexer, c, false);
    955 		return lex_literal(lexer, out);
    956 	}
    957 
    958 	lexer->require_int = false;
    959 
    960 	if (c <= 0x7F && (isalpha(c) || c == '_' || c == '@')) {
    961 		push(lexer, c, false);
    962 		return lex_name(lexer, out);
    963 	}
    964 
    965 	char p[5];
    966 	switch (c) {
    967 	case '"':
    968 	case '`':
    969 	case '\'':
    970 		push(lexer, c, false);
    971 		return lex_string(lexer, out);
    972 	case '.': // . .. ...
    973 	case '<': // < << <= <<=
    974 	case '>': // > >> >= >>=
    975 	case '&': // & && &= &&=
    976 	case '|': // | || |= ||=
    977 	case '^': // ^ ^^ ^= ^^=
    978 		return lex3(lexer, out, c);
    979 	case '*': // * *=
    980 	case '%': // % %=
    981 	case '/': // / /= //
    982 	case '+': // + +=
    983 	case '-': // - -=
    984 	case ':': // : ::
    985 	case '!': // ! !=
    986 	case '=': // = == =>
    987 		return lex2(lexer, out, c);
    988 	case '~':
    989 		out->token = T_BNOT;
    990 		break;
    991 	case ',':
    992 		out->token = T_COMMA;
    993 		break;
    994 	case '{':
    995 		out->token = T_LBRACE;
    996 		break;
    997 	case '[':
    998 		out->token = T_LBRACKET;
    999 		break;
   1000 	case '(':
   1001 		out->token = T_LPAREN;
   1002 		break;
   1003 	case '}':
   1004 		out->token = T_RBRACE;
   1005 		break;
   1006 	case ']':
   1007 		out->token = T_RBRACKET;
   1008 		break;
   1009 	case ')':
   1010 		out->token = T_RPAREN;
   1011 		break;
   1012 	case ';':
   1013 		out->token = T_SEMICOLON;
   1014 		break;
   1015 	case '?':
   1016 		out->token = T_QUESTION;
   1017 		break;
   1018 	default:
   1019 		p[utf8_encode(p, c)] = '\0';
   1020 		fprintf(stderr, "Error: unexpected code point '%s' at %s:%d:%d\n",
   1021 			p, sources[lexer->loc.file], lexer->loc.lineno,
   1022 			lexer->loc.colno);
   1023 		exit(EXIT_FAILURE);
   1024 	}
   1025 
   1026 	return out->token;
   1027 }
   1028 
   1029 enum lexical_token
   1030 lex(struct lexer *lexer, struct token *out)
   1031 {
   1032 	return _lex(lexer, out);
   1033 }
   1034 
   1035 void
   1036 token_finish(struct token *tok)
   1037 {
   1038 	switch (tok->token) {
   1039 	case T_NAME:
   1040 		free(tok->name);
   1041 		break;
   1042 	case T_LITERAL:
   1043 		switch (tok->storage) {
   1044 		case STORAGE_STRING:
   1045 			free(tok->string.value);
   1046 			break;
   1047 		default:
   1048 			break;
   1049 		}
   1050 		break;
   1051 	default:
   1052 		break;
   1053 	}
   1054 	tok->token = 0;
   1055 	tok->storage = 0;
   1056 	tok->loc.file = 0;
   1057 	tok->loc.colno = 0;
   1058 	tok->loc.lineno = 0;
   1059 }
   1060 
   1061 const char *
   1062 lexical_token_str(enum lexical_token tok)
   1063 {
   1064 	switch (tok) {
   1065 	case T_NAME:
   1066 		return "name";
   1067 	case T_LITERAL:
   1068 		return "literal";
   1069 	case T_EOF:
   1070 		return "end of file";
   1071 	case T_ERROR:
   1072 		return "error";
   1073 	default:
   1074 		assert(tok < sizeof(tokens) / sizeof(tokens[0]));
   1075 		return tokens[tok];
   1076 	}
   1077 }
   1078 
   1079 static const char *
   1080 rune_unparse(uint32_t c)
   1081 {
   1082 	static char buf[11];
   1083 	switch (c) {
   1084 	case '\0':
   1085 		snprintf(buf, sizeof(buf), "\\0");
   1086 		break;
   1087 	case '\a':
   1088 		snprintf(buf, sizeof(buf), "\\a");
   1089 		break;
   1090 	case '\b':
   1091 		snprintf(buf, sizeof(buf), "\\b");
   1092 		break;
   1093 	case '\f':
   1094 		snprintf(buf, sizeof(buf), "\\f");
   1095 		break;
   1096 	case '\n':
   1097 		snprintf(buf, sizeof(buf), "\\n");
   1098 		break;
   1099 	case '\r':
   1100 		snprintf(buf, sizeof(buf), "\\r");
   1101 		break;
   1102 	case '\t':
   1103 		snprintf(buf, sizeof(buf), "\\t");
   1104 		break;
   1105 	case '\v':
   1106 		snprintf(buf, sizeof(buf), "\\v");
   1107 		break;
   1108 	case '\\':
   1109 		snprintf(buf, sizeof(buf), "\\\\");
   1110 		break;
   1111 	case '\'':
   1112 		snprintf(buf, sizeof(buf), "\\'");
   1113 		break;
   1114 	case '"':
   1115 		snprintf(buf, sizeof(buf), "\\\"");
   1116 		break;
   1117 	default:
   1118 		if (c > 0xffff) {
   1119 			snprintf(buf, sizeof(buf), "\\U%08x", c);
   1120 		} else if (c > 0x7F) {
   1121 			snprintf(buf, sizeof(buf), "\\u%04x", c);
   1122 		} else if (!isprint(c)) {
   1123 			snprintf(buf, sizeof(buf), "\\x%02x", c);
   1124 		} else {
   1125 			assert(utf8_cpsize(c) < sizeof(buf));
   1126 			buf[utf8_encode(buf, c)] = '\0';
   1127 		}
   1128 		break;
   1129 	}
   1130 	return buf;
   1131 }
   1132 
   1133 static const char *
   1134 string_unparse(const struct token *tok)
   1135 {
   1136 	static char buf[1024];
   1137 	assert(tok->token == T_LITERAL && tok->storage == STORAGE_STRING);
   1138 	int bytes = 0;
   1139 	memset(buf, 0, sizeof(buf));
   1140 	bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\"");
   1141 	const char *s = tok->string.value;
   1142 	for (uint32_t c = utf8_decode(&s);
   1143 			s - tok->string.value <= (ptrdiff_t)tok->string.len;
   1144 			c = utf8_decode(&s)) {
   1145 		bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s",
   1146 			rune_unparse(c));
   1147 	}
   1148 	bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\"");
   1149 	return buf;
   1150 }
   1151 
   1152 const char *
   1153 token_str(const struct token *tok)
   1154 {
   1155 	static char buf[1024];
   1156 	int bytes = 0;
   1157 	switch (tok->token) {
   1158 	case T_NAME:
   1159 		snprintf(buf, sizeof(buf), "name %s", tok->name);
   1160 		return buf;
   1161 	case T_LITERAL:
   1162 		switch (tok->storage) {
   1163 		case STORAGE_U8:
   1164 		case STORAGE_U16:
   1165 		case STORAGE_U32:
   1166 		case STORAGE_U64:
   1167 		case STORAGE_UINT:
   1168 		case STORAGE_UINTPTR:
   1169 		case STORAGE_SIZE:
   1170 			snprintf(buf, sizeof(buf), "%ju", tok->uval);
   1171 			break;
   1172 		case STORAGE_I8:
   1173 		case STORAGE_I16:
   1174 		case STORAGE_I32:
   1175 		case STORAGE_I64:
   1176 		case STORAGE_ICONST:
   1177 		case STORAGE_INT:
   1178 			snprintf(buf, sizeof(buf), "%jd", tok->ival);
   1179 			break;
   1180 		case STORAGE_F32:
   1181 		case STORAGE_F64:
   1182 		case STORAGE_FCONST:
   1183 			snprintf(buf, sizeof(buf), "%lf", tok->fval);
   1184 			break;
   1185 		case STORAGE_RCONST:
   1186 			bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'");
   1187 			bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s",
   1188 				rune_unparse(tok->rune));
   1189 			bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'");
   1190 			break;
   1191 		case STORAGE_STRING:
   1192 			return string_unparse(tok);
   1193 		case STORAGE_ALIAS:
   1194 		case STORAGE_ARRAY:
   1195 		case STORAGE_BOOL:
   1196 		case STORAGE_CHAR:
   1197 		case STORAGE_ENUM:
   1198 		case STORAGE_ERROR:
   1199 		case STORAGE_FUNCTION:
   1200 		case STORAGE_POINTER:
   1201 		case STORAGE_NULL:
   1202 		case STORAGE_RUNE:
   1203 		case STORAGE_SLICE:
   1204 		case STORAGE_STRUCT:
   1205 		case STORAGE_TAGGED:
   1206 		case STORAGE_TUPLE:
   1207 		case STORAGE_UNION:
   1208 		case STORAGE_VALIST:
   1209 		case STORAGE_VOID:
   1210 			assert(0);
   1211 		}
   1212 		return buf;
   1213 	default:;
   1214 		const char *out = lexical_token_str(tok->token);
   1215 		return out;
   1216 	}
   1217 }
   1218 
   1219 void
   1220 unlex(struct lexer *lexer, struct token *in)
   1221 {
   1222 	assert(lexer->un.token == T_ERROR && "Only one unlex is supported");
   1223 	lexer->un = *in;
   1224 }