harec

[hare] Hare compiler, written in C11 for POSIX OSs
Log | Files | Refs | README | LICENSE

lex.c (23669B)


      1 #include <assert.h>
      2 #include <ctype.h>
      3 #include <errno.h>
      4 #include <inttypes.h>
      5 #include <stdbool.h>
      6 #include <stddef.h>
      7 #include <stdint.h>
      8 #include <stdio.h>
      9 #include <stdlib.h>
     10 #include <string.h>
     11 #include "lex.h"
     12 #include "utf8.h"
     13 #include "util.h"
     14 
     15 static const char *tokens[] = {
     16 	// Must be alpha sorted and match lex.h
     17 	[T_ATTR_FINI] = "@fini",
     18 	[T_ATTR_INIT] = "@init",
     19 	[T_ATTR_NORETURN] = "@noreturn",
     20 	[T_ATTR_OFFSET] = "@offset",
     21 	[T_ATTR_SYMBOL] = "@symbol",
     22 	[T_ATTR_TEST] = "@test",
     23 	[T_UNDERSCORE] = "_",
     24 	[T_ABORT] = "abort",
     25 	[T_ALLOC] = "alloc",
     26 	[T_APPEND] = "append",
     27 	[T_AS] = "as",
     28 	[T_ASSERT] = "assert",
     29 	[T_BOOL] = "bool",
     30 	[T_BREAK] = "break",
     31 	[T_CASE] = "case",
     32 	[T_CHAR] = "char",
     33 	[T_CONST] = "const",
     34 	[T_CONTINUE] = "continue",
     35 	[T_DEFER] = "defer",
     36 	[T_DEF] = "def",
     37 	[T_DELETE] = "delete",
     38 	[T_ELSE] = "else",
     39 	[T_ENUM] = "enum",
     40 	[T_EXPORT] = "export",
     41 	[T_F32] = "f32",
     42 	[T_F64] = "f64",
     43 	[T_FALSE] = "false",
     44 	[T_FN] = "fn",
     45 	[T_FOR] = "for",
     46 	[T_FREE] = "free",
     47 	[T_I16] = "i16",
     48 	[T_I32] = "i32",
     49 	[T_I64] = "i64",
     50 	[T_I8] = "i8",
     51 	[T_IF] = "if",
     52 	[T_INSERT] = "insert",
     53 	[T_INT] = "int",
     54 	[T_IS] = "is",
     55 	[T_LEN] = "len",
     56 	[T_LET] = "let",
     57 	[T_MATCH] = "match",
     58 	[T_NULL] = "null",
     59 	[T_NULLABLE] = "nullable",
     60 	[T_OFFSET] = "offset",
     61 	[T_RETURN] = "return",
     62 	[T_RUNE] = "rune",
     63 	[T_SIZE] = "size",
     64 	[T_STATIC] = "static",
     65 	[T_STR] = "str",
     66 	[T_STRUCT] = "struct",
     67 	[T_SWITCH] = "switch",
     68 	[T_TRUE] = "true",
     69 	[T_TYPE] = "type",
     70 	[T_U16] = "u16",
     71 	[T_U32] = "u32",
     72 	[T_U64] = "u64",
     73 	[T_U8] = "u8",
     74 	[T_UINT] = "uint",
     75 	[T_UINTPTR] = "uintptr",
     76 	[T_UNION] = "union",
     77 	[T_USE] = "use",
     78 	[T_VAARG] = "vaarg",
     79 	[T_VAEND] = "vaend",
     80 	[T_VALIST] = "valist",
     81 	[T_VASTART] = "vastart",
     82 	[T_VOID] = "void",
     83 	[T_YIELD] = "yield",
     84 
     85 	// Operators
     86 	[T_ARROW] = "=>",
     87 	[T_BANDEQ] = "&=",
     88 	[T_BAND] = "&",
     89 	[T_BNOT] = "~",
     90 	[T_BOR] = "|",
     91 	[T_COLON] = ":",
     92 	[T_COMMA] = ",",
     93 	[T_DIV] = "/",
     94 	[T_DIVEQ] = "/=",
     95 	[T_DOT] = ".",
     96 	[T_DOUBLE_COLON] = "::",
     97 	[T_ELLIPSIS] = "...",
     98 	[T_EQUAL] = "=",
     99 	[T_GREATER] = ">",
    100 	[T_GREATEREQ] = ">=",
    101 	[T_LAND] = "&&",
    102 	[T_LANDEQ] = "&&=",
    103 	[T_LBRACE] = "{",
    104 	[T_LBRACKET] = "[",
    105 	[T_LEQUAL] = "==",
    106 	[T_LESS] = "<",
    107 	[T_LESSEQ] = "<=",
    108 	[T_LNOT] = "!",
    109 	[T_LOR] = "||",
    110 	[T_LOREQ] = "||=",
    111 	[T_LPAREN] = "(",
    112 	[T_LSHIFT] = "<<",
    113 	[T_LSHIFTEQ] = "<<=",
    114 	[T_LXOR] = "^^",
    115 	[T_LXOREQ] = "^^=",
    116 	[T_MINUS] = "-",
    117 	[T_MINUSEQ] = "-=",
    118 	[T_MODEQ] = "%=",
    119 	[T_MODULO] = "%",
    120 	[T_NEQUAL] = "!=",
    121 	[T_BOREQ] = "|=",
    122 	[T_PLUS] = "+",
    123 	[T_PLUSEQ] = "+=",
    124 	[T_QUESTION] = "?",
    125 	[T_RBRACE] = "}",
    126 	[T_RBRACKET] = "]",
    127 	[T_RPAREN] = ")",
    128 	[T_RSHIFT] = ">>",
    129 	[T_RSHIFTEQ] = ">>=",
    130 	[T_SEMICOLON] = ";",
    131 	[T_SLICE] = "..",
    132 	[T_TIMES] = "*",
    133 	[T_TIMESEQ] = "*=",
    134 	[T_BXOR] = "^",
    135 	[T_BXOREQ] = "^=",
    136 };
    137 
    138 void
    139 lex_init(struct lexer *lexer, FILE *f, int fileid)
    140 {
    141 	memset(lexer, 0, sizeof(*lexer));
    142 	lexer->in = f;
    143 	lexer->bufsz = 256;
    144 	lexer->buf = xcalloc(1, lexer->bufsz);
    145 	lexer->un.token = T_ERROR;
    146 	lexer->loc.lineno = 1;
    147 	lexer->loc.colno = 0;
    148 	lexer->loc.file = fileid;
    149 	lexer->c[0] = UINT32_MAX;
    150 	lexer->c[1] = UINT32_MAX;
    151 }
    152 
    153 void
    154 lex_finish(struct lexer *lexer)
    155 {
    156 	fclose(lexer->in);
    157 	free(lexer->buf);
    158 }
    159 
    160 static void
    161 update_lineno(struct location *loc, uint32_t c)
    162 {
    163 	if (c == '\n') {
    164 		loc->lineno++;
    165 		loc->colno = 0;
    166 	} else if (c == '\t') {
    167 		loc->colno += 8;
    168 	} else {
    169 		loc->colno++;
    170 	}
    171 }
    172 
    173 static uint32_t
    174 next(struct lexer *lexer, struct location *loc, bool buffer)
    175 {
    176 	uint32_t c;
    177 	if (lexer->c[0] != UINT32_MAX) {
    178 		c = lexer->c[0];
    179 		lexer->c[0] = lexer->c[1];
    180 		lexer->c[1] = UINT32_MAX;
    181 	} else {
    182 		c = utf8_get(lexer->in);
    183 		update_lineno(&lexer->loc, c);
    184 	}
    185 	if (loc != NULL) {
    186 		*loc = lexer->loc;
    187 		for (size_t i = 0; i < 2 && lexer->c[i] != UINT32_MAX; i++) {
    188 			update_lineno(&lexer->loc, lexer->c[i]);
    189 		}
    190 	}
    191 	if (c == UTF8_INVALID || !buffer) {
    192 		return c;
    193 	}
    194 	if (lexer->buflen + utf8_cpsize(c) >= lexer->bufsz) {
    195 		lexer->bufsz *= 2;
    196 		lexer->buf = xrealloc(lexer->buf, lexer->bufsz);
    197 	}
    198 	char buf[UTF8_MAX_SIZE];
    199 	size_t sz = utf8_encode(&buf[0], c);
    200 	memcpy(lexer->buf + lexer->buflen, buf, sz);
    201 	lexer->buflen += sz;
    202 	lexer->buf[lexer->buflen] = '\0';
    203 	return c;
    204 }
    205 
    206 static bool
    207 isharespace(uint32_t c)
    208 {
    209 	return c == '\t' || c == '\n' || c == ' ';
    210 }
    211 
    212 static uint32_t
    213 wgetc(struct lexer *lexer, struct location *loc)
    214 {
    215 	uint32_t c;
    216 	while ((c = next(lexer, loc, false)) != UTF8_INVALID && isharespace(c)) ;
    217 	return c;
    218 }
    219 
    220 static void
    221 consume(struct lexer *lexer, ssize_t n)
    222 {
    223 	if (n == -1) {
    224 		lexer->buflen = 0;
    225 		lexer->buf[0] = 0;
    226 		return;
    227 	}
    228 	for (ssize_t i = 0; i < n; i++) {
    229 		while ((lexer->buf[--lexer->buflen] & 0xC0) == 0x80) ;
    230 	}
    231 	lexer->buf[lexer->buflen] = 0;
    232 }
    233 
    234 static void
    235 push(struct lexer *lexer, uint32_t c, bool buffer)
    236 {
    237 	assert(lexer->c[1] == UINT32_MAX);
    238 	lexer->c[1] = lexer->c[0];
    239 	lexer->c[0] = c;
    240 	if (buffer) {
    241 		consume(lexer, 1);
    242 	}
    243 }
    244 
    245 static int
    246 cmp_keyword(const void *va, const void *vb)
    247 {
    248 	return strcmp(*(const char **)va, *(const char **)vb);
    249 }
    250 
    251 static uint32_t
    252 lex_name(struct lexer *lexer, struct token *out)
    253 {
    254 	uint32_t c = next(lexer, &out->loc, true);
    255 	assert(c != UTF8_INVALID && c <= 0x7F && (isalpha(c) || c == '_' || c == '@'));
    256 	while ((c = next(lexer, NULL, true)) != UTF8_INVALID) {
    257 		if (c > 0x7F || (!isalnum(c) && c != '_')) {
    258 			push(lexer, c, true);
    259 			break;
    260 		}
    261 	}
    262 
    263 	void *token = bsearch(&lexer->buf, tokens, T_LAST_KEYWORD + 1,
    264 			sizeof(tokens[0]), cmp_keyword);
    265 	if (!token) {
    266 		if (lexer->buf[0] == '@') {
    267 			out->token = T_ERROR;
    268 			consume(lexer, -1);
    269 			return out->token;
    270 		}
    271 		out->token = T_NAME;
    272 		out->name = strdup(lexer->buf);
    273 	} else {
    274 		out->token = (const char **)token - tokens;
    275 	}
    276 	consume(lexer, -1);
    277 	return out->token;
    278 }
    279 
    280 static uint32_t
    281 lex_literal(struct lexer *lexer, struct token *out)
    282 {
    283 	uint32_t c = next(lexer, &out->loc, true);
    284 	assert(c != UTF8_INVALID && c <= 0x7F && isdigit(c));
    285 
    286 	int base = 10;
    287 	const char *basechrs = "0123456789";
    288 	if (c == '0') {
    289 		switch ((c = next(lexer, NULL, true))) {
    290 		case 'b':
    291 			base = 2;
    292 			basechrs = "01";
    293 			consume(lexer, 2);
    294 			break;
    295 		case 'o':
    296 			base = 8;
    297 			basechrs = "01234567";
    298 			consume(lexer, 2);
    299 			break;
    300 		case 'x':
    301 			base = 16;
    302 			basechrs = "0123456789ABCDEFabcdef";
    303 			consume(lexer, 2);
    304 			break;
    305 		default:
    306 			push(lexer, c, true);
    307 			break;
    308 		}
    309 	}
    310 
    311 	char *suff = NULL;
    312 	char *exp = NULL;
    313 	bool isfloat = false;
    314 	while ((c = next(lexer, NULL, true)) != UTF8_INVALID) {
    315 		if (!strchr(basechrs, c)) {
    316 			switch (c) {
    317 			case '.':
    318 				if (lexer->require_int) {
    319 					push(lexer, '.', true);
    320 					goto finalize;
    321 				}
    322 				if (isfloat || suff || exp) {
    323 					push(lexer, c, true);
    324 					goto finalize;
    325 				}
    326 				if (!strchr(basechrs, c = next(lexer, NULL, false))) {
    327 					push(lexer, c, false);
    328 					push(lexer, '.', true);
    329 					goto finalize;
    330 				} else {
    331 					push(lexer, c, false);
    332 				}
    333 				isfloat = true;
    334 				break;
    335 			case 'e':
    336 				if (exp || suff) {
    337 					push(lexer, c, true);
    338 					goto finalize;
    339 				}
    340 				// exponent is always in base 10
    341 				basechrs = "0123456789";
    342 				c = next(lexer, NULL, true);
    343 				if (c != '-' && c != '+' && !strchr(basechrs, c)) {
    344 					push(lexer, c, true);
    345 					push(lexer, 'e', true);
    346 					goto finalize;
    347 				};
    348 				exp = &lexer->buf[lexer->buflen - 1];
    349 				break;
    350 			case 'i':
    351 			case 'u':
    352 			case 'f':
    353 			case 'z':
    354 				if (suff) {
    355 					push(lexer, c, true);
    356 					goto finalize;
    357 				}
    358 				suff = &lexer->buf[lexer->buflen - 1];
    359 				basechrs = "0123456789";
    360 				break;
    361 			default:
    362 				push(lexer, c, true);
    363 				goto finalize;
    364 			}
    365 		}
    366 	}
    367 
    368 finalize:
    369 	lexer->require_int = false;
    370 	out->token = T_LITERAL;
    371 	if (isfloat) {
    372 		out->storage = STORAGE_FCONST;
    373 	} else {
    374 		out->storage = STORAGE_ICONST;
    375 	}
    376 	if (suff) {
    377 		const char *suffs[] = {
    378 			[STORAGE_U8] = "u8",
    379 			[STORAGE_U16] = "u16",
    380 			[STORAGE_U32] = "u32",
    381 			[STORAGE_U64] = "u64",
    382 			[STORAGE_I8] = "i8",
    383 			[STORAGE_I16] = "i16",
    384 			[STORAGE_I32] = "i32",
    385 			[STORAGE_I64] = "i64",
    386 
    387 			[STORAGE_UINT] = "u",
    388 			[STORAGE_INT] = "i",
    389 			[STORAGE_SIZE] = "z",
    390 			[STORAGE_F32] = "f32",
    391 			[STORAGE_F64] = "f64",
    392 		};
    393 		bool isvalid = false;
    394 		for (enum type_storage i = 0;
    395 				i < sizeof(suffs) / sizeof(suffs[0]); ++i) {
    396 			if (suffs[i] && strcmp(suff, suffs[i]) == 0) {
    397 				isvalid = true;
    398 				out->storage = i;
    399 				break;
    400 			}
    401 		}
    402 		if (!isvalid) {
    403 			out->token = T_ERROR;
    404 			consume(lexer, -1);
    405 			return out->token;
    406 		}
    407 	}
    408 
    409 	intmax_t exponent = 0;
    410 	if (exp) {
    411 		char *endptr = NULL;
    412 		exponent = strtoimax(exp, &endptr, 10);
    413 		// integers can't have negative exponents
    414 		if (exponent < 0 && !suff) {
    415 			out->storage = STORAGE_FCONST;
    416 		}
    417 		enum type_storage s = out->storage;
    418 		bool valid = exponent >= 0
    419 			|| s == STORAGE_F32
    420 			|| s == STORAGE_F64
    421 			|| s == STORAGE_FCONST;
    422 		if (endptr == exp || !valid) {
    423 			out->token = T_ERROR;
    424 			consume(lexer, -1);
    425 			return out->token;
    426 		}
    427 	}
    428 
    429 	if (isfloat) {
    430 		switch (out->storage) {
    431 		case STORAGE_F32:
    432 		case STORAGE_F64:
    433 		case STORAGE_FCONST:
    434 			break;
    435 		default:
    436 			out->token = T_ERROR;
    437 			consume(lexer, -1);
    438 			return out->token;
    439 		}
    440 	}
    441 
    442 	errno = 0;
    443 	switch (out->storage) {
    444 	case STORAGE_U8:
    445 	case STORAGE_U16:
    446 	case STORAGE_U32:
    447 	case STORAGE_UINT:
    448 	case STORAGE_U64:
    449 	case STORAGE_SIZE:
    450 		out->uval = strtoumax(lexer->buf, NULL, base);
    451 		for (intmax_t i = 0; i < exponent; i++) {
    452 			out->uval *= 10;
    453 		}
    454 		break;
    455 	case STORAGE_ICONST:
    456 		out->uval = strtoumax(lexer->buf, NULL, base);
    457 		for (intmax_t i = 0; i < exponent; i++) {
    458 			out->uval *= 10;
    459 		}
    460 		if (out->uval > (uintmax_t)INT64_MAX) {
    461 			out->storage = STORAGE_U64;
    462 			break;
    463 		}
    464 		// Fallthrough
    465 	case STORAGE_I8:
    466 	case STORAGE_I16:
    467 	case STORAGE_I32:
    468 	case STORAGE_INT:
    469 	case STORAGE_I64:
    470 		out->uval = strtoumax(lexer->buf, NULL, base);
    471 		for (intmax_t i = 0; i < exponent; i++) {
    472 			out->uval *= 10;
    473 		}
    474 		if (out->uval == (uintmax_t)INT64_MIN) {
    475 			// XXX: Hack
    476 			out->ival = INT64_MIN;
    477 			break;
    478 		}
    479 		out->ival = strtoimax(lexer->buf, NULL, base);
    480 		for (intmax_t i = 0; i < exponent; i++) {
    481 			out->ival *= 10;
    482 		}
    483 		break;
    484 	case STORAGE_F32:
    485 	case STORAGE_F64:
    486 	case STORAGE_FCONST:
    487 		out->fval = strtod(lexer->buf, NULL);
    488 		break;
    489 	default:
    490 		assert(0);
    491 	}
    492 	if (errno == ERANGE && !isfloat) {
    493 		out->token = T_ERROR;
    494 	}
    495 	consume(lexer, -1);
    496 	return out->token;
    497 }
    498 
    499 static uint32_t
    500 lex_rune(struct lexer *lexer)
    501 {
    502 	char buf[9];
    503 	char *endptr;
    504 	uint32_t c = next(lexer, NULL, false);
    505 	assert(c != UTF8_INVALID);
    506 
    507 	switch (c) {
    508 	case '\\':
    509 		c = next(lexer, NULL, false);
    510 		switch (c) {
    511 		case '0':
    512 			return '\0';
    513 		case 'a':
    514 			return '\a';
    515 		case 'b':
    516 			return '\b';
    517 		case 'f':
    518 			return '\f';
    519 		case 'n':
    520 			return '\n';
    521 		case 'r':
    522 			return '\r';
    523 		case 't':
    524 			return '\t';
    525 		case 'v':
    526 			return '\v';
    527 		case '\\':
    528 			return '\\';
    529 		case '\'':
    530 			return '\'';
    531 		case '"':
    532 			return '\"';
    533 		case 'x':
    534 			buf[0] = next(lexer, NULL, false);
    535 			buf[1] = next(lexer, NULL, false);
    536 			buf[2] = '\0';
    537 			c = strtoul(&buf[0], &endptr, 16);
    538 			if (*endptr != '\0') {
    539 				fprintf(stderr,
    540 					"Error: invalid hex literal at %s:%d:%d\n",
    541 					sources[lexer->loc.file], lexer->loc.lineno,
    542 					lexer->loc.colno);
    543 				exit(EXIT_FAILURE);
    544 			}
    545 			return c;
    546 		case 'u':
    547 			buf[0] = next(lexer, NULL, false);
    548 			buf[1] = next(lexer, NULL, false);
    549 			buf[2] = next(lexer, NULL, false);
    550 			buf[3] = next(lexer, NULL, false);
    551 			buf[4] = '\0';
    552 			c = strtoul(&buf[0], &endptr, 16);
    553 			if (*endptr != '\0') {
    554 				fprintf(stderr,
    555 					"Error: invalid hex literal at %s:%d:%d\n",
    556 					sources[lexer->loc.file], lexer->loc.lineno,
    557 					lexer->loc.colno);
    558 				exit(EXIT_FAILURE);
    559 			}
    560 			return c;
    561 		case 'U':
    562 			buf[0] = next(lexer, NULL, false);
    563 			buf[1] = next(lexer, NULL, false);
    564 			buf[2] = next(lexer, NULL, false);
    565 			buf[3] = next(lexer, NULL, false);
    566 			buf[4] = next(lexer, NULL, false);
    567 			buf[5] = next(lexer, NULL, false);
    568 			buf[6] = next(lexer, NULL, false);
    569 			buf[7] = next(lexer, NULL, false);
    570 			buf[8] = '\0';
    571 			c = strtoul(&buf[0], &endptr, 16);
    572 			if (*endptr != '\0') {
    573 				fprintf(stderr,
    574 					"Error: invalid hex literal at %s:%d:%d\n",
    575 					sources[lexer->loc.file], lexer->loc.lineno,
    576 					lexer->loc.colno);
    577 				exit(EXIT_FAILURE);
    578 			}
    579 			return c;
    580 		default:
    581 			fprintf(stderr,
    582 				"Error: invalid escape '\\%c' at %s:%d:%d\n",
    583 				c, sources[lexer->loc.file], lexer->loc.lineno,
    584 				lexer->loc.colno);
    585 			exit(EXIT_FAILURE);
    586 		}
    587 		assert(0);
    588 	default:
    589 		return c;
    590 	}
    591 	assert(0);
    592 }
    593 
    594 static enum lexical_token
    595 lex_string(struct lexer *lexer, struct token *out)
    596 {
    597 	uint32_t c = next(lexer, &out->loc, false);
    598 	uint32_t delim;
    599 	assert(c != UTF8_INVALID);
    600 
    601 	switch (c) {
    602 	case '"':
    603 	case '`':
    604 		delim = c;
    605 		while ((c = next(lexer, NULL, false)) != UTF8_INVALID) {
    606 			if (c == delim) {
    607 				char *buf = xcalloc(lexer->buflen + 1, 1);
    608 				memcpy(buf, lexer->buf, lexer->buflen);
    609 				out->token = T_LITERAL;
    610 				out->storage = STORAGE_STRING;
    611 				out->string.len = lexer->buflen;
    612 				out->string.value = buf;
    613 				consume(lexer, -1);
    614 				return out->token;
    615 			} else {
    616 				push(lexer, c, false);
    617 				if (delim == '"') {
    618 					push(lexer, lex_rune(lexer), false);
    619 				}
    620 				next(lexer, NULL, true);
    621 			}
    622 		}
    623 		assert(0); // Invariant
    624 	case '\'':
    625 		c = next(lexer, NULL, false);
    626 		switch (c) {
    627 		case '\'':
    628 			assert(0); // Invariant
    629 		case '\\':
    630 			push(lexer, c, false);
    631 			out->rune = lex_rune(lexer);
    632 			break;
    633 		default:
    634 			out->rune = c;
    635 		}
    636 		c = next(lexer, NULL, false);
    637 		assert(c == '\'');
    638 		out->token = T_LITERAL;
    639 		out->storage = STORAGE_RCONST;
    640 		return out->token;
    641 	default:
    642 		assert(0); // Invariant
    643 	}
    644 	assert(0);
    645 }
    646 
    647 static enum lexical_token
    648 lex3(struct lexer *lexer, struct token *out, uint32_t c)
    649 {
    650 	assert(c != UTF8_INVALID);
    651 
    652 	switch (c) {
    653 	case '.':
    654 		switch ((c = next(lexer, NULL, false))) {
    655 		case '.':
    656 			switch ((c = next(lexer, NULL, false))) {
    657 			case '.':
    658 				out->token = T_ELLIPSIS;
    659 				break;
    660 			default:
    661 				push(lexer, c, false);
    662 				out->token = T_SLICE;
    663 				break;
    664 			}
    665 			break;
    666 		default:
    667 			push(lexer, c, false);
    668 			out->token = T_DOT;
    669 			lexer->require_int = true;
    670 			break;
    671 		}
    672 		break;
    673 	case '<':
    674 		switch ((c = next(lexer, NULL, false))) {
    675 		case '<':
    676 			switch ((c = next(lexer, NULL, false))) {
    677 			case '=':
    678 				out->token = T_LSHIFTEQ;
    679 				break;
    680 			default:
    681 				push(lexer, c, false);
    682 				out->token = T_LSHIFT;
    683 				break;
    684 			}
    685 			break;
    686 		case '=':
    687 			out->token = T_LESSEQ;
    688 			break;
    689 		default:
    690 			push(lexer, c, false);
    691 			out->token = T_LESS;
    692 			break;
    693 		}
    694 		break;
    695 	case '>':
    696 		switch ((c = next(lexer, NULL, false))) {
    697 		case '>':
    698 			switch ((c = next(lexer, NULL, false))) {
    699 			case '=':
    700 				out->token = T_RSHIFTEQ;
    701 				break;
    702 			default:
    703 				push(lexer, c, false);
    704 				out->token = T_RSHIFT;
    705 				break;
    706 			}
    707 			break;
    708 		case '=':
    709 			out->token = T_GREATEREQ;
    710 			break;
    711 		default:
    712 			push(lexer, c, false);
    713 			out->token = T_GREATER;
    714 			break;
    715 		}
    716 		break;
    717 	case '&':
    718 		switch ((c = next(lexer, NULL, false))) {
    719 		case '&':
    720 			switch ((c = next(lexer, NULL, false))) {
    721 			case '=':
    722 				out->token = T_LANDEQ;
    723 				break;
    724 			default:
    725 				push(lexer, c, false);
    726 				out->token = T_LAND;
    727 				break;
    728 			}
    729 			break;
    730 		case '=':
    731 			out->token = T_BANDEQ;
    732 			break;
    733 		default:
    734 			push(lexer, c, false);
    735 			out->token = T_BAND;
    736 			break;
    737 		}
    738 		break;
    739 	case '|':
    740 		switch ((c = next(lexer, NULL, false))) {
    741 		case '|':
    742 			switch ((c = next(lexer, NULL, false))) {
    743 			case '=':
    744 				out->token = T_LOREQ;
    745 				break;
    746 			default:
    747 				push(lexer, c, false);
    748 				out->token = T_LOR;
    749 				break;
    750 			}
    751 			break;
    752 		case '=':
    753 			out->token = T_BOREQ;
    754 			break;
    755 		default:
    756 			push(lexer, c, false);
    757 			out->token = T_BOR;
    758 			break;
    759 		}
    760 		break;
    761 	case '^':
    762 		switch ((c = next(lexer, NULL, false))) {
    763 		case '^':
    764 			switch ((c = next(lexer, NULL, false))) {
    765 			case '=':
    766 				out->token = T_LXOREQ;
    767 				break;
    768 			default:
    769 				push(lexer, c, false);
    770 				out->token = T_LXOR;
    771 				break;
    772 			}
    773 			break;
    774 		case '=':
    775 			out->token = T_BXOREQ;
    776 			break;
    777 		default:
    778 			push(lexer, c, false);
    779 			out->token = T_BXOR;
    780 			break;
    781 		}
    782 		break;
    783 	default:
    784 		assert(0); // Invariant
    785 	}
    786 
    787 	return out->token;
    788 }
    789 
    790 static enum lexical_token _lex(struct lexer *lexer, struct token *out);
    791 
    792 static enum lexical_token
    793 lex2(struct lexer *lexer, struct token *out, uint32_t c)
    794 {
    795 	assert(c != UTF8_INVALID);
    796 
    797 	switch (c) {
    798 	case '*':
    799 		switch ((c = next(lexer, NULL, false))) {
    800 		case '=':
    801 			out->token = T_TIMESEQ;
    802 			break;
    803 		default:
    804 			push(lexer, c, false);
    805 			out->token = T_TIMES;
    806 			break;
    807 		}
    808 		break;
    809 	case '%':
    810 		switch ((c = next(lexer, NULL, false))) {
    811 		case '=':
    812 			out->token = T_MODEQ;
    813 			break;
    814 		default:
    815 			push(lexer, c, false);
    816 			out->token = T_MODULO;
    817 			break;
    818 		}
    819 		break;
    820 	case '/':
    821 		switch ((c = next(lexer, NULL, false))) {
    822 		case '=':
    823 			out->token = T_DIVEQ;
    824 			break;
    825 		case '/':
    826 			while ((c = next(lexer, NULL, false)) != UTF8_INVALID && c != '\n') ;
    827 			return _lex(lexer, out);
    828 		default:
    829 			push(lexer, c, false);
    830 			out->token = T_DIV;
    831 			break;
    832 		}
    833 		break;
    834 	case '+':
    835 		switch ((c = next(lexer, NULL, false))) {
    836 		case '=':
    837 			out->token = T_PLUSEQ;
    838 			break;
    839 		default:
    840 			push(lexer, c, false);
    841 			out->token = T_PLUS;
    842 			break;
    843 		}
    844 		break;
    845 	case '-':
    846 		switch ((c = next(lexer, NULL, false))) {
    847 		case '=':
    848 			out->token = T_MINUSEQ;
    849 			break;
    850 		default:
    851 			push(lexer, c, false);
    852 			out->token = T_MINUS;
    853 			break;
    854 		}
    855 		break;
    856 	case ':':
    857 		switch ((c = next(lexer, NULL, false))) {
    858 		case ':':
    859 			out->token = T_DOUBLE_COLON;
    860 			break;
    861 		default:
    862 			push(lexer, c, false);
    863 			out->token = T_COLON;
    864 			break;
    865 		}
    866 		break;
    867 	case '!':
    868 		switch ((c = next(lexer, NULL, false))) {
    869 		case '=':
    870 			out->token = T_NEQUAL;
    871 			break;
    872 		default:
    873 			push(lexer, c, false);
    874 			out->token = T_LNOT;
    875 			break;
    876 		}
    877 		break;
    878 	case '=':
    879 		switch ((c = next(lexer, NULL, false))) {
    880 		case '=':
    881 			out->token = T_LEQUAL;
    882 			break;
    883 		case '>':
    884 			out->token = T_ARROW;
    885 			break;
    886 		default:
    887 			push(lexer, c, false);
    888 			out->token = T_EQUAL;
    889 			break;
    890 		}
    891 		break;
    892 	default:
    893 		assert(0); // Invariant
    894 	}
    895 
    896 	return out->token;
    897 }
    898 
    899 static enum lexical_token
    900 _lex(struct lexer *lexer, struct token *out)
    901 {
    902 	if (lexer->un.token != T_ERROR) {
    903 		*out = lexer->un;
    904 		lexer->un.token = T_ERROR;
    905 		return out->token;
    906 	}
    907 
    908 	uint32_t c = wgetc(lexer, &out->loc);
    909 	if (c == UTF8_INVALID) {
    910 		out->token = T_EOF;
    911 		return out->token;
    912 	}
    913 
    914 	if (c <= 0x7F && isdigit(c)) {
    915 		push(lexer, c, false);
    916 		return lex_literal(lexer, out);
    917 	}
    918 
    919 	lexer->require_int = false;
    920 
    921 	if (c <= 0x7F && (isalpha(c) || c == '_' || c == '@')) {
    922 		push(lexer, c, false);
    923 		return lex_name(lexer, out);
    924 	}
    925 
    926 	char p[5];
    927 	switch (c) {
    928 	case '"':
    929 	case '`':
    930 	case '\'':
    931 		push(lexer, c, false);
    932 		return lex_string(lexer, out);
    933 	case '.': // . .. ...
    934 	case '<': // < << <= <<=
    935 	case '>': // > >> >= >>=
    936 	case '&': // & && &= &&=
    937 	case '|': // | || |= ||=
    938 	case '^': // ^ ^^ ^= ^^=
    939 		return lex3(lexer, out, c);
    940 	case '*': // * *=
    941 	case '%': // % %=
    942 	case '/': // / /= //
    943 	case '+': // + +=
    944 	case '-': // - -=
    945 	case ':': // : ::
    946 	case '!': // ! !=
    947 	case '=': // = == =>
    948 		return lex2(lexer, out, c);
    949 	case '~':
    950 		out->token = T_BNOT;
    951 		break;
    952 	case ',':
    953 		out->token = T_COMMA;
    954 		break;
    955 	case '{':
    956 		out->token = T_LBRACE;
    957 		break;
    958 	case '[':
    959 		out->token = T_LBRACKET;
    960 		break;
    961 	case '(':
    962 		out->token = T_LPAREN;
    963 		break;
    964 	case '}':
    965 		out->token = T_RBRACE;
    966 		break;
    967 	case ']':
    968 		out->token = T_RBRACKET;
    969 		break;
    970 	case ')':
    971 		out->token = T_RPAREN;
    972 		break;
    973 	case ';':
    974 		out->token = T_SEMICOLON;
    975 		break;
    976 	case '?':
    977 		out->token = T_QUESTION;
    978 		break;
    979 	default:
    980 		p[utf8_encode(p, c)] = '\0';
    981 		fprintf(stderr, "Error: unexpected code point '%s' at %s:%d:%d\n",
    982 			p, sources[lexer->loc.file], lexer->loc.lineno,
    983 			lexer->loc.colno);
    984 		exit(EXIT_FAILURE);
    985 	}
    986 
    987 	return out->token;
    988 }
    989 
    990 enum lexical_token
    991 lex(struct lexer *lexer, struct token *out)
    992 {
    993 	return _lex(lexer, out);
    994 }
    995 
    996 void
    997 token_finish(struct token *tok)
    998 {
    999 	switch (tok->token) {
   1000 	case T_NAME:
   1001 		free(tok->name);
   1002 		break;
   1003 	case T_LITERAL:
   1004 		switch (tok->storage) {
   1005 		case STORAGE_STRING:
   1006 			free(tok->string.value);
   1007 			break;
   1008 		default:
   1009 			break;
   1010 		}
   1011 		break;
   1012 	default:
   1013 		break;
   1014 	}
   1015 	tok->token = 0;
   1016 	tok->storage = 0;
   1017 	tok->loc.file = 0;
   1018 	tok->loc.colno = 0;
   1019 	tok->loc.lineno = 0;
   1020 }
   1021 
   1022 const char *
   1023 lexical_token_str(enum lexical_token tok)
   1024 {
   1025 	switch (tok) {
   1026 	case T_NAME:
   1027 		return "name";
   1028 	case T_LITERAL:
   1029 		return "literal";
   1030 	case T_EOF:
   1031 		return "end of file";
   1032 	case T_ERROR:
   1033 		return "error";
   1034 	default:
   1035 		assert(tok < sizeof(tokens) / sizeof(tokens[0]));
   1036 		return tokens[tok];
   1037 	}
   1038 }
   1039 
   1040 static const char *
   1041 rune_unparse(uint32_t c)
   1042 {
   1043 	static char buf[11];
   1044 	switch (c) {
   1045 	case '\0':
   1046 		snprintf(buf, sizeof(buf), "\\0");
   1047 		break;
   1048 	case '\a':
   1049 		snprintf(buf, sizeof(buf), "\\a");
   1050 		break;
   1051 	case '\b':
   1052 		snprintf(buf, sizeof(buf), "\\b");
   1053 		break;
   1054 	case '\f':
   1055 		snprintf(buf, sizeof(buf), "\\f");
   1056 		break;
   1057 	case '\n':
   1058 		snprintf(buf, sizeof(buf), "\\n");
   1059 		break;
   1060 	case '\r':
   1061 		snprintf(buf, sizeof(buf), "\\r");
   1062 		break;
   1063 	case '\t':
   1064 		snprintf(buf, sizeof(buf), "\\t");
   1065 		break;
   1066 	case '\v':
   1067 		snprintf(buf, sizeof(buf), "\\v");
   1068 		break;
   1069 	case '\\':
   1070 		snprintf(buf, sizeof(buf), "\\\\");
   1071 		break;
   1072 	case '\'':
   1073 		snprintf(buf, sizeof(buf), "\\'");
   1074 		break;
   1075 	case '"':
   1076 		snprintf(buf, sizeof(buf), "\\\"");
   1077 		break;
   1078 	default:
   1079 		if (c > 0xffff) {
   1080 			snprintf(buf, sizeof(buf), "\\U%08x", c);
   1081 		} else if (c > 0x7F) {
   1082 			snprintf(buf, sizeof(buf), "\\u%04x", c);
   1083 		} else if (!isprint(c)) {
   1084 			snprintf(buf, sizeof(buf), "\\x%02x", c);
   1085 		} else {
   1086 			assert(utf8_cpsize(c) < sizeof(buf));
   1087 			buf[utf8_encode(buf, c)] = '\0';
   1088 		}
   1089 		break;
   1090 	}
   1091 	return buf;
   1092 }
   1093 
   1094 static const char *
   1095 string_unparse(const struct token *tok)
   1096 {
   1097 	static char buf[1024];
   1098 	assert(tok->token == T_LITERAL && tok->storage == STORAGE_STRING);
   1099 	int bytes = 0;
   1100 	memset(buf, 0, sizeof(buf));
   1101 	bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\"");
   1102 	const char *s = tok->string.value;
   1103 	for (uint32_t c = utf8_decode(&s);
   1104 			s - tok->string.value <= (ptrdiff_t)tok->string.len;
   1105 			c = utf8_decode(&s)) {
   1106 		bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s",
   1107 			rune_unparse(c));
   1108 	}
   1109 	bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\"");
   1110 	return buf;
   1111 }
   1112 
   1113 const char *
   1114 token_str(const struct token *tok)
   1115 {
   1116 	static char buf[1024];
   1117 	int bytes = 0;
   1118 	switch (tok->token) {
   1119 	case T_NAME:
   1120 		snprintf(buf, sizeof(buf), "name %s", tok->name);
   1121 		return buf;
   1122 	case T_LITERAL:
   1123 		switch (tok->storage) {
   1124 		case STORAGE_U8:
   1125 		case STORAGE_U16:
   1126 		case STORAGE_U32:
   1127 		case STORAGE_U64:
   1128 		case STORAGE_UINT:
   1129 		case STORAGE_UINTPTR:
   1130 		case STORAGE_SIZE:
   1131 			snprintf(buf, sizeof(buf), "%ju", tok->uval);
   1132 			break;
   1133 		case STORAGE_I8:
   1134 		case STORAGE_I16:
   1135 		case STORAGE_I32:
   1136 		case STORAGE_I64:
   1137 		case STORAGE_ICONST:
   1138 		case STORAGE_INT:
   1139 			snprintf(buf, sizeof(buf), "%jd", tok->ival);
   1140 			break;
   1141 		case STORAGE_F32:
   1142 		case STORAGE_F64:
   1143 		case STORAGE_FCONST:
   1144 			snprintf(buf, sizeof(buf), "%lf", tok->fval);
   1145 			break;
   1146 		case STORAGE_RCONST:
   1147 			bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'");
   1148 			bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s",
   1149 				rune_unparse(tok->rune));
   1150 			bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'");
   1151 			break;
   1152 		case STORAGE_STRING:
   1153 			return string_unparse(tok);
   1154 		case STORAGE_ALIAS:
   1155 		case STORAGE_ARRAY:
   1156 		case STORAGE_BOOL:
   1157 		case STORAGE_CHAR:
   1158 		case STORAGE_ENUM:
   1159 		case STORAGE_FUNCTION:
   1160 		case STORAGE_POINTER:
   1161 		case STORAGE_NULL:
   1162 		case STORAGE_RUNE:
   1163 		case STORAGE_SLICE:
   1164 		case STORAGE_STRUCT:
   1165 		case STORAGE_TAGGED:
   1166 		case STORAGE_TUPLE:
   1167 		case STORAGE_UNION:
   1168 		case STORAGE_VALIST:
   1169 		case STORAGE_VOID:
   1170 			assert(0);
   1171 		}
   1172 		return buf;
   1173 	default:;
   1174 		const char *out = lexical_token_str(tok->token);
   1175 		return out;
   1176 	}
   1177 }
   1178 
   1179 void
   1180 unlex(struct lexer *lexer, struct token *in)
   1181 {
   1182 	assert(lexer->un.token == T_ERROR && "Only one unlex is supported");
   1183 	lexer->un = *in;
   1184 }