lex.c (24064B)
1 #include <assert.h> 2 #include <ctype.h> 3 #include <errno.h> 4 #include <inttypes.h> 5 #include <stdarg.h> 6 #include <stdbool.h> 7 #include <stddef.h> 8 #include <stdint.h> 9 #include <stdio.h> 10 #include <stdlib.h> 11 #include <stdnoreturn.h> 12 #include <string.h> 13 #include "lex.h" 14 #include "utf8.h" 15 #include "util.h" 16 17 static const char *tokens[] = { 18 // Must be alpha sorted and match lex.h 19 [T_ATTR_FINI] = "@fini", 20 [T_ATTR_INIT] = "@init", 21 [T_ATTR_NORETURN] = "@noreturn", 22 [T_ATTR_OFFSET] = "@offset", 23 [T_ATTR_PACKED] = "@packed", 24 [T_ATTR_SYMBOL] = "@symbol", 25 [T_ATTR_TEST] = "@test", 26 [T_ATTR_THREADLOCAL] = "@threadlocal", 27 [T_UNDERSCORE] = "_", 28 [T_ABORT] = "abort", 29 [T_ALIGN] = "align", 30 [T_ALLOC] = "alloc", 31 [T_APPEND] = "append", 32 [T_AS] = "as", 33 [T_ASSERT] = "assert", 34 [T_BOOL] = "bool", 35 [T_BREAK] = "break", 36 [T_CASE] = "case", 37 [T_CHAR] = "char", 38 [T_CONST] = "const", 39 [T_CONTINUE] = "continue", 40 [T_DEFER] = "defer", 41 [T_DEF] = "def", 42 [T_DELETE] = "delete", 43 [T_ELSE] = "else", 44 [T_ENUM] = "enum", 45 [T_EXPORT] = "export", 46 [T_F32] = "f32", 47 [T_F64] = "f64", 48 [T_FALSE] = "false", 49 [T_FN] = "fn", 50 [T_FOR] = "for", 51 [T_FREE] = "free", 52 [T_I16] = "i16", 53 [T_I32] = "i32", 54 [T_I64] = "i64", 55 [T_I8] = "i8", 56 [T_IF] = "if", 57 [T_INSERT] = "insert", 58 [T_INT] = "int", 59 [T_IS] = "is", 60 [T_LEN] = "len", 61 [T_LET] = "let", 62 [T_MATCH] = "match", 63 [T_NULL] = "null", 64 [T_NULLABLE] = "nullable", 65 [T_OFFSET] = "offset", 66 [T_RETURN] = "return", 67 [T_RUNE] = "rune", 68 [T_SIZE] = "size", 69 [T_STATIC] = "static", 70 [T_STR] = "str", 71 [T_STRUCT] = "struct", 72 [T_SWITCH] = "switch", 73 [T_TRUE] = "true", 74 [T_TYPE] = "type", 75 [T_U16] = "u16", 76 [T_U32] = "u32", 77 [T_U64] = "u64", 78 [T_U8] = "u8", 79 [T_UINT] = "uint", 80 [T_UINTPTR] = "uintptr", 81 [T_UNION] = "union", 82 [T_USE] = "use", 83 [T_VAARG] = "vaarg", 84 [T_VAEND] = "vaend", 85 [T_VALIST] = "valist", 86 [T_VASTART] = "vastart", 87 [T_VOID] = "void", 88 [T_YIELD] = "yield", 89 90 // Operators 91 [T_ARROW] = "=>", 92 [T_BANDEQ] = "&=", 93 [T_BAND] = "&", 94 [T_BNOT] = "~", 95 [T_BOR] = "|", 96 [T_COLON] = ":", 97 [T_COMMA] = ",", 98 [T_DIV] = "/", 99 [T_DIVEQ] = "/=", 100 [T_DOT] = ".", 101 [T_DOUBLE_COLON] = "::", 102 [T_ELLIPSIS] = "...", 103 [T_EQUAL] = "=", 104 [T_GREATER] = ">", 105 [T_GREATEREQ] = ">=", 106 [T_LAND] = "&&", 107 [T_LANDEQ] = "&&=", 108 [T_LBRACE] = "{", 109 [T_LBRACKET] = "[", 110 [T_LEQUAL] = "==", 111 [T_LESS] = "<", 112 [T_LESSEQ] = "<=", 113 [T_LNOT] = "!", 114 [T_LOR] = "||", 115 [T_LOREQ] = "||=", 116 [T_LPAREN] = "(", 117 [T_LSHIFT] = "<<", 118 [T_LSHIFTEQ] = "<<=", 119 [T_LXOR] = "^^", 120 [T_LXOREQ] = "^^=", 121 [T_MINUS] = "-", 122 [T_MINUSEQ] = "-=", 123 [T_MODEQ] = "%=", 124 [T_MODULO] = "%", 125 [T_NEQUAL] = "!=", 126 [T_BOREQ] = "|=", 127 [T_PLUS] = "+", 128 [T_PLUSEQ] = "+=", 129 [T_QUESTION] = "?", 130 [T_RBRACE] = "}", 131 [T_RBRACKET] = "]", 132 [T_RPAREN] = ")", 133 [T_RSHIFT] = ">>", 134 [T_RSHIFTEQ] = ">>=", 135 [T_SEMICOLON] = ";", 136 [T_SLICE] = "..", 137 [T_TIMES] = "*", 138 [T_TIMESEQ] = "*=", 139 [T_BXOR] = "^", 140 [T_BXOREQ] = "^=", 141 }; 142 143 static noreturn void 144 error(struct location *loc, char *fmt, ...) 145 { 146 fprintf(stderr, "Syntax error at %s:%d:%d: ", sources[loc->file], 147 loc->lineno, loc->colno); 148 149 va_list ap; 150 va_start(ap, fmt); 151 vfprintf(stderr, fmt, ap); 152 va_end(ap); 153 154 fputc('\n', stderr); 155 errline(sources[loc->file], loc->lineno, loc->colno); 156 exit(EXIT_FAILURE); 157 } 158 159 void 160 lex_init(struct lexer *lexer, FILE *f, int fileid) 161 { 162 memset(lexer, 0, sizeof(*lexer)); 163 lexer->in = f; 164 lexer->bufsz = 256; 165 lexer->buf = xcalloc(1, lexer->bufsz); 166 lexer->un.token = T_NONE; 167 lexer->loc.lineno = 1; 168 lexer->loc.colno = 0; 169 lexer->loc.file = fileid; 170 lexer->c[0] = UINT32_MAX; 171 lexer->c[1] = UINT32_MAX; 172 } 173 174 void 175 lex_finish(struct lexer *lexer) 176 { 177 fclose(lexer->in); 178 free(lexer->buf); 179 } 180 181 static void 182 update_lineno(struct location *loc, uint32_t c) 183 { 184 if (c == '\n') { 185 loc->lineno++; 186 loc->colno = 0; 187 } else if (c == '\t') { 188 loc->colno += 8; 189 } else { 190 loc->colno++; 191 } 192 } 193 194 static uint32_t 195 next(struct lexer *lexer, struct location *loc, bool buffer) 196 { 197 uint32_t c; 198 if (lexer->c[0] != UINT32_MAX) { 199 c = lexer->c[0]; 200 lexer->c[0] = lexer->c[1]; 201 lexer->c[1] = UINT32_MAX; 202 } else { 203 c = utf8_get(lexer->in); 204 update_lineno(&lexer->loc, c); 205 if (c == UTF8_INVALID && !feof(lexer->in)) { 206 error(&lexer->loc, "Invalid UTF-8 sequence encountered"); 207 } 208 } 209 if (loc != NULL) { 210 *loc = lexer->loc; 211 for (size_t i = 0; i < 2 && lexer->c[i] != UINT32_MAX; i++) { 212 update_lineno(&lexer->loc, lexer->c[i]); 213 } 214 } 215 if (c == C_EOF || !buffer) { 216 return c; 217 } 218 if (lexer->buflen + utf8_cpsize(c) >= lexer->bufsz) { 219 lexer->bufsz *= 2; 220 lexer->buf = xrealloc(lexer->buf, lexer->bufsz); 221 } 222 char buf[UTF8_MAX_SIZE]; 223 size_t sz = utf8_encode(&buf[0], c); 224 memcpy(lexer->buf + lexer->buflen, buf, sz); 225 lexer->buflen += sz; 226 lexer->buf[lexer->buflen] = '\0'; 227 return c; 228 } 229 230 static bool 231 isharespace(uint32_t c) 232 { 233 return c == '\t' || c == '\n' || c == ' '; 234 } 235 236 static uint32_t 237 wgetc(struct lexer *lexer, struct location *loc) 238 { 239 uint32_t c; 240 while ((c = next(lexer, loc, false)) != C_EOF && isharespace(c)) ; 241 return c; 242 } 243 244 static void 245 consume(struct lexer *lexer, ssize_t n) 246 { 247 if (n == -1) { 248 lexer->buflen = 0; 249 lexer->buf[0] = 0; 250 return; 251 } 252 for (ssize_t i = 0; i < n; i++) { 253 while ((lexer->buf[--lexer->buflen] & 0xC0) == 0x80) ; 254 } 255 lexer->buf[lexer->buflen] = 0; 256 } 257 258 static void 259 push(struct lexer *lexer, uint32_t c, bool buffer) 260 { 261 assert(lexer->c[1] == UINT32_MAX); 262 lexer->c[1] = lexer->c[0]; 263 lexer->c[0] = c; 264 if (buffer) { 265 consume(lexer, 1); 266 } 267 } 268 269 static int 270 cmp_keyword(const void *va, const void *vb) 271 { 272 return strcmp(*(const char **)va, *(const char **)vb); 273 } 274 275 static uint32_t 276 lex_name(struct lexer *lexer, struct token *out) 277 { 278 uint32_t c = next(lexer, &out->loc, true); 279 assert(c != C_EOF && c <= 0x7F && (isalpha(c) || c == '_' || c == '@')); 280 while ((c = next(lexer, NULL, true)) != C_EOF) { 281 if (c > 0x7F || (!isalnum(c) && c != '_')) { 282 push(lexer, c, true); 283 break; 284 } 285 } 286 287 void *token = bsearch(&lexer->buf, tokens, T_LAST_KEYWORD + 1, 288 sizeof(tokens[0]), cmp_keyword); 289 if (!token) { 290 if (lexer->buf[0] == '@') { 291 error(&out->loc, "Unknown attribute %s", lexer->buf); 292 } 293 out->token = T_NAME; 294 out->name = xstrdup(lexer->buf); 295 } else { 296 out->token = (const char **)token - tokens; 297 } 298 consume(lexer, -1); 299 return out->token; 300 } 301 302 static uintmax_t 303 compute_exp(uintmax_t n, int exponent, bool _signed) 304 { 305 if (n == 0) { 306 return 0; 307 } 308 for (int i = 0; i < exponent; i++) { 309 uintmax_t old = n; 310 n *= 10; 311 if (n / 10 != old) { 312 errno = ERANGE; 313 return INT64_MAX; 314 } 315 } 316 if (_signed && n > (uintmax_t)INT64_MIN) { 317 errno = ERANGE; 318 return INT64_MAX; 319 } 320 return n; 321 } 322 323 static void 324 lex_literal(struct lexer *lexer, struct token *out) 325 { 326 enum bases { 327 BIN = 1, OCT, HEX, DEC = 0x07, MASK = DEC 328 }; 329 static_assert((BIN | OCT | HEX | DEC) == DEC, "DEC bits must be a superset of all other bases"); 330 enum flags { 331 FLT = 3, EXP, SUFF, DIG, 332 }; 333 334 static const char chrs[][24] = { 335 [BIN] = "01", 336 [OCT] = "01234567", 337 [DEC] = "0123456789", 338 [HEX] = "0123456789abcdefABCDEF", 339 }; 340 341 static const char matching_states[0x80][6] = { 342 ['.'] = {DEC, /*HEX,*/ 0}, 343 ['e'] = {DEC, DEC | 1<<FLT, 0}, 344 ['+'] = {DEC | 1<<EXP | 1<<DIG, DEC | 1<<FLT | 1<<EXP | 1<<DIG, 0}, 345 ['-'] = {DEC | 1<<EXP | 1<<DIG, DEC | 1<<FLT | 1<<EXP | 1<<DIG, 0}, 346 ['i'] = {BIN, OCT, HEX, DEC, DEC | 1<<EXP, 0}, 347 ['u'] = {BIN, OCT, HEX, DEC, DEC | 1<<EXP, 0}, 348 ['z'] = {BIN, OCT, HEX, DEC, DEC | 1<<EXP, 0}, 349 ['f'] = {DEC, DEC | 1<<FLT, DEC | 1<<EXP, DEC | 1<<FLT | 1<<EXP, 0}, 350 }; 351 int state = DEC, base = 10, oldstate = DEC; 352 uint32_t c = next(lexer, &out->loc, true), last = 0; 353 assert(c != C_EOF && c <= 0x7F && isdigit(c)); 354 if (c == '0') { 355 c = next(lexer, NULL, true); 356 if (c <= 0x7F && isdigit(c)) { 357 error(&out->loc, "Leading zero in base 10 literal"); 358 } else if (c == 'b') { 359 state = BIN | 1 << DIG; 360 base = 2; 361 } else if (c == 'o') { 362 state = OCT | 1 << DIG; 363 base = 8; 364 } else if (c == 'x') { 365 state = HEX | 1 << DIG; 366 base = 16; 367 } 368 } 369 if (state != DEC) { 370 last = c; 371 c = next(lexer, NULL, true); 372 } 373 size_t exp = 0, suff = 0; 374 do { 375 if (strchr(chrs[state & MASK], c)) { 376 state &= ~(1 << DIG); 377 last = c; 378 continue; 379 } else if (c > 0x7f || !strchr(matching_states[c], state)) { 380 goto end; 381 } 382 oldstate = state; 383 switch (c) { 384 case '.': 385 if (lexer->require_int) { 386 goto want_int; 387 } 388 state |= 1 << FLT; 389 break; 390 case '-': 391 state |= 1 << FLT; 392 /* fallthrough */ 393 case 'e': 394 case '+': 395 state |= 1 << EXP; 396 exp = lexer->buflen - 1; 397 break; 398 case 'f': 399 state |= 1 << FLT; 400 /* fallthrough */ 401 case 'i': 402 case 'u': 403 case 'z': 404 state |= DEC | 1 << SUFF; 405 suff = lexer->buflen - 1; 406 break; 407 default: 408 goto end; 409 } 410 if (state & 1 << FLT && lexer->require_int) { 411 error(&out->loc, "Expected integer literal"); 412 } 413 last = c; 414 state |= 1 << DIG; 415 } while ((c = next(lexer, NULL, true)) != C_EOF); 416 last = 0; 417 end: 418 if (last && !strchr("iuz", last) && !strchr(chrs[state & MASK], last)) { 419 state = oldstate; 420 push(lexer, c, true); 421 push(lexer, last, true); 422 } else if (c != C_EOF) { 423 want_int: 424 push(lexer, c, true); 425 } 426 out->token = T_LITERAL; 427 lexer->require_int = false; 428 429 enum kind { 430 UNKNOWN = -1, 431 ICONST, SIGNED, UNSIGNED, FLOAT 432 } kind = UNKNOWN; 433 static const struct { 434 const char suff[4]; 435 enum kind kind; 436 enum type_storage storage; 437 } storages[] = { 438 {"f32", FLOAT, STORAGE_F32}, 439 {"f64", FLOAT, STORAGE_F64}, 440 {"i", SIGNED, STORAGE_INT}, 441 {"i16", SIGNED, STORAGE_I16}, 442 {"i32", SIGNED, STORAGE_I32}, 443 {"i64", SIGNED, STORAGE_I64}, 444 {"i8", SIGNED, STORAGE_I8}, 445 {"u", UNSIGNED, STORAGE_UINT}, 446 {"u16", UNSIGNED, STORAGE_U16}, 447 {"u32", UNSIGNED, STORAGE_U32}, 448 {"u64", UNSIGNED, STORAGE_U64}, 449 {"u8", UNSIGNED, STORAGE_U8}, 450 {"z", UNSIGNED, STORAGE_SIZE}, 451 }; 452 if (suff) { 453 for (size_t i = 0; i < sizeof storages / sizeof storages[0]; i++) { 454 if (!strcmp(storages[i].suff, lexer->buf + suff)) { 455 out->storage = storages[i].storage; 456 kind = storages[i].kind; 457 break; 458 } 459 } 460 if (kind == UNKNOWN) { 461 error(&out->loc, "Invalid suffix '%s'", lexer->buf + suff); 462 } 463 } 464 if (state & 1 << FLT) { 465 if (kind == UNKNOWN) { 466 out->storage = STORAGE_FCONST; 467 } else if (kind != FLOAT) { 468 error(&out->loc, "Unexpected decimal point in integer literal"); 469 } 470 out->fval = strtod(lexer->buf, NULL); 471 consume(lexer, -1); 472 return; 473 } 474 475 if (kind == UNKNOWN) { 476 kind = ICONST; 477 out->storage = STORAGE_ICONST; 478 } 479 uintmax_t exponent = 0; 480 errno = 0; 481 if (exp != 0) { 482 exponent = strtoumax(lexer->buf + exp + 1, NULL, 10); 483 } 484 out->uval = strtoumax(lexer->buf + (base == 10 ? 0 : 2), NULL, base); 485 out->uval = compute_exp(out->uval, exponent, kind == SIGNED); 486 if (errno == ERANGE) { 487 error(&out->loc, "Integer literal overflow"); 488 } 489 if (kind == ICONST && out->uval > (uintmax_t)INT64_MAX) { 490 out->storage = STORAGE_U64; 491 } else if (kind == SIGNED && out->uval == (uintmax_t)INT64_MIN) { 492 // XXX: Hack 493 out->ival = INT64_MIN; 494 } else if (kind != UNSIGNED) { 495 out->ival = (intmax_t)out->uval; 496 } 497 consume(lexer, -1); 498 } 499 500 static uint32_t 501 lex_rune(struct lexer *lexer) 502 { 503 char buf[9]; 504 char *endptr; 505 uint32_t c = next(lexer, NULL, false); 506 assert(c != C_EOF); 507 508 switch (c) { 509 case '\\': 510 c = next(lexer, NULL, false); 511 switch (c) { 512 case '0': 513 return '\0'; 514 case 'a': 515 return '\a'; 516 case 'b': 517 return '\b'; 518 case 'f': 519 return '\f'; 520 case 'n': 521 return '\n'; 522 case 'r': 523 return '\r'; 524 case 't': 525 return '\t'; 526 case 'v': 527 return '\v'; 528 case '\\': 529 return '\\'; 530 case '\'': 531 return '\''; 532 case '"': 533 return '\"'; 534 case 'x': 535 buf[0] = next(lexer, NULL, false); 536 buf[1] = next(lexer, NULL, false); 537 buf[2] = '\0'; 538 c = strtoul(&buf[0], &endptr, 16); 539 if (*endptr != '\0') { 540 error(&lexer->loc, "Invalid hex literal"); 541 } 542 return c; 543 case 'u': 544 buf[0] = next(lexer, NULL, false); 545 buf[1] = next(lexer, NULL, false); 546 buf[2] = next(lexer, NULL, false); 547 buf[3] = next(lexer, NULL, false); 548 buf[4] = '\0'; 549 c = strtoul(&buf[0], &endptr, 16); 550 if (*endptr != '\0') { 551 error(&lexer->loc, "Invalid hex literal"); 552 } 553 return c; 554 case 'U': 555 buf[0] = next(lexer, NULL, false); 556 buf[1] = next(lexer, NULL, false); 557 buf[2] = next(lexer, NULL, false); 558 buf[3] = next(lexer, NULL, false); 559 buf[4] = next(lexer, NULL, false); 560 buf[5] = next(lexer, NULL, false); 561 buf[6] = next(lexer, NULL, false); 562 buf[7] = next(lexer, NULL, false); 563 buf[8] = '\0'; 564 c = strtoul(&buf[0], &endptr, 16); 565 if (*endptr != '\0') { 566 error(&lexer->loc, "Invalid hex literal"); 567 } 568 return c; 569 case C_EOF: 570 error(&lexer->loc, "Unexpected end of file"); 571 default: 572 error(&lexer->loc, "Invalid escape '\\%c'", c); 573 } 574 assert(0); 575 default: 576 return c; 577 } 578 assert(0); 579 } 580 581 static enum lexical_token 582 lex_string(struct lexer *lexer, struct token *out) 583 { 584 uint32_t c = next(lexer, &out->loc, false); 585 uint32_t delim; 586 587 switch (c) { 588 case '"': 589 case '`': 590 delim = c; 591 while ((c = next(lexer, NULL, false)) != delim) { 592 if (c == C_EOF) { 593 error(&lexer->loc, "Unexpected end of file"); 594 } 595 push(lexer, c, false); 596 if (delim == '"') { 597 push(lexer, lex_rune(lexer), false); 598 } 599 next(lexer, NULL, true); 600 } 601 char *buf = xcalloc(lexer->buflen + 1, 1); 602 memcpy(buf, lexer->buf, lexer->buflen); 603 out->token = T_LITERAL; 604 out->storage = STORAGE_STRING; 605 out->string.len = lexer->buflen; 606 out->string.value = buf; 607 consume(lexer, -1); 608 return out->token; 609 case '\'': 610 c = next(lexer, NULL, false); 611 switch (c) { 612 case '\'': 613 error(&out->loc, "Expected rune before trailing single quote"); 614 case '\\': 615 push(lexer, c, false); 616 out->rune = lex_rune(lexer); 617 break; 618 default: 619 out->rune = c; 620 } 621 if (next(lexer, NULL, false) != '\'') { 622 error(&out->loc, "Expected trailing single quote"); 623 } 624 out->token = T_LITERAL; 625 out->storage = STORAGE_RCONST; 626 return out->token; 627 default: 628 assert(0); // Invariant 629 } 630 assert(0); 631 } 632 633 static enum lexical_token 634 lex3(struct lexer *lexer, struct token *out, uint32_t c) 635 { 636 assert(c != C_EOF); 637 638 switch (c) { 639 case '.': 640 switch ((c = next(lexer, NULL, false))) { 641 case '.': 642 switch ((c = next(lexer, NULL, false))) { 643 case '.': 644 out->token = T_ELLIPSIS; 645 break; 646 default: 647 push(lexer, c, false); 648 out->token = T_SLICE; 649 break; 650 } 651 break; 652 default: 653 push(lexer, c, false); 654 out->token = T_DOT; 655 lexer->require_int = true; 656 break; 657 } 658 break; 659 case '<': 660 switch ((c = next(lexer, NULL, false))) { 661 case '<': 662 switch ((c = next(lexer, NULL, false))) { 663 case '=': 664 out->token = T_LSHIFTEQ; 665 break; 666 default: 667 push(lexer, c, false); 668 out->token = T_LSHIFT; 669 break; 670 } 671 break; 672 case '=': 673 out->token = T_LESSEQ; 674 break; 675 default: 676 push(lexer, c, false); 677 out->token = T_LESS; 678 break; 679 } 680 break; 681 case '>': 682 switch ((c = next(lexer, NULL, false))) { 683 case '>': 684 switch ((c = next(lexer, NULL, false))) { 685 case '=': 686 out->token = T_RSHIFTEQ; 687 break; 688 default: 689 push(lexer, c, false); 690 out->token = T_RSHIFT; 691 break; 692 } 693 break; 694 case '=': 695 out->token = T_GREATEREQ; 696 break; 697 default: 698 push(lexer, c, false); 699 out->token = T_GREATER; 700 break; 701 } 702 break; 703 case '&': 704 switch ((c = next(lexer, NULL, false))) { 705 case '&': 706 switch ((c = next(lexer, NULL, false))) { 707 case '=': 708 out->token = T_LANDEQ; 709 break; 710 default: 711 push(lexer, c, false); 712 out->token = T_LAND; 713 break; 714 } 715 break; 716 case '=': 717 out->token = T_BANDEQ; 718 break; 719 default: 720 push(lexer, c, false); 721 out->token = T_BAND; 722 break; 723 } 724 break; 725 case '|': 726 switch ((c = next(lexer, NULL, false))) { 727 case '|': 728 switch ((c = next(lexer, NULL, false))) { 729 case '=': 730 out->token = T_LOREQ; 731 break; 732 default: 733 push(lexer, c, false); 734 out->token = T_LOR; 735 break; 736 } 737 break; 738 case '=': 739 out->token = T_BOREQ; 740 break; 741 default: 742 push(lexer, c, false); 743 out->token = T_BOR; 744 break; 745 } 746 break; 747 case '^': 748 switch ((c = next(lexer, NULL, false))) { 749 case '^': 750 switch ((c = next(lexer, NULL, false))) { 751 case '=': 752 out->token = T_LXOREQ; 753 break; 754 default: 755 push(lexer, c, false); 756 out->token = T_LXOR; 757 break; 758 } 759 break; 760 case '=': 761 out->token = T_BXOREQ; 762 break; 763 default: 764 push(lexer, c, false); 765 out->token = T_BXOR; 766 break; 767 } 768 break; 769 default: 770 assert(0); // Invariant 771 } 772 773 return out->token; 774 } 775 776 static enum lexical_token _lex(struct lexer *lexer, struct token *out); 777 778 static enum lexical_token 779 lex2(struct lexer *lexer, struct token *out, uint32_t c) 780 { 781 assert(c != C_EOF); 782 783 switch (c) { 784 case '*': 785 switch ((c = next(lexer, NULL, false))) { 786 case '=': 787 out->token = T_TIMESEQ; 788 break; 789 default: 790 push(lexer, c, false); 791 out->token = T_TIMES; 792 break; 793 } 794 break; 795 case '%': 796 switch ((c = next(lexer, NULL, false))) { 797 case '=': 798 out->token = T_MODEQ; 799 break; 800 default: 801 push(lexer, c, false); 802 out->token = T_MODULO; 803 break; 804 } 805 break; 806 case '/': 807 switch ((c = next(lexer, NULL, false))) { 808 case '=': 809 out->token = T_DIVEQ; 810 break; 811 case '/': 812 while ((c = next(lexer, NULL, false)) != C_EOF && c != '\n') ; 813 return _lex(lexer, out); 814 default: 815 push(lexer, c, false); 816 out->token = T_DIV; 817 break; 818 } 819 break; 820 case '+': 821 switch ((c = next(lexer, NULL, false))) { 822 case '=': 823 out->token = T_PLUSEQ; 824 break; 825 default: 826 push(lexer, c, false); 827 out->token = T_PLUS; 828 break; 829 } 830 break; 831 case '-': 832 switch ((c = next(lexer, NULL, false))) { 833 case '=': 834 out->token = T_MINUSEQ; 835 break; 836 default: 837 push(lexer, c, false); 838 out->token = T_MINUS; 839 break; 840 } 841 break; 842 case ':': 843 switch ((c = next(lexer, NULL, false))) { 844 case ':': 845 out->token = T_DOUBLE_COLON; 846 break; 847 default: 848 push(lexer, c, false); 849 out->token = T_COLON; 850 break; 851 } 852 break; 853 case '!': 854 switch ((c = next(lexer, NULL, false))) { 855 case '=': 856 out->token = T_NEQUAL; 857 break; 858 default: 859 push(lexer, c, false); 860 out->token = T_LNOT; 861 break; 862 } 863 break; 864 case '=': 865 switch ((c = next(lexer, NULL, false))) { 866 case '=': 867 out->token = T_LEQUAL; 868 break; 869 case '>': 870 out->token = T_ARROW; 871 break; 872 default: 873 push(lexer, c, false); 874 out->token = T_EQUAL; 875 break; 876 } 877 break; 878 default: 879 assert(0); // Invariant 880 } 881 882 return out->token; 883 } 884 885 static enum lexical_token 886 _lex(struct lexer *lexer, struct token *out) 887 { 888 if (lexer->un.token != T_NONE) { 889 *out = lexer->un; 890 lexer->un.token = T_NONE; 891 return out->token; 892 } 893 894 uint32_t c = wgetc(lexer, &out->loc); 895 if (c == C_EOF) { 896 out->token = T_EOF; 897 return out->token; 898 } 899 900 if (c <= 0x7F && isdigit(c)) { 901 push(lexer, c, false); 902 lex_literal(lexer, out); 903 return T_LITERAL; 904 } 905 906 lexer->require_int = false; 907 908 if (c <= 0x7F && (isalpha(c) || c == '_' || c == '@')) { 909 push(lexer, c, false); 910 return lex_name(lexer, out); 911 } 912 913 char p[5]; 914 switch (c) { 915 case '"': 916 case '`': 917 case '\'': 918 push(lexer, c, false); 919 return lex_string(lexer, out); 920 case '.': // . .. ... 921 case '<': // < << <= <<= 922 case '>': // > >> >= >>= 923 case '&': // & && &= &&= 924 case '|': // | || |= ||= 925 case '^': // ^ ^^ ^= ^^= 926 return lex3(lexer, out, c); 927 case '*': // * *= 928 case '%': // % %= 929 case '/': // / /= // 930 case '+': // + += 931 case '-': // - -= 932 case ':': // : :: 933 case '!': // ! != 934 case '=': // = == => 935 return lex2(lexer, out, c); 936 case '~': 937 out->token = T_BNOT; 938 break; 939 case ',': 940 out->token = T_COMMA; 941 break; 942 case '{': 943 out->token = T_LBRACE; 944 break; 945 case '[': 946 out->token = T_LBRACKET; 947 break; 948 case '(': 949 out->token = T_LPAREN; 950 break; 951 case '}': 952 out->token = T_RBRACE; 953 break; 954 case ']': 955 out->token = T_RBRACKET; 956 break; 957 case ')': 958 out->token = T_RPAREN; 959 break; 960 case ';': 961 out->token = T_SEMICOLON; 962 break; 963 case '?': 964 out->token = T_QUESTION; 965 break; 966 default: 967 p[utf8_encode(p, c)] = '\0'; 968 fprintf(stderr, "Error: unexpected code point '%s' at %s:%d:%d\n", 969 p, sources[lexer->loc.file], lexer->loc.lineno, 970 lexer->loc.colno); 971 exit(EXIT_FAILURE); 972 } 973 974 return out->token; 975 } 976 977 enum lexical_token 978 lex(struct lexer *lexer, struct token *out) 979 { 980 return _lex(lexer, out); 981 } 982 983 void 984 token_finish(struct token *tok) 985 { 986 switch (tok->token) { 987 case T_NAME: 988 free(tok->name); 989 break; 990 case T_LITERAL: 991 switch (tok->storage) { 992 case STORAGE_STRING: 993 free(tok->string.value); 994 break; 995 default: 996 break; 997 } 998 break; 999 default: 1000 break; 1001 } 1002 tok->token = 0; 1003 tok->storage = 0; 1004 tok->loc.file = 0; 1005 tok->loc.colno = 0; 1006 tok->loc.lineno = 0; 1007 } 1008 1009 const char * 1010 lexical_token_str(enum lexical_token tok) 1011 { 1012 switch (tok) { 1013 case T_NAME: 1014 return "name"; 1015 case T_LITERAL: 1016 return "literal"; 1017 case T_EOF: 1018 return "end of file"; 1019 case T_NONE: 1020 abort(); 1021 default: 1022 assert(tok < sizeof(tokens) / sizeof(tokens[0])); 1023 return tokens[tok]; 1024 } 1025 } 1026 1027 static const char * 1028 rune_unparse(uint32_t c) 1029 { 1030 static char buf[11]; 1031 switch (c) { 1032 case '\0': 1033 snprintf(buf, sizeof(buf), "\\0"); 1034 break; 1035 case '\a': 1036 snprintf(buf, sizeof(buf), "\\a"); 1037 break; 1038 case '\b': 1039 snprintf(buf, sizeof(buf), "\\b"); 1040 break; 1041 case '\f': 1042 snprintf(buf, sizeof(buf), "\\f"); 1043 break; 1044 case '\n': 1045 snprintf(buf, sizeof(buf), "\\n"); 1046 break; 1047 case '\r': 1048 snprintf(buf, sizeof(buf), "\\r"); 1049 break; 1050 case '\t': 1051 snprintf(buf, sizeof(buf), "\\t"); 1052 break; 1053 case '\v': 1054 snprintf(buf, sizeof(buf), "\\v"); 1055 break; 1056 case '\\': 1057 snprintf(buf, sizeof(buf), "\\\\"); 1058 break; 1059 case '\'': 1060 snprintf(buf, sizeof(buf), "\\'"); 1061 break; 1062 case '"': 1063 snprintf(buf, sizeof(buf), "\\\""); 1064 break; 1065 default: 1066 if (c > 0xffff) { 1067 snprintf(buf, sizeof(buf), "\\U%08x", c); 1068 } else if (c > 0x7F) { 1069 snprintf(buf, sizeof(buf), "\\u%04x", c); 1070 } else if (!isprint(c)) { 1071 snprintf(buf, sizeof(buf), "\\x%02x", c); 1072 } else { 1073 assert(utf8_cpsize(c) < sizeof(buf)); 1074 buf[utf8_encode(buf, c)] = '\0'; 1075 } 1076 break; 1077 } 1078 return buf; 1079 } 1080 1081 static const char * 1082 string_unparse(const struct token *tok) 1083 { 1084 static char buf[1024]; 1085 assert(tok->token == T_LITERAL && tok->storage == STORAGE_STRING); 1086 int bytes = 0; 1087 memset(buf, 0, sizeof(buf)); 1088 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\""); 1089 const char *s = tok->string.value; 1090 for (uint32_t c = utf8_decode(&s); 1091 s - tok->string.value <= (ptrdiff_t)tok->string.len; 1092 c = utf8_decode(&s)) { 1093 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s", 1094 rune_unparse(c)); 1095 } 1096 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\""); 1097 return buf; 1098 } 1099 1100 const char * 1101 token_str(const struct token *tok) 1102 { 1103 static char buf[1024]; 1104 int bytes = 0; 1105 switch (tok->token) { 1106 case T_NAME: 1107 snprintf(buf, sizeof(buf), "name %s", tok->name); 1108 return buf; 1109 case T_LITERAL: 1110 switch (tok->storage) { 1111 case STORAGE_U8: 1112 case STORAGE_U16: 1113 case STORAGE_U32: 1114 case STORAGE_U64: 1115 case STORAGE_UINT: 1116 case STORAGE_UINTPTR: 1117 case STORAGE_SIZE: 1118 snprintf(buf, sizeof(buf), "%ju", tok->uval); 1119 break; 1120 case STORAGE_I8: 1121 case STORAGE_I16: 1122 case STORAGE_I32: 1123 case STORAGE_I64: 1124 case STORAGE_ICONST: 1125 case STORAGE_INT: 1126 snprintf(buf, sizeof(buf), "%jd", tok->ival); 1127 break; 1128 case STORAGE_F32: 1129 case STORAGE_F64: 1130 case STORAGE_FCONST: 1131 snprintf(buf, sizeof(buf), "%lf", tok->fval); 1132 break; 1133 case STORAGE_RCONST: 1134 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'"); 1135 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s", 1136 rune_unparse(tok->rune)); 1137 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'"); 1138 break; 1139 case STORAGE_STRING: 1140 return string_unparse(tok); 1141 case STORAGE_ALIAS: 1142 case STORAGE_ARRAY: 1143 case STORAGE_BOOL: 1144 case STORAGE_CHAR: 1145 case STORAGE_ENUM: 1146 case STORAGE_ERROR: 1147 case STORAGE_FUNCTION: 1148 case STORAGE_POINTER: 1149 case STORAGE_NULL: 1150 case STORAGE_RUNE: 1151 case STORAGE_SLICE: 1152 case STORAGE_STRUCT: 1153 case STORAGE_TAGGED: 1154 case STORAGE_TUPLE: 1155 case STORAGE_UNION: 1156 case STORAGE_VALIST: 1157 case STORAGE_VOID: 1158 assert(0); 1159 } 1160 return buf; 1161 default:; 1162 const char *out = lexical_token_str(tok->token); 1163 return out; 1164 } 1165 } 1166 1167 void 1168 unlex(struct lexer *lexer, struct token *in) 1169 { 1170 assert(lexer->un.token == T_NONE); 1171 lexer->un = *in; 1172 }