lex.c (24685B)
1 #include <assert.h> 2 #include <ctype.h> 3 #include <errno.h> 4 #include <inttypes.h> 5 #include <stdarg.h> 6 #include <stdbool.h> 7 #include <stddef.h> 8 #include <stdint.h> 9 #include <stdio.h> 10 #include <stdlib.h> 11 #include <stdnoreturn.h> 12 #include <string.h> 13 #include "lex.h" 14 #include "utf8.h" 15 #include "util.h" 16 17 static const char *tokens[] = { 18 // Must be alpha sorted and match lex.h 19 [T_ATTR_FINI] = "@fini", 20 [T_ATTR_INIT] = "@init", 21 [T_ATTR_NORETURN] = "@noreturn", 22 [T_ATTR_OFFSET] = "@offset", 23 [T_ATTR_PACKED] = "@packed", 24 [T_ATTR_SYMBOL] = "@symbol", 25 [T_ATTR_TEST] = "@test", 26 [T_ATTR_THREADLOCAL] = "@threadlocal", 27 [T_UNDERSCORE] = "_", 28 [T_ABORT] = "abort", 29 [T_ALLOC] = "alloc", 30 [T_APPEND] = "append", 31 [T_AS] = "as", 32 [T_ASSERT] = "assert", 33 [T_BOOL] = "bool", 34 [T_BREAK] = "break", 35 [T_CASE] = "case", 36 [T_CHAR] = "char", 37 [T_CONST] = "const", 38 [T_CONTINUE] = "continue", 39 [T_DEFER] = "defer", 40 [T_DEF] = "def", 41 [T_DELETE] = "delete", 42 [T_ELSE] = "else", 43 [T_ENUM] = "enum", 44 [T_EXPORT] = "export", 45 [T_F32] = "f32", 46 [T_F64] = "f64", 47 [T_FALSE] = "false", 48 [T_FN] = "fn", 49 [T_FOR] = "for", 50 [T_FREE] = "free", 51 [T_I16] = "i16", 52 [T_I32] = "i32", 53 [T_I64] = "i64", 54 [T_I8] = "i8", 55 [T_IF] = "if", 56 [T_INSERT] = "insert", 57 [T_INT] = "int", 58 [T_IS] = "is", 59 [T_LEN] = "len", 60 [T_LET] = "let", 61 [T_MATCH] = "match", 62 [T_NULL] = "null", 63 [T_NULLABLE] = "nullable", 64 [T_OFFSET] = "offset", 65 [T_RETURN] = "return", 66 [T_RUNE] = "rune", 67 [T_SIZE] = "size", 68 [T_STATIC] = "static", 69 [T_STR] = "str", 70 [T_STRUCT] = "struct", 71 [T_SWITCH] = "switch", 72 [T_TRUE] = "true", 73 [T_TYPE] = "type", 74 [T_U16] = "u16", 75 [T_U32] = "u32", 76 [T_U64] = "u64", 77 [T_U8] = "u8", 78 [T_UINT] = "uint", 79 [T_UINTPTR] = "uintptr", 80 [T_UNION] = "union", 81 [T_USE] = "use", 82 [T_VAARG] = "vaarg", 83 [T_VAEND] = "vaend", 84 [T_VALIST] = "valist", 85 [T_VASTART] = "vastart", 86 [T_VOID] = "void", 87 [T_YIELD] = "yield", 88 89 // Operators 90 [T_ARROW] = "=>", 91 [T_BANDEQ] = "&=", 92 [T_BAND] = "&", 93 [T_BNOT] = "~", 94 [T_BOR] = "|", 95 [T_COLON] = ":", 96 [T_COMMA] = ",", 97 [T_DIV] = "/", 98 [T_DIVEQ] = "/=", 99 [T_DOT] = ".", 100 [T_DOUBLE_COLON] = "::", 101 [T_ELLIPSIS] = "...", 102 [T_EQUAL] = "=", 103 [T_GREATER] = ">", 104 [T_GREATEREQ] = ">=", 105 [T_LAND] = "&&", 106 [T_LANDEQ] = "&&=", 107 [T_LBRACE] = "{", 108 [T_LBRACKET] = "[", 109 [T_LEQUAL] = "==", 110 [T_LESS] = "<", 111 [T_LESSEQ] = "<=", 112 [T_LNOT] = "!", 113 [T_LOR] = "||", 114 [T_LOREQ] = "||=", 115 [T_LPAREN] = "(", 116 [T_LSHIFT] = "<<", 117 [T_LSHIFTEQ] = "<<=", 118 [T_LXOR] = "^^", 119 [T_LXOREQ] = "^^=", 120 [T_MINUS] = "-", 121 [T_MINUSEQ] = "-=", 122 [T_MODEQ] = "%=", 123 [T_MODULO] = "%", 124 [T_NEQUAL] = "!=", 125 [T_BOREQ] = "|=", 126 [T_PLUS] = "+", 127 [T_PLUSEQ] = "+=", 128 [T_QUESTION] = "?", 129 [T_RBRACE] = "}", 130 [T_RBRACKET] = "]", 131 [T_RPAREN] = ")", 132 [T_RSHIFT] = ">>", 133 [T_RSHIFTEQ] = ">>=", 134 [T_SEMICOLON] = ";", 135 [T_SLICE] = "..", 136 [T_TIMES] = "*", 137 [T_TIMESEQ] = "*=", 138 [T_BXOR] = "^", 139 [T_BXOREQ] = "^=", 140 }; 141 142 static noreturn void 143 error(struct location *loc, char *fmt, ...) 144 { 145 fprintf(stderr, "Syntax error at %s:%d:%d: ", sources[loc->file], 146 loc->lineno, loc->colno); 147 148 va_list ap; 149 va_start(ap, fmt); 150 vfprintf(stderr, fmt, ap); 151 va_end(ap); 152 153 fputc('\n', stderr); 154 errline(sources[loc->file], loc->lineno, loc->colno); 155 exit(EXIT_FAILURE); 156 } 157 158 void 159 lex_init(struct lexer *lexer, FILE *f, int fileid) 160 { 161 memset(lexer, 0, sizeof(*lexer)); 162 lexer->in = f; 163 lexer->bufsz = 256; 164 lexer->buf = xcalloc(1, lexer->bufsz); 165 lexer->un.token = T_ERROR; 166 lexer->loc.lineno = 1; 167 lexer->loc.colno = 0; 168 lexer->loc.file = fileid; 169 lexer->c[0] = UINT32_MAX; 170 lexer->c[1] = UINT32_MAX; 171 } 172 173 void 174 lex_finish(struct lexer *lexer) 175 { 176 fclose(lexer->in); 177 free(lexer->buf); 178 } 179 180 static void 181 update_lineno(struct location *loc, uint32_t c) 182 { 183 if (c == '\n') { 184 loc->lineno++; 185 loc->colno = 0; 186 } else if (c == '\t') { 187 loc->colno += 8; 188 } else { 189 loc->colno++; 190 } 191 } 192 193 static uint32_t 194 next(struct lexer *lexer, struct location *loc, bool buffer) 195 { 196 uint32_t c; 197 if (lexer->c[0] != UINT32_MAX) { 198 c = lexer->c[0]; 199 lexer->c[0] = lexer->c[1]; 200 lexer->c[1] = UINT32_MAX; 201 } else { 202 c = utf8_get(lexer->in); 203 update_lineno(&lexer->loc, c); 204 if (c == UTF8_INVALID && !feof(lexer->in)) { 205 error(&lexer->loc, "Invalid UTF-8 sequence encountered"); 206 } 207 } 208 if (loc != NULL) { 209 *loc = lexer->loc; 210 for (size_t i = 0; i < 2 && lexer->c[i] != UINT32_MAX; i++) { 211 update_lineno(&lexer->loc, lexer->c[i]); 212 } 213 } 214 if (c == C_EOF || !buffer) { 215 return c; 216 } 217 if (lexer->buflen + utf8_cpsize(c) >= lexer->bufsz) { 218 lexer->bufsz *= 2; 219 lexer->buf = xrealloc(lexer->buf, lexer->bufsz); 220 } 221 char buf[UTF8_MAX_SIZE]; 222 size_t sz = utf8_encode(&buf[0], c); 223 memcpy(lexer->buf + lexer->buflen, buf, sz); 224 lexer->buflen += sz; 225 lexer->buf[lexer->buflen] = '\0'; 226 return c; 227 } 228 229 static bool 230 isharespace(uint32_t c) 231 { 232 return c == '\t' || c == '\n' || c == ' '; 233 } 234 235 static uint32_t 236 wgetc(struct lexer *lexer, struct location *loc) 237 { 238 uint32_t c; 239 while ((c = next(lexer, loc, false)) != C_EOF && isharespace(c)) ; 240 return c; 241 } 242 243 static void 244 consume(struct lexer *lexer, ssize_t n) 245 { 246 if (n == -1) { 247 lexer->buflen = 0; 248 lexer->buf[0] = 0; 249 return; 250 } 251 for (ssize_t i = 0; i < n; i++) { 252 while ((lexer->buf[--lexer->buflen] & 0xC0) == 0x80) ; 253 } 254 lexer->buf[lexer->buflen] = 0; 255 } 256 257 static void 258 push(struct lexer *lexer, uint32_t c, bool buffer) 259 { 260 assert(lexer->c[1] == UINT32_MAX); 261 lexer->c[1] = lexer->c[0]; 262 lexer->c[0] = c; 263 if (buffer) { 264 consume(lexer, 1); 265 } 266 } 267 268 static int 269 cmp_keyword(const void *va, const void *vb) 270 { 271 return strcmp(*(const char **)va, *(const char **)vb); 272 } 273 274 static uint32_t 275 lex_name(struct lexer *lexer, struct token *out) 276 { 277 uint32_t c = next(lexer, &out->loc, true); 278 assert(c != C_EOF && c <= 0x7F && (isalpha(c) || c == '_' || c == '@')); 279 while ((c = next(lexer, NULL, true)) != C_EOF) { 280 if (c > 0x7F || (!isalnum(c) && c != '_')) { 281 push(lexer, c, true); 282 break; 283 } 284 } 285 286 void *token = bsearch(&lexer->buf, tokens, T_LAST_KEYWORD + 1, 287 sizeof(tokens[0]), cmp_keyword); 288 if (!token) { 289 if (lexer->buf[0] == '@') { 290 error(&out->loc, "Unknown attribute %s", lexer->buf); 291 } 292 out->token = T_NAME; 293 out->name = xstrdup(lexer->buf); 294 } else { 295 out->token = (const char **)token - tokens; 296 } 297 consume(lexer, -1); 298 return out->token; 299 } 300 301 static uintmax_t 302 compute_exp(uintmax_t n, int exponent, bool _signed, struct location *loc) 303 { 304 if (n == 0) { 305 return 0; 306 } 307 for (int i = 0; i < exponent; i++) { 308 uintmax_t old = n; 309 n *= 10; 310 if (n / 10 != old) { 311 error(loc, "Integer literal overflow"); 312 } 313 } 314 if (_signed && n > (uintmax_t)INT64_MIN) { 315 error(loc, "Integer literal overflow"); 316 } 317 return n; 318 } 319 320 static uint32_t 321 lex_literal(struct lexer *lexer, struct token *out) 322 { 323 uint32_t c = next(lexer, &out->loc, true); 324 assert(c != C_EOF && c <= 0x7F && isdigit(c)); 325 326 bool started = false, leadingzero = false; 327 int base = 10; 328 const char *basechrs = "0123456789"; 329 if (c == '0') { 330 switch ((c = next(lexer, NULL, true))) { 331 case 'b': 332 base = 2; 333 basechrs = "01"; 334 consume(lexer, 2); 335 break; 336 case 'o': 337 base = 8; 338 basechrs = "01234567"; 339 consume(lexer, 2); 340 break; 341 case 'x': 342 base = 16; 343 basechrs = "0123456789ABCDEFabcdef"; 344 consume(lexer, 2); 345 break; 346 default: 347 started = true; 348 leadingzero = true; 349 push(lexer, c, true); 350 break; 351 } 352 } else { 353 started = true; 354 } 355 356 char *suff = NULL; 357 char *exp = NULL; 358 bool isfloat = false; 359 while ((c = next(lexer, NULL, true)) != C_EOF) { 360 if (!strchr(basechrs, c)) { 361 switch (c) { 362 case '.': 363 if (!started) { 364 push(lexer, c, true); 365 goto finalize; 366 } 367 if (lexer->require_int) { 368 push(lexer, '.', true); 369 goto finalize; 370 } 371 if (isfloat || suff || exp) { 372 push(lexer, c, true); 373 goto finalize; 374 } 375 if (!strchr(basechrs, c = next(lexer, NULL, false))) { 376 push(lexer, c, false); 377 push(lexer, '.', true); 378 goto finalize; 379 } else { 380 push(lexer, c, false); 381 } 382 isfloat = true; 383 break; 384 case 'e': 385 if (!started) { 386 push(lexer, c, true); 387 goto finalize; 388 } 389 if (exp || suff) { 390 push(lexer, c, true); 391 goto finalize; 392 } 393 // exponent is always in base 10 394 basechrs = "0123456789"; 395 c = next(lexer, NULL, true); 396 if (c != '-' && c != '+' && !strchr(basechrs, c)) { 397 push(lexer, c, true); 398 push(lexer, 'e', true); 399 goto finalize; 400 }; 401 exp = &lexer->buf[lexer->buflen - 1]; 402 break; 403 case 'f': 404 if (base != 10) { 405 push(lexer, c, true); 406 goto finalize; 407 } 408 // Fallthrough 409 case 'i': 410 case 'u': 411 case 'z': 412 if (suff || !started) { 413 push(lexer, c, true); 414 goto finalize; 415 } 416 suff = &lexer->buf[lexer->buflen - 1]; 417 basechrs = "0123456789"; 418 break; 419 default: 420 push(lexer, c, true); 421 goto finalize; 422 } 423 } 424 started = true; 425 } 426 427 finalize: 428 if (!started) { 429 error(&out->loc, "Invalid literal"); 430 } 431 if (leadingzero && lexer->buflen >= 2 && strchr(basechrs, lexer->buf[1])) { 432 error(&out->loc, "Leading zero in base 10 literal"); 433 } 434 lexer->require_int = false; 435 out->token = T_LITERAL; 436 if (isfloat) { 437 out->storage = STORAGE_FCONST; 438 } else { 439 out->storage = STORAGE_ICONST; 440 } 441 if (suff) { 442 const char *suffs[] = { 443 [STORAGE_U8] = "u8", 444 [STORAGE_U16] = "u16", 445 [STORAGE_U32] = "u32", 446 [STORAGE_U64] = "u64", 447 [STORAGE_I8] = "i8", 448 [STORAGE_I16] = "i16", 449 [STORAGE_I32] = "i32", 450 [STORAGE_I64] = "i64", 451 452 [STORAGE_UINT] = "u", 453 [STORAGE_INT] = "i", 454 [STORAGE_SIZE] = "z", 455 [STORAGE_F32] = "f32", 456 [STORAGE_F64] = "f64", 457 }; 458 bool isvalid = false; 459 for (enum type_storage i = 0; 460 i < sizeof(suffs) / sizeof(suffs[0]); ++i) { 461 if (suffs[i] && strcmp(suff, suffs[i]) == 0) { 462 isvalid = true; 463 out->storage = i; 464 break; 465 } 466 } 467 if (!isvalid) { 468 error(&out->loc, "Invalid numeric suffix"); 469 } 470 } 471 472 intmax_t exponent = 0; 473 if (exp) { 474 char *endptr = NULL; 475 errno = 0; 476 exponent = strtoimax(exp, &endptr, 10); 477 if (errno == ERANGE) { 478 error(&out->loc, "Numerical exponent overflow"); 479 } 480 // integers can't have negative exponents 481 if (exponent < 0 && !suff) { 482 out->storage = STORAGE_FCONST; 483 } 484 enum type_storage s = out->storage; 485 bool valid = exponent >= 0 486 || s == STORAGE_F32 487 || s == STORAGE_F64 488 || s == STORAGE_FCONST; 489 if (endptr == exp || !valid) { 490 error(&out->loc, "Integers cannot have negative exponents"); 491 } 492 } 493 494 if (isfloat) { 495 switch (out->storage) { 496 case STORAGE_F32: 497 case STORAGE_F64: 498 case STORAGE_FCONST: 499 break; 500 default: 501 error(&out->loc, "Unexpected decimal point in integer literal"); 502 } 503 } 504 505 errno = 0; 506 switch (out->storage) { 507 case STORAGE_U8: 508 case STORAGE_U16: 509 case STORAGE_U32: 510 case STORAGE_UINT: 511 case STORAGE_U64: 512 case STORAGE_SIZE: 513 out->uval = compute_exp(strtoumax(lexer->buf, NULL, base), 514 exponent, false, &out->loc); 515 break; 516 case STORAGE_ICONST: 517 out->uval = compute_exp(strtoumax(lexer->buf, NULL, base), 518 exponent, false, &out->loc); 519 if (out->uval > (uintmax_t)INT64_MAX) { 520 out->storage = STORAGE_U64; 521 break; 522 } 523 // Fallthrough 524 case STORAGE_I8: 525 case STORAGE_I16: 526 case STORAGE_I32: 527 case STORAGE_INT: 528 case STORAGE_I64: 529 out->uval = compute_exp(strtoumax(lexer->buf, NULL, base), 530 exponent, true, &out->loc); 531 if (out->uval == (uintmax_t)INT64_MIN) { 532 // XXX: Hack 533 out->ival = INT64_MIN; 534 } else { 535 out->ival = (intmax_t)out->uval; 536 } 537 break; 538 case STORAGE_F32: 539 case STORAGE_F64: 540 case STORAGE_FCONST: 541 out->fval = strtod(lexer->buf, NULL); 542 break; 543 default: 544 assert(0); 545 } 546 if (errno == ERANGE && !isfloat) { 547 error(&out->loc, "Integer literal overflow"); 548 } 549 consume(lexer, -1); 550 return out->token; 551 } 552 553 static uint32_t 554 lex_rune(struct lexer *lexer) 555 { 556 char buf[9]; 557 char *endptr; 558 uint32_t c = next(lexer, NULL, false); 559 assert(c != C_EOF); 560 561 switch (c) { 562 case '\\': 563 c = next(lexer, NULL, false); 564 switch (c) { 565 case '0': 566 return '\0'; 567 case 'a': 568 return '\a'; 569 case 'b': 570 return '\b'; 571 case 'f': 572 return '\f'; 573 case 'n': 574 return '\n'; 575 case 'r': 576 return '\r'; 577 case 't': 578 return '\t'; 579 case 'v': 580 return '\v'; 581 case '\\': 582 return '\\'; 583 case '\'': 584 return '\''; 585 case '"': 586 return '\"'; 587 case 'x': 588 buf[0] = next(lexer, NULL, false); 589 buf[1] = next(lexer, NULL, false); 590 buf[2] = '\0'; 591 c = strtoul(&buf[0], &endptr, 16); 592 if (*endptr != '\0') { 593 error(&lexer->loc, "Invalid hex literal"); 594 } 595 return c; 596 case 'u': 597 buf[0] = next(lexer, NULL, false); 598 buf[1] = next(lexer, NULL, false); 599 buf[2] = next(lexer, NULL, false); 600 buf[3] = next(lexer, NULL, false); 601 buf[4] = '\0'; 602 c = strtoul(&buf[0], &endptr, 16); 603 if (*endptr != '\0') { 604 error(&lexer->loc, "Invalid hex literal"); 605 } 606 return c; 607 case 'U': 608 buf[0] = next(lexer, NULL, false); 609 buf[1] = next(lexer, NULL, false); 610 buf[2] = next(lexer, NULL, false); 611 buf[3] = next(lexer, NULL, false); 612 buf[4] = next(lexer, NULL, false); 613 buf[5] = next(lexer, NULL, false); 614 buf[6] = next(lexer, NULL, false); 615 buf[7] = next(lexer, NULL, false); 616 buf[8] = '\0'; 617 c = strtoul(&buf[0], &endptr, 16); 618 if (*endptr != '\0') { 619 error(&lexer->loc, "Invalid hex literal"); 620 } 621 return c; 622 case C_EOF: 623 error(&lexer->loc, "Unexpected end of file"); 624 default: 625 error(&lexer->loc, "Invalid escape '\\%c'", c); 626 } 627 assert(0); 628 default: 629 return c; 630 } 631 assert(0); 632 } 633 634 static enum lexical_token 635 lex_string(struct lexer *lexer, struct token *out) 636 { 637 uint32_t c = next(lexer, &out->loc, false); 638 uint32_t delim; 639 640 switch (c) { 641 case '"': 642 case '`': 643 delim = c; 644 while ((c = next(lexer, NULL, false)) != delim) { 645 if (c == C_EOF) { 646 error(&lexer->loc, "Unexpected end of file"); 647 } 648 push(lexer, c, false); 649 if (delim == '"') { 650 push(lexer, lex_rune(lexer), false); 651 } 652 next(lexer, NULL, true); 653 } 654 char *buf = xcalloc(lexer->buflen + 1, 1); 655 memcpy(buf, lexer->buf, lexer->buflen); 656 out->token = T_LITERAL; 657 out->storage = STORAGE_STRING; 658 out->string.len = lexer->buflen; 659 out->string.value = buf; 660 consume(lexer, -1); 661 return out->token; 662 case '\'': 663 c = next(lexer, NULL, false); 664 switch (c) { 665 case '\'': 666 error(&out->loc, "Expected rune before trailing single quote"); 667 case '\\': 668 push(lexer, c, false); 669 out->rune = lex_rune(lexer); 670 break; 671 default: 672 out->rune = c; 673 } 674 if (next(lexer, NULL, false) != '\'') { 675 error(&out->loc, "Expected trailing single quote"); 676 } 677 out->token = T_LITERAL; 678 out->storage = STORAGE_RCONST; 679 return out->token; 680 default: 681 assert(0); // Invariant 682 } 683 assert(0); 684 } 685 686 static enum lexical_token 687 lex3(struct lexer *lexer, struct token *out, uint32_t c) 688 { 689 assert(c != C_EOF); 690 691 switch (c) { 692 case '.': 693 switch ((c = next(lexer, NULL, false))) { 694 case '.': 695 switch ((c = next(lexer, NULL, false))) { 696 case '.': 697 out->token = T_ELLIPSIS; 698 break; 699 default: 700 push(lexer, c, false); 701 out->token = T_SLICE; 702 break; 703 } 704 break; 705 default: 706 push(lexer, c, false); 707 out->token = T_DOT; 708 lexer->require_int = true; 709 break; 710 } 711 break; 712 case '<': 713 switch ((c = next(lexer, NULL, false))) { 714 case '<': 715 switch ((c = next(lexer, NULL, false))) { 716 case '=': 717 out->token = T_LSHIFTEQ; 718 break; 719 default: 720 push(lexer, c, false); 721 out->token = T_LSHIFT; 722 break; 723 } 724 break; 725 case '=': 726 out->token = T_LESSEQ; 727 break; 728 default: 729 push(lexer, c, false); 730 out->token = T_LESS; 731 break; 732 } 733 break; 734 case '>': 735 switch ((c = next(lexer, NULL, false))) { 736 case '>': 737 switch ((c = next(lexer, NULL, false))) { 738 case '=': 739 out->token = T_RSHIFTEQ; 740 break; 741 default: 742 push(lexer, c, false); 743 out->token = T_RSHIFT; 744 break; 745 } 746 break; 747 case '=': 748 out->token = T_GREATEREQ; 749 break; 750 default: 751 push(lexer, c, false); 752 out->token = T_GREATER; 753 break; 754 } 755 break; 756 case '&': 757 switch ((c = next(lexer, NULL, false))) { 758 case '&': 759 switch ((c = next(lexer, NULL, false))) { 760 case '=': 761 out->token = T_LANDEQ; 762 break; 763 default: 764 push(lexer, c, false); 765 out->token = T_LAND; 766 break; 767 } 768 break; 769 case '=': 770 out->token = T_BANDEQ; 771 break; 772 default: 773 push(lexer, c, false); 774 out->token = T_BAND; 775 break; 776 } 777 break; 778 case '|': 779 switch ((c = next(lexer, NULL, false))) { 780 case '|': 781 switch ((c = next(lexer, NULL, false))) { 782 case '=': 783 out->token = T_LOREQ; 784 break; 785 default: 786 push(lexer, c, false); 787 out->token = T_LOR; 788 break; 789 } 790 break; 791 case '=': 792 out->token = T_BOREQ; 793 break; 794 default: 795 push(lexer, c, false); 796 out->token = T_BOR; 797 break; 798 } 799 break; 800 case '^': 801 switch ((c = next(lexer, NULL, false))) { 802 case '^': 803 switch ((c = next(lexer, NULL, false))) { 804 case '=': 805 out->token = T_LXOREQ; 806 break; 807 default: 808 push(lexer, c, false); 809 out->token = T_LXOR; 810 break; 811 } 812 break; 813 case '=': 814 out->token = T_BXOREQ; 815 break; 816 default: 817 push(lexer, c, false); 818 out->token = T_BXOR; 819 break; 820 } 821 break; 822 default: 823 assert(0); // Invariant 824 } 825 826 return out->token; 827 } 828 829 static enum lexical_token _lex(struct lexer *lexer, struct token *out); 830 831 static enum lexical_token 832 lex2(struct lexer *lexer, struct token *out, uint32_t c) 833 { 834 assert(c != C_EOF); 835 836 switch (c) { 837 case '*': 838 switch ((c = next(lexer, NULL, false))) { 839 case '=': 840 out->token = T_TIMESEQ; 841 break; 842 default: 843 push(lexer, c, false); 844 out->token = T_TIMES; 845 break; 846 } 847 break; 848 case '%': 849 switch ((c = next(lexer, NULL, false))) { 850 case '=': 851 out->token = T_MODEQ; 852 break; 853 default: 854 push(lexer, c, false); 855 out->token = T_MODULO; 856 break; 857 } 858 break; 859 case '/': 860 switch ((c = next(lexer, NULL, false))) { 861 case '=': 862 out->token = T_DIVEQ; 863 break; 864 case '/': 865 while ((c = next(lexer, NULL, false)) != C_EOF && c != '\n') ; 866 return _lex(lexer, out); 867 default: 868 push(lexer, c, false); 869 out->token = T_DIV; 870 break; 871 } 872 break; 873 case '+': 874 switch ((c = next(lexer, NULL, false))) { 875 case '=': 876 out->token = T_PLUSEQ; 877 break; 878 default: 879 push(lexer, c, false); 880 out->token = T_PLUS; 881 break; 882 } 883 break; 884 case '-': 885 switch ((c = next(lexer, NULL, false))) { 886 case '=': 887 out->token = T_MINUSEQ; 888 break; 889 default: 890 push(lexer, c, false); 891 out->token = T_MINUS; 892 break; 893 } 894 break; 895 case ':': 896 switch ((c = next(lexer, NULL, false))) { 897 case ':': 898 out->token = T_DOUBLE_COLON; 899 break; 900 default: 901 push(lexer, c, false); 902 out->token = T_COLON; 903 break; 904 } 905 break; 906 case '!': 907 switch ((c = next(lexer, NULL, false))) { 908 case '=': 909 out->token = T_NEQUAL; 910 break; 911 default: 912 push(lexer, c, false); 913 out->token = T_LNOT; 914 break; 915 } 916 break; 917 case '=': 918 switch ((c = next(lexer, NULL, false))) { 919 case '=': 920 out->token = T_LEQUAL; 921 break; 922 case '>': 923 out->token = T_ARROW; 924 break; 925 default: 926 push(lexer, c, false); 927 out->token = T_EQUAL; 928 break; 929 } 930 break; 931 default: 932 assert(0); // Invariant 933 } 934 935 return out->token; 936 } 937 938 static enum lexical_token 939 _lex(struct lexer *lexer, struct token *out) 940 { 941 if (lexer->un.token != T_ERROR) { 942 *out = lexer->un; 943 lexer->un.token = T_ERROR; 944 return out->token; 945 } 946 947 uint32_t c = wgetc(lexer, &out->loc); 948 if (c == C_EOF) { 949 out->token = T_EOF; 950 return out->token; 951 } 952 953 if (c <= 0x7F && isdigit(c)) { 954 push(lexer, c, false); 955 return lex_literal(lexer, out); 956 } 957 958 lexer->require_int = false; 959 960 if (c <= 0x7F && (isalpha(c) || c == '_' || c == '@')) { 961 push(lexer, c, false); 962 return lex_name(lexer, out); 963 } 964 965 char p[5]; 966 switch (c) { 967 case '"': 968 case '`': 969 case '\'': 970 push(lexer, c, false); 971 return lex_string(lexer, out); 972 case '.': // . .. ... 973 case '<': // < << <= <<= 974 case '>': // > >> >= >>= 975 case '&': // & && &= &&= 976 case '|': // | || |= ||= 977 case '^': // ^ ^^ ^= ^^= 978 return lex3(lexer, out, c); 979 case '*': // * *= 980 case '%': // % %= 981 case '/': // / /= // 982 case '+': // + += 983 case '-': // - -= 984 case ':': // : :: 985 case '!': // ! != 986 case '=': // = == => 987 return lex2(lexer, out, c); 988 case '~': 989 out->token = T_BNOT; 990 break; 991 case ',': 992 out->token = T_COMMA; 993 break; 994 case '{': 995 out->token = T_LBRACE; 996 break; 997 case '[': 998 out->token = T_LBRACKET; 999 break; 1000 case '(': 1001 out->token = T_LPAREN; 1002 break; 1003 case '}': 1004 out->token = T_RBRACE; 1005 break; 1006 case ']': 1007 out->token = T_RBRACKET; 1008 break; 1009 case ')': 1010 out->token = T_RPAREN; 1011 break; 1012 case ';': 1013 out->token = T_SEMICOLON; 1014 break; 1015 case '?': 1016 out->token = T_QUESTION; 1017 break; 1018 default: 1019 p[utf8_encode(p, c)] = '\0'; 1020 fprintf(stderr, "Error: unexpected code point '%s' at %s:%d:%d\n", 1021 p, sources[lexer->loc.file], lexer->loc.lineno, 1022 lexer->loc.colno); 1023 exit(EXIT_FAILURE); 1024 } 1025 1026 return out->token; 1027 } 1028 1029 enum lexical_token 1030 lex(struct lexer *lexer, struct token *out) 1031 { 1032 return _lex(lexer, out); 1033 } 1034 1035 void 1036 token_finish(struct token *tok) 1037 { 1038 switch (tok->token) { 1039 case T_NAME: 1040 free(tok->name); 1041 break; 1042 case T_LITERAL: 1043 switch (tok->storage) { 1044 case STORAGE_STRING: 1045 free(tok->string.value); 1046 break; 1047 default: 1048 break; 1049 } 1050 break; 1051 default: 1052 break; 1053 } 1054 tok->token = 0; 1055 tok->storage = 0; 1056 tok->loc.file = 0; 1057 tok->loc.colno = 0; 1058 tok->loc.lineno = 0; 1059 } 1060 1061 const char * 1062 lexical_token_str(enum lexical_token tok) 1063 { 1064 switch (tok) { 1065 case T_NAME: 1066 return "name"; 1067 case T_LITERAL: 1068 return "literal"; 1069 case T_EOF: 1070 return "end of file"; 1071 case T_ERROR: 1072 return "error"; 1073 default: 1074 assert(tok < sizeof(tokens) / sizeof(tokens[0])); 1075 return tokens[tok]; 1076 } 1077 } 1078 1079 static const char * 1080 rune_unparse(uint32_t c) 1081 { 1082 static char buf[11]; 1083 switch (c) { 1084 case '\0': 1085 snprintf(buf, sizeof(buf), "\\0"); 1086 break; 1087 case '\a': 1088 snprintf(buf, sizeof(buf), "\\a"); 1089 break; 1090 case '\b': 1091 snprintf(buf, sizeof(buf), "\\b"); 1092 break; 1093 case '\f': 1094 snprintf(buf, sizeof(buf), "\\f"); 1095 break; 1096 case '\n': 1097 snprintf(buf, sizeof(buf), "\\n"); 1098 break; 1099 case '\r': 1100 snprintf(buf, sizeof(buf), "\\r"); 1101 break; 1102 case '\t': 1103 snprintf(buf, sizeof(buf), "\\t"); 1104 break; 1105 case '\v': 1106 snprintf(buf, sizeof(buf), "\\v"); 1107 break; 1108 case '\\': 1109 snprintf(buf, sizeof(buf), "\\\\"); 1110 break; 1111 case '\'': 1112 snprintf(buf, sizeof(buf), "\\'"); 1113 break; 1114 case '"': 1115 snprintf(buf, sizeof(buf), "\\\""); 1116 break; 1117 default: 1118 if (c > 0xffff) { 1119 snprintf(buf, sizeof(buf), "\\U%08x", c); 1120 } else if (c > 0x7F) { 1121 snprintf(buf, sizeof(buf), "\\u%04x", c); 1122 } else if (!isprint(c)) { 1123 snprintf(buf, sizeof(buf), "\\x%02x", c); 1124 } else { 1125 assert(utf8_cpsize(c) < sizeof(buf)); 1126 buf[utf8_encode(buf, c)] = '\0'; 1127 } 1128 break; 1129 } 1130 return buf; 1131 } 1132 1133 static const char * 1134 string_unparse(const struct token *tok) 1135 { 1136 static char buf[1024]; 1137 assert(tok->token == T_LITERAL && tok->storage == STORAGE_STRING); 1138 int bytes = 0; 1139 memset(buf, 0, sizeof(buf)); 1140 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\""); 1141 const char *s = tok->string.value; 1142 for (uint32_t c = utf8_decode(&s); 1143 s - tok->string.value <= (ptrdiff_t)tok->string.len; 1144 c = utf8_decode(&s)) { 1145 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s", 1146 rune_unparse(c)); 1147 } 1148 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\""); 1149 return buf; 1150 } 1151 1152 const char * 1153 token_str(const struct token *tok) 1154 { 1155 static char buf[1024]; 1156 int bytes = 0; 1157 switch (tok->token) { 1158 case T_NAME: 1159 snprintf(buf, sizeof(buf), "name %s", tok->name); 1160 return buf; 1161 case T_LITERAL: 1162 switch (tok->storage) { 1163 case STORAGE_U8: 1164 case STORAGE_U16: 1165 case STORAGE_U32: 1166 case STORAGE_U64: 1167 case STORAGE_UINT: 1168 case STORAGE_UINTPTR: 1169 case STORAGE_SIZE: 1170 snprintf(buf, sizeof(buf), "%ju", tok->uval); 1171 break; 1172 case STORAGE_I8: 1173 case STORAGE_I16: 1174 case STORAGE_I32: 1175 case STORAGE_I64: 1176 case STORAGE_ICONST: 1177 case STORAGE_INT: 1178 snprintf(buf, sizeof(buf), "%jd", tok->ival); 1179 break; 1180 case STORAGE_F32: 1181 case STORAGE_F64: 1182 case STORAGE_FCONST: 1183 snprintf(buf, sizeof(buf), "%lf", tok->fval); 1184 break; 1185 case STORAGE_RCONST: 1186 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'"); 1187 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s", 1188 rune_unparse(tok->rune)); 1189 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'"); 1190 break; 1191 case STORAGE_STRING: 1192 return string_unparse(tok); 1193 case STORAGE_ALIAS: 1194 case STORAGE_ARRAY: 1195 case STORAGE_BOOL: 1196 case STORAGE_CHAR: 1197 case STORAGE_ENUM: 1198 case STORAGE_ERROR: 1199 case STORAGE_FUNCTION: 1200 case STORAGE_POINTER: 1201 case STORAGE_NULL: 1202 case STORAGE_RUNE: 1203 case STORAGE_SLICE: 1204 case STORAGE_STRUCT: 1205 case STORAGE_TAGGED: 1206 case STORAGE_TUPLE: 1207 case STORAGE_UNION: 1208 case STORAGE_VALIST: 1209 case STORAGE_VOID: 1210 assert(0); 1211 } 1212 return buf; 1213 default:; 1214 const char *out = lexical_token_str(tok->token); 1215 return out; 1216 } 1217 } 1218 1219 void 1220 unlex(struct lexer *lexer, struct token *in) 1221 { 1222 assert(lexer->un.token == T_ERROR && "Only one unlex is supported"); 1223 lexer->un = *in; 1224 }