lex.c (23669B)
1 #include <assert.h> 2 #include <ctype.h> 3 #include <errno.h> 4 #include <inttypes.h> 5 #include <stdbool.h> 6 #include <stddef.h> 7 #include <stdint.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <string.h> 11 #include "lex.h" 12 #include "utf8.h" 13 #include "util.h" 14 15 static const char *tokens[] = { 16 // Must be alpha sorted and match lex.h 17 [T_ATTR_FINI] = "@fini", 18 [T_ATTR_INIT] = "@init", 19 [T_ATTR_NORETURN] = "@noreturn", 20 [T_ATTR_OFFSET] = "@offset", 21 [T_ATTR_SYMBOL] = "@symbol", 22 [T_ATTR_TEST] = "@test", 23 [T_UNDERSCORE] = "_", 24 [T_ABORT] = "abort", 25 [T_ALLOC] = "alloc", 26 [T_APPEND] = "append", 27 [T_AS] = "as", 28 [T_ASSERT] = "assert", 29 [T_BOOL] = "bool", 30 [T_BREAK] = "break", 31 [T_CASE] = "case", 32 [T_CHAR] = "char", 33 [T_CONST] = "const", 34 [T_CONTINUE] = "continue", 35 [T_DEFER] = "defer", 36 [T_DEF] = "def", 37 [T_DELETE] = "delete", 38 [T_ELSE] = "else", 39 [T_ENUM] = "enum", 40 [T_EXPORT] = "export", 41 [T_F32] = "f32", 42 [T_F64] = "f64", 43 [T_FALSE] = "false", 44 [T_FN] = "fn", 45 [T_FOR] = "for", 46 [T_FREE] = "free", 47 [T_I16] = "i16", 48 [T_I32] = "i32", 49 [T_I64] = "i64", 50 [T_I8] = "i8", 51 [T_IF] = "if", 52 [T_INSERT] = "insert", 53 [T_INT] = "int", 54 [T_IS] = "is", 55 [T_LEN] = "len", 56 [T_LET] = "let", 57 [T_MATCH] = "match", 58 [T_NULL] = "null", 59 [T_NULLABLE] = "nullable", 60 [T_OFFSET] = "offset", 61 [T_RETURN] = "return", 62 [T_RUNE] = "rune", 63 [T_SIZE] = "size", 64 [T_STATIC] = "static", 65 [T_STR] = "str", 66 [T_STRUCT] = "struct", 67 [T_SWITCH] = "switch", 68 [T_TRUE] = "true", 69 [T_TYPE] = "type", 70 [T_U16] = "u16", 71 [T_U32] = "u32", 72 [T_U64] = "u64", 73 [T_U8] = "u8", 74 [T_UINT] = "uint", 75 [T_UINTPTR] = "uintptr", 76 [T_UNION] = "union", 77 [T_USE] = "use", 78 [T_VAARG] = "vaarg", 79 [T_VAEND] = "vaend", 80 [T_VALIST] = "valist", 81 [T_VASTART] = "vastart", 82 [T_VOID] = "void", 83 [T_YIELD] = "yield", 84 85 // Operators 86 [T_ARROW] = "=>", 87 [T_BANDEQ] = "&=", 88 [T_BAND] = "&", 89 [T_BNOT] = "~", 90 [T_BOR] = "|", 91 [T_COLON] = ":", 92 [T_COMMA] = ",", 93 [T_DIV] = "/", 94 [T_DIVEQ] = "/=", 95 [T_DOT] = ".", 96 [T_DOUBLE_COLON] = "::", 97 [T_ELLIPSIS] = "...", 98 [T_EQUAL] = "=", 99 [T_GREATER] = ">", 100 [T_GREATEREQ] = ">=", 101 [T_LAND] = "&&", 102 [T_LANDEQ] = "&&=", 103 [T_LBRACE] = "{", 104 [T_LBRACKET] = "[", 105 [T_LEQUAL] = "==", 106 [T_LESS] = "<", 107 [T_LESSEQ] = "<=", 108 [T_LNOT] = "!", 109 [T_LOR] = "||", 110 [T_LOREQ] = "||=", 111 [T_LPAREN] = "(", 112 [T_LSHIFT] = "<<", 113 [T_LSHIFTEQ] = "<<=", 114 [T_LXOR] = "^^", 115 [T_LXOREQ] = "^^=", 116 [T_MINUS] = "-", 117 [T_MINUSEQ] = "-=", 118 [T_MODEQ] = "%=", 119 [T_MODULO] = "%", 120 [T_NEQUAL] = "!=", 121 [T_BOREQ] = "|=", 122 [T_PLUS] = "+", 123 [T_PLUSEQ] = "+=", 124 [T_QUESTION] = "?", 125 [T_RBRACE] = "}", 126 [T_RBRACKET] = "]", 127 [T_RPAREN] = ")", 128 [T_RSHIFT] = ">>", 129 [T_RSHIFTEQ] = ">>=", 130 [T_SEMICOLON] = ";", 131 [T_SLICE] = "..", 132 [T_TIMES] = "*", 133 [T_TIMESEQ] = "*=", 134 [T_BXOR] = "^", 135 [T_BXOREQ] = "^=", 136 }; 137 138 void 139 lex_init(struct lexer *lexer, FILE *f, int fileid) 140 { 141 memset(lexer, 0, sizeof(*lexer)); 142 lexer->in = f; 143 lexer->bufsz = 256; 144 lexer->buf = xcalloc(1, lexer->bufsz); 145 lexer->un.token = T_ERROR; 146 lexer->loc.lineno = 1; 147 lexer->loc.colno = 0; 148 lexer->loc.file = fileid; 149 lexer->c[0] = UINT32_MAX; 150 lexer->c[1] = UINT32_MAX; 151 } 152 153 void 154 lex_finish(struct lexer *lexer) 155 { 156 fclose(lexer->in); 157 free(lexer->buf); 158 } 159 160 static void 161 update_lineno(struct location *loc, uint32_t c) 162 { 163 if (c == '\n') { 164 loc->lineno++; 165 loc->colno = 0; 166 } else if (c == '\t') { 167 loc->colno += 8; 168 } else { 169 loc->colno++; 170 } 171 } 172 173 static uint32_t 174 next(struct lexer *lexer, struct location *loc, bool buffer) 175 { 176 uint32_t c; 177 if (lexer->c[0] != UINT32_MAX) { 178 c = lexer->c[0]; 179 lexer->c[0] = lexer->c[1]; 180 lexer->c[1] = UINT32_MAX; 181 } else { 182 c = utf8_get(lexer->in); 183 update_lineno(&lexer->loc, c); 184 } 185 if (loc != NULL) { 186 *loc = lexer->loc; 187 for (size_t i = 0; i < 2 && lexer->c[i] != UINT32_MAX; i++) { 188 update_lineno(&lexer->loc, lexer->c[i]); 189 } 190 } 191 if (c == UTF8_INVALID || !buffer) { 192 return c; 193 } 194 if (lexer->buflen + utf8_cpsize(c) >= lexer->bufsz) { 195 lexer->bufsz *= 2; 196 lexer->buf = xrealloc(lexer->buf, lexer->bufsz); 197 } 198 char buf[UTF8_MAX_SIZE]; 199 size_t sz = utf8_encode(&buf[0], c); 200 memcpy(lexer->buf + lexer->buflen, buf, sz); 201 lexer->buflen += sz; 202 lexer->buf[lexer->buflen] = '\0'; 203 return c; 204 } 205 206 static bool 207 isharespace(uint32_t c) 208 { 209 return c == '\t' || c == '\n' || c == ' '; 210 } 211 212 static uint32_t 213 wgetc(struct lexer *lexer, struct location *loc) 214 { 215 uint32_t c; 216 while ((c = next(lexer, loc, false)) != UTF8_INVALID && isharespace(c)) ; 217 return c; 218 } 219 220 static void 221 consume(struct lexer *lexer, ssize_t n) 222 { 223 if (n == -1) { 224 lexer->buflen = 0; 225 lexer->buf[0] = 0; 226 return; 227 } 228 for (ssize_t i = 0; i < n; i++) { 229 while ((lexer->buf[--lexer->buflen] & 0xC0) == 0x80) ; 230 } 231 lexer->buf[lexer->buflen] = 0; 232 } 233 234 static void 235 push(struct lexer *lexer, uint32_t c, bool buffer) 236 { 237 assert(lexer->c[1] == UINT32_MAX); 238 lexer->c[1] = lexer->c[0]; 239 lexer->c[0] = c; 240 if (buffer) { 241 consume(lexer, 1); 242 } 243 } 244 245 static int 246 cmp_keyword(const void *va, const void *vb) 247 { 248 return strcmp(*(const char **)va, *(const char **)vb); 249 } 250 251 static uint32_t 252 lex_name(struct lexer *lexer, struct token *out) 253 { 254 uint32_t c = next(lexer, &out->loc, true); 255 assert(c != UTF8_INVALID && c <= 0x7F && (isalpha(c) || c == '_' || c == '@')); 256 while ((c = next(lexer, NULL, true)) != UTF8_INVALID) { 257 if (c > 0x7F || (!isalnum(c) && c != '_')) { 258 push(lexer, c, true); 259 break; 260 } 261 } 262 263 void *token = bsearch(&lexer->buf, tokens, T_LAST_KEYWORD + 1, 264 sizeof(tokens[0]), cmp_keyword); 265 if (!token) { 266 if (lexer->buf[0] == '@') { 267 out->token = T_ERROR; 268 consume(lexer, -1); 269 return out->token; 270 } 271 out->token = T_NAME; 272 out->name = strdup(lexer->buf); 273 } else { 274 out->token = (const char **)token - tokens; 275 } 276 consume(lexer, -1); 277 return out->token; 278 } 279 280 static uint32_t 281 lex_literal(struct lexer *lexer, struct token *out) 282 { 283 uint32_t c = next(lexer, &out->loc, true); 284 assert(c != UTF8_INVALID && c <= 0x7F && isdigit(c)); 285 286 int base = 10; 287 const char *basechrs = "0123456789"; 288 if (c == '0') { 289 switch ((c = next(lexer, NULL, true))) { 290 case 'b': 291 base = 2; 292 basechrs = "01"; 293 consume(lexer, 2); 294 break; 295 case 'o': 296 base = 8; 297 basechrs = "01234567"; 298 consume(lexer, 2); 299 break; 300 case 'x': 301 base = 16; 302 basechrs = "0123456789ABCDEFabcdef"; 303 consume(lexer, 2); 304 break; 305 default: 306 push(lexer, c, true); 307 break; 308 } 309 } 310 311 char *suff = NULL; 312 char *exp = NULL; 313 bool isfloat = false; 314 while ((c = next(lexer, NULL, true)) != UTF8_INVALID) { 315 if (!strchr(basechrs, c)) { 316 switch (c) { 317 case '.': 318 if (lexer->require_int) { 319 push(lexer, '.', true); 320 goto finalize; 321 } 322 if (isfloat || suff || exp) { 323 push(lexer, c, true); 324 goto finalize; 325 } 326 if (!strchr(basechrs, c = next(lexer, NULL, false))) { 327 push(lexer, c, false); 328 push(lexer, '.', true); 329 goto finalize; 330 } else { 331 push(lexer, c, false); 332 } 333 isfloat = true; 334 break; 335 case 'e': 336 if (exp || suff) { 337 push(lexer, c, true); 338 goto finalize; 339 } 340 // exponent is always in base 10 341 basechrs = "0123456789"; 342 c = next(lexer, NULL, true); 343 if (c != '-' && c != '+' && !strchr(basechrs, c)) { 344 push(lexer, c, true); 345 push(lexer, 'e', true); 346 goto finalize; 347 }; 348 exp = &lexer->buf[lexer->buflen - 1]; 349 break; 350 case 'i': 351 case 'u': 352 case 'f': 353 case 'z': 354 if (suff) { 355 push(lexer, c, true); 356 goto finalize; 357 } 358 suff = &lexer->buf[lexer->buflen - 1]; 359 basechrs = "0123456789"; 360 break; 361 default: 362 push(lexer, c, true); 363 goto finalize; 364 } 365 } 366 } 367 368 finalize: 369 lexer->require_int = false; 370 out->token = T_LITERAL; 371 if (isfloat) { 372 out->storage = STORAGE_FCONST; 373 } else { 374 out->storage = STORAGE_ICONST; 375 } 376 if (suff) { 377 const char *suffs[] = { 378 [STORAGE_U8] = "u8", 379 [STORAGE_U16] = "u16", 380 [STORAGE_U32] = "u32", 381 [STORAGE_U64] = "u64", 382 [STORAGE_I8] = "i8", 383 [STORAGE_I16] = "i16", 384 [STORAGE_I32] = "i32", 385 [STORAGE_I64] = "i64", 386 387 [STORAGE_UINT] = "u", 388 [STORAGE_INT] = "i", 389 [STORAGE_SIZE] = "z", 390 [STORAGE_F32] = "f32", 391 [STORAGE_F64] = "f64", 392 }; 393 bool isvalid = false; 394 for (enum type_storage i = 0; 395 i < sizeof(suffs) / sizeof(suffs[0]); ++i) { 396 if (suffs[i] && strcmp(suff, suffs[i]) == 0) { 397 isvalid = true; 398 out->storage = i; 399 break; 400 } 401 } 402 if (!isvalid) { 403 out->token = T_ERROR; 404 consume(lexer, -1); 405 return out->token; 406 } 407 } 408 409 intmax_t exponent = 0; 410 if (exp) { 411 char *endptr = NULL; 412 exponent = strtoimax(exp, &endptr, 10); 413 // integers can't have negative exponents 414 if (exponent < 0 && !suff) { 415 out->storage = STORAGE_FCONST; 416 } 417 enum type_storage s = out->storage; 418 bool valid = exponent >= 0 419 || s == STORAGE_F32 420 || s == STORAGE_F64 421 || s == STORAGE_FCONST; 422 if (endptr == exp || !valid) { 423 out->token = T_ERROR; 424 consume(lexer, -1); 425 return out->token; 426 } 427 } 428 429 if (isfloat) { 430 switch (out->storage) { 431 case STORAGE_F32: 432 case STORAGE_F64: 433 case STORAGE_FCONST: 434 break; 435 default: 436 out->token = T_ERROR; 437 consume(lexer, -1); 438 return out->token; 439 } 440 } 441 442 errno = 0; 443 switch (out->storage) { 444 case STORAGE_U8: 445 case STORAGE_U16: 446 case STORAGE_U32: 447 case STORAGE_UINT: 448 case STORAGE_U64: 449 case STORAGE_SIZE: 450 out->uval = strtoumax(lexer->buf, NULL, base); 451 for (intmax_t i = 0; i < exponent; i++) { 452 out->uval *= 10; 453 } 454 break; 455 case STORAGE_ICONST: 456 out->uval = strtoumax(lexer->buf, NULL, base); 457 for (intmax_t i = 0; i < exponent; i++) { 458 out->uval *= 10; 459 } 460 if (out->uval > (uintmax_t)INT64_MAX) { 461 out->storage = STORAGE_U64; 462 break; 463 } 464 // Fallthrough 465 case STORAGE_I8: 466 case STORAGE_I16: 467 case STORAGE_I32: 468 case STORAGE_INT: 469 case STORAGE_I64: 470 out->uval = strtoumax(lexer->buf, NULL, base); 471 for (intmax_t i = 0; i < exponent; i++) { 472 out->uval *= 10; 473 } 474 if (out->uval == (uintmax_t)INT64_MIN) { 475 // XXX: Hack 476 out->ival = INT64_MIN; 477 break; 478 } 479 out->ival = strtoimax(lexer->buf, NULL, base); 480 for (intmax_t i = 0; i < exponent; i++) { 481 out->ival *= 10; 482 } 483 break; 484 case STORAGE_F32: 485 case STORAGE_F64: 486 case STORAGE_FCONST: 487 out->fval = strtod(lexer->buf, NULL); 488 break; 489 default: 490 assert(0); 491 } 492 if (errno == ERANGE && !isfloat) { 493 out->token = T_ERROR; 494 } 495 consume(lexer, -1); 496 return out->token; 497 } 498 499 static uint32_t 500 lex_rune(struct lexer *lexer) 501 { 502 char buf[9]; 503 char *endptr; 504 uint32_t c = next(lexer, NULL, false); 505 assert(c != UTF8_INVALID); 506 507 switch (c) { 508 case '\\': 509 c = next(lexer, NULL, false); 510 switch (c) { 511 case '0': 512 return '\0'; 513 case 'a': 514 return '\a'; 515 case 'b': 516 return '\b'; 517 case 'f': 518 return '\f'; 519 case 'n': 520 return '\n'; 521 case 'r': 522 return '\r'; 523 case 't': 524 return '\t'; 525 case 'v': 526 return '\v'; 527 case '\\': 528 return '\\'; 529 case '\'': 530 return '\''; 531 case '"': 532 return '\"'; 533 case 'x': 534 buf[0] = next(lexer, NULL, false); 535 buf[1] = next(lexer, NULL, false); 536 buf[2] = '\0'; 537 c = strtoul(&buf[0], &endptr, 16); 538 if (*endptr != '\0') { 539 fprintf(stderr, 540 "Error: invalid hex literal at %s:%d:%d\n", 541 sources[lexer->loc.file], lexer->loc.lineno, 542 lexer->loc.colno); 543 exit(EXIT_FAILURE); 544 } 545 return c; 546 case 'u': 547 buf[0] = next(lexer, NULL, false); 548 buf[1] = next(lexer, NULL, false); 549 buf[2] = next(lexer, NULL, false); 550 buf[3] = next(lexer, NULL, false); 551 buf[4] = '\0'; 552 c = strtoul(&buf[0], &endptr, 16); 553 if (*endptr != '\0') { 554 fprintf(stderr, 555 "Error: invalid hex literal at %s:%d:%d\n", 556 sources[lexer->loc.file], lexer->loc.lineno, 557 lexer->loc.colno); 558 exit(EXIT_FAILURE); 559 } 560 return c; 561 case 'U': 562 buf[0] = next(lexer, NULL, false); 563 buf[1] = next(lexer, NULL, false); 564 buf[2] = next(lexer, NULL, false); 565 buf[3] = next(lexer, NULL, false); 566 buf[4] = next(lexer, NULL, false); 567 buf[5] = next(lexer, NULL, false); 568 buf[6] = next(lexer, NULL, false); 569 buf[7] = next(lexer, NULL, false); 570 buf[8] = '\0'; 571 c = strtoul(&buf[0], &endptr, 16); 572 if (*endptr != '\0') { 573 fprintf(stderr, 574 "Error: invalid hex literal at %s:%d:%d\n", 575 sources[lexer->loc.file], lexer->loc.lineno, 576 lexer->loc.colno); 577 exit(EXIT_FAILURE); 578 } 579 return c; 580 default: 581 fprintf(stderr, 582 "Error: invalid escape '\\%c' at %s:%d:%d\n", 583 c, sources[lexer->loc.file], lexer->loc.lineno, 584 lexer->loc.colno); 585 exit(EXIT_FAILURE); 586 } 587 assert(0); 588 default: 589 return c; 590 } 591 assert(0); 592 } 593 594 static enum lexical_token 595 lex_string(struct lexer *lexer, struct token *out) 596 { 597 uint32_t c = next(lexer, &out->loc, false); 598 uint32_t delim; 599 assert(c != UTF8_INVALID); 600 601 switch (c) { 602 case '"': 603 case '`': 604 delim = c; 605 while ((c = next(lexer, NULL, false)) != UTF8_INVALID) { 606 if (c == delim) { 607 char *buf = xcalloc(lexer->buflen + 1, 1); 608 memcpy(buf, lexer->buf, lexer->buflen); 609 out->token = T_LITERAL; 610 out->storage = STORAGE_STRING; 611 out->string.len = lexer->buflen; 612 out->string.value = buf; 613 consume(lexer, -1); 614 return out->token; 615 } else { 616 push(lexer, c, false); 617 if (delim == '"') { 618 push(lexer, lex_rune(lexer), false); 619 } 620 next(lexer, NULL, true); 621 } 622 } 623 assert(0); // Invariant 624 case '\'': 625 c = next(lexer, NULL, false); 626 switch (c) { 627 case '\'': 628 assert(0); // Invariant 629 case '\\': 630 push(lexer, c, false); 631 out->rune = lex_rune(lexer); 632 break; 633 default: 634 out->rune = c; 635 } 636 c = next(lexer, NULL, false); 637 assert(c == '\''); 638 out->token = T_LITERAL; 639 out->storage = STORAGE_RCONST; 640 return out->token; 641 default: 642 assert(0); // Invariant 643 } 644 assert(0); 645 } 646 647 static enum lexical_token 648 lex3(struct lexer *lexer, struct token *out, uint32_t c) 649 { 650 assert(c != UTF8_INVALID); 651 652 switch (c) { 653 case '.': 654 switch ((c = next(lexer, NULL, false))) { 655 case '.': 656 switch ((c = next(lexer, NULL, false))) { 657 case '.': 658 out->token = T_ELLIPSIS; 659 break; 660 default: 661 push(lexer, c, false); 662 out->token = T_SLICE; 663 break; 664 } 665 break; 666 default: 667 push(lexer, c, false); 668 out->token = T_DOT; 669 lexer->require_int = true; 670 break; 671 } 672 break; 673 case '<': 674 switch ((c = next(lexer, NULL, false))) { 675 case '<': 676 switch ((c = next(lexer, NULL, false))) { 677 case '=': 678 out->token = T_LSHIFTEQ; 679 break; 680 default: 681 push(lexer, c, false); 682 out->token = T_LSHIFT; 683 break; 684 } 685 break; 686 case '=': 687 out->token = T_LESSEQ; 688 break; 689 default: 690 push(lexer, c, false); 691 out->token = T_LESS; 692 break; 693 } 694 break; 695 case '>': 696 switch ((c = next(lexer, NULL, false))) { 697 case '>': 698 switch ((c = next(lexer, NULL, false))) { 699 case '=': 700 out->token = T_RSHIFTEQ; 701 break; 702 default: 703 push(lexer, c, false); 704 out->token = T_RSHIFT; 705 break; 706 } 707 break; 708 case '=': 709 out->token = T_GREATEREQ; 710 break; 711 default: 712 push(lexer, c, false); 713 out->token = T_GREATER; 714 break; 715 } 716 break; 717 case '&': 718 switch ((c = next(lexer, NULL, false))) { 719 case '&': 720 switch ((c = next(lexer, NULL, false))) { 721 case '=': 722 out->token = T_LANDEQ; 723 break; 724 default: 725 push(lexer, c, false); 726 out->token = T_LAND; 727 break; 728 } 729 break; 730 case '=': 731 out->token = T_BANDEQ; 732 break; 733 default: 734 push(lexer, c, false); 735 out->token = T_BAND; 736 break; 737 } 738 break; 739 case '|': 740 switch ((c = next(lexer, NULL, false))) { 741 case '|': 742 switch ((c = next(lexer, NULL, false))) { 743 case '=': 744 out->token = T_LOREQ; 745 break; 746 default: 747 push(lexer, c, false); 748 out->token = T_LOR; 749 break; 750 } 751 break; 752 case '=': 753 out->token = T_BOREQ; 754 break; 755 default: 756 push(lexer, c, false); 757 out->token = T_BOR; 758 break; 759 } 760 break; 761 case '^': 762 switch ((c = next(lexer, NULL, false))) { 763 case '^': 764 switch ((c = next(lexer, NULL, false))) { 765 case '=': 766 out->token = T_LXOREQ; 767 break; 768 default: 769 push(lexer, c, false); 770 out->token = T_LXOR; 771 break; 772 } 773 break; 774 case '=': 775 out->token = T_BXOREQ; 776 break; 777 default: 778 push(lexer, c, false); 779 out->token = T_BXOR; 780 break; 781 } 782 break; 783 default: 784 assert(0); // Invariant 785 } 786 787 return out->token; 788 } 789 790 static enum lexical_token _lex(struct lexer *lexer, struct token *out); 791 792 static enum lexical_token 793 lex2(struct lexer *lexer, struct token *out, uint32_t c) 794 { 795 assert(c != UTF8_INVALID); 796 797 switch (c) { 798 case '*': 799 switch ((c = next(lexer, NULL, false))) { 800 case '=': 801 out->token = T_TIMESEQ; 802 break; 803 default: 804 push(lexer, c, false); 805 out->token = T_TIMES; 806 break; 807 } 808 break; 809 case '%': 810 switch ((c = next(lexer, NULL, false))) { 811 case '=': 812 out->token = T_MODEQ; 813 break; 814 default: 815 push(lexer, c, false); 816 out->token = T_MODULO; 817 break; 818 } 819 break; 820 case '/': 821 switch ((c = next(lexer, NULL, false))) { 822 case '=': 823 out->token = T_DIVEQ; 824 break; 825 case '/': 826 while ((c = next(lexer, NULL, false)) != UTF8_INVALID && c != '\n') ; 827 return _lex(lexer, out); 828 default: 829 push(lexer, c, false); 830 out->token = T_DIV; 831 break; 832 } 833 break; 834 case '+': 835 switch ((c = next(lexer, NULL, false))) { 836 case '=': 837 out->token = T_PLUSEQ; 838 break; 839 default: 840 push(lexer, c, false); 841 out->token = T_PLUS; 842 break; 843 } 844 break; 845 case '-': 846 switch ((c = next(lexer, NULL, false))) { 847 case '=': 848 out->token = T_MINUSEQ; 849 break; 850 default: 851 push(lexer, c, false); 852 out->token = T_MINUS; 853 break; 854 } 855 break; 856 case ':': 857 switch ((c = next(lexer, NULL, false))) { 858 case ':': 859 out->token = T_DOUBLE_COLON; 860 break; 861 default: 862 push(lexer, c, false); 863 out->token = T_COLON; 864 break; 865 } 866 break; 867 case '!': 868 switch ((c = next(lexer, NULL, false))) { 869 case '=': 870 out->token = T_NEQUAL; 871 break; 872 default: 873 push(lexer, c, false); 874 out->token = T_LNOT; 875 break; 876 } 877 break; 878 case '=': 879 switch ((c = next(lexer, NULL, false))) { 880 case '=': 881 out->token = T_LEQUAL; 882 break; 883 case '>': 884 out->token = T_ARROW; 885 break; 886 default: 887 push(lexer, c, false); 888 out->token = T_EQUAL; 889 break; 890 } 891 break; 892 default: 893 assert(0); // Invariant 894 } 895 896 return out->token; 897 } 898 899 static enum lexical_token 900 _lex(struct lexer *lexer, struct token *out) 901 { 902 if (lexer->un.token != T_ERROR) { 903 *out = lexer->un; 904 lexer->un.token = T_ERROR; 905 return out->token; 906 } 907 908 uint32_t c = wgetc(lexer, &out->loc); 909 if (c == UTF8_INVALID) { 910 out->token = T_EOF; 911 return out->token; 912 } 913 914 if (c <= 0x7F && isdigit(c)) { 915 push(lexer, c, false); 916 return lex_literal(lexer, out); 917 } 918 919 lexer->require_int = false; 920 921 if (c <= 0x7F && (isalpha(c) || c == '_' || c == '@')) { 922 push(lexer, c, false); 923 return lex_name(lexer, out); 924 } 925 926 char p[5]; 927 switch (c) { 928 case '"': 929 case '`': 930 case '\'': 931 push(lexer, c, false); 932 return lex_string(lexer, out); 933 case '.': // . .. ... 934 case '<': // < << <= <<= 935 case '>': // > >> >= >>= 936 case '&': // & && &= &&= 937 case '|': // | || |= ||= 938 case '^': // ^ ^^ ^= ^^= 939 return lex3(lexer, out, c); 940 case '*': // * *= 941 case '%': // % %= 942 case '/': // / /= // 943 case '+': // + += 944 case '-': // - -= 945 case ':': // : :: 946 case '!': // ! != 947 case '=': // = == => 948 return lex2(lexer, out, c); 949 case '~': 950 out->token = T_BNOT; 951 break; 952 case ',': 953 out->token = T_COMMA; 954 break; 955 case '{': 956 out->token = T_LBRACE; 957 break; 958 case '[': 959 out->token = T_LBRACKET; 960 break; 961 case '(': 962 out->token = T_LPAREN; 963 break; 964 case '}': 965 out->token = T_RBRACE; 966 break; 967 case ']': 968 out->token = T_RBRACKET; 969 break; 970 case ')': 971 out->token = T_RPAREN; 972 break; 973 case ';': 974 out->token = T_SEMICOLON; 975 break; 976 case '?': 977 out->token = T_QUESTION; 978 break; 979 default: 980 p[utf8_encode(p, c)] = '\0'; 981 fprintf(stderr, "Error: unexpected code point '%s' at %s:%d:%d\n", 982 p, sources[lexer->loc.file], lexer->loc.lineno, 983 lexer->loc.colno); 984 exit(EXIT_FAILURE); 985 } 986 987 return out->token; 988 } 989 990 enum lexical_token 991 lex(struct lexer *lexer, struct token *out) 992 { 993 return _lex(lexer, out); 994 } 995 996 void 997 token_finish(struct token *tok) 998 { 999 switch (tok->token) { 1000 case T_NAME: 1001 free(tok->name); 1002 break; 1003 case T_LITERAL: 1004 switch (tok->storage) { 1005 case STORAGE_STRING: 1006 free(tok->string.value); 1007 break; 1008 default: 1009 break; 1010 } 1011 break; 1012 default: 1013 break; 1014 } 1015 tok->token = 0; 1016 tok->storage = 0; 1017 tok->loc.file = 0; 1018 tok->loc.colno = 0; 1019 tok->loc.lineno = 0; 1020 } 1021 1022 const char * 1023 lexical_token_str(enum lexical_token tok) 1024 { 1025 switch (tok) { 1026 case T_NAME: 1027 return "name"; 1028 case T_LITERAL: 1029 return "literal"; 1030 case T_EOF: 1031 return "end of file"; 1032 case T_ERROR: 1033 return "error"; 1034 default: 1035 assert(tok < sizeof(tokens) / sizeof(tokens[0])); 1036 return tokens[tok]; 1037 } 1038 } 1039 1040 static const char * 1041 rune_unparse(uint32_t c) 1042 { 1043 static char buf[11]; 1044 switch (c) { 1045 case '\0': 1046 snprintf(buf, sizeof(buf), "\\0"); 1047 break; 1048 case '\a': 1049 snprintf(buf, sizeof(buf), "\\a"); 1050 break; 1051 case '\b': 1052 snprintf(buf, sizeof(buf), "\\b"); 1053 break; 1054 case '\f': 1055 snprintf(buf, sizeof(buf), "\\f"); 1056 break; 1057 case '\n': 1058 snprintf(buf, sizeof(buf), "\\n"); 1059 break; 1060 case '\r': 1061 snprintf(buf, sizeof(buf), "\\r"); 1062 break; 1063 case '\t': 1064 snprintf(buf, sizeof(buf), "\\t"); 1065 break; 1066 case '\v': 1067 snprintf(buf, sizeof(buf), "\\v"); 1068 break; 1069 case '\\': 1070 snprintf(buf, sizeof(buf), "\\\\"); 1071 break; 1072 case '\'': 1073 snprintf(buf, sizeof(buf), "\\'"); 1074 break; 1075 case '"': 1076 snprintf(buf, sizeof(buf), "\\\""); 1077 break; 1078 default: 1079 if (c > 0xffff) { 1080 snprintf(buf, sizeof(buf), "\\U%08x", c); 1081 } else if (c > 0x7F) { 1082 snprintf(buf, sizeof(buf), "\\u%04x", c); 1083 } else if (!isprint(c)) { 1084 snprintf(buf, sizeof(buf), "\\x%02x", c); 1085 } else { 1086 assert(utf8_cpsize(c) < sizeof(buf)); 1087 buf[utf8_encode(buf, c)] = '\0'; 1088 } 1089 break; 1090 } 1091 return buf; 1092 } 1093 1094 static const char * 1095 string_unparse(const struct token *tok) 1096 { 1097 static char buf[1024]; 1098 assert(tok->token == T_LITERAL && tok->storage == STORAGE_STRING); 1099 int bytes = 0; 1100 memset(buf, 0, sizeof(buf)); 1101 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\""); 1102 const char *s = tok->string.value; 1103 for (uint32_t c = utf8_decode(&s); 1104 s - tok->string.value <= (ptrdiff_t)tok->string.len; 1105 c = utf8_decode(&s)) { 1106 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s", 1107 rune_unparse(c)); 1108 } 1109 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "\""); 1110 return buf; 1111 } 1112 1113 const char * 1114 token_str(const struct token *tok) 1115 { 1116 static char buf[1024]; 1117 int bytes = 0; 1118 switch (tok->token) { 1119 case T_NAME: 1120 snprintf(buf, sizeof(buf), "name %s", tok->name); 1121 return buf; 1122 case T_LITERAL: 1123 switch (tok->storage) { 1124 case STORAGE_U8: 1125 case STORAGE_U16: 1126 case STORAGE_U32: 1127 case STORAGE_U64: 1128 case STORAGE_UINT: 1129 case STORAGE_UINTPTR: 1130 case STORAGE_SIZE: 1131 snprintf(buf, sizeof(buf), "%ju", tok->uval); 1132 break; 1133 case STORAGE_I8: 1134 case STORAGE_I16: 1135 case STORAGE_I32: 1136 case STORAGE_I64: 1137 case STORAGE_ICONST: 1138 case STORAGE_INT: 1139 snprintf(buf, sizeof(buf), "%jd", tok->ival); 1140 break; 1141 case STORAGE_F32: 1142 case STORAGE_F64: 1143 case STORAGE_FCONST: 1144 snprintf(buf, sizeof(buf), "%lf", tok->fval); 1145 break; 1146 case STORAGE_RCONST: 1147 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'"); 1148 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "%s", 1149 rune_unparse(tok->rune)); 1150 bytes += snprintf(&buf[bytes], sizeof(buf) - bytes, "'"); 1151 break; 1152 case STORAGE_STRING: 1153 return string_unparse(tok); 1154 case STORAGE_ALIAS: 1155 case STORAGE_ARRAY: 1156 case STORAGE_BOOL: 1157 case STORAGE_CHAR: 1158 case STORAGE_ENUM: 1159 case STORAGE_FUNCTION: 1160 case STORAGE_POINTER: 1161 case STORAGE_NULL: 1162 case STORAGE_RUNE: 1163 case STORAGE_SLICE: 1164 case STORAGE_STRUCT: 1165 case STORAGE_TAGGED: 1166 case STORAGE_TUPLE: 1167 case STORAGE_UNION: 1168 case STORAGE_VALIST: 1169 case STORAGE_VOID: 1170 assert(0); 1171 } 1172 return buf; 1173 default:; 1174 const char *out = lexical_token_str(tok->token); 1175 return out; 1176 } 1177 } 1178 1179 void 1180 unlex(struct lexer *lexer, struct token *in) 1181 { 1182 assert(lexer->un.token == T_ERROR && "Only one unlex is supported"); 1183 lexer->un = *in; 1184 }