harec

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit e85e5df216cbd26cc9a9b2ab79bf7638251d6ebd
parent 576fe8c851fb6eaa01b37c214752aa83e6cd86ad
Author: Eyal Sawady <ecs@d2evs.net>
Date:   Sat, 24 Oct 2020 15:28:05 -0400

lex: handle string and rune constants

Diffstat:
Minclude/lex.h | 8++++++++
Msrc/lex.c | 109+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Msrc/main.c | 10++++++++++
3 files changed, 125 insertions(+), 2 deletions(-)

diff --git a/include/lex.h b/include/lex.h @@ -1,5 +1,6 @@ #ifndef HAREC_LEX_H #define HAREC_LEX_H +#include <stdint.h> // Keep sorted enum lexical_token { @@ -103,6 +104,8 @@ enum lexical_token { // Tokens with additional information T_NAME, T_LITERAL, + T_RUNE, + T_STRING, // Magic tokens T_EOF, @@ -114,6 +117,11 @@ struct token { union { char *name; char *literal; + uint32_t rune; + struct { + size_t len; + char *value; + } string; }; }; diff --git a/src/lex.c b/src/lex.c @@ -1,6 +1,7 @@ #include <assert.h> #include <ctype.h> #include <stdbool.h> +#include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -297,10 +298,111 @@ finalize: return c; } -static int +static uint32_t +lex_rune(struct lexer *lexer) +{ + char buf[5]; + char *endptr; + uint32_t c = next(lexer, false); + assert(c != UTF8_INVALID); + + switch (c) { + case '\\': + c = next(lexer, false); + switch (c) { + case '0': + return '\0'; + case 'a': + return '\a'; + case 'b': + return '\b'; + case 'f': + return '\f'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'v': + return '\v'; + case '\\': + return '\\'; + case '\'': + return '\''; + case '"': + return '\"'; + case 'x': + buf[0] = next(lexer, false); + buf[1] = next(lexer, false); + buf[2] = '\0'; + c = strtoul(&buf[0], &endptr, 16); + assert(*endptr == '\0'); + return c; + case 'u': + buf[0] = next(lexer, false); + buf[1] = next(lexer, false); + buf[2] = next(lexer, false); + buf[3] = next(lexer, false); + buf[4] = '\0'; + c = strtoul(&buf[0], &endptr, 16); + assert(*endptr == '\0'); + return c; + default: + assert(0); // Invariant + } + assert(0); + default: + return c; + } + assert(0); +} + +static uint32_t lex_string(struct lexer *lexer, struct token *out) { - assert(0); // TODO + uint32_t c = next(lexer, false); + assert(c != UTF8_INVALID); + + switch (c) { + case '"': + while ((c = next(lexer, false)) != UTF8_INVALID) { + switch (c) { + case '"':; + char *buf = malloc(lexer->buflen); + memcpy(buf, lexer->buf, lexer->buflen); + out->token = T_STRING; + out->string.len = lexer->buflen; + out->string.value = buf; + consume(lexer, -1); + return c; + default: + push(lexer, c, false); + push(lexer, lex_rune(lexer), false); + next(lexer, true); + } + } + assert(0); // Invariant + case '\'': + c = next(lexer, false); + switch (c) { + case '\'': + assert(0); // Invariant + case '\\': + push(lexer, c, false); + out->rune = lex_rune(lexer); + break; + default: + out->rune = c; + } + c = next(lexer, false); + assert(c == '\''); + out->token = T_RUNE; + return c; + default: + assert(0); // Invariant + } + assert(0); } static uint32_t @@ -609,6 +711,9 @@ token_finish(struct token *tok) case T_NAME: free(tok->name); break; + case T_STRING: + free(tok->string.value); + break; default: break; } diff --git a/src/main.c b/src/main.c @@ -19,6 +19,16 @@ main(int argc, char *argv[]) case T_LITERAL: fprintf(stderr, "(%s)\n", tok.literal); break; + case T_RUNE: + putc('\'', stderr); + utf8_fputch(stderr, tok.rune); + putc('\'', stderr); + putc('\n', stderr); + break; + case T_STRING: + fprintf(stderr, "\"%*s\"\n", (int)tok.string.len, + tok.string.value); + break; case T_ERROR: fprintf(stderr, "ERROR\n"); break;