commit e85e5df216cbd26cc9a9b2ab79bf7638251d6ebd
parent 576fe8c851fb6eaa01b37c214752aa83e6cd86ad
Author: Eyal Sawady <ecs@d2evs.net>
Date: Sat, 24 Oct 2020 15:28:05 -0400
lex: handle string and rune constants
Diffstat:
3 files changed, 125 insertions(+), 2 deletions(-)
diff --git a/include/lex.h b/include/lex.h
@@ -1,5 +1,6 @@
#ifndef HAREC_LEX_H
#define HAREC_LEX_H
+#include <stdint.h>
// Keep sorted
enum lexical_token {
@@ -103,6 +104,8 @@ enum lexical_token {
// Tokens with additional information
T_NAME,
T_LITERAL,
+ T_RUNE,
+ T_STRING,
// Magic tokens
T_EOF,
@@ -114,6 +117,11 @@ struct token {
union {
char *name;
char *literal;
+ uint32_t rune;
+ struct {
+ size_t len;
+ char *value;
+ } string;
};
};
diff --git a/src/lex.c b/src/lex.c
@@ -1,6 +1,7 @@
#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
+#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -297,10 +298,111 @@ finalize:
return c;
}
-static int
+static uint32_t
+lex_rune(struct lexer *lexer)
+{
+ char buf[5];
+ char *endptr;
+ uint32_t c = next(lexer, false);
+ assert(c != UTF8_INVALID);
+
+ switch (c) {
+ case '\\':
+ c = next(lexer, false);
+ switch (c) {
+ case '0':
+ return '\0';
+ case 'a':
+ return '\a';
+ case 'b':
+ return '\b';
+ case 'f':
+ return '\f';
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 't':
+ return '\t';
+ case 'v':
+ return '\v';
+ case '\\':
+ return '\\';
+ case '\'':
+ return '\'';
+ case '"':
+ return '\"';
+ case 'x':
+ buf[0] = next(lexer, false);
+ buf[1] = next(lexer, false);
+ buf[2] = '\0';
+ c = strtoul(&buf[0], &endptr, 16);
+ assert(*endptr == '\0');
+ return c;
+ case 'u':
+ buf[0] = next(lexer, false);
+ buf[1] = next(lexer, false);
+ buf[2] = next(lexer, false);
+ buf[3] = next(lexer, false);
+ buf[4] = '\0';
+ c = strtoul(&buf[0], &endptr, 16);
+ assert(*endptr == '\0');
+ return c;
+ default:
+ assert(0); // Invariant
+ }
+ assert(0);
+ default:
+ return c;
+ }
+ assert(0);
+}
+
+static uint32_t
lex_string(struct lexer *lexer, struct token *out)
{
- assert(0); // TODO
+ uint32_t c = next(lexer, false);
+ assert(c != UTF8_INVALID);
+
+ switch (c) {
+ case '"':
+ while ((c = next(lexer, false)) != UTF8_INVALID) {
+ switch (c) {
+ case '"':;
+ char *buf = malloc(lexer->buflen);
+ memcpy(buf, lexer->buf, lexer->buflen);
+ out->token = T_STRING;
+ out->string.len = lexer->buflen;
+ out->string.value = buf;
+ consume(lexer, -1);
+ return c;
+ default:
+ push(lexer, c, false);
+ push(lexer, lex_rune(lexer), false);
+ next(lexer, true);
+ }
+ }
+ assert(0); // Invariant
+ case '\'':
+ c = next(lexer, false);
+ switch (c) {
+ case '\'':
+ assert(0); // Invariant
+ case '\\':
+ push(lexer, c, false);
+ out->rune = lex_rune(lexer);
+ break;
+ default:
+ out->rune = c;
+ }
+ c = next(lexer, false);
+ assert(c == '\'');
+ out->token = T_RUNE;
+ return c;
+ default:
+ assert(0); // Invariant
+ }
+ assert(0);
}
static uint32_t
@@ -609,6 +711,9 @@ token_finish(struct token *tok)
case T_NAME:
free(tok->name);
break;
+ case T_STRING:
+ free(tok->string.value);
+ break;
default:
break;
}
diff --git a/src/main.c b/src/main.c
@@ -19,6 +19,16 @@ main(int argc, char *argv[])
case T_LITERAL:
fprintf(stderr, "(%s)\n", tok.literal);
break;
+ case T_RUNE:
+ putc('\'', stderr);
+ utf8_fputch(stderr, tok.rune);
+ putc('\'', stderr);
+ putc('\n', stderr);
+ break;
+ case T_STRING:
+ fprintf(stderr, "\"%*s\"\n", (int)tok.string.len,
+ tok.string.value);
+ break;
case T_ERROR:
fprintf(stderr, "ERROR\n");
break;