commit 576fe8c851fb6eaa01b37c214752aa83e6cd86ad
parent 2890510b07d458352206d97a82a8769ad434a829
Author: Eyal Sawady <ecs@d2evs.net>
Date: Sat, 24 Oct 2020 15:28:04 -0400
lex: support UTF-8
Diffstat:
6 files changed, 228 insertions(+), 37 deletions(-)
diff --git a/configure b/configure
@@ -5,7 +5,8 @@ eval ". $srcdir/config.sh"
harec() {
genrules harec \
src/lex.c \
- src/main.c
+ src/main.c \
+ src/utf8.c
}
all="harec"
diff --git a/include/lex.h b/include/lex.h
@@ -121,12 +121,12 @@ struct lexer {
FILE *in;
char *buf;
size_t bufsz, buflen;
- int c;
+ uint32_t c;
};
void lex_init(struct lexer *lexer, FILE *f);
void lex_finish(struct lexer *lexer);
-int lex(struct lexer *lexer, struct token *out);
+uint32_t lex(struct lexer *lexer, struct token *out);
void token_finish(struct token *tok);
const char *token_str(const struct token *tok);
diff --git a/include/utf8.h b/include/utf8.h
@@ -0,0 +1,40 @@
+#ifndef HAREC_UTF8_H
+#define HAREC_UTF8_H
+
+// Technically UTF-8 supports up to 6 byte codepoints, but Unicode itself
+// doesn't really bother with more than 4.
+#define UTF8_MAX_SIZE 4
+
+#define UTF8_INVALID 0x80
+
+/**
+ * Grabs the next UTF-8 character and advances the string pointer
+ */
+uint32_t utf8_decode(const char **str);
+
+/**
+ * Encodes a character as UTF-8 and returns the length of that character.
+ */
+size_t utf8_encode(char *str, uint32_t ch);
+
+/**
+ * Returns the size of the next UTF-8 character
+ */
+int utf8_size(const char *str);
+
+/**
+ * Returns the size of a UTF-8 character
+ */
+size_t utf8_chsize(uint32_t ch);
+
+/**
+ * Reads and returns the next character from the file.
+ */
+uint32_t utf8_fgetch(FILE *f);
+
+/**
+ * Writes this character to the file and returns the number of bytes written.
+ */
+size_t utf8_fputch(FILE *f, uint32_t ch);
+
+#endif
diff --git a/src/lex.c b/src/lex.c
@@ -5,6 +5,7 @@
#include <stdlib.h>
#include <string.h>
#include "lex.h"
+#include "utf8.h"
static const char *tokens[] = {
// Must be alpha sorted
@@ -120,34 +121,37 @@ lex_finish(struct lexer *lexer)
free(lexer->buf);
}
-static int
+static uint32_t
next(struct lexer *lexer, bool buffer)
{
- int c;
+ uint32_t c;
if (lexer->c != 0) {
c = lexer->c;
lexer->c = 0;
} else {
- c = fgetc(lexer->in);
+ c = utf8_fgetch(lexer->in);
}
- if (c == EOF || !buffer) {
+ if (c == UTF8_INVALID || !buffer) {
return c;
}
- if (lexer->buflen + 1 >= lexer->bufsz) {
+ if (lexer->buflen + utf8_chsize(c) >= lexer->bufsz) {
lexer->bufsz *= 2;
lexer->buf = realloc(lexer->buf, lexer->bufsz);
assert(lexer->buf);
}
- lexer->buf[lexer->buflen++] = c;
+ char buf[UTF8_MAX_SIZE];
+ size_t sz = utf8_encode(&buf[0], c);
+ memcpy(lexer->buf + lexer->buflen, buf, sz);
+ lexer->buflen += sz;
lexer->buf[lexer->buflen] = '\0';
return c;
}
-static int
+static uint32_t
wgetc(struct lexer *lexer)
{
- int c;
- while ((c = next(lexer, false)) != EOF && isspace(c)) ;
+ uint32_t c;
+ while ((c = next(lexer, false)) != UTF8_INVALID && isspace(c)) ;
return c;
}
@@ -159,16 +163,18 @@ consume(struct lexer *lexer, ssize_t n)
lexer->buf[0] = 0;
return;
}
- memmove(lexer->buf, &lexer->buf[lexer->buflen], lexer->buflen - n);
- lexer->buflen -= n;
+ for (ssize_t i = 0; i < n; i++) {
+ while ((lexer->buf[--lexer->buflen] & 0xC0) == 0x80) ;
+ }
+ lexer->buf[lexer->buflen] = 0;
}
static void
-push(struct lexer *lexer, int c, bool buffer)
+push(struct lexer *lexer, uint32_t c, bool buffer)
{
lexer->c = c;
if (buffer) {
- lexer->buf[--lexer->buflen] = 0;
+ consume(lexer, 1);
}
}
@@ -178,13 +184,13 @@ cmp_keyword(const void *va, const void *vb)
return strcmp(*(const char **)va, *(const char **)vb);
}
-static int
+static uint32_t
lex_name(struct lexer *lexer, struct token *out)
{
- int c = next(lexer, true);
- assert(c != EOF && (isalpha(c) || c == '_'));
- while ((c = next(lexer, true)) != EOF) {
- if (!isalnum(c) && c != '_') {
+ uint32_t c = next(lexer, true);
+ assert(c != UTF8_INVALID && c <= 0x7F && (isalpha(c) || c == '_'));
+ while ((c = next(lexer, true)) != UTF8_INVALID) {
+ if (c > 0x7F || (!isalnum(c) && c != '_')) {
push(lexer, c, true);
goto lookup;
}
@@ -205,11 +211,11 @@ lookup:;
return c;
}
-static int
+static uint32_t
lex_literal(struct lexer *lexer, struct token *out)
{
- int c = next(lexer, true);
- assert(c != EOF && isdigit(c));
+ uint32_t c = next(lexer, true);
+ assert(c != UTF8_INVALID && c <= 0x7F && isdigit(c));
const char *base = "0123456789";
switch ((c = next(lexer, true))) {
@@ -229,7 +235,7 @@ lex_literal(struct lexer *lexer, struct token *out)
char *suff = NULL;
bool isfloat = false, isexp = false, issuff = false;
- while ((c = next(lexer, true)) != EOF) {
+ while ((c = next(lexer, true)) != UTF8_INVALID) {
if (!strchr(base, c)) {
switch (c) {
case '.':
@@ -297,10 +303,10 @@ lex_string(struct lexer *lexer, struct token *out)
assert(0); // TODO
}
-static int
-lex3(struct lexer *lexer, struct token *out, int c)
+static uint32_t
+lex3(struct lexer *lexer, struct token *out, uint32_t c)
{
- assert(c != EOF);
+ assert(c != UTF8_INVALID);
switch (c) {
case '.':
@@ -373,10 +379,10 @@ lex3(struct lexer *lexer, struct token *out, int c)
return c;
}
-static int
-lex2(struct lexer *lexer, struct token *out, int c)
+static uint32_t
+lex2(struct lexer *lexer, struct token *out, uint32_t c)
{
- assert(c != EOF);
+ assert(c != UTF8_INVALID);
switch (c) {
case '^':
@@ -418,7 +424,7 @@ lex2(struct lexer *lexer, struct token *out, int c)
out->token = T_DIVEQ;
break;
case '/':
- while ((c = next(lexer, false)) != EOF && c != '\n') ;
+ while ((c = next(lexer, false)) != UTF8_INVALID && c != '\n') ;
return lex(lexer, out);
default:
push(lexer, c, false);
@@ -522,21 +528,21 @@ lex2(struct lexer *lexer, struct token *out, int c)
return c;
}
-int
+uint32_t
lex(struct lexer *lexer, struct token *out)
{
- int c = wgetc(lexer);
- if (c == EOF) {
+ uint32_t c = wgetc(lexer);
+ if (c == UTF8_INVALID) {
out->token = T_EOF;
return c;
}
- if (isalpha(c) || c == '_') {
+ if (c <= 0x7F && (isalpha(c) || c == '_')) {
push(lexer, c, false);
return lex_name(lexer, out);
}
- if (isdigit(c)) {
+ if (c <= 0x7F && isdigit(c)) {
push(lexer, c, false);
return lex_literal(lexer, out);
}
diff --git a/src/main.c b/src/main.c
@@ -1,5 +1,6 @@
#include <stdio.h>
#include "lex.h"
+#include "utf8.h"
int
main(int argc, char *argv[])
diff --git a/src/utf8.c b/src/utf8.c
@@ -0,0 +1,143 @@
+#include <stdint.h>
+#include <stdio.h>
+#include "utf8.h"
+
+uint8_t masks[] = {
+ 0x7F,
+ 0x1F,
+ 0x0F,
+ 0x07,
+ 0x03,
+ 0x01
+};
+
+struct {
+ uint8_t mask;
+ uint8_t result;
+ int octets;
+} sizes[] = {
+ { 0x80, 0x00, 1 },
+ { 0xE0, 0xC0, 2 },
+ { 0xF0, 0xE0, 3 },
+ { 0xF8, 0xF0, 4 },
+ { 0xFC, 0xF8, 5 },
+ { 0xFE, 0xF8, 6 },
+ { 0x80, 0x80, -1 },
+};
+
+size_t
+utf8_chsize(uint32_t ch)
+{
+ if (ch < 0x80) {
+ return 1;
+ } else if (ch < 0x800) {
+ return 2;
+ } else if (ch < 0x10000) {
+ return 3;
+ }
+ return 4;
+}
+
+uint32_t
+utf8_decode(const char **char_str)
+{
+ uint8_t **s = (uint8_t **)char_str;
+
+ uint32_t cp = 0;
+ if (**s < 128) {
+ // shortcut
+ cp = **s;
+ ++*s;
+ return cp;
+ }
+ int size = utf8_size((char *)*s);
+ if (size == -1) {
+ ++*s;
+ return UTF8_INVALID;
+ }
+ uint8_t mask = masks[size - 1];
+ cp = **s & mask;
+ ++*s;
+ while (--size) {
+ cp <<= 6;
+ cp |= **s & 0x3f;
+ ++*s;
+ }
+ return cp;
+}
+
+size_t
+utf8_encode(char *str, uint32_t ch)
+{
+ size_t len = 0;
+ uint8_t first;
+
+ if (ch < 0x80) {
+ first = 0;
+ len = 1;
+ } else if (ch < 0x800) {
+ first = 0xc0;
+ len = 2;
+ } else if (ch < 0x10000) {
+ first = 0xe0;
+ len = 3;
+ } else {
+ first = 0xf0;
+ len = 4;
+ }
+
+ for (size_t i = len - 1; i > 0; --i) {
+ str[i] = (ch & 0x3f) | 0x80;
+ ch >>= 6;
+ }
+
+ str[0] = ch | first;
+ return len;
+}
+
+int
+utf8_size(const char *s)
+{
+ uint8_t c = (uint8_t)*s;
+ for (size_t i = 0; i < sizeof(sizes) / 2; ++i) {
+ if ((c & sizes[i].mask) == sizes[i].result) {
+ return sizes[i].octets;
+ }
+ }
+ return -1;
+}
+
+uint32_t
+utf8_fgetch(FILE *f)
+{
+ char buffer[UTF8_MAX_SIZE];
+ int c = fgetc(f);
+ if (c == EOF) {
+ return UTF8_INVALID;
+ }
+ buffer[0] = (char)c;
+ int size = utf8_size(buffer);
+
+ if (size > UTF8_MAX_SIZE) {
+ fseek(f, size - 1, SEEK_CUR);
+ return UTF8_INVALID;
+ }
+
+ if (size > 1) {
+ int amt = fread(&buffer[1], 1, size - 1, f);
+ if (amt != size - 1) {
+ return UTF8_INVALID;
+ }
+ }
+ const char *ptr = buffer;
+ return utf8_decode(&ptr);
+}
+
+size_t
+utf8_fputch(FILE *f, uint32_t ch)
+{
+ char buffer[UTF8_MAX_SIZE];
+ char *ptr = buffer;
+ size_t size = utf8_encode(ptr, ch);
+ return fwrite(&buffer, 1, size, f);
+}