lex: support UTF-8 - harec - Unnamed repository; edit this file 'description' to name the repository.

commit 576fe8c851fb6eaa01b37c214752aa83e6cd86ad
parent 2890510b07d458352206d97a82a8769ad434a829
Author: Eyal Sawady <ecs@d2evs.net>
Date:   Sat, 24 Oct 2020 15:28:04 -0400

lex: support UTF-8

Diffstat:
M configure  | 3 ++-
M include/lex.h  | 4 ++--
A include/utf8.h  | 40 ++++++++++++++++++++++++++++++++++++++++
M src/lex.c  | 74 ++++++++++++++++++++++++++++++++++++++++----------------------------------
M src/main.c  | 1 +
A src/utf8.c  | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

6 files changed, 228 insertions(+), 37 deletions(-)
diff --git a/configure b/configure
@@ -5,7 +5,8 @@ eval ". $srcdir/config.sh"
 harec() {
 	genrules harec \
 		src/lex.c \
-		src/main.c
+		src/main.c \
+		src/utf8.c
 }
 
 all="harec"
diff --git a/include/lex.h b/include/lex.h
@@ -121,12 +121,12 @@ struct lexer {
 	FILE *in;
 	char *buf;
 	size_t bufsz, buflen;
-	int c;
+	uint32_t c;
 };
 
 void lex_init(struct lexer *lexer, FILE *f);
 void lex_finish(struct lexer *lexer);
-int lex(struct lexer *lexer, struct token *out);
+uint32_t lex(struct lexer *lexer, struct token *out);
 
 void token_finish(struct token *tok);
 const char *token_str(const struct token *tok);
diff --git a/include/utf8.h b/include/utf8.h
@@ -0,0 +1,40 @@
+#ifndef HAREC_UTF8_H
+#define HAREC_UTF8_H
+
+// Technically UTF-8 supports up to 6 byte codepoints, but Unicode itself
+// doesn't really bother with more than 4.
+#define UTF8_MAX_SIZE 4
+
+#define UTF8_INVALID 0x80
+
+/**
+ * Grabs the next UTF-8 character and advances the string pointer
+ */
+uint32_t utf8_decode(const char **str);
+
+/**
+ * Encodes a character as UTF-8 and returns the length of that character.
+ */
+size_t utf8_encode(char *str, uint32_t ch);
+
+/**
+ * Returns the size of the next UTF-8 character
+ */
+int utf8_size(const char *str);
+
+/**
+ * Returns the size of a UTF-8 character
+ */
+size_t utf8_chsize(uint32_t ch);
+
+/**
+ * Reads and returns the next character from the file.
+ */
+uint32_t utf8_fgetch(FILE *f);
+
+/**
+ * Writes this character to the file and returns the number of bytes written.
+ */
+size_t utf8_fputch(FILE *f, uint32_t ch);
+
+#endif
diff --git a/src/lex.c b/src/lex.c
@@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "lex.h"
+#include "utf8.h"
 
 static const char *tokens[] = {
 	// Must be alpha sorted
@@ -120,34 +121,37 @@ lex_finish(struct lexer *lexer)
 	free(lexer->buf);
 }
 
-static int
+static uint32_t
 next(struct lexer *lexer, bool buffer)
 {
-	int c;
+	uint32_t c;
 	if (lexer->c != 0) {
 		c = lexer->c;
 		lexer->c = 0;
 	} else {
-		c = fgetc(lexer->in);
+		c = utf8_fgetch(lexer->in);
 	}
-	if (c == EOF || !buffer) {
+	if (c == UTF8_INVALID || !buffer) {
 		return c;
 	}
-	if (lexer->buflen + 1 >= lexer->bufsz) {
+	if (lexer->buflen + utf8_chsize(c) >= lexer->bufsz) {
 		lexer->bufsz *= 2;
 		lexer->buf = realloc(lexer->buf, lexer->bufsz);
 		assert(lexer->buf);
 	}
-	lexer->buf[lexer->buflen++] = c;
+	char buf[UTF8_MAX_SIZE];
+	size_t sz = utf8_encode(&buf[0], c);
+	memcpy(lexer->buf + lexer->buflen, buf, sz);
+	lexer->buflen += sz;
 	lexer->buf[lexer->buflen] = '\0';
 	return c;
 }
 
-static int
+static uint32_t
 wgetc(struct lexer *lexer)
 {
-	int c;
-	while ((c = next(lexer, false)) != EOF && isspace(c)) ;
+	uint32_t c;
+	while ((c = next(lexer, false)) != UTF8_INVALID && isspace(c)) ;
 	return c;
 }
 
@@ -159,16 +163,18 @@ consume(struct lexer *lexer, ssize_t n)
 		lexer->buf[0] = 0;
 		return;
 	}
-	memmove(lexer->buf, &lexer->buf[lexer->buflen], lexer->buflen - n);
-	lexer->buflen -= n;
+	for (ssize_t i = 0; i < n; i++) {
+		while ((lexer->buf[--lexer->buflen] & 0xC0) == 0x80) ;
+	}
+	lexer->buf[lexer->buflen] = 0;
 }
 
 static void
-push(struct lexer *lexer, int c, bool buffer)
+push(struct lexer *lexer, uint32_t c, bool buffer)
 {
 	lexer->c = c;
 	if (buffer) {
-		lexer->buf[--lexer->buflen] = 0;
+		consume(lexer, 1);
 	}
 }
 
@@ -178,13 +184,13 @@ cmp_keyword(const void *va, const void *vb)
 	return strcmp(*(const char **)va, *(const char **)vb);
 }
 
-static int
+static uint32_t
 lex_name(struct lexer *lexer, struct token *out)
 {
-	int c = next(lexer, true);
-	assert(c != EOF && (isalpha(c) || c == '_'));
-	while ((c = next(lexer, true)) != EOF) {
-		if (!isalnum(c) && c != '_') {
+	uint32_t c = next(lexer, true);
+	assert(c != UTF8_INVALID && c <= 0x7F && (isalpha(c) || c == '_'));
+	while ((c = next(lexer, true)) != UTF8_INVALID) {
+		if (c > 0x7F || (!isalnum(c) && c != '_')) {
 			push(lexer, c, true);
 			goto lookup;
 		}
@@ -205,11 +211,11 @@ lookup:;
 	return c;
 }
 
-static int
+static uint32_t
 lex_literal(struct lexer *lexer, struct token *out)
 {
-	int c = next(lexer, true);
-	assert(c != EOF && isdigit(c));
+	uint32_t c = next(lexer, true);
+	assert(c != UTF8_INVALID && c <= 0x7F && isdigit(c));
 
 	const char *base = "0123456789";
 	switch ((c = next(lexer, true))) {
@@ -229,7 +235,7 @@ lex_literal(struct lexer *lexer, struct token *out)
 
 	char *suff = NULL;
 	bool isfloat = false, isexp = false, issuff = false;
-	while ((c = next(lexer, true)) != EOF) {
+	while ((c = next(lexer, true)) != UTF8_INVALID) {
 		if (!strchr(base, c)) {
 			switch (c) {
 			case '.':
@@ -297,10 +303,10 @@ lex_string(struct lexer *lexer, struct token *out)
 	assert(0); // TODO
 }
 
-static int
-lex3(struct lexer *lexer, struct token *out, int c)
+static uint32_t
+lex3(struct lexer *lexer, struct token *out, uint32_t c)
 {
-	assert(c != EOF);
+	assert(c != UTF8_INVALID);
 
 	switch (c) {
 	case '.':
@@ -373,10 +379,10 @@ lex3(struct lexer *lexer, struct token *out, int c)
 	return c;
 }
 
-static int
-lex2(struct lexer *lexer, struct token *out, int c)
+static uint32_t
+lex2(struct lexer *lexer, struct token *out, uint32_t c)
 {
-	assert(c != EOF);
+	assert(c != UTF8_INVALID);
 
 	switch (c) {
 	case '^':
@@ -418,7 +424,7 @@ lex2(struct lexer *lexer, struct token *out, int c)
 			out->token = T_DIVEQ;
 			break;
 		case '/':
-			while ((c = next(lexer, false)) != EOF && c != '\n') ;
+			while ((c = next(lexer, false)) != UTF8_INVALID && c != '\n') ;
 			return lex(lexer, out);
 		default:
 			push(lexer, c, false);
@@ -522,21 +528,21 @@ lex2(struct lexer *lexer, struct token *out, int c)
 	return c;
 }
 
-int
+uint32_t
 lex(struct lexer *lexer, struct token *out)
 {
-	int c = wgetc(lexer);
-	if (c == EOF) {
+	uint32_t c = wgetc(lexer);
+	if (c == UTF8_INVALID) {
 		out->token = T_EOF;
 		return c;
 	}
 
-	if (isalpha(c) || c == '_') {
+	if (c <= 0x7F && (isalpha(c) || c == '_')) {
 		push(lexer, c, false);
 		return lex_name(lexer, out);
 	}
 
-	if (isdigit(c)) {
+	if (c <= 0x7F && isdigit(c)) {
 		push(lexer, c, false);
 		return lex_literal(lexer, out);
 	}
diff --git a/src/main.c b/src/main.c
@@ -1,5 +1,6 @@
 #include <stdio.h>
 #include "lex.h"
+#include "utf8.h"
 
 int
 main(int argc, char *argv[])
diff --git a/src/utf8.c b/src/utf8.c
@@ -0,0 +1,143 @@
+#include <stdint.h>
+#include <stdio.h>
+#include "utf8.h"
+
+uint8_t masks[] = {
+	0x7F,
+	0x1F,
+	0x0F,
+	0x07,
+	0x03,
+	0x01
+};
+
+struct {
+	uint8_t mask;
+	uint8_t result;
+	int octets;
+} sizes[] = {
+	{ 0x80, 0x00, 1 },
+	{ 0xE0, 0xC0, 2 },
+	{ 0xF0, 0xE0, 3 },
+	{ 0xF8, 0xF0, 4 },
+	{ 0xFC, 0xF8, 5 },
+	{ 0xFE, 0xF8, 6 },
+	{ 0x80, 0x80, -1 },
+};
+
+size_t
+utf8_chsize(uint32_t ch)
+{
+	if (ch < 0x80) {
+		return 1;
+	} else if (ch < 0x800) {
+		return 2;
+	} else if (ch < 0x10000) {
+		return 3;
+	}
+	return 4;
+}
+
+uint32_t
+utf8_decode(const char **char_str)
+{
+	uint8_t **s = (uint8_t **)char_str;
+
+	uint32_t cp = 0;
+	if (**s < 128) {
+		// shortcut
+		cp = **s;
+		++*s;
+		return cp;
+	}
+	int size = utf8_size((char *)*s);
+	if (size == -1) {
+		++*s;
+		return UTF8_INVALID;
+	}
+	uint8_t mask = masks[size - 1];
+	cp = **s & mask;
+	++*s;
+	while (--size) {
+		cp <<= 6;
+		cp |= **s & 0x3f;
+		++*s;
+	}
+	return cp;
+}
+
+size_t
+utf8_encode(char *str, uint32_t ch)
+{
+	size_t len = 0;
+	uint8_t first;
+
+	if (ch < 0x80) {
+		first = 0;
+		len = 1;
+	} else if (ch < 0x800) {
+		first = 0xc0;
+		len = 2;
+	} else if (ch < 0x10000) {
+		first = 0xe0;
+		len = 3;
+	} else {
+		first = 0xf0;
+		len = 4;
+	}
+
+	for (size_t i = len - 1; i > 0; --i) {
+		str[i] = (ch & 0x3f) | 0x80;
+		ch >>= 6;
+	}
+
+	str[0] = ch | first;
+	return len;
+}
+
+int
+utf8_size(const char *s)
+{
+	uint8_t c = (uint8_t)*s;
+	for (size_t i = 0; i < sizeof(sizes) / 2; ++i) {
+		if ((c & sizes[i].mask) == sizes[i].result) {
+			return sizes[i].octets;
+		}
+	}
+	return -1;
+}
+
+uint32_t
+utf8_fgetch(FILE *f)
+{
+	char buffer[UTF8_MAX_SIZE];
+	int c = fgetc(f);
+	if (c == EOF) {
+		return UTF8_INVALID;
+	}
+	buffer[0] = (char)c;
+	int size = utf8_size(buffer);
+
+	if (size > UTF8_MAX_SIZE) {
+		fseek(f, size - 1, SEEK_CUR);
+		return UTF8_INVALID;
+	}
+
+	if (size > 1) {
+		int amt = fread(&buffer[1], 1, size - 1, f);
+		if (amt != size - 1) {
+			return UTF8_INVALID;
+		}
+	}
+	const char *ptr = buffer;
+	return utf8_decode(&ptr);
+}
+
+size_t
+utf8_fputch(FILE *f, uint32_t ch)
+{
+	char buffer[UTF8_MAX_SIZE];
+	char *ptr = buffer;
+	size_t size = utf8_encode(ptr, ch);
+	return fwrite(&buffer, 1, size, f);
+}

	harec Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

M	configure	\|	3	++-
M	include/lex.h	\|	4	++--
A	include/utf8.h	\|	40	++++++++++++++++++++++++++++++++++++++++
M	src/lex.c	\|	74	++++++++++++++++++++++++++++++++++++++++----------------------------------
M	src/main.c	\|	1	+
A	src/utf8.c	\|	143	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++