harec

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 576fe8c851fb6eaa01b37c214752aa83e6cd86ad
parent 2890510b07d458352206d97a82a8769ad434a829
Author: Eyal Sawady <ecs@d2evs.net>
Date:   Sat, 24 Oct 2020 15:28:04 -0400

lex: support UTF-8

Diffstat:
Mconfigure | 3++-
Minclude/lex.h | 4++--
Ainclude/utf8.h | 40++++++++++++++++++++++++++++++++++++++++
Msrc/lex.c | 74++++++++++++++++++++++++++++++++++++++++----------------------------------
Msrc/main.c | 1+
Asrc/utf8.c | 143+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 228 insertions(+), 37 deletions(-)

diff --git a/configure b/configure @@ -5,7 +5,8 @@ eval ". $srcdir/config.sh" harec() { genrules harec \ src/lex.c \ - src/main.c + src/main.c \ + src/utf8.c } all="harec" diff --git a/include/lex.h b/include/lex.h @@ -121,12 +121,12 @@ struct lexer { FILE *in; char *buf; size_t bufsz, buflen; - int c; + uint32_t c; }; void lex_init(struct lexer *lexer, FILE *f); void lex_finish(struct lexer *lexer); -int lex(struct lexer *lexer, struct token *out); +uint32_t lex(struct lexer *lexer, struct token *out); void token_finish(struct token *tok); const char *token_str(const struct token *tok); diff --git a/include/utf8.h b/include/utf8.h @@ -0,0 +1,40 @@ +#ifndef HAREC_UTF8_H +#define HAREC_UTF8_H + +// Technically UTF-8 supports up to 6 byte codepoints, but Unicode itself +// doesn't really bother with more than 4. +#define UTF8_MAX_SIZE 4 + +#define UTF8_INVALID 0x80 + +/** + * Grabs the next UTF-8 character and advances the string pointer + */ +uint32_t utf8_decode(const char **str); + +/** + * Encodes a character as UTF-8 and returns the length of that character. + */ +size_t utf8_encode(char *str, uint32_t ch); + +/** + * Returns the size of the next UTF-8 character + */ +int utf8_size(const char *str); + +/** + * Returns the size of a UTF-8 character + */ +size_t utf8_chsize(uint32_t ch); + +/** + * Reads and returns the next character from the file. + */ +uint32_t utf8_fgetch(FILE *f); + +/** + * Writes this character to the file and returns the number of bytes written. + */ +size_t utf8_fputch(FILE *f, uint32_t ch); + +#endif diff --git a/src/lex.c b/src/lex.c @@ -5,6 +5,7 @@ #include <stdlib.h> #include <string.h> #include "lex.h" +#include "utf8.h" static const char *tokens[] = { // Must be alpha sorted @@ -120,34 +121,37 @@ lex_finish(struct lexer *lexer) free(lexer->buf); } -static int +static uint32_t next(struct lexer *lexer, bool buffer) { - int c; + uint32_t c; if (lexer->c != 0) { c = lexer->c; lexer->c = 0; } else { - c = fgetc(lexer->in); + c = utf8_fgetch(lexer->in); } - if (c == EOF || !buffer) { + if (c == UTF8_INVALID || !buffer) { return c; } - if (lexer->buflen + 1 >= lexer->bufsz) { + if (lexer->buflen + utf8_chsize(c) >= lexer->bufsz) { lexer->bufsz *= 2; lexer->buf = realloc(lexer->buf, lexer->bufsz); assert(lexer->buf); } - lexer->buf[lexer->buflen++] = c; + char buf[UTF8_MAX_SIZE]; + size_t sz = utf8_encode(&buf[0], c); + memcpy(lexer->buf + lexer->buflen, buf, sz); + lexer->buflen += sz; lexer->buf[lexer->buflen] = '\0'; return c; } -static int +static uint32_t wgetc(struct lexer *lexer) { - int c; - while ((c = next(lexer, false)) != EOF && isspace(c)) ; + uint32_t c; + while ((c = next(lexer, false)) != UTF8_INVALID && isspace(c)) ; return c; } @@ -159,16 +163,18 @@ consume(struct lexer *lexer, ssize_t n) lexer->buf[0] = 0; return; } - memmove(lexer->buf, &lexer->buf[lexer->buflen], lexer->buflen - n); - lexer->buflen -= n; + for (ssize_t i = 0; i < n; i++) { + while ((lexer->buf[--lexer->buflen] & 0xC0) == 0x80) ; + } + lexer->buf[lexer->buflen] = 0; } static void -push(struct lexer *lexer, int c, bool buffer) +push(struct lexer *lexer, uint32_t c, bool buffer) { lexer->c = c; if (buffer) { - lexer->buf[--lexer->buflen] = 0; + consume(lexer, 1); } } @@ -178,13 +184,13 @@ cmp_keyword(const void *va, const void *vb) return strcmp(*(const char **)va, *(const char **)vb); } -static int +static uint32_t lex_name(struct lexer *lexer, struct token *out) { - int c = next(lexer, true); - assert(c != EOF && (isalpha(c) || c == '_')); - while ((c = next(lexer, true)) != EOF) { - if (!isalnum(c) && c != '_') { + uint32_t c = next(lexer, true); + assert(c != UTF8_INVALID && c <= 0x7F && (isalpha(c) || c == '_')); + while ((c = next(lexer, true)) != UTF8_INVALID) { + if (c > 0x7F || (!isalnum(c) && c != '_')) { push(lexer, c, true); goto lookup; } @@ -205,11 +211,11 @@ lookup:; return c; } -static int +static uint32_t lex_literal(struct lexer *lexer, struct token *out) { - int c = next(lexer, true); - assert(c != EOF && isdigit(c)); + uint32_t c = next(lexer, true); + assert(c != UTF8_INVALID && c <= 0x7F && isdigit(c)); const char *base = "0123456789"; switch ((c = next(lexer, true))) { @@ -229,7 +235,7 @@ lex_literal(struct lexer *lexer, struct token *out) char *suff = NULL; bool isfloat = false, isexp = false, issuff = false; - while ((c = next(lexer, true)) != EOF) { + while ((c = next(lexer, true)) != UTF8_INVALID) { if (!strchr(base, c)) { switch (c) { case '.': @@ -297,10 +303,10 @@ lex_string(struct lexer *lexer, struct token *out) assert(0); // TODO } -static int -lex3(struct lexer *lexer, struct token *out, int c) +static uint32_t +lex3(struct lexer *lexer, struct token *out, uint32_t c) { - assert(c != EOF); + assert(c != UTF8_INVALID); switch (c) { case '.': @@ -373,10 +379,10 @@ lex3(struct lexer *lexer, struct token *out, int c) return c; } -static int -lex2(struct lexer *lexer, struct token *out, int c) +static uint32_t +lex2(struct lexer *lexer, struct token *out, uint32_t c) { - assert(c != EOF); + assert(c != UTF8_INVALID); switch (c) { case '^': @@ -418,7 +424,7 @@ lex2(struct lexer *lexer, struct token *out, int c) out->token = T_DIVEQ; break; case '/': - while ((c = next(lexer, false)) != EOF && c != '\n') ; + while ((c = next(lexer, false)) != UTF8_INVALID && c != '\n') ; return lex(lexer, out); default: push(lexer, c, false); @@ -522,21 +528,21 @@ lex2(struct lexer *lexer, struct token *out, int c) return c; } -int +uint32_t lex(struct lexer *lexer, struct token *out) { - int c = wgetc(lexer); - if (c == EOF) { + uint32_t c = wgetc(lexer); + if (c == UTF8_INVALID) { out->token = T_EOF; return c; } - if (isalpha(c) || c == '_') { + if (c <= 0x7F && (isalpha(c) || c == '_')) { push(lexer, c, false); return lex_name(lexer, out); } - if (isdigit(c)) { + if (c <= 0x7F && isdigit(c)) { push(lexer, c, false); return lex_literal(lexer, out); } diff --git a/src/main.c b/src/main.c @@ -1,5 +1,6 @@ #include <stdio.h> #include "lex.h" +#include "utf8.h" int main(int argc, char *argv[]) diff --git a/src/utf8.c b/src/utf8.c @@ -0,0 +1,143 @@ +#include <stdint.h> +#include <stdio.h> +#include "utf8.h" + +uint8_t masks[] = { + 0x7F, + 0x1F, + 0x0F, + 0x07, + 0x03, + 0x01 +}; + +struct { + uint8_t mask; + uint8_t result; + int octets; +} sizes[] = { + { 0x80, 0x00, 1 }, + { 0xE0, 0xC0, 2 }, + { 0xF0, 0xE0, 3 }, + { 0xF8, 0xF0, 4 }, + { 0xFC, 0xF8, 5 }, + { 0xFE, 0xF8, 6 }, + { 0x80, 0x80, -1 }, +}; + +size_t +utf8_chsize(uint32_t ch) +{ + if (ch < 0x80) { + return 1; + } else if (ch < 0x800) { + return 2; + } else if (ch < 0x10000) { + return 3; + } + return 4; +} + +uint32_t +utf8_decode(const char **char_str) +{ + uint8_t **s = (uint8_t **)char_str; + + uint32_t cp = 0; + if (**s < 128) { + // shortcut + cp = **s; + ++*s; + return cp; + } + int size = utf8_size((char *)*s); + if (size == -1) { + ++*s; + return UTF8_INVALID; + } + uint8_t mask = masks[size - 1]; + cp = **s & mask; + ++*s; + while (--size) { + cp <<= 6; + cp |= **s & 0x3f; + ++*s; + } + return cp; +} + +size_t +utf8_encode(char *str, uint32_t ch) +{ + size_t len = 0; + uint8_t first; + + if (ch < 0x80) { + first = 0; + len = 1; + } else if (ch < 0x800) { + first = 0xc0; + len = 2; + } else if (ch < 0x10000) { + first = 0xe0; + len = 3; + } else { + first = 0xf0; + len = 4; + } + + for (size_t i = len - 1; i > 0; --i) { + str[i] = (ch & 0x3f) | 0x80; + ch >>= 6; + } + + str[0] = ch | first; + return len; +} + +int +utf8_size(const char *s) +{ + uint8_t c = (uint8_t)*s; + for (size_t i = 0; i < sizeof(sizes) / 2; ++i) { + if ((c & sizes[i].mask) == sizes[i].result) { + return sizes[i].octets; + } + } + return -1; +} + +uint32_t +utf8_fgetch(FILE *f) +{ + char buffer[UTF8_MAX_SIZE]; + int c = fgetc(f); + if (c == EOF) { + return UTF8_INVALID; + } + buffer[0] = (char)c; + int size = utf8_size(buffer); + + if (size > UTF8_MAX_SIZE) { + fseek(f, size - 1, SEEK_CUR); + return UTF8_INVALID; + } + + if (size > 1) { + int amt = fread(&buffer[1], 1, size - 1, f); + if (amt != size - 1) { + return UTF8_INVALID; + } + } + const char *ptr = buffer; + return utf8_decode(&ptr); +} + +size_t +utf8_fputch(FILE *f, uint32_t ch) +{ + char buffer[UTF8_MAX_SIZE]; + char *ptr = buffer; + size_t size = utf8_encode(ptr, ch); + return fwrite(&buffer, 1, size, f); +}