harec

[hare] Hare compiler, written in C11 for POSIX OSs
Log | Files | Refs | README | LICENSE

utf8.c (1924B)


      1 #include <stdint.h>
      2 #include <stdio.h>
      3 #include "utf8.h"
      4 
      5 uint8_t masks[] = {
      6 	0x7F,
      7 	0x1F,
      8 	0x0F,
      9 	0x07,
     10 	0x03,
     11 	0x01
     12 };
     13 
     14 struct {
     15 	uint8_t mask;
     16 	uint8_t result;
     17 	int octets;
     18 } sizes[] = {
     19 	{ 0x80, 0x00, 1 },
     20 	{ 0xE0, 0xC0, 2 },
     21 	{ 0xF0, 0xE0, 3 },
     22 	{ 0xF8, 0xF0, 4 },
     23 	{ 0xFC, 0xF8, 5 },
     24 	{ 0xFE, 0xF8, 6 },
     25 	{ 0x80, 0x80, -1 },
     26 };
     27 
     28 size_t
     29 utf8_cpsize(uint32_t ch)
     30 {
     31 	if (ch < 0x80) {
     32 		return 1;
     33 	} else if (ch < 0x800) {
     34 		return 2;
     35 	} else if (ch < 0x10000) {
     36 		return 3;
     37 	}
     38 	return 4;
     39 }
     40 
     41 uint32_t
     42 utf8_decode(const char **char_str)
     43 {
     44 	uint8_t **s = (uint8_t **)char_str;
     45 
     46 	uint32_t cp = 0;
     47 	if (**s < 128) {
     48 		// shortcut
     49 		cp = **s;
     50 		++*s;
     51 		return cp;
     52 	}
     53 	int size = utf8_size((char *)*s);
     54 	if (size == -1) {
     55 		++*s;
     56 		return UTF8_INVALID;
     57 	}
     58 	uint8_t mask = masks[size - 1];
     59 	cp = **s & mask;
     60 	++*s;
     61 	while (--size) {
     62 		cp <<= 6;
     63 		cp |= **s & 0x3f;
     64 		++*s;
     65 	}
     66 	return cp;
     67 }
     68 
     69 size_t
     70 utf8_encode(char *str, uint32_t ch)
     71 {
     72 	size_t len = 0;
     73 	uint8_t first;
     74 
     75 	if (ch < 0x80) {
     76 		first = 0;
     77 		len = 1;
     78 	} else if (ch < 0x800) {
     79 		first = 0xc0;
     80 		len = 2;
     81 	} else if (ch < 0x10000) {
     82 		first = 0xe0;
     83 		len = 3;
     84 	} else {
     85 		first = 0xf0;
     86 		len = 4;
     87 	}
     88 
     89 	for (size_t i = len - 1; i > 0; --i) {
     90 		str[i] = (ch & 0x3f) | 0x80;
     91 		ch >>= 6;
     92 	}
     93 
     94 	str[0] = ch | first;
     95 	return len;
     96 }
     97 
     98 int
     99 utf8_size(const char *s)
    100 {
    101 	uint8_t c = (uint8_t)*s;
    102 	for (size_t i = 0; i < sizeof(sizes) / 2; ++i) {
    103 		if ((c & sizes[i].mask) == sizes[i].result) {
    104 			return sizes[i].octets;
    105 		}
    106 	}
    107 	return -1;
    108 }
    109 
    110 uint32_t
    111 utf8_get(FILE *f)
    112 {
    113 	char buffer[UTF8_MAX_SIZE];
    114 	int c = fgetc(f);
    115 	if (c == EOF) {
    116 		return UTF8_INVALID;
    117 	}
    118 	buffer[0] = (char)c;
    119 	int size = utf8_size(buffer);
    120 
    121 	if (size > UTF8_MAX_SIZE) {
    122 		fseek(f, size - 1, SEEK_CUR);
    123 		return UTF8_INVALID;
    124 	}
    125 
    126 	if (size > 1) {
    127 		int amt = fread(&buffer[1], 1, size - 1, f);
    128 		if (amt != size - 1) {
    129 			return UTF8_INVALID;
    130 		}
    131 	}
    132 	const char *ptr = buffer;
    133 	return utf8_decode(&ptr);
    134 }