harec

[hare] Hare compiler, written in C11 for POSIX OSs
Log | Files | Refs | README | LICENSE

utf8.c (2007B)


      1 #include <stdint.h>
      2 #include <stdio.h>
      3 #include "utf8.h"
      4 
      5 uint8_t masks[] = {
      6 	0x7F,
      7 	0x1F,
      8 	0x0F,
      9 	0x07,
     10 	0x03,
     11 	0x01
     12 };
     13 
     14 struct {
     15 	uint8_t mask;
     16 	uint8_t result;
     17 	int octets;
     18 } sizes[] = {
     19 	{ 0x80, 0x00, 1 },
     20 	{ 0xE0, 0xC0, 2 },
     21 	{ 0xF0, 0xE0, 3 },
     22 	{ 0xF8, 0xF0, 4 },
     23 	{ 0xFC, 0xF8, 5 },
     24 	{ 0xFE, 0xFC, 6 },
     25 	{ 0x80, 0x80, -1 },
     26 };
     27 
     28 size_t
     29 utf8_cpsize(uint32_t ch)
     30 {
     31 	if (ch < 0x80) {
     32 		return 1;
     33 	} else if (ch < 0x800) {
     34 		return 2;
     35 	} else if (ch < 0x10000) {
     36 		return 3;
     37 	}
     38 	return 4;
     39 }
     40 
     41 uint32_t
     42 utf8_decode(const char **char_str)
     43 {
     44 	uint8_t **s = (uint8_t **)char_str;
     45 
     46 	uint32_t cp = 0;
     47 	if (**s < 128) {
     48 		// shortcut
     49 		cp = **s;
     50 		++*s;
     51 		return cp;
     52 	}
     53 	int size = utf8_size((char *)*s);
     54 	if (size == -1) {
     55 		++*s;
     56 		return UTF8_INVALID;
     57 	}
     58 	uint8_t mask = masks[size - 1];
     59 	cp = **s & mask;
     60 	++*s;
     61 	while (--size) {
     62 		uint8_t c = **s;
     63 
     64 		++*s;
     65 
     66 		if ((c >> 6) != 0x02)
     67 			return UTF8_INVALID;
     68 
     69 		cp <<= 6;
     70 		cp |= c & 0x3f;
     71 	}
     72 	return cp;
     73 }
     74 
     75 size_t
     76 utf8_encode(char *str, uint32_t ch)
     77 {
     78 	size_t len = 0;
     79 	uint8_t first;
     80 
     81 	if (ch < 0x80) {
     82 		first = 0;
     83 		len = 1;
     84 	} else if (ch < 0x800) {
     85 		first = 0xc0;
     86 		len = 2;
     87 	} else if (ch < 0x10000) {
     88 		first = 0xe0;
     89 		len = 3;
     90 	} else {
     91 		first = 0xf0;
     92 		len = 4;
     93 	}
     94 
     95 	for (size_t i = len - 1; i > 0; --i) {
     96 		str[i] = (ch & 0x3f) | 0x80;
     97 		ch >>= 6;
     98 	}
     99 
    100 	str[0] = ch | first;
    101 	return len;
    102 }
    103 
    104 int
    105 utf8_size(const char *s)
    106 {
    107 	uint8_t c = (uint8_t)*s;
    108 	for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); ++i) {
    109 		if ((c & sizes[i].mask) == sizes[i].result) {
    110 			return sizes[i].octets;
    111 		}
    112 	}
    113 	return -1;
    114 }
    115 
    116 uint32_t
    117 utf8_get(FILE *f)
    118 {
    119 	char buffer[UTF8_MAX_SIZE];
    120 	int c = fgetc(f);
    121 	if (c == EOF) {
    122 		return UTF8_INVALID;
    123 	}
    124 	buffer[0] = (char)c;
    125 	int size = utf8_size(buffer);
    126 
    127 	if (size > UTF8_MAX_SIZE) {
    128 		fseek(f, size - 1, SEEK_CUR);
    129 		return UTF8_INVALID;
    130 	}
    131 
    132 	if (size > 1) {
    133 		int amt = fread(&buffer[1], 1, size - 1, f);
    134 		if (amt != size - 1) {
    135 			return UTF8_INVALID;
    136 		}
    137 	}
    138 	const char *ptr = buffer;
    139 	return utf8_decode(&ptr);
    140 }