utf8.c (2007B)
1 #include <stdint.h> 2 #include <stdio.h> 3 #include "utf8.h" 4 5 uint8_t masks[] = { 6 0x7F, 7 0x1F, 8 0x0F, 9 0x07, 10 0x03, 11 0x01 12 }; 13 14 struct { 15 uint8_t mask; 16 uint8_t result; 17 int octets; 18 } sizes[] = { 19 { 0x80, 0x00, 1 }, 20 { 0xE0, 0xC0, 2 }, 21 { 0xF0, 0xE0, 3 }, 22 { 0xF8, 0xF0, 4 }, 23 { 0xFC, 0xF8, 5 }, 24 { 0xFE, 0xFC, 6 }, 25 { 0x80, 0x80, -1 }, 26 }; 27 28 size_t 29 utf8_cpsize(uint32_t ch) 30 { 31 if (ch < 0x80) { 32 return 1; 33 } else if (ch < 0x800) { 34 return 2; 35 } else if (ch < 0x10000) { 36 return 3; 37 } 38 return 4; 39 } 40 41 uint32_t 42 utf8_decode(const char **char_str) 43 { 44 uint8_t **s = (uint8_t **)char_str; 45 46 uint32_t cp = 0; 47 if (**s < 128) { 48 // shortcut 49 cp = **s; 50 ++*s; 51 return cp; 52 } 53 int size = utf8_size((char *)*s); 54 if (size == -1) { 55 ++*s; 56 return UTF8_INVALID; 57 } 58 uint8_t mask = masks[size - 1]; 59 cp = **s & mask; 60 ++*s; 61 while (--size) { 62 uint8_t c = **s; 63 64 ++*s; 65 66 if ((c >> 6) != 0x02) 67 return UTF8_INVALID; 68 69 cp <<= 6; 70 cp |= c & 0x3f; 71 } 72 return cp; 73 } 74 75 size_t 76 utf8_encode(char *str, uint32_t ch) 77 { 78 size_t len = 0; 79 uint8_t first; 80 81 if (ch < 0x80) { 82 first = 0; 83 len = 1; 84 } else if (ch < 0x800) { 85 first = 0xc0; 86 len = 2; 87 } else if (ch < 0x10000) { 88 first = 0xe0; 89 len = 3; 90 } else { 91 first = 0xf0; 92 len = 4; 93 } 94 95 for (size_t i = len - 1; i > 0; --i) { 96 str[i] = (ch & 0x3f) | 0x80; 97 ch >>= 6; 98 } 99 100 str[0] = ch | first; 101 return len; 102 } 103 104 int 105 utf8_size(const char *s) 106 { 107 uint8_t c = (uint8_t)*s; 108 for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); ++i) { 109 if ((c & sizes[i].mask) == sizes[i].result) { 110 return sizes[i].octets; 111 } 112 } 113 return -1; 114 } 115 116 uint32_t 117 utf8_get(FILE *f) 118 { 119 char buffer[UTF8_MAX_SIZE]; 120 int c = fgetc(f); 121 if (c == EOF) { 122 return UTF8_INVALID; 123 } 124 buffer[0] = (char)c; 125 int size = utf8_size(buffer); 126 127 if (size > UTF8_MAX_SIZE) { 128 fseek(f, size - 1, SEEK_CUR); 129 return UTF8_INVALID; 130 } 131 132 if (size > 1) { 133 int amt = fread(&buffer[1], 1, size - 1, f); 134 if (amt != size - 1) { 135 return UTF8_INVALID; 136 } 137 } 138 const char *ptr = buffer; 139 return utf8_decode(&ptr); 140 }