decode.ha (4375B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 export type decoder = struct { 5 offs: size, 6 src: []u8, 7 }; 8 9 // Initializes a new UTF-8 decoder. 10 export fn decode(src: []u8) decoder = decoder { 11 src = src, 12 offs = 0, 13 }; 14 15 const masks: [2][8]u8 = [ 16 [0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f], 17 [0x7f, 0x1f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07], 18 ]; 19 20 // Returns the next rune from a decoder. void is returned when there are no 21 // remaining codepoints. 22 export fn next(d: *decoder) (rune | void | more | invalid) = { 23 if (d.offs == len(d.src)) { 24 return; 25 }; 26 27 // from https://github.com/skeeto/scratch/blob/master/parsers/utf8_decode.c 28 // See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 29 // and https://nullprogram.com/blog/2020/12/31/ for an explanation of 30 // the algorithm. 31 let next = 0, state = 0; 32 let r = 0u32; 33 for (d.offs < len(d.src); d.offs += 1) { 34 next = table[state][d.src[d.offs]]; 35 r = r << 6 | d.src[d.offs] & masks[(state - 1): uint >> 31][next & 7]; 36 if (next <= 0) { 37 d.offs += 1; 38 return if (next == 0) r: rune else invalid; 39 }; 40 state = next; 41 }; 42 return more; 43 }; 44 45 // Returns the previous rune from a decoder. void is returned when there are no 46 // previous codepoints. 47 export fn prev(d: *decoder) (rune | void | more | invalid) = { 48 if (d.offs == 0) { 49 return; 50 }; 51 let n = d.offs; 52 d.offs -= 1; 53 for (d.offs < len(d.src); d.offs -= 1) { 54 if (table[0][d.src[d.offs]] != -1) { 55 let t = d.offs; 56 defer d.offs = t; 57 let r = next(d); 58 return if (n != d.offs || r is more) invalid else r; 59 }; 60 if (n - d.offs == 4) { 61 // Too many continuation bytes in a row 62 return invalid; 63 }; 64 }; 65 return more; 66 }; 67 68 @test fn decode() void = { 69 const input: [_]u8 = [ 70 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 71 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00, 72 ]; 73 assert(validate(input) is void); 74 const expected = ['こ', 'ん', 'に', 'ち', 'は', '\0']; 75 let decoder = decode(input); 76 for (let i = 0z; i < len(expected); i += 1) { 77 match (next(&decoder)) { 78 case (invalid | more | void) => 79 abort(); 80 case let r: rune => 81 assert(r == expected[i]); 82 }; 83 }; 84 assert(next(&decoder) is void); 85 assert(decoder.offs == len(decoder.src)); 86 for (let i = 0z; i < len(expected); i += 1) { 87 match (prev(&decoder)) { 88 case (invalid | more | void) => 89 abort(); 90 case let r: rune => 91 assert(r == expected[len(expected) - i - 1]); 92 }; 93 }; 94 assert(prev(&decoder) is void); 95 96 const inv: [_]u8 = [0xA0, 0xA1]; 97 decoder = decode(inv); 98 assert(next(&decoder) is invalid); 99 decoder.offs = 2; 100 assert(prev(&decoder) is more); 101 assert(validate(inv) is invalid); 102 103 const incomplete: [_]u8 = [0xE3, 0x81]; 104 decoder = decode(incomplete); 105 assert(next(&decoder) is more); 106 decoder.offs = 2; 107 assert(prev(&decoder) is invalid); 108 assert(validate(incomplete) is invalid); 109 110 const surrogate: [_]u8 = [0xED, 0xA0, 0x80]; 111 decoder = decode(surrogate); 112 assert(next(&decoder) is invalid); 113 decoder.offs = 3; 114 assert(prev(&decoder) is invalid); 115 assert(validate(surrogate) is invalid); 116 117 const overlong: [_]u8 = [0xF0, 0x82, 0x82, 0xAC]; 118 decoder = decode(overlong); 119 assert(next(&decoder) is invalid); 120 decoder.offs = 4; 121 assert(prev(&decoder) is invalid); 122 assert(validate(overlong) is invalid); 123 124 const badcont: [_]u8 = [0xC2, 0xFF]; 125 decoder = decode(badcont); 126 assert(next(&decoder) is invalid); 127 assert(validate(badcont) is invalid); 128 129 const extracont: [_]u8 = [0xC2, 0xA3, 0x95]; 130 decoder = decode(extracont); 131 decoder.offs = 3; 132 assert(prev(&decoder) is invalid); 133 assert(validate(extracont) is invalid); 134 const maxinrange: [_]u8 = [0xF4, 0x8F, 0xBF, 0xBF]; 135 decoder = decode(maxinrange); 136 match (next(&decoder)) { 137 case let r: rune => 138 assert(r == 0x10FFFFu32: rune); 139 case => abort(); 140 }; 141 decoder.offs = 4; 142 match (prev(&decoder)) { 143 case let r: rune => 144 assert(r == 0x10FFFFu32: rune); 145 case => abort(); 146 }; 147 148 const minoutofrange: [_]u8 = [0xF5, 0x94, 0x80, 0x80]; 149 decoder = decode(minoutofrange); 150 assert(next(&decoder) is invalid); 151 decoder.offs = 4; 152 assert(prev(&decoder) is invalid); 153 }; 154 155 // Returns void if a given byte slice contains only valid UTF-8 sequences, 156 // otherwise returns invalid. 157 export fn validate(src: []u8) (void | invalid) = { 158 let state = 0; 159 for (let i = 0z; i < len(src) && state >= 0; i += 1) { 160 state = table[state][src[i]]; 161 }; 162 return if (state == 0) void else invalid; 163 };