decode.ha (5769B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use bytes; 5 6 export type decoder = struct { 7 offs: size, 8 src: []u8, 9 }; 10 11 // Initializes a new UTF-8 decoder. 12 export fn decode(src: []u8) decoder = decoder { 13 src = src, 14 offs = 0, 15 }; 16 17 const masks: [2][8]u8 = [ 18 [0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f], 19 [0x7f, 0x1f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07], 20 ]; 21 22 // Returns the next rune from a decoder. done is returned when there are no 23 // remaining codepoints. 24 export fn next(d: *decoder) (rune | done | more | invalid) = { 25 if (d.offs == len(d.src)) { 26 return done; 27 }; 28 29 // from https://github.com/skeeto/scratch/blob/master/parsers/utf8_decode.c 30 // See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ 31 // and https://nullprogram.com/blog/2020/12/31/ for an explanation of 32 // the algorithm. 33 let next = 0, state = 0; 34 let r = 0u32; 35 for (d.offs < len(d.src); d.offs += 1) { 36 next = table[state][d.src[d.offs]]; 37 r = r << 6 | d.src[d.offs] & masks[(state - 1): uint >> 31][next & 7]; 38 if (next <= 0) { 39 d.offs += 1; 40 return if (next == 0) r: rune else invalid; 41 }; 42 state = next; 43 }; 44 return more; 45 }; 46 47 // Returns the previous rune from a decoder. done is returned when there are no 48 // previous codepoints. 49 export fn prev(d: *decoder) (rune | done | more | invalid) = { 50 if (d.offs == 0) { 51 return done; 52 }; 53 let n = d.offs; 54 d.offs -= 1; 55 for (d.offs < len(d.src); d.offs -= 1) { 56 if (table[0][d.src[d.offs]] != -1) { 57 let t = d.offs; 58 defer d.offs = t; 59 let r = next(d); 60 return if (n != d.offs || r is more) invalid else r; 61 }; 62 if (n - d.offs == 4) { 63 // Too many continuation bytes in a row 64 return invalid; 65 }; 66 }; 67 return more; 68 }; 69 70 // Returns a subslice from the next byte in the decoder to the end of the slice. 71 export fn remaining(d: *decoder) []u8 = d.src[d.offs..]; 72 73 // Return a subslice from the position of the first decoder to the position of 74 // the second decoder. The decoders must originate from the same slice and the 75 // position of the second decoder must not be before the position of the first 76 // one. 77 export fn slice(begin: *decoder, end: *decoder) []u8 = { 78 assert(begin.src: *[*]u8 == end.src: *[*]u8 && begin.offs <= end.offs); 79 return begin.src[begin.offs..end.offs]; 80 }; 81 82 @test fn decode() void = { 83 const input: [_]u8 = [ 84 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 85 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00, 86 ]; 87 assert(validate(input) is void); 88 const expected = ['こ', 'ん', 'に', 'ち', 'は', '\0']; 89 let decoder = decode(input); 90 for (let i = 0z; i < len(expected); i += 1) { 91 match (next(&decoder)) { 92 case (invalid | more | done) => 93 abort(); 94 case let r: rune => 95 assert(r == expected[i]); 96 }; 97 }; 98 assert(next(&decoder) is done); 99 assert(decoder.offs == len(decoder.src)); 100 for (let i = 0z; i < len(expected); i += 1) { 101 match (prev(&decoder)) { 102 case (invalid | more | done) => 103 abort(); 104 case let r: rune => 105 assert(r == expected[len(expected) - i - 1]); 106 }; 107 }; 108 assert(prev(&decoder) is done); 109 110 const inv: [_]u8 = [0xA0, 0xA1]; 111 decoder = decode(inv); 112 assert(next(&decoder) is invalid); 113 decoder.offs = 2; 114 assert(prev(&decoder) is more); 115 assert(validate(inv) is invalid); 116 117 const incomplete: [_]u8 = [0xE3, 0x81]; 118 decoder = decode(incomplete); 119 assert(next(&decoder) is more); 120 decoder.offs = 2; 121 assert(prev(&decoder) is invalid); 122 assert(validate(incomplete) is invalid); 123 124 const surrogate: [_]u8 = [0xED, 0xA0, 0x80]; 125 decoder = decode(surrogate); 126 assert(next(&decoder) is invalid); 127 decoder.offs = 3; 128 assert(prev(&decoder) is invalid); 129 assert(validate(surrogate) is invalid); 130 131 const overlong: [_]u8 = [0xF0, 0x82, 0x82, 0xAC]; 132 decoder = decode(overlong); 133 assert(next(&decoder) is invalid); 134 decoder.offs = 4; 135 assert(prev(&decoder) is invalid); 136 assert(validate(overlong) is invalid); 137 138 const badcont: [_]u8 = [0xC2, 0xFF]; 139 decoder = decode(badcont); 140 assert(next(&decoder) is invalid); 141 assert(validate(badcont) is invalid); 142 143 const extracont: [_]u8 = [0xC2, 0xA3, 0x95]; 144 decoder = decode(extracont); 145 decoder.offs = 3; 146 assert(prev(&decoder) is invalid); 147 assert(validate(extracont) is invalid); 148 const maxinrange: [_]u8 = [0xF4, 0x8F, 0xBF, 0xBF]; 149 decoder = decode(maxinrange); 150 match (next(&decoder)) { 151 case let r: rune => 152 assert(r == 0x10FFFFu32: rune); 153 case => abort(); 154 }; 155 decoder.offs = 4; 156 match (prev(&decoder)) { 157 case let r: rune => 158 assert(r == 0x10FFFFu32: rune); 159 case => abort(); 160 }; 161 162 const minoutofrange: [_]u8 = [0xF5, 0x94, 0x80, 0x80]; 163 decoder = decode(minoutofrange); 164 assert(next(&decoder) is invalid); 165 decoder.offs = 4; 166 assert(prev(&decoder) is invalid); 167 }; 168 169 @test fn slice() void = { 170 const input: [_]u8 = [ 171 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 172 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00, 173 ]; 174 let d1 = decode(input); 175 let d2 = d1; 176 assert(bytes::equal(remaining(&d1), input)); 177 assert(len(slice(&d1, &d2)) == 0 && len(slice(&d2, &d1)) == 0); 178 for (let i = 0; i < 2; i += 1) { 179 next(&d1)!; 180 next(&d2)!; 181 }; 182 assert(bytes::equal(remaining(&d1), input[6..])); 183 assert(len(slice(&d1, &d2)) == 0 && len(slice(&d2, &d1)) == 0); 184 for (let i = 0; i < 3; i += 1) { 185 next(&d2)!; 186 }; 187 assert(bytes::equal(remaining(&d2), input[15..])); 188 assert(bytes::equal(slice(&d1, &d2), input[6..15])); 189 for (let i = 0; i < 3; i += 1) { 190 next(&d1)!; 191 }; 192 assert(len(slice(&d1, &d2)) == 0 && len(slice(&d2, &d1)) == 0); 193 next(&d1)!; 194 assert(len(remaining(&d1)) == 0); 195 }; 196 197 // Returns void if a given byte slice contains only valid UTF-8 sequences, 198 // otherwise returns invalid. 199 export fn validate(src: []u8) (void | invalid) = { 200 let state = 0; 201 for (let i = 0z; i < len(src) && state >= 0; i += 1) { 202 state = table[state][src[i]]; 203 }; 204 return if (state == 0) void else invalid; 205 };