hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

decode.ha (5769B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use bytes;
      5 
      6 export type decoder = struct {
      7 	offs: size,
      8 	src: []u8,
      9 };
     10 
     11 // Initializes a new UTF-8 decoder.
     12 export fn decode(src: []u8) decoder = decoder {
     13 	src = src,
     14 	offs = 0,
     15 };
     16 
     17 const masks: [2][8]u8 = [
     18 	[0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f],
     19 	[0x7f, 0x1f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07],
     20 ];
     21 
     22 // Returns the next rune from a decoder. done is returned when there are no
     23 // remaining codepoints.
     24 export fn next(d: *decoder) (rune | done | more | invalid) = {
     25 	if (d.offs == len(d.src)) {
     26 		return done;
     27 	};
     28 
     29 	// from https://github.com/skeeto/scratch/blob/master/parsers/utf8_decode.c
     30 	// See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
     31 	// and https://nullprogram.com/blog/2020/12/31/ for an explanation of
     32 	// the algorithm.
     33 	let next = 0, state = 0;
     34 	let r = 0u32;
     35 	for (d.offs < len(d.src); d.offs += 1) {
     36 		next = table[state][d.src[d.offs]];
     37 		r = r << 6 | d.src[d.offs] & masks[(state - 1): uint >> 31][next & 7];
     38 		if (next <= 0) {
     39 			d.offs += 1;
     40 			return if (next == 0) r: rune else invalid;
     41 		};
     42 		state = next;
     43 	};
     44 	return more;
     45 };
     46 
     47 // Returns the previous rune from a decoder. done is returned when there are no
     48 // previous codepoints.
     49 export fn prev(d: *decoder) (rune | done | more | invalid) = {
     50 	if (d.offs == 0) {
     51 		return done;
     52 	};
     53 	let n = d.offs;
     54 	d.offs -= 1;
     55 	for (d.offs < len(d.src); d.offs -= 1) {
     56 		if (table[0][d.src[d.offs]] != -1) {
     57 			let t = d.offs;
     58 			defer d.offs = t;
     59 			let r = next(d);
     60 			return if (n != d.offs || r is more) invalid else r;
     61 		};
     62 		if (n - d.offs == 4) {
     63 			// Too many continuation bytes in a row
     64 			return invalid;
     65 		};
     66 	};
     67 	return more;
     68 };
     69 
     70 // Returns a subslice from the next byte in the decoder to the end of the slice.
     71 export fn remaining(d: *decoder) []u8 = d.src[d.offs..];
     72 
     73 // Return a subslice from the position of the first decoder to the position of
     74 // the second decoder. The decoders must originate from the same slice and the
     75 // position of the second decoder must not be before the position of the first
     76 // one.
     77 export fn slice(begin: *decoder, end: *decoder) []u8 = {
     78 	assert(begin.src: *[*]u8 == end.src: *[*]u8 && begin.offs <= end.offs);
     79 	return begin.src[begin.offs..end.offs];
     80 };
     81 
     82 @test fn decode() void = {
     83 	const input: [_]u8 = [
     84 		0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81,
     85 		0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00,
     86 	];
     87 	assert(validate(input) is void);
     88 	const expected = ['こ', 'ん', 'に', 'ち', 'は', '\0'];
     89 	let decoder = decode(input);
     90 	for (let i = 0z; i < len(expected); i += 1) {
     91 		match (next(&decoder)) {
     92 		case (invalid | more | done) =>
     93 			abort();
     94 		case let r: rune =>
     95 			assert(r == expected[i]);
     96 		};
     97 	};
     98 	assert(next(&decoder) is done);
     99 	assert(decoder.offs == len(decoder.src));
    100 	for (let i = 0z; i < len(expected); i += 1) {
    101 		match (prev(&decoder)) {
    102 		case (invalid | more | done) =>
    103 			abort();
    104 		case let r: rune =>
    105 			assert(r == expected[len(expected) - i - 1]);
    106 		};
    107 	};
    108 	assert(prev(&decoder) is done);
    109 
    110 	const inv: [_]u8 = [0xA0, 0xA1];
    111 	decoder = decode(inv);
    112 	assert(next(&decoder) is invalid);
    113 	decoder.offs = 2;
    114 	assert(prev(&decoder) is more);
    115 	assert(validate(inv) is invalid);
    116 
    117 	const incomplete: [_]u8 = [0xE3, 0x81];
    118 	decoder = decode(incomplete);
    119 	assert(next(&decoder) is more);
    120 	decoder.offs = 2;
    121 	assert(prev(&decoder) is invalid);
    122 	assert(validate(incomplete) is invalid);
    123 
    124 	const surrogate: [_]u8 = [0xED, 0xA0, 0x80];
    125 	decoder = decode(surrogate);
    126 	assert(next(&decoder) is invalid);
    127 	decoder.offs = 3;
    128 	assert(prev(&decoder) is invalid);
    129 	assert(validate(surrogate) is invalid);
    130 
    131 	const overlong: [_]u8 = [0xF0, 0x82, 0x82, 0xAC];
    132 	decoder = decode(overlong);
    133 	assert(next(&decoder) is invalid);
    134 	decoder.offs = 4;
    135 	assert(prev(&decoder) is invalid);
    136 	assert(validate(overlong) is invalid);
    137 
    138 	const badcont: [_]u8 = [0xC2, 0xFF];
    139 	decoder = decode(badcont);
    140 	assert(next(&decoder) is invalid);
    141 	assert(validate(badcont) is invalid);
    142 
    143 	const extracont: [_]u8 = [0xC2, 0xA3, 0x95];
    144 	decoder = decode(extracont);
    145 	decoder.offs = 3;
    146 	assert(prev(&decoder) is invalid);
    147 	assert(validate(extracont) is invalid);
    148 	const maxinrange: [_]u8 = [0xF4, 0x8F, 0xBF, 0xBF];
    149 	decoder = decode(maxinrange);
    150 	match (next(&decoder)) {
    151 	case let r: rune =>
    152 		assert(r == 0x10FFFFu32: rune);
    153 	case => abort();
    154 	};
    155 	decoder.offs = 4;
    156 	match (prev(&decoder)) {
    157 	case let r: rune =>
    158 		assert(r == 0x10FFFFu32: rune);
    159 	case => abort();
    160 	};
    161 
    162 	const minoutofrange: [_]u8 = [0xF5, 0x94, 0x80, 0x80];
    163 	decoder = decode(minoutofrange);
    164 	assert(next(&decoder) is invalid);
    165 	decoder.offs = 4;
    166 	assert(prev(&decoder) is invalid);
    167 };
    168 
    169 @test fn slice() void = {
    170 	const input: [_]u8 = [
    171 		0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81,
    172 		0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00,
    173 	];
    174 	let d1 = decode(input);
    175 	let d2 = d1;
    176 	assert(bytes::equal(remaining(&d1), input));
    177 	assert(len(slice(&d1, &d2)) == 0 && len(slice(&d2, &d1)) == 0);
    178 	for (let i = 0; i < 2; i += 1) {
    179 		next(&d1)!;
    180 		next(&d2)!;
    181 	};
    182 	assert(bytes::equal(remaining(&d1), input[6..]));
    183 	assert(len(slice(&d1, &d2)) == 0 && len(slice(&d2, &d1)) == 0);
    184 	for (let i = 0; i < 3; i += 1) {
    185 		next(&d2)!;
    186 	};
    187 	assert(bytes::equal(remaining(&d2), input[15..]));
    188 	assert(bytes::equal(slice(&d1, &d2), input[6..15]));
    189 	for (let i = 0; i < 3; i += 1) {
    190 		next(&d1)!;
    191 	};
    192 	assert(len(slice(&d1, &d2)) == 0 && len(slice(&d2, &d1)) == 0);
    193 	next(&d1)!;
    194 	assert(len(remaining(&d1)) == 0);
    195 };
    196 
    197 // Returns void if a given byte slice contains only valid UTF-8 sequences,
    198 // otherwise returns invalid.
    199 export fn validate(src: []u8) (void | invalid) = {
    200 	let state = 0;
    201 	for (let i = 0z; i < len(src) && state >= 0; i += 1) {
    202 		state = table[state][src[i]];
    203 	};
    204 	return if (state == 0) void else invalid;
    205 };