hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

decode.ha (4375B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 export type decoder = struct {
      5 	offs: size,
      6 	src: []u8,
      7 };
      8 
      9 // Initializes a new UTF-8 decoder.
     10 export fn decode(src: []u8) decoder = decoder {
     11 	src = src,
     12 	offs = 0,
     13 };
     14 
     15 const masks: [2][8]u8 = [
     16 	[0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f],
     17 	[0x7f, 0x1f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07],
     18 ];
     19 
     20 // Returns the next rune from a decoder. void is returned when there are no
     21 // remaining codepoints.
     22 export fn next(d: *decoder) (rune | void | more | invalid) = {
     23 	if (d.offs == len(d.src)) {
     24 		return;
     25 	};
     26 
     27 	// from https://github.com/skeeto/scratch/blob/master/parsers/utf8_decode.c
     28 	// See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
     29 	// and https://nullprogram.com/blog/2020/12/31/ for an explanation of
     30 	// the algorithm.
     31 	let next = 0, state = 0;
     32 	let r = 0u32;
     33 	for (d.offs < len(d.src); d.offs += 1) {
     34 		next = table[state][d.src[d.offs]];
     35 		r = r << 6 | d.src[d.offs] & masks[(state - 1): uint >> 31][next & 7];
     36 		if (next <= 0) {
     37 			d.offs += 1;
     38 			return if (next == 0) r: rune else invalid;
     39 		};
     40 		state = next;
     41 	};
     42 	return more;
     43 };
     44 
     45 // Returns the previous rune from a decoder. void is returned when there are no
     46 // previous codepoints.
     47 export fn prev(d: *decoder) (rune | void | more | invalid) = {
     48 	if (d.offs == 0) {
     49 		return;
     50 	};
     51 	let n = d.offs;
     52 	d.offs -= 1;
     53 	for (d.offs < len(d.src); d.offs -= 1) {
     54 		if (table[0][d.src[d.offs]] != -1) {
     55 			let t = d.offs;
     56 			defer d.offs = t;
     57 			let r = next(d);
     58 			return if (n != d.offs || r is more) invalid else r;
     59 		};
     60 		if (n - d.offs == 4) {
     61 			// Too many continuation bytes in a row
     62 			return invalid;
     63 		};
     64 	};
     65 	return more;
     66 };
     67 
     68 @test fn decode() void = {
     69 	const input: [_]u8 = [
     70 		0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81,
     71 		0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00,
     72 	];
     73 	assert(validate(input) is void);
     74 	const expected = ['こ', 'ん', 'に', 'ち', 'は', '\0'];
     75 	let decoder = decode(input);
     76 	for (let i = 0z; i < len(expected); i += 1) {
     77 		match (next(&decoder)) {
     78 		case (invalid | more | void) =>
     79 			abort();
     80 		case let r: rune =>
     81 			assert(r == expected[i]);
     82 		};
     83 	};
     84 	assert(next(&decoder) is void);
     85 	assert(decoder.offs == len(decoder.src));
     86 	for (let i = 0z; i < len(expected); i += 1) {
     87 		match (prev(&decoder)) {
     88 		case (invalid | more | void) =>
     89 			abort();
     90 		case let r: rune =>
     91 			assert(r == expected[len(expected) - i - 1]);
     92 		};
     93 	};
     94 	assert(prev(&decoder) is void);
     95 
     96 	const inv: [_]u8 = [0xA0, 0xA1];
     97 	decoder = decode(inv);
     98 	assert(next(&decoder) is invalid);
     99 	decoder.offs = 2;
    100 	assert(prev(&decoder) is more);
    101 	assert(validate(inv) is invalid);
    102 
    103 	const incomplete: [_]u8 = [0xE3, 0x81];
    104 	decoder = decode(incomplete);
    105 	assert(next(&decoder) is more);
    106 	decoder.offs = 2;
    107 	assert(prev(&decoder) is invalid);
    108 	assert(validate(incomplete) is invalid);
    109 
    110 	const surrogate: [_]u8 = [0xED, 0xA0, 0x80];
    111 	decoder = decode(surrogate);
    112 	assert(next(&decoder) is invalid);
    113 	decoder.offs = 3;
    114 	assert(prev(&decoder) is invalid);
    115 	assert(validate(surrogate) is invalid);
    116 
    117 	const overlong: [_]u8 = [0xF0, 0x82, 0x82, 0xAC];
    118 	decoder = decode(overlong);
    119 	assert(next(&decoder) is invalid);
    120 	decoder.offs = 4;
    121 	assert(prev(&decoder) is invalid);
    122 	assert(validate(overlong) is invalid);
    123 
    124 	const badcont: [_]u8 = [0xC2, 0xFF];
    125 	decoder = decode(badcont);
    126 	assert(next(&decoder) is invalid);
    127 	assert(validate(badcont) is invalid);
    128 
    129 	const extracont: [_]u8 = [0xC2, 0xA3, 0x95];
    130 	decoder = decode(extracont);
    131 	decoder.offs = 3;
    132 	assert(prev(&decoder) is invalid);
    133 	assert(validate(extracont) is invalid);
    134 	const maxinrange: [_]u8 = [0xF4, 0x8F, 0xBF, 0xBF];
    135 	decoder = decode(maxinrange);
    136 	match (next(&decoder)) {
    137 	case let r: rune =>
    138 		assert(r == 0x10FFFFu32: rune);
    139 	case => abort();
    140 	};
    141 	decoder.offs = 4;
    142 	match (prev(&decoder)) {
    143 	case let r: rune =>
    144 		assert(r == 0x10FFFFu32: rune);
    145 	case => abort();
    146 	};
    147 
    148 	const minoutofrange: [_]u8 = [0xF5, 0x94, 0x80, 0x80];
    149 	decoder = decode(minoutofrange);
    150 	assert(next(&decoder) is invalid);
    151 	decoder.offs = 4;
    152 	assert(prev(&decoder) is invalid);
    153 };
    154 
    155 // Returns void if a given byte slice contains only valid UTF-8 sequences,
    156 // otherwise returns invalid.
    157 export fn validate(src: []u8) (void | invalid) = {
    158 	let state = 0;
    159 	for (let i = 0z; i < len(src) && state >= 0; i += 1) {
    160 		state = table[state][src[i]];
    161 	};
    162 	return if (state == 0) void else invalid;
    163 };