encoding::utf8: improve decoder - hare - [hare] The Hare programming language

commit 081a021cc699985aeaf6abeac94a9a01679368bd
parent a2e9cfc788855bb1fb636e90b9ddbf19a87564f9
Author: Kirill Primak <vyivel@eclair.cafe>
Date:   Sun, 18 Sep 2022 15:39:47 +0300

encoding::utf8: improve decoder

This commit:
- introduces checks for surrogates;
- adds checks for continuation bytes' high order two bits in
  utf8::next();
- fixes possible out-of-bounds slice access in utf8::prev()
  (with e.g. [0xFF, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0], scanning from
  the end);
- adds more invalid byte sequences for tests.

Additionaly, `use types` statements are removed.

Signed-off-by: Kirill Primak <vyivel@eclair.cafe>

Diffstat:
M encoding/utf8/decode.ha  | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
M encoding/utf8/rune.ha  | 3 ---

2 files changed, 82 insertions(+), 39 deletions(-)
diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
@@ -2,7 +2,6 @@
 // (c) 2021 Bor Grošelj Simić <bor.groseljsimic@telemach.net>
 // (c) 2021 Drew DeVault <sir@cmpwn.com>
 // (c) 2021 Eyal Sawady <ecs@d2evs.net>
-use types;
 
 fn toutf8(in: str) []u8 = *(&in: *[]u8);
 
@@ -26,6 +25,18 @@ export type more = void;
 // Returned when an invalid UTF-8 sequence was found.
 export type invalid = !void;
 
+const leader_masks: [_]u8 = [0x7F, 0x3F, 0x1F, 0x0F];
+
+fn decode_leader(c: u8) ((size, u8) | invalid) = {
+	for (let i = 0z; i < len(sizes); i += 1) {
+		if (c & sizes[i].mask == sizes[i].result) {
+			return (sizes[i].octets, c & leader_masks[i]);
+		};
+	};
+	// Bad leading byte
+	return invalid;
+};
+
 // Returns the next rune from a decoder. void is returned when there are no
 // remaining codepoints.
 export fn next(d: *decoder) (rune | void | more | invalid) = {
@@ -34,31 +45,32 @@ export fn next(d: *decoder) (rune | void | more | invalid) = {
 		return;
 	};
 
-	// XXX: It would be faster if we decoded and measured at the same time.
-	const n = match (utf8sz(d.src[d.offs])) {
-	case let z: size =>
-		yield z;
-	case void =>
-		return invalid;
-	};
+	const (n, leader) = decode_leader(d.src[d.offs])?;
 	if (d.offs + n > len(d.src)) {
 		return more;
 	};
-	let bytes = d.src[d.offs..d.offs+n];
-	d.offs += n;
 
-	let r = 0u32;
-	if (bytes[0] < 128) {
-		// ASCII
-		return bytes[0]: u32: rune;
-	};
-
-	const mask = masks[n - 1];
-	r = bytes[0] & mask;
-	for (let i = 1z; i < len(bytes); i += 1) {
-		r <<= 6;
-		r |= bytes[i] & 0x3F;
+	let r = leader: u32;
+	if (n > 1) {
+		for (let i = 1z; i < n; i += 1) {
+			let byte = d.src[d.offs + i];
+			if ((byte & 0xC0) != 0x80) {
+				// Bad continuation byte
+				return invalid;
+			};
+			r <<= 6;
+			r |= byte & 0x3F;
+		};
+		if (r >= 0xD800 && r <= 0xDFFF) {
+			// UTF-16 surrogates
+			return invalid;
+		};
+		if (runesz(r: rune) != n) {
+			// Overlong encoding
+			return invalid;
+		};
 	};
+	d.offs += n;
 	return r: rune;
 };
 
@@ -69,30 +81,44 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
 		return;
 	};
 
-	let n = 0z;
 	let r = 0u32;
-
+	let n = 0z;
 	for (let i = 0z; i < d.offs; i += 1) {
-		if ((d.src[d.offs - i - 1] & 0xC0) == 0x80) {
-			let tmp: u32 = d.src[d.offs - i - 1] & 0x3F;
-			r |= tmp << (i * 6): u32;
+		let byte = d.src[d.offs - i - 1];
+		if ((byte & 0xC0) == 0x80) {
+			if (i == 3) {
+				// Too many continuation bytes in a row
+				return invalid;
+			};
+			byte &= 0x3F;
+			r |= byte << (i * 6): u32;
 		} else {
-			n = i + 1;
-			let tmp: u32 = d.src[d.offs - i - 1] & masks[i];
-			r |=  tmp << (i * 6): u32;
+			const nl = decode_leader(byte)?;
+			n = nl.0;
+			if (i + 1 != n) {
+				// Trailing continuation bytes
+				return invalid;
+			};
+			r |= nl.1 << (i * 6): u32;
 			break;
 		};
 	};
+
 	if (n == 0) {
 		return more;
+	} else if (n > 1) {
+		if (r >= 0xD800 && r <= 0xDFFF) {
+			// UTF-16 surrogates
+			return invalid;
+		};
+		if (runesz(r: rune) != n) {
+			// Overlong encoding
+			return invalid;
+		};
 	};
+
 	d.offs -= n;
-	match (utf8sz(d.src[d.offs])) {
-	case let z: size =>
-		return if (n == z) r: rune else invalid;
-	case void =>
-		return invalid;
-	};
+	return r: rune;
 };
 
 @test fn decode() void = {
@@ -122,7 +148,6 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
 	};
 	assert(prev(&decoder) is void);
 
-	// TODO: Test more invalid sequences
 	const inv: [_]u8 = [0xA0, 0xA1];
 	decoder = decode(inv);
 	assert(next(&decoder) is invalid);
@@ -134,6 +159,27 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
 	assert(next(&decoder) is more);
 	decoder.offs = 2;
 	assert(prev(&decoder) is invalid);
+
+	const surrogate: [_]u8 = [0xED, 0xA0, 0x80];
+	decoder = decode(surrogate);
+	assert(next(&decoder) is invalid);
+	decoder.offs = 3;
+	assert(prev(&decoder) is invalid);
+
+	const overlong: [_]u8 = [0xF0, 0x82, 0x82, 0xAC];
+	decoder = decode(overlong);
+	assert(next(&decoder) is invalid);
+	decoder.offs = 4;
+	assert(prev(&decoder) is invalid);
+
+	const badcont: [_]u8 = [0xC2, 0xFF];
+	decoder = decode(badcont);
+	assert(next(&decoder) is invalid);
+
+	const extracont: [_]u8 = [0xC2, 0xA3, 0x95];
+	decoder = decode(extracont);
+	decoder.offs = 3;
+	assert(prev(&decoder) is invalid);
 };
 
 // Returns true if a given string or byte slice contains only valid UTF-8
diff --git a/encoding/utf8/rune.ha b/encoding/utf8/rune.ha
@@ -1,8 +1,5 @@
 // License: MPL-2.0
 // (c) 2021 Drew DeVault <sir@cmpwn.com>
-use types;
-
-const masks: [_]u8 = [0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01];
 
 type rsize = struct {
 	mask: u8,

	hare [hare] The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

M	encoding/utf8/decode.ha	\|	118	+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
M	encoding/utf8/rune.ha	\|	3	---