commit 081a021cc699985aeaf6abeac94a9a01679368bd
parent a2e9cfc788855bb1fb636e90b9ddbf19a87564f9
Author: Kirill Primak <vyivel@eclair.cafe>
Date: Sun, 18 Sep 2022 15:39:47 +0300
encoding::utf8: improve decoder
This commit:
- introduces checks for surrogates;
- adds checks for continuation bytes' high order two bits in
utf8::next();
- fixes possible out-of-bounds slice access in utf8::prev()
(with e.g. [0xFF, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0], scanning from
the end);
- adds more invalid byte sequences for tests.
Additionaly, `use types` statements are removed.
Signed-off-by: Kirill Primak <vyivel@eclair.cafe>
Diffstat:
2 files changed, 82 insertions(+), 39 deletions(-)
diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
@@ -2,7 +2,6 @@
// (c) 2021 Bor Grošelj Simić <bor.groseljsimic@telemach.net>
// (c) 2021 Drew DeVault <sir@cmpwn.com>
// (c) 2021 Eyal Sawady <ecs@d2evs.net>
-use types;
fn toutf8(in: str) []u8 = *(&in: *[]u8);
@@ -26,6 +25,18 @@ export type more = void;
// Returned when an invalid UTF-8 sequence was found.
export type invalid = !void;
+const leader_masks: [_]u8 = [0x7F, 0x3F, 0x1F, 0x0F];
+
+fn decode_leader(c: u8) ((size, u8) | invalid) = {
+ for (let i = 0z; i < len(sizes); i += 1) {
+ if (c & sizes[i].mask == sizes[i].result) {
+ return (sizes[i].octets, c & leader_masks[i]);
+ };
+ };
+ // Bad leading byte
+ return invalid;
+};
+
// Returns the next rune from a decoder. void is returned when there are no
// remaining codepoints.
export fn next(d: *decoder) (rune | void | more | invalid) = {
@@ -34,31 +45,32 @@ export fn next(d: *decoder) (rune | void | more | invalid) = {
return;
};
- // XXX: It would be faster if we decoded and measured at the same time.
- const n = match (utf8sz(d.src[d.offs])) {
- case let z: size =>
- yield z;
- case void =>
- return invalid;
- };
+ const (n, leader) = decode_leader(d.src[d.offs])?;
if (d.offs + n > len(d.src)) {
return more;
};
- let bytes = d.src[d.offs..d.offs+n];
- d.offs += n;
- let r = 0u32;
- if (bytes[0] < 128) {
- // ASCII
- return bytes[0]: u32: rune;
- };
-
- const mask = masks[n - 1];
- r = bytes[0] & mask;
- for (let i = 1z; i < len(bytes); i += 1) {
- r <<= 6;
- r |= bytes[i] & 0x3F;
+ let r = leader: u32;
+ if (n > 1) {
+ for (let i = 1z; i < n; i += 1) {
+ let byte = d.src[d.offs + i];
+ if ((byte & 0xC0) != 0x80) {
+ // Bad continuation byte
+ return invalid;
+ };
+ r <<= 6;
+ r |= byte & 0x3F;
+ };
+ if (r >= 0xD800 && r <= 0xDFFF) {
+ // UTF-16 surrogates
+ return invalid;
+ };
+ if (runesz(r: rune) != n) {
+ // Overlong encoding
+ return invalid;
+ };
};
+ d.offs += n;
return r: rune;
};
@@ -69,30 +81,44 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
return;
};
- let n = 0z;
let r = 0u32;
-
+ let n = 0z;
for (let i = 0z; i < d.offs; i += 1) {
- if ((d.src[d.offs - i - 1] & 0xC0) == 0x80) {
- let tmp: u32 = d.src[d.offs - i - 1] & 0x3F;
- r |= tmp << (i * 6): u32;
+ let byte = d.src[d.offs - i - 1];
+ if ((byte & 0xC0) == 0x80) {
+ if (i == 3) {
+ // Too many continuation bytes in a row
+ return invalid;
+ };
+ byte &= 0x3F;
+ r |= byte << (i * 6): u32;
} else {
- n = i + 1;
- let tmp: u32 = d.src[d.offs - i - 1] & masks[i];
- r |= tmp << (i * 6): u32;
+ const nl = decode_leader(byte)?;
+ n = nl.0;
+ if (i + 1 != n) {
+ // Trailing continuation bytes
+ return invalid;
+ };
+ r |= nl.1 << (i * 6): u32;
break;
};
};
+
if (n == 0) {
return more;
+ } else if (n > 1) {
+ if (r >= 0xD800 && r <= 0xDFFF) {
+ // UTF-16 surrogates
+ return invalid;
+ };
+ if (runesz(r: rune) != n) {
+ // Overlong encoding
+ return invalid;
+ };
};
+
d.offs -= n;
- match (utf8sz(d.src[d.offs])) {
- case let z: size =>
- return if (n == z) r: rune else invalid;
- case void =>
- return invalid;
- };
+ return r: rune;
};
@test fn decode() void = {
@@ -122,7 +148,6 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
};
assert(prev(&decoder) is void);
- // TODO: Test more invalid sequences
const inv: [_]u8 = [0xA0, 0xA1];
decoder = decode(inv);
assert(next(&decoder) is invalid);
@@ -134,6 +159,27 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
assert(next(&decoder) is more);
decoder.offs = 2;
assert(prev(&decoder) is invalid);
+
+ const surrogate: [_]u8 = [0xED, 0xA0, 0x80];
+ decoder = decode(surrogate);
+ assert(next(&decoder) is invalid);
+ decoder.offs = 3;
+ assert(prev(&decoder) is invalid);
+
+ const overlong: [_]u8 = [0xF0, 0x82, 0x82, 0xAC];
+ decoder = decode(overlong);
+ assert(next(&decoder) is invalid);
+ decoder.offs = 4;
+ assert(prev(&decoder) is invalid);
+
+ const badcont: [_]u8 = [0xC2, 0xFF];
+ decoder = decode(badcont);
+ assert(next(&decoder) is invalid);
+
+ const extracont: [_]u8 = [0xC2, 0xA3, 0x95];
+ decoder = decode(extracont);
+ decoder.offs = 3;
+ assert(prev(&decoder) is invalid);
};
// Returns true if a given string or byte slice contains only valid UTF-8
diff --git a/encoding/utf8/rune.ha b/encoding/utf8/rune.ha
@@ -1,8 +1,5 @@
// License: MPL-2.0
// (c) 2021 Drew DeVault <sir@cmpwn.com>
-use types;
-
-const masks: [_]u8 = [0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01];
type rsize = struct {
mask: u8,