commit f45e052d2c7e6e8792a8de6e0ef8f19cb723a78f
parent 503a2f4dd116a7babe1d400a5ad841461a0efa17
Author: Bor Grošelj Simić <bgs@turminal.net>
Date: Fri, 3 Feb 2023 04:43:23 +0100
use improved forward decode in backward decode
Signed-off-by: Bor Grošelj Simić <bgs@turminal.net>
Diffstat:
1 file changed, 11 insertions(+), 47 deletions(-)
diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
@@ -25,23 +25,11 @@ export type more = void;
// Returned when an invalid UTF-8 sequence was found.
export type invalid = !void;
-const leader_masks: [_]u8 = [0x7F, 0x3F, 0x1F, 0x0F];
-
const masks: [2][8]u8 = [
[0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f],
[0x7f, 0x1f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07],
];
-fn decode_leader(c: u8) ((size, u8) | invalid) = {
- for (let i = 0z; i < len(sizes); i += 1) {
- if (c & sizes[i].mask == sizes[i].result) {
- return (sizes[i].octets, c & leader_masks[i]);
- };
- };
- // Bad leading byte
- return invalid;
-};
-
// Returns the next rune from a decoder. void is returned when there are no
// remaining codepoints.
export fn next(d: *decoder) (rune | void | more | invalid) = {
@@ -73,45 +61,21 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
if (d.offs == 0) {
return;
};
-
- let r = 0u32;
- let n = 0z;
- for (let i = 0z; i < d.offs; i += 1) {
- let byte = d.src[d.offs - i - 1];
- if ((byte & 0xC0) == 0x80) {
- if (i == 3) {
- // Too many continuation bytes in a row
- return invalid;
- };
- byte &= 0x3F;
- r |= byte << (i * 6): u32;
- } else {
- const nl = decode_leader(byte)?;
- n = nl.0;
- if (i + 1 != n) {
- // Trailing continuation bytes
- return invalid;
- };
- r |= nl.1 << (i * 6): u32;
- break;
+ let n = d.offs;
+ d.offs -= 1;
+ for (d.offs < len(d.src); d.offs -= 1) {
+ if (table[0][d.src[d.offs]] != -1) {
+ let t = d.offs;
+ defer d.offs = t;
+ let r = next(d);
+ return if (n != d.offs || r is more) invalid else r;
};
- };
-
- if (n == 0) {
- return more;
- } else if (n > 1) {
- if (r >= 0xD800 && r <= 0xDFFF) {
- // UTF-16 surrogates
- return invalid;
- };
- if (runesz(r: rune) != n) {
- // Overlong encoding
+ if (n - d.offs == 4) {
+ // Too many continuation bytes in a row
return invalid;
};
};
-
- d.offs -= n;
- return r: rune;
+ return more;
};
@test fn decode() void = {