commit 5740e48104f9e3d18f62e599af4cb22ffe6531a2
parent f45e052d2c7e6e8792a8de6e0ef8f19cb723a78f
Author: Bor Grošelj Simić <bgs@turminal.net>
Date: Fri, 3 Feb 2023 04:43:24 +0100
rewrite encoding::utf8::valid using the new decoder table
Signed-off-by: Bor Grošelj Simić <bgs@turminal.net>
Diffstat:
1 file changed, 22 insertions(+), 17 deletions(-)
diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
@@ -5,17 +5,22 @@
fn toutf8(in: str) []u8 = *(&in: *[]u8);
+fn fromtagged(in: (str | []u8)) []u8 = match (in) {
+case let s: str =>
+ return toutf8(s);
+case let b: []u8 =>
+ return b;
+};
+
export type decoder = struct {
offs: size,
src: []u8,
};
// Initializes a new UTF-8 decoder.
-export fn decode(src: (str | []u8)) decoder = match (src) {
-case let s: str =>
- yield decoder { src = toutf8(s), ... };
-case let b: []u8 =>
- yield decoder { src = b, ... };
+export fn decode(src: (str | []u8)) decoder = decoder {
+ src = fromtagged(src),
+ offs = 0,
};
// Returned when more data is needed, i.e. when an incomplete UTF-8 sequence is
@@ -83,6 +88,7 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81,
0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00,
];
+ assert(valid(input));
const expected = ['こ', 'ん', 'に', 'ち', 'は', '\0'];
let decoder = decode(input);
for (let i = 0z; i < len(expected); i += 1) {
@@ -110,33 +116,39 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
assert(next(&decoder) is invalid);
decoder.offs = 2;
assert(prev(&decoder) is more);
+ assert(!valid(inv));
const incomplete: [_]u8 = [0xE3, 0x81];
decoder = decode(incomplete);
assert(next(&decoder) is more);
decoder.offs = 2;
assert(prev(&decoder) is invalid);
+ assert(!valid(incomplete));
const surrogate: [_]u8 = [0xED, 0xA0, 0x80];
decoder = decode(surrogate);
assert(next(&decoder) is invalid);
decoder.offs = 3;
assert(prev(&decoder) is invalid);
+ assert(!valid(surrogate));
const overlong: [_]u8 = [0xF0, 0x82, 0x82, 0xAC];
decoder = decode(overlong);
assert(next(&decoder) is invalid);
decoder.offs = 4;
assert(prev(&decoder) is invalid);
+ assert(!valid(overlong));
const badcont: [_]u8 = [0xC2, 0xFF];
decoder = decode(badcont);
assert(next(&decoder) is invalid);
+ assert(!valid(badcont));
const extracont: [_]u8 = [0xC2, 0xA3, 0x95];
decoder = decode(extracont);
decoder.offs = 3;
assert(prev(&decoder) is invalid);
+ assert(!valid(extracont));
const regression: []u8 = [0xf5, 0x94, 0x80, 0x80];
assert(!valid(regression));
@@ -146,17 +158,10 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
// sequences. Note that Hare strings (str) are always valid UTF-8 - if this
// returns false for a str type, something funny is going on.
export fn valid(src: (str | []u8)) bool = {
- let decoder = decode(src);
- for (true) {
- match (next(&decoder)) {
- case void =>
- return true;
- case invalid =>
- return false;
- case more =>
- return false;
- case rune => void;
- };
+ let src = fromtagged(src);
+ let state = 0;
+ for (let i = 0z; i < len(src) && state >= 0; i += 1) {
+ state = table[state][src[i]];
};
- abort();
+ return state == 0;
};