hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 5740e48104f9e3d18f62e599af4cb22ffe6531a2
parent f45e052d2c7e6e8792a8de6e0ef8f19cb723a78f
Author: Bor Grošelj Simić <bgs@turminal.net>
Date:   Fri,  3 Feb 2023 04:43:24 +0100

rewrite encoding::utf8::valid using the new decoder table

Signed-off-by: Bor Grošelj Simić <bgs@turminal.net>

Diffstat:
Mencoding/utf8/decode.ha | 39++++++++++++++++++++++-----------------
1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha @@ -5,17 +5,22 @@ fn toutf8(in: str) []u8 = *(&in: *[]u8); +fn fromtagged(in: (str | []u8)) []u8 = match (in) { +case let s: str => + return toutf8(s); +case let b: []u8 => + return b; +}; + export type decoder = struct { offs: size, src: []u8, }; // Initializes a new UTF-8 decoder. -export fn decode(src: (str | []u8)) decoder = match (src) { -case let s: str => - yield decoder { src = toutf8(s), ... }; -case let b: []u8 => - yield decoder { src = b, ... }; +export fn decode(src: (str | []u8)) decoder = decoder { + src = fromtagged(src), + offs = 0, }; // Returned when more data is needed, i.e. when an incomplete UTF-8 sequence is @@ -83,6 +88,7 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = { 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00, ]; + assert(valid(input)); const expected = ['こ', 'ん', 'に', 'ち', 'は', '\0']; let decoder = decode(input); for (let i = 0z; i < len(expected); i += 1) { @@ -110,33 +116,39 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = { assert(next(&decoder) is invalid); decoder.offs = 2; assert(prev(&decoder) is more); + assert(!valid(inv)); const incomplete: [_]u8 = [0xE3, 0x81]; decoder = decode(incomplete); assert(next(&decoder) is more); decoder.offs = 2; assert(prev(&decoder) is invalid); + assert(!valid(incomplete)); const surrogate: [_]u8 = [0xED, 0xA0, 0x80]; decoder = decode(surrogate); assert(next(&decoder) is invalid); decoder.offs = 3; assert(prev(&decoder) is invalid); + assert(!valid(surrogate)); const overlong: [_]u8 = [0xF0, 0x82, 0x82, 0xAC]; decoder = decode(overlong); assert(next(&decoder) is invalid); decoder.offs = 4; assert(prev(&decoder) is invalid); + assert(!valid(overlong)); const badcont: [_]u8 = [0xC2, 0xFF]; decoder = decode(badcont); assert(next(&decoder) is invalid); + assert(!valid(badcont)); const extracont: [_]u8 = [0xC2, 0xA3, 0x95]; decoder = decode(extracont); decoder.offs = 3; assert(prev(&decoder) is invalid); + assert(!valid(extracont)); const regression: []u8 = [0xf5, 0x94, 0x80, 0x80]; assert(!valid(regression)); @@ -146,17 +158,10 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = { // sequences. Note that Hare strings (str) are always valid UTF-8 - if this // returns false for a str type, something funny is going on. export fn valid(src: (str | []u8)) bool = { - let decoder = decode(src); - for (true) { - match (next(&decoder)) { - case void => - return true; - case invalid => - return false; - case more => - return false; - case rune => void; - }; + let src = fromtagged(src); + let state = 0; + for (let i = 0z; i < len(src) && state >= 0; i += 1) { + state = table[state][src[i]]; }; - abort(); + return state == 0; };