hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 2f2a662f44bc948871ddaa2bea6f74da9a3ecbb2
parent a9a776a5d05269ec4d61472aa68226b1b53f3c45
Author: Bor Grošelj Simić <bor.groseljsimic@telemach.net>
Date:   Mon, 22 Feb 2021 20:46:17 +0100

encoding::utf8: implement bacwards decoder

Diffstat:
Mencoding/utf8/decode.ha | 49+++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha @@ -23,9 +23,10 @@ export type invalid = void; // Returns the next rune from a decoder. If the slice ends with a complete UTF-8 // sequence, void is returned. If an incomplete sequence is encountered, more is -// returned. And if an invalid sequence is encountered, invalid returned. +// returned. And if an invalid sequence is encountered, invalid is returned. export fn next(d: *decoder) (rune | void | more | invalid) = { - if (d.offs >= len(d.src)) { + assert(d.offs <= len(d.src)); + if (d.offs == len(d.src)) { return; }; @@ -54,6 +55,38 @@ export fn next(d: *decoder) (rune | void | more | invalid) = { return r: rune; }; +// Returns the previous rune from a decoder. If the slice starts with a complete UTF-8 +// sequence, void is returned. If an incomplete sequence is encountered, more is +// returned. And if an invalid sequence is encountered, invalid is returned. +export fn prev(d: *decoder) (rune | void | more | invalid) = { + if (d.offs == 0) { + return; + }; + + let n = 0z; + let r = 0u32; + + for (let i = 0z; i < d.offs; i += 1) { + if ((d.src[d.offs - i - 1] & 0xC0) == 0x80) { + let tmp: u32 = d.src[d.offs - i - 1] & 0x3F; + r |= tmp << (i * 6): u32; + } else { + n = i + 1; + let tmp: u32 = d.src[d.offs - i - 1] & masks[i]; + r |= tmp << (i * 6): u32; + break; + }; + }; + if (n == 0) { + return more; + }; + d.offs -= n; + if (n != utf8sz(d.src[d.offs])) { + return invalid; + }; + return r: rune; +}; + @test fn decode() void = { const input: [_]u8 = [ 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, @@ -68,15 +101,27 @@ export fn next(d: *decoder) (rune | void | more | invalid) = { }; }; assert(next(&decoder) is void); + assert(decoder.offs == len(decoder.src)); + for (let i = 0z; i < len(expected); i += 1) { + match (prev(&decoder)) { + (invalid | more | void ) => abort(), + r: rune => assert(r == expected[len(expected) - i - 1]), + }; + }; + assert(prev(&decoder) is void); // TODO: Test more invalid sequences const invalid: [_]u8 = [0xA0, 0xA1]; decoder = decode(invalid); assert(next(&decoder) is invalid); + decoder.offs = 2; + assert(prev(&decoder) is more); const incomplete: [_]u8 = [0xE3, 0x81]; decoder = decode(incomplete); assert(next(&decoder) is more); + decoder.offs = 2; + assert(prev(&decoder) is invalid); }; // Returns true if a given string or byte slice contains only valid UTF-8