hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 8d8c605613c577a876b7889592eff6eb3fac2f77
parent 3ab69790dd41093a2a3ae7e35a7497f9b3f51c88
Author: Drew DeVault <sir@cmpwn.com>
Date:   Mon,  1 Feb 2021 16:27:49 -0500

encoding::utf8: new module

Diffstat:
Aencoding/utf8/decode.ha | 69+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aencoding/utf8/rune.ha | 31+++++++++++++++++++++++++++++++
2 files changed, 100 insertions(+), 0 deletions(-)

diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha @@ -0,0 +1,69 @@ +use strings; +use types; + +// The state for the UTF-8 decoder. +export type decoder = struct { + offs: size, + src: []u8, +}; + +export fn decode(src: str) decoder = decoder { + offs = 0z, + src = strings::to_utf8(src), +}; + +// Initializes a new UTF-8 decoder for a byte slice. +export fn decode_bytes(src: []u8) decoder = decoder { + offs = 0z, + src = src, +}; + +// Indicates that more data is needed, or that a partial UTF-8 sequence was +// encountered. +export type more = void; + +// Returns the next rune from a decoder. If the slice ends with a complete UTF-8 +// sequence, void is returned. If an incomplete sequence is encountered, more is +// returned. And if an invalid sequence is encountered, invalid returned. +export fn next(d: *decoder) (rune | void | more | invalid) = { + if (d.offs >= len(d.src)) { + return void; + }; + + // XXX: It would be faster if we decoded and measured at the same time. + const n = utf8sz(d.src[d.offs..]); + if (n == types::SIZE_MAX) { + return invalid; + } else if (d.offs + n > len(d.src)) { + return more; + }; + let bytes = d.src[d.offs..d.offs+n]; + d.offs += n; + + let cp = 0u32; + if (bytes[0] < 128u8) { + // ASCII + return bytes[0]: u32: rune; + }; + + const mask = masks[n - 1z]; + cp = bytes[0] & mask; + for (let i = 1z; i < len(bytes); i += 1z) { + cp <<= 6u8; + cp |= bytes[i] & 0x3Fu8; + }; + return cp: rune; +}; + +fn utf8sz(src: []u8) size = { + assert(sizes[0].octets == 1z); + + assert(len(src) > 0z); + let c = src[0]; + for (let i = 0z; i < len(sizes); i += 1z) { + if (c & sizes[i].mask == sizes[i].result) { + return sizes[i].octets; + }; + }; + return types::SIZE_MAX; +}; diff --git a/encoding/utf8/rune.ha b/encoding/utf8/rune.ha @@ -0,0 +1,31 @@ +use types; + +// An error indicating that an invalid UTF-8 sequence was found. +export type invalid = void; + +const masks: [_]u8 = [0x7Fu8, 0x1Fu8, 0x0Fu8, 0x07u8, 0x03u8, 0x01u8]; + +type rsize = struct { + mask: u8, + result: u8, + octets: size, +}; + +const sizes: [_]rsize = [ + rsize { mask = 0x80u8, result = 0x00u8, octets = 1z }, + rsize { mask = 0xE0u8, result = 0xC0u8, octets = 2z }, + rsize { mask = 0xF0u8, result = 0xE0u8, octets = 3z }, + rsize { mask = 0xF8u8, result = 0xF0u8, octets = 4z }, + rsize { mask = 0xFCu8, result = 0xF8u8, octets = 5z }, + rsize { mask = 0xFEu8, result = 0xF8u8, octets = 6z }, + rsize { mask = 0x80u8, result = 0x80u8, octets = types::SIZE_MAX }, +]; + +// Returns the size of a rune, in octets, when encoded as UTF-8. +export fn runesz(r: rune) size = { + const ch = r: u32; + return if (ch < 0x80u32) 1z + else if (ch < 0x800u32) 2z + else if (ch < 0x10000u32) 3z + else 4z; +};