commit 8d8c605613c577a876b7889592eff6eb3fac2f77
parent 3ab69790dd41093a2a3ae7e35a7497f9b3f51c88
Author: Drew DeVault <sir@cmpwn.com>
Date: Mon, 1 Feb 2021 16:27:49 -0500
encoding::utf8: new module
Diffstat:
2 files changed, 100 insertions(+), 0 deletions(-)
diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
@@ -0,0 +1,69 @@
+use strings;
+use types;
+
+// The state for the UTF-8 decoder.
+export type decoder = struct {
+ offs: size,
+ src: []u8,
+};
+
+export fn decode(src: str) decoder = decoder {
+ offs = 0z,
+ src = strings::to_utf8(src),
+};
+
+// Initializes a new UTF-8 decoder for a byte slice.
+export fn decode_bytes(src: []u8) decoder = decoder {
+ offs = 0z,
+ src = src,
+};
+
+// Indicates that more data is needed, or that a partial UTF-8 sequence was
+// encountered.
+export type more = void;
+
+// Returns the next rune from a decoder. If the slice ends with a complete UTF-8
+// sequence, void is returned. If an incomplete sequence is encountered, more is
+// returned. And if an invalid sequence is encountered, invalid returned.
+export fn next(d: *decoder) (rune | void | more | invalid) = {
+ if (d.offs >= len(d.src)) {
+ return void;
+ };
+
+ // XXX: It would be faster if we decoded and measured at the same time.
+ const n = utf8sz(d.src[d.offs..]);
+ if (n == types::SIZE_MAX) {
+ return invalid;
+ } else if (d.offs + n > len(d.src)) {
+ return more;
+ };
+ let bytes = d.src[d.offs..d.offs+n];
+ d.offs += n;
+
+ let cp = 0u32;
+ if (bytes[0] < 128u8) {
+ // ASCII
+ return bytes[0]: u32: rune;
+ };
+
+ const mask = masks[n - 1z];
+ cp = bytes[0] & mask;
+ for (let i = 1z; i < len(bytes); i += 1z) {
+ cp <<= 6u8;
+ cp |= bytes[i] & 0x3Fu8;
+ };
+ return cp: rune;
+};
+
+fn utf8sz(src: []u8) size = {
+ assert(sizes[0].octets == 1z);
+
+ assert(len(src) > 0z);
+ let c = src[0];
+ for (let i = 0z; i < len(sizes); i += 1z) {
+ if (c & sizes[i].mask == sizes[i].result) {
+ return sizes[i].octets;
+ };
+ };
+ return types::SIZE_MAX;
+};
diff --git a/encoding/utf8/rune.ha b/encoding/utf8/rune.ha
@@ -0,0 +1,31 @@
+use types;
+
+// An error indicating that an invalid UTF-8 sequence was found.
+export type invalid = void;
+
+const masks: [_]u8 = [0x7Fu8, 0x1Fu8, 0x0Fu8, 0x07u8, 0x03u8, 0x01u8];
+
+type rsize = struct {
+ mask: u8,
+ result: u8,
+ octets: size,
+};
+
+const sizes: [_]rsize = [
+ rsize { mask = 0x80u8, result = 0x00u8, octets = 1z },
+ rsize { mask = 0xE0u8, result = 0xC0u8, octets = 2z },
+ rsize { mask = 0xF0u8, result = 0xE0u8, octets = 3z },
+ rsize { mask = 0xF8u8, result = 0xF0u8, octets = 4z },
+ rsize { mask = 0xFCu8, result = 0xF8u8, octets = 5z },
+ rsize { mask = 0xFEu8, result = 0xF8u8, octets = 6z },
+ rsize { mask = 0x80u8, result = 0x80u8, octets = types::SIZE_MAX },
+];
+
+// Returns the size of a rune, in octets, when encoded as UTF-8.
+export fn runesz(r: rune) size = {
+ const ch = r: u32;
+ return if (ch < 0x80u32) 1z
+ else if (ch < 0x800u32) 2z
+ else if (ch < 0x10000u32) 3z
+ else 4z;
+};