commit 8e99e59ae9f1e41efa432bfab6d2818c954bab07
parent c66756b8e192480a8d837f60fcd4dc9a57aeac79
Author: Drew DeVault <sir@cmpwn.com>
Date: Wed, 10 Feb 2021 14:58:51 -0500
io: add io::getrune
Diffstat:
3 files changed, 84 insertions(+), 6 deletions(-)
diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
@@ -30,7 +30,7 @@ export fn next(d: *decoder) (rune | void | more | invalid) = {
};
// XXX: It would be faster if we decoded and measured at the same time.
- const n = utf8sz(d.src[d.offs..]);
+ const n = utf8sz(d.src[d.offs]);
if (n == types::SIZE_MAX) {
return invalid;
} else if (d.offs + n > len(d.src)) {
@@ -97,15 +97,12 @@ export fn valid(src: (str | []u8)) bool = {
abort();
};
-fn utf8sz(src: []u8) size = {
- assert(len(src) > 0);
-
- let c = src[0];
+// Returns the expected length of a UTF-8 character in bytes.
+export fn utf8sz(c: u8) size = {
for (let i = 0z; i < len(sizes); i += 1) {
if (c & sizes[i].mask == sizes[i].result) {
return sizes[i].octets;
};
};
-
return types::SIZE_MAX;
};
diff --git a/io/+test/strings.ha b/io/+test/strings.ha
@@ -0,0 +1,45 @@
+use encoding::utf8;
+use rt;
+
+type bufstream = struct {
+ stream: stream,
+ buf: []u8,
+};
+
+@test fn getrune() void = {
+ let bs = bufstream {
+ stream = stream {
+ name = "buffer",
+ reader = &bs_read,
+ ...
+ },
+ buf = [
+ 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81,
+ 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, 0x00,
+ ],
+ };
+ let in = &bs.stream;
+ const expected: [_](rune | utf8::invalid | EOF | error) = [
+ 'こ', 'ん', 'に', 'ち', 'は', '\0', EOF,
+ ];
+ for (let i = 0z; i < len(expected); i += 1) {
+ let want = expected[i];
+ match (getrune(in)) {
+ r: rune => assert(want is rune && want as rune == r),
+ EOF => assert(want is EOF),
+ * => abort(),
+ };
+ };
+};
+
+fn bs_read(s: *stream, buf: []u8) (size | error | EOF) = {
+ let stream = s: *bufstream;
+ if (len(stream.buf) == 0) {
+ return EOF;
+ };
+ const n = if (len(buf) > len(stream.buf)) len(stream.buf) else len(buf);
+ // TODO: Fix me up once slice copying is in
+ rt::memcpy(buf: *[*]u8, stream.buf: *[*]u8, n);
+ stream.buf = stream.buf[n..];
+ return n;
+};
diff --git a/io/strings.ha b/io/strings.ha
@@ -0,0 +1,36 @@
+use encoding::utf8;
+use types;
+
+// TODO: Do we want some kind of io::text_stream?
+
+// Reads a rune from a UTF-8 stream.
+export fn getrune(in: *io::stream) (rune | utf8::invalid | EOF | error) = {
+ let b: [4]u8 = [0...];
+ match (read(in, b[..1])) {
+ n: size => assert(n == 1),
+ err: error => return err,
+ EOF => return EOF,
+ };
+
+ const sz = utf8::utf8sz(b[0]);
+ if (sz == types::SIZE_MAX) {
+ return utf8::invalid;
+ };
+
+ if (sz == 1) {
+ return b[0]: u32: rune;
+ };
+
+ match (read(in, b[1..sz])) {
+ n: size => assert(n == sz - 1),
+ err: error => return err,
+ EOF => return EOF,
+ };
+
+ let dec = utf8::decode(b[..sz]);
+ return match (utf8::next(&dec)) {
+ r: rune => r,
+ utf8::invalid => utf8::invalid,
+ (void | utf8::more) => EOF,
+ };
+};