commit 4e47d429b45c0431b75c34804eaded1029a35e74
parent daff63ea85e347be3fde00292d4ff7f5a433a737
Author: Sebastian <sebastian@sebsite.pw>
Date: Mon, 27 Nov 2023 01:18:05 -0500
encoding::utf8: operate exclusively on byte slices
Previously, utf8::decode and utf8::valid accepted (str | []u8). This has
been changed, so they now only accept []u8.
utf8::decode with a string operand duplicates the functionality of
strings::iter.
utf8::valid only makes sense on byte slices, since str can't hold
invalid UTF-8. If code really wants to check if the contents of a string
are valid, it's simple enough to just call strings::toutf8 on the str
first.
Signed-off-by: Sebastian <sebastian@sebsite.pw>
Diffstat:
5 files changed, 15 insertions(+), 32 deletions(-)
diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha
@@ -1,23 +1,14 @@
// SPDX-License-Identifier: MPL-2.0
// (c) Hare authors <https://harelang.org>
-fn toutf8(in: str) []u8 = *(&in: *[]u8);
-
-fn fromtagged(in: (str | []u8)) []u8 = match (in) {
-case let s: str =>
- return toutf8(s);
-case let b: []u8 =>
- return b;
-};
-
export type decoder = struct {
offs: size,
src: []u8,
};
// Initializes a new UTF-8 decoder.
-export fn decode(src: (str | []u8)) decoder = decoder {
- src = fromtagged(src),
+export fn decode(src: []u8) decoder = decoder {
+ src = src,
offs = 0,
};
@@ -161,11 +152,8 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = {
assert(prev(&decoder) is invalid);
};
-// Returns true if a given string or byte slice contains only valid UTF-8
-// sequences. Note that Hare strings (str) are always valid UTF-8 - if this
-// returns false for a str type, something funny is going on.
-export fn valid(src: (str | []u8)) bool = {
- let src = fromtagged(src);
+// Returns true if a given byte slice contains only valid UTF-8 sequences.
+export fn valid(src: []u8) bool = {
let state = 0;
for (let i = 0z; i < len(src) && state >= 0; i += 1) {
state = table[state][src[i]];
diff --git a/hare/parse/+test/loc.ha b/hare/parse/+test/loc.ha
@@ -2,7 +2,6 @@
// (c) Hare authors <https://harelang.org>
use bufio;
-use encoding::utf8;
use fmt;
use hare::ast;
use hare::lex;
@@ -26,11 +25,8 @@ fn expr_testloc(srcs: str...) void = for (let i = 0z; i < len(srcs); i += 1) {
};
defer ast::expr_finish(&exp);
let runes = 0z;
- let d = utf8::decode(srcs[i]);
- for (true) match (utf8::next(&d)!) {
- case void =>
- break;
- case rune =>
+ let it = strings::iter(srcs[i]);
+ for (strings::next(&it) is rune) {
runes += 1;
};
assert(exp.start.line == 1 && exp.start.col == 1);
@@ -117,11 +113,8 @@ fn type_testloc(srcs: str...) void = for (let i = 0z; i < len(srcs); i += 1) {
};
defer ast::type_finish(&typ);
let runes = 0z;
- let d = utf8::decode(srcs[i]);
- for (true) match (utf8::next(&d)!) {
- case void =>
- break;
- case rune =>
+ let it = strings::iter(srcs[i]);
+ for (strings::next(&it) is rune) {
runes += 1;
};
assert(typ.start.line == 1 && typ.start.col == 1);
diff --git a/strings/iter.ha b/strings/iter.ha
@@ -22,7 +22,7 @@ export type iterator = struct {
// strings::next(&dup); // '!'
// strings::next(&dup); // void
export fn iter(src: str) iterator = iterator {
- dec = utf8::decode(src),
+ dec = utf8::decode(toutf8(src)),
reverse = false,
};
@@ -30,7 +30,7 @@ export fn iter(src: str) iterator = iterator {
// backwards with each call to [[next]].
export fn riter(src: str) iterator = {
let ret = iterator {
- dec = utf8::decode(src),
+ dec = utf8::decode(toutf8(src)),
reverse = true,
};
ret.dec.offs = len(src);
diff --git a/strings/utf8.ha b/strings/utf8.ha
@@ -21,7 +21,7 @@ export fn fromutf8_unsafe(in: []u8) str = {
// [[encoding::utf8::invalid]] is returned instead.
export fn fromutf8(in: []u8) (str | utf8::invalid) = {
let s = fromutf8_unsafe(in);
- if (!utf8::valid(s)) {
+ if (!utf8::valid(in)) {
return utf8::invalid;
};
return s;
diff --git a/types/c/strings.ha b/types/c/strings.ha
@@ -44,8 +44,10 @@ export fn tostr(cstr: *const char) (const str | utf8::invalid) = {
// Converts a C string with a given length to a Hare string. If the string is
// not valid UTF-8, return [[encoding::utf8::invalid]].
export fn tostrn(cstr: *const char, length: size) (const str | utf8::invalid) = {
- let s = tostrn_unsafe(cstr, length);
- return if (utf8::valid(s)) s else utf8::invalid;
+ if (!utf8::valid((cstr: *[*]u8)[..length])) {
+ return utf8::invalid;
+ };
+ return tostrn_unsafe(cstr, length);
};
// Converts a Hare string to a C string. The result is allocated; the caller